2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
24 * Authors: Dave Airlie
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_client_event.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/device.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
59 #ifdef CONFIG_DRM_AMDGPU_CIK
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_pmu.h"
75 #include "amdgpu_fru_eeprom.h"
76 #include "amdgpu_reset.h"
77 #include "amdgpu_virt.h"
78 #include "amdgpu_dev_coredump.h"
80 #include <linux/suspend.h>
81 #include <drm/task_barrier.h>
82 #include <linux/pm_runtime.h>
84 #include <drm/drm_drv.h>
86 #if IS_ENABLED(CONFIG_X86)
87 #include <asm/intel-family.h>
90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
98 #define AMDGPU_RESUME_MS 2000
99 #define AMDGPU_MAX_RETRY_LIMIT 2
100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
105 static const struct drm_driver amdgpu_kms_driver;
107 const char *amdgpu_asic_name[] = {
148 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0)
150 * Default init level where all blocks are expected to be initialized. This is
151 * the level of initialization expected by default and also after a full reset
154 struct amdgpu_init_level amdgpu_init_default = {
155 .level = AMDGPU_INIT_LEVEL_DEFAULT,
156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
159 struct amdgpu_init_level amdgpu_init_recovery = {
160 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
165 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
166 * is used for cases like reset on initialization where the entire hive needs to
167 * be reset before first use.
169 struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
170 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
171 .hwini_ip_block_mask =
172 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
173 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
174 BIT(AMD_IP_BLOCK_TYPE_PSP)
177 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
178 enum amd_ip_block_type block)
180 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
183 void amdgpu_set_init_level(struct amdgpu_device *adev,
184 enum amdgpu_init_lvl_id lvl)
187 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
188 adev->init_lvl = &amdgpu_init_minimal_xgmi;
190 case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
191 adev->init_lvl = &amdgpu_init_recovery;
193 case AMDGPU_INIT_LEVEL_DEFAULT:
196 adev->init_lvl = &amdgpu_init_default;
201 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
204 * DOC: pcie_replay_count
206 * The amdgpu driver provides a sysfs API for reporting the total number
207 * of PCIe replays (NAKs)
208 * The file pcie_replay_count is used for this and returns the total
209 * number of replays as a sum of the NAKs generated and NAKs received
212 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
213 struct device_attribute *attr, char *buf)
215 struct drm_device *ddev = dev_get_drvdata(dev);
216 struct amdgpu_device *adev = drm_to_adev(ddev);
217 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
219 return sysfs_emit(buf, "%llu\n", cnt);
222 static DEVICE_ATTR(pcie_replay_count, 0444,
223 amdgpu_device_get_pcie_replay_count, NULL);
225 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
226 struct bin_attribute *attr, char *buf,
227 loff_t ppos, size_t count)
229 struct device *dev = kobj_to_dev(kobj);
230 struct drm_device *ddev = dev_get_drvdata(dev);
231 struct amdgpu_device *adev = drm_to_adev(ddev);
235 case AMDGPU_SYS_REG_STATE_XGMI:
236 bytes_read = amdgpu_asic_get_reg_state(
237 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
239 case AMDGPU_SYS_REG_STATE_WAFL:
240 bytes_read = amdgpu_asic_get_reg_state(
241 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
243 case AMDGPU_SYS_REG_STATE_PCIE:
244 bytes_read = amdgpu_asic_get_reg_state(
245 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
247 case AMDGPU_SYS_REG_STATE_USR:
248 bytes_read = amdgpu_asic_get_reg_state(
249 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
251 case AMDGPU_SYS_REG_STATE_USR_1:
252 bytes_read = amdgpu_asic_get_reg_state(
253 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
262 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
263 AMDGPU_SYS_REG_STATE_END);
265 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
269 if (!amdgpu_asic_get_reg_state_supported(adev))
272 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
277 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
279 if (!amdgpu_asic_get_reg_state_supported(adev))
281 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
284 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
288 if (ip_block->version->funcs->suspend) {
289 r = ip_block->version->funcs->suspend(ip_block);
291 dev_err(ip_block->adev->dev,
292 "suspend of IP block <%s> failed %d\n",
293 ip_block->version->funcs->name, r);
298 ip_block->status.hw = false;
302 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
306 if (ip_block->version->funcs->resume) {
307 r = ip_block->version->funcs->resume(ip_block);
309 dev_err(ip_block->adev->dev,
310 "resume of IP block <%s> failed %d\n",
311 ip_block->version->funcs->name, r);
316 ip_block->status.hw = true;
323 * The amdgpu driver provides a sysfs API for giving board related information.
324 * It provides the form factor information in the format
328 * Possible form factor values
330 * - "cem" - PCIE CEM card
331 * - "oam" - Open Compute Accelerator Module
332 * - "unknown" - Not known
336 static ssize_t amdgpu_device_get_board_info(struct device *dev,
337 struct device_attribute *attr,
340 struct drm_device *ddev = dev_get_drvdata(dev);
341 struct amdgpu_device *adev = drm_to_adev(ddev);
342 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
345 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
346 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
349 case AMDGPU_PKG_TYPE_CEM:
352 case AMDGPU_PKG_TYPE_OAM:
360 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
363 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
365 static struct attribute *amdgpu_board_attrs[] = {
366 &dev_attr_board_info.attr,
370 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
371 struct attribute *attr, int n)
373 struct device *dev = kobj_to_dev(kobj);
374 struct drm_device *ddev = dev_get_drvdata(dev);
375 struct amdgpu_device *adev = drm_to_adev(ddev);
377 if (adev->flags & AMD_IS_APU)
383 static const struct attribute_group amdgpu_board_attrs_group = {
384 .attrs = amdgpu_board_attrs,
385 .is_visible = amdgpu_board_attrs_is_visible
388 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
392 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
394 * @dev: drm_device pointer
396 * Returns true if the device is a dGPU with ATPX power control,
397 * otherwise return false.
399 bool amdgpu_device_supports_px(struct drm_device *dev)
401 struct amdgpu_device *adev = drm_to_adev(dev);
403 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
409 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
411 * @dev: drm_device pointer
413 * Returns true if the device is a dGPU with ACPI power control,
414 * otherwise return false.
416 bool amdgpu_device_supports_boco(struct drm_device *dev)
418 struct amdgpu_device *adev = drm_to_adev(dev);
420 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
424 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
430 * amdgpu_device_supports_baco - Does the device support BACO
432 * @dev: drm_device pointer
435 * 1 if the device supporte BACO;
436 * 3 if the device support MACO (only works if BACO is supported)
437 * otherwise return 0.
439 int amdgpu_device_supports_baco(struct drm_device *dev)
441 struct amdgpu_device *adev = drm_to_adev(dev);
443 return amdgpu_asic_supports_baco(adev);
446 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
448 struct drm_device *dev;
451 dev = adev_to_drm(adev);
453 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
454 bamaco_support = amdgpu_device_supports_baco(dev);
456 switch (amdgpu_runtime_pm) {
458 if (bamaco_support & MACO_SUPPORT) {
459 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
460 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
461 } else if (bamaco_support == BACO_SUPPORT) {
462 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
463 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
467 if (bamaco_support & BACO_SUPPORT) {
468 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
469 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
474 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
475 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
476 dev_info(adev->dev, "Using ATPX for runtime pm\n");
477 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
478 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
479 dev_info(adev->dev, "Using BOCO for runtime pm\n");
484 switch (adev->asic_type) {
487 /* BACO are not supported on vega20 and arctrus */
490 /* enable BACO as runpm mode if noretry=0 */
491 if (!adev->gmc.noretry)
492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
495 /* enable BACO as runpm mode on CI+ */
496 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
500 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
501 if (bamaco_support & MACO_SUPPORT) {
502 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
503 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
505 dev_info(adev->dev, "Using BACO for runtime pm\n");
511 dev_info(adev->dev, "runtime pm is manually disabled\n");
518 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
519 dev_info(adev->dev, "Runtime PM not available\n");
522 * amdgpu_device_supports_smart_shift - Is the device dGPU with
523 * smart shift support
525 * @dev: drm_device pointer
527 * Returns true if the device is a dGPU with Smart Shift support,
528 * otherwise returns false.
530 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
532 return (amdgpu_device_supports_boco(dev) &&
533 amdgpu_acpi_is_power_shift_control_supported());
537 * VRAM access helper functions
541 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
543 * @adev: amdgpu_device pointer
544 * @pos: offset of the buffer in vram
545 * @buf: virtual address of the buffer in system memory
546 * @size: read/write size, sizeof(@buf) must > @size
547 * @write: true - write to vram, otherwise - read from vram
549 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
550 void *buf, size_t size, bool write)
553 uint32_t hi = ~0, tmp = 0;
554 uint32_t *data = buf;
558 if (!drm_dev_enter(adev_to_drm(adev), &idx))
561 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
563 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
564 for (last = pos + size; pos < last; pos += 4) {
567 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
569 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
573 WREG32_NO_KIQ(mmMM_DATA, *data++);
575 *data++ = RREG32_NO_KIQ(mmMM_DATA);
578 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
583 * amdgpu_device_aper_access - access vram by vram aperature
585 * @adev: amdgpu_device pointer
586 * @pos: offset of the buffer in vram
587 * @buf: virtual address of the buffer in system memory
588 * @size: read/write size, sizeof(@buf) must > @size
589 * @write: true - write to vram, otherwise - read from vram
591 * The return value means how many bytes have been transferred.
593 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
594 void *buf, size_t size, bool write)
601 if (!adev->mman.aper_base_kaddr)
604 last = min(pos + size, adev->gmc.visible_vram_size);
606 addr = adev->mman.aper_base_kaddr + pos;
610 memcpy_toio(addr, buf, count);
611 /* Make sure HDP write cache flush happens without any reordering
612 * after the system memory contents are sent over PCIe device
615 amdgpu_device_flush_hdp(adev, NULL);
617 amdgpu_device_invalidate_hdp(adev, NULL);
618 /* Make sure HDP read cache is invalidated before issuing a read
622 memcpy_fromio(buf, addr, count);
634 * amdgpu_device_vram_access - read/write a buffer in vram
636 * @adev: amdgpu_device pointer
637 * @pos: offset of the buffer in vram
638 * @buf: virtual address of the buffer in system memory
639 * @size: read/write size, sizeof(@buf) must > @size
640 * @write: true - write to vram, otherwise - read from vram
642 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
643 void *buf, size_t size, bool write)
647 /* try to using vram apreature to access vram first */
648 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
651 /* using MM to access rest vram */
654 amdgpu_device_mm_access(adev, pos, buf, size, write);
659 * register access helper functions.
662 /* Check if hw access should be skipped because of hotplug or device error */
663 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
665 if (adev->no_hw_access)
668 #ifdef CONFIG_LOCKDEP
670 * This is a bit complicated to understand, so worth a comment. What we assert
671 * here is that the GPU reset is not running on another thread in parallel.
673 * For this we trylock the read side of the reset semaphore, if that succeeds
674 * we know that the reset is not running in paralell.
676 * If the trylock fails we assert that we are either already holding the read
677 * side of the lock or are the reset thread itself and hold the write side of
681 if (down_read_trylock(&adev->reset_domain->sem))
682 up_read(&adev->reset_domain->sem);
684 lockdep_assert_held(&adev->reset_domain->sem);
691 * amdgpu_device_rreg - read a memory mapped IO or indirect register
693 * @adev: amdgpu_device pointer
694 * @reg: dword aligned register offset
695 * @acc_flags: access flags which require special behavior
697 * Returns the 32 bit value from the offset specified.
699 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
700 uint32_t reg, uint32_t acc_flags)
704 if (amdgpu_device_skip_hw_access(adev))
707 if ((reg * 4) < adev->rmmio_size) {
708 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
709 amdgpu_sriov_runtime(adev) &&
710 down_read_trylock(&adev->reset_domain->sem)) {
711 ret = amdgpu_kiq_rreg(adev, reg, 0);
712 up_read(&adev->reset_domain->sem);
714 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
717 ret = adev->pcie_rreg(adev, reg * 4);
720 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
726 * MMIO register read with bytes helper functions
727 * @offset:bytes offset from MMIO start
731 * amdgpu_mm_rreg8 - read a memory mapped IO register
733 * @adev: amdgpu_device pointer
734 * @offset: byte aligned register offset
736 * Returns the 8 bit value from the offset specified.
738 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
740 if (amdgpu_device_skip_hw_access(adev))
743 if (offset < adev->rmmio_size)
744 return (readb(adev->rmmio + offset));
750 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
752 * @adev: amdgpu_device pointer
753 * @reg: dword aligned register offset
754 * @acc_flags: access flags which require special behavior
755 * @xcc_id: xcc accelerated compute core id
757 * Returns the 32 bit value from the offset specified.
759 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
760 uint32_t reg, uint32_t acc_flags,
763 uint32_t ret, rlcg_flag;
765 if (amdgpu_device_skip_hw_access(adev))
768 if ((reg * 4) < adev->rmmio_size) {
769 if (amdgpu_sriov_vf(adev) &&
770 !amdgpu_sriov_runtime(adev) &&
771 adev->gfx.rlc.rlcg_reg_access_supported &&
772 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
775 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
776 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
777 amdgpu_sriov_runtime(adev) &&
778 down_read_trylock(&adev->reset_domain->sem)) {
779 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
780 up_read(&adev->reset_domain->sem);
782 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
785 ret = adev->pcie_rreg(adev, reg * 4);
792 * MMIO register write with bytes helper functions
793 * @offset:bytes offset from MMIO start
794 * @value: the value want to be written to the register
798 * amdgpu_mm_wreg8 - read a memory mapped IO register
800 * @adev: amdgpu_device pointer
801 * @offset: byte aligned register offset
802 * @value: 8 bit value to write
804 * Writes the value specified to the offset specified.
806 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
808 if (amdgpu_device_skip_hw_access(adev))
811 if (offset < adev->rmmio_size)
812 writeb(value, adev->rmmio + offset);
818 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
820 * @adev: amdgpu_device pointer
821 * @reg: dword aligned register offset
822 * @v: 32 bit value to write to the register
823 * @acc_flags: access flags which require special behavior
825 * Writes the value specified to the offset specified.
827 void amdgpu_device_wreg(struct amdgpu_device *adev,
828 uint32_t reg, uint32_t v,
831 if (amdgpu_device_skip_hw_access(adev))
834 if ((reg * 4) < adev->rmmio_size) {
835 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
836 amdgpu_sriov_runtime(adev) &&
837 down_read_trylock(&adev->reset_domain->sem)) {
838 amdgpu_kiq_wreg(adev, reg, v, 0);
839 up_read(&adev->reset_domain->sem);
841 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
844 adev->pcie_wreg(adev, reg * 4, v);
847 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
851 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
853 * @adev: amdgpu_device pointer
854 * @reg: mmio/rlc register
856 * @xcc_id: xcc accelerated compute core id
858 * this function is invoked only for the debugfs register access
860 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
861 uint32_t reg, uint32_t v,
864 if (amdgpu_device_skip_hw_access(adev))
867 if (amdgpu_sriov_fullaccess(adev) &&
868 adev->gfx.rlc.funcs &&
869 adev->gfx.rlc.funcs->is_rlcg_access_range) {
870 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
871 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
872 } else if ((reg * 4) >= adev->rmmio_size) {
873 adev->pcie_wreg(adev, reg * 4, v);
875 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
880 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
882 * @adev: amdgpu_device pointer
883 * @reg: dword aligned register offset
884 * @v: 32 bit value to write to the register
885 * @acc_flags: access flags which require special behavior
886 * @xcc_id: xcc accelerated compute core id
888 * Writes the value specified to the offset specified.
890 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
891 uint32_t reg, uint32_t v,
892 uint32_t acc_flags, uint32_t xcc_id)
896 if (amdgpu_device_skip_hw_access(adev))
899 if ((reg * 4) < adev->rmmio_size) {
900 if (amdgpu_sriov_vf(adev) &&
901 !amdgpu_sriov_runtime(adev) &&
902 adev->gfx.rlc.rlcg_reg_access_supported &&
903 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
906 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
907 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
908 amdgpu_sriov_runtime(adev) &&
909 down_read_trylock(&adev->reset_domain->sem)) {
910 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
911 up_read(&adev->reset_domain->sem);
913 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
916 adev->pcie_wreg(adev, reg * 4, v);
921 * amdgpu_device_indirect_rreg - read an indirect register
923 * @adev: amdgpu_device pointer
924 * @reg_addr: indirect register address to read from
926 * Returns the value of indirect register @reg_addr
928 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
931 unsigned long flags, pcie_index, pcie_data;
932 void __iomem *pcie_index_offset;
933 void __iomem *pcie_data_offset;
936 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
937 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
939 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
940 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
941 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
943 writel(reg_addr, pcie_index_offset);
944 readl(pcie_index_offset);
945 r = readl(pcie_data_offset);
946 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
951 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
954 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
956 void __iomem *pcie_index_offset;
957 void __iomem *pcie_index_hi_offset;
958 void __iomem *pcie_data_offset;
960 if (unlikely(!adev->nbio.funcs)) {
961 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
962 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
964 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
965 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
968 if (reg_addr >> 32) {
969 if (unlikely(!adev->nbio.funcs))
970 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
972 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
977 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
978 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
979 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
980 if (pcie_index_hi != 0)
981 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
984 writel(reg_addr, pcie_index_offset);
985 readl(pcie_index_offset);
986 if (pcie_index_hi != 0) {
987 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
988 readl(pcie_index_hi_offset);
990 r = readl(pcie_data_offset);
992 /* clear the high bits */
993 if (pcie_index_hi != 0) {
994 writel(0, pcie_index_hi_offset);
995 readl(pcie_index_hi_offset);
998 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1004 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1006 * @adev: amdgpu_device pointer
1007 * @reg_addr: indirect register address to read from
1009 * Returns the value of indirect register @reg_addr
1011 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1014 unsigned long flags, pcie_index, pcie_data;
1015 void __iomem *pcie_index_offset;
1016 void __iomem *pcie_data_offset;
1019 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1020 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1022 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1023 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1024 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1026 /* read low 32 bits */
1027 writel(reg_addr, pcie_index_offset);
1028 readl(pcie_index_offset);
1029 r = readl(pcie_data_offset);
1030 /* read high 32 bits */
1031 writel(reg_addr + 4, pcie_index_offset);
1032 readl(pcie_index_offset);
1033 r |= ((u64)readl(pcie_data_offset) << 32);
1034 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1039 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1042 unsigned long flags, pcie_index, pcie_data;
1043 unsigned long pcie_index_hi = 0;
1044 void __iomem *pcie_index_offset;
1045 void __iomem *pcie_index_hi_offset;
1046 void __iomem *pcie_data_offset;
1049 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1050 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1051 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1052 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1054 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1055 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1056 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1057 if (pcie_index_hi != 0)
1058 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1061 /* read low 32 bits */
1062 writel(reg_addr, pcie_index_offset);
1063 readl(pcie_index_offset);
1064 if (pcie_index_hi != 0) {
1065 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1066 readl(pcie_index_hi_offset);
1068 r = readl(pcie_data_offset);
1069 /* read high 32 bits */
1070 writel(reg_addr + 4, pcie_index_offset);
1071 readl(pcie_index_offset);
1072 if (pcie_index_hi != 0) {
1073 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1074 readl(pcie_index_hi_offset);
1076 r |= ((u64)readl(pcie_data_offset) << 32);
1078 /* clear the high bits */
1079 if (pcie_index_hi != 0) {
1080 writel(0, pcie_index_hi_offset);
1081 readl(pcie_index_hi_offset);
1084 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1090 * amdgpu_device_indirect_wreg - write an indirect register address
1092 * @adev: amdgpu_device pointer
1093 * @reg_addr: indirect register offset
1094 * @reg_data: indirect register data
1097 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1098 u32 reg_addr, u32 reg_data)
1100 unsigned long flags, pcie_index, pcie_data;
1101 void __iomem *pcie_index_offset;
1102 void __iomem *pcie_data_offset;
1104 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1105 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1107 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1108 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1109 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1111 writel(reg_addr, pcie_index_offset);
1112 readl(pcie_index_offset);
1113 writel(reg_data, pcie_data_offset);
1114 readl(pcie_data_offset);
1115 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1118 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1119 u64 reg_addr, u32 reg_data)
1121 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1122 void __iomem *pcie_index_offset;
1123 void __iomem *pcie_index_hi_offset;
1124 void __iomem *pcie_data_offset;
1126 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1127 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1128 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1129 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1133 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1134 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1135 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1136 if (pcie_index_hi != 0)
1137 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1140 writel(reg_addr, pcie_index_offset);
1141 readl(pcie_index_offset);
1142 if (pcie_index_hi != 0) {
1143 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1144 readl(pcie_index_hi_offset);
1146 writel(reg_data, pcie_data_offset);
1147 readl(pcie_data_offset);
1149 /* clear the high bits */
1150 if (pcie_index_hi != 0) {
1151 writel(0, pcie_index_hi_offset);
1152 readl(pcie_index_hi_offset);
1155 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1159 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1161 * @adev: amdgpu_device pointer
1162 * @reg_addr: indirect register offset
1163 * @reg_data: indirect register data
1166 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1167 u32 reg_addr, u64 reg_data)
1169 unsigned long flags, pcie_index, pcie_data;
1170 void __iomem *pcie_index_offset;
1171 void __iomem *pcie_data_offset;
1173 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1174 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1176 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1177 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1178 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1180 /* write low 32 bits */
1181 writel(reg_addr, pcie_index_offset);
1182 readl(pcie_index_offset);
1183 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1184 readl(pcie_data_offset);
1185 /* write high 32 bits */
1186 writel(reg_addr + 4, pcie_index_offset);
1187 readl(pcie_index_offset);
1188 writel((u32)(reg_data >> 32), pcie_data_offset);
1189 readl(pcie_data_offset);
1190 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1193 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1194 u64 reg_addr, u64 reg_data)
1196 unsigned long flags, pcie_index, pcie_data;
1197 unsigned long pcie_index_hi = 0;
1198 void __iomem *pcie_index_offset;
1199 void __iomem *pcie_index_hi_offset;
1200 void __iomem *pcie_data_offset;
1202 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1203 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1204 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1205 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1207 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1208 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1209 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1210 if (pcie_index_hi != 0)
1211 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1214 /* write low 32 bits */
1215 writel(reg_addr, pcie_index_offset);
1216 readl(pcie_index_offset);
1217 if (pcie_index_hi != 0) {
1218 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1219 readl(pcie_index_hi_offset);
1221 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1222 readl(pcie_data_offset);
1223 /* write high 32 bits */
1224 writel(reg_addr + 4, pcie_index_offset);
1225 readl(pcie_index_offset);
1226 if (pcie_index_hi != 0) {
1227 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1228 readl(pcie_index_hi_offset);
1230 writel((u32)(reg_data >> 32), pcie_data_offset);
1231 readl(pcie_data_offset);
1233 /* clear the high bits */
1234 if (pcie_index_hi != 0) {
1235 writel(0, pcie_index_hi_offset);
1236 readl(pcie_index_hi_offset);
1239 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1243 * amdgpu_device_get_rev_id - query device rev_id
1245 * @adev: amdgpu_device pointer
1247 * Return device rev_id
1249 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1251 return adev->nbio.funcs->get_rev_id(adev);
1255 * amdgpu_invalid_rreg - dummy reg read function
1257 * @adev: amdgpu_device pointer
1258 * @reg: offset of register
1260 * Dummy register read function. Used for register blocks
1261 * that certain asics don't have (all asics).
1262 * Returns the value in the register.
1264 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1266 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1271 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1273 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1279 * amdgpu_invalid_wreg - dummy reg write function
1281 * @adev: amdgpu_device pointer
1282 * @reg: offset of register
1283 * @v: value to write to the register
1285 * Dummy register read function. Used for register blocks
1286 * that certain asics don't have (all asics).
1288 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1290 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1295 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1297 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1303 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1305 * @adev: amdgpu_device pointer
1306 * @reg: offset of register
1308 * Dummy register read function. Used for register blocks
1309 * that certain asics don't have (all asics).
1310 * Returns the value in the register.
1312 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1314 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1319 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1321 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1327 * amdgpu_invalid_wreg64 - dummy reg write function
1329 * @adev: amdgpu_device pointer
1330 * @reg: offset of register
1331 * @v: value to write to the register
1333 * Dummy register read function. Used for register blocks
1334 * that certain asics don't have (all asics).
1336 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1338 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1343 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1345 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1351 * amdgpu_block_invalid_rreg - dummy reg read function
1353 * @adev: amdgpu_device pointer
1354 * @block: offset of instance
1355 * @reg: offset of register
1357 * Dummy register read function. Used for register blocks
1358 * that certain asics don't have (all asics).
1359 * Returns the value in the register.
1361 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1362 uint32_t block, uint32_t reg)
1364 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1371 * amdgpu_block_invalid_wreg - dummy reg write function
1373 * @adev: amdgpu_device pointer
1374 * @block: offset of instance
1375 * @reg: offset of register
1376 * @v: value to write to the register
1378 * Dummy register read function. Used for register blocks
1379 * that certain asics don't have (all asics).
1381 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1383 uint32_t reg, uint32_t v)
1385 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1391 * amdgpu_device_asic_init - Wrapper for atom asic_init
1393 * @adev: amdgpu_device pointer
1395 * Does any asic specific work and then calls atom asic init.
1397 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1401 amdgpu_asic_pre_asic_init(adev);
1403 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1404 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1405 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1406 amdgpu_psp_wait_for_bootloader(adev);
1407 ret = amdgpu_atomfirmware_asic_init(adev, true);
1410 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1417 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1419 * @adev: amdgpu_device pointer
1421 * Allocates a scratch page of VRAM for use by various things in the
1424 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1426 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1427 AMDGPU_GEM_DOMAIN_VRAM |
1428 AMDGPU_GEM_DOMAIN_GTT,
1429 &adev->mem_scratch.robj,
1430 &adev->mem_scratch.gpu_addr,
1431 (void **)&adev->mem_scratch.ptr);
1435 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1437 * @adev: amdgpu_device pointer
1439 * Frees the VRAM scratch page.
1441 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1443 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1447 * amdgpu_device_program_register_sequence - program an array of registers.
1449 * @adev: amdgpu_device pointer
1450 * @registers: pointer to the register array
1451 * @array_size: size of the register array
1453 * Programs an array or registers with and or masks.
1454 * This is a helper for setting golden registers.
1456 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1457 const u32 *registers,
1458 const u32 array_size)
1460 u32 tmp, reg, and_mask, or_mask;
1466 for (i = 0; i < array_size; i += 3) {
1467 reg = registers[i + 0];
1468 and_mask = registers[i + 1];
1469 or_mask = registers[i + 2];
1471 if (and_mask == 0xffffffff) {
1476 if (adev->family >= AMDGPU_FAMILY_AI)
1477 tmp |= (or_mask & and_mask);
1486 * amdgpu_device_pci_config_reset - reset the GPU
1488 * @adev: amdgpu_device pointer
1490 * Resets the GPU using the pci config reset sequence.
1491 * Only applicable to asics prior to vega10.
1493 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1495 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1499 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1501 * @adev: amdgpu_device pointer
1503 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1505 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1507 return pci_reset_function(adev->pdev);
1511 * amdgpu_device_wb_*()
1512 * Writeback is the method by which the GPU updates special pages in memory
1513 * with the status of certain GPU events (fences, ring pointers,etc.).
1517 * amdgpu_device_wb_fini - Disable Writeback and free memory
1519 * @adev: amdgpu_device pointer
1521 * Disables Writeback and frees the Writeback memory (all asics).
1522 * Used at driver shutdown.
1524 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1526 if (adev->wb.wb_obj) {
1527 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1529 (void **)&adev->wb.wb);
1530 adev->wb.wb_obj = NULL;
1535 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1537 * @adev: amdgpu_device pointer
1539 * Initializes writeback and allocates writeback memory (all asics).
1540 * Used at driver startup.
1541 * Returns 0 on success or an -error on failure.
1543 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1547 if (adev->wb.wb_obj == NULL) {
1548 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1549 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1550 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1551 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1552 (void **)&adev->wb.wb);
1554 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1558 adev->wb.num_wb = AMDGPU_MAX_WB;
1559 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1561 /* clear wb memory */
1562 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1569 * amdgpu_device_wb_get - Allocate a wb entry
1571 * @adev: amdgpu_device pointer
1574 * Allocate a wb slot for use by the driver (all asics).
1575 * Returns 0 on success or -EINVAL on failure.
1577 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1579 unsigned long flags, offset;
1581 spin_lock_irqsave(&adev->wb.lock, flags);
1582 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1583 if (offset < adev->wb.num_wb) {
1584 __set_bit(offset, adev->wb.used);
1585 spin_unlock_irqrestore(&adev->wb.lock, flags);
1586 *wb = offset << 3; /* convert to dw offset */
1589 spin_unlock_irqrestore(&adev->wb.lock, flags);
1595 * amdgpu_device_wb_free - Free a wb entry
1597 * @adev: amdgpu_device pointer
1600 * Free a wb slot allocated for use by the driver (all asics)
1602 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1604 unsigned long flags;
1607 spin_lock_irqsave(&adev->wb.lock, flags);
1608 if (wb < adev->wb.num_wb)
1609 __clear_bit(wb, adev->wb.used);
1610 spin_unlock_irqrestore(&adev->wb.lock, flags);
1614 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1616 * @adev: amdgpu_device pointer
1618 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1619 * to fail, but if any of the BARs is not accessible after the size we abort
1620 * driver loading by returning -ENODEV.
1622 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1624 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1625 struct pci_bus *root;
1626 struct resource *res;
1631 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1635 if (amdgpu_sriov_vf(adev))
1638 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1639 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1640 DRM_WARN("System can't access extended configuration space, please check!!\n");
1642 /* skip if the bios has already enabled large BAR */
1643 if (adev->gmc.real_vram_size &&
1644 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1647 /* Check if the root BUS has 64bit memory resources */
1648 root = adev->pdev->bus;
1649 while (root->parent)
1650 root = root->parent;
1652 pci_bus_for_each_resource(root, res, i) {
1653 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1654 res->start > 0x100000000ull)
1658 /* Trying to resize is pointless without a root hub window above 4GB */
1662 /* Limit the BAR size to what is available */
1663 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1666 /* Disable memory decoding while we change the BAR addresses and size */
1667 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1668 pci_write_config_word(adev->pdev, PCI_COMMAND,
1669 cmd & ~PCI_COMMAND_MEMORY);
1671 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1672 amdgpu_doorbell_fini(adev);
1673 if (adev->asic_type >= CHIP_BONAIRE)
1674 pci_release_resource(adev->pdev, 2);
1676 pci_release_resource(adev->pdev, 0);
1678 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1680 DRM_INFO("Not enough PCI address space for a large BAR.");
1681 else if (r && r != -ENOTSUPP)
1682 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1684 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1686 /* When the doorbell or fb BAR isn't available we have no chance of
1689 r = amdgpu_doorbell_init(adev);
1690 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1693 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1698 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1700 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1707 * GPU helpers function.
1710 * amdgpu_device_need_post - check if the hw need post or not
1712 * @adev: amdgpu_device pointer
1714 * Check if the asic has been initialized (all asics) at driver startup
1715 * or post is needed if hw reset is performed.
1716 * Returns true if need or false if not.
1718 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1722 if (amdgpu_sriov_vf(adev))
1725 if (!amdgpu_device_read_bios(adev))
1728 if (amdgpu_passthrough(adev)) {
1729 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1730 * some old smc fw still need driver do vPost otherwise gpu hang, while
1731 * those smc fw version above 22.15 doesn't have this flaw, so we force
1732 * vpost executed for smc version below 22.15
1734 if (adev->asic_type == CHIP_FIJI) {
1738 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1739 /* force vPost if error occured */
1743 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1744 release_firmware(adev->pm.fw);
1745 if (fw_ver < 0x00160e00)
1750 /* Don't post if we need to reset whole hive on init */
1751 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1754 if (adev->has_hw_reset) {
1755 adev->has_hw_reset = false;
1759 /* bios scratch used on CIK+ */
1760 if (adev->asic_type >= CHIP_BONAIRE)
1761 return amdgpu_atombios_scratch_need_asic_init(adev);
1763 /* check MEM_SIZE for older asics */
1764 reg = amdgpu_asic_get_config_memsize(adev);
1766 if ((reg != 0) && (reg != 0xffffffff))
1773 * Check whether seamless boot is supported.
1775 * So far we only support seamless boot on DCE 3.0 or later.
1776 * If users report that it works on older ASICS as well, we may
1779 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1781 switch (amdgpu_seamless) {
1789 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1794 if (!(adev->flags & AMD_IS_APU))
1797 if (adev->mman.keep_stolen_vga_memory)
1800 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1804 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1805 * don't support dynamic speed switching. Until we have confirmation from Intel
1806 * that a specific host supports it, it's safer that we keep it disabled for all.
1808 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1809 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1811 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1813 #if IS_ENABLED(CONFIG_X86)
1814 struct cpuinfo_x86 *c = &cpu_data(0);
1816 /* eGPU change speeds based on USB4 fabric conditions */
1817 if (dev_is_removable(adev->dev))
1820 if (c->x86_vendor == X86_VENDOR_INTEL)
1827 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1829 * @adev: amdgpu_device pointer
1831 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1832 * be set for this device.
1834 * Returns true if it should be used or false if not.
1836 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1838 switch (amdgpu_aspm) {
1848 if (adev->flags & AMD_IS_APU)
1850 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1852 return pcie_aspm_enabled(adev->pdev);
1855 /* if we get transitioned to only one device, take VGA back */
1857 * amdgpu_device_vga_set_decode - enable/disable vga decode
1859 * @pdev: PCI device pointer
1860 * @state: enable/disable vga decode
1862 * Enable/disable vga decode (all asics).
1863 * Returns VGA resource flags.
1865 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1868 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1870 amdgpu_asic_set_vga_state(adev, state);
1872 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1873 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1875 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1879 * amdgpu_device_check_block_size - validate the vm block size
1881 * @adev: amdgpu_device pointer
1883 * Validates the vm block size specified via module parameter.
1884 * The vm block size defines number of bits in page table versus page directory,
1885 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1886 * page table and the remaining bits are in the page directory.
1888 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1890 /* defines number of bits in page table versus page directory,
1891 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1892 * page table and the remaining bits are in the page directory
1894 if (amdgpu_vm_block_size == -1)
1897 if (amdgpu_vm_block_size < 9) {
1898 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1899 amdgpu_vm_block_size);
1900 amdgpu_vm_block_size = -1;
1905 * amdgpu_device_check_vm_size - validate the vm size
1907 * @adev: amdgpu_device pointer
1909 * Validates the vm size in GB specified via module parameter.
1910 * The VM size is the size of the GPU virtual memory space in GB.
1912 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1914 /* no need to check the default value */
1915 if (amdgpu_vm_size == -1)
1918 if (amdgpu_vm_size < 1) {
1919 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1921 amdgpu_vm_size = -1;
1925 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1928 bool is_os_64 = (sizeof(void *) == 8);
1929 uint64_t total_memory;
1930 uint64_t dram_size_seven_GB = 0x1B8000000;
1931 uint64_t dram_size_three_GB = 0xB8000000;
1933 if (amdgpu_smu_memory_pool_size == 0)
1937 DRM_WARN("Not 64-bit OS, feature not supported\n");
1941 total_memory = (uint64_t)si.totalram * si.mem_unit;
1943 if ((amdgpu_smu_memory_pool_size == 1) ||
1944 (amdgpu_smu_memory_pool_size == 2)) {
1945 if (total_memory < dram_size_three_GB)
1947 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1948 (amdgpu_smu_memory_pool_size == 8)) {
1949 if (total_memory < dram_size_seven_GB)
1952 DRM_WARN("Smu memory pool size not supported\n");
1955 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1960 DRM_WARN("No enough system memory\n");
1962 adev->pm.smu_prv_buffer_size = 0;
1965 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1967 if (!(adev->flags & AMD_IS_APU) ||
1968 adev->asic_type < CHIP_RAVEN)
1971 switch (adev->asic_type) {
1973 if (adev->pdev->device == 0x15dd)
1974 adev->apu_flags |= AMD_APU_IS_RAVEN;
1975 if (adev->pdev->device == 0x15d8)
1976 adev->apu_flags |= AMD_APU_IS_PICASSO;
1979 if ((adev->pdev->device == 0x1636) ||
1980 (adev->pdev->device == 0x164c))
1981 adev->apu_flags |= AMD_APU_IS_RENOIR;
1983 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1986 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1988 case CHIP_YELLOW_CARP:
1990 case CHIP_CYAN_SKILLFISH:
1991 if ((adev->pdev->device == 0x13FE) ||
1992 (adev->pdev->device == 0x143F))
1993 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
2003 * amdgpu_device_check_arguments - validate module params
2005 * @adev: amdgpu_device pointer
2007 * Validates certain module parameters and updates
2008 * the associated values used by the driver (all asics).
2010 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2014 if (amdgpu_sched_jobs < 4) {
2015 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2017 amdgpu_sched_jobs = 4;
2018 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
2019 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2021 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2024 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2025 /* gart size must be greater or equal to 32M */
2026 dev_warn(adev->dev, "gart size (%d) too small\n",
2028 amdgpu_gart_size = -1;
2031 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2032 /* gtt size must be greater or equal to 32M */
2033 dev_warn(adev->dev, "gtt size (%d) too small\n",
2035 amdgpu_gtt_size = -1;
2038 /* valid range is between 4 and 9 inclusive */
2039 if (amdgpu_vm_fragment_size != -1 &&
2040 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2041 dev_warn(adev->dev, "valid range is between 4 and 9\n");
2042 amdgpu_vm_fragment_size = -1;
2045 if (amdgpu_sched_hw_submission < 2) {
2046 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2047 amdgpu_sched_hw_submission);
2048 amdgpu_sched_hw_submission = 2;
2049 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2050 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2051 amdgpu_sched_hw_submission);
2052 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2055 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2056 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2057 amdgpu_reset_method = -1;
2060 amdgpu_device_check_smu_prv_buffer_size(adev);
2062 amdgpu_device_check_vm_size(adev);
2064 amdgpu_device_check_block_size(adev);
2066 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2068 for (i = 0; i < MAX_XCP; i++)
2069 adev->enforce_isolation[i] = !!enforce_isolation;
2075 * amdgpu_switcheroo_set_state - set switcheroo state
2077 * @pdev: pci dev pointer
2078 * @state: vga_switcheroo state
2080 * Callback for the switcheroo driver. Suspends or resumes
2081 * the asics before or after it is powered up using ACPI methods.
2083 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2084 enum vga_switcheroo_state state)
2086 struct drm_device *dev = pci_get_drvdata(pdev);
2089 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2092 if (state == VGA_SWITCHEROO_ON) {
2093 pr_info("switched on\n");
2094 /* don't suspend or resume card normally */
2095 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2097 pci_set_power_state(pdev, PCI_D0);
2098 amdgpu_device_load_pci_state(pdev);
2099 r = pci_enable_device(pdev);
2101 DRM_WARN("pci_enable_device failed (%d)\n", r);
2102 amdgpu_device_resume(dev, true);
2104 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2106 pr_info("switched off\n");
2107 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2108 amdgpu_device_prepare(dev);
2109 amdgpu_device_suspend(dev, true);
2110 amdgpu_device_cache_pci_state(pdev);
2111 /* Shut down the device */
2112 pci_disable_device(pdev);
2113 pci_set_power_state(pdev, PCI_D3cold);
2114 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2119 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2121 * @pdev: pci dev pointer
2123 * Callback for the switcheroo driver. Check of the switcheroo
2124 * state can be changed.
2125 * Returns true if the state can be changed, false if not.
2127 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2129 struct drm_device *dev = pci_get_drvdata(pdev);
2132 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2133 * locking inversion with the driver load path. And the access here is
2134 * completely racy anyway. So don't bother with locking for now.
2136 return atomic_read(&dev->open_count) == 0;
2139 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2140 .set_gpu_state = amdgpu_switcheroo_set_state,
2142 .can_switch = amdgpu_switcheroo_can_switch,
2146 * amdgpu_device_ip_set_clockgating_state - set the CG state
2148 * @dev: amdgpu_device pointer
2149 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2150 * @state: clockgating state (gate or ungate)
2152 * Sets the requested clockgating state for all instances of
2153 * the hardware IP specified.
2154 * Returns the error code from the last instance.
2156 int amdgpu_device_ip_set_clockgating_state(void *dev,
2157 enum amd_ip_block_type block_type,
2158 enum amd_clockgating_state state)
2160 struct amdgpu_device *adev = dev;
2163 for (i = 0; i < adev->num_ip_blocks; i++) {
2164 if (!adev->ip_blocks[i].status.valid)
2166 if (adev->ip_blocks[i].version->type != block_type)
2168 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2170 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2171 (void *)adev, state);
2173 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2174 adev->ip_blocks[i].version->funcs->name, r);
2180 * amdgpu_device_ip_set_powergating_state - set the PG state
2182 * @dev: amdgpu_device pointer
2183 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2184 * @state: powergating state (gate or ungate)
2186 * Sets the requested powergating state for all instances of
2187 * the hardware IP specified.
2188 * Returns the error code from the last instance.
2190 int amdgpu_device_ip_set_powergating_state(void *dev,
2191 enum amd_ip_block_type block_type,
2192 enum amd_powergating_state state)
2194 struct amdgpu_device *adev = dev;
2197 for (i = 0; i < adev->num_ip_blocks; i++) {
2198 if (!adev->ip_blocks[i].status.valid)
2200 if (adev->ip_blocks[i].version->type != block_type)
2202 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2204 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2205 (void *)adev, state);
2207 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2208 adev->ip_blocks[i].version->funcs->name, r);
2214 * amdgpu_device_ip_get_clockgating_state - get the CG state
2216 * @adev: amdgpu_device pointer
2217 * @flags: clockgating feature flags
2219 * Walks the list of IPs on the device and updates the clockgating
2220 * flags for each IP.
2221 * Updates @flags with the feature flags for each hardware IP where
2222 * clockgating is enabled.
2224 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2229 for (i = 0; i < adev->num_ip_blocks; i++) {
2230 if (!adev->ip_blocks[i].status.valid)
2232 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2233 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2238 * amdgpu_device_ip_wait_for_idle - wait for idle
2240 * @adev: amdgpu_device pointer
2241 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2243 * Waits for the request hardware IP to be idle.
2244 * Returns 0 for success or a negative error code on failure.
2246 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2247 enum amd_ip_block_type block_type)
2251 for (i = 0; i < adev->num_ip_blocks; i++) {
2252 if (!adev->ip_blocks[i].status.valid)
2254 if (adev->ip_blocks[i].version->type == block_type) {
2255 if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2256 r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2257 &adev->ip_blocks[i]);
2269 * amdgpu_device_ip_is_valid - is the hardware IP enabled
2271 * @adev: amdgpu_device pointer
2272 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2274 * Check if the hardware IP is enable or not.
2275 * Returns true if it the IP is enable, false if not.
2277 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2278 enum amd_ip_block_type block_type)
2282 for (i = 0; i < adev->num_ip_blocks; i++) {
2283 if (adev->ip_blocks[i].version->type == block_type)
2284 return adev->ip_blocks[i].status.valid;
2291 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2293 * @adev: amdgpu_device pointer
2294 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2296 * Returns a pointer to the hardware IP block structure
2297 * if it exists for the asic, otherwise NULL.
2299 struct amdgpu_ip_block *
2300 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2301 enum amd_ip_block_type type)
2305 for (i = 0; i < adev->num_ip_blocks; i++)
2306 if (adev->ip_blocks[i].version->type == type)
2307 return &adev->ip_blocks[i];
2313 * amdgpu_device_ip_block_version_cmp
2315 * @adev: amdgpu_device pointer
2316 * @type: enum amd_ip_block_type
2317 * @major: major version
2318 * @minor: minor version
2320 * return 0 if equal or greater
2321 * return 1 if smaller or the ip_block doesn't exist
2323 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2324 enum amd_ip_block_type type,
2325 u32 major, u32 minor)
2327 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2329 if (ip_block && ((ip_block->version->major > major) ||
2330 ((ip_block->version->major == major) &&
2331 (ip_block->version->minor >= minor))))
2338 * amdgpu_device_ip_block_add
2340 * @adev: amdgpu_device pointer
2341 * @ip_block_version: pointer to the IP to add
2343 * Adds the IP block driver information to the collection of IPs
2346 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2347 const struct amdgpu_ip_block_version *ip_block_version)
2349 if (!ip_block_version)
2352 switch (ip_block_version->type) {
2353 case AMD_IP_BLOCK_TYPE_VCN:
2354 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2357 case AMD_IP_BLOCK_TYPE_JPEG:
2358 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2365 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
2366 ip_block_version->funcs->name);
2368 adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2370 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2376 * amdgpu_device_enable_virtual_display - enable virtual display feature
2378 * @adev: amdgpu_device pointer
2380 * Enabled the virtual display feature if the user has enabled it via
2381 * the module parameter virtual_display. This feature provides a virtual
2382 * display hardware on headless boards or in virtualized environments.
2383 * This function parses and validates the configuration string specified by
2384 * the user and configues the virtual display configuration (number of
2385 * virtual connectors, crtcs, etc.) specified.
2387 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2389 adev->enable_virtual_display = false;
2391 if (amdgpu_virtual_display) {
2392 const char *pci_address_name = pci_name(adev->pdev);
2393 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2395 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2396 pciaddstr_tmp = pciaddstr;
2397 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2398 pciaddname = strsep(&pciaddname_tmp, ",");
2399 if (!strcmp("all", pciaddname)
2400 || !strcmp(pci_address_name, pciaddname)) {
2404 adev->enable_virtual_display = true;
2407 res = kstrtol(pciaddname_tmp, 10,
2415 adev->mode_info.num_crtc = num_crtc;
2417 adev->mode_info.num_crtc = 1;
2423 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2424 amdgpu_virtual_display, pci_address_name,
2425 adev->enable_virtual_display, adev->mode_info.num_crtc);
2431 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2433 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2434 adev->mode_info.num_crtc = 1;
2435 adev->enable_virtual_display = true;
2436 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2437 adev->enable_virtual_display, adev->mode_info.num_crtc);
2442 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2444 * @adev: amdgpu_device pointer
2446 * Parses the asic configuration parameters specified in the gpu info
2447 * firmware and makes them availale to the driver for use in configuring
2449 * Returns 0 on success, -EINVAL on failure.
2451 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2453 const char *chip_name;
2455 const struct gpu_info_firmware_header_v1_0 *hdr;
2457 adev->firmware.gpu_info_fw = NULL;
2459 if (adev->mman.discovery_bin)
2462 switch (adev->asic_type) {
2466 chip_name = "vega10";
2469 chip_name = "vega12";
2472 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2473 chip_name = "raven2";
2474 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2475 chip_name = "picasso";
2477 chip_name = "raven";
2480 chip_name = "arcturus";
2483 chip_name = "navi12";
2487 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2488 "amdgpu/%s_gpu_info.bin", chip_name);
2491 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2496 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2497 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2499 switch (hdr->version_major) {
2502 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2503 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2504 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2507 * Should be droped when DAL no longer needs it.
2509 if (adev->asic_type == CHIP_NAVI12)
2510 goto parse_soc_bounding_box;
2512 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2513 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2514 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2515 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2516 adev->gfx.config.max_texture_channel_caches =
2517 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2518 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2519 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2520 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2521 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2522 adev->gfx.config.double_offchip_lds_buf =
2523 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2524 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2525 adev->gfx.cu_info.max_waves_per_simd =
2526 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2527 adev->gfx.cu_info.max_scratch_slots_per_cu =
2528 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2529 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2530 if (hdr->version_minor >= 1) {
2531 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2532 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2533 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2534 adev->gfx.config.num_sc_per_sh =
2535 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2536 adev->gfx.config.num_packer_per_sc =
2537 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2540 parse_soc_bounding_box:
2542 * soc bounding box info is not integrated in disocovery table,
2543 * we always need to parse it from gpu info firmware if needed.
2545 if (hdr->version_minor == 2) {
2546 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2547 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2548 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2549 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2555 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2564 * amdgpu_device_ip_early_init - run early init for hardware IPs
2566 * @adev: amdgpu_device pointer
2568 * Early initialization pass for hardware IPs. The hardware IPs that make
2569 * up each asic are discovered each IP's early_init callback is run. This
2570 * is the first stage in initializing the asic.
2571 * Returns 0 on success, negative error code on failure.
2573 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2575 struct amdgpu_ip_block *ip_block;
2576 struct pci_dev *parent;
2580 amdgpu_device_enable_virtual_display(adev);
2582 if (amdgpu_sriov_vf(adev)) {
2583 r = amdgpu_virt_request_full_gpu(adev, true);
2588 switch (adev->asic_type) {
2589 #ifdef CONFIG_DRM_AMDGPU_SI
2595 adev->family = AMDGPU_FAMILY_SI;
2596 r = si_set_ip_blocks(adev);
2601 #ifdef CONFIG_DRM_AMDGPU_CIK
2607 if (adev->flags & AMD_IS_APU)
2608 adev->family = AMDGPU_FAMILY_KV;
2610 adev->family = AMDGPU_FAMILY_CI;
2612 r = cik_set_ip_blocks(adev);
2620 case CHIP_POLARIS10:
2621 case CHIP_POLARIS11:
2622 case CHIP_POLARIS12:
2626 if (adev->flags & AMD_IS_APU)
2627 adev->family = AMDGPU_FAMILY_CZ;
2629 adev->family = AMDGPU_FAMILY_VI;
2631 r = vi_set_ip_blocks(adev);
2636 r = amdgpu_discovery_set_ip_blocks(adev);
2642 if (amdgpu_has_atpx() &&
2643 (amdgpu_is_atpx_hybrid() ||
2644 amdgpu_has_atpx_dgpu_power_cntl()) &&
2645 ((adev->flags & AMD_IS_APU) == 0) &&
2646 !dev_is_removable(&adev->pdev->dev))
2647 adev->flags |= AMD_IS_PX;
2649 if (!(adev->flags & AMD_IS_APU)) {
2650 parent = pcie_find_root_port(adev->pdev);
2651 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2655 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2656 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2657 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2658 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2659 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2660 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2661 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2664 for (i = 0; i < adev->num_ip_blocks; i++) {
2665 ip_block = &adev->ip_blocks[i];
2667 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2668 DRM_WARN("disabled ip block: %d <%s>\n",
2669 i, adev->ip_blocks[i].version->funcs->name);
2670 adev->ip_blocks[i].status.valid = false;
2671 } else if (ip_block->version->funcs->early_init) {
2672 r = ip_block->version->funcs->early_init(ip_block);
2674 adev->ip_blocks[i].status.valid = false;
2676 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2677 adev->ip_blocks[i].version->funcs->name, r);
2680 adev->ip_blocks[i].status.valid = true;
2683 adev->ip_blocks[i].status.valid = true;
2685 /* get the vbios after the asic_funcs are set up */
2686 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2687 r = amdgpu_device_parse_gpu_info_fw(adev);
2692 if (amdgpu_device_read_bios(adev)) {
2693 if (!amdgpu_get_bios(adev))
2696 r = amdgpu_atombios_init(adev);
2698 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2699 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2704 /*get pf2vf msg info at it's earliest time*/
2705 if (amdgpu_sriov_vf(adev))
2706 amdgpu_virt_init_data_exchange(adev);
2713 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2714 if (ip_block->status.valid != false)
2715 amdgpu_amdkfd_device_probe(adev);
2717 adev->cg_flags &= amdgpu_cg_mask;
2718 adev->pg_flags &= amdgpu_pg_mask;
2723 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2727 for (i = 0; i < adev->num_ip_blocks; i++) {
2728 if (!adev->ip_blocks[i].status.sw)
2730 if (adev->ip_blocks[i].status.hw)
2732 if (!amdgpu_ip_member_of_hwini(
2733 adev, adev->ip_blocks[i].version->type))
2735 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2736 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2737 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2738 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2740 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2741 adev->ip_blocks[i].version->funcs->name, r);
2744 adev->ip_blocks[i].status.hw = true;
2751 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2755 for (i = 0; i < adev->num_ip_blocks; i++) {
2756 if (!adev->ip_blocks[i].status.sw)
2758 if (adev->ip_blocks[i].status.hw)
2760 if (!amdgpu_ip_member_of_hwini(
2761 adev, adev->ip_blocks[i].version->type))
2763 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2765 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2766 adev->ip_blocks[i].version->funcs->name, r);
2769 adev->ip_blocks[i].status.hw = true;
2775 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2779 uint32_t smu_version;
2781 if (adev->asic_type >= CHIP_VEGA10) {
2782 for (i = 0; i < adev->num_ip_blocks; i++) {
2783 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2786 if (!amdgpu_ip_member_of_hwini(adev,
2787 AMD_IP_BLOCK_TYPE_PSP))
2790 if (!adev->ip_blocks[i].status.sw)
2793 /* no need to do the fw loading again if already done*/
2794 if (adev->ip_blocks[i].status.hw == true)
2797 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2798 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
2802 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2804 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2805 adev->ip_blocks[i].version->funcs->name, r);
2808 adev->ip_blocks[i].status.hw = true;
2814 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2815 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2820 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2825 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2826 struct amdgpu_ring *ring = adev->rings[i];
2828 /* No need to setup the GPU scheduler for rings that don't need it */
2829 if (!ring || ring->no_scheduler)
2832 switch (ring->funcs->type) {
2833 case AMDGPU_RING_TYPE_GFX:
2834 timeout = adev->gfx_timeout;
2836 case AMDGPU_RING_TYPE_COMPUTE:
2837 timeout = adev->compute_timeout;
2839 case AMDGPU_RING_TYPE_SDMA:
2840 timeout = adev->sdma_timeout;
2843 timeout = adev->video_timeout;
2847 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2848 DRM_SCHED_PRIORITY_COUNT,
2849 ring->num_hw_submission, 0,
2850 timeout, adev->reset_domain->wq,
2851 ring->sched_score, ring->name,
2854 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2858 r = amdgpu_uvd_entity_init(adev, ring);
2860 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2864 r = amdgpu_vce_entity_init(adev, ring);
2866 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2872 amdgpu_xcp_update_partition_sched_list(adev);
2879 * amdgpu_device_ip_init - run init for hardware IPs
2881 * @adev: amdgpu_device pointer
2883 * Main initialization pass for hardware IPs. The list of all the hardware
2884 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2885 * are run. sw_init initializes the software state associated with each IP
2886 * and hw_init initializes the hardware associated with each IP.
2887 * Returns 0 on success, negative error code on failure.
2889 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2894 r = amdgpu_ras_init(adev);
2898 for (i = 0; i < adev->num_ip_blocks; i++) {
2899 if (!adev->ip_blocks[i].status.valid)
2901 if (adev->ip_blocks[i].version->funcs->sw_init) {
2902 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
2904 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2905 adev->ip_blocks[i].version->funcs->name, r);
2909 adev->ip_blocks[i].status.sw = true;
2911 if (!amdgpu_ip_member_of_hwini(
2912 adev, adev->ip_blocks[i].version->type))
2915 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2916 /* need to do common hw init early so everything is set up for gmc */
2917 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2919 DRM_ERROR("hw_init %d failed %d\n", i, r);
2922 adev->ip_blocks[i].status.hw = true;
2923 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2924 /* need to do gmc hw init early so we can allocate gpu mem */
2925 /* Try to reserve bad pages early */
2926 if (amdgpu_sriov_vf(adev))
2927 amdgpu_virt_exchange_data(adev);
2929 r = amdgpu_device_mem_scratch_init(adev);
2931 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2934 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2936 DRM_ERROR("hw_init %d failed %d\n", i, r);
2939 r = amdgpu_device_wb_init(adev);
2941 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2944 adev->ip_blocks[i].status.hw = true;
2946 /* right after GMC hw init, we create CSA */
2947 if (adev->gfx.mcbp) {
2948 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2949 AMDGPU_GEM_DOMAIN_VRAM |
2950 AMDGPU_GEM_DOMAIN_GTT,
2953 DRM_ERROR("allocate CSA failed %d\n", r);
2958 r = amdgpu_seq64_init(adev);
2960 DRM_ERROR("allocate seq64 failed %d\n", r);
2966 if (amdgpu_sriov_vf(adev))
2967 amdgpu_virt_init_data_exchange(adev);
2969 r = amdgpu_ib_pool_init(adev);
2971 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2972 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2976 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2980 r = amdgpu_device_ip_hw_init_phase1(adev);
2984 r = amdgpu_device_fw_loading(adev);
2988 r = amdgpu_device_ip_hw_init_phase2(adev);
2993 * retired pages will be loaded from eeprom and reserved here,
2994 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2995 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2996 * for I2C communication which only true at this point.
2998 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2999 * failure from bad gpu situation and stop amdgpu init process
3000 * accordingly. For other failed cases, it will still release all
3001 * the resource and print error message, rather than returning one
3002 * negative value to upper level.
3004 * Note: theoretically, this should be called before all vram allocations
3005 * to protect retired page from abusing
3007 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3008 r = amdgpu_ras_recovery_init(adev, init_badpage);
3013 * In case of XGMI grab extra reference for reset domain for this device
3015 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3016 if (amdgpu_xgmi_add_device(adev) == 0) {
3017 if (!amdgpu_sriov_vf(adev)) {
3018 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3020 if (WARN_ON(!hive)) {
3025 if (!hive->reset_domain ||
3026 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3028 amdgpu_put_xgmi_hive(hive);
3032 /* Drop the early temporary reset domain we created for device */
3033 amdgpu_reset_put_reset_domain(adev->reset_domain);
3034 adev->reset_domain = hive->reset_domain;
3035 amdgpu_put_xgmi_hive(hive);
3040 r = amdgpu_device_init_schedulers(adev);
3044 if (adev->mman.buffer_funcs_ring->sched.ready)
3045 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3047 /* Don't init kfd if whole hive need to be reset during init */
3048 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3049 kgd2kfd_init_zone_device(adev);
3050 amdgpu_amdkfd_device_init(adev);
3053 amdgpu_fru_get_product_info(adev);
3061 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3063 * @adev: amdgpu_device pointer
3065 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3066 * this function before a GPU reset. If the value is retained after a
3067 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
3069 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3071 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3075 * amdgpu_device_check_vram_lost - check if vram is valid
3077 * @adev: amdgpu_device pointer
3079 * Checks the reset magic value written to the gart pointer in VRAM.
3080 * The driver calls this after a GPU reset to see if the contents of
3081 * VRAM is lost or now.
3082 * returns true if vram is lost, false if not.
3084 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3086 if (memcmp(adev->gart.ptr, adev->reset_magic,
3087 AMDGPU_RESET_MAGIC_NUM))
3090 if (!amdgpu_in_reset(adev))
3094 * For all ASICs with baco/mode1 reset, the VRAM is
3095 * always assumed to be lost.
3097 switch (amdgpu_asic_reset_method(adev)) {
3098 case AMD_RESET_METHOD_BACO:
3099 case AMD_RESET_METHOD_MODE1:
3107 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3109 * @adev: amdgpu_device pointer
3110 * @state: clockgating state (gate or ungate)
3112 * The list of all the hardware IPs that make up the asic is walked and the
3113 * set_clockgating_state callbacks are run.
3114 * Late initialization pass enabling clockgating for hardware IPs.
3115 * Fini or suspend, pass disabling clockgating for hardware IPs.
3116 * Returns 0 on success, negative error code on failure.
3119 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3120 enum amd_clockgating_state state)
3124 if (amdgpu_emu_mode == 1)
3127 for (j = 0; j < adev->num_ip_blocks; j++) {
3128 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3129 if (!adev->ip_blocks[i].status.late_initialized)
3131 /* skip CG for GFX, SDMA on S0ix */
3132 if (adev->in_s0ix &&
3133 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3134 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3136 /* skip CG for VCE/UVD, it's handled specially */
3137 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3138 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3139 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3140 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3141 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3142 /* enable clockgating to save power */
3143 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
3146 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3147 adev->ip_blocks[i].version->funcs->name, r);
3156 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3157 enum amd_powergating_state state)
3161 if (amdgpu_emu_mode == 1)
3164 for (j = 0; j < adev->num_ip_blocks; j++) {
3165 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3166 if (!adev->ip_blocks[i].status.late_initialized)
3168 /* skip PG for GFX, SDMA on S0ix */
3169 if (adev->in_s0ix &&
3170 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3171 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3173 /* skip CG for VCE/UVD, it's handled specially */
3174 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3175 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3176 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3177 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3178 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3179 /* enable powergating to save power */
3180 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
3183 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3184 adev->ip_blocks[i].version->funcs->name, r);
3192 static int amdgpu_device_enable_mgpu_fan_boost(void)
3194 struct amdgpu_gpu_instance *gpu_ins;
3195 struct amdgpu_device *adev;
3198 mutex_lock(&mgpu_info.mutex);
3201 * MGPU fan boost feature should be enabled
3202 * only when there are two or more dGPUs in
3205 if (mgpu_info.num_dgpu < 2)
3208 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3209 gpu_ins = &(mgpu_info.gpu_ins[i]);
3210 adev = gpu_ins->adev;
3211 if (!(adev->flags & AMD_IS_APU) &&
3212 !gpu_ins->mgpu_fan_enabled) {
3213 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3217 gpu_ins->mgpu_fan_enabled = 1;
3222 mutex_unlock(&mgpu_info.mutex);
3228 * amdgpu_device_ip_late_init - run late init for hardware IPs
3230 * @adev: amdgpu_device pointer
3232 * Late initialization pass for hardware IPs. The list of all the hardware
3233 * IPs that make up the asic is walked and the late_init callbacks are run.
3234 * late_init covers any special initialization that an IP requires
3235 * after all of the have been initialized or something that needs to happen
3236 * late in the init process.
3237 * Returns 0 on success, negative error code on failure.
3239 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3241 struct amdgpu_gpu_instance *gpu_instance;
3244 for (i = 0; i < adev->num_ip_blocks; i++) {
3245 if (!adev->ip_blocks[i].status.hw)
3247 if (adev->ip_blocks[i].version->funcs->late_init) {
3248 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3250 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3251 adev->ip_blocks[i].version->funcs->name, r);
3255 adev->ip_blocks[i].status.late_initialized = true;
3258 r = amdgpu_ras_late_init(adev);
3260 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3264 if (!amdgpu_reset_in_recovery(adev))
3265 amdgpu_ras_set_error_query_ready(adev, true);
3267 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3268 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3270 amdgpu_device_fill_reset_magic(adev);
3272 r = amdgpu_device_enable_mgpu_fan_boost();
3274 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3276 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3277 if (amdgpu_passthrough(adev) &&
3278 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3279 adev->asic_type == CHIP_ALDEBARAN))
3280 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3282 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3283 mutex_lock(&mgpu_info.mutex);
3286 * Reset device p-state to low as this was booted with high.
3288 * This should be performed only after all devices from the same
3289 * hive get initialized.
3291 * However, it's unknown how many device in the hive in advance.
3292 * As this is counted one by one during devices initializations.
3294 * So, we wait for all XGMI interlinked devices initialized.
3295 * This may bring some delays as those devices may come from
3296 * different hives. But that should be OK.
3298 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3299 for (i = 0; i < mgpu_info.num_gpu; i++) {
3300 gpu_instance = &(mgpu_info.gpu_ins[i]);
3301 if (gpu_instance->adev->flags & AMD_IS_APU)
3304 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3305 AMDGPU_XGMI_PSTATE_MIN);
3307 DRM_ERROR("pstate setting failed (%d).\n", r);
3313 mutex_unlock(&mgpu_info.mutex);
3319 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3323 if (!ip_block->version->funcs->hw_fini) {
3324 DRM_ERROR("hw_fini of IP block <%s> not defined\n",
3325 ip_block->version->funcs->name);
3327 r = ip_block->version->funcs->hw_fini(ip_block);
3328 /* XXX handle errors */
3330 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3331 ip_block->version->funcs->name, r);
3335 ip_block->status.hw = false;
3339 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3341 * @adev: amdgpu_device pointer
3343 * For ASICs need to disable SMC first
3345 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3349 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3352 for (i = 0; i < adev->num_ip_blocks; i++) {
3353 if (!adev->ip_blocks[i].status.hw)
3355 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3356 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3362 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3366 for (i = 0; i < adev->num_ip_blocks; i++) {
3367 if (!adev->ip_blocks[i].version->funcs->early_fini)
3370 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3372 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3373 adev->ip_blocks[i].version->funcs->name, r);
3377 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3378 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3380 amdgpu_amdkfd_suspend(adev, false);
3382 /* Workaroud for ASICs need to disable SMC first */
3383 amdgpu_device_smu_fini_early(adev);
3385 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3386 if (!adev->ip_blocks[i].status.hw)
3389 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3392 if (amdgpu_sriov_vf(adev)) {
3393 if (amdgpu_virt_release_full_gpu(adev, false))
3394 DRM_ERROR("failed to release exclusive mode on fini\n");
3401 * amdgpu_device_ip_fini - run fini for hardware IPs
3403 * @adev: amdgpu_device pointer
3405 * Main teardown pass for hardware IPs. The list of all the hardware
3406 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3407 * are run. hw_fini tears down the hardware associated with each IP
3408 * and sw_fini tears down any software state associated with each IP.
3409 * Returns 0 on success, negative error code on failure.
3411 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3415 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3416 amdgpu_virt_release_ras_err_handler_data(adev);
3418 if (adev->gmc.xgmi.num_physical_nodes > 1)
3419 amdgpu_xgmi_remove_device(adev);
3421 amdgpu_amdkfd_device_fini_sw(adev);
3423 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3424 if (!adev->ip_blocks[i].status.sw)
3427 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3428 amdgpu_ucode_free_bo(adev);
3429 amdgpu_free_static_csa(&adev->virt.csa_obj);
3430 amdgpu_device_wb_fini(adev);
3431 amdgpu_device_mem_scratch_fini(adev);
3432 amdgpu_ib_pool_fini(adev);
3433 amdgpu_seq64_fini(adev);
3435 if (adev->ip_blocks[i].version->funcs->sw_fini) {
3436 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3437 /* XXX handle errors */
3439 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3440 adev->ip_blocks[i].version->funcs->name, r);
3443 adev->ip_blocks[i].status.sw = false;
3444 adev->ip_blocks[i].status.valid = false;
3447 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3448 if (!adev->ip_blocks[i].status.late_initialized)
3450 if (adev->ip_blocks[i].version->funcs->late_fini)
3451 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3452 adev->ip_blocks[i].status.late_initialized = false;
3455 amdgpu_ras_fini(adev);
3461 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3463 * @work: work_struct.
3465 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3467 struct amdgpu_device *adev =
3468 container_of(work, struct amdgpu_device, delayed_init_work.work);
3471 r = amdgpu_ib_ring_tests(adev);
3473 DRM_ERROR("ib ring test failed (%d).\n", r);
3476 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3478 struct amdgpu_device *adev =
3479 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3481 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3482 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3484 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
3485 adev->gfx.gfx_off_state = true;
3489 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3491 * @adev: amdgpu_device pointer
3493 * Main suspend function for hardware IPs. The list of all the hardware
3494 * IPs that make up the asic is walked, clockgating is disabled and the
3495 * suspend callbacks are run. suspend puts the hardware and software state
3496 * in each IP into a state suitable for suspend.
3497 * Returns 0 on success, negative error code on failure.
3499 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3503 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3504 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3507 * Per PMFW team's suggestion, driver needs to handle gfxoff
3508 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3509 * scenario. Add the missing df cstate disablement here.
3511 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3512 dev_warn(adev->dev, "Failed to disallow df cstate");
3514 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3515 if (!adev->ip_blocks[i].status.valid)
3518 /* displays are handled separately */
3519 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3522 /* XXX handle errors */
3523 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3532 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3534 * @adev: amdgpu_device pointer
3536 * Main suspend function for hardware IPs. The list of all the hardware
3537 * IPs that make up the asic is walked, clockgating is disabled and the
3538 * suspend callbacks are run. suspend puts the hardware and software state
3539 * in each IP into a state suitable for suspend.
3540 * Returns 0 on success, negative error code on failure.
3542 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3547 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3549 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3550 if (!adev->ip_blocks[i].status.valid)
3552 /* displays are handled in phase1 */
3553 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3555 /* PSP lost connection when err_event_athub occurs */
3556 if (amdgpu_ras_intr_triggered() &&
3557 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3558 adev->ip_blocks[i].status.hw = false;
3562 /* skip unnecessary suspend if we do not initialize them yet */
3563 if (!amdgpu_ip_member_of_hwini(
3564 adev, adev->ip_blocks[i].version->type))
3567 /* skip suspend of gfx/mes and psp for S0ix
3568 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3569 * like at runtime. PSP is also part of the always on hardware
3570 * so no need to suspend it.
3572 if (adev->in_s0ix &&
3573 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3574 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3575 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3578 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3579 if (adev->in_s0ix &&
3580 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3581 IP_VERSION(5, 0, 0)) &&
3582 (adev->ip_blocks[i].version->type ==
3583 AMD_IP_BLOCK_TYPE_SDMA))
3586 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3587 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3588 * from this location and RLC Autoload automatically also gets loaded
3589 * from here based on PMFW -> PSP message during re-init sequence.
3590 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3591 * the TMR and reload FWs again for IMU enabled APU ASICs.
3593 if (amdgpu_in_reset(adev) &&
3594 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3595 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3598 /* XXX handle errors */
3599 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3600 adev->ip_blocks[i].status.hw = false;
3602 /* handle putting the SMC in the appropriate state */
3603 if (!amdgpu_sriov_vf(adev)) {
3604 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3605 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3607 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3608 adev->mp1_state, r);
3619 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3621 * @adev: amdgpu_device pointer
3623 * Main suspend function for hardware IPs. The list of all the hardware
3624 * IPs that make up the asic is walked, clockgating is disabled and the
3625 * suspend callbacks are run. suspend puts the hardware and software state
3626 * in each IP into a state suitable for suspend.
3627 * Returns 0 on success, negative error code on failure.
3629 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3633 if (amdgpu_sriov_vf(adev)) {
3634 amdgpu_virt_fini_data_exchange(adev);
3635 amdgpu_virt_request_full_gpu(adev, false);
3638 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3640 r = amdgpu_device_ip_suspend_phase1(adev);
3643 r = amdgpu_device_ip_suspend_phase2(adev);
3645 if (amdgpu_sriov_vf(adev))
3646 amdgpu_virt_release_full_gpu(adev, false);
3651 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3655 static enum amd_ip_block_type ip_order[] = {
3656 AMD_IP_BLOCK_TYPE_COMMON,
3657 AMD_IP_BLOCK_TYPE_GMC,
3658 AMD_IP_BLOCK_TYPE_PSP,
3659 AMD_IP_BLOCK_TYPE_IH,
3662 for (i = 0; i < adev->num_ip_blocks; i++) {
3664 struct amdgpu_ip_block *block;
3666 block = &adev->ip_blocks[i];
3667 block->status.hw = false;
3669 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3671 if (block->version->type != ip_order[j] ||
3672 !block->status.valid)
3675 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3677 dev_err(adev->dev, "RE-INIT-early: %s failed\n",
3678 block->version->funcs->name);
3681 block->status.hw = true;
3688 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3690 struct amdgpu_ip_block *block;
3693 static enum amd_ip_block_type ip_order[] = {
3694 AMD_IP_BLOCK_TYPE_SMC,
3695 AMD_IP_BLOCK_TYPE_DCE,
3696 AMD_IP_BLOCK_TYPE_GFX,
3697 AMD_IP_BLOCK_TYPE_SDMA,
3698 AMD_IP_BLOCK_TYPE_MES,
3699 AMD_IP_BLOCK_TYPE_UVD,
3700 AMD_IP_BLOCK_TYPE_VCE,
3701 AMD_IP_BLOCK_TYPE_VCN,
3702 AMD_IP_BLOCK_TYPE_JPEG
3705 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3706 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
3711 if (block->status.valid && !block->status.hw) {
3712 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
3713 r = amdgpu_ip_block_resume(block);
3715 r = block->version->funcs->hw_init(block);
3719 dev_err(adev->dev, "RE-INIT-late: %s failed\n",
3720 block->version->funcs->name);
3723 block->status.hw = true;
3731 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3733 * @adev: amdgpu_device pointer
3735 * First resume function for hardware IPs. The list of all the hardware
3736 * IPs that make up the asic is walked and the resume callbacks are run for
3737 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3738 * after a suspend and updates the software state as necessary. This
3739 * function is also used for restoring the GPU after a GPU reset.
3740 * Returns 0 on success, negative error code on failure.
3742 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3746 for (i = 0; i < adev->num_ip_blocks; i++) {
3747 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3749 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3750 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3751 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3752 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3754 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3764 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3766 * @adev: amdgpu_device pointer
3768 * Second resume function for hardware IPs. The list of all the hardware
3769 * IPs that make up the asic is walked and the resume callbacks are run for
3770 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3771 * functional state after a suspend and updates the software state as
3772 * necessary. This function is also used for restoring the GPU after a GPU
3774 * Returns 0 on success, negative error code on failure.
3776 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3780 for (i = 0; i < adev->num_ip_blocks; i++) {
3781 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3783 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3785 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3786 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3787 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3789 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3798 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3800 * @adev: amdgpu_device pointer
3802 * Third resume function for hardware IPs. The list of all the hardware
3803 * IPs that make up the asic is walked and the resume callbacks are run for
3804 * all DCE. resume puts the hardware into a functional state after a suspend
3805 * and updates the software state as necessary. This function is also used
3806 * for restoring the GPU after a GPU reset.
3808 * Returns 0 on success, negative error code on failure.
3810 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3814 for (i = 0; i < adev->num_ip_blocks; i++) {
3815 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3817 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3818 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3828 * amdgpu_device_ip_resume - run resume for hardware IPs
3830 * @adev: amdgpu_device pointer
3832 * Main resume function for hardware IPs. The hardware IPs
3833 * are split into two resume functions because they are
3834 * also used in recovering from a GPU reset and some additional
3835 * steps need to be take between them. In this case (S3/S4) they are
3837 * Returns 0 on success, negative error code on failure.
3839 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3843 r = amdgpu_device_ip_resume_phase1(adev);
3847 r = amdgpu_device_fw_loading(adev);
3851 r = amdgpu_device_ip_resume_phase2(adev);
3853 if (adev->mman.buffer_funcs_ring->sched.ready)
3854 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3859 amdgpu_fence_driver_hw_init(adev);
3861 r = amdgpu_device_ip_resume_phase3(adev);
3867 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3869 * @adev: amdgpu_device pointer
3871 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3873 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3875 if (amdgpu_sriov_vf(adev)) {
3876 if (adev->is_atom_fw) {
3877 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3878 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3880 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3881 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3884 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3885 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3890 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3892 * @asic_type: AMD asic type
3894 * Check if there is DC (new modesetting infrastructre) support for an asic.
3895 * returns true if DC has support, false if not.
3897 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3899 switch (asic_type) {
3900 #ifdef CONFIG_DRM_AMDGPU_SI
3904 /* chips with no display hardware */
3906 #if defined(CONFIG_DRM_AMD_DC)
3912 * We have systems in the wild with these ASICs that require
3913 * LVDS and VGA support which is not supported with DC.
3915 * Fallback to the non-DC driver here by default so as not to
3916 * cause regressions.
3918 #if defined(CONFIG_DRM_AMD_DC_SI)
3919 return amdgpu_dc > 0;
3928 * We have systems in the wild with these ASICs that require
3929 * VGA support which is not supported with DC.
3931 * Fallback to the non-DC driver here by default so as not to
3932 * cause regressions.
3934 return amdgpu_dc > 0;
3936 return amdgpu_dc != 0;
3940 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3947 * amdgpu_device_has_dc_support - check if dc is supported
3949 * @adev: amdgpu_device pointer
3951 * Returns true for supported, false for not supported
3953 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3955 if (adev->enable_virtual_display ||
3956 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3959 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3962 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3964 struct amdgpu_device *adev =
3965 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3966 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3968 /* It's a bug to not have a hive within this function */
3973 * Use task barrier to synchronize all xgmi reset works across the
3974 * hive. task_barrier_enter and task_barrier_exit will block
3975 * until all the threads running the xgmi reset works reach
3976 * those points. task_barrier_full will do both blocks.
3978 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3980 task_barrier_enter(&hive->tb);
3981 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3983 if (adev->asic_reset_res)
3986 task_barrier_exit(&hive->tb);
3987 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3989 if (adev->asic_reset_res)
3992 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
3995 task_barrier_full(&hive->tb);
3996 adev->asic_reset_res = amdgpu_asic_reset(adev);
4000 if (adev->asic_reset_res)
4001 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4002 adev->asic_reset_res, adev_to_drm(adev)->unique);
4003 amdgpu_put_xgmi_hive(hive);
4006 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4008 char *input = amdgpu_lockup_timeout;
4009 char *timeout_setting = NULL;
4015 * By default timeout for non compute jobs is 10000
4016 * and 60000 for compute jobs.
4017 * In SR-IOV or passthrough mode, timeout for compute
4018 * jobs are 60000 by default.
4020 adev->gfx_timeout = msecs_to_jiffies(10000);
4021 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4022 if (amdgpu_sriov_vf(adev))
4023 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
4024 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
4026 adev->compute_timeout = msecs_to_jiffies(60000);
4028 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4029 while ((timeout_setting = strsep(&input, ",")) &&
4030 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4031 ret = kstrtol(timeout_setting, 0, &timeout);
4038 } else if (timeout < 0) {
4039 timeout = MAX_SCHEDULE_TIMEOUT;
4040 dev_warn(adev->dev, "lockup timeout disabled");
4041 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4043 timeout = msecs_to_jiffies(timeout);
4048 adev->gfx_timeout = timeout;
4051 adev->compute_timeout = timeout;
4054 adev->sdma_timeout = timeout;
4057 adev->video_timeout = timeout;
4064 * There is only one value specified and
4065 * it should apply to all non-compute jobs.
4068 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4069 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4070 adev->compute_timeout = adev->gfx_timeout;
4078 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4080 * @adev: amdgpu_device pointer
4082 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4084 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4086 struct iommu_domain *domain;
4088 domain = iommu_get_domain_for_dev(adev->dev);
4089 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4090 adev->ram_is_direct_mapped = true;
4093 #if defined(CONFIG_HSA_AMD_P2P)
4095 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4097 * @adev: amdgpu_device pointer
4099 * return if IOMMU remapping bar address
4101 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4103 struct iommu_domain *domain;
4105 domain = iommu_get_domain_for_dev(adev->dev);
4106 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4107 domain->type == IOMMU_DOMAIN_DMA_FQ))
4114 static const struct attribute *amdgpu_dev_attributes[] = {
4115 &dev_attr_pcie_replay_count.attr,
4119 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4121 if (amdgpu_mcbp == 1)
4122 adev->gfx.mcbp = true;
4123 else if (amdgpu_mcbp == 0)
4124 adev->gfx.mcbp = false;
4126 if (amdgpu_sriov_vf(adev))
4127 adev->gfx.mcbp = true;
4130 DRM_INFO("MCBP is enabled\n");
4134 * amdgpu_device_init - initialize the driver
4136 * @adev: amdgpu_device pointer
4137 * @flags: driver flags
4139 * Initializes the driver info and hw (all asics).
4140 * Returns 0 for success or an error on failure.
4141 * Called at driver startup.
4143 int amdgpu_device_init(struct amdgpu_device *adev,
4146 struct drm_device *ddev = adev_to_drm(adev);
4147 struct pci_dev *pdev = adev->pdev;
4153 adev->shutdown = false;
4154 adev->flags = flags;
4156 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4157 adev->asic_type = amdgpu_force_asic_type;
4159 adev->asic_type = flags & AMD_ASIC_MASK;
4161 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4162 if (amdgpu_emu_mode == 1)
4163 adev->usec_timeout *= 10;
4164 adev->gmc.gart_size = 512 * 1024 * 1024;
4165 adev->accel_working = false;
4166 adev->num_rings = 0;
4167 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4168 adev->mman.buffer_funcs = NULL;
4169 adev->mman.buffer_funcs_ring = NULL;
4170 adev->vm_manager.vm_pte_funcs = NULL;
4171 adev->vm_manager.vm_pte_num_scheds = 0;
4172 adev->gmc.gmc_funcs = NULL;
4173 adev->harvest_ip_mask = 0x0;
4174 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4175 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4177 adev->smc_rreg = &amdgpu_invalid_rreg;
4178 adev->smc_wreg = &amdgpu_invalid_wreg;
4179 adev->pcie_rreg = &amdgpu_invalid_rreg;
4180 adev->pcie_wreg = &amdgpu_invalid_wreg;
4181 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4182 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4183 adev->pciep_rreg = &amdgpu_invalid_rreg;
4184 adev->pciep_wreg = &amdgpu_invalid_wreg;
4185 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4186 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4187 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4188 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4189 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4190 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4191 adev->didt_rreg = &amdgpu_invalid_rreg;
4192 adev->didt_wreg = &amdgpu_invalid_wreg;
4193 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4194 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4195 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4196 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4198 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4199 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4200 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4202 /* mutex initialization are all done here so we
4203 * can recall function without having locking issues
4205 mutex_init(&adev->firmware.mutex);
4206 mutex_init(&adev->pm.mutex);
4207 mutex_init(&adev->gfx.gpu_clock_mutex);
4208 mutex_init(&adev->srbm_mutex);
4209 mutex_init(&adev->gfx.pipe_reserve_mutex);
4210 mutex_init(&adev->gfx.gfx_off_mutex);
4211 mutex_init(&adev->gfx.partition_mutex);
4212 mutex_init(&adev->grbm_idx_mutex);
4213 mutex_init(&adev->mn_lock);
4214 mutex_init(&adev->virt.vf_errors.lock);
4215 mutex_init(&adev->virt.rlcg_reg_lock);
4216 hash_init(adev->mn_hash);
4217 mutex_init(&adev->psp.mutex);
4218 mutex_init(&adev->notifier_lock);
4219 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4220 mutex_init(&adev->benchmark_mutex);
4221 mutex_init(&adev->gfx.reset_sem_mutex);
4222 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4223 mutex_init(&adev->enforce_isolation_mutex);
4224 mutex_init(&adev->gfx.kfd_sch_mutex);
4226 amdgpu_device_init_apu_flags(adev);
4228 r = amdgpu_device_check_arguments(adev);
4232 spin_lock_init(&adev->mmio_idx_lock);
4233 spin_lock_init(&adev->smc_idx_lock);
4234 spin_lock_init(&adev->pcie_idx_lock);
4235 spin_lock_init(&adev->uvd_ctx_idx_lock);
4236 spin_lock_init(&adev->didt_idx_lock);
4237 spin_lock_init(&adev->gc_cac_idx_lock);
4238 spin_lock_init(&adev->se_cac_idx_lock);
4239 spin_lock_init(&adev->audio_endpt_idx_lock);
4240 spin_lock_init(&adev->mm_stats.lock);
4241 spin_lock_init(&adev->wb.lock);
4243 INIT_LIST_HEAD(&adev->reset_list);
4245 INIT_LIST_HEAD(&adev->ras_list);
4247 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4249 INIT_DELAYED_WORK(&adev->delayed_init_work,
4250 amdgpu_device_delayed_init_work_handler);
4251 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4252 amdgpu_device_delay_enable_gfx_off);
4254 * Initialize the enforce_isolation work structures for each XCP
4255 * partition. This work handler is responsible for enforcing shader
4256 * isolation on AMD GPUs. It counts the number of emitted fences for
4257 * each GFX and compute ring. If there are any fences, it schedules
4258 * the `enforce_isolation_work` to be run after a delay. If there are
4259 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4262 for (i = 0; i < MAX_XCP; i++) {
4263 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4264 amdgpu_gfx_enforce_isolation_handler);
4265 adev->gfx.enforce_isolation[i].adev = adev;
4266 adev->gfx.enforce_isolation[i].xcp_id = i;
4269 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4271 adev->gfx.gfx_off_req_count = 1;
4272 adev->gfx.gfx_off_residency = 0;
4273 adev->gfx.gfx_off_entrycount = 0;
4274 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4276 atomic_set(&adev->throttling_logging_enabled, 1);
4278 * If throttling continues, logging will be performed every minute
4279 * to avoid log flooding. "-1" is subtracted since the thermal
4280 * throttling interrupt comes every second. Thus, the total logging
4281 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4282 * for throttling interrupt) = 60 seconds.
4284 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4285 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1);
4287 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4288 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE);
4290 /* Registers mapping */
4291 /* TODO: block userspace mapping of io register */
4292 if (adev->asic_type >= CHIP_BONAIRE) {
4293 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4294 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4296 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4297 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4300 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4301 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4303 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4307 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4308 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4311 * Reset domain needs to be present early, before XGMI hive discovered
4312 * (if any) and intitialized to use reset sem and in_gpu reset flag
4313 * early on during init and before calling to RREG32.
4315 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4316 if (!adev->reset_domain)
4319 /* detect hw virtualization here */
4320 amdgpu_detect_virtualization(adev);
4322 amdgpu_device_get_pcie_info(adev);
4324 r = amdgpu_device_get_job_timeout_settings(adev);
4326 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4330 amdgpu_device_set_mcbp(adev);
4333 * By default, use default mode where all blocks are expected to be
4334 * initialized. At present a 'swinit' of blocks is required to be
4335 * completed before the need for a different level is detected.
4337 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4338 /* early init functions */
4339 r = amdgpu_device_ip_early_init(adev);
4343 /* Get rid of things like offb */
4344 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4348 /* Enable TMZ based on IP_VERSION */
4349 amdgpu_gmc_tmz_set(adev);
4351 if (amdgpu_sriov_vf(adev) &&
4352 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4353 /* VF MMIO access (except mailbox range) from CPU
4354 * will be blocked during sriov runtime
4356 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4358 amdgpu_gmc_noretry_set(adev);
4359 /* Need to get xgmi info early to decide the reset behavior*/
4360 if (adev->gmc.xgmi.supported) {
4361 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4366 /* enable PCIE atomic ops */
4367 if (amdgpu_sriov_vf(adev)) {
4368 if (adev->virt.fw_reserve.p_pf2vf)
4369 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4370 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4371 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4372 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4373 * internal path natively support atomics, set have_atomics_support to true.
4375 } else if ((adev->flags & AMD_IS_APU) &&
4376 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4377 IP_VERSION(9, 0, 0))) {
4378 adev->have_atomics_support = true;
4380 adev->have_atomics_support =
4381 !pci_enable_atomic_ops_to_root(adev->pdev,
4382 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4383 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4386 if (!adev->have_atomics_support)
4387 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4389 /* doorbell bar mapping and doorbell index init*/
4390 amdgpu_doorbell_init(adev);
4392 if (amdgpu_emu_mode == 1) {
4393 /* post the asic on emulation mode */
4394 emu_soc_asic_init(adev);
4395 goto fence_driver_init;
4398 amdgpu_reset_init(adev);
4400 /* detect if we are with an SRIOV vbios */
4402 amdgpu_device_detect_sriov_bios(adev);
4404 /* check if we need to reset the asic
4405 * E.g., driver was not cleanly unloaded previously, etc.
4407 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4408 if (adev->gmc.xgmi.num_physical_nodes) {
4409 dev_info(adev->dev, "Pending hive reset.\n");
4410 amdgpu_set_init_level(adev,
4411 AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4412 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4413 !amdgpu_device_has_display_hardware(adev)) {
4414 r = psp_gpu_reset(adev);
4416 tmp = amdgpu_reset_method;
4417 /* It should do a default reset when loading or reloading the driver,
4418 * regardless of the module parameter reset_method.
4420 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4421 r = amdgpu_asic_reset(adev);
4422 amdgpu_reset_method = tmp;
4426 dev_err(adev->dev, "asic reset on init failed\n");
4431 /* Post card if necessary */
4432 if (amdgpu_device_need_post(adev)) {
4434 dev_err(adev->dev, "no vBIOS found\n");
4438 DRM_INFO("GPU posting now...\n");
4439 r = amdgpu_device_asic_init(adev);
4441 dev_err(adev->dev, "gpu post error!\n");
4447 if (adev->is_atom_fw) {
4448 /* Initialize clocks */
4449 r = amdgpu_atomfirmware_get_clock_info(adev);
4451 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4452 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4456 /* Initialize clocks */
4457 r = amdgpu_atombios_get_clock_info(adev);
4459 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4460 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4463 /* init i2c buses */
4464 if (!amdgpu_device_has_dc_support(adev))
4465 amdgpu_atombios_i2c_init(adev);
4471 r = amdgpu_fence_driver_sw_init(adev);
4473 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4474 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4478 /* init the mode config */
4479 drm_mode_config_init(adev_to_drm(adev));
4481 r = amdgpu_device_ip_init(adev);
4483 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4484 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4485 goto release_ras_con;
4488 amdgpu_fence_driver_hw_init(adev);
4491 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4492 adev->gfx.config.max_shader_engines,
4493 adev->gfx.config.max_sh_per_se,
4494 adev->gfx.config.max_cu_per_sh,
4495 adev->gfx.cu_info.number);
4497 adev->accel_working = true;
4499 amdgpu_vm_check_compute_bug(adev);
4501 /* Initialize the buffer migration limit. */
4502 if (amdgpu_moverate >= 0)
4503 max_MBps = amdgpu_moverate;
4505 max_MBps = 8; /* Allow 8 MB/s. */
4506 /* Get a log2 for easy divisions. */
4507 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4510 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4511 * Otherwise the mgpu fan boost feature will be skipped due to the
4512 * gpu instance is counted less.
4514 amdgpu_register_gpu_instance(adev);
4516 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4517 * explicit gating rather than handling it automatically.
4519 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4520 r = amdgpu_device_ip_late_init(adev);
4522 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4523 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4524 goto release_ras_con;
4527 amdgpu_ras_resume(adev);
4528 queue_delayed_work(system_wq, &adev->delayed_init_work,
4529 msecs_to_jiffies(AMDGPU_RESUME_MS));
4532 if (amdgpu_sriov_vf(adev)) {
4533 amdgpu_virt_release_full_gpu(adev, true);
4534 flush_delayed_work(&adev->delayed_init_work);
4538 * Place those sysfs registering after `late_init`. As some of those
4539 * operations performed in `late_init` might affect the sysfs
4540 * interfaces creating.
4542 r = amdgpu_atombios_sysfs_init(adev);
4544 drm_err(&adev->ddev,
4545 "registering atombios sysfs failed (%d).\n", r);
4547 r = amdgpu_pm_sysfs_init(adev);
4549 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4551 r = amdgpu_ucode_sysfs_init(adev);
4553 adev->ucode_sysfs_en = false;
4554 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4556 adev->ucode_sysfs_en = true;
4558 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4560 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4562 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4565 "Could not create amdgpu board attributes\n");
4567 amdgpu_fru_sysfs_init(adev);
4568 amdgpu_reg_state_sysfs_init(adev);
4569 amdgpu_xcp_cfg_sysfs_init(adev);
4571 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4572 r = amdgpu_pmu_init(adev);
4574 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4576 /* Have stored pci confspace at hand for restore in sudden PCI error */
4577 if (amdgpu_device_cache_pci_state(adev->pdev))
4578 pci_restore_state(pdev);
4580 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4581 /* this will fail for cards that aren't VGA class devices, just
4584 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4585 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4587 px = amdgpu_device_supports_px(ddev);
4589 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4590 apple_gmux_detect(NULL, NULL)))
4591 vga_switcheroo_register_client(adev->pdev,
4592 &amdgpu_switcheroo_ops, px);
4595 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4597 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4598 amdgpu_xgmi_reset_on_init(adev);
4600 amdgpu_device_check_iommu_direct_map(adev);
4605 if (amdgpu_sriov_vf(adev))
4606 amdgpu_virt_release_full_gpu(adev, true);
4608 /* failed in exclusive mode due to timeout */
4609 if (amdgpu_sriov_vf(adev) &&
4610 !amdgpu_sriov_runtime(adev) &&
4611 amdgpu_virt_mmio_blocked(adev) &&
4612 !amdgpu_virt_wait_reset(adev)) {
4613 dev_err(adev->dev, "VF exclusive mode timeout\n");
4614 /* Don't send request since VF is inactive. */
4615 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4616 adev->virt.ops = NULL;
4619 amdgpu_release_ras_context(adev);
4622 amdgpu_vf_error_trans_all(adev);
4627 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4630 /* Clear all CPU mappings pointing to this device */
4631 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4633 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4634 amdgpu_doorbell_fini(adev);
4636 iounmap(adev->rmmio);
4638 if (adev->mman.aper_base_kaddr)
4639 iounmap(adev->mman.aper_base_kaddr);
4640 adev->mman.aper_base_kaddr = NULL;
4642 /* Memory manager related */
4643 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4644 arch_phys_wc_del(adev->gmc.vram_mtrr);
4645 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4650 * amdgpu_device_fini_hw - tear down the driver
4652 * @adev: amdgpu_device pointer
4654 * Tear down the driver info (all asics).
4655 * Called at driver shutdown.
4657 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4659 dev_info(adev->dev, "amdgpu: finishing device.\n");
4660 flush_delayed_work(&adev->delayed_init_work);
4662 if (adev->mman.initialized)
4663 drain_workqueue(adev->mman.bdev.wq);
4664 adev->shutdown = true;
4666 /* make sure IB test finished before entering exclusive mode
4667 * to avoid preemption on IB test
4669 if (amdgpu_sriov_vf(adev)) {
4670 amdgpu_virt_request_full_gpu(adev, false);
4671 amdgpu_virt_fini_data_exchange(adev);
4674 /* disable all interrupts */
4675 amdgpu_irq_disable_all(adev);
4676 if (adev->mode_info.mode_config_initialized) {
4677 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4678 drm_helper_force_disable_all(adev_to_drm(adev));
4680 drm_atomic_helper_shutdown(adev_to_drm(adev));
4682 amdgpu_fence_driver_hw_fini(adev);
4684 if (adev->pm.sysfs_initialized)
4685 amdgpu_pm_sysfs_fini(adev);
4686 if (adev->ucode_sysfs_en)
4687 amdgpu_ucode_sysfs_fini(adev);
4688 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4689 amdgpu_fru_sysfs_fini(adev);
4691 amdgpu_reg_state_sysfs_fini(adev);
4692 amdgpu_xcp_cfg_sysfs_fini(adev);
4694 /* disable ras feature must before hw fini */
4695 amdgpu_ras_pre_fini(adev);
4697 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4699 amdgpu_device_ip_fini_early(adev);
4701 amdgpu_irq_fini_hw(adev);
4703 if (adev->mman.initialized)
4704 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4706 amdgpu_gart_dummy_page_fini(adev);
4708 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4709 amdgpu_device_unmap_mmio(adev);
4713 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4718 amdgpu_device_ip_fini(adev);
4719 amdgpu_fence_driver_sw_fini(adev);
4720 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4721 adev->accel_working = false;
4722 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4724 amdgpu_reset_fini(adev);
4726 /* free i2c buses */
4727 if (!amdgpu_device_has_dc_support(adev))
4728 amdgpu_i2c_fini(adev);
4730 if (amdgpu_emu_mode != 1)
4731 amdgpu_atombios_fini(adev);
4736 kfree(adev->fru_info);
4737 adev->fru_info = NULL;
4739 px = amdgpu_device_supports_px(adev_to_drm(adev));
4741 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4742 apple_gmux_detect(NULL, NULL)))
4743 vga_switcheroo_unregister_client(adev->pdev);
4746 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4748 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4749 vga_client_unregister(adev->pdev);
4751 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4753 iounmap(adev->rmmio);
4755 amdgpu_doorbell_fini(adev);
4759 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4760 amdgpu_pmu_fini(adev);
4761 if (adev->mman.discovery_bin)
4762 amdgpu_discovery_fini(adev);
4764 amdgpu_reset_put_reset_domain(adev->reset_domain);
4765 adev->reset_domain = NULL;
4767 kfree(adev->pci_state);
4772 * amdgpu_device_evict_resources - evict device resources
4773 * @adev: amdgpu device object
4775 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4776 * of the vram memory type. Mainly used for evicting device resources
4780 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4784 /* No need to evict vram on APUs for suspend to ram or s2idle */
4785 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4788 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4790 DRM_WARN("evicting device resources failed\n");
4798 * amdgpu_device_prepare - prepare for device suspend
4800 * @dev: drm dev pointer
4802 * Prepare to put the hw in the suspend state (all asics).
4803 * Returns 0 for success or an error on failure.
4804 * Called at driver suspend.
4806 int amdgpu_device_prepare(struct drm_device *dev)
4808 struct amdgpu_device *adev = drm_to_adev(dev);
4811 amdgpu_choose_low_power_state(adev);
4813 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4816 /* Evict the majority of BOs before starting suspend sequence */
4817 r = amdgpu_device_evict_resources(adev);
4821 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4823 for (i = 0; i < adev->num_ip_blocks; i++) {
4824 if (!adev->ip_blocks[i].status.valid)
4826 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4828 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
4836 adev->in_s0ix = adev->in_s3 = false;
4842 * amdgpu_device_suspend - initiate device suspend
4844 * @dev: drm dev pointer
4845 * @notify_clients: notify in-kernel DRM clients
4847 * Puts the hw in the suspend state (all asics).
4848 * Returns 0 for success or an error on failure.
4849 * Called at driver suspend.
4851 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
4853 struct amdgpu_device *adev = drm_to_adev(dev);
4856 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4859 adev->in_suspend = true;
4861 if (amdgpu_sriov_vf(adev)) {
4862 amdgpu_virt_fini_data_exchange(adev);
4863 r = amdgpu_virt_request_full_gpu(adev, false);
4868 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4869 DRM_WARN("smart shift update failed\n");
4872 drm_client_dev_suspend(adev_to_drm(adev), false);
4874 cancel_delayed_work_sync(&adev->delayed_init_work);
4876 amdgpu_ras_suspend(adev);
4878 amdgpu_device_ip_suspend_phase1(adev);
4881 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4883 r = amdgpu_device_evict_resources(adev);
4887 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4889 amdgpu_fence_driver_hw_fini(adev);
4891 amdgpu_device_ip_suspend_phase2(adev);
4893 if (amdgpu_sriov_vf(adev))
4894 amdgpu_virt_release_full_gpu(adev, false);
4896 r = amdgpu_dpm_notify_rlc_state(adev, false);
4904 * amdgpu_device_resume - initiate device resume
4906 * @dev: drm dev pointer
4907 * @notify_clients: notify in-kernel DRM clients
4909 * Bring the hw back to operating state (all asics).
4910 * Returns 0 for success or an error on failure.
4911 * Called at driver resume.
4913 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
4915 struct amdgpu_device *adev = drm_to_adev(dev);
4918 if (amdgpu_sriov_vf(adev)) {
4919 r = amdgpu_virt_request_full_gpu(adev, true);
4924 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4928 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4931 if (amdgpu_device_need_post(adev)) {
4932 r = amdgpu_device_asic_init(adev);
4934 dev_err(adev->dev, "amdgpu asic init failed\n");
4937 r = amdgpu_device_ip_resume(adev);
4940 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4944 if (!adev->in_s0ix) {
4945 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4950 r = amdgpu_device_ip_late_init(adev);
4954 queue_delayed_work(system_wq, &adev->delayed_init_work,
4955 msecs_to_jiffies(AMDGPU_RESUME_MS));
4957 if (amdgpu_sriov_vf(adev)) {
4958 amdgpu_virt_init_data_exchange(adev);
4959 amdgpu_virt_release_full_gpu(adev, true);
4965 /* Make sure IB tests flushed */
4966 flush_delayed_work(&adev->delayed_init_work);
4969 drm_client_dev_resume(adev_to_drm(adev), false);
4971 amdgpu_ras_resume(adev);
4973 if (adev->mode_info.num_crtc) {
4975 * Most of the connector probing functions try to acquire runtime pm
4976 * refs to ensure that the GPU is powered on when connector polling is
4977 * performed. Since we're calling this from a runtime PM callback,
4978 * trying to acquire rpm refs will cause us to deadlock.
4980 * Since we're guaranteed to be holding the rpm lock, it's safe to
4981 * temporarily disable the rpm helpers so this doesn't deadlock us.
4984 dev->dev->power.disable_depth++;
4986 if (!adev->dc_enabled)
4987 drm_helper_hpd_irq_event(dev);
4989 drm_kms_helper_hotplug_event(dev);
4991 dev->dev->power.disable_depth--;
4994 adev->in_suspend = false;
4996 if (adev->enable_mes)
4997 amdgpu_mes_self_test(adev);
4999 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
5000 DRM_WARN("smart shift update failed\n");
5006 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5008 * @adev: amdgpu_device pointer
5010 * The list of all the hardware IPs that make up the asic is walked and
5011 * the check_soft_reset callbacks are run. check_soft_reset determines
5012 * if the asic is still hung or not.
5013 * Returns true if any of the IPs are still in a hung state, false if not.
5015 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5018 bool asic_hang = false;
5020 if (amdgpu_sriov_vf(adev))
5023 if (amdgpu_asic_need_full_reset(adev))
5026 for (i = 0; i < adev->num_ip_blocks; i++) {
5027 if (!adev->ip_blocks[i].status.valid)
5029 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5030 adev->ip_blocks[i].status.hang =
5031 adev->ip_blocks[i].version->funcs->check_soft_reset(
5032 &adev->ip_blocks[i]);
5033 if (adev->ip_blocks[i].status.hang) {
5034 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5042 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5044 * @adev: amdgpu_device pointer
5046 * The list of all the hardware IPs that make up the asic is walked and the
5047 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5048 * handles any IP specific hardware or software state changes that are
5049 * necessary for a soft reset to succeed.
5050 * Returns 0 on success, negative error code on failure.
5052 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5056 for (i = 0; i < adev->num_ip_blocks; i++) {
5057 if (!adev->ip_blocks[i].status.valid)
5059 if (adev->ip_blocks[i].status.hang &&
5060 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5061 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5071 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5073 * @adev: amdgpu_device pointer
5075 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5076 * reset is necessary to recover.
5077 * Returns true if a full asic reset is required, false if not.
5079 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5083 if (amdgpu_asic_need_full_reset(adev))
5086 for (i = 0; i < adev->num_ip_blocks; i++) {
5087 if (!adev->ip_blocks[i].status.valid)
5089 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5090 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5091 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5092 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5093 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5094 if (adev->ip_blocks[i].status.hang) {
5095 dev_info(adev->dev, "Some block need full reset!\n");
5104 * amdgpu_device_ip_soft_reset - do a soft reset
5106 * @adev: amdgpu_device pointer
5108 * The list of all the hardware IPs that make up the asic is walked and the
5109 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5110 * IP specific hardware or software state changes that are necessary to soft
5112 * Returns 0 on success, negative error code on failure.
5114 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5118 for (i = 0; i < adev->num_ip_blocks; i++) {
5119 if (!adev->ip_blocks[i].status.valid)
5121 if (adev->ip_blocks[i].status.hang &&
5122 adev->ip_blocks[i].version->funcs->soft_reset) {
5123 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5133 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5135 * @adev: amdgpu_device pointer
5137 * The list of all the hardware IPs that make up the asic is walked and the
5138 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5139 * handles any IP specific hardware or software state changes that are
5140 * necessary after the IP has been soft reset.
5141 * Returns 0 on success, negative error code on failure.
5143 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5147 for (i = 0; i < adev->num_ip_blocks; i++) {
5148 if (!adev->ip_blocks[i].status.valid)
5150 if (adev->ip_blocks[i].status.hang &&
5151 adev->ip_blocks[i].version->funcs->post_soft_reset)
5152 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5161 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5163 * @adev: amdgpu_device pointer
5164 * @reset_context: amdgpu reset context pointer
5166 * do VF FLR and reinitialize Asic
5167 * return 0 means succeeded otherwise failed
5169 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5170 struct amdgpu_reset_context *reset_context)
5173 struct amdgpu_hive_info *hive = NULL;
5175 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5176 if (!amdgpu_ras_get_fed_status(adev))
5177 amdgpu_virt_ready_to_reset(adev);
5178 amdgpu_virt_wait_reset(adev);
5179 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5180 r = amdgpu_virt_request_full_gpu(adev, true);
5182 r = amdgpu_virt_reset_gpu(adev);
5187 amdgpu_ras_set_fed(adev, false);
5188 amdgpu_irq_gpu_reset_resume_helper(adev);
5190 /* some sw clean up VF needs to do before recover */
5191 amdgpu_virt_post_reset(adev);
5193 /* Resume IP prior to SMC */
5194 r = amdgpu_device_ip_reinit_early_sriov(adev);
5198 amdgpu_virt_init_data_exchange(adev);
5200 r = amdgpu_device_fw_loading(adev);
5204 /* now we are okay to resume SMC/CP/SDMA */
5205 r = amdgpu_device_ip_reinit_late_sriov(adev);
5209 hive = amdgpu_get_xgmi_hive(adev);
5210 /* Update PSP FW topology after reset */
5211 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5212 r = amdgpu_xgmi_update_topology(hive, adev);
5214 amdgpu_put_xgmi_hive(hive);
5218 r = amdgpu_ib_ring_tests(adev);
5222 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5223 amdgpu_inc_vram_lost(adev);
5225 /* need to be called during full access so we can't do it later like
5228 amdgpu_amdkfd_post_reset(adev);
5229 amdgpu_virt_release_full_gpu(adev, true);
5231 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5232 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5233 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5234 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5235 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5236 amdgpu_ras_resume(adev);
5238 amdgpu_virt_ras_telemetry_post_reset(adev);
5244 * amdgpu_device_has_job_running - check if there is any job in mirror list
5246 * @adev: amdgpu_device pointer
5248 * check if there is any job in mirror list
5250 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5253 struct drm_sched_job *job;
5255 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5256 struct amdgpu_ring *ring = adev->rings[i];
5258 if (!amdgpu_ring_sched_ready(ring))
5261 spin_lock(&ring->sched.job_list_lock);
5262 job = list_first_entry_or_null(&ring->sched.pending_list,
5263 struct drm_sched_job, list);
5264 spin_unlock(&ring->sched.job_list_lock);
5272 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5274 * @adev: amdgpu_device pointer
5276 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5279 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5282 if (amdgpu_gpu_recovery == 0)
5285 /* Skip soft reset check in fatal error mode */
5286 if (!amdgpu_ras_is_poison_mode_supported(adev))
5289 if (amdgpu_sriov_vf(adev))
5292 if (amdgpu_gpu_recovery == -1) {
5293 switch (adev->asic_type) {
5294 #ifdef CONFIG_DRM_AMDGPU_SI
5301 #ifdef CONFIG_DRM_AMDGPU_CIK
5308 case CHIP_CYAN_SKILLFISH:
5318 dev_info(adev->dev, "GPU recovery disabled.\n");
5322 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5327 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5329 dev_info(adev->dev, "GPU mode1 reset\n");
5331 /* Cache the state before bus master disable. The saved config space
5332 * values are used in other cases like restore after mode-2 reset.
5334 amdgpu_device_cache_pci_state(adev->pdev);
5337 pci_clear_master(adev->pdev);
5339 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5340 dev_info(adev->dev, "GPU smu mode1 reset\n");
5341 ret = amdgpu_dpm_mode1_reset(adev);
5343 dev_info(adev->dev, "GPU psp mode1 reset\n");
5344 ret = psp_gpu_reset(adev);
5348 goto mode1_reset_failed;
5350 amdgpu_device_load_pci_state(adev->pdev);
5351 ret = amdgpu_psp_wait_for_bootloader(adev);
5353 goto mode1_reset_failed;
5355 /* wait for asic to come out of reset */
5356 for (i = 0; i < adev->usec_timeout; i++) {
5357 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5359 if (memsize != 0xffffffff)
5364 if (i >= adev->usec_timeout) {
5366 goto mode1_reset_failed;
5369 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5374 dev_err(adev->dev, "GPU mode1 reset failed\n");
5378 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5379 struct amdgpu_reset_context *reset_context)
5382 struct amdgpu_job *job = NULL;
5383 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5384 bool need_full_reset =
5385 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5387 if (reset_context->reset_req_dev == adev)
5388 job = reset_context->job;
5390 if (amdgpu_sriov_vf(adev))
5391 amdgpu_virt_pre_reset(adev);
5393 amdgpu_fence_driver_isr_toggle(adev, true);
5395 /* block all schedulers and reset given job's ring */
5396 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5397 struct amdgpu_ring *ring = adev->rings[i];
5399 if (!amdgpu_ring_sched_ready(ring))
5402 /* Clear job fence from fence drv to avoid force_completion
5403 * leave NULL and vm flush fence in fence drv
5405 amdgpu_fence_driver_clear_job_fences(ring);
5407 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5408 amdgpu_fence_driver_force_completion(ring);
5411 amdgpu_fence_driver_isr_toggle(adev, false);
5414 drm_sched_increase_karma(&job->base);
5416 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5417 /* If reset handler not implemented, continue; otherwise return */
5418 if (r == -EOPNOTSUPP)
5423 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5424 if (!amdgpu_sriov_vf(adev)) {
5426 if (!need_full_reset)
5427 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5429 if (!need_full_reset && amdgpu_gpu_recovery &&
5430 amdgpu_device_ip_check_soft_reset(adev)) {
5431 amdgpu_device_ip_pre_soft_reset(adev);
5432 r = amdgpu_device_ip_soft_reset(adev);
5433 amdgpu_device_ip_post_soft_reset(adev);
5434 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5435 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5436 need_full_reset = true;
5440 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5441 dev_info(tmp_adev->dev, "Dumping IP State\n");
5442 /* Trigger ip dump before we reset the asic */
5443 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5444 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5445 tmp_adev->ip_blocks[i].version->funcs
5446 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5447 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5450 if (need_full_reset)
5451 r = amdgpu_device_ip_suspend(adev);
5452 if (need_full_reset)
5453 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5455 clear_bit(AMDGPU_NEED_FULL_RESET,
5456 &reset_context->flags);
5462 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
5464 struct list_head *device_list_handle;
5465 bool full_reset, vram_lost = false;
5466 struct amdgpu_device *tmp_adev;
5469 device_list_handle = reset_context->reset_device_list;
5471 if (!device_list_handle)
5474 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5477 * If it's reset on init, it's default init level, otherwise keep level
5478 * as recovery level.
5480 if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5481 init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5483 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5486 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5487 amdgpu_set_init_level(tmp_adev, init_level);
5490 amdgpu_ras_set_fed(tmp_adev, false);
5491 r = amdgpu_device_asic_init(tmp_adev);
5493 dev_warn(tmp_adev->dev, "asic atom init failed!");
5495 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5497 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5501 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5503 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5504 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5507 DRM_INFO("VRAM is lost due to GPU reset!\n");
5508 amdgpu_inc_vram_lost(tmp_adev);
5511 r = amdgpu_device_fw_loading(tmp_adev);
5515 r = amdgpu_xcp_restore_partition_mode(
5520 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5524 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5525 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5527 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5532 amdgpu_device_fill_reset_magic(tmp_adev);
5535 * Add this ASIC as tracked as reset was already
5536 * complete successfully.
5538 amdgpu_register_gpu_instance(tmp_adev);
5540 if (!reset_context->hive &&
5541 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5542 amdgpu_xgmi_add_device(tmp_adev);
5544 r = amdgpu_device_ip_late_init(tmp_adev);
5548 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
5551 * The GPU enters bad state once faulty pages
5552 * by ECC has reached the threshold, and ras
5553 * recovery is scheduled next. So add one check
5554 * here to break recovery if it indeed exceeds
5555 * bad page threshold, and remind user to
5556 * retire this GPU or setting one bigger
5557 * bad_page_threshold value to fix this once
5558 * probing driver again.
5560 if (!amdgpu_ras_is_rma(tmp_adev)) {
5562 amdgpu_ras_resume(tmp_adev);
5568 /* Update PSP FW topology after reset */
5569 if (reset_context->hive &&
5570 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5571 r = amdgpu_xgmi_update_topology(
5572 reset_context->hive, tmp_adev);
5578 /* IP init is complete now, set level as default */
5579 amdgpu_set_init_level(tmp_adev,
5580 AMDGPU_INIT_LEVEL_DEFAULT);
5581 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5582 r = amdgpu_ib_ring_tests(tmp_adev);
5584 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5591 tmp_adev->asic_reset_res = r;
5598 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5599 struct amdgpu_reset_context *reset_context)
5601 struct amdgpu_device *tmp_adev = NULL;
5602 bool need_full_reset, skip_hw_reset;
5605 /* Try reset handler method first */
5606 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5609 reset_context->reset_device_list = device_list_handle;
5610 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5611 /* If reset handler not implemented, continue; otherwise return */
5612 if (r == -EOPNOTSUPP)
5617 /* Reset handler not implemented, use the default method */
5619 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5620 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5623 * ASIC reset has to be done on all XGMI hive nodes ASAP
5624 * to allow proper links negotiation in FW (within 1 sec)
5626 if (!skip_hw_reset && need_full_reset) {
5627 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5628 /* For XGMI run all resets in parallel to speed up the process */
5629 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5630 if (!queue_work(system_unbound_wq,
5631 &tmp_adev->xgmi_reset_work))
5634 r = amdgpu_asic_reset(tmp_adev);
5637 dev_err(tmp_adev->dev,
5638 "ASIC reset failed with error, %d for drm dev, %s",
5639 r, adev_to_drm(tmp_adev)->unique);
5644 /* For XGMI wait for all resets to complete before proceed */
5646 list_for_each_entry(tmp_adev, device_list_handle,
5648 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5649 flush_work(&tmp_adev->xgmi_reset_work);
5650 r = tmp_adev->asic_reset_res;
5658 if (!r && amdgpu_ras_intr_triggered()) {
5659 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5660 amdgpu_ras_reset_error_count(tmp_adev,
5661 AMDGPU_RAS_BLOCK__MMHUB);
5664 amdgpu_ras_intr_cleared();
5667 r = amdgpu_device_reinit_after_reset(reset_context);
5669 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5671 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5677 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5680 switch (amdgpu_asic_reset_method(adev)) {
5681 case AMD_RESET_METHOD_MODE1:
5682 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5684 case AMD_RESET_METHOD_MODE2:
5685 adev->mp1_state = PP_MP1_STATE_RESET;
5688 adev->mp1_state = PP_MP1_STATE_NONE;
5693 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5695 amdgpu_vf_error_trans_all(adev);
5696 adev->mp1_state = PP_MP1_STATE_NONE;
5699 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5701 struct pci_dev *p = NULL;
5703 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5704 adev->pdev->bus->number, 1);
5706 pm_runtime_enable(&(p->dev));
5707 pm_runtime_resume(&(p->dev));
5713 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5715 enum amd_reset_method reset_method;
5716 struct pci_dev *p = NULL;
5720 * For now, only BACO and mode1 reset are confirmed
5721 * to suffer the audio issue without proper suspended.
5723 reset_method = amdgpu_asic_reset_method(adev);
5724 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5725 (reset_method != AMD_RESET_METHOD_MODE1))
5728 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5729 adev->pdev->bus->number, 1);
5733 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5736 * If we cannot get the audio device autosuspend delay,
5737 * a fixed 4S interval will be used. Considering 3S is
5738 * the audio controller default autosuspend delay setting.
5739 * 4S used here is guaranteed to cover that.
5741 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5743 while (!pm_runtime_status_suspended(&(p->dev))) {
5744 if (!pm_runtime_suspend(&(p->dev)))
5747 if (expires < ktime_get_mono_fast_ns()) {
5748 dev_warn(adev->dev, "failed to suspend display audio\n");
5750 /* TODO: abort the succeeding gpu reset? */
5755 pm_runtime_disable(&(p->dev));
5761 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5763 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5765 #if defined(CONFIG_DEBUG_FS)
5766 if (!amdgpu_sriov_vf(adev))
5767 cancel_work(&adev->reset_work);
5771 cancel_work(&adev->kfd.reset_work);
5773 if (amdgpu_sriov_vf(adev))
5774 cancel_work(&adev->virt.flr_work);
5776 if (con && adev->ras_enabled)
5777 cancel_work(&con->recovery_work);
5781 static int amdgpu_device_health_check(struct list_head *device_list_handle)
5783 struct amdgpu_device *tmp_adev;
5787 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5788 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5789 if (PCI_POSSIBLE_ERROR(status)) {
5790 dev_err(tmp_adev->dev, "device lost from bus!");
5799 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5801 * @adev: amdgpu_device pointer
5802 * @job: which job trigger hang
5803 * @reset_context: amdgpu reset context pointer
5805 * Attempt to reset the GPU if it has hung (all asics).
5806 * Attempt to do soft-reset or full-reset and reinitialize Asic
5807 * Returns 0 for success or an error on failure.
5810 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5811 struct amdgpu_job *job,
5812 struct amdgpu_reset_context *reset_context)
5814 struct list_head device_list, *device_list_handle = NULL;
5815 bool job_signaled = false;
5816 struct amdgpu_hive_info *hive = NULL;
5817 struct amdgpu_device *tmp_adev = NULL;
5819 bool need_emergency_restart = false;
5820 bool audio_suspended = false;
5821 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5824 * Special case: RAS triggered and full reset isn't supported
5826 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5829 * Flush RAM to disk so that after reboot
5830 * the user can read log and see why the system rebooted.
5832 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5833 amdgpu_ras_get_context(adev)->reboot) {
5834 DRM_WARN("Emergency reboot.");
5837 emergency_restart();
5840 dev_info(adev->dev, "GPU %s begin!\n",
5841 need_emergency_restart ? "jobs stop":"reset");
5843 if (!amdgpu_sriov_vf(adev))
5844 hive = amdgpu_get_xgmi_hive(adev);
5846 mutex_lock(&hive->hive_lock);
5848 reset_context->job = job;
5849 reset_context->hive = hive;
5851 * Build list of devices to reset.
5852 * In case we are in XGMI hive mode, resort the device list
5853 * to put adev in the 1st position.
5855 INIT_LIST_HEAD(&device_list);
5856 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5857 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5858 list_add_tail(&tmp_adev->reset_list, &device_list);
5860 tmp_adev->shutdown = true;
5862 if (!list_is_first(&adev->reset_list, &device_list))
5863 list_rotate_to_front(&adev->reset_list, &device_list);
5864 device_list_handle = &device_list;
5866 list_add_tail(&adev->reset_list, &device_list);
5867 device_list_handle = &device_list;
5870 if (!amdgpu_sriov_vf(adev)) {
5871 r = amdgpu_device_health_check(device_list_handle);
5876 /* We need to lock reset domain only once both for XGMI and single device */
5877 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5879 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5881 /* block all schedulers and reset given job's ring */
5882 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5884 amdgpu_device_set_mp1_state(tmp_adev);
5887 * Try to put the audio codec into suspend state
5888 * before gpu reset started.
5890 * Due to the power domain of the graphics device
5891 * is shared with AZ power domain. Without this,
5892 * we may change the audio hardware from behind
5893 * the audio driver's back. That will trigger
5894 * some audio codec errors.
5896 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5897 audio_suspended = true;
5899 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5901 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5903 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5906 * Mark these ASICs to be reseted as untracked first
5907 * And add them back after reset completed
5909 amdgpu_unregister_gpu_instance(tmp_adev);
5911 drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
5913 /* disable ras on ALL IPs */
5914 if (!need_emergency_restart &&
5915 amdgpu_device_ip_need_full_reset(tmp_adev))
5916 amdgpu_ras_suspend(tmp_adev);
5918 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5919 struct amdgpu_ring *ring = tmp_adev->rings[i];
5921 if (!amdgpu_ring_sched_ready(ring))
5924 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5926 if (need_emergency_restart)
5927 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5929 atomic_inc(&tmp_adev->gpu_reset_counter);
5932 if (need_emergency_restart)
5933 goto skip_sched_resume;
5936 * Must check guilty signal here since after this point all old
5937 * HW fences are force signaled.
5939 * job->base holds a reference to parent fence
5941 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5942 job_signaled = true;
5943 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5947 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5948 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5949 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5950 /*TODO Should we stop ?*/
5952 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5953 r, adev_to_drm(tmp_adev)->unique);
5954 tmp_adev->asic_reset_res = r;
5958 /* Actual ASIC resets if needed.*/
5959 /* Host driver will handle XGMI hive reset for SRIOV */
5960 if (amdgpu_sriov_vf(adev)) {
5961 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
5962 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
5963 amdgpu_ras_set_fed(adev, true);
5964 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5967 r = amdgpu_device_reset_sriov(adev, reset_context);
5968 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
5969 amdgpu_virt_release_full_gpu(adev, true);
5973 adev->asic_reset_res = r;
5975 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5976 if (r && r == -EAGAIN)
5980 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5982 * Drop any pending non scheduler resets queued before reset is done.
5983 * Any reset scheduled after this point would be valid. Scheduler resets
5984 * were already dropped during drm_sched_stop and no new ones can come
5985 * in before drm_sched_start.
5987 amdgpu_device_stop_pending_resets(tmp_adev);
5992 /* Post ASIC reset for all devs .*/
5993 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5995 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5996 struct amdgpu_ring *ring = tmp_adev->rings[i];
5998 if (!amdgpu_ring_sched_ready(ring))
6001 drm_sched_start(&ring->sched, 0);
6004 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6005 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6007 if (tmp_adev->asic_reset_res)
6008 r = tmp_adev->asic_reset_res;
6010 tmp_adev->asic_reset_res = 0;
6013 /* bad news, how to tell it to userspace ?
6014 * for ras error, we should report GPU bad status instead of
6017 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6018 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6019 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
6020 atomic_read(&tmp_adev->gpu_reset_counter));
6021 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
6023 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
6024 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
6025 DRM_WARN("smart shift update failed\n");
6030 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6031 /* unlock kfd: SRIOV would do it separately */
6032 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6033 amdgpu_amdkfd_post_reset(tmp_adev);
6035 /* kfd_post_reset will do nothing if kfd device is not initialized,
6036 * need to bring up kfd here if it's not be initialized before
6038 if (!adev->kfd.init_complete)
6039 amdgpu_amdkfd_device_init(adev);
6041 if (audio_suspended)
6042 amdgpu_device_resume_display_audio(tmp_adev);
6044 amdgpu_device_unset_mp1_state(tmp_adev);
6046 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6049 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6051 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6055 mutex_unlock(&hive->hive_lock);
6056 amdgpu_put_xgmi_hive(hive);
6060 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6062 atomic_set(&adev->reset_domain->reset_res, r);
6067 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6069 * @adev: amdgpu_device pointer
6070 * @speed: pointer to the speed of the link
6071 * @width: pointer to the width of the link
6073 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6074 * first physical partner to an AMD dGPU.
6075 * This will exclude any virtual switches and links.
6077 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6078 enum pci_bus_speed *speed,
6079 enum pcie_link_width *width)
6081 struct pci_dev *parent = adev->pdev;
6083 if (!speed || !width)
6086 *speed = PCI_SPEED_UNKNOWN;
6087 *width = PCIE_LNK_WIDTH_UNKNOWN;
6089 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6090 while ((parent = pci_upstream_bridge(parent))) {
6091 /* skip upstream/downstream switches internal to dGPU*/
6092 if (parent->vendor == PCI_VENDOR_ID_ATI)
6094 *speed = pcie_get_speed_cap(parent);
6095 *width = pcie_get_width_cap(parent);
6099 /* use the current speeds rather than max if switching is not supported */
6100 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6105 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6107 * @adev: amdgpu_device pointer
6109 * Fetchs and stores in the driver the PCIE capabilities (gen speed
6110 * and lanes) of the slot the device is in. Handles APUs and
6111 * virtualized environments where PCIE config space may not be available.
6113 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6115 struct pci_dev *pdev;
6116 enum pci_bus_speed speed_cap, platform_speed_cap;
6117 enum pcie_link_width platform_link_width;
6119 if (amdgpu_pcie_gen_cap)
6120 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6122 if (amdgpu_pcie_lane_cap)
6123 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6125 /* covers APUs as well */
6126 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6127 if (adev->pm.pcie_gen_mask == 0)
6128 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6129 if (adev->pm.pcie_mlw_mask == 0)
6130 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6134 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6137 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6138 &platform_link_width);
6140 if (adev->pm.pcie_gen_mask == 0) {
6143 speed_cap = pcie_get_speed_cap(pdev);
6144 if (speed_cap == PCI_SPEED_UNKNOWN) {
6145 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6146 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6147 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6149 if (speed_cap == PCIE_SPEED_32_0GT)
6150 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6151 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6152 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6153 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6154 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6155 else if (speed_cap == PCIE_SPEED_16_0GT)
6156 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6157 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6158 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6159 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6160 else if (speed_cap == PCIE_SPEED_8_0GT)
6161 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6162 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6163 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6164 else if (speed_cap == PCIE_SPEED_5_0GT)
6165 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6166 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6168 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6171 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6172 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6173 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6175 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6176 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6177 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6178 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6179 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6180 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6181 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6182 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6183 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6184 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6185 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6186 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6187 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6188 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6189 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6190 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6191 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6192 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6194 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6198 if (adev->pm.pcie_mlw_mask == 0) {
6199 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6200 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6202 switch (platform_link_width) {
6204 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6205 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6206 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6207 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6208 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6209 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6210 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6213 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6214 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6215 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6216 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6217 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6218 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6221 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6222 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6223 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6224 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6225 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6228 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6229 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6230 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6231 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6234 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6235 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6236 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6239 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6240 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6243 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6253 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6255 * @adev: amdgpu_device pointer
6256 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6258 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6259 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6262 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6263 struct amdgpu_device *peer_adev)
6265 #ifdef CONFIG_HSA_AMD_P2P
6267 !adev->gmc.xgmi.connected_to_cpu &&
6268 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6270 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6271 pci_name(peer_adev->pdev));
6273 bool is_large_bar = adev->gmc.visible_vram_size &&
6274 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6275 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6277 if (!p2p_addressable) {
6278 uint64_t address_mask = peer_adev->dev->dma_mask ?
6279 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6280 resource_size_t aper_limit =
6281 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6283 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6284 aper_limit & address_mask);
6286 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6292 int amdgpu_device_baco_enter(struct drm_device *dev)
6294 struct amdgpu_device *adev = drm_to_adev(dev);
6295 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6297 if (!amdgpu_device_supports_baco(dev))
6300 if (ras && adev->ras_enabled &&
6301 adev->nbio.funcs->enable_doorbell_interrupt)
6302 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6304 return amdgpu_dpm_baco_enter(adev);
6307 int amdgpu_device_baco_exit(struct drm_device *dev)
6309 struct amdgpu_device *adev = drm_to_adev(dev);
6310 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6313 if (!amdgpu_device_supports_baco(dev))
6316 ret = amdgpu_dpm_baco_exit(adev);
6320 if (ras && adev->ras_enabled &&
6321 adev->nbio.funcs->enable_doorbell_interrupt)
6322 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6324 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6325 adev->nbio.funcs->clear_doorbell_interrupt)
6326 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6332 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6333 * @pdev: PCI device struct
6334 * @state: PCI channel state
6336 * Description: Called when a PCI error is detected.
6338 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6340 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6342 struct drm_device *dev = pci_get_drvdata(pdev);
6343 struct amdgpu_device *adev = drm_to_adev(dev);
6346 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6348 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6349 DRM_WARN("No support for XGMI hive yet...");
6350 return PCI_ERS_RESULT_DISCONNECT;
6353 adev->pci_channel_state = state;
6356 case pci_channel_io_normal:
6357 return PCI_ERS_RESULT_CAN_RECOVER;
6358 /* Fatal error, prepare for slot reset */
6359 case pci_channel_io_frozen:
6361 * Locking adev->reset_domain->sem will prevent any external access
6362 * to GPU during PCI error recovery
6364 amdgpu_device_lock_reset_domain(adev->reset_domain);
6365 amdgpu_device_set_mp1_state(adev);
6368 * Block any work scheduling as we do for regular GPU reset
6369 * for the duration of the recovery
6371 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6372 struct amdgpu_ring *ring = adev->rings[i];
6374 if (!amdgpu_ring_sched_ready(ring))
6377 drm_sched_stop(&ring->sched, NULL);
6379 atomic_inc(&adev->gpu_reset_counter);
6380 return PCI_ERS_RESULT_NEED_RESET;
6381 case pci_channel_io_perm_failure:
6382 /* Permanent error, prepare for device removal */
6383 return PCI_ERS_RESULT_DISCONNECT;
6386 return PCI_ERS_RESULT_NEED_RESET;
6390 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6391 * @pdev: pointer to PCI device
6393 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6396 DRM_INFO("PCI error: mmio enabled callback!!\n");
6398 /* TODO - dump whatever for debugging purposes */
6400 /* This called only if amdgpu_pci_error_detected returns
6401 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6402 * works, no need to reset slot.
6405 return PCI_ERS_RESULT_RECOVERED;
6409 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6410 * @pdev: PCI device struct
6412 * Description: This routine is called by the pci error recovery
6413 * code after the PCI slot has been reset, just before we
6414 * should resume normal operations.
6416 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6418 struct drm_device *dev = pci_get_drvdata(pdev);
6419 struct amdgpu_device *adev = drm_to_adev(dev);
6421 struct amdgpu_reset_context reset_context;
6423 struct list_head device_list;
6425 /* PCI error slot reset should be skipped During RAS recovery */
6426 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6427 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6428 amdgpu_ras_in_recovery(adev))
6429 return PCI_ERS_RESULT_RECOVERED;
6431 DRM_INFO("PCI error: slot reset callback!!\n");
6433 memset(&reset_context, 0, sizeof(reset_context));
6435 INIT_LIST_HEAD(&device_list);
6436 list_add_tail(&adev->reset_list, &device_list);
6438 /* wait for asic to come out of reset */
6441 /* Restore PCI confspace */
6442 amdgpu_device_load_pci_state(pdev);
6444 /* confirm ASIC came out of reset */
6445 for (i = 0; i < adev->usec_timeout; i++) {
6446 memsize = amdgpu_asic_get_config_memsize(adev);
6448 if (memsize != 0xffffffff)
6452 if (memsize == 0xffffffff) {
6457 reset_context.method = AMD_RESET_METHOD_NONE;
6458 reset_context.reset_req_dev = adev;
6459 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6460 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6462 adev->no_hw_access = true;
6463 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6464 adev->no_hw_access = false;
6468 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6472 if (amdgpu_device_cache_pci_state(adev->pdev))
6473 pci_restore_state(adev->pdev);
6475 DRM_INFO("PCIe error recovery succeeded\n");
6477 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6478 amdgpu_device_unset_mp1_state(adev);
6479 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6482 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6486 * amdgpu_pci_resume() - resume normal ops after PCI reset
6487 * @pdev: pointer to PCI device
6489 * Called when the error recovery driver tells us that its
6490 * OK to resume normal operation.
6492 void amdgpu_pci_resume(struct pci_dev *pdev)
6494 struct drm_device *dev = pci_get_drvdata(pdev);
6495 struct amdgpu_device *adev = drm_to_adev(dev);
6499 DRM_INFO("PCI error: resume callback!!\n");
6501 /* Only continue execution for the case of pci_channel_io_frozen */
6502 if (adev->pci_channel_state != pci_channel_io_frozen)
6505 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6506 struct amdgpu_ring *ring = adev->rings[i];
6508 if (!amdgpu_ring_sched_ready(ring))
6511 drm_sched_start(&ring->sched, 0);
6514 amdgpu_device_unset_mp1_state(adev);
6515 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6518 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6520 struct drm_device *dev = pci_get_drvdata(pdev);
6521 struct amdgpu_device *adev = drm_to_adev(dev);
6524 if (amdgpu_sriov_vf(adev))
6527 r = pci_save_state(pdev);
6529 kfree(adev->pci_state);
6531 adev->pci_state = pci_store_saved_state(pdev);
6533 if (!adev->pci_state) {
6534 DRM_ERROR("Failed to store PCI saved state");
6538 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6545 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6547 struct drm_device *dev = pci_get_drvdata(pdev);
6548 struct amdgpu_device *adev = drm_to_adev(dev);
6551 if (!adev->pci_state)
6554 r = pci_load_saved_state(pdev, adev->pci_state);
6557 pci_restore_state(pdev);
6559 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6566 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6567 struct amdgpu_ring *ring)
6569 #ifdef CONFIG_X86_64
6570 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6573 if (adev->gmc.xgmi.connected_to_cpu)
6576 if (ring && ring->funcs->emit_hdp_flush)
6577 amdgpu_ring_emit_hdp_flush(ring);
6579 amdgpu_asic_flush_hdp(adev, ring);
6582 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6583 struct amdgpu_ring *ring)
6585 #ifdef CONFIG_X86_64
6586 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6589 if (adev->gmc.xgmi.connected_to_cpu)
6592 amdgpu_asic_invalidate_hdp(adev, ring);
6595 int amdgpu_in_reset(struct amdgpu_device *adev)
6597 return atomic_read(&adev->reset_domain->in_gpu_reset);
6601 * amdgpu_device_halt() - bring hardware to some kind of halt state
6603 * @adev: amdgpu_device pointer
6605 * Bring hardware to some kind of halt state so that no one can touch it
6606 * any more. It will help to maintain error context when error occurred.
6607 * Compare to a simple hang, the system will keep stable at least for SSH
6608 * access. Then it should be trivial to inspect the hardware state and
6609 * see what's going on. Implemented as following:
6611 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6612 * clears all CPU mappings to device, disallows remappings through page faults
6613 * 2. amdgpu_irq_disable_all() disables all interrupts
6614 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6615 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6616 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6617 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6618 * flush any in flight DMA operations
6620 void amdgpu_device_halt(struct amdgpu_device *adev)
6622 struct pci_dev *pdev = adev->pdev;
6623 struct drm_device *ddev = adev_to_drm(adev);
6625 amdgpu_xcp_dev_unplug(adev);
6626 drm_dev_unplug(ddev);
6628 amdgpu_irq_disable_all(adev);
6630 amdgpu_fence_driver_hw_fini(adev);
6632 adev->no_hw_access = true;
6634 amdgpu_device_unmap_mmio(adev);
6636 pci_disable_device(pdev);
6637 pci_wait_for_pending_transaction(pdev);
6640 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6643 unsigned long flags, address, data;
6646 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6647 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6649 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6650 WREG32(address, reg * 4);
6651 (void)RREG32(address);
6653 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6657 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6660 unsigned long flags, address, data;
6662 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6663 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6665 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6666 WREG32(address, reg * 4);
6667 (void)RREG32(address);
6670 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6674 * amdgpu_device_get_gang - return a reference to the current gang
6675 * @adev: amdgpu_device pointer
6677 * Returns: A new reference to the current gang leader.
6679 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6681 struct dma_fence *fence;
6684 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6690 * amdgpu_device_switch_gang - switch to a new gang
6691 * @adev: amdgpu_device pointer
6692 * @gang: the gang to switch to
6694 * Try to switch to a new gang.
6695 * Returns: NULL if we switched to the new gang or a reference to the current
6698 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6699 struct dma_fence *gang)
6701 struct dma_fence *old = NULL;
6705 old = amdgpu_device_get_gang(adev);
6709 if (!dma_fence_is_signaled(old))
6712 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6719 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6721 switch (adev->asic_type) {
6722 #ifdef CONFIG_DRM_AMDGPU_SI
6726 /* chips with no display hardware */
6728 #ifdef CONFIG_DRM_AMDGPU_SI
6734 #ifdef CONFIG_DRM_AMDGPU_CIK
6743 case CHIP_POLARIS10:
6744 case CHIP_POLARIS11:
6745 case CHIP_POLARIS12:
6749 /* chips with display hardware */
6753 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6754 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6760 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6761 uint32_t inst, uint32_t reg_addr, char reg_name[],
6762 uint32_t expected_value, uint32_t mask)
6766 uint32_t tmp_ = RREG32(reg_addr);
6767 uint32_t loop = adev->usec_timeout;
6769 while ((tmp_ & (mask)) != (expected_value)) {
6771 loop = adev->usec_timeout;
6775 tmp_ = RREG32(reg_addr);
6778 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6779 inst, reg_name, (uint32_t)expected_value,
6780 (uint32_t)(tmp_ & (mask)));
6788 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
6792 if (!ring || !ring->adev)
6795 if (amdgpu_device_should_recover_gpu(ring->adev))
6796 size |= AMDGPU_RESET_TYPE_FULL;
6798 if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
6799 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
6800 size |= AMDGPU_RESET_TYPE_SOFT_RESET;
6805 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
6809 if (supported_reset == 0) {
6810 size += sysfs_emit_at(buf, size, "unsupported");
6811 size += sysfs_emit_at(buf, size, "\n");
6816 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
6817 size += sysfs_emit_at(buf, size, "soft ");
6819 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
6820 size += sysfs_emit_at(buf, size, "queue ");
6822 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
6823 size += sysfs_emit_at(buf, size, "pipe ");
6825 if (supported_reset & AMDGPU_RESET_TYPE_FULL)
6826 size += sysfs_emit_at(buf, size, "full ");
6828 size += sysfs_emit_at(buf, size, "\n");