X-Git-Url: https://repo.jachan.dev/linux.git/blobdiff_plain/929ae7c2e3adbbb2c2bddcd16854a6b11b56e95a..8fa76350587b6deb8a95d83f9cb23ce7599587b5:/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6298e3c1de39..1787602fe582 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -81,6 +81,10 @@ #include +#if IS_ENABLED(CONFIG_X86) +#include +#endif + MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); @@ -598,7 +602,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) if (amdgpu_device_skip_hw_access(adev)) return 0; - if (index < adev->doorbell.num_doorbells) { + if (index < adev->doorbell.num_kernel_doorbells) { return readl(adev->doorbell.ptr + index); } else { DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); @@ -621,7 +625,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) if (amdgpu_device_skip_hw_access(adev)) return; - if (index < adev->doorbell.num_doorbells) { + if (index < adev->doorbell.num_kernel_doorbells) { writel(v, adev->doorbell.ptr + index); } else { DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); @@ -642,7 +646,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) if (amdgpu_device_skip_hw_access(adev)) return 0; - if (index < adev->doorbell.num_doorbells) { + if (index < adev->doorbell.num_kernel_doorbells) { return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); } else { DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); @@ -665,7 +669,7 @@ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) if (amdgpu_device_skip_hw_access(adev)) return; - if (index < adev->doorbell.num_doorbells) { + if (index < adev->doorbell.num_kernel_doorbells) { atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); } else { DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); @@ -994,7 +998,7 @@ void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, if (array_size % 3) return; - for (i = 0; i < array_size; i +=3) { + for (i = 0; i < array_size; i += 3) { reg = registers[i + 0]; and_mask = registers[i + 1]; or_mask = registers[i + 2]; @@ -1056,7 +1060,7 @@ static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) if (adev->asic_type < CHIP_BONAIRE) { adev->doorbell.base = 0; adev->doorbell.size = 0; - adev->doorbell.num_doorbells = 0; + adev->doorbell.num_kernel_doorbells = 0; adev->doorbell.ptr = NULL; return 0; } @@ -1071,27 +1075,27 @@ static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) adev->doorbell.size = pci_resource_len(adev->pdev, 2); if (adev->enable_mes) { - adev->doorbell.num_doorbells = + adev->doorbell.num_kernel_doorbells = adev->doorbell.size / sizeof(u32); } else { - adev->doorbell.num_doorbells = + adev->doorbell.num_kernel_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), adev->doorbell_index.max_assignment+1); - if (adev->doorbell.num_doorbells == 0) + if (adev->doorbell.num_kernel_doorbells == 0) return -EINVAL; /* For Vega, reserve and map two pages on doorbell BAR since SDMA * paging queue doorbell use the second page. The * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the * doorbells are in the first page. So with paging queue enabled, - * the max num_doorbells should + 1 page (0x400 in dword) + * the max num_kernel_doorbells should + 1 page (0x400 in dword) */ if (adev->asic_type >= CHIP_VEGA10) - adev->doorbell.num_doorbells += 0x400; + adev->doorbell.num_kernel_doorbells += 0x400; } adev->doorbell.ptr = ioremap(adev->doorbell.base, - adev->doorbell.num_doorbells * + adev->doorbell.num_kernel_doorbells * sizeof(u32)); if (adev->doorbell.ptr == NULL) return -ENOMEM; @@ -1373,6 +1377,17 @@ bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) return pcie_aspm_enabled(adev->pdev); } +bool amdgpu_device_aspm_support_quirk(void) +{ +#if IS_ENABLED(CONFIG_X86) + struct cpuinfo_x86 *c = &cpu_data(0); + + return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); +#else + return true; +#endif +} + /* if we get transitioned to only one device, take VGA back */ /** * amdgpu_device_vga_set_decode - enable/disable vga decode @@ -1532,7 +1547,7 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", amdgpu_sched_jobs); amdgpu_sched_jobs = 4; - } else if (!is_power_of_2(amdgpu_sched_jobs)){ + } else if (!is_power_of_2(amdgpu_sched_jobs)) { dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", amdgpu_sched_jobs); amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); @@ -2169,7 +2184,6 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) adev->has_pr3 = parent ? pci_pr3_present(parent) : false; } - amdgpu_amdkfd_device_probe(adev); adev->pm.pp_feature = amdgpu_pp_feature_mask; if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) @@ -2225,6 +2239,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) if (!total) return -ENODEV; + amdgpu_amdkfd_device_probe(adev); adev->cg_flags &= amdgpu_cg_mask; adev->pg_flags &= amdgpu_pg_mask; @@ -2350,7 +2365,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) } r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, - ring->num_hw_submission, amdgpu_job_hang_limit, + ring->num_hw_submission, 0, timeout, adev->reset_domain->wq, ring->sched_score, ring->name, adev->dev); @@ -2524,8 +2539,6 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) amdgpu_fru_get_product_info(adev); init_failed: - if (amdgpu_sriov_vf(adev)) - amdgpu_virt_release_full_gpu(adev, true); return r; } @@ -2746,8 +2759,9 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ - if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| - adev->asic_type == CHIP_ALDEBARAN )) + if (amdgpu_passthrough(adev) && + ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || + adev->asic_type == CHIP_ALDEBARAN)) amdgpu_dpm_handle_passthrough_sbr(adev, true); if (adev->gmc.xgmi.num_physical_nodes > 1) { @@ -3076,7 +3090,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) } adev->ip_blocks[i].status.hw = false; /* handle putting the SMC in the appropriate state */ - if(!amdgpu_sriov_vf(adev)){ + if (!amdgpu_sriov_vf(adev)) { if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); if (r) { @@ -3166,9 +3180,11 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_DCE, AMD_IP_BLOCK_TYPE_GFX, AMD_IP_BLOCK_TYPE_SDMA, + AMD_IP_BLOCK_TYPE_MES, AMD_IP_BLOCK_TYPE_UVD, AMD_IP_BLOCK_TYPE_VCE, - AMD_IP_BLOCK_TYPE_VCN + AMD_IP_BLOCK_TYPE_VCN, + AMD_IP_BLOCK_TYPE_JPEG }; for (i = 0; i < ARRAY_SIZE(ip_order); i++) { @@ -3288,9 +3304,11 @@ static int amdgpu_device_ip_resume(struct amdgpu_device *adev) { int r; - r = amdgpu_amdkfd_resume_iommu(adev); - if (r) - return r; + if (!adev->in_s0ix) { + r = amdgpu_amdkfd_resume_iommu(adev); + if (r) + return r; + } r = amdgpu_device_ip_resume_phase1(adev); if (r) @@ -3561,6 +3579,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, int r, i; bool px = false; u32 max_MBps; + int tmp; adev->shutdown = false; adev->flags = flags; @@ -3782,7 +3801,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } } else { + tmp = amdgpu_reset_method; + /* It should do a default reset when loading or reloading the driver, + * regardless of the module parameter reset_method. + */ + amdgpu_reset_method = AMD_RESET_METHOD_NONE; r = amdgpu_asic_reset(adev); + amdgpu_reset_method = tmp; if (r) { dev_err(adev->dev, "asic reset on init failed\n"); goto failed; @@ -3840,18 +3865,6 @@ fence_driver_init: r = amdgpu_device_ip_init(adev); if (r) { - /* failed in exclusive mode due to timeout */ - if (amdgpu_sriov_vf(adev) && - !amdgpu_sriov_runtime(adev) && - amdgpu_virt_mmio_blocked(adev) && - !amdgpu_virt_wait_reset(adev)) { - dev_err(adev->dev, "VF exclusive mode timeout\n"); - /* Don't send request since VF is inactive. */ - adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; - adev->virt.ops = NULL; - r = -EAGAIN; - goto release_ras_con; - } dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); goto release_ras_con; @@ -3920,8 +3933,10 @@ fence_driver_init: msecs_to_jiffies(AMDGPU_RESUME_MS)); } - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev)) { + amdgpu_virt_release_full_gpu(adev, true); flush_delayed_work(&adev->delayed_init_work); + } r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); if (r) @@ -3961,6 +3976,20 @@ fence_driver_init: return 0; release_ras_con: + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_release_full_gpu(adev, true); + + /* failed in exclusive mode due to timeout */ + if (amdgpu_sriov_vf(adev) && + !amdgpu_sriov_runtime(adev) && + amdgpu_virt_mmio_blocked(adev) && + !amdgpu_virt_wait_reset(adev)) { + dev_err(adev->dev, "VF exclusive mode timeout\n"); + /* Don't send request since VF is inactive. */ + adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; + adev->virt.ops = NULL; + r = -EAGAIN; + } amdgpu_release_ras_context(adev); failed: @@ -4015,7 +4044,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) /* disable all interrupts */ amdgpu_irq_disable_all(adev); - if (adev->mode_info.mode_config_initialized){ + if (adev->mode_info.mode_config_initialized) { if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) drm_helper_force_disable_all(adev_to_drm(adev)); else @@ -4676,42 +4705,42 @@ disabled: int amdgpu_device_mode1_reset(struct amdgpu_device *adev) { - u32 i; - int ret = 0; + u32 i; + int ret = 0; - amdgpu_atombios_scratch_regs_engine_hung(adev, true); + amdgpu_atombios_scratch_regs_engine_hung(adev, true); - dev_info(adev->dev, "GPU mode1 reset\n"); + dev_info(adev->dev, "GPU mode1 reset\n"); - /* disable BM */ - pci_clear_master(adev->pdev); + /* disable BM */ + pci_clear_master(adev->pdev); - amdgpu_device_cache_pci_state(adev->pdev); + amdgpu_device_cache_pci_state(adev->pdev); - if (amdgpu_dpm_is_mode1_reset_supported(adev)) { - dev_info(adev->dev, "GPU smu mode1 reset\n"); - ret = amdgpu_dpm_mode1_reset(adev); - } else { - dev_info(adev->dev, "GPU psp mode1 reset\n"); - ret = psp_gpu_reset(adev); - } + if (amdgpu_dpm_is_mode1_reset_supported(adev)) { + dev_info(adev->dev, "GPU smu mode1 reset\n"); + ret = amdgpu_dpm_mode1_reset(adev); + } else { + dev_info(adev->dev, "GPU psp mode1 reset\n"); + ret = psp_gpu_reset(adev); + } - if (ret) - dev_err(adev->dev, "GPU mode1 reset failed\n"); + if (ret) + dev_err(adev->dev, "GPU mode1 reset failed\n"); - amdgpu_device_load_pci_state(adev->pdev); + amdgpu_device_load_pci_state(adev->pdev); - /* wait for asic to come out of reset */ - for (i = 0; i < adev->usec_timeout; i++) { - u32 memsize = adev->nbio.funcs->get_memsize(adev); + /* wait for asic to come out of reset */ + for (i = 0; i < adev->usec_timeout; i++) { + u32 memsize = adev->nbio.funcs->get_memsize(adev); - if (memsize != 0xffffffff) - break; - udelay(1); - } + if (memsize != 0xffffffff) + break; + udelay(1); + } - amdgpu_atombios_scratch_regs_engine_hung(adev, false); - return ret; + amdgpu_atombios_scratch_regs_engine_hung(adev, false); + return ret; } int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, @@ -5161,6 +5190,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) * * @adev: amdgpu_device pointer * @job: which job trigger hang + * @reset_context: amdgpu reset context pointer * * Attempt to reset the GPU if it has hung (all asics). * Attempt to do soft-reset or full-reset and reinitialize Asic @@ -5330,8 +5360,9 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; - /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ - if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) + /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || + adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) amdgpu_ras_resume(adev); } else { r = amdgpu_do_asic_reset(device_list_handle, reset_context);