#include "amdgpu_xgmi.h"
#include "amdgpu_ras.h"
#include "amdgpu_pmu.h"
+#include "amdgpu_fru_eeprom.h"
#include <linux/suspend.h>
#include <drm/task_barrier.h>
+#include <linux/pm_runtime.h>
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
+MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
+MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
#define AMDGPU_RESUME_MS 2000
"NAVI10",
"NAVI14",
"NAVI12",
+ "SIENNA_CICHLID",
+ "NAVY_FLOUNDER",
"LAST",
};
static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
+/**
+ * DOC: product_name
+ *
+ * The amdgpu driver provides a sysfs API for reporting the product name
+ * for the device
+ * The file serial_number is used for this and returns the product name
+ * as returned from the FRU.
+ * NOTE: This is only available for certain server cards
+ */
+
+static ssize_t amdgpu_device_get_product_name(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct drm_device *ddev = dev_get_drvdata(dev);
+ struct amdgpu_device *adev = ddev->dev_private;
+
+ return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
+}
+
+static DEVICE_ATTR(product_name, S_IRUGO,
+ amdgpu_device_get_product_name, NULL);
+
+/**
+ * DOC: product_number
+ *
+ * The amdgpu driver provides a sysfs API for reporting the part number
+ * for the device
+ * The file serial_number is used for this and returns the part number
+ * as returned from the FRU.
+ * NOTE: This is only available for certain server cards
+ */
+
+static ssize_t amdgpu_device_get_product_number(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct drm_device *ddev = dev_get_drvdata(dev);
+ struct amdgpu_device *adev = ddev->dev_private;
+
+ return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
+}
+
+static DEVICE_ATTR(product_number, S_IRUGO,
+ amdgpu_device_get_product_number, NULL);
+
+/**
+ * DOC: serial_number
+ *
+ * The amdgpu driver provides a sysfs API for reporting the serial number
+ * for the device
+ * The file serial_number is used for this and returns the serial number
+ * as returned from the FRU.
+ * NOTE: This is only available for certain server cards
+ */
+
+static ssize_t amdgpu_device_get_serial_number(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct drm_device *ddev = dev_get_drvdata(dev);
+ struct amdgpu_device *adev = ddev->dev_private;
+
+ return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
+}
+
+static DEVICE_ATTR(serial_number, S_IRUGO,
+ amdgpu_device_get_serial_number, NULL);
+
/**
* amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
*
{
uint32_t ret;
- if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
+ if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
return amdgpu_kiq_rreg(adev, reg);
- if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
+ if ((reg * 4) < adev->rmmio_size)
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
else {
unsigned long flags;
{
trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
- if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
+ if ((reg * 4) < adev->rmmio_size)
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
else {
unsigned long flags;
writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
}
-
- if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
- udelay(500);
- }
}
/**
void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
uint32_t acc_flags)
{
- if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
- adev->last_mm_index = v;
- }
-
- if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
+ if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
return amdgpu_kiq_wreg(adev, reg, v);
amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
*/
void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
{
- if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
- adev->last_mm_index = v;
- }
-
if ((reg * 4) < adev->rio_mem_size)
iowrite32(v, adev->rio_mem + (reg * 4));
else {
iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
}
-
- if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
- udelay(500);
- }
}
/**
if (amdgpu_sriov_vf(adev))
return 0;
+ /* skip if the bios has already enabled large BAR */
+ if (adev->gmc.real_vram_size &&
+ (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
+ return 0;
+
/* Check if the root BUS has 64bit memory resources */
root = adev->pdev->bus;
while (root->parent)
amdgpu_vm_fragment_size = -1;
}
+ if (amdgpu_sched_hw_submission < 2) {
+ dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
+ amdgpu_sched_hw_submission);
+ amdgpu_sched_hw_submission = 2;
+ } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
+ dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
+ amdgpu_sched_hw_submission);
+ amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
+ }
+
amdgpu_device_check_smu_prv_buffer_size(adev);
amdgpu_device_check_vm_size(adev);
adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
+ amdgpu_gmc_tmz_set(adev);
+
return 0;
}
return;
if (state == VGA_SWITCHEROO_ON) {
- pr_info("amdgpu: switched on\n");
+ pr_info("switched on\n");
/* don't suspend or resume card normally */
dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
dev->switch_power_state = DRM_SWITCH_POWER_ON;
drm_kms_helper_poll_enable(dev);
} else {
- pr_info("amdgpu: switched off\n");
+ pr_info("switched off\n");
drm_kms_helper_poll_disable(dev);
dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
amdgpu_device_suspend(dev, true);
static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
{
const char *chip_name;
- char fw_name[30];
+ char fw_name[40];
int err;
const struct gpu_info_firmware_header_v1_0 *hdr;
adev->firmware.gpu_info_fw = NULL;
+ if (adev->discovery_bin) {
+ amdgpu_discovery_get_gfx_info(adev);
+
+ /*
+ * FIXME: The bounding box is still needed by Navi12, so
+ * temporarily read it from gpu_info firmware. Should be droped
+ * when DAL no longer needs it.
+ */
+ if (adev->asic_type != CHIP_NAVI12)
+ return 0;
+ }
+
switch (adev->asic_type) {
- case CHIP_TOPAZ:
- case CHIP_TONGA:
- case CHIP_FIJI:
- case CHIP_POLARIS10:
- case CHIP_POLARIS11:
- case CHIP_POLARIS12:
- case CHIP_VEGAM:
- case CHIP_CARRIZO:
- case CHIP_STONEY:
#ifdef CONFIG_DRM_AMDGPU_SI
case CHIP_VERDE:
case CHIP_TAHITI:
case CHIP_KABINI:
case CHIP_MULLINS:
#endif
+ case CHIP_TOPAZ:
+ case CHIP_TONGA:
+ case CHIP_FIJI:
+ case CHIP_POLARIS10:
+ case CHIP_POLARIS11:
+ case CHIP_POLARIS12:
+ case CHIP_VEGAM:
+ case CHIP_CARRIZO:
+ case CHIP_STONEY:
case CHIP_VEGA20:
default:
return 0;
chip_name = "vega12";
break;
case CHIP_RAVEN:
- if (adev->rev_id >= 8)
+ if (adev->apu_flags & AMD_APU_IS_RAVEN2)
chip_name = "raven2";
- else if (adev->pdev->device == 0x15d8)
+ else if (adev->apu_flags & AMD_APU_IS_PICASSO)
chip_name = "picasso";
else
chip_name = "raven";
case CHIP_NAVI12:
chip_name = "navi12";
break;
+ case CHIP_SIENNA_CICHLID:
+ chip_name = "sienna_cichlid";
+ break;
+ case CHIP_NAVY_FLOUNDER:
+ chip_name = "navy_flounder";
+ break;
}
snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
le32_to_cpu(hdr->header.ucode_array_offset_bytes));
- if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
+ /*
+ * Should be droped when DAL no longer needs it.
+ */
+ if (adev->asic_type == CHIP_NAVI12)
goto parse_soc_bounding_box;
adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
parse_soc_bounding_box:
/*
* soc bounding box info is not integrated in disocovery table,
- * we always need to parse it from gpu info firmware.
+ * we always need to parse it from gpu info firmware if needed.
*/
if (hdr->version_minor == 2) {
const struct gpu_info_firmware_v1_2 *gpu_info_fw =
amdgpu_device_enable_virtual_display(adev);
- switch (adev->asic_type) {
- case CHIP_TOPAZ:
- case CHIP_TONGA:
- case CHIP_FIJI:
- case CHIP_POLARIS10:
- case CHIP_POLARIS11:
- case CHIP_POLARIS12:
- case CHIP_VEGAM:
- case CHIP_CARRIZO:
- case CHIP_STONEY:
- if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY)
- adev->family = AMDGPU_FAMILY_CZ;
- else
- adev->family = AMDGPU_FAMILY_VI;
-
- r = vi_set_ip_blocks(adev);
+ if (amdgpu_sriov_vf(adev)) {
+ r = amdgpu_virt_request_full_gpu(adev, true);
if (r)
return r;
- break;
+ }
+
+ switch (adev->asic_type) {
#ifdef CONFIG_DRM_AMDGPU_SI
case CHIP_VERDE:
case CHIP_TAHITI:
case CHIP_KAVERI:
case CHIP_KABINI:
case CHIP_MULLINS:
- if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII))
- adev->family = AMDGPU_FAMILY_CI;
- else
+ if (adev->flags & AMD_IS_APU)
adev->family = AMDGPU_FAMILY_KV;
+ else
+ adev->family = AMDGPU_FAMILY_CI;
r = cik_set_ip_blocks(adev);
if (r)
return r;
break;
#endif
+ case CHIP_TOPAZ:
+ case CHIP_TONGA:
+ case CHIP_FIJI:
+ case CHIP_POLARIS10:
+ case CHIP_POLARIS11:
+ case CHIP_POLARIS12:
+ case CHIP_VEGAM:
+ case CHIP_CARRIZO:
+ case CHIP_STONEY:
+ if (adev->flags & AMD_IS_APU)
+ adev->family = AMDGPU_FAMILY_CZ;
+ else
+ adev->family = AMDGPU_FAMILY_VI;
+
+ r = vi_set_ip_blocks(adev);
+ if (r)
+ return r;
+ break;
case CHIP_VEGA10:
case CHIP_VEGA12:
case CHIP_VEGA20:
case CHIP_RAVEN:
case CHIP_ARCTURUS:
case CHIP_RENOIR:
- if (adev->asic_type == CHIP_RAVEN ||
- adev->asic_type == CHIP_RENOIR)
+ if (adev->flags & AMD_IS_APU)
adev->family = AMDGPU_FAMILY_RV;
else
adev->family = AMDGPU_FAMILY_AI;
case CHIP_NAVI10:
case CHIP_NAVI14:
case CHIP_NAVI12:
+ case CHIP_SIENNA_CICHLID:
+ case CHIP_NAVY_FLOUNDER:
adev->family = AMDGPU_FAMILY_NV;
r = nv_set_ip_blocks(adev);
return -EINVAL;
}
- r = amdgpu_device_parse_gpu_info_fw(adev);
- if (r)
- return r;
-
- if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
- amdgpu_discovery_get_gfx_info(adev);
-
amdgpu_amdkfd_device_probe(adev);
- if (amdgpu_sriov_vf(adev)) {
- r = amdgpu_virt_request_full_gpu(adev, true);
- if (r)
- return -EAGAIN;
- }
-
adev->pm.pp_feature = amdgpu_pp_feature_mask;
if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
}
/* get the vbios after the asic_funcs are set up */
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
+ r = amdgpu_device_parse_gpu_info_fw(adev);
+ if (r)
+ return r;
+
/* Read BIOS */
if (!amdgpu_get_bios(adev))
return -EINVAL;
amdgpu_xgmi_add_device(adev);
amdgpu_amdkfd_device_init(adev);
+ amdgpu_fru_get_product_info(adev);
+
init_failed:
if (amdgpu_sriov_vf(adev))
amdgpu_virt_release_full_gpu(adev, true);
adev->ip_blocks[i].status.late_initialized = true;
}
+ amdgpu_ras_set_error_query_ready(adev, true);
+
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
if (gpu_instance->adev->flags & AMD_IS_APU)
continue;
- r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+ r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
+ AMDGPU_XGMI_PSTATE_MIN);
if (r) {
DRM_ERROR("pstate setting failed (%d).\n", r);
break;
{
int i, r;
+ if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
+ amdgpu_virt_release_ras_err_handler_data(adev);
+
amdgpu_ras_pre_fini(adev);
if (adev->gmc.xgmi.num_physical_nodes > 1)
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
if (!adev->ip_blocks[i].status.valid)
continue;
+
/* displays are handled separately */
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
- /* XXX handle errors */
- r = adev->ip_blocks[i].version->funcs->suspend(adev);
- /* XXX handle errors */
- if (r) {
- DRM_ERROR("suspend of IP block <%s> failed %d\n",
- adev->ip_blocks[i].version->funcs->name, r);
- return r;
- }
- adev->ip_blocks[i].status.hw = false;
+ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
+ continue;
+
+ /* XXX handle errors */
+ r = adev->ip_blocks[i].version->funcs->suspend(adev);
+ /* XXX handle errors */
+ if (r) {
+ DRM_ERROR("suspend of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name, r);
+ return r;
}
+
+ adev->ip_blocks[i].status.hw = false;
}
return 0;
AMD_IP_BLOCK_TYPE_IH,
};
+ for (i = 0; i < adev->num_ip_blocks; i++)
+ adev->ip_blocks[i].status.hw = false;
+
for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
int j;
struct amdgpu_ip_block *block;
for (j = 0; j < adev->num_ip_blocks; j++) {
block = &adev->ip_blocks[j];
- block->status.hw = false;
if (block->version->type != ip_order[i] ||
!block->status.valid)
continue;
case CHIP_NAVI14:
case CHIP_NAVI12:
case CHIP_RENOIR:
+#endif
+#if defined(CONFIG_DRM_AMD_DC_DCN3_0)
+ case CHIP_SIENNA_CICHLID:
+ case CHIP_NAVY_FLOUNDER:
#endif
return amdgpu_dc != 0;
#endif
* By default timeout for non compute jobs is 10000.
* And there is no timeout enforced on compute jobs.
* In SR-IOV or passthrough mode, timeout for compute
- * jobs are 10000 by default.
+ * jobs are 60000 by default.
*/
adev->gfx_timeout = msecs_to_jiffies(10000);
adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
- adev->compute_timeout = adev->gfx_timeout;
+ adev->compute_timeout = msecs_to_jiffies(60000);
else
adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
return ret;
}
+static const struct attribute *amdgpu_dev_attributes[] = {
+ &dev_attr_product_name.attr,
+ &dev_attr_product_number.attr,
+ &dev_attr_serial_number.attr,
+ &dev_attr_pcie_replay_count.attr,
+ NULL
+};
+
/**
* amdgpu_device_init - initialize the driver
*
INIT_LIST_HEAD(&adev->shadow_list);
mutex_init(&adev->shadow_list_lock);
- INIT_LIST_HEAD(&adev->ring_lru_list);
- spin_lock_init(&adev->ring_lru_list_lock);
-
INIT_DELAYED_WORK(&adev->delayed_init_work,
amdgpu_device_delayed_init_work_handler);
INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
adev->gfx.gfx_off_req_count = 1;
- adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;
+ adev->pm.ac_power = power_supply_is_system_supplied() > 0;
+
+ atomic_set(&adev->throttling_logging_enabled, 1);
+ /*
+ * If throttling continues, logging will be performed every minute
+ * to avoid log flooding. "-1" is subtracted since the thermal
+ * throttling interrupt comes every second. Thus, the total logging
+ * interval is 59 seconds(retelimited printk interval) + 1(waiting
+ * for throttling interrupt) = 60 seconds.
+ */
+ ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
+ ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
/* Registers mapping */
/* TODO: block userspace mapping of io register */
if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
adev->enable_mes = true;
- if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) {
- r = amdgpu_discovery_init(adev);
- if (r) {
- dev_err(adev->dev, "amdgpu_discovery_init failed\n");
- return r;
- }
- }
-
- /* early init functions */
- r = amdgpu_device_ip_early_init(adev);
- if (r)
- return r;
+ /* detect hw virtualization here */
+ amdgpu_detect_virtualization(adev);
r = amdgpu_device_get_job_timeout_settings(adev);
if (r) {
return r;
}
+ /* early init functions */
+ r = amdgpu_device_ip_early_init(adev);
+ if (r)
+ return r;
+
/* doorbell bar mapping and doorbell index init*/
amdgpu_device_doorbell_init(adev);
goto failed;
}
- DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
+ dev_info(adev->dev,
+ "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
adev->gfx.config.max_shader_engines,
adev->gfx.config.max_sh_per_se,
adev->gfx.config.max_cu_per_sh,
adev->gfx.cu_info.number);
- amdgpu_ctx_init_sched(adev);
-
adev->accel_working = true;
amdgpu_vm_check_compute_bug(adev);
queue_delayed_work(system_wq, &adev->delayed_init_work,
msecs_to_jiffies(AMDGPU_RESUME_MS));
- r = device_create_file(adev->dev, &dev_attr_pcie_replay_count);
+ if (amdgpu_sriov_vf(adev))
+ flush_delayed_work(&adev->delayed_init_work);
+
+ r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
if (r) {
- dev_err(adev->dev, "Could not create pcie_replay_count");
+ dev_err(adev->dev, "Could not create amdgpu device attr\n");
return r;
}
amdgpu_pm_sysfs_fini(adev);
amdgpu_fbdev_fini(adev);
r = amdgpu_device_ip_fini(adev);
- if (adev->firmware.gpu_info_fw) {
- release_firmware(adev->firmware.gpu_info_fw);
- adev->firmware.gpu_info_fw = NULL;
- }
+ release_firmware(adev->firmware.gpu_info_fw);
+ adev->firmware.gpu_info_fw = NULL;
adev->accel_working = false;
/* free i2c buses */
if (!amdgpu_device_has_dc_support(adev))
adev->rmmio = NULL;
amdgpu_device_doorbell_fini(adev);
- device_remove_file(adev->dev, &dev_attr_pcie_replay_count);
if (adev->ucode_sysfs_en)
amdgpu_ucode_sysfs_fini(adev);
+
+ sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
if (IS_ENABLED(CONFIG_PERF_EVENTS))
amdgpu_pmu_fini(adev);
- if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
+ if (adev->discovery_bin)
amdgpu_discovery_fini(adev);
}
* amdgpu_device_suspend - initiate device suspend
*
* @dev: drm dev pointer
- * @suspend: suspend state
* @fbcon : notify the fbdev of suspend
*
* Puts the hw in the suspend state (all asics).
* amdgpu_device_resume - initiate device resume
*
* @dev: drm dev pointer
- * @resume: resume state
* @fbcon : notify the fbdev of resume
*
* Bring the hw back to operating state (all asics).
if (r)
return r;
+ amdgpu_amdkfd_pre_reset(adev);
+
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)
amdgpu_virt_init_data_exchange(adev);
/* we need recover gart prior to run SMC/CP/SDMA resume */
- amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
+ amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
r = amdgpu_device_fw_loading(adev);
if (r)
case CHIP_NAVI10:
case CHIP_NAVI14:
case CHIP_NAVI12:
+ case CHIP_SIENNA_CICHLID:
break;
default:
goto disabled;
int i, r = 0;
bool need_full_reset = *need_full_reset_arg;
+ amdgpu_debugfs_wait_dump(adev);
+
/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
amdgpu_inc_vram_lost(tmp_adev);
}
- r = amdgpu_gtt_mgr_recover(
- &tmp_adev->mman.bdev.man[TTM_PL_TT]);
+ r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
if (r)
goto out;
mutex_unlock(&adev->lock_reset);
}
+static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
+{
+ struct pci_dev *p = NULL;
+
+ p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
+ adev->pdev->bus->number, 1);
+ if (p) {
+ pm_runtime_enable(&(p->dev));
+ pm_runtime_resume(&(p->dev));
+ }
+}
+
+static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
+{
+ enum amd_reset_method reset_method;
+ struct pci_dev *p = NULL;
+ u64 expires;
+
+ /*
+ * For now, only BACO and mode1 reset are confirmed
+ * to suffer the audio issue without proper suspended.
+ */
+ reset_method = amdgpu_asic_reset_method(adev);
+ if ((reset_method != AMD_RESET_METHOD_BACO) &&
+ (reset_method != AMD_RESET_METHOD_MODE1))
+ return -EINVAL;
+
+ p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
+ adev->pdev->bus->number, 1);
+ if (!p)
+ return -ENODEV;
+
+ expires = pm_runtime_autosuspend_expiration(&(p->dev));
+ if (!expires)
+ /*
+ * If we cannot get the audio device autosuspend delay,
+ * a fixed 4S interval will be used. Considering 3S is
+ * the audio controller default autosuspend delay setting.
+ * 4S used here is guaranteed to cover that.
+ */
+ expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
+
+ while (!pm_runtime_status_suspended(&(p->dev))) {
+ if (!pm_runtime_suspend(&(p->dev)))
+ break;
+
+ if (expires < ktime_get_mono_fast_ns()) {
+ dev_warn(adev->dev, "failed to suspend display audio\n");
+ /* TODO: abort the succeeding gpu reset? */
+ return -ETIMEDOUT;
+ }
+ }
+
+ pm_runtime_disable(&(p->dev));
+
+ return 0;
+}
+
/**
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
*
struct amdgpu_job *job)
{
struct list_head device_list, *device_list_handle = NULL;
- bool need_full_reset, job_signaled;
+ bool need_full_reset = false;
+ bool job_signaled = false;
struct amdgpu_hive_info *hive = NULL;
struct amdgpu_device *tmp_adev = NULL;
int i, r = 0;
- bool in_ras_intr = amdgpu_ras_intr_triggered();
- bool use_baco =
- (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
- true : false;
+ bool need_emergency_restart = false;
+ bool audio_suspended = false;
+
+ /**
+ * Special case: RAS triggered and full reset isn't supported
+ */
+ need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
/*
* Flush RAM to disk so that after reboot
* the user can read log and see why the system rebooted.
*/
- if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) {
-
+ if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
DRM_WARN("Emergency reboot.");
ksys_sync_helper();
emergency_restart();
}
- need_full_reset = job_signaled = false;
- INIT_LIST_HEAD(&device_list);
-
dev_info(adev->dev, "GPU %s begin!\n",
- (in_ras_intr && !use_baco) ? "jobs stop":"reset");
-
- cancel_delayed_work_sync(&adev->delayed_init_work);
-
- hive = amdgpu_get_xgmi_hive(adev, false);
+ need_emergency_restart ? "jobs stop":"reset");
/*
* Here we trylock to avoid chain of resets executing from
* We always reset all schedulers for device and all devices for XGMI
* hive so that should take care of them too.
*/
-
+ hive = amdgpu_get_xgmi_hive(adev, true);
if (hive && !mutex_trylock(&hive->reset_lock)) {
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
job ? job->base.id : -1, hive->hive_id);
+ mutex_unlock(&hive->hive_lock);
return 0;
}
- /* Start with adev pre asic reset first for soft reset check.*/
- if (!amdgpu_device_lock_adev(adev, !hive)) {
- DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
- job ? job->base.id : -1);
- return 0;
- }
-
- /* Block kfd: SRIOV would do it separately */
- if (!amdgpu_sriov_vf(adev))
- amdgpu_amdkfd_pre_reset(adev);
-
- /* Build list of devices to reset */
- if (adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!hive) {
- /*unlock kfd: SRIOV would do it separately */
- if (!amdgpu_sriov_vf(adev))
- amdgpu_amdkfd_post_reset(adev);
- amdgpu_device_unlock_adev(adev);
+ /*
+ * Build list of devices to reset.
+ * In case we are in XGMI hive mode, resort the device list
+ * to put adev in the 1st position.
+ */
+ INIT_LIST_HEAD(&device_list);
+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
+ if (!hive)
return -ENODEV;
- }
-
- /*
- * In case we are in XGMI hive mode device reset is done for all the
- * nodes in the hive to retrain all XGMI links and hence the reset
- * sequence is executed in loop on all nodes.
- */
+ if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
+ list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
device_list_handle = &hive->device_list;
} else {
list_add_tail(&adev->gmc.xgmi.head, &device_list);
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- if (tmp_adev != adev) {
- amdgpu_device_lock_adev(tmp_adev, false);
- if (!amdgpu_sriov_vf(tmp_adev))
- amdgpu_amdkfd_pre_reset(tmp_adev);
+ if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
+ DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
+ job ? job->base.id : -1);
+ mutex_unlock(&hive->hive_lock);
+ return 0;
}
+ /*
+ * Try to put the audio codec into suspend state
+ * before gpu reset started.
+ *
+ * Due to the power domain of the graphics device
+ * is shared with AZ power domain. Without this,
+ * we may change the audio hardware from behind
+ * the audio driver's back. That will trigger
+ * some audio codec errors.
+ */
+ if (!amdgpu_device_suspend_display_audio(tmp_adev))
+ audio_suspended = true;
+
+ amdgpu_ras_set_error_query_ready(tmp_adev, false);
+
+ cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
+
+ if (!amdgpu_sriov_vf(tmp_adev))
+ amdgpu_amdkfd_pre_reset(tmp_adev);
+
/*
* Mark these ASICs to be reseted as untracked first
* And add them back after reset completed
*/
amdgpu_unregister_gpu_instance(tmp_adev);
- amdgpu_fbdev_set_suspend(adev, 1);
+ amdgpu_fbdev_set_suspend(tmp_adev, 1);
/* disable ras on ALL IPs */
- if (!(in_ras_intr && !use_baco) &&
+ if (!need_emergency_restart &&
amdgpu_device_ip_need_full_reset(tmp_adev))
amdgpu_ras_suspend(tmp_adev);
drm_sched_stop(&ring->sched, job ? &job->base : NULL);
- if (in_ras_intr && !use_baco)
+ if (need_emergency_restart)
amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
}
}
-
- if (in_ras_intr && !use_baco)
+ if (need_emergency_restart)
goto skip_sched_resume;
/*
* job->base holds a reference to parent fence
*/
if (job && job->base.s_fence->parent &&
- dma_fence_is_signaled(job->base.s_fence->parent))
+ dma_fence_is_signaled(job->base.s_fence->parent)) {
job_signaled = true;
-
- if (job_signaled) {
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
goto skip_hw_reset;
}
-
- /* Guilty job will be freed after this*/
- r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset);
- if (r) {
- /*TODO Should we stop ?*/
- DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
- r, adev->ddev->unique);
- adev->asic_reset_res = r;
- }
-
retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-
- if (tmp_adev == adev)
- continue;
-
r = amdgpu_device_pre_asic_reset(tmp_adev,
NULL,
&need_full_reset);
skip_sched_resume:
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
/*unlock kfd: SRIOV would do it separately */
- if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))
+ if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
amdgpu_amdkfd_post_reset(tmp_adev);
+ if (audio_suspended)
+ amdgpu_device_resume_display_audio(tmp_adev);
amdgpu_device_unlock_adev(tmp_adev);
}
- if (hive)
+ if (hive) {
mutex_unlock(&hive->reset_lock);
+ mutex_unlock(&hive->hive_lock);
+ }
if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);