amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
GC_HWIP, false,
&rlcg_flag)) {
- ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id);
+ ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
down_read_trylock(&adev->reset_domain->sem)) {
amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
GC_HWIP, true,
&rlcg_flag)) {
- amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id);
+ amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
down_read_trylock(&adev->reset_domain->sem)) {
amdgpu_asic_pre_asic_init(adev);
if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
{
const char *chip_name;
- char fw_name[40];
int err;
const struct gpu_info_firmware_header_v1_0 *hdr;
break;
}
- snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
- err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
+ err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
+ "amdgpu/%s_gpu_info.bin", chip_name);
if (err) {
dev_err(adev->dev,
- "Failed to get gpu_info firmware \"%s\"\n",
- fw_name);
+ "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
+ chip_name);
goto out;
}
mutex_init(&adev->grbm_idx_mutex);
mutex_init(&adev->mn_lock);
mutex_init(&adev->virt.vf_errors.lock);
+ mutex_init(&adev->virt.rlcg_reg_lock);
hash_init(adev->mn_hash);
mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock);
shadow = vmbo->shadow;
/* No need to recover an evicted BO */
- if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
+ if (!shadow->tbo.resource ||
+ shadow->tbo.resource->mem_type != TTM_PL_TT ||
shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
continue;
* amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
*
* @adev: amdgpu_device pointer
- * @from_hypervisor: request from hypervisor
+ * @reset_context: amdgpu reset context pointer
*
* do VF FLR and reinitialize Asic
* return 0 means succeeded otherwise failed
*/
static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
- bool from_hypervisor)
+ struct amdgpu_reset_context *reset_context)
{
int r;
struct amdgpu_hive_info *hive = NULL;
- int retry_limit = 0;
-
-retry:
- amdgpu_amdkfd_pre_reset(adev);
- amdgpu_device_stop_pending_resets(adev);
-
- if (from_hypervisor)
+ if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
+ amdgpu_virt_ready_to_reset(adev);
+ amdgpu_virt_wait_reset(adev);
+ clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
r = amdgpu_virt_request_full_gpu(adev, true);
- else
+ } else {
r = amdgpu_virt_reset_gpu(adev);
+ }
if (r)
return r;
+
amdgpu_ras_set_fed(adev, false);
amdgpu_irq_gpu_reset_resume_helper(adev);
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)
- goto error;
+ return r;
amdgpu_virt_init_data_exchange(adev);
/* now we are okay to resume SMC/CP/SDMA */
r = amdgpu_device_ip_reinit_late_sriov(adev);
if (r)
- goto error;
+ return r;
hive = amdgpu_get_xgmi_hive(adev);
/* Update PSP FW topology after reset */
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
r = amdgpu_xgmi_update_topology(hive, adev);
-
if (hive)
amdgpu_put_xgmi_hive(hive);
+ if (r)
+ return r;
- if (!r) {
- r = amdgpu_ib_ring_tests(adev);
-
- amdgpu_amdkfd_post_reset(adev);
- }
+ r = amdgpu_ib_ring_tests(adev);
+ if (r)
+ return r;
-error:
- if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
+ if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
amdgpu_inc_vram_lost(adev);
r = amdgpu_device_recover_vram(adev);
}
- amdgpu_virt_release_full_gpu(adev, true);
+ if (r)
+ return r;
- if (AMDGPU_RETRY_SRIOV_RESET(r)) {
- if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
- retry_limit++;
- goto retry;
- } else
- DRM_ERROR("GPU reset retry is beyond the retry limit\n");
- }
+ /* need to be called during full access so we can't do it later like
+ * bare-metal does.
+ */
+ amdgpu_amdkfd_post_reset(adev);
+ amdgpu_virt_release_full_gpu(adev, true);
- return r;
+ /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
+ if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
+ amdgpu_ras_resume(adev);
+ return 0;
}
/**
if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
amdgpu_reset_reg_dumps(tmp_adev);
+ dev_info(tmp_adev->dev, "Dumping IP State\n");
/* Trigger ip dump before we reset the asic */
for (i = 0; i < tmp_adev->num_ip_blocks; i++)
if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
tmp_adev->ip_blocks[i].version->funcs
->dump_ip_state((void *)tmp_adev);
+ dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
}
reset_context->reset_device_list = device_list_handle;
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
+ int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
/*
* Special case: RAS triggered and full reset isn't supported
* to put adev in the 1st position.
*/
INIT_LIST_HEAD(&device_list);
- if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
+ if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
list_add_tail(&tmp_adev->reset_list, &device_list);
if (adev->shutdown)
cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
- if (!amdgpu_sriov_vf(tmp_adev))
- amdgpu_amdkfd_pre_reset(tmp_adev);
+ amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
/*
* Mark these ASICs to be reseted as untracked first
r, adev_to_drm(tmp_adev)->unique);
tmp_adev->asic_reset_res = r;
}
-
- if (!amdgpu_sriov_vf(tmp_adev))
- /*
- * Drop all pending non scheduler resets. Scheduler resets
- * were already dropped during drm_sched_stop
- */
- amdgpu_device_stop_pending_resets(tmp_adev);
}
/* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
- r = amdgpu_device_reset_sriov(adev, job ? false : true);
+ r = amdgpu_device_reset_sriov(adev, reset_context);
+ if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
+ amdgpu_virt_release_full_gpu(adev, true);
+ goto retry;
+ }
if (r)
adev->asic_reset_res = r;
-
- /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
- if (amdgpu_ip_version(adev, GC_HWIP, 0) ==
- IP_VERSION(9, 4, 2) ||
- amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
- amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
- amdgpu_ras_resume(adev);
} else {
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
if (r && r == -EAGAIN)
goto retry;
}
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+ /*
+ * Drop any pending non scheduler resets queued before reset is done.
+ * Any reset scheduled after this point would be valid. Scheduler resets
+ * were already dropped during drm_sched_stop and no new ones can come
+ * in before drm_sched_start.
+ */
+ amdgpu_device_stop_pending_resets(tmp_adev);
+ }
+
skip_hw_reset:
/* Post ASIC reset for all devs .*/
adev->nbio.funcs->enable_doorbell_interrupt)
adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
- if (amdgpu_passthrough(adev) &&
+ if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
adev->nbio.funcs->clear_doorbell_interrupt)
adev->nbio.funcs->clear_doorbell_interrupt(adev);
struct amdgpu_reset_context reset_context;
u32 memsize;
struct list_head device_list;
- struct amdgpu_hive_info *hive;
- int hive_ras_recovery = 0;
- struct amdgpu_ras *ras;
/* PCI error slot reset should be skipped During RAS recovery */
- hive = amdgpu_get_xgmi_hive(adev);
- if (hive) {
- hive_ras_recovery = atomic_read(&hive->ras_recovery);
- amdgpu_put_xgmi_hive(hive);
- }
- ras = amdgpu_ras_get_context(adev);
- if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) &&
- ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery))
+ if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
+ amdgpu_ras_in_recovery(adev))
return PCI_ERS_RESULT_RECOVERED;
DRM_INFO("PCI error: slot reset callback!!\n");
spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
}
+/**
+ * amdgpu_device_get_gang - return a reference to the current gang
+ * @adev: amdgpu_device pointer
+ *
+ * Returns: A new reference to the current gang leader.
+ */
+struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
+{
+ struct dma_fence *fence;
+
+ rcu_read_lock();
+ fence = dma_fence_get_rcu_safe(&adev->gang_submit);
+ rcu_read_unlock();
+ return fence;
+}
+
/**
* amdgpu_device_switch_gang - switch to a new gang
* @adev: amdgpu_device pointer
do {
dma_fence_put(old);
- rcu_read_lock();
- old = dma_fence_get_rcu_safe(&adev->gang_submit);
- rcu_read_unlock();
-
+ old = amdgpu_device_get_gang(adev);
if (old == gang)
break;