if (ret)
return ret;
- /* gfx block ras dsiable cmd must send to ras-ta */
+ /* gfx block ras disable cmd must send to ras-ta */
if (head->block == AMDGPU_RAS_BLOCK__GFX)
con->features |= BIT(head->block);
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
amdgpu_ras_error_statistic_de_count(&obj->err_data,
- &err_info->mcm_info, NULL, err_info->de_count);
+ &err_info->mcm_info, err_info->de_count);
amdgpu_ras_error_statistic_ce_count(&obj->err_data,
- &err_info->mcm_info, NULL, err_info->ce_count);
+ &err_info->mcm_info, err_info->ce_count);
amdgpu_ras_error_statistic_ue_count(&obj->err_data,
- &err_info->mcm_info, NULL, err_info->ue_count);
+ &err_info->mcm_info, err_info->ue_count);
}
} else {
/* for legacy asic path which doesn't has error source info */
/* gpu reset is fallback for failed and default cases.
* For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
*/
- if (poison_stat && !con->is_rma) {
+ if (poison_stat && !amdgpu_ras_is_rma(adev)) {
event_id = amdgpu_ras_acquire_event_id(adev, type);
RAS_EVENT_LOG(adev, event_id,
"GPU reset for %s RAS poison consumption is issued!\n",
{
mutex_init(&ecc_log->lock);
- /* Set any value as siphash key */
- memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));
-
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
ecc_log->de_queried_count = 0;
ecc_log->prev_de_queried_count = 0;
amdgpu_ras_error_data_fini(&err_data);
- if (err_cnt && con->is_rma)
+ if (err_cnt && amdgpu_ras_is_rma(adev))
amdgpu_ras_reset_gpu(adev);
amdgpu_ras_schedule_retirement_dwork(con,
}
/* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
- if (reset_flags && !con->is_rma) {
+ if (reset_flags && !amdgpu_ras_is_rma(adev)) {
if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET)
* This calling fails when is_rma is true or
* ret != 0.
*/
- if (con->is_rma || ret)
+ if (amdgpu_ras_is_rma(adev) || ret)
goto free;
if (con->eeprom_control.ras_num_recs) {
* Except error threshold exceeding case, other failure cases in this
* function would not fail amdgpu driver init.
*/
- if (!con->is_rma)
+ if (!amdgpu_ras_is_rma(adev))
ret = 0;
else
ret = -EINVAL;
/* aca is disabled by default */
adev->aca.is_enabled = false;
+
+ /* bad page feature is not applicable to specific app platform */
+ if (adev->gmc.is_app_apu &&
+ amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0))
+ amdgpu_bad_page_threshold = 0;
}
static void amdgpu_ras_counte_dw(struct work_struct *work)
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
/* mode1 is the only selection for RMA status */
- if (ras->is_rma) {
+ if (amdgpu_ras_is_rma(adev)) {
ras->gpu_reset_flags = 0;
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
}
if (!err_node)
return NULL;
- INIT_LIST_HEAD(&err_node->err_info.err_addr_list);
-
memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
err_data->err_list_count++;
return &err_node->err_info;
}
-void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr)
-{
- /* This function will be retired. */
- return;
-}
-
-void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr)
-{
- list_del(&mca_err_addr->node);
- kfree(mca_err_addr);
-}
-
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info,
- struct ras_err_addr *err_addr, u64 count)
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ u64 count)
{
struct ras_err_info *err_info;
if (!err_info)
return -EINVAL;
- if (err_addr && err_addr->err_status)
- amdgpu_ras_add_mca_err_addr(err_info, err_addr);
-
err_info->ue_count += count;
err_data->ue_count += count;
}
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info,
- struct ras_err_addr *err_addr, u64 count)
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ u64 count)
{
struct ras_err_info *err_info;
}
int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info,
- struct ras_err_addr *err_addr, u64 count)
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ u64 count)
{
struct ras_err_info *err_info;
if (!err_info)
return -EINVAL;
- if (err_addr && err_addr->err_status)
- amdgpu_ras_add_mca_err_addr(err_info, err_addr);
-
err_info->de_count += count;
err_data->de_count += count;
dev_info(adev->dev,
"socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n",
socket_id, aid_id, hbm_id, fw_status);
+
+ if (AMDGPU_RAS_GPU_ERR_DATA_ABORT(boot_error))
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n",
+ socket_id, aid_id, fw_status);
+
+ if (AMDGPU_RAS_GPU_ERR_UNKNOWN(boot_error))
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, unknown boot time errors\n",
+ socket_id, aid_id, fw_status);
}
static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
va_end(args);
}
+
+bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!con)
+ return false;
+
+ return con->is_rma;
+}