static inline void put_obj(struct ras_manager *obj)
{
- if (obj && (--obj->use == 0))
+ if (obj && (--obj->use == 0)) {
list_del(&obj->node);
+ amdgpu_ras_error_data_fini(&obj->err_data);
+ }
+
if (obj && (obj->use < 0))
DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
}
if (alive_obj(obj))
return NULL;
+ if (amdgpu_ras_error_data_init(&obj->err_data))
+ return NULL;
+
obj->head = *head;
obj->adev = adev;
list_add(&obj->node, &con->head);
}
static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
- struct ras_query_if *query_if,
+ struct ras_manager *ras_mgr,
struct ras_err_data *err_data,
+ const char *blk_name,
bool is_ue)
{
- struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
- const char *blk_name = get_ras_block_str(&query_if->head);
struct amdgpu_smuio_mcm_config_info *mcm_info;
struct ras_err_node *err_node;
struct ras_err_info *err_info;
- if (is_ue)
- dev_info(adev->dev, "%ld uncorrectable hardware errors detected in %s block\n",
- ras_mgr->err_data.ue_count, blk_name);
- else
- dev_info(adev->dev, "%ld correctable hardware errors detected in %s block\n",
- ras_mgr->err_data.ue_count, blk_name);
+ if (is_ue) {
+ for_each_ras_error(err_node, err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ if (err_info->ue_count) {
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld new uncorrectable hardware errors detected in %s block\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->ue_count,
+ blk_name);
+ }
+ }
- for_each_ras_error(err_node, err_data) {
- err_info = &err_node->err_info;
- mcm_info = &err_info->mcm_info;
- if (is_ue && err_info->ue_count) {
- dev_info(adev->dev, "socket: %d, die: %d "
- "%lld uncorrectable hardware errors detected in %s block\n",
- mcm_info->socket_id,
- mcm_info->die_id,
- err_info->ue_count,
- blk_name);
- } else if (!is_ue && err_info->ce_count) {
- dev_info(adev->dev, "socket: %d, die: %d "
- "%lld correctable hardware errors detected in %s block\n",
- mcm_info->socket_id,
- mcm_info->die_id,
- err_info->ue_count,
- blk_name);
+ for_each_ras_error(err_node, &ras_mgr->err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld uncorrectable hardware errors detected in total in %s block\n",
+ mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
+ }
+
+ } else {
+ for_each_ras_error(err_node, err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ if (err_info->ce_count) {
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld new correctable hardware errors detected in %s block, "
+ "no user action is needed\n",
+ mcm_info->socket_id,
+ mcm_info->die_id,
+ err_info->ce_count,
+ blk_name);
+ }
+ }
+
+ for_each_ras_error(err_node, &ras_mgr->err_data) {
+ err_info = &err_node->err_info;
+ mcm_info = &err_info->mcm_info;
+ dev_info(adev->dev, "socket: %d, die: %d, "
+ "%lld correctable hardware errors detected in total in %s block, "
+ "no user action is needed\n",
+ mcm_info->socket_id, mcm_info->die_id, err_info->ce_count, blk_name);
}
}
}
+static inline bool err_data_has_source_info(struct ras_err_data *data)
+{
+ return !list_empty(&data->err_node_list);
+}
+
static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
struct ras_query_if *query_if,
struct ras_err_data *err_data)
const char *blk_name = get_ras_block_str(&query_if->head);
if (err_data->ce_count) {
- if (!list_empty(&err_data->err_node_list)) {
- amdgpu_ras_error_print_error_data(adev, query_if,
- err_data, false);
+ if (err_data_has_source_info(err_data)) {
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
}
if (err_data->ue_count) {
- if (!list_empty(&err_data->err_node_list)) {
- amdgpu_ras_error_print_error_data(adev, query_if,
- err_data, true);
+ if (err_data_has_source_info(err_data)) {
+ amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, blk_name, true);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
}
+static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data)
+{
+ struct ras_err_node *err_node;
+ struct ras_err_info *err_info;
+
+ if (err_data_has_source_info(err_data)) {
+ for_each_ras_error(err_node, err_data) {
+ err_info = &err_node->err_info;
+
+ amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, err_info->ce_count);
+ amdgpu_ras_error_statistic_ue_count(&obj->err_data, &err_info->mcm_info, err_info->ue_count);
+ }
+ } else {
+ /* for legacy asic path which doesn't has error source info */
+ obj->err_data.ue_count += err_data->ue_count;
+ obj->err_data.ce_count += err_data->ce_count;
+ }
+}
+
/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info)
}
}
- obj->err_data.ue_count += err_data.ue_count;
- obj->err_data.ce_count += err_data.ce_count;
+ amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
info->ue_count = obj->err_data.ue_count;
info->ce_count = obj->err_data.ce_count;
enum amdgpu_ras_block block)
{
struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!block_obj || !block_obj->hw_ops) {
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
return -EOPNOTSUPP;
}
- if (!amdgpu_ras_is_supported(adev, block))
+ /* skip ras error reset in gpu reset */
+ if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery)) &&
+ mca_funcs && mca_funcs->mca_set_debug_mode)
+ return -EOPNOTSUPP;
+
+ if (!amdgpu_ras_is_supported(adev, block) ||
+ !amdgpu_ras_get_mca_debug_mode(adev))
return -EOPNOTSUPP;
if (block_obj->hw_ops->reset_ras_error_count)
/* hw_supported needs to be aligned with RAS block mask. */
adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
-
- /*
- * Disable ras feature for aqua vanjaram
- * by default on apu platform.
- */
- if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6) &&
- adev->gmc.is_app_apu)
- adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 :
- adev->ras_hw_enabled & amdgpu_ras_mask;
- else
- adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
- adev->ras_hw_enabled & amdgpu_ras_mask;
+ adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
+ adev->ras_hw_enabled & amdgpu_ras_mask;
}
static void amdgpu_ras_counte_dw(struct work_struct *work)
return;
/* Init poison supported flag, the default value is false */
- if (adev->gmc.xgmi.connected_to_cpu) {
+ if (adev->gmc.xgmi.connected_to_cpu ||
+ adev->gmc.is_app_apu) {
/* enabled by default when GPU is connected to CPU */
con->poison_supported = true;
} else if (adev->df.funcs &&
return 0;
}
+void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (con)
+ con->is_mca_debug_mode = enable;
+}
+
+bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+
+ if (!con)
+ return false;
+
+ if (mca_funcs && mca_funcs->mca_set_debug_mode)
+ return con->is_mca_debug_mode;
+ else
+ return true;
+}
/* Register each ip ras block into amdgpu ras */
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
{
struct ras_err_node *err_node, *tmp;
- list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) {
+ list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node)
amdgpu_ras_error_node_release(err_node);
- list_del(&err_node->node);
- }
}
static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data,
for_each_ras_error(err_node, err_data) {
ref_id = &err_node->err_info.mcm_info;
- if ((mcm_info->socket_id >= 0 && mcm_info->socket_id != ref_id->socket_id) ||
- (mcm_info->die_id >= 0 && mcm_info->die_id != ref_id->die_id))
- continue;
- return err_node;
+ if (mcm_info->socket_id == ref_id->socket_id &&
+ mcm_info->die_id == ref_id->die_id)
+ return err_node;
}
return NULL;