#include <linux/reboot.h>
#include <linux/syscalls.h>
#include <linux/pm_runtime.h>
+#include <linux/list_sort.h>
#include "amdgpu.h"
#include "amdgpu_ras.h"
}
}
-/* query/inject/cure begin */
-int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
- struct ras_query_if *info)
+static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
+ struct ras_query_if *info,
+ struct ras_err_data *err_data,
+ unsigned int error_query_mode)
{
+ enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
struct amdgpu_ras_block_object *block_obj = NULL;
+
+ if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
+ return -EINVAL;
+
+ if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
+ if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
+ amdgpu_ras_get_ecc_info(adev, err_data);
+ } else {
+ block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
+ if (!block_obj || !block_obj->hw_ops) {
+ dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
+ get_ras_block_str(&info->head));
+ return -EINVAL;
+ }
+
+ if (block_obj->hw_ops->query_ras_error_count)
+ block_obj->hw_ops->query_ras_error_count(adev, err_data);
+
+ if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
+ (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
+ (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
+ if (block_obj->hw_ops->query_ras_error_status)
+ block_obj->hw_ops->query_ras_error_status(adev);
+ }
+ }
+ } else {
+ /* FIXME: add code to check return value later */
+ amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
+ amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+ }
+
+ return 0;
+}
+
+/* query/inject/cure begin */
+int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
+{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data;
+ unsigned int error_query_mode;
int ret;
if (!obj)
if (ret)
return ret;
- if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
- amdgpu_ras_get_ecc_info(adev, &err_data);
- } else {
- block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
- if (!block_obj || !block_obj->hw_ops) {
- dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
- get_ras_block_str(&info->head));
- ret = -EINVAL;
- goto out_fini_err_data;
- }
-
- if (block_obj->hw_ops->query_ras_error_count)
- block_obj->hw_ops->query_ras_error_count(adev, &err_data);
+ if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
+ return -EINVAL;
- if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
- (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
- (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
- if (block_obj->hw_ops->query_ras_error_status)
- block_obj->hw_ops->query_ras_error_status(adev);
- }
- }
+ ret = amdgpu_ras_query_error_status_helper(adev, info,
+ &err_data,
+ error_query_mode);
+ if (ret)
+ goto out_fini_err_data;
amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- sysfs_remove_file_from_group(&adev->dev->kobj,
+ if (adev->dev->kobj.sd)
+ sysfs_remove_file_from_group(&adev->dev->kobj,
&con->badpages_attr.attr,
RAS_FS_NAME);
}
.attrs = attrs,
};
- sysfs_remove_group(&adev->dev->kobj, &group);
+ if (adev->dev->kobj.sd)
+ sysfs_remove_group(&adev->dev->kobj, &group);
return 0;
}
if (!obj || !obj->attr_inuse)
return -EINVAL;
- sysfs_remove_file_from_group(&adev->dev->kobj,
+ if (adev->dev->kobj.sd)
+ sysfs_remove_file_from_group(&adev->dev->kobj,
&obj->sysfs_attr.attr,
RAS_FS_NAME);
obj->attr_inuse = 0;
return true;
}
+bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
+ unsigned int *error_query_mode)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+
+ if (!con) {
+ *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
+ return false;
+ }
+
+ if (mca_funcs && mca_funcs->mca_set_debug_mode)
+ *error_query_mode =
+ (con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
+ else
+ *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
+
+ return true;
+}
+
/* Register each ip ras block into amdgpu ras */
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj)
return err_node;
}
+static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct list_head *b)
+{
+ struct ras_err_node *nodea = container_of(a, struct ras_err_node, node);
+ struct ras_err_node *nodeb = container_of(b, struct ras_err_node, node);
+ struct amdgpu_smuio_mcm_config_info *infoa = &nodea->err_info.mcm_info;
+ struct amdgpu_smuio_mcm_config_info *infob = &nodeb->err_info.mcm_info;
+
+ if (unlikely(infoa->socket_id != infob->socket_id))
+ return infoa->socket_id - infob->socket_id;
+ else
+ return infoa->die_id - infob->die_id;
+
+ return 0;
+}
+
static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info)
{
err_data->err_list_count++;
list_add_tail(&err_node->node, &err_data->err_node_list);
+ list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp);
return &err_node->err_info;
}