return false;
}
+static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
+{
+ struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct eeprom_table_record err_rec;
+
+ if ((address >= adev->gmc.mc_vram_size) ||
+ (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+ dev_warn(adev->dev,
+ "RAS WARN: input address 0x%llx is invalid.\n",
+ address);
+ return -EINVAL;
+ }
+
+ if (amdgpu_ras_check_bad_page(adev, address)) {
+ dev_warn(adev->dev,
+ "RAS WARN: 0x%llx has been marked as bad page!\n",
+ address);
+ return 0;
+ }
+
+ memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
+
+ err_rec.address = address;
+ err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT;
+ err_rec.ts = (uint64_t)ktime_get_real_seconds();
+ err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+
+ err_data.err_addr = &err_rec;
+ err_data.err_addr_cnt = 1;
+
+ if (amdgpu_bad_page_threshold != 0) {
+ amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+ err_data.err_addr_cnt);
+ amdgpu_ras_save_bad_pages(adev);
+ }
+
+ dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
+ dev_warn(adev->dev, "Clear EEPROM:\n");
+ dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
+
+ return 0;
+}
+
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
{
op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2;
+ else if (sscanf(str, "retire_page") == 0)
+ op = 3;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;
if (op != -1) {
+
+ if (op == 3) {
+ if (sscanf(str, "%*s %llu", &address) != 1)
+ if (sscanf(str, "%*s 0x%llx", &address) != 1)
+ return -EINVAL;
+
+ data->op = op;
+ data->inject.address = address;
+
+ return 0;
+ }
+
if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
return -EINVAL;
if (ret)
return -EINVAL;
+ if (data.op == 3)
+ {
+ ret = amdgpu_reserve_page_direct(adev, data.inject.address);
+
+ if (ret)
+ return size;
+ else
+ return ret;
+ }
+
if (!amdgpu_ras_is_supported(adev, data.head.block))
return -EINVAL;
};
if (!amdgpu_ras_get_error_query_ready(obj->adev))
- return snprintf(buf, PAGE_SIZE,
- "Query currently inaccessible\n");
+ return sysfs_emit(buf, "Query currently inaccessible\n");
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
- return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
- "ue", info.ue_count,
- "ce", info.ce_count);
+ return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
+ "ce", info.ce_count);
}
/* obj begin */
static inline void put_obj(struct ras_manager *obj)
{
- if (obj && --obj->use == 0)
+ if (obj && (--obj->use == 0))
list_del(&obj->node);
- if (obj && obj->use < 0) {
- DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
- }
+ if (obj && (obj->use < 0))
+ DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
}
/* make one obj and return it. */
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
- if (!con)
+ if (!adev->ras_features || !con)
return NULL;
if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
struct ras_manager *obj;
int i;
- if (!con)
+ if (!adev->ras_features || !con)
return NULL;
if (head) {
con->features |= BIT(head->block);
} else {
if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
- con->features &= ~BIT(head->block);
+ /* skip clean gfx ras context feature for VEGA20 Gaming.
+ * will clean later
+ */
+ if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)))
+ con->features &= ~BIT(head->block);
put_obj(obj);
}
}
if (ret)
return ret;
+ /* gfx block ras dsiable cmd must send to ras-ta */
+ if (head->block == AMDGPU_RAS_BLOCK__GFX)
+ con->features |= BIT(head->block);
+
ret = amdgpu_ras_feature_enable(adev, head, 0);
}
} else
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__UMC:
- if (adev->umc.funcs->query_ras_error_count)
- adev->umc.funcs->query_ras_error_count(adev, &err_data);
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_count)
+ adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
/* umc query_ras_error_address is also responsible for clearing
* error status
*/
- if (adev->umc.funcs->query_ras_error_address)
- adev->umc.funcs->query_ras_error_address(adev, &err_data);
+ if (adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_error_address)
+ adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
break;
case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->query_ras_error_count) {
}
break;
case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.funcs->query_ras_error_count)
- adev->gfx.funcs->query_ras_error_count(adev, &err_data);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->query_ras_error_count)
+ adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data);
- if (adev->gfx.funcs->query_ras_error_status)
- adev->gfx.funcs->query_ras_error_status(adev);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->query_ras_error_status)
+ adev->gfx.ras_funcs->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
- if (adev->mmhub.funcs->query_ras_error_count)
- adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
+ if (adev->mmhub.ras_funcs &&
+ adev->mmhub.ras_funcs->query_ras_error_count)
+ adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data);
- if (adev->mmhub.funcs->query_ras_error_status)
- adev->mmhub.funcs->query_ras_error_status(adev);
+ if (adev->mmhub.ras_funcs &&
+ adev->mmhub.ras_funcs->query_ras_error_status)
+ adev->mmhub.ras_funcs->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__PCIE_BIF:
- if (adev->nbio.funcs->query_ras_error_count)
- adev->nbio.funcs->query_ras_error_count(adev, &err_data);
+ if (adev->nbio.ras_funcs &&
+ adev->nbio.ras_funcs->query_ras_error_count)
+ adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);
break;
case AMDGPU_RAS_BLOCK__XGMI_WAFL:
- amdgpu_xgmi_query_ras_error_count(adev, &err_data);
+ if (adev->gmc.xgmi.ras_funcs &&
+ adev->gmc.xgmi.ras_funcs->query_ras_error_count)
+ adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data);
break;
default:
break;
switch (block) {
case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.funcs->reset_ras_error_count)
- adev->gfx.funcs->reset_ras_error_count(adev);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->reset_ras_error_count)
+ adev->gfx.ras_funcs->reset_ras_error_count(adev);
- if (adev->gfx.funcs->reset_ras_error_status)
- adev->gfx.funcs->reset_ras_error_status(adev);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->reset_ras_error_status)
+ adev->gfx.ras_funcs->reset_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
- if (adev->mmhub.funcs->reset_ras_error_count)
- adev->mmhub.funcs->reset_ras_error_count(adev);
+ if (adev->mmhub.ras_funcs &&
+ adev->mmhub.ras_funcs->reset_ras_error_count)
+ adev->mmhub.ras_funcs->reset_ras_error_count(adev);
break;
case AMDGPU_RAS_BLOCK__SDMA:
if (adev->sdma.funcs->reset_ras_error_count)
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.funcs->ras_error_inject)
- ret = adev->gfx.funcs->ras_error_inject(adev, info);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->ras_error_inject)
+ ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
else
ret = -EINVAL;
break;
case AMDGPU_RAS_BLOCK__UMC:
+ case AMDGPU_RAS_BLOCK__SDMA:
case AMDGPU_RAS_BLOCK__MMHUB:
case AMDGPU_RAS_BLOCK__PCIE_BIF:
ret = psp_ras_trigger_error(&adev->psp, &block_info);
struct ras_manager *obj;
struct ras_err_data data = {0, 0};
- if (!con)
+ if (!adev->ras_features || !con)
return 0;
list_for_each_entry(obj, &con->head, node) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
- if (!con)
+ if (!adev->ras_features || !con)
return;
list_for_each_entry(obj, &con->head, node) {
*/
switch (info->head.block) {
case AMDGPU_RAS_BLOCK__GFX:
- if (adev->gfx.funcs->query_ras_error_status)
- adev->gfx.funcs->query_ras_error_status(adev);
+ if (adev->gfx.ras_funcs &&
+ adev->gfx.ras_funcs->query_ras_error_status)
+ adev->gfx.ras_funcs->query_ras_error_status(adev);
break;
case AMDGPU_RAS_BLOCK__MMHUB:
- if (adev->mmhub.funcs->query_ras_error_status)
- adev->mmhub.funcs->query_ras_error_status(adev);
+ if (adev->mmhub.ras_funcs &&
+ adev->mmhub.ras_funcs->query_ras_error_status)
+ adev->mmhub.ras_funcs->query_ras_error_status(adev);
break;
default:
break;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
- if (!con)
+ if (!adev->ras_features || !con)
return;
list_for_each_entry(obj, &con->head, node) {
return ret;
}
-static uint32_t
-amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
+static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
+ uint32_t max_length)
{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int tmp_threshold = amdgpu_bad_page_threshold;
u64 val;
- uint32_t max_length = 0;
- max_length = amdgpu_ras_eeprom_get_record_max_length();
/*
* Justification of value bad_page_cnt_threshold in ras structure
*
tmp_threshold = max_length;
if (tmp_threshold == -1) {
- val = adev->gmc.real_vram_size;
+ val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_RATE);
- tmp_threshold = min(lower_32_bits(val), max_length);
+ con->bad_page_cnt_threshold = min(lower_32_bits(val),
+ max_length);
+ } else {
+ con->bad_page_cnt_threshold = tmp_threshold;
}
-
- return tmp_threshold;
}
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
+ uint32_t max_eeprom_records_len = 0;
bool exc_err_limit = false;
int ret;
- if (con)
+ if (adev->ras_features && con)
data = &con->eh_data;
else
return 0;
atomic_set(&con->in_recovery, 0);
con->adev = adev;
- if (!con->bad_page_cnt_threshold) {
- con->bad_page_cnt_threshold =
- amdgpu_ras_calculate_badpags_threshold(adev);
-
- ret = amdgpu_vram_mgr_reserve_backup_pages(
- ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
- con->bad_page_cnt_threshold);
- if (ret)
- goto out;
- }
+ max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+ amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+ /* Todo: During test the SMU might fail to read the eeprom through I2C
+ * when the GPU is pending on XGMI reset during probe time
+ * (Mostly after second bus reset), skip it now
+ */
+ if (adev->gmc.xgmi.pending_reset)
+ return 0;
ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
/*
* This calling fails when exc_err_limit is true or
return 0;
}
-static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev)
+static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
{
- if (adev->asic_type != CHIP_VEGA10 &&
- adev->asic_type != CHIP_VEGA20 &&
- adev->asic_type != CHIP_ARCTURUS &&
- adev->asic_type != CHIP_SIENNA_CICHLID)
- return 1;
- else
- return 0;
+ return adev->asic_type == CHIP_VEGA10 ||
+ adev->asic_type == CHIP_VEGA20 ||
+ adev->asic_type == CHIP_ARCTURUS ||
+ adev->asic_type == CHIP_ALDEBARAN ||
+ adev->asic_type == CHIP_SIENNA_CICHLID;
}
/*
*supported = 0;
if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
- amdgpu_ras_check_asic_type(adev))
+ !amdgpu_ras_asic_supported(adev))
return;
- if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
- dev_info(adev->dev, "HBM ECC is active.\n");
- *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
- 1 << AMDGPU_RAS_BLOCK__DF);
- } else
- dev_info(adev->dev, "HBM ECC is not presented.\n");
+ if (!adev->gmc.xgmi.connected_to_cpu) {
+ if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+ dev_info(adev->dev, "MEM ECC is active.\n");
+ *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+ } else {
+ dev_info(adev->dev, "MEM ECC is not presented.\n");
+ }
- if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
- dev_info(adev->dev, "SRAM ECC is active.\n");
- *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
- 1 << AMDGPU_RAS_BLOCK__DF);
- } else
- dev_info(adev->dev, "SRAM ECC is not presented.\n");
+ if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+ dev_info(adev->dev, "SRAM ECC is active.\n");
+ *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+ } else {
+ dev_info(adev->dev, "SRAM ECC is not presented.\n");
+ }
+ } else {
+ /* driver only manages a few IP blocks RAS feature
+ * when GPU is connected cpu through XGMI */
+ *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX |
+ 1 << AMDGPU_RAS_BLOCK__SDMA |
+ 1 << AMDGPU_RAS_BLOCK__MMHUB);
+ }
/* hw_supported needs to be aligned with RAS block mask. */
*hw_supported &= AMDGPU_RAS_BLOCK_MASK;
amdgpu_ras_check_supported(adev, &con->hw_supported,
&con->supported);
if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
+ /* set gfx block ras context feature for VEGA20 Gaming
+ * send ras disable cmd to ras ta during ras late init.
+ */
+ if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) {
+ con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
+
+ return 0;
+ }
+
r = 0;
goto release_con;
}
/* Might need get this flag from vbios. */
con->flags = RAS_DEFAULT_FLAGS;
- if (adev->nbio.funcs->init_ras_controller_interrupt) {
- r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
+ /* initialize nbio ras function ahead of any other
+ * ras functions so hardware fatal error interrupt
+ * can be enabled as early as possible */
+ switch (adev->asic_type) {
+ case CHIP_VEGA20:
+ case CHIP_ARCTURUS:
+ case CHIP_ALDEBARAN:
+ if (!adev->gmc.xgmi.connected_to_cpu)
+ adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs;
+ break;
+ default:
+ /* nbio ras is not available */
+ break;
+ }
+
+ if (adev->nbio.ras_funcs &&
+ adev->nbio.ras_funcs->init_ras_controller_interrupt) {
+ r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev);
if (r)
goto release_con;
}
- if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
- r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
+ if (adev->nbio.ras_funcs &&
+ adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) {
+ r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev);
if (r)
goto release_con;
}
return r;
}
+static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
+{
+ if (adev->gmc.xgmi.connected_to_cpu)
+ return 1;
+ return 0;
+}
+
+static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block)
+{
+ struct ras_query_if info = {
+ .head = *ras_block,
+ };
+
+ if (!amdgpu_persistent_edc_harvesting_supported(adev))
+ return 0;
+
+ if (amdgpu_ras_query_error_status(adev, &info) != 0)
+ DRM_WARN("RAS init harvest failure");
+
+ if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
+ DRM_WARN("RAS init harvest reset failure");
+
+ return 0;
+}
+
/* helper function to handle common stuff in ip late init phase */
int amdgpu_ras_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block,
return r;
}
+ /* check for errors on warm reset edc persisant supported ASIC */
+ amdgpu_persistent_edc_harvesting(adev, ras_block);
+
/* in resume phase, no need to create ras fs node */
if (adev->in_suspend || amdgpu_in_reset(adev))
return 0;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj, *tmp;
- if (!con)
+ if (!adev->ras_features || !con) {
+ /* clean ras context for VEGA20 Gaming after send ras disable cmd */
+ amdgpu_release_ras_context(adev);
+
return;
+ }
if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
/* Set up all other IPs which are not implemented. There is a
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- if (!con)
+ if (!adev->ras_features || !con)
return;
amdgpu_ras_disable_all_features(adev, 0);
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- if (!con)
+ if (!adev->ras_features || !con)
return 0;
/* Need disable ras on all IPs here before ip [hw/sw]fini */
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- if (!con)
+ if (!adev->ras_features || !con)
return 0;
amdgpu_ras_fs_fini(adev);
return false;
}
+
+void amdgpu_release_ras_context(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!con)
+ return;
+
+ if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
+ con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
+ amdgpu_ras_set_context(adev, NULL);
+ kfree(con);
+ }
+}