X-Git-Url: https://repo.jachan.dev/linux.git/blobdiff_plain/e53a3250f76b8a0dd5b533bd0ce0dc821055e77d..901bdf5ea1a836400ee69aa32b04e9c209271ec7:/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8a16a06cb78a..a6c3265cdbc4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -256,6 +256,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, int block_id; uint32_t sub_block; u64 address, value; + /* default value is 0 if the mask is not set by user */ + u32 instance_mask = 0; if (*pos) return -EINVAL; @@ -306,7 +308,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, data->op = op; if (op == 2) { - if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", + if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x", + &sub_block, &address, &value, &instance_mask) != 4 && + sscanf(str, "%*s %*s %*s %u %llu %llu %u", + &sub_block, &address, &value, &instance_mask) != 4 && + sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", &sub_block, &address, &value) != 3 && sscanf(str, "%*s %*s %*s %u %llu %llu", &sub_block, &address, &value) != 3) @@ -314,6 +320,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, data->head.sub_block_index = sub_block; data->inject.address = address; data->inject.value = value; + data->inject.instance_mask = instance_mask; } } else { if (size < sizeof(*data)) @@ -326,6 +333,46 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, return 0; } +static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev, + struct ras_debug_if *data) +{ + int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; + uint32_t mask, inst_mask = data->inject.instance_mask; + + /* no need to set instance mask if there is only one instance */ + if (num_xcc <= 1 && inst_mask) { + data->inject.instance_mask = 0; + dev_dbg(adev->dev, + "RAS inject mask(0x%x) isn't supported and force it to 0.\n", + inst_mask); + + return; + } + + switch (data->head.block) { + case AMDGPU_RAS_BLOCK__GFX: + mask = GENMASK(num_xcc - 1, 0); + break; + case AMDGPU_RAS_BLOCK__SDMA: + mask = GENMASK(adev->sdma.num_instances - 1, 0); + break; + case AMDGPU_RAS_BLOCK__VCN: + case AMDGPU_RAS_BLOCK__JPEG: + mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); + break; + default: + mask = inst_mask; + break; + } + + /* remove invalid bits in instance mask */ + data->inject.instance_mask &= mask; + if (inst_mask != data->inject.instance_mask) + dev_dbg(adev->dev, + "Adjust RAS inject mask 0x%x to 0x%x\n", + inst_mask, data->inject.instance_mask); +} + /** * DOC: AMDGPU RAS debugfs control interface * @@ -341,7 +388,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * sub_block_index: some IPs have subcomponets. say, GFX, sDMA. * name: the name of IP. * - * inject has two more members than head, they are address, value. + * inject has three more members than head, they are address, value and mask. * As their names indicate, inject operation will write the * value to the address. * @@ -365,7 +412,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * * echo "disable " > /sys/kernel/debug/dri//ras/ras_ctrl * echo "enable " > /sys/kernel/debug/dri//ras/ras_ctrl - * echo "inject
> /sys/kernel/debug/dri//ras/ras_ctrl + * echo "inject
" > /sys/kernel/debug/dri//ras/ras_ctrl * * Where N, is the card which you want to affect. * @@ -382,13 +429,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * * The sub-block is a the sub-block index, pass 0 if there is no sub-block. * The address and value are hexadecimal numbers, leading 0x is optional. + * The mask means instance mask, is optional, default value is 0x1. * * For instance, * * .. code-block:: bash * * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl - * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl + * echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl * * How to check the result of the operation? @@ -460,6 +508,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, break; } + amdgpu_ras_instance_mask_check(adev, &data); + /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; @@ -1115,15 +1165,15 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, block_info.address); } - if (info->head.block == AMDGPU_RAS_BLOCK__GFX) { - if (block_obj->hw_ops->ras_error_inject) - ret = block_obj->hw_ops->ras_error_inject(adev, info); + if (block_obj->hw_ops->ras_error_inject) { + if (info->head.block == AMDGPU_RAS_BLOCK__GFX) + ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); + else /* Special ras_error_inject is defined (e.g: xgmi) */ + ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, + info->instance_mask); } else { - /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */ - if (block_obj->hw_ops->ras_error_inject) - ret = block_obj->hw_ops->ras_error_inject(adev, &block_info); - else /*If not defined .ras_error_inject, use default ras_error_inject*/ - ret = psp_ras_trigger_error(&adev->psp, &block_info); + /* default path */ + ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); } if (ret) @@ -2007,9 +2057,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) /* Perform full reset in fatal error mode */ if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - else + else { clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { + ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; + reset_context.method = AMD_RESET_METHOD_MODE2; + } + } + amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } atomic_set(&ras->in_recovery, 0); @@ -2258,7 +2314,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->eeprom_control.bad_channel_bitmap = 0; - max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); + max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); /* Todo: During test the SMU might fail to read the eeprom through I2C @@ -2624,7 +2680,8 @@ release_con: int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) { - if (adev->gmc.xgmi.connected_to_cpu) + if (adev->gmc.xgmi.connected_to_cpu || + adev->gmc.is_app_apu) return 1; return 0; } @@ -3163,7 +3220,8 @@ bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG)) - return false; + /* keep the check here in case we need to refer to the result later */ + dev_dbg(adev->dev, "Invalid err_info field\n"); /* read err count */ *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT); @@ -3186,17 +3244,17 @@ void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, uint32_t i, j; for (i = 0; i < reg_list_size; i++) { + /* query memory_id from err_status_lo */ + if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], + instance, &memory_id)) + continue; + /* query err_cnt from err_status_hi */ if (!amdgpu_ras_inst_get_err_cnt_field(adev, ®_list[i], instance, &err_cnt) || !err_cnt) continue; - /* query memory_id from err_status_lo */ - if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], - instance, &memory_id)) - continue; - *err_count += err_cnt; /* log the errors */