]> Git Repo - J-linux.git/commitdiff
drm/amdgpu: add RAS is_rma flag
authorTao Zhou <[email protected]>
Thu, 23 May 2024 03:23:20 +0000 (11:23 +0800)
committerAlex Deucher <[email protected]>
Wed, 5 Jun 2024 15:25:14 +0000 (11:25 -0400)
Set the flag to true if bad page number reaches threshold.

Signed-off-by: Tao Zhou <[email protected]>
Reviewed-by: Hawking Zhang <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

index 8dbfdb767f943159bbb8f6a8892f25502b3a519c..b3d11703df042c0cd535500de73d6a3c150b53c3 100644 (file)
@@ -2926,7 +2926,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data **data;
        u32  max_eeprom_records_count = 0;
-       bool exc_err_limit = false;
        int ret;
 
        if (!con || amdgpu_sriov_vf(adev))
@@ -2963,12 +2962,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         */
        if (adev->gmc.xgmi.pending_reset)
                return 0;
-       ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
+       ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
        /*
-        * This calling fails when exc_err_limit is true or
+        * This calling fails when is_rma is true or
         * ret != 0.
         */
-       if (exc_err_limit || ret)
+       if (con->is_rma || ret)
                goto free;
 
        if (con->eeprom_control.ras_num_recs) {
@@ -3016,7 +3015,7 @@ out:
         * Except error threshold exceeding case, other failure cases in this
         * function would not fail amdgpu driver init.
         */
-       if (!exc_err_limit)
+       if (!con->is_rma)
                ret = 0;
        else
                ret = -EINVAL;
index 56b9bf63b67f810a53db7c1f7049aa5530cab2f5..e70c45712ddb4a0b26cb59ae2489cbbd01f8cba2 100644 (file)
@@ -522,6 +522,7 @@ struct amdgpu_ras {
        bool update_channel_flag;
        /* Record status of smu mca debug mode */
        bool is_aca_debug_mode;
+       bool is_rma;
 
        /* Record special requirements of gpu reset caller */
        uint32_t  gpu_reset_flags;
index 9b789dcc2bd170df6e54359cedbdd7df1270b6ac..eae0a555df3c482f4535ff6f45ed9f7d88845801 100644 (file)
@@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
                        control->tbl_rai.health_percent = 0;
                }
 
+               if (amdgpu_bad_page_threshold != -1)
+                       ras->is_rma = true;
+
                /* ignore the -ENOTSUPP return value */
                amdgpu_dpm_send_rma_reason(adev);
        }
@@ -1321,8 +1324,7 @@ Out:
        return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
 }
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
-                          bool *exceed_err_limit)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 {
        struct amdgpu_device *adev = to_amdgpu_device(control);
        unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
@@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
        int res;
 
-       *exceed_err_limit = false;
+       ras->is_rma = false;
 
        if (!__is_ras_eeprom_supported(adev))
                return 0;
@@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
                                dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
                                res = 0;
                        } else {
-                               *exceed_err_limit = true;
+                               ras->is_rma = true;
                                dev_err(adev->dev,
                                        "RAS records:%d exceed threshold:%d, "
                                        "GPU will not be initialized. Replace this GPU or increase the threshold",
index 6dfd667f3013d0fd0c990bef7c819a8eeafa563f..b9ebda577797dc12ed6f0af1c74ecca5db4818b5 100644 (file)
@@ -129,8 +129,7 @@ struct eeprom_table_record {
        unsigned char mcumc_id;
 } __packed;
 
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
-                          bool *exceed_err_limit);
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
 
 int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
 
This page took 0.068066 seconds and 4 git commands to generate.