Merge tag 'amd-drm-next-6.5-2023-06-09' of https://gitlab.freedesktop.org/agd5f/linux...

[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 8a16a06cb78a115f640720002837cda41ffb67a5..a6c3265cdbc46c1540cd559428c091da3d4b3132 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -256,6 +256,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
         int block_id;
         uint32_t sub_block;
         u64 address, value;
+       /* default value is 0 if the mask is not set by user */
+       u32 instance_mask = 0;
  
         if (*pos)
                 return -EINVAL;
@@ -306,7 +308,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
                 data->op = op;
  
                 if (op == 2) {
-                       if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
+                       if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x",
+                                  &sub_block, &address, &value, &instance_mask) != 4 &&
+                           sscanf(str, "%*s %*s %*s %u %llu %llu %u",
+                                  &sub_block, &address, &value, &instance_mask) != 4 &&
+                               sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
                                    &sub_block, &address, &value) != 3 &&
                             sscanf(str, "%*s %*s %*s %u %llu %llu",
                                    &sub_block, &address, &value) != 3)
@@ -314,6 +320,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
                         data->head.sub_block_index = sub_block;
                         data->inject.address = address;
                         data->inject.value = value;
+                       data->inject.instance_mask = instance_mask;
                 }
         } else {
                 if (size < sizeof(*data))
@@ -326,6 +333,46 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
         return 0;
  }
  
+static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev,
+                               struct ras_debug_if *data)
+{
+       int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
+       uint32_t mask, inst_mask = data->inject.instance_mask;
+
+       /* no need to set instance mask if there is only one instance */
+       if (num_xcc <= 1 && inst_mask) {
+               data->inject.instance_mask = 0;
+               dev_dbg(adev->dev,
+                       "RAS inject mask(0x%x) isn't supported and force it to 0.\n",
+                       inst_mask);
+
+               return;
+       }
+
+       switch (data->head.block) {
+       case AMDGPU_RAS_BLOCK__GFX:
+               mask = GENMASK(num_xcc - 1, 0);
+               break;
+       case AMDGPU_RAS_BLOCK__SDMA:
+               mask = GENMASK(adev->sdma.num_instances - 1, 0);
+               break;
+       case AMDGPU_RAS_BLOCK__VCN:
+       case AMDGPU_RAS_BLOCK__JPEG:
+               mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0);
+               break;
+       default:
+               mask = inst_mask;
+               break;
+       }
+
+       /* remove invalid bits in instance mask */
+       data->inject.instance_mask &= mask;
+       if (inst_mask != data->inject.instance_mask)
+               dev_dbg(adev->dev,
+                       "Adjust RAS inject mask 0x%x to 0x%x\n",
+                       inst_mask, data->inject.instance_mask);
+}
+
  /**
   * DOC: AMDGPU RAS debugfs control interface
   *
@@ -341,7 +388,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
   * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
   * name: the name of IP.
   *
- * inject has two more members than head, they are address, value.
+ * inject has three more members than head, they are address, value and mask.
   * As their names indicate, inject operation will write the
   * value to the address.
   *
@@ -365,7 +412,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
   *
   *     echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
   *     echo "enable  <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
- *     echo "inject  <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
+ *     echo "inject  <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
   *
   * Where N, is the card which you want to affect.
   *
@@ -382,13 +429,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
   *
   * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
   * The address and value are hexadecimal numbers, leading 0x is optional.
+ * The mask means instance mask, is optional, default value is 0x1.
   *
   * For instance,
   *
   * .. code-block:: bash
   *
   *     echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
- *     echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
+ *     echo inject umc ce 0 0 0 3 > /sys/kernel/debug/dri/0/ras/ras_ctrl
   *     echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
   *
   * How to check the result of the operation?
@@ -460,6 +508,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
                         break;
                 }
  
+               amdgpu_ras_instance_mask_check(adev, &data);
+
                 /* data.inject.address is offset instead of absolute gpu address */
                 ret = amdgpu_ras_error_inject(adev, &data.inject);
                 break;
@@ -1115,15 +1165,15 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
                                                           block_info.address);
         }
  
-       if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
-               if (block_obj->hw_ops->ras_error_inject)
-                       ret = block_obj->hw_ops->ras_error_inject(adev, info);
+       if (block_obj->hw_ops->ras_error_inject) {
+               if (info->head.block == AMDGPU_RAS_BLOCK__GFX)
+                       ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask);
+               else /* Special ras_error_inject is defined (e.g: xgmi) */
+                       ret = block_obj->hw_ops->ras_error_inject(adev, &block_info,
+                                               info->instance_mask);
         } else {
-               /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
-               if (block_obj->hw_ops->ras_error_inject)
-                       ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
-               else  /*If not defined .ras_error_inject, use default ras_error_inject*/
-                       ret = psp_ras_trigger_error(&adev->psp, &block_info);
+               /* default path */
+               ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask);
         }
  
         if (ret)
@@ -2007,9 +2057,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
                 /* Perform full reset in fatal error mode */
                 if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
                         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-               else
+               else {
                         clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
  
+                       if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
+                               ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+                               reset_context.method = AMD_RESET_METHOD_MODE2;
+                       }
+               }
+
                 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
         }
         atomic_set(&ras->in_recovery, 0);
@@ -2258,7 +2314,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         atomic_set(&con->in_recovery, 0);
         con->eeprom_control.bad_channel_bitmap = 0;
  
-       max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
+       max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
         amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
  
         /* Todo: During test the SMU might fail to read the eeprom through I2C
@@ -2624,7 +2680,8 @@ release_con:
  
  int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
  {
-       if (adev->gmc.xgmi.connected_to_cpu)
+       if (adev->gmc.xgmi.connected_to_cpu ||
+           adev->gmc.is_app_apu)
                 return 1;
         return 0;
  }
@@ -3163,7 +3220,8 @@ bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev,
  
         if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) &&
             !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG))
-               return false;
+               /* keep the check here in case we need to refer to the result later */
+               dev_dbg(adev->dev, "Invalid err_info field\n");
  
         /* read err count */
         *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT);
@@ -3186,17 +3244,17 @@ void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev,
         uint32_t i, j;
  
         for (i = 0; i < reg_list_size; i++) {
+               /* query memory_id from err_status_lo */
+               if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
+                                                        instance, &memory_id))
+                       continue;
+
                 /* query err_cnt from err_status_hi */
                 if (!amdgpu_ras_inst_get_err_cnt_field(adev, &reg_list[i],
                                                        instance, &err_cnt) ||
                     !err_cnt)
                         continue;
  
-               /* query memory_id from err_status_lo */
-               if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i],
-                                                        instance, &memory_id))
-                       continue;
-
                 *err_count += err_cnt;
  
                 /* log the errors */