Merge tag 'drm-misc-next-2021-04-01' of git://anongit.freedesktop.org/drm/drm-misc...

[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 1fb2a91ad30ad797e06d4ae1b26b91b289f30bd5..0e16683876aa482128ebd6f9333872debb06789a 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -109,7 +109,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
         ssize_t s;
         char val[128];
  
-       if (amdgpu_ras_error_query(obj->adev, &info))
+       if (amdgpu_ras_query_error_status(obj->adev, &info))
                 return -EINVAL;
  
         s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
@@ -434,7 +434,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
                 return snprintf(buf, PAGE_SIZE,
                                 "Query currently inaccessible\n");
  
-       if (amdgpu_ras_error_query(obj->adev, &info))
+       if (amdgpu_ras_query_error_status(obj->adev, &info))
                 return -EINVAL;
  
         return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
@@ -463,7 +463,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_manager *obj;
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return NULL;
  
         if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
@@ -490,7 +490,7 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
         struct ras_manager *obj;
         int i;
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return NULL;
  
         if (head) {
@@ -590,7 +590,11 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
                 con->features |= BIT(head->block);
         } else {
                 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
-                       con->features &= ~BIT(head->block);
+                       /* skip clean gfx ras context feature for VEGA20 Gaming.
+                        * will clean later
+                        */
+                       if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)))
+                               con->features &= ~BIT(head->block);
                         put_obj(obj);
                 }
         }
@@ -693,6 +697,10 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
                         if (ret)
                                 return ret;
  
+                       /* gfx block ras dsiable cmd must send to ras-ta */
+                       if (head->block == AMDGPU_RAS_BLOCK__GFX)
+                               con->features |= BIT(head->block);
+
                         ret = amdgpu_ras_feature_enable(adev, head, 0);
                 }
         } else
@@ -757,8 +765,8 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
  /* feature ctl end */
  
  /* query/inject/cure begin */
-int amdgpu_ras_error_query(struct amdgpu_device *adev,
-               struct ras_query_if *info)
+int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
+       struct ras_query_if *info)
  {
         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
         struct ras_err_data err_data = {0, 0, 0, NULL};
@@ -787,10 +795,16 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
         case AMDGPU_RAS_BLOCK__GFX:
                 if (adev->gfx.funcs->query_ras_error_count)
                         adev->gfx.funcs->query_ras_error_count(adev, &err_data);
+
+               if (adev->gfx.funcs->query_ras_error_status)
+                       adev->gfx.funcs->query_ras_error_status(adev);
                 break;
         case AMDGPU_RAS_BLOCK__MMHUB:
                 if (adev->mmhub.funcs->query_ras_error_count)
                         adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
+
+               if (adev->mmhub.funcs->query_ras_error_status)
+                       adev->mmhub.funcs->query_ras_error_status(adev);
                 break;
         case AMDGPU_RAS_BLOCK__PCIE_BIF:
                 if (adev->nbio.funcs->query_ras_error_count)
@@ -826,6 +840,35 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
         return 0;
  }
  
+int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
+               enum amdgpu_ras_block block)
+{
+       if (!amdgpu_ras_is_supported(adev, block))
+               return -EINVAL;
+
+       switch (block) {
+       case AMDGPU_RAS_BLOCK__GFX:
+               if (adev->gfx.funcs->reset_ras_error_count)
+                       adev->gfx.funcs->reset_ras_error_count(adev);
+
+               if (adev->gfx.funcs->reset_ras_error_status)
+                       adev->gfx.funcs->reset_ras_error_status(adev);
+               break;
+       case AMDGPU_RAS_BLOCK__MMHUB:
+               if (adev->mmhub.funcs->reset_ras_error_count)
+                       adev->mmhub.funcs->reset_ras_error_count(adev);
+               break;
+       case AMDGPU_RAS_BLOCK__SDMA:
+               if (adev->sdma.funcs->reset_ras_error_count)
+                       adev->sdma.funcs->reset_ras_error_count(adev);
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
  /* Trigger XGMI/WAFL error */
  static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
                                  struct ta_ras_trigger_error_input *block_info)
@@ -913,7 +956,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
         struct ras_manager *obj;
         struct ras_err_data data = {0, 0};
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return 0;
  
         list_for_each_entry(obj, &con->head, node) {
@@ -921,7 +964,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
                         .head = obj->head,
                 };
  
-               if (amdgpu_ras_error_query(adev, &info))
+               if (amdgpu_ras_query_error_status(adev, &info))
                         return 0;
  
                 data.ce_count += info.ce_count;
@@ -1137,16 +1180,17 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
   *
   */
  /* debugfs begin */
-static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
+static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct dentry *dir;
         struct drm_minor *minor = adev_to_drm(adev)->primary;
  
-       con->dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
-       debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
-                               adev, &amdgpu_ras_debugfs_ctrl_ops);
-       debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
-                               adev, &amdgpu_ras_debugfs_eeprom_ops);
+       dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
+       debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
+                           &amdgpu_ras_debugfs_ctrl_ops);
+       debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
+                           &amdgpu_ras_debugfs_eeprom_ops);
  
         /*
          * After one uncorrectable error happens, usually GPU recovery will
@@ -1156,24 +1200,24 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
          * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
          * will never be called.
          */
-       debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
-                               &con->reboot);
+       debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);
  
         /*
          * User could set this not to clean up hardware's error count register
          * of RAS IPs during ras recovery.
          */
-       debugfs_create_bool("disable_ras_err_cnt_harvest", 0644,
-                       con->dir, &con->disable_ras_err_cnt_harvest);
+       debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir,
+                           &con->disable_ras_err_cnt_harvest);
+       return dir;
  }
  
  static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
-               struct ras_fs_if *head)
+                                     struct ras_fs_if *head,
+                                     struct dentry *dir)
  {
-       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
  
-       if (!obj || obj->ent)
+       if (!obj || !dir)
                 return;
  
         get_obj(obj);
@@ -1182,14 +1226,14 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
                         head->debugfs_name,
                         sizeof(obj->fs_data.debugfs_name));
  
-       obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
-                                      S_IWUGO | S_IRUGO, con->dir, obj,
-                                      &amdgpu_ras_debugfs_ops);
+       debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
+                           obj, &amdgpu_ras_debugfs_ops);
  }
  
  void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct dentry *dir;
         struct ras_manager *obj;
         struct ras_fs_if fs_info;
  
@@ -1200,7 +1244,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
         if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
                 return;
  
-       amdgpu_ras_debugfs_create_ctrl_node(adev);
+       dir = amdgpu_ras_debugfs_create_ctrl_node(adev);
  
         list_for_each_entry(obj, &con->head, node) {
                 if (amdgpu_ras_is_supported(adev, obj->head.block) &&
@@ -1208,34 +1252,11 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
                         sprintf(fs_info.debugfs_name, "%s_err_inject",
                                         ras_block_str(obj->head.block));
                         fs_info.head = obj->head;
-                       amdgpu_ras_debugfs_create(adev, &fs_info);
+                       amdgpu_ras_debugfs_create(adev, &fs_info, dir);
                 }
         }
  }
  
-static void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
-               struct ras_common_if *head)
-{
-       struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
-
-       if (!obj || !obj->ent)
-               return;
-
-       obj->ent = NULL;
-       put_obj(obj);
-}
-
-static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
-{
-       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       struct ras_manager *obj, *tmp;
-
-       list_for_each_entry_safe(obj, tmp, &con->head, node) {
-               amdgpu_ras_debugfs_remove(adev, &obj->head);
-       }
-
-       con->dir = NULL;
-}
  /* debugfs end */
  
  /* ras fs */
@@ -1282,8 +1303,17 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
  
  static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
  {
-       if (IS_ENABLED(CONFIG_DEBUG_FS))
-               amdgpu_ras_debugfs_remove_all(adev);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_manager *con_obj, *ip_obj, *tmp;
+
+       if (IS_ENABLED(CONFIG_DEBUG_FS)) {
+               list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
+                       ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
+                       if (ip_obj)
+                               put_obj(ip_obj);
+               }
+       }
+
         amdgpu_ras_sysfs_remove_all(adev);
         return 0;
  }
@@ -1447,7 +1477,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_manager *obj;
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return;
  
         list_for_each_entry(obj, &con->head, node) {
@@ -1464,7 +1494,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
                 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
                         continue;
  
-               amdgpu_ras_error_query(adev, &info);
+               amdgpu_ras_query_error_status(adev, &info);
         }
  }
  
@@ -1495,7 +1525,7 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_manager *obj;
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return;
  
         list_for_each_entry(obj, &con->head, node) {
@@ -1809,7 +1839,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         bool exc_err_limit = false;
         int ret;
  
-       if (con)
+       if (adev->ras_features && con)
                 data = &con->eh_data;
         else
                 return 0;
@@ -1828,6 +1858,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
         amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
  
+       /* Todo: During test the SMU might fail to read the eeprom through I2C
+        * when the GPU is pending on XGMI reset during probe time
+        * (Mostly after second bus reset), skip it now
+        */
+       if (adev->gmc.xgmi.pending_reset)
+               return 0;
         ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
         /*
          * This calling fails when exc_err_limit is true or
@@ -1928,11 +1964,11 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
                 return;
  
         if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
-               dev_info(adev->dev, "HBM ECC is active.\n");
+               dev_info(adev->dev, "MEM ECC is active.\n");
                 *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
                                 1 << AMDGPU_RAS_BLOCK__DF);
         } else
-               dev_info(adev->dev, "HBM ECC is not presented.\n");
+               dev_info(adev->dev, "MEM ECC is not presented.\n");
  
         if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
                 dev_info(adev->dev, "SRAM ECC is active.\n");
@@ -1970,6 +2006,15 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
         amdgpu_ras_check_supported(adev, &con->hw_supported,
                         &con->supported);
         if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
+               /* set gfx block ras context feature for VEGA20 Gaming
+                * send ras disable cmd to ras ta during ras late init.
+                */
+               if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) {
+                       con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
+
+                       return 0;
+               }
+
                 r = 0;
                 goto release_con;
         }
@@ -2083,8 +2128,12 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_manager *obj, *tmp;
  
-       if (!con)
+       if (!adev->ras_features || !con) {
+               /* clean ras context for VEGA20 Gaming after send ras disable cmd */
+               amdgpu_release_ras_context(adev);
+
                 return;
+       }
  
         if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
                 /* Set up all other IPs which are not implemented. There is a
@@ -2125,7 +2174,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return;
  
         amdgpu_ras_disable_all_features(adev, 0);
@@ -2139,7 +2188,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return 0;
  
         /* Need disable ras on all IPs here before ip [hw/sw]fini */
@@ -2152,7 +2201,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return 0;
  
         amdgpu_ras_fs_fini(adev);
@@ -2196,18 +2245,16 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
         return false;
  }
  
-bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
+void amdgpu_release_ras_context(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       bool exc_err_limit = false;
  
-       if (con && (amdgpu_bad_page_threshold != 0))
-               amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control,
-                                               &exc_err_limit);
+       if (!con)
+               return;
  
-       /*
-        * We are only interested in variable exc_err_limit,
-        * as it says if GPU is in bad state or not.
-        */
-       return exc_err_limit;
+       if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
+               con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
+               amdgpu_ras_set_context(adev, NULL);
+               kfree(con);
+       }
  }