Merge tag 'drm-msm-next-2021-06-23b' of https://gitlab.freedesktop.org/drm/msm into...

[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index a324dc2da101c0b6ecc53f38f72dc28bc93b4ccc..bd24639aedfcbd8e9291899ffe6f437ee185acc0 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -27,6 +27,7 @@
  #include <linux/uaccess.h>
  #include <linux/reboot.h>
  #include <linux/syscalls.h>
+#include <linux/pm_runtime.h>
  
  #include "amdgpu.h"
  #include "amdgpu_ras.h"
@@ -321,11 +322,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
   * "disable" requires only the block.
   * "enable" requires the block and error type.
   * "inject" requires the block, error type, address, and value.
+ *
   * The block is one of: umc, sdma, gfx, etc.
   *     see ras_block_string[] for details
+ *
   * The error type is one of: ue, ce, where,
   *     ue is multi-uncorrectable
   *     ce is single-correctable
+ *
   * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
   * The address and value are hexadecimal numbers, leading 0x is optional.
   *
@@ -1040,29 +1044,36 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
  }
  
  /* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-               bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+                                 unsigned long *ce_count,
+                                 unsigned long *ue_count)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_manager *obj;
-       struct ras_err_data data = {0, 0};
+       unsigned long ce, ue;
  
         if (!adev->ras_enabled || !con)
-               return 0;
+               return;
  
+       ce = 0;
+       ue = 0;
         list_for_each_entry(obj, &con->head, node) {
                 struct ras_query_if info = {
                         .head = obj->head,
                 };
  
                 if (amdgpu_ras_query_error_status(adev, &info))
-                       return 0;
+                       return;
  
-               data.ce_count += info.ce_count;
-               data.ue_count += info.ue_count;
+               ce += info.ce_count;
+               ue += info.ue_count;
         }
  
-       return is_ce ? data.ce_count : data.ue_count;
+       if (ce_count)
+               *ce_count = ce;
+
+       if (ue_count)
+               *ue_count = ue;
  }
  /* query/inject/cure end */
  
@@ -2106,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
                 adev->ras_hw_enabled & amdgpu_ras_mask;
  }
  
+static void amdgpu_ras_counte_dw(struct work_struct *work)
+{
+       struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
+                                             ras_counte_delay_work.work);
+       struct amdgpu_device *adev = con->adev;
+       struct drm_device *dev = adev_to_drm(adev);
+       unsigned long ce_count, ue_count;
+       int res;
+
+       res = pm_runtime_get_sync(dev->dev);
+       if (res < 0)
+               goto Out;
+
+       /* Cache new values.
+        */
+       amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
+       atomic_set(&con->ras_ce_count, ce_count);
+       atomic_set(&con->ras_ue_count, ue_count);
+
+       pm_runtime_mark_last_busy(dev->dev);
+Out:
+       pm_runtime_put_autosuspend(dev->dev);
+}
+
  int amdgpu_ras_init(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -2120,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
         if (!con)
                 return -ENOMEM;
  
+       con->adev = adev;
+       INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
+       atomic_set(&con->ras_ce_count, 0);
+       atomic_set(&con->ras_ue_count, 0);
+
         con->objs = (struct ras_manager *)(con + 1);
  
         amdgpu_ras_set_context(adev, con);
@@ -2191,7 +2231,7 @@ release_con:
         return r;
  }
  
-static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
+int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
  {
         if (adev->gmc.xgmi.connected_to_cpu)
                 return 1;
@@ -2223,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
                          struct ras_fs_if *fs_info,
                          struct ras_ih_if *ih_info)
  {
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       unsigned long ue_count, ce_count;
         int r;
  
         /* disable RAS feature per IP block if it is not supported */
@@ -2263,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
         if (r)
                 goto sysfs;
  
+       /* Those are the cached values at init.
+        */
+       amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
+       atomic_set(&con->ras_ce_count, ce_count);
+       atomic_set(&con->ras_ue_count, ue_count);
+
         return 0;
  cleanup:
         amdgpu_ras_sysfs_remove(adev, ras_block);
@@ -2359,6 +2407,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
         if (!adev->ras_enabled || !con)
                 return 0;
  
+
         /* Need disable ras on all IPs here before ip [hw/sw]fini */
         amdgpu_ras_disable_all_features(adev, 0);
         amdgpu_ras_recovery_fini(adev);
@@ -2380,6 +2429,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
         if (con->features)
                 amdgpu_ras_disable_all_features(adev, 1);
  
+       cancel_delayed_work_sync(&con->ras_counte_delay_work);
+
         amdgpu_ras_set_context(adev, NULL);
         kfree(con);