Merge tag 'drm-misc-next-2021-04-01' of git://anongit.freedesktop.org/drm/drm-misc...

[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_drv.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index e39d81b68169338a3d2e9387888f3bb79a319c1b..33991b4a5627e37842bbd63e29395acb81587b7d 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -45,6 +45,7 @@
  #include "amdgpu_amdkfd.h"
  
  #include "amdgpu_ras.h"
+#include "amdgpu_xgmi.h"
  
  /*
   * KMS wrapper.
@@ -169,15 +170,20 @@ uint amdgpu_freesync_vid_mode;
  int amdgpu_reset_method = -1; /* auto */
  int amdgpu_num_kcq = -1;
  
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
+
  struct amdgpu_mgpu_info mgpu_info = {
         .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
+       .delayed_reset_work = __DELAYED_WORK_INITIALIZER(
+                       mgpu_info.delayed_reset_work,
+                       amdgpu_drv_delayed_reset_work_handler, 0),
  };
  int amdgpu_ras_enable = -1;
  uint amdgpu_ras_mask = 0xffffffff;
-int amdgpu_bad_page_threshold = 100;
+int amdgpu_bad_page_threshold = -1;
  struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
         .timeout_fatal_disable = false,
-       .period = 0x3f, /* about 8s */
+       .period = 0x23, /* default to max. timeout = 1 << 0x23 cycles */
  };
  
  /**
@@ -545,7 +551,7 @@ module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_di
   * DOC: timeout_period (uint)
   * Modify the watchdog timeout max_cycles as (1 << period)
   */
-MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0x1F = default), timeout maxCycles = (1 << period)");
+MODULE_PARM_DESC(timeout_period, "watchdog timeout period (1 to 0x23(default), timeout maxCycles = (1 << period)");
  module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644);
  
  /**
@@ -848,7 +854,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444);
   * faulty pages by ECC exceed threshold value and leave it for user's further
   * check.
   */
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
  module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
  
  MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
@@ -1173,9 +1179,9 @@ static const struct pci_device_id pciidlist[] = {
         {0x1002, 0x73FF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
  
         /* Aldebaran */
-       {0x1002, 0x7408, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN},
-       {0x1002, 0x740C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN},
-       {0x1002, 0x740F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN},
+       {0x1002, 0x7408, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
+       {0x1002, 0x740C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
+       {0x1002, 0x740F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
  
         {0, 0, 0}
  };
@@ -1333,6 +1339,69 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
         adev->mp1_state = PP_MP1_STATE_NONE;
  }
  
+/**
+ * amdgpu_drv_delayed_reset_work_handler - work handler for reset
+ *
+ * @work: work_struct.
+ */
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
+{
+       struct list_head device_list;
+       struct amdgpu_device *adev;
+       int i, r;
+       bool need_full_reset = true;
+
+       mutex_lock(&mgpu_info.mutex);
+       if (mgpu_info.pending_reset == true) {
+               mutex_unlock(&mgpu_info.mutex);
+               return;
+       }
+       mgpu_info.pending_reset = true;
+       mutex_unlock(&mgpu_info.mutex);
+
+       for (i = 0; i < mgpu_info.num_dgpu; i++) {
+               adev = mgpu_info.gpu_ins[i].adev;
+               r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
+               if (r) {
+                       dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
+                               r, adev_to_drm(adev)->unique);
+               }
+               if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
+                       r = -EALREADY;
+       }
+       for (i = 0; i < mgpu_info.num_dgpu; i++) {
+               adev = mgpu_info.gpu_ins[i].adev;
+               flush_work(&adev->xgmi_reset_work);
+               adev->gmc.xgmi.pending_reset = false;
+       }
+
+       /* reset function will rebuild the xgmi hive info , clear it now */
+       for (i = 0; i < mgpu_info.num_dgpu; i++)
+               amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
+
+       INIT_LIST_HEAD(&device_list);
+
+       for (i = 0; i < mgpu_info.num_dgpu; i++)
+               list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list, &device_list);
+
+       /* unregister the GPU first, reset function will add them back */
+       list_for_each_entry(adev, &device_list, reset_list)
+               amdgpu_unregister_gpu_instance(adev);
+
+       r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
+       if (r) {
+               DRM_ERROR("reinit gpus failure");
+               return;
+       }
+       for (i = 0; i < mgpu_info.num_dgpu; i++) {
+               adev = mgpu_info.gpu_ins[i].adev;
+               if (!adev->kfd.init_complete)
+                       amdgpu_amdkfd_device_init(adev);
+               amdgpu_ttm_set_buffer_funcs_status(adev, true);
+       }
+       return;
+}
+
  static int amdgpu_pmops_suspend(struct device *dev)
  {
         struct drm_device *drm_dev = dev_get_drvdata(dev);