Merge tag 'drm-misc-next-2021-04-01' of git://anongit.freedesktop.org/drm/drm-misc...

[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_drv.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index db03be054fad5abdc2728b982bb6aaa5a2c84763..33991b4a5627e37842bbd63e29395acb81587b7d 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -45,6 +45,7 @@
  #include "amdgpu_amdkfd.h"
  
  #include "amdgpu_ras.h"
+#include "amdgpu_xgmi.h"
  
  /*
   * KMS wrapper.
@@ -146,6 +147,7 @@ int amdgpu_compute_multipipe = -1;
  int amdgpu_gpu_recovery = -1; /* auto */
  int amdgpu_emu_mode;
  uint amdgpu_smu_memory_pool_size;
+int amdgpu_smu_pptable_id = -1;
  /*
   * FBC (bit 0) disabled by default
   * MULTI_MON_PP_MCLK_SWITCH (bit 1) enabled by default
@@ -168,12 +170,21 @@ uint amdgpu_freesync_vid_mode;
  int amdgpu_reset_method = -1; /* auto */
  int amdgpu_num_kcq = -1;
  
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
+
  struct amdgpu_mgpu_info mgpu_info = {
         .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
+       .delayed_reset_work = __DELAYED_WORK_INITIALIZER(
+                       mgpu_info.delayed_reset_work,
+                       amdgpu_drv_delayed_reset_work_handler, 0),
  };
  int amdgpu_ras_enable = -1;
  uint amdgpu_ras_mask = 0xffffffff;
-int amdgpu_bad_page_threshold = 100;
+int amdgpu_bad_page_threshold = -1;
+struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
+       .timeout_fatal_disable = false,
+       .period = 0x23, /* default to max. timeout = 1 << 0x23 cycles */
+};
  
  /**
   * DOC: vramlimit (int)
@@ -529,6 +540,20 @@ module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
  MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1");
  module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444);
  
+/**
+ * DOC: timeout_fatal_disable (bool)
+ * Disable Watchdog timeout fatal error event
+ */
+MODULE_PARM_DESC(timeout_fatal_disable, "disable watchdog timeout fatal error (false = default)");
+module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_disable, bool, 0644);
+
+/**
+ * DOC: timeout_period (uint)
+ * Modify the watchdog timeout max_cycles as (1 << period)
+ */
+MODULE_PARM_DESC(timeout_period, "watchdog timeout period (1 to 0x23(default), timeout maxCycles = (1 << period)");
+module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644);
+
  /**
   * DOC: si_support (int)
   * Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled,
@@ -750,6 +775,13 @@ bool no_system_mem_limit;
  module_param(no_system_mem_limit, bool, 0644);
  MODULE_PARM_DESC(no_system_mem_limit, "disable system memory limit (false = default)");
  
+/**
+ * DOC: no_queue_eviction_on_vm_fault (int)
+ * If set, process queues will not be evicted on gpuvm fault. This is to keep the wavefront context for debugging (0 = queue eviction, 1 = no queue eviction). The default is 0 (queue eviction).
+ */
+int amdgpu_no_queue_eviction_on_vm_fault = 0;
+MODULE_PARM_DESC(no_queue_eviction_on_vm_fault, "No queue eviction on VM fault (0 = queue eviction, 1 = no queue eviction)");
+module_param_named(no_queue_eviction_on_vm_fault, amdgpu_no_queue_eviction_on_vm_fault, int, 0444);
  #endif
  
  /**
@@ -783,6 +815,10 @@ uint amdgpu_dm_abm_level;
  MODULE_PARM_DESC(abmlevel, "ABM level (0 = off (default), 1-4 = backlight reduction level) ");
  module_param_named(abmlevel, amdgpu_dm_abm_level, uint, 0444);
  
+int amdgpu_backlight = -1;
+MODULE_PARM_DESC(backlight, "Backlight control (0 = pwm, 1 = aux, -1 auto (default))");
+module_param_named(backlight, amdgpu_backlight, bint, 0444);
+
  /**
   * DOC: tmz (int)
   * Trusted Memory Zone (TMZ) is a method to protect data being written
@@ -818,12 +854,21 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444);
   * faulty pages by ECC exceed threshold value and leave it for user's further
   * check.
   */
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
  module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
  
  MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
  module_param_named(num_kcq, amdgpu_num_kcq, int, 0444);
  
+/**
+ * DOC: smu_pptable_id (int)
+ * Used to override pptable id. id = 0 use VBIOS pptable.
+ * id > 0 use the soft pptable with specicfied id.
+ */
+MODULE_PARM_DESC(smu_pptable_id,
+       "specify pptable id to be used (-1 = auto(default) value, 0 = use pptable from vbios, > 0 = soft pptable id)");
+module_param_named(smu_pptable_id, amdgpu_smu_pptable_id, int, 0444);
+
  static const struct pci_device_id pciidlist[] = {
  #ifdef  CONFIG_DRM_AMDGPU_SI
         {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI},
@@ -1133,6 +1178,11 @@ static const struct pci_device_id pciidlist[] = {
         {0x1002, 0x73E2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
         {0x1002, 0x73FF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
  
+       /* Aldebaran */
+       {0x1002, 0x7408, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
+       {0x1002, 0x740C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
+       {0x1002, 0x740F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
+
         {0, 0, 0}
  };
  
@@ -1289,6 +1339,69 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
         adev->mp1_state = PP_MP1_STATE_NONE;
  }
  
+/**
+ * amdgpu_drv_delayed_reset_work_handler - work handler for reset
+ *
+ * @work: work_struct.
+ */
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
+{
+       struct list_head device_list;
+       struct amdgpu_device *adev;
+       int i, r;
+       bool need_full_reset = true;
+
+       mutex_lock(&mgpu_info.mutex);
+       if (mgpu_info.pending_reset == true) {
+               mutex_unlock(&mgpu_info.mutex);
+               return;
+       }
+       mgpu_info.pending_reset = true;
+       mutex_unlock(&mgpu_info.mutex);
+
+       for (i = 0; i < mgpu_info.num_dgpu; i++) {
+               adev = mgpu_info.gpu_ins[i].adev;
+               r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
+               if (r) {
+                       dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
+                               r, adev_to_drm(adev)->unique);
+               }
+               if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
+                       r = -EALREADY;
+       }
+       for (i = 0; i < mgpu_info.num_dgpu; i++) {
+               adev = mgpu_info.gpu_ins[i].adev;
+               flush_work(&adev->xgmi_reset_work);
+               adev->gmc.xgmi.pending_reset = false;
+       }
+
+       /* reset function will rebuild the xgmi hive info , clear it now */
+       for (i = 0; i < mgpu_info.num_dgpu; i++)
+               amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
+
+       INIT_LIST_HEAD(&device_list);
+
+       for (i = 0; i < mgpu_info.num_dgpu; i++)
+               list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list, &device_list);
+
+       /* unregister the GPU first, reset function will add them back */
+       list_for_each_entry(adev, &device_list, reset_list)
+               amdgpu_unregister_gpu_instance(adev);
+
+       r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
+       if (r) {
+               DRM_ERROR("reinit gpus failure");
+               return;
+       }
+       for (i = 0; i < mgpu_info.num_dgpu; i++) {
+               adev = mgpu_info.gpu_ins[i].adev;
+               if (!adev->kfd.init_complete)
+                       amdgpu_amdkfd_device_init(adev);
+               amdgpu_ttm_set_buffer_funcs_status(adev, true);
+       }
+       return;
+}
+
  static int amdgpu_pmops_suspend(struct device *dev)
  {
         struct drm_device *drm_dev = dev_get_drvdata(dev);