#include "amdgpu_amdkfd.h"
#include "amdgpu_ras.h"
+#include "amdgpu_xgmi.h"
/*
* KMS wrapper.
int amdgpu_reset_method = -1; /* auto */
int amdgpu_num_kcq = -1;
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
+
struct amdgpu_mgpu_info mgpu_info = {
.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
+ .delayed_reset_work = __DELAYED_WORK_INITIALIZER(
+ mgpu_info.delayed_reset_work,
+ amdgpu_drv_delayed_reset_work_handler, 0),
};
int amdgpu_ras_enable = -1;
uint amdgpu_ras_mask = 0xffffffff;
-int amdgpu_bad_page_threshold = 100;
+int amdgpu_bad_page_threshold = -1;
struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
.timeout_fatal_disable = false,
- .period = 0x3f, /* about 8s */
+ .period = 0x23, /* default to max. timeout = 1 << 0x23 cycles */
};
/**
* DOC: timeout_period (uint)
* Modify the watchdog timeout max_cycles as (1 << period)
*/
-MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0x1F = default), timeout maxCycles = (1 << period)");
+MODULE_PARM_DESC(timeout_period, "watchdog timeout period (1 to 0x23(default), timeout maxCycles = (1 << period)");
module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644);
/**
* faulty pages by ECC exceed threshold value and leave it for user's further
* check.
*/
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
{0x1002, 0x73FF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_DIMGREY_CAVEFISH},
/* Aldebaran */
- {0x1002, 0x7408, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN},
- {0x1002, 0x740C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN},
- {0x1002, 0x740F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN},
+ {0x1002, 0x7408, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
+ {0x1002, 0x740C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
+ {0x1002, 0x740F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN|AMD_EXP_HW_SUPPORT},
{0, 0, 0}
};
adev->mp1_state = PP_MP1_STATE_NONE;
}
+/**
+ * amdgpu_drv_delayed_reset_work_handler - work handler for reset
+ *
+ * @work: work_struct.
+ */
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
+{
+ struct list_head device_list;
+ struct amdgpu_device *adev;
+ int i, r;
+ bool need_full_reset = true;
+
+ mutex_lock(&mgpu_info.mutex);
+ if (mgpu_info.pending_reset == true) {
+ mutex_unlock(&mgpu_info.mutex);
+ return;
+ }
+ mgpu_info.pending_reset = true;
+ mutex_unlock(&mgpu_info.mutex);
+
+ for (i = 0; i < mgpu_info.num_dgpu; i++) {
+ adev = mgpu_info.gpu_ins[i].adev;
+ r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
+ if (r) {
+ dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
+ r, adev_to_drm(adev)->unique);
+ }
+ if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
+ r = -EALREADY;
+ }
+ for (i = 0; i < mgpu_info.num_dgpu; i++) {
+ adev = mgpu_info.gpu_ins[i].adev;
+ flush_work(&adev->xgmi_reset_work);
+ adev->gmc.xgmi.pending_reset = false;
+ }
+
+ /* reset function will rebuild the xgmi hive info , clear it now */
+ for (i = 0; i < mgpu_info.num_dgpu; i++)
+ amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
+
+ INIT_LIST_HEAD(&device_list);
+
+ for (i = 0; i < mgpu_info.num_dgpu; i++)
+ list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list, &device_list);
+
+ /* unregister the GPU first, reset function will add them back */
+ list_for_each_entry(adev, &device_list, reset_list)
+ amdgpu_unregister_gpu_instance(adev);
+
+ r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
+ if (r) {
+ DRM_ERROR("reinit gpus failure");
+ return;
+ }
+ for (i = 0; i < mgpu_info.num_dgpu; i++) {
+ adev = mgpu_info.gpu_ins[i].adev;
+ if (!adev->kfd.init_complete)
+ amdgpu_amdkfd_device_init(adev);
+ amdgpu_ttm_set_buffer_funcs_status(adev, true);
+ }
+ return;
+}
+
static int amdgpu_pmops_suspend(struct device *dev)
{
struct drm_device *drm_dev = dev_get_drvdata(dev);