#include <linux/uaccess.h>
#include <linux/reboot.h>
#include <linux/syscalls.h>
+#include <linux/pm_runtime.h>
#include "amdgpu.h"
#include "amdgpu_ras.h"
* "disable" requires only the block.
* "enable" requires the block and error type.
* "inject" requires the block, error type, address, and value.
+ *
* The block is one of: umc, sdma, gfx, etc.
* see ras_block_string[] for details
+ *
* The error type is one of: ue, ce, where,
* ue is multi-uncorrectable
* ce is single-correctable
+ *
* The sub-block is a the sub-block index, pass 0 if there is no sub-block.
* The address and value are hexadecimal numbers, leading 0x is optional.
*
}
/* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
- bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
- struct ras_err_data data = {0, 0};
+ unsigned long ce, ue;
if (!adev->ras_enabled || !con)
- return 0;
+ return;
+ ce = 0;
+ ue = 0;
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
.head = obj->head,
};
if (amdgpu_ras_query_error_status(adev, &info))
- return 0;
+ return;
- data.ce_count += info.ce_count;
- data.ue_count += info.ue_count;
+ ce += info.ce_count;
+ ue += info.ue_count;
}
- return is_ce ? data.ce_count : data.ue_count;
+ if (ce_count)
+ *ce_count = ce;
+
+ if (ue_count)
+ *ue_count = ue;
}
/* query/inject/cure end */
adev->ras_hw_enabled & amdgpu_ras_mask;
}
+static void amdgpu_ras_counte_dw(struct work_struct *work)
+{
+ struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
+ ras_counte_delay_work.work);
+ struct amdgpu_device *adev = con->adev;
+ struct drm_device *dev = adev_to_drm(adev);
+ unsigned long ce_count, ue_count;
+ int res;
+
+ res = pm_runtime_get_sync(dev->dev);
+ if (res < 0)
+ goto Out;
+
+ /* Cache new values.
+ */
+ amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
+ atomic_set(&con->ras_ce_count, ce_count);
+ atomic_set(&con->ras_ue_count, ue_count);
+
+ pm_runtime_mark_last_busy(dev->dev);
+Out:
+ pm_runtime_put_autosuspend(dev->dev);
+}
+
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
if (!con)
return -ENOMEM;
+ con->adev = adev;
+ INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
+ atomic_set(&con->ras_ce_count, 0);
+ atomic_set(&con->ras_ue_count, 0);
+
con->objs = (struct ras_manager *)(con + 1);
amdgpu_ras_set_context(adev, con);
return r;
}
-static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
+int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
{
if (adev->gmc.xgmi.connected_to_cpu)
return 1;
struct ras_fs_if *fs_info,
struct ras_ih_if *ih_info)
{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ unsigned long ue_count, ce_count;
int r;
/* disable RAS feature per IP block if it is not supported */
if (r)
goto sysfs;
+ /* Those are the cached values at init.
+ */
+ amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
+ atomic_set(&con->ras_ce_count, ce_count);
+ atomic_set(&con->ras_ue_count, ue_count);
+
return 0;
cleanup:
amdgpu_ras_sysfs_remove(adev, ras_block);
if (!adev->ras_enabled || !con)
return 0;
+
/* Need disable ras on all IPs here before ip [hw/sw]fini */
amdgpu_ras_disable_all_features(adev, 0);
amdgpu_ras_recovery_fini(adev);
if (con->features)
amdgpu_ras_disable_all_features(adev, 1);
+ cancel_delayed_work_sync(&con->ras_counte_delay_work);
+
amdgpu_ras_set_context(adev, NULL);
kfree(con);