]> Git Repo - J-linux.git/blobdiff - drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
Merge tag 'for-5.19-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave...
[J-linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
index 035891ec59d5f73a1d5f8dd40f2005880af5203e..dac202ae864dd224d14b7975d67dbfce1e9ca3d6 100644 (file)
@@ -197,6 +197,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
        if (amdgpu_ras_query_error_status(obj->adev, &info))
                return -EINVAL;
 
+       /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
+       if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+           obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+               if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
+                       dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
+       }
+
        s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
                        "ue", info.ue_count,
                        "ce", info.ce_count);
@@ -550,9 +557,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
        if (amdgpu_ras_query_error_status(obj->adev, &info))
                return -EINVAL;
 
-       if (obj->adev->asic_type == CHIP_ALDEBARAN) {
+       if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+           obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
                if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
-                       DRM_WARN("Failed to reset error counter and error status");
+                       dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
        }
 
        return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
@@ -726,7 +734,9 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
        /* Do not enable if it is not allowed. */
        WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
 
-       if (!amdgpu_ras_intr_triggered()) {
+       /* Only enable ras feature operation handle on host side */
+       if (!amdgpu_sriov_vf(adev) &&
+               !amdgpu_ras_intr_triggered()) {
                ret = psp_ras_enable_features(&adev->psp, info, enable);
                if (ret) {
                        dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
@@ -1025,9 +1035,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
                }
        }
 
-       if (!amdgpu_persistent_edc_harvesting_supported(adev))
-               amdgpu_ras_reset_error_status(adev, info->head.block);
-
        return 0;
 }
 
@@ -1147,6 +1154,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
                if (res)
                        return res;
 
+               if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+                   adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+                       if (amdgpu_ras_reset_error_status(adev, info.head.block))
+                               dev_warn(adev->dev, "Failed to reset error counter and error status");
+               }
+
                ce += info.ce_count;
                ue += info.ue_count;
        }
@@ -1523,7 +1536,9 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
  */
 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
 {
-       if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
+       /* Fatal error events are handled on host side */
+       if (amdgpu_sriov_vf(adev) ||
+               !amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
                return;
 
        if (adev->nbio.ras &&
@@ -1788,6 +1803,12 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
                        continue;
 
                amdgpu_ras_query_error_status(adev, &info);
+
+               if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+                   adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+                       if (amdgpu_ras_reset_error_status(adev, info.head.block))
+                               dev_warn(adev->dev, "Failed to reset error counter and error status");
+               }
        }
 }
 
@@ -2270,10 +2291,15 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
 {
        adev->ras_hw_enabled = adev->ras_enabled = 0;
 
-       if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
+       if (!adev->is_atom_fw ||
            !amdgpu_ras_asic_supported(adev))
                return;
 
+       /* If driver run on sriov guest side, only enable ras for aldebaran */
+       if (amdgpu_sriov_vf(adev) &&
+               adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 2))
+               return;
+
        if (!adev->gmc.xgmi.connected_to_cpu) {
                if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
                        dev_info(adev->dev, "MEM ECC is active.\n");
@@ -2285,15 +2311,21 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
 
                if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
                        dev_info(adev->dev, "SRAM ECC is active.\n");
-                       adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
-                                                   1 << AMDGPU_RAS_BLOCK__DF);
-
-                       if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0))
-                               adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
-                                               1 << AMDGPU_RAS_BLOCK__JPEG);
-                       else
-                               adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
-                                               1 << AMDGPU_RAS_BLOCK__JPEG);
+                       if (!amdgpu_sriov_vf(adev)) {
+                               adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+                                                           1 << AMDGPU_RAS_BLOCK__DF);
+
+                               if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0))
+                                       adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+                                                       1 << AMDGPU_RAS_BLOCK__JPEG);
+                               else
+                                       adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+                                                       1 << AMDGPU_RAS_BLOCK__JPEG);
+                       } else {
+                               adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
+                                                               1 << AMDGPU_RAS_BLOCK__SDMA |
+                                                               1 << AMDGPU_RAS_BLOCK__GFX);
+                       }
                } else {
                        dev_info(adev->dev, "SRAM ECC is not presented.\n");
                }
@@ -2637,6 +2669,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
        struct amdgpu_ras_block_object *obj;
        int r;
 
+       /* Guest side doesn't need init ras feature */
+       if (amdgpu_sriov_vf(adev))
+               return 0;
+
        list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
                if (!node->ras_obj) {
                        dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
This page took 0.033232 seconds and 4 git commands to generate.