Merge tag 'core-rcu-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index ed83a32f6f30aefe93f295798b13eb0506af95d7..0541196ae1ed80c9358e01fd4b74cddf187db207 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -99,6 +99,49 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
         return false;
  }
  
+static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
+{
+       struct ras_err_data err_data = {0, 0, 0, NULL};
+       struct eeprom_table_record err_rec;
+
+       if ((address >= adev->gmc.mc_vram_size) ||
+           (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+               dev_warn(adev->dev,
+                        "RAS WARN: input address 0x%llx is invalid.\n",
+                        address);
+               return -EINVAL;
+       }
+
+       if (amdgpu_ras_check_bad_page(adev, address)) {
+               dev_warn(adev->dev,
+                        "RAS WARN: 0x%llx has been marked as bad page!\n",
+                        address);
+               return 0;
+       }
+
+       memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
+
+       err_rec.address = address;
+       err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT;
+       err_rec.ts = (uint64_t)ktime_get_real_seconds();
+       err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+
+       err_data.err_addr = &err_rec;
+       err_data.err_addr_cnt = 1;
+
+       if (amdgpu_bad_page_threshold != 0) {
+               amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+                                        err_data.err_addr_cnt);
+               amdgpu_ras_save_bad_pages(adev);
+       }
+
+       dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
+       dev_warn(adev->dev, "Clear EEPROM:\n");
+       dev_warn(adev->dev, "    echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
+
+       return 0;
+}
+
  static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
                                         size_t size, loff_t *pos)
  {
@@ -178,11 +221,25 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
                 op = 1;
         else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
                 op = 2;
+       else if (sscanf(str, "retire_page") == 0)
+               op = 3;
         else if (str[0] && str[1] && str[2] && str[3])
                 /* ascii string, but commands are not matched. */
                 return -EINVAL;
  
         if (op != -1) {
+
+               if (op == 3) {
+                       if (sscanf(str, "%*s %llu", &address) != 1)
+                               if (sscanf(str, "%*s 0x%llx", &address) != 1)
+                                       return -EINVAL;
+
+                       data->op = op;
+                       data->inject.address = address;
+
+                       return 0;
+               }
+
                 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
                         return -EINVAL;
  
@@ -310,6 +367,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
         if (ret)
                 return -EINVAL;
  
+       if (data.op == 3)
+       {
+               ret = amdgpu_reserve_page_direct(adev, data.inject.address);
+
+               if (ret)
+                       return size;
+               else
+                       return ret;
+       }
+
         if (!amdgpu_ras_is_supported(adev, data.head.block))
                 return -EINVAL;
  
@@ -431,15 +498,13 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
         };
  
         if (!amdgpu_ras_get_error_query_ready(obj->adev))
-               return snprintf(buf, PAGE_SIZE,
-                               "Query currently inaccessible\n");
+               return sysfs_emit(buf, "Query currently inaccessible\n");
  
         if (amdgpu_ras_query_error_status(obj->adev, &info))
                 return -EINVAL;
  
-       return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
-                       "ue", info.ue_count,
-                       "ce", info.ce_count);
+       return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
+                         "ce", info.ce_count);
  }
  
  /* obj begin */
@@ -449,11 +514,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
  
  static inline void put_obj(struct ras_manager *obj)
  {
-       if (obj && --obj->use == 0)
+       if (obj && (--obj->use == 0))
                 list_del(&obj->node);
-       if (obj && obj->use < 0) {
-                DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
-       }
+       if (obj && (obj->use < 0))
+               DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
  }
  
  /* make one obj and return it. */
@@ -463,7 +527,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_manager *obj;
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return NULL;
  
         if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
@@ -490,7 +554,7 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
         struct ras_manager *obj;
         int i;
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return NULL;
  
         if (head) {
@@ -590,7 +654,11 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
                 con->features |= BIT(head->block);
         } else {
                 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
-                       con->features &= ~BIT(head->block);
+                       /* skip clean gfx ras context feature for VEGA20 Gaming.
+                        * will clean later
+                        */
+                       if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)))
+                               con->features &= ~BIT(head->block);
                         put_obj(obj);
                 }
         }
@@ -693,6 +761,10 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
                         if (ret)
                                 return ret;
  
+                       /* gfx block ras dsiable cmd must send to ras-ta */
+                       if (head->block == AMDGPU_RAS_BLOCK__GFX)
+                               con->features |= BIT(head->block);
+
                         ret = amdgpu_ras_feature_enable(adev, head, 0);
                 }
         } else
@@ -769,13 +841,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
  
         switch (info->head.block) {
         case AMDGPU_RAS_BLOCK__UMC:
-               if (adev->umc.funcs->query_ras_error_count)
-                       adev->umc.funcs->query_ras_error_count(adev, &err_data);
+               if (adev->umc.ras_funcs &&
+                   adev->umc.ras_funcs->query_ras_error_count)
+                       adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
                 /* umc query_ras_error_address is also responsible for clearing
                  * error status
                  */
-               if (adev->umc.funcs->query_ras_error_address)
-                       adev->umc.funcs->query_ras_error_address(adev, &err_data);
+               if (adev->umc.ras_funcs &&
+                   adev->umc.ras_funcs->query_ras_error_address)
+                       adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
                 break;
         case AMDGPU_RAS_BLOCK__SDMA:
                 if (adev->sdma.funcs->query_ras_error_count) {
@@ -785,25 +859,32 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
                 }
                 break;
         case AMDGPU_RAS_BLOCK__GFX:
-               if (adev->gfx.funcs->query_ras_error_count)
-                       adev->gfx.funcs->query_ras_error_count(adev, &err_data);
+               if (adev->gfx.ras_funcs &&
+                   adev->gfx.ras_funcs->query_ras_error_count)
+                       adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data);
  
-               if (adev->gfx.funcs->query_ras_error_status)
-                       adev->gfx.funcs->query_ras_error_status(adev);
+               if (adev->gfx.ras_funcs &&
+                   adev->gfx.ras_funcs->query_ras_error_status)
+                       adev->gfx.ras_funcs->query_ras_error_status(adev);
                 break;
         case AMDGPU_RAS_BLOCK__MMHUB:
-               if (adev->mmhub.funcs->query_ras_error_count)
-                       adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
+               if (adev->mmhub.ras_funcs &&
+                   adev->mmhub.ras_funcs->query_ras_error_count)
+                       adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data);
  
-               if (adev->mmhub.funcs->query_ras_error_status)
-                       adev->mmhub.funcs->query_ras_error_status(adev);
+               if (adev->mmhub.ras_funcs &&
+                   adev->mmhub.ras_funcs->query_ras_error_status)
+                       adev->mmhub.ras_funcs->query_ras_error_status(adev);
                 break;
         case AMDGPU_RAS_BLOCK__PCIE_BIF:
-               if (adev->nbio.funcs->query_ras_error_count)
-                       adev->nbio.funcs->query_ras_error_count(adev, &err_data);
+               if (adev->nbio.ras_funcs &&
+                   adev->nbio.ras_funcs->query_ras_error_count)
+                       adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);
                 break;
         case AMDGPU_RAS_BLOCK__XGMI_WAFL:
-               amdgpu_xgmi_query_ras_error_count(adev, &err_data);
+               if (adev->gmc.xgmi.ras_funcs &&
+                   adev->gmc.xgmi.ras_funcs->query_ras_error_count)
+                       adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data);
                 break;
         default:
                 break;
@@ -840,15 +921,18 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
  
         switch (block) {
         case AMDGPU_RAS_BLOCK__GFX:
-               if (adev->gfx.funcs->reset_ras_error_count)
-                       adev->gfx.funcs->reset_ras_error_count(adev);
+               if (adev->gfx.ras_funcs &&
+                   adev->gfx.ras_funcs->reset_ras_error_count)
+                       adev->gfx.ras_funcs->reset_ras_error_count(adev);
  
-               if (adev->gfx.funcs->reset_ras_error_status)
-                       adev->gfx.funcs->reset_ras_error_status(adev);
+               if (adev->gfx.ras_funcs &&
+                   adev->gfx.ras_funcs->reset_ras_error_status)
+                       adev->gfx.ras_funcs->reset_ras_error_status(adev);
                 break;
         case AMDGPU_RAS_BLOCK__MMHUB:
-               if (adev->mmhub.funcs->reset_ras_error_count)
-                       adev->mmhub.funcs->reset_ras_error_count(adev);
+               if (adev->mmhub.ras_funcs &&
+                   adev->mmhub.ras_funcs->reset_ras_error_count)
+                       adev->mmhub.ras_funcs->reset_ras_error_count(adev);
                 break;
         case AMDGPU_RAS_BLOCK__SDMA:
                 if (adev->sdma.funcs->reset_ras_error_count)
@@ -913,12 +997,14 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
  
         switch (info->head.block) {
         case AMDGPU_RAS_BLOCK__GFX:
-               if (adev->gfx.funcs->ras_error_inject)
-                       ret = adev->gfx.funcs->ras_error_inject(adev, info);
+               if (adev->gfx.ras_funcs &&
+                   adev->gfx.ras_funcs->ras_error_inject)
+                       ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
                 else
                         ret = -EINVAL;
                 break;
         case AMDGPU_RAS_BLOCK__UMC:
+       case AMDGPU_RAS_BLOCK__SDMA:
         case AMDGPU_RAS_BLOCK__MMHUB:
         case AMDGPU_RAS_BLOCK__PCIE_BIF:
                 ret = psp_ras_trigger_error(&adev->psp, &block_info);
@@ -948,7 +1034,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
         struct ras_manager *obj;
         struct ras_err_data data = {0, 0};
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return 0;
  
         list_for_each_entry(obj, &con->head, node) {
@@ -1469,7 +1555,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_manager *obj;
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return;
  
         list_for_each_entry(obj, &con->head, node) {
@@ -1500,12 +1586,14 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
          */
         switch (info->head.block) {
         case AMDGPU_RAS_BLOCK__GFX:
-               if (adev->gfx.funcs->query_ras_error_status)
-                       adev->gfx.funcs->query_ras_error_status(adev);
+               if (adev->gfx.ras_funcs &&
+                   adev->gfx.ras_funcs->query_ras_error_status)
+                       adev->gfx.ras_funcs->query_ras_error_status(adev);
                 break;
         case AMDGPU_RAS_BLOCK__MMHUB:
-               if (adev->mmhub.funcs->query_ras_error_status)
-                       adev->mmhub.funcs->query_ras_error_status(adev);
+               if (adev->mmhub.ras_funcs &&
+                   adev->mmhub.ras_funcs->query_ras_error_status)
+                       adev->mmhub.ras_funcs->query_ras_error_status(adev);
                 break;
         default:
                 break;
@@ -1517,7 +1605,7 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_manager *obj;
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return;
  
         list_for_each_entry(obj, &con->head, node) {
@@ -1782,14 +1870,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
         return ret;
  }
  
-static uint32_t
-amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
+static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
+                                       uint32_t max_length)
  {
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         int tmp_threshold = amdgpu_bad_page_threshold;
         u64 val;
-       uint32_t max_length = 0;
  
-       max_length = amdgpu_ras_eeprom_get_record_max_length();
         /*
          * Justification of value bad_page_cnt_threshold in ras structure
          *
@@ -1815,22 +1902,24 @@ amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
                 tmp_threshold = max_length;
  
         if (tmp_threshold == -1) {
-               val = adev->gmc.real_vram_size;
+               val = adev->gmc.mc_vram_size;
                 do_div(val, RAS_BAD_PAGE_RATE);
-               tmp_threshold = min(lower_32_bits(val), max_length);
+               con->bad_page_cnt_threshold = min(lower_32_bits(val),
+                                               max_length);
+       } else {
+               con->bad_page_cnt_threshold = tmp_threshold;
         }
-
-       return tmp_threshold;
  }
  
  int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_err_handler_data **data;
+       uint32_t max_eeprom_records_len = 0;
         bool exc_err_limit = false;
         int ret;
  
-       if (con)
+       if (adev->ras_features && con)
                 data = &con->eh_data;
         else
                 return 0;
@@ -1846,17 +1935,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         atomic_set(&con->in_recovery, 0);
         con->adev = adev;
  
-       if (!con->bad_page_cnt_threshold) {
-               con->bad_page_cnt_threshold =
-                       amdgpu_ras_calculate_badpags_threshold(adev);
-
-               ret = amdgpu_vram_mgr_reserve_backup_pages(
-                       ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
-                       con->bad_page_cnt_threshold);
-               if (ret)
-                       goto out;
-       }
+       max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+       amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
  
+       /* Todo: During test the SMU might fail to read the eeprom through I2C
+        * when the GPU is pending on XGMI reset during probe time
+        * (Mostly after second bus reset), skip it now
+        */
+       if (adev->gmc.xgmi.pending_reset)
+               return 0;
         ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
         /*
          * This calling fails when exc_err_limit is true or
@@ -1926,15 +2013,13 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
         return 0;
  }
  
-static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev)
+static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
  {
-       if (adev->asic_type != CHIP_VEGA10 &&
-               adev->asic_type != CHIP_VEGA20 &&
-               adev->asic_type != CHIP_ARCTURUS &&
-               adev->asic_type != CHIP_SIENNA_CICHLID)
-               return 1;
-       else
-               return 0;
+       return adev->asic_type == CHIP_VEGA10 ||
+               adev->asic_type == CHIP_VEGA20 ||
+               adev->asic_type == CHIP_ARCTURUS ||
+               adev->asic_type == CHIP_ALDEBARAN ||
+               adev->asic_type == CHIP_SIENNA_CICHLID;
  }
  
  /*
@@ -1953,22 +2038,32 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
         *supported = 0;
  
         if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
-               amdgpu_ras_check_asic_type(adev))
+           !amdgpu_ras_asic_supported(adev))
                 return;
  
-       if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
-               dev_info(adev->dev, "HBM ECC is active.\n");
-               *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
-                               1 << AMDGPU_RAS_BLOCK__DF);
-       } else
-               dev_info(adev->dev, "HBM ECC is not presented.\n");
+       if (!adev->gmc.xgmi.connected_to_cpu) {
+               if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+                       dev_info(adev->dev, "MEM ECC is active.\n");
+                       *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
+                                       1 << AMDGPU_RAS_BLOCK__DF);
+               } else {
+                       dev_info(adev->dev, "MEM ECC is not presented.\n");
+               }
  
-       if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
-               dev_info(adev->dev, "SRAM ECC is active.\n");
-               *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
-                               1 << AMDGPU_RAS_BLOCK__DF);
-       } else
-               dev_info(adev->dev, "SRAM ECC is not presented.\n");
+               if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+                       dev_info(adev->dev, "SRAM ECC is active.\n");
+                       *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+                                       1 << AMDGPU_RAS_BLOCK__DF);
+               } else {
+                       dev_info(adev->dev, "SRAM ECC is not presented.\n");
+               }
+       } else {
+               /* driver only manages a few IP blocks RAS feature
+                * when GPU is connected cpu through XGMI */
+               *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX |
+                               1 << AMDGPU_RAS_BLOCK__SDMA |
+                               1 << AMDGPU_RAS_BLOCK__MMHUB);
+       }
  
         /* hw_supported needs to be aligned with RAS block mask. */
         *hw_supported &= AMDGPU_RAS_BLOCK_MASK;
@@ -1999,6 +2094,15 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
         amdgpu_ras_check_supported(adev, &con->hw_supported,
                         &con->supported);
         if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
+               /* set gfx block ras context feature for VEGA20 Gaming
+                * send ras disable cmd to ras ta during ras late init.
+                */
+               if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) {
+                       con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
+
+                       return 0;
+               }
+
                 r = 0;
                 goto release_con;
         }
@@ -2008,14 +2112,31 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
         /* Might need get this flag from vbios. */
         con->flags = RAS_DEFAULT_FLAGS;
  
-       if (adev->nbio.funcs->init_ras_controller_interrupt) {
-               r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
+       /* initialize nbio ras function ahead of any other
+        * ras functions so hardware fatal error interrupt
+        * can be enabled as early as possible */
+       switch (adev->asic_type) {
+       case CHIP_VEGA20:
+       case CHIP_ARCTURUS:
+       case CHIP_ALDEBARAN:
+               if (!adev->gmc.xgmi.connected_to_cpu)
+                       adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs;
+               break;
+       default:
+               /* nbio ras is not available */
+               break;
+       }
+
+       if (adev->nbio.ras_funcs &&
+           adev->nbio.ras_funcs->init_ras_controller_interrupt) {
+               r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev);
                 if (r)
                         goto release_con;
         }
  
-       if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
-               r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
+       if (adev->nbio.ras_funcs &&
+           adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) {
+               r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev);
                 if (r)
                         goto release_con;
         }
@@ -2036,6 +2157,32 @@ release_con:
         return r;
  }
  
+static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
+{
+       if (adev->gmc.xgmi.connected_to_cpu)
+               return 1;
+       return 0;
+}
+
+static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
+                                       struct ras_common_if *ras_block)
+{
+       struct ras_query_if info = {
+               .head = *ras_block,
+       };
+
+       if (!amdgpu_persistent_edc_harvesting_supported(adev))
+               return 0;
+
+       if (amdgpu_ras_query_error_status(adev, &info) != 0)
+               DRM_WARN("RAS init harvest failure");
+
+       if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
+               DRM_WARN("RAS init harvest reset failure");
+
+       return 0;
+}
+
  /* helper function to handle common stuff in ip late init phase */
  int amdgpu_ras_late_init(struct amdgpu_device *adev,
                          struct ras_common_if *ras_block,
@@ -2065,6 +2212,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
                         return r;
         }
  
+       /* check for errors on warm reset edc persisant supported ASIC */
+       amdgpu_persistent_edc_harvesting(adev, ras_block);
+
         /* in resume phase, no need to create ras fs node */
         if (adev->in_suspend || amdgpu_in_reset(adev))
                 return 0;
@@ -2112,8 +2262,12 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_manager *obj, *tmp;
  
-       if (!con)
+       if (!adev->ras_features || !con) {
+               /* clean ras context for VEGA20 Gaming after send ras disable cmd */
+               amdgpu_release_ras_context(adev);
+
                 return;
+       }
  
         if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
                 /* Set up all other IPs which are not implemented. There is a
@@ -2154,7 +2308,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return;
  
         amdgpu_ras_disable_all_features(adev, 0);
@@ -2168,7 +2322,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return 0;
  
         /* Need disable ras on all IPs here before ip [hw/sw]fini */
@@ -2181,7 +2335,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  
-       if (!con)
+       if (!adev->ras_features || !con)
                 return 0;
  
         amdgpu_ras_fs_fini(adev);
@@ -2224,3 +2378,17 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
  
         return false;
  }
+
+void amdgpu_release_ras_context(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+       if (!con)
+               return;
+
+       if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
+               con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
+               amdgpu_ras_set_context(adev, NULL);
+               kfree(con);
+       }
+}