]> Git Repo - linux.git/blobdiff - drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drm/amdgpu/amdgpu: improve code indentation and alignment
[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
index 40614ac9a1117556eb50b00809e1f35b5cc673a4..b96267068a72d0081377c3298bcb25266aea9313 100644 (file)
@@ -80,6 +80,8 @@ enum amdgpu_ras_retire_page_reservation {
 
 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
 
+static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+                               uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
                                uint64_t addr);
 
@@ -516,9 +518,9 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 /* obj end */
 
 static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev,
-                                 const char*           invoke_type,
-                                 const char*           block_name,
-                                 enum ta_ras_status    ret)
+                                        const char* invoke_type,
+                                        const char* block_name,
+                                        enum ta_ras_status ret)
 {
        switch (ret) {
        case TA_RAS_STATUS__SUCCESS:
@@ -607,7 +609,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
        if (!con)
                return -EINVAL;
 
-        info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
+       info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
        if (!info)
                return -ENOMEM;
 
@@ -953,7 +955,7 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
        case AMDGPU_RAS_RETIRE_PAGE_FAULT:
        default:
                return "F";
-       };
+       }
 }
 
 /**
@@ -1027,58 +1029,6 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
        return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
 }
 
-static void amdgpu_ras_sysfs_add_bad_page_node(struct amdgpu_device *adev)
-{
-       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       struct attribute_group group;
-       struct bin_attribute *bin_attrs[] = {
-               &con->badpages_attr,
-               NULL,
-       };
-
-       con->badpages_attr = (struct bin_attribute) {
-               .attr = {
-                       .name = "gpu_vram_bad_pages",
-                       .mode = S_IRUGO,
-               },
-               .size = 0,
-               .private = NULL,
-               .read = amdgpu_ras_sysfs_badpages_read,
-       };
-
-       group.name = RAS_FS_NAME;
-       group.bin_attrs = bin_attrs;
-
-       sysfs_bin_attr_init(bin_attrs[0]);
-
-       sysfs_update_group(&adev->dev->kobj, &group);
-}
-
-static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
-{
-       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       struct attribute *attrs[] = {
-               &con->features_attr.attr,
-               NULL
-       };
-       struct attribute_group group = {
-               .name = RAS_FS_NAME,
-               .attrs = attrs,
-       };
-
-       con->features_attr = (struct device_attribute) {
-               .attr = {
-                       .name = "features",
-                       .mode = S_IRUGO,
-               },
-                       .show = amdgpu_ras_sysfs_features_read,
-       };
-
-       sysfs_attr_init(attrs[0]);
-
-       return sysfs_create_group(&adev->dev->kobj, &group);
-}
-
 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -1300,13 +1250,43 @@ static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
 /* debugfs end */
 
 /* ras fs */
-
+static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
+               amdgpu_ras_sysfs_badpages_read, NULL, 0);
+static DEVICE_ATTR(features, S_IRUGO,
+               amdgpu_ras_sysfs_features_read, NULL);
 static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
 {
-       amdgpu_ras_sysfs_create_feature_node(adev);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct attribute_group group = {
+               .name = RAS_FS_NAME,
+       };
+       struct attribute *attrs[] = {
+               &con->features_attr.attr,
+               NULL
+       };
+       struct bin_attribute *bin_attrs[] = {
+               NULL,
+               NULL,
+       };
+       int r;
 
-       if (amdgpu_bad_page_threshold != 0)
-               amdgpu_ras_sysfs_add_bad_page_node(adev);
+       /* add features entry */
+       con->features_attr = dev_attr_features;
+       group.attrs = attrs;
+       sysfs_attr_init(attrs[0]);
+
+       if (amdgpu_bad_page_threshold != 0) {
+               /* add bad_page_features entry */
+               bin_attr_gpu_vram_bad_pages.private = NULL;
+               con->badpages_attr = bin_attr_gpu_vram_bad_pages;
+               bin_attrs[0] = &con->badpages_attr;
+               group.bin_attrs = bin_attrs;
+               sysfs_bin_attr_init(bin_attrs[0]);
+       }
+
+       r = sysfs_create_group(&adev->dev->kobj, &group);
+       if (r)
+               dev_err(adev->dev, "Failed to create RAS sysfs group!");
 
        return 0;
 }
@@ -1573,10 +1553,12 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
                        .size = AMDGPU_GPU_PAGE_SIZE,
                        .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
                };
-
-               if (data->last_reserved <= i)
+               ret = amdgpu_vram_mgr_query_page_status(
+                               ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
+                               data->bps[i].retired_page);
+               if (ret == -EBUSY)
                        (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
-               else if (data->bps_bo[i] == NULL)
+               else if (ret == -ENOENT)
                        (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
        }
 
@@ -1628,12 +1610,9 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
        unsigned int new_space = old_space + pages;
        unsigned int align_space = ALIGN(new_space, 512);
        void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
-       struct amdgpu_bo **bps_bo =
-                       kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
 
-       if (!bps || !bps_bo) {
+       if (!bps) {
                kfree(bps);
-               kfree(bps_bo);
                return -ENOMEM;
        }
 
@@ -1642,14 +1621,8 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
                                data->count * sizeof(*data->bps));
                kfree(data->bps);
        }
-       if (data->bps_bo) {
-               memcpy(bps_bo, data->bps_bo,
-                               data->count * sizeof(*data->bps_bo));
-               kfree(data->bps_bo);
-       }
 
        data->bps = bps;
-       data->bps_bo = bps_bo;
        data->space_left += align_space - old_space;
        return 0;
 }
@@ -1661,6 +1634,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data *data;
        int ret = 0;
+       uint32_t i;
 
        if (!con || !con->eh_data || !bps || pages <= 0)
                return 0;
@@ -1670,16 +1644,26 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
        if (!data)
                goto out;
 
-       if (data->space_left <= pages)
-               if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
+       for (i = 0; i < pages; i++) {
+               if (amdgpu_ras_check_bad_page_unlock(con,
+                       bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+                       continue;
+
+               if (!data->space_left &&
+                       amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
                        ret = -ENOMEM;
                        goto out;
                }
 
-       memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
-       data->count += pages;
-       data->space_left -= pages;
+               amdgpu_vram_mgr_reserve_range(
+                       ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
+                       bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
+                       AMDGPU_GPU_PAGE_SIZE);
 
+               memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
+               data->count++;
+               data->space_left--;
+       }
 out:
        mutex_unlock(&con->recovery_lock);
 
@@ -1690,7 +1674,7 @@ out:
  * write error record array to eeprom, the function should be
  * protected by recovery_lock
  */
-static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
+int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data *data;
@@ -1752,6 +1736,20 @@ out:
        return ret;
 }
 
+static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+                               uint64_t addr)
+{
+       struct ras_err_handler_data *data = con->eh_data;
+       int i;
+
+       addr >>= AMDGPU_GPU_PAGE_SHIFT;
+       for (i = 0; i < data->count; i++)
+               if (addr == data->bps[i].retired_page)
+                       return true;
+
+       return false;
+}
+
 /*
  * check if an address belongs to bad page
  *
@@ -1761,26 +1759,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
                                uint64_t addr)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       struct ras_err_handler_data *data;
-       int i;
        bool ret = false;
 
        if (!con || !con->eh_data)
                return ret;
 
        mutex_lock(&con->recovery_lock);
-       data = con->eh_data;
-       if (!data)
-               goto out;
-
-       addr >>= AMDGPU_GPU_PAGE_SHIFT;
-       for (i = 0; i < data->count; i++)
-               if (addr == data->bps[i].retired_page) {
-                       ret = true;
-                       goto out;
-               }
-
-out:
+       ret = amdgpu_ras_check_bad_page_unlock(con, addr);
        mutex_unlock(&con->recovery_lock);
        return ret;
 }
@@ -1826,80 +1811,6 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
        }
 }
 
-/* called in gpu recovery/init */
-int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
-{
-       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       struct ras_err_handler_data *data;
-       uint64_t bp;
-       struct amdgpu_bo *bo = NULL;
-       int i, ret = 0;
-
-       /* Not reserve bad page when amdgpu_bad_page_threshold == 0. */
-       if (!con || !con->eh_data || (amdgpu_bad_page_threshold == 0))
-               return 0;
-
-       mutex_lock(&con->recovery_lock);
-       data = con->eh_data;
-       if (!data)
-               goto out;
-       /* reserve vram at driver post stage. */
-       for (i = data->last_reserved; i < data->count; i++) {
-               bp = data->bps[i].retired_page;
-
-               /* There are two cases of reserve error should be ignored:
-                * 1) a ras bad page has been allocated (used by someone);
-                * 2) a ras bad page has been reserved (duplicate error injection
-                *    for one page);
-                */
-               if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
-                                              AMDGPU_GPU_PAGE_SIZE,
-                                              AMDGPU_GEM_DOMAIN_VRAM,
-                                              &bo, NULL))
-                       dev_warn(adev->dev, "RAS WARN: reserve vram for "
-                                       "retired page %llx fail\n", bp);
-
-               data->bps_bo[i] = bo;
-               data->last_reserved = i + 1;
-               bo = NULL;
-       }
-
-       /* continue to save bad pages to eeprom even reesrve_vram fails */
-       ret = amdgpu_ras_save_bad_pages(adev);
-out:
-       mutex_unlock(&con->recovery_lock);
-       return ret;
-}
-
-/* called when driver unload */
-static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
-{
-       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       struct ras_err_handler_data *data;
-       struct amdgpu_bo *bo;
-       int i;
-
-       if (!con || !con->eh_data)
-               return 0;
-
-       mutex_lock(&con->recovery_lock);
-       data = con->eh_data;
-       if (!data)
-               goto out;
-
-       for (i = data->last_reserved - 1; i >= 0; i--) {
-               bo = data->bps_bo[i];
-
-               amdgpu_bo_free_kernel(&bo, NULL, NULL);
-
-               data->bps_bo[i] = bo;
-               data->last_reserved = i;
-       }
-out:
-       mutex_unlock(&con->recovery_lock);
-       return 0;
-}
-
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -1939,18 +1850,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
                ret = amdgpu_ras_load_bad_pages(adev);
                if (ret)
                        goto free;
-               ret = amdgpu_ras_reserve_bad_pages(adev);
-               if (ret)
-                       goto release;
        }
 
        return 0;
 
-release:
-       amdgpu_ras_release_bad_pages(adev);
 free:
        kfree((*data)->bps);
-       kfree((*data)->bps_bo);
        kfree(*data);
        con->eh_data = NULL;
 out:
@@ -1978,12 +1883,10 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
                return 0;
 
        cancel_work_sync(&con->recovery_work);
-       amdgpu_ras_release_bad_pages(adev);
 
        mutex_lock(&con->recovery_lock);
        con->eh_data = NULL;
        kfree(data->bps);
-       kfree(data->bps_bo);
        kfree(data);
        mutex_unlock(&con->recovery_lock);
 
@@ -2178,7 +2081,7 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev,
 
        amdgpu_ras_sysfs_remove(adev, ras_block);
        if (ih_info->cb)
-                amdgpu_ras_interrupt_remove_handler(adev, ih_info);
+               amdgpu_ras_interrupt_remove_handler(adev, ih_info);
        amdgpu_ras_feature_enable(adev, ras_block, 0);
 }
 
This page took 0.048717 seconds and 4 git commands to generate.