]> Git Repo - linux.git/commitdiff
drm/amdgpu: refine poison consumption interrupt handler
authorYiPeng Chai <[email protected]>
Mon, 24 Jun 2024 03:33:19 +0000 (11:33 +0800)
committerAlex Deucher <[email protected]>
Thu, 27 Jun 2024 21:32:06 +0000 (17:32 -0400)
1. The poison fifo is only used for poison consumption
   requests.
2. Merge reset requests when poison fifo caches multiple
   poison consumption messages

Signed-off-by: YiPeng Chai <[email protected]>
Reviewed-by: Hawking Zhang <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

index 6e7c4f1f86da536928ab975d65d6fdf25c2f6f11..d3247533d15e34c0e30bc4792302caa480095505 100644 (file)
@@ -2911,23 +2911,41 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
 }
 
 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
-                       struct ras_poison_msg *poison_msg)
+                       uint32_t msg_count, uint32_t *gpu_reset)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       uint32_t reset = poison_msg->reset;
-       uint16_t pasid = poison_msg->pasid;
+       uint32_t reset_flags = 0, reset = 0;
+       struct ras_poison_msg msg;
+       int ret, i;
 
        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 
-       if (poison_msg->pasid_fn)
-               poison_msg->pasid_fn(adev, pasid, poison_msg->data);
+       for (i = 0; i < msg_count; i++) {
+               ret = amdgpu_ras_get_poison_req(adev, &msg);
+               if (!ret)
+                       continue;
+
+               if (msg.pasid_fn)
+                       msg.pasid_fn(adev, msg.pasid, msg.data);
+
+               reset_flags |= msg.reset;
+       }
 
        /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
-       if (reset && !con->is_rma) {
+       if (reset_flags && !con->is_rma) {
+               if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
+                       reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+               else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET)
+                       reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+               else
+                       reset = reset_flags;
+
                flush_delayed_work(&con->page_retirement_dwork);
 
                con->gpu_reset_flags |= reset;
                amdgpu_ras_reset_gpu(adev);
+
+               *gpu_reset = reset;
        }
 
        return 0;
@@ -2937,10 +2955,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)param;
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       uint32_t poison_creation_count;
+       uint32_t poison_creation_count, msg_count;
+       uint32_t gpu_reset;
        int ret;
-       struct ras_poison_msg poison_msg;
-       enum amdgpu_ras_block ras_block;
 
        while (!kthread_should_stop()) {
 
@@ -2951,6 +2968,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
                if (kthread_should_stop())
                        break;
 
+               gpu_reset = 0;
 
                do {
                        poison_creation_count = atomic_read(&con->poison_creation_count);
@@ -2964,15 +2982,16 @@ static int amdgpu_ras_page_retirement_thread(void *param)
                        }
                } while (atomic_read(&con->poison_creation_count));
 
-               if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
-                       continue;
-
-               ras_block = poison_msg.block;
-
-               dev_dbg(adev->dev, "Start processing ras block %s(%d)\n",
-                               ras_block_str(ras_block), ras_block);
-
-                       amdgpu_ras_poison_consumption_handler(adev, &poison_msg);
+               if (ret != -EIO) {
+                       msg_count = kfifo_len(&con->poison_fifo);
+                       if (msg_count) {
+                               ret = amdgpu_ras_poison_consumption_handler(adev,
+                                               msg_count, &gpu_reset);
+                               if ((ret != -EIO) &&
+                                   (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET))
+                                       atomic_sub(msg_count, &con->page_retirement_req_cnt);
+                       }
+               }
        }
 
        return 0;
index 20e0e522fb51de90b8d4ac0d3475f2b029170ffd..2f84bdb8c594d59a15151781dda6cfde69e15382 100644 (file)
@@ -293,14 +293,15 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
 
                        amdgpu_ras_error_data_fini(&err_data);
                } else {
-                               struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
-                               amdgpu_ras_put_poison_req(adev,
-                                       block, pasid, pasid_fn, data, reset);
+                       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+                       int ret;
 
+                       ret = amdgpu_ras_put_poison_req(adev,
+                               block, pasid, pasid_fn, data, reset);
+                       if (!ret) {
                                atomic_inc(&con->page_retirement_req_cnt);
-
                                wake_up(&con->page_retirement_wq);
+                       }
                }
        } else {
                if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
This page took 0.067445 seconds and 4 git commands to generate.