drivers/gpu/drm/v3d/v3d_sched.c

   1 // SPDX-License-Identifier: GPL-2.0+
   2 /* Copyright (C) 2018 Broadcom */
   3
   4 /**
   5  * DOC: Broadcom V3D scheduling
   6  *
   7  * The shared DRM GPU scheduler is used to coordinate submitting jobs
   8  * to the hardware.  Each DRM fd (roughly a client process) gets its
   9  * own scheduler entity, which will process jobs in order.  The GPU
  10  * scheduler will round-robin between clients to submit the next job.
  11  *
  12  * For simplicity, and in order to keep latency low for interactive
  13  * jobs when bulk background jobs are queued up, we submit a new job
  14  * to the HW only when it has completed the last one, instead of
  15  * filling up the CT[01]Q FIFOs with jobs.  Similarly, we use
  16  * drm_sched_job_add_dependency() to manage the dependency between bin and
  17  * render, instead of having the clients submit jobs using the HW's
  18  * semaphores to interlock between them.
  19  */
  20
  21 #include <linux/sched/clock.h>
  22 #include <linux/kthread.h>
  23
  24 #include <drm/drm_syncobj.h>
  25
  26 #include "v3d_drv.h"
  27 #include "v3d_regs.h"
  28 #include "v3d_trace.h"
  29
  30 #define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
  31
  32 static struct v3d_job *
  33 to_v3d_job(struct drm_sched_job *sched_job)
  34 {
  35         return container_of(sched_job, struct v3d_job, base);
  36 }
  37
  38 static struct v3d_bin_job *
  39 to_bin_job(struct drm_sched_job *sched_job)
  40 {
  41         return container_of(sched_job, struct v3d_bin_job, base.base);
  42 }
  43
  44 static struct v3d_render_job *
  45 to_render_job(struct drm_sched_job *sched_job)
  46 {
  47         return container_of(sched_job, struct v3d_render_job, base.base);
  48 }
  49
  50 static struct v3d_tfu_job *
  51 to_tfu_job(struct drm_sched_job *sched_job)
  52 {
  53         return container_of(sched_job, struct v3d_tfu_job, base.base);
  54 }
  55
  56 static struct v3d_csd_job *
  57 to_csd_job(struct drm_sched_job *sched_job)
  58 {
  59         return container_of(sched_job, struct v3d_csd_job, base.base);
  60 }
  61
  62 static struct v3d_cpu_job *
  63 to_cpu_job(struct drm_sched_job *sched_job)
  64 {
  65         return container_of(sched_job, struct v3d_cpu_job, base.base);
  66 }
  67
  68 static void
  69 v3d_sched_job_free(struct drm_sched_job *sched_job)
  70 {
  71         struct v3d_job *job = to_v3d_job(sched_job);
  72
  73         v3d_job_cleanup(job);
  74 }
  75
  76 void
  77 v3d_timestamp_query_info_free(struct v3d_timestamp_query_info *query_info,
  78                               unsigned int count)
  79 {
  80         if (query_info->queries) {
  81                 unsigned int i;
  82
  83                 for (i = 0; i < count; i++)
  84                         drm_syncobj_put(query_info->queries[i].syncobj);
  85
  86                 kvfree(query_info->queries);
  87         }
  88 }
  89
  90 void
  91 v3d_performance_query_info_free(struct v3d_performance_query_info *query_info,
  92                                 unsigned int count)
  93 {
  94         if (query_info->queries) {
  95                 unsigned int i;
  96
  97                 for (i = 0; i < count; i++) {
  98                         drm_syncobj_put(query_info->queries[i].syncobj);
  99                         kvfree(query_info->queries[i].kperfmon_ids);
 100                 }
 101
 102                 kvfree(query_info->queries);
 103         }
 104 }
 105
 106 static void
 107 v3d_cpu_job_free(struct drm_sched_job *sched_job)
 108 {
 109         struct v3d_cpu_job *job = to_cpu_job(sched_job);
 110
 111         v3d_timestamp_query_info_free(&job->timestamp_query,
 112                                       job->timestamp_query.count);
 113
 114         v3d_performance_query_info_free(&job->performance_query,
 115                                         job->performance_query.count);
 116
 117         v3d_job_cleanup(&job->base);
 118 }
 119
 120 static void
 121 v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job)
 122 {
 123         if (job->perfmon != v3d->active_perfmon)
 124                 v3d_perfmon_stop(v3d, v3d->active_perfmon, true);
 125
 126         if (job->perfmon && v3d->active_perfmon != job->perfmon)
 127                 v3d_perfmon_start(v3d, job->perfmon);
 128 }
 129
 130 static void
 131 v3d_job_start_stats(struct v3d_job *job, enum v3d_queue queue)
 132 {
 133         struct v3d_dev *v3d = job->v3d;
 134         struct v3d_file_priv *file = job->file->driver_priv;
 135         struct v3d_stats *global_stats = &v3d->queue[queue].stats;
 136         struct v3d_stats *local_stats = &file->stats[queue];
 137         u64 now = local_clock();
 138         unsigned long flags;
 139
 140         /*
 141          * We only need to disable local interrupts to appease lockdep who
 142          * otherwise would think v3d_job_start_stats vs v3d_stats_update has an
 143          * unsafe in-irq vs no-irq-off usage problem. This is a false positive
 144          * because all the locks are per queue and stats type, and all jobs are
 145          * completely one at a time serialised. More specifically:
 146          *
 147          * 1. Locks for GPU queues are updated from interrupt handlers under a
 148          *    spin lock and started here with preemption disabled.
 149          *
 150          * 2. Locks for CPU queues are updated from the worker with preemption
 151          *    disabled and equally started here with preemption disabled.
 152          *
 153          * Therefore both are consistent.
 154          *
 155          * 3. Because next job can only be queued after the previous one has
 156          *    been signaled, and locks are per queue, there is also no scope for
 157          *    the start part to race with the update part.
 158          */
 159         if (IS_ENABLED(CONFIG_LOCKDEP))
 160                 local_irq_save(flags);
 161         else
 162                 preempt_disable();
 163
 164         write_seqcount_begin(&local_stats->lock);
 165         local_stats->start_ns = now;
 166         write_seqcount_end(&local_stats->lock);
 167
 168         write_seqcount_begin(&global_stats->lock);
 169         global_stats->start_ns = now;
 170         write_seqcount_end(&global_stats->lock);
 171
 172         if (IS_ENABLED(CONFIG_LOCKDEP))
 173                 local_irq_restore(flags);
 174         else
 175                 preempt_enable();
 176 }
 177
 178 static void
 179 v3d_stats_update(struct v3d_stats *stats, u64 now)
 180 {
 181         write_seqcount_begin(&stats->lock);
 182         stats->enabled_ns += now - stats->start_ns;
 183         stats->jobs_completed++;
 184         stats->start_ns = 0;
 185         write_seqcount_end(&stats->lock);
 186 }
 187
 188 void
 189 v3d_job_update_stats(struct v3d_job *job, enum v3d_queue queue)
 190 {
 191         struct v3d_dev *v3d = job->v3d;
 192         struct v3d_file_priv *file = job->file->driver_priv;
 193         struct v3d_stats *global_stats = &v3d->queue[queue].stats;
 194         struct v3d_stats *local_stats = &file->stats[queue];
 195         u64 now = local_clock();
 196         unsigned long flags;
 197
 198         /* See comment in v3d_job_start_stats() */
 199         if (IS_ENABLED(CONFIG_LOCKDEP))
 200                 local_irq_save(flags);
 201         else
 202                 preempt_disable();
 203
 204         v3d_stats_update(local_stats, now);
 205         v3d_stats_update(global_stats, now);
 206
 207         if (IS_ENABLED(CONFIG_LOCKDEP))
 208                 local_irq_restore(flags);
 209         else
 210                 preempt_enable();
 211 }
 212
 213 static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
 214 {
 215         struct v3d_bin_job *job = to_bin_job(sched_job);
 216         struct v3d_dev *v3d = job->base.v3d;
 217         struct drm_device *dev = &v3d->drm;
 218         struct dma_fence *fence;
 219         unsigned long irqflags;
 220
 221         if (unlikely(job->base.base.s_fence->finished.error))
 222                 return NULL;
 223
 224         /* Lock required around bin_job update vs
 225          * v3d_overflow_mem_work().
 226          */
 227         spin_lock_irqsave(&v3d->job_lock, irqflags);
 228         v3d->bin_job = job;
 229         /* Clear out the overflow allocation, so we don't
 230          * reuse the overflow attached to a previous job.
 231          */
 232         V3D_CORE_WRITE(0, V3D_PTB_BPOS, 0);
 233         spin_unlock_irqrestore(&v3d->job_lock, irqflags);
 234
 235         v3d_invalidate_caches(v3d);
 236
 237         fence = v3d_fence_create(v3d, V3D_BIN);
 238         if (IS_ERR(fence))
 239                 return NULL;
 240
 241         if (job->base.irq_fence)
 242                 dma_fence_put(job->base.irq_fence);
 243         job->base.irq_fence = dma_fence_get(fence);
 244
 245         trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,
 246                             job->start, job->end);
 247
 248         v3d_job_start_stats(&job->base, V3D_BIN);
 249         v3d_switch_perfmon(v3d, &job->base);
 250
 251         /* Set the current and end address of the control list.
 252          * Writing the end register is what starts the job.
 253          */
 254         if (job->qma) {
 255                 V3D_CORE_WRITE(0, V3D_CLE_CT0QMA, job->qma);
 256                 V3D_CORE_WRITE(0, V3D_CLE_CT0QMS, job->qms);
 257         }
 258         if (job->qts) {
 259                 V3D_CORE_WRITE(0, V3D_CLE_CT0QTS,
 260                                V3D_CLE_CT0QTS_ENABLE |
 261                                job->qts);
 262         }
 263         V3D_CORE_WRITE(0, V3D_CLE_CT0QBA, job->start);
 264         V3D_CORE_WRITE(0, V3D_CLE_CT0QEA, job->end);
 265
 266         return fence;
 267 }
 268
 269 static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job)
 270 {
 271         struct v3d_render_job *job = to_render_job(sched_job);
 272         struct v3d_dev *v3d = job->base.v3d;
 273         struct drm_device *dev = &v3d->drm;
 274         struct dma_fence *fence;
 275
 276         if (unlikely(job->base.base.s_fence->finished.error))
 277                 return NULL;
 278
 279         v3d->render_job = job;
 280
 281         /* Can we avoid this flush?  We need to be careful of
 282          * scheduling, though -- imagine job0 rendering to texture and
 283          * job1 reading, and them being executed as bin0, bin1,
 284          * render0, render1, so that render1's flush at bin time
 285          * wasn't enough.
 286          */
 287         v3d_invalidate_caches(v3d);
 288
 289         fence = v3d_fence_create(v3d, V3D_RENDER);
 290         if (IS_ERR(fence))
 291                 return NULL;
 292
 293         if (job->base.irq_fence)
 294                 dma_fence_put(job->base.irq_fence);
 295         job->base.irq_fence = dma_fence_get(fence);
 296
 297         trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,
 298                             job->start, job->end);
 299
 300         v3d_job_start_stats(&job->base, V3D_RENDER);
 301         v3d_switch_perfmon(v3d, &job->base);
 302
 303         /* XXX: Set the QCFG */
 304
 305         /* Set the current and end address of the control list.
 306          * Writing the end register is what starts the job.
 307          */
 308         V3D_CORE_WRITE(0, V3D_CLE_CT1QBA, job->start);
 309         V3D_CORE_WRITE(0, V3D_CLE_CT1QEA, job->end);
 310
 311         return fence;
 312 }
 313
 314 static struct dma_fence *
 315 v3d_tfu_job_run(struct drm_sched_job *sched_job)
 316 {
 317         struct v3d_tfu_job *job = to_tfu_job(sched_job);
 318         struct v3d_dev *v3d = job->base.v3d;
 319         struct drm_device *dev = &v3d->drm;
 320         struct dma_fence *fence;
 321
 322         fence = v3d_fence_create(v3d, V3D_TFU);
 323         if (IS_ERR(fence))
 324                 return NULL;
 325
 326         v3d->tfu_job = job;
 327         if (job->base.irq_fence)
 328                 dma_fence_put(job->base.irq_fence);
 329         job->base.irq_fence = dma_fence_get(fence);
 330
 331         trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
 332
 333         v3d_job_start_stats(&job->base, V3D_TFU);
 334
 335         V3D_WRITE(V3D_TFU_IIA(v3d->ver), job->args.iia);
 336         V3D_WRITE(V3D_TFU_IIS(v3d->ver), job->args.iis);
 337         V3D_WRITE(V3D_TFU_ICA(v3d->ver), job->args.ica);
 338         V3D_WRITE(V3D_TFU_IUA(v3d->ver), job->args.iua);
 339         V3D_WRITE(V3D_TFU_IOA(v3d->ver), job->args.ioa);
 340         if (v3d->ver >= 71)
 341                 V3D_WRITE(V3D_V7_TFU_IOC, job->args.v71.ioc);
 342         V3D_WRITE(V3D_TFU_IOS(v3d->ver), job->args.ios);
 343         V3D_WRITE(V3D_TFU_COEF0(v3d->ver), job->args.coef[0]);
 344         if (v3d->ver >= 71 || (job->args.coef[0] & V3D_TFU_COEF0_USECOEF)) {
 345                 V3D_WRITE(V3D_TFU_COEF1(v3d->ver), job->args.coef[1]);
 346                 V3D_WRITE(V3D_TFU_COEF2(v3d->ver), job->args.coef[2]);
 347                 V3D_WRITE(V3D_TFU_COEF3(v3d->ver), job->args.coef[3]);
 348         }
 349         /* ICFG kicks off the job. */
 350         V3D_WRITE(V3D_TFU_ICFG(v3d->ver), job->args.icfg | V3D_TFU_ICFG_IOC);
 351
 352         return fence;
 353 }
 354
 355 static struct dma_fence *
 356 v3d_csd_job_run(struct drm_sched_job *sched_job)
 357 {
 358         struct v3d_csd_job *job = to_csd_job(sched_job);
 359         struct v3d_dev *v3d = job->base.v3d;
 360         struct drm_device *dev = &v3d->drm;
 361         struct dma_fence *fence;
 362         int i, csd_cfg0_reg;
 363
 364         v3d->csd_job = job;
 365
 366         v3d_invalidate_caches(v3d);
 367
 368         fence = v3d_fence_create(v3d, V3D_CSD);
 369         if (IS_ERR(fence))
 370                 return NULL;
 371
 372         if (job->base.irq_fence)
 373                 dma_fence_put(job->base.irq_fence);
 374         job->base.irq_fence = dma_fence_get(fence);
 375
 376         trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
 377
 378         v3d_job_start_stats(&job->base, V3D_CSD);
 379         v3d_switch_perfmon(v3d, &job->base);
 380
 381         csd_cfg0_reg = V3D_CSD_QUEUED_CFG0(v3d->ver);
 382         for (i = 1; i <= 6; i++)
 383                 V3D_CORE_WRITE(0, csd_cfg0_reg + 4 * i, job->args.cfg[i]);
 384
 385         /* Although V3D 7.1 has an eighth configuration register, we are not
 386          * using it. Therefore, make sure it remains unused.
 387          *
 388          * XXX: Set the CFG7 register
 389          */
 390         if (v3d->ver >= 71)
 391                 V3D_CORE_WRITE(0, V3D_V7_CSD_QUEUED_CFG7, 0);
 392
 393         /* CFG0 write kicks off the job. */
 394         V3D_CORE_WRITE(0, csd_cfg0_reg, job->args.cfg[0]);
 395
 396         return fence;
 397 }
 398
 399 static void
 400 v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job)
 401 {
 402         struct v3d_indirect_csd_info *indirect_csd = &job->indirect_csd;
 403         struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
 404         struct v3d_bo *indirect = to_v3d_bo(indirect_csd->indirect);
 405         struct drm_v3d_submit_csd *args = &indirect_csd->job->args;
 406         u32 *wg_counts;
 407
 408         v3d_get_bo_vaddr(bo);
 409         v3d_get_bo_vaddr(indirect);
 410
 411         wg_counts = (uint32_t *)(bo->vaddr + indirect_csd->offset);
 412
 413         if (wg_counts[0] == 0 || wg_counts[1] == 0 || wg_counts[2] == 0)
 414                 return;
 415
 416         args->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
 417         args->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
 418         args->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
 419         args->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) *
 420                        (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
 421
 422         for (int i = 0; i < 3; i++) {
 423                 /* 0xffffffff indicates that the uniform rewrite is not needed */
 424                 if (indirect_csd->wg_uniform_offsets[i] != 0xffffffff) {
 425                         u32 uniform_idx = indirect_csd->wg_uniform_offsets[i];
 426                         ((uint32_t *)indirect->vaddr)[uniform_idx] = wg_counts[i];
 427                 }
 428         }
 429
 430         v3d_put_bo_vaddr(indirect);
 431         v3d_put_bo_vaddr(bo);
 432 }
 433
 434 static void
 435 v3d_timestamp_query(struct v3d_cpu_job *job)
 436 {
 437         struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query;
 438         struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
 439         u8 *value_addr;
 440
 441         v3d_get_bo_vaddr(bo);
 442
 443         for (int i = 0; i < timestamp_query->count; i++) {
 444                 value_addr = ((u8 *)bo->vaddr) + timestamp_query->queries[i].offset;
 445                 *((u64 *)value_addr) = i == 0 ? ktime_get_ns() : 0ull;
 446
 447                 drm_syncobj_replace_fence(timestamp_query->queries[i].syncobj,
 448                                           job->base.done_fence);
 449         }
 450
 451         v3d_put_bo_vaddr(bo);
 452 }
 453
 454 static void
 455 v3d_reset_timestamp_queries(struct v3d_cpu_job *job)
 456 {
 457         struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query;
 458         struct v3d_timestamp_query *queries = timestamp_query->queries;
 459         struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
 460         u8 *value_addr;
 461
 462         v3d_get_bo_vaddr(bo);
 463
 464         for (int i = 0; i < timestamp_query->count; i++) {
 465                 value_addr = ((u8 *)bo->vaddr) + queries[i].offset;
 466                 *((u64 *)value_addr) = 0;
 467
 468                 drm_syncobj_replace_fence(queries[i].syncobj, NULL);
 469         }
 470
 471         v3d_put_bo_vaddr(bo);
 472 }
 473
 474 static void write_to_buffer_32(u32 *dst, unsigned int idx, u32 value)
 475 {
 476         dst[idx] = value;
 477 }
 478
 479 static void write_to_buffer_64(u64 *dst, unsigned int idx, u64 value)
 480 {
 481         dst[idx] = value;
 482 }
 483
 484 static void
 485 write_to_buffer(void *dst, unsigned int idx, bool do_64bit, u64 value)
 486 {
 487         if (do_64bit)
 488                 write_to_buffer_64(dst, idx, value);
 489         else
 490                 write_to_buffer_32(dst, idx, value);
 491 }
 492
 493 static void
 494 v3d_copy_query_results(struct v3d_cpu_job *job)
 495 {
 496         struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query;
 497         struct v3d_timestamp_query *queries = timestamp_query->queries;
 498         struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
 499         struct v3d_bo *timestamp = to_v3d_bo(job->base.bo[1]);
 500         struct v3d_copy_query_results_info *copy = &job->copy;
 501         struct dma_fence *fence;
 502         u8 *query_addr;
 503         bool available, write_result;
 504         u8 *data;
 505         int i;
 506
 507         v3d_get_bo_vaddr(bo);
 508         v3d_get_bo_vaddr(timestamp);
 509
 510         data = ((u8 *)bo->vaddr) + copy->offset;
 511
 512         for (i = 0; i < timestamp_query->count; i++) {
 513                 fence = drm_syncobj_fence_get(queries[i].syncobj);
 514                 available = fence ? dma_fence_is_signaled(fence) : false;
 515
 516                 write_result = available || copy->do_partial;
 517                 if (write_result) {
 518                         query_addr = ((u8 *)timestamp->vaddr) + queries[i].offset;
 519                         write_to_buffer(data, 0, copy->do_64bit, *((u64 *)query_addr));
 520                 }
 521
 522                 if (copy->availability_bit)
 523                         write_to_buffer(data, 1, copy->do_64bit, available ? 1u : 0u);
 524
 525                 data += copy->stride;
 526
 527                 dma_fence_put(fence);
 528         }
 529
 530         v3d_put_bo_vaddr(timestamp);
 531         v3d_put_bo_vaddr(bo);
 532 }
 533
 534 static void
 535 v3d_reset_performance_queries(struct v3d_cpu_job *job)
 536 {
 537         struct v3d_performance_query_info *performance_query = &job->performance_query;
 538         struct v3d_file_priv *v3d_priv = job->base.file->driver_priv;
 539         struct v3d_dev *v3d = job->base.v3d;
 540         struct v3d_perfmon *perfmon;
 541
 542         for (int i = 0; i < performance_query->count; i++) {
 543                 for (int j = 0; j < performance_query->nperfmons; j++) {
 544                         perfmon = v3d_perfmon_find(v3d_priv,
 545                                                    performance_query->queries[i].kperfmon_ids[j]);
 546                         if (!perfmon) {
 547                                 DRM_DEBUG("Failed to find perfmon.");
 548                                 continue;
 549                         }
 550
 551                         v3d_perfmon_stop(v3d, perfmon, false);
 552
 553                         memset(perfmon->values, 0, perfmon->ncounters * sizeof(u64));
 554
 555                         v3d_perfmon_put(perfmon);
 556                 }
 557
 558                 drm_syncobj_replace_fence(performance_query->queries[i].syncobj, NULL);
 559         }
 560 }
 561
 562 static void
 563 v3d_write_performance_query_result(struct v3d_cpu_job *job, void *data,
 564                                    unsigned int query)
 565 {
 566         struct v3d_performance_query_info *performance_query =
 567                                                 &job->performance_query;
 568         struct v3d_file_priv *v3d_priv = job->base.file->driver_priv;
 569         struct v3d_performance_query *perf_query =
 570                         &performance_query->queries[query];
 571         struct v3d_dev *v3d = job->base.v3d;
 572         unsigned int i, j, offset;
 573
 574         for (i = 0, offset = 0;
 575              i < performance_query->nperfmons;
 576              i++, offset += DRM_V3D_MAX_PERF_COUNTERS) {
 577                 struct v3d_perfmon *perfmon;
 578
 579                 perfmon = v3d_perfmon_find(v3d_priv,
 580                                            perf_query->kperfmon_ids[i]);
 581                 if (!perfmon) {
 582                         DRM_DEBUG("Failed to find perfmon.");
 583                         continue;
 584                 }
 585
 586                 v3d_perfmon_stop(v3d, perfmon, true);
 587
 588                 if (job->copy.do_64bit) {
 589                         for (j = 0; j < perfmon->ncounters; j++)
 590                                 write_to_buffer_64(data, offset + j,
 591                                                    perfmon->values[j]);
 592                 } else {
 593                         for (j = 0; j < perfmon->ncounters; j++)
 594                                 write_to_buffer_32(data, offset + j,
 595                                                    perfmon->values[j]);
 596                 }
 597
 598                 v3d_perfmon_put(perfmon);
 599         }
 600 }
 601
 602 static void
 603 v3d_copy_performance_query(struct v3d_cpu_job *job)
 604 {
 605         struct v3d_performance_query_info *performance_query = &job->performance_query;
 606         struct v3d_copy_query_results_info *copy = &job->copy;
 607         struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
 608         struct dma_fence *fence;
 609         bool available, write_result;
 610         u8 *data;
 611
 612         v3d_get_bo_vaddr(bo);
 613
 614         data = ((u8 *)bo->vaddr) + copy->offset;
 615
 616         for (int i = 0; i < performance_query->count; i++) {
 617                 fence = drm_syncobj_fence_get(performance_query->queries[i].syncobj);
 618                 available = fence ? dma_fence_is_signaled(fence) : false;
 619
 620                 write_result = available || copy->do_partial;
 621                 if (write_result)
 622                         v3d_write_performance_query_result(job, data, i);
 623
 624                 if (copy->availability_bit)
 625                         write_to_buffer(data, performance_query->ncounters,
 626                                         copy->do_64bit, available ? 1u : 0u);
 627
 628                 data += copy->stride;
 629
 630                 dma_fence_put(fence);
 631         }
 632
 633         v3d_put_bo_vaddr(bo);
 634 }
 635
 636 static const v3d_cpu_job_fn cpu_job_function[] = {
 637         [V3D_CPU_JOB_TYPE_INDIRECT_CSD] = v3d_rewrite_csd_job_wg_counts_from_indirect,
 638         [V3D_CPU_JOB_TYPE_TIMESTAMP_QUERY] = v3d_timestamp_query,
 639         [V3D_CPU_JOB_TYPE_RESET_TIMESTAMP_QUERY] = v3d_reset_timestamp_queries,
 640         [V3D_CPU_JOB_TYPE_COPY_TIMESTAMP_QUERY] = v3d_copy_query_results,
 641         [V3D_CPU_JOB_TYPE_RESET_PERFORMANCE_QUERY] = v3d_reset_performance_queries,
 642         [V3D_CPU_JOB_TYPE_COPY_PERFORMANCE_QUERY] = v3d_copy_performance_query,
 643 };
 644
 645 static struct dma_fence *
 646 v3d_cpu_job_run(struct drm_sched_job *sched_job)
 647 {
 648         struct v3d_cpu_job *job = to_cpu_job(sched_job);
 649         struct v3d_dev *v3d = job->base.v3d;
 650
 651         v3d->cpu_job = job;
 652
 653         if (job->job_type >= ARRAY_SIZE(cpu_job_function)) {
 654                 DRM_DEBUG_DRIVER("Unknown CPU job: %d\n", job->job_type);
 655                 return NULL;
 656         }
 657
 658         v3d_job_start_stats(&job->base, V3D_CPU);
 659         trace_v3d_cpu_job_begin(&v3d->drm, job->job_type);
 660
 661         cpu_job_function[job->job_type](job);
 662
 663         trace_v3d_cpu_job_end(&v3d->drm, job->job_type);
 664         v3d_job_update_stats(&job->base, V3D_CPU);
 665
 666         return NULL;
 667 }
 668
 669 static struct dma_fence *
 670 v3d_cache_clean_job_run(struct drm_sched_job *sched_job)
 671 {
 672         struct v3d_job *job = to_v3d_job(sched_job);
 673         struct v3d_dev *v3d = job->v3d;
 674
 675         v3d_job_start_stats(job, V3D_CACHE_CLEAN);
 676
 677         v3d_clean_caches(v3d);
 678
 679         v3d_job_update_stats(job, V3D_CACHE_CLEAN);
 680
 681         return NULL;
 682 }
 683
 684 static enum drm_gpu_sched_stat
 685 v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
 686 {
 687         enum v3d_queue q;
 688
 689         mutex_lock(&v3d->reset_lock);
 690
 691         /* block scheduler */
 692         for (q = 0; q < V3D_MAX_QUEUES; q++)
 693                 drm_sched_stop(&v3d->queue[q].sched, sched_job);
 694
 695         if (sched_job)
 696                 drm_sched_increase_karma(sched_job);
 697
 698         /* get the GPU back into the init state */
 699         v3d_reset(v3d);
 700
 701         for (q = 0; q < V3D_MAX_QUEUES; q++)
 702                 drm_sched_resubmit_jobs(&v3d->queue[q].sched);
 703
 704         /* Unblock schedulers and restart their jobs. */
 705         for (q = 0; q < V3D_MAX_QUEUES; q++) {
 706                 drm_sched_start(&v3d->queue[q].sched, 0);
 707         }
 708
 709         mutex_unlock(&v3d->reset_lock);
 710
 711         return DRM_GPU_SCHED_STAT_NOMINAL;
 712 }
 713
 714 /* If the current address or return address have changed, then the GPU
 715  * has probably made progress and we should delay the reset.  This
 716  * could fail if the GPU got in an infinite loop in the CL, but that
 717  * is pretty unlikely outside of an i-g-t testcase.
 718  */
 719 static enum drm_gpu_sched_stat
 720 v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
 721                     u32 *timedout_ctca, u32 *timedout_ctra)
 722 {
 723         struct v3d_job *job = to_v3d_job(sched_job);
 724         struct v3d_dev *v3d = job->v3d;
 725         u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q));
 726         u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q));
 727
 728         if (*timedout_ctca != ctca || *timedout_ctra != ctra) {
 729                 *timedout_ctca = ctca;
 730                 *timedout_ctra = ctra;
 731                 return DRM_GPU_SCHED_STAT_NOMINAL;
 732         }
 733
 734         return v3d_gpu_reset_for_timeout(v3d, sched_job);
 735 }
 736
 737 static enum drm_gpu_sched_stat
 738 v3d_bin_job_timedout(struct drm_sched_job *sched_job)
 739 {
 740         struct v3d_bin_job *job = to_bin_job(sched_job);
 741
 742         return v3d_cl_job_timedout(sched_job, V3D_BIN,
 743                                    &job->timedout_ctca, &job->timedout_ctra);
 744 }
 745
 746 static enum drm_gpu_sched_stat
 747 v3d_render_job_timedout(struct drm_sched_job *sched_job)
 748 {
 749         struct v3d_render_job *job = to_render_job(sched_job);
 750
 751         return v3d_cl_job_timedout(sched_job, V3D_RENDER,
 752                                    &job->timedout_ctca, &job->timedout_ctra);
 753 }
 754
 755 static enum drm_gpu_sched_stat
 756 v3d_generic_job_timedout(struct drm_sched_job *sched_job)
 757 {
 758         struct v3d_job *job = to_v3d_job(sched_job);
 759
 760         return v3d_gpu_reset_for_timeout(job->v3d, sched_job);
 761 }
 762
 763 static enum drm_gpu_sched_stat
 764 v3d_csd_job_timedout(struct drm_sched_job *sched_job)
 765 {
 766         struct v3d_csd_job *job = to_csd_job(sched_job);
 767         struct v3d_dev *v3d = job->base.v3d;
 768         u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4(v3d->ver));
 769
 770         /* If we've made progress, skip reset and let the timer get
 771          * rearmed.
 772          */
 773         if (job->timedout_batches != batches) {
 774                 job->timedout_batches = batches;
 775                 return DRM_GPU_SCHED_STAT_NOMINAL;
 776         }
 777
 778         return v3d_gpu_reset_for_timeout(v3d, sched_job);
 779 }
 780
 781 static const struct drm_sched_backend_ops v3d_bin_sched_ops = {
 782         .run_job = v3d_bin_job_run,
 783         .timedout_job = v3d_bin_job_timedout,
 784         .free_job = v3d_sched_job_free,
 785 };
 786
 787 static const struct drm_sched_backend_ops v3d_render_sched_ops = {
 788         .run_job = v3d_render_job_run,
 789         .timedout_job = v3d_render_job_timedout,
 790         .free_job = v3d_sched_job_free,
 791 };
 792
 793 static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
 794         .run_job = v3d_tfu_job_run,
 795         .timedout_job = v3d_generic_job_timedout,
 796         .free_job = v3d_sched_job_free,
 797 };
 798
 799 static const struct drm_sched_backend_ops v3d_csd_sched_ops = {
 800         .run_job = v3d_csd_job_run,
 801         .timedout_job = v3d_csd_job_timedout,
 802         .free_job = v3d_sched_job_free
 803 };
 804
 805 static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = {
 806         .run_job = v3d_cache_clean_job_run,
 807         .timedout_job = v3d_generic_job_timedout,
 808         .free_job = v3d_sched_job_free
 809 };
 810
 811 static const struct drm_sched_backend_ops v3d_cpu_sched_ops = {
 812         .run_job = v3d_cpu_job_run,
 813         .timedout_job = v3d_generic_job_timedout,
 814         .free_job = v3d_cpu_job_free
 815 };
 816
 817 int
 818 v3d_sched_init(struct v3d_dev *v3d)
 819 {
 820         int hw_jobs_limit = 1;
 821         int job_hang_limit = 0;
 822         int hang_limit_ms = 500;
 823         int ret;
 824
 825         ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
 826                              &v3d_bin_sched_ops, NULL,
 827                              DRM_SCHED_PRIORITY_COUNT,
 828                              hw_jobs_limit, job_hang_limit,
 829                              msecs_to_jiffies(hang_limit_ms), NULL,
 830                              NULL, "v3d_bin", v3d->drm.dev);
 831         if (ret)
 832                 return ret;
 833
 834         ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
 835                              &v3d_render_sched_ops, NULL,
 836                              DRM_SCHED_PRIORITY_COUNT,
 837                              hw_jobs_limit, job_hang_limit,
 838                              msecs_to_jiffies(hang_limit_ms), NULL,
 839                              NULL, "v3d_render", v3d->drm.dev);
 840         if (ret)
 841                 goto fail;
 842
 843         ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
 844                              &v3d_tfu_sched_ops, NULL,
 845                              DRM_SCHED_PRIORITY_COUNT,
 846                              hw_jobs_limit, job_hang_limit,
 847                              msecs_to_jiffies(hang_limit_ms), NULL,
 848                              NULL, "v3d_tfu", v3d->drm.dev);
 849         if (ret)
 850                 goto fail;
 851
 852         if (v3d_has_csd(v3d)) {
 853                 ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
 854                                      &v3d_csd_sched_ops, NULL,
 855                                      DRM_SCHED_PRIORITY_COUNT,
 856                                      hw_jobs_limit, job_hang_limit,
 857                                      msecs_to_jiffies(hang_limit_ms), NULL,
 858                                      NULL, "v3d_csd", v3d->drm.dev);
 859                 if (ret)
 860                         goto fail;
 861
 862                 ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
 863                                      &v3d_cache_clean_sched_ops, NULL,
 864                                      DRM_SCHED_PRIORITY_COUNT,
 865                                      hw_jobs_limit, job_hang_limit,
 866                                      msecs_to_jiffies(hang_limit_ms), NULL,
 867                                      NULL, "v3d_cache_clean", v3d->drm.dev);
 868                 if (ret)
 869                         goto fail;
 870         }
 871
 872         ret = drm_sched_init(&v3d->queue[V3D_CPU].sched,
 873                              &v3d_cpu_sched_ops, NULL,
 874                              DRM_SCHED_PRIORITY_COUNT,
 875                              1, job_hang_limit,
 876                              msecs_to_jiffies(hang_limit_ms), NULL,
 877                              NULL, "v3d_cpu", v3d->drm.dev);
 878         if (ret)
 879                 goto fail;
 880
 881         return 0;
 882
 883 fail:
 884         v3d_sched_fini(v3d);
 885         return ret;
 886 }
 887
 888 void
 889 v3d_sched_fini(struct v3d_dev *v3d)
 890 {
 891         enum v3d_queue q;
 892
 893         for (q = 0; q < V3D_MAX_QUEUES; q++) {
 894                 if (v3d->queue[q].sched.ready)
 895                         drm_sched_fini(&v3d->queue[q].sched);
 896         }
 897 }