drivers/gpu/drm/scheduler/sched_entity.c

   1 /*
   2  * Copyright 2015 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  */
  23
  24 #include <linux/kthread.h>
  25 #include <linux/slab.h>
  26 #include <linux/completion.h>
  27
  28 #include <drm/drm_print.h>
  29 #include <drm/gpu_scheduler.h>
  30
  31 #include "gpu_scheduler_trace.h"
  32
  33 #define to_drm_sched_job(sched_job)             \
  34                 container_of((sched_job), struct drm_sched_job, queue_node)
  35
  36 /**
  37  * drm_sched_entity_init - Init a context entity used by scheduler when
  38  * submit to HW ring.
  39  *
  40  * @entity: scheduler entity to init
  41  * @priority: priority of the entity
  42  * @sched_list: the list of drm scheds on which jobs from this
  43  *           entity can be submitted
  44  * @num_sched_list: number of drm sched in sched_list
  45  * @guilty: atomic_t set to 1 when a job on this queue
  46  *          is found to be guilty causing a timeout
  47  *
  48  * Note that the &sched_list must have at least one element to schedule the entity.
  49  *
  50  * For changing @priority later on at runtime see
  51  * drm_sched_entity_set_priority(). For changing the set of schedulers
  52  * @sched_list at runtime see drm_sched_entity_modify_sched().
  53  *
  54  * An entity is cleaned up by callind drm_sched_entity_fini(). See also
  55  * drm_sched_entity_destroy().
  56  *
  57  * Returns 0 on success or a negative error code on failure.
  58  */
  59 int drm_sched_entity_init(struct drm_sched_entity *entity,
  60                           enum drm_sched_priority priority,
  61                           struct drm_gpu_scheduler **sched_list,
  62                           unsigned int num_sched_list,
  63                           atomic_t *guilty)
  64 {
  65         if (!(entity && sched_list && (num_sched_list == 0 || sched_list[0])))
  66                 return -EINVAL;
  67
  68         memset(entity, 0, sizeof(struct drm_sched_entity));
  69         INIT_LIST_HEAD(&entity->list);
  70         entity->rq = NULL;
  71         entity->guilty = guilty;
  72         entity->num_sched_list = num_sched_list;
  73         entity->priority = priority;
  74         entity->sched_list = num_sched_list > 1 ? sched_list : NULL;
  75         RCU_INIT_POINTER(entity->last_scheduled, NULL);
  76         RB_CLEAR_NODE(&entity->rb_tree_node);
  77
  78         if (!sched_list[0]->sched_rq) {
  79                 /* Warn drivers not to do this and to fix their DRM
  80                  * calling order.
  81                  */
  82                 pr_warn("%s: called with uninitialized scheduler\n", __func__);
  83         } else if (num_sched_list) {
  84                 /* The "priority" of an entity cannot exceed the number of run-queues of a
  85                  * scheduler. Protect against num_rqs being 0, by converting to signed. Choose
  86                  * the lowest priority available.
  87                  */
  88                 if (entity->priority >= sched_list[0]->num_rqs) {
  89                         drm_err(sched_list[0], "entity with out-of-bounds priority:%u num_rqs:%u\n",
  90                                 entity->priority, sched_list[0]->num_rqs);
  91                         entity->priority = max_t(s32, (s32) sched_list[0]->num_rqs - 1,
  92                                                  (s32) DRM_SCHED_PRIORITY_KERNEL);
  93                 }
  94                 entity->rq = sched_list[0]->sched_rq[entity->priority];
  95         }
  96
  97         init_completion(&entity->entity_idle);
  98
  99         /* We start in an idle state. */
 100         complete_all(&entity->entity_idle);
 101
 102         spin_lock_init(&entity->rq_lock);
 103         spsc_queue_init(&entity->job_queue);
 104
 105         atomic_set(&entity->fence_seq, 0);
 106         entity->fence_context = dma_fence_context_alloc(2);
 107
 108         return 0;
 109 }
 110 EXPORT_SYMBOL(drm_sched_entity_init);
 111
 112 /**
 113  * drm_sched_entity_modify_sched - Modify sched of an entity
 114  * @entity: scheduler entity to init
 115  * @sched_list: the list of new drm scheds which will replace
 116  *               existing entity->sched_list
 117  * @num_sched_list: number of drm sched in sched_list
 118  *
 119  * Note that this must be called under the same common lock for @entity as
 120  * drm_sched_job_arm() and drm_sched_entity_push_job(), or the driver needs to
 121  * guarantee through some other means that this is never called while new jobs
 122  * can be pushed to @entity.
 123  */
 124 void drm_sched_entity_modify_sched(struct drm_sched_entity *entity,
 125                                     struct drm_gpu_scheduler **sched_list,
 126                                     unsigned int num_sched_list)
 127 {
 128         WARN_ON(!num_sched_list || !sched_list);
 129
 130         entity->sched_list = sched_list;
 131         entity->num_sched_list = num_sched_list;
 132 }
 133 EXPORT_SYMBOL(drm_sched_entity_modify_sched);
 134
 135 static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity)
 136 {
 137         rmb(); /* for list_empty to work without lock */
 138
 139         if (list_empty(&entity->list) ||
 140             spsc_queue_count(&entity->job_queue) == 0 ||
 141             entity->stopped)
 142                 return true;
 143
 144         return false;
 145 }
 146
 147 /* Return true if entity could provide a job. */
 148 bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
 149 {
 150         if (spsc_queue_peek(&entity->job_queue) == NULL)
 151                 return false;
 152
 153         if (READ_ONCE(entity->dependency))
 154                 return false;
 155
 156         return true;
 157 }
 158
 159 /**
 160  * drm_sched_entity_error - return error of last scheduled job
 161  * @entity: scheduler entity to check
 162  *
 163  * Opportunistically return the error of the last scheduled job. Result can
 164  * change any time when new jobs are pushed to the hw.
 165  */
 166 int drm_sched_entity_error(struct drm_sched_entity *entity)
 167 {
 168         struct dma_fence *fence;
 169         int r;
 170
 171         rcu_read_lock();
 172         fence = rcu_dereference(entity->last_scheduled);
 173         r = fence ? fence->error : 0;
 174         rcu_read_unlock();
 175
 176         return r;
 177 }
 178 EXPORT_SYMBOL(drm_sched_entity_error);
 179
 180 static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk)
 181 {
 182         struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
 183
 184         drm_sched_fence_finished(job->s_fence, -ESRCH);
 185         WARN_ON(job->s_fence->parent);
 186         job->sched->ops->free_job(job);
 187 }
 188
 189 /* Signal the scheduler finished fence when the entity in question is killed. */
 190 static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
 191                                           struct dma_fence_cb *cb)
 192 {
 193         struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
 194                                                  finish_cb);
 195         unsigned long index;
 196
 197         dma_fence_put(f);
 198
 199         /* Wait for all dependencies to avoid data corruptions */
 200         xa_for_each(&job->dependencies, index, f) {
 201                 struct drm_sched_fence *s_fence = to_drm_sched_fence(f);
 202
 203                 if (s_fence && f == &s_fence->scheduled) {
 204                         /* The dependencies array had a reference on the scheduled
 205                          * fence, and the finished fence refcount might have
 206                          * dropped to zero. Use dma_fence_get_rcu() so we get
 207                          * a NULL fence in that case.
 208                          */
 209                         f = dma_fence_get_rcu(&s_fence->finished);
 210
 211                         /* Now that we have a reference on the finished fence,
 212                          * we can release the reference the dependencies array
 213                          * had on the scheduled fence.
 214                          */
 215                         dma_fence_put(&s_fence->scheduled);
 216                 }
 217
 218                 xa_erase(&job->dependencies, index);
 219                 if (f && !dma_fence_add_callback(f, &job->finish_cb,
 220                                                  drm_sched_entity_kill_jobs_cb))
 221                         return;
 222
 223                 dma_fence_put(f);
 224         }
 225
 226         INIT_WORK(&job->work, drm_sched_entity_kill_jobs_work);
 227         schedule_work(&job->work);
 228 }
 229
 230 /* Remove the entity from the scheduler and kill all pending jobs */
 231 static void drm_sched_entity_kill(struct drm_sched_entity *entity)
 232 {
 233         struct drm_sched_job *job;
 234         struct dma_fence *prev;
 235
 236         if (!entity->rq)
 237                 return;
 238
 239         spin_lock(&entity->rq_lock);
 240         entity->stopped = true;
 241         drm_sched_rq_remove_entity(entity->rq, entity);
 242         spin_unlock(&entity->rq_lock);
 243
 244         /* Make sure this entity is not used by the scheduler at the moment */
 245         wait_for_completion(&entity->entity_idle);
 246
 247         /* The entity is guaranteed to not be used by the scheduler */
 248         prev = rcu_dereference_check(entity->last_scheduled, true);
 249         dma_fence_get(prev);
 250         while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) {
 251                 struct drm_sched_fence *s_fence = job->s_fence;
 252
 253                 dma_fence_get(&s_fence->finished);
 254                 if (!prev || dma_fence_add_callback(prev, &job->finish_cb,
 255                                            drm_sched_entity_kill_jobs_cb))
 256                         drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb);
 257
 258                 prev = &s_fence->finished;
 259         }
 260         dma_fence_put(prev);
 261 }
 262
 263 /**
 264  * drm_sched_entity_flush - Flush a context entity
 265  *
 266  * @entity: scheduler entity
 267  * @timeout: time to wait in for Q to become empty in jiffies.
 268  *
 269  * Splitting drm_sched_entity_fini() into two functions, The first one does the
 270  * waiting, removes the entity from the runqueue and returns an error when the
 271  * process was killed.
 272  *
 273  * Returns the remaining time in jiffies left from the input timeout
 274  */
 275 long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
 276 {
 277         struct drm_gpu_scheduler *sched;
 278         struct task_struct *last_user;
 279         long ret = timeout;
 280
 281         if (!entity->rq)
 282                 return 0;
 283
 284         sched = entity->rq->sched;
 285         /**
 286          * The client will not queue more IBs during this fini, consume existing
 287          * queued IBs or discard them on SIGKILL
 288          */
 289         if (current->flags & PF_EXITING) {
 290                 if (timeout)
 291                         ret = wait_event_timeout(
 292                                         sched->job_scheduled,
 293                                         drm_sched_entity_is_idle(entity),
 294                                         timeout);
 295         } else {
 296                 wait_event_killable(sched->job_scheduled,
 297                                     drm_sched_entity_is_idle(entity));
 298         }
 299
 300         /* For killed process disable any more IBs enqueue right now */
 301         last_user = cmpxchg(&entity->last_user, current->group_leader, NULL);
 302         if ((!last_user || last_user == current->group_leader) &&
 303             (current->flags & PF_EXITING) && (current->exit_code == SIGKILL))
 304                 drm_sched_entity_kill(entity);
 305
 306         return ret;
 307 }
 308 EXPORT_SYMBOL(drm_sched_entity_flush);
 309
 310 /**
 311  * drm_sched_entity_fini - Destroy a context entity
 312  *
 313  * @entity: scheduler entity
 314  *
 315  * Cleanups up @entity which has been initialized by drm_sched_entity_init().
 316  *
 317  * If there are potentially job still in flight or getting newly queued
 318  * drm_sched_entity_flush() must be called first. This function then goes over
 319  * the entity and signals all jobs with an error code if the process was killed.
 320  */
 321 void drm_sched_entity_fini(struct drm_sched_entity *entity)
 322 {
 323         /*
 324          * If consumption of existing IBs wasn't completed. Forcefully remove
 325          * them here. Also makes sure that the scheduler won't touch this entity
 326          * any more.
 327          */
 328         drm_sched_entity_kill(entity);
 329
 330         if (entity->dependency) {
 331                 dma_fence_remove_callback(entity->dependency, &entity->cb);
 332                 dma_fence_put(entity->dependency);
 333                 entity->dependency = NULL;
 334         }
 335
 336         dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
 337         RCU_INIT_POINTER(entity->last_scheduled, NULL);
 338 }
 339 EXPORT_SYMBOL(drm_sched_entity_fini);
 340
 341 /**
 342  * drm_sched_entity_destroy - Destroy a context entity
 343  * @entity: scheduler entity
 344  *
 345  * Calls drm_sched_entity_flush() and drm_sched_entity_fini() as a
 346  * convenience wrapper.
 347  */
 348 void drm_sched_entity_destroy(struct drm_sched_entity *entity)
 349 {
 350         drm_sched_entity_flush(entity, MAX_WAIT_SCHED_ENTITY_Q_EMPTY);
 351         drm_sched_entity_fini(entity);
 352 }
 353 EXPORT_SYMBOL(drm_sched_entity_destroy);
 354
 355 /* drm_sched_entity_clear_dep - callback to clear the entities dependency */
 356 static void drm_sched_entity_clear_dep(struct dma_fence *f,
 357                                        struct dma_fence_cb *cb)
 358 {
 359         struct drm_sched_entity *entity =
 360                 container_of(cb, struct drm_sched_entity, cb);
 361
 362         entity->dependency = NULL;
 363         dma_fence_put(f);
 364 }
 365
 366 /*
 367  * drm_sched_entity_clear_dep - callback to clear the entities dependency and
 368  * wake up scheduler
 369  */
 370 static void drm_sched_entity_wakeup(struct dma_fence *f,
 371                                     struct dma_fence_cb *cb)
 372 {
 373         struct drm_sched_entity *entity =
 374                 container_of(cb, struct drm_sched_entity, cb);
 375
 376         drm_sched_entity_clear_dep(f, cb);
 377         drm_sched_wakeup(entity->rq->sched, entity);
 378 }
 379
 380 /**
 381  * drm_sched_entity_set_priority - Sets priority of the entity
 382  *
 383  * @entity: scheduler entity
 384  * @priority: scheduler priority
 385  *
 386  * Update the priority of runqueus used for the entity.
 387  */
 388 void drm_sched_entity_set_priority(struct drm_sched_entity *entity,
 389                                    enum drm_sched_priority priority)
 390 {
 391         spin_lock(&entity->rq_lock);
 392         entity->priority = priority;
 393         spin_unlock(&entity->rq_lock);
 394 }
 395 EXPORT_SYMBOL(drm_sched_entity_set_priority);
 396
 397 /*
 398  * Add a callback to the current dependency of the entity to wake up the
 399  * scheduler when the entity becomes available.
 400  */
 401 static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
 402 {
 403         struct drm_gpu_scheduler *sched = entity->rq->sched;
 404         struct dma_fence *fence = entity->dependency;
 405         struct drm_sched_fence *s_fence;
 406
 407         if (fence->context == entity->fence_context ||
 408             fence->context == entity->fence_context + 1) {
 409                 /*
 410                  * Fence is a scheduled/finished fence from a job
 411                  * which belongs to the same entity, we can ignore
 412                  * fences from ourself
 413                  */
 414                 dma_fence_put(entity->dependency);
 415                 return false;
 416         }
 417
 418         s_fence = to_drm_sched_fence(fence);
 419         if (!fence->error && s_fence && s_fence->sched == sched &&
 420             !test_bit(DRM_SCHED_FENCE_DONT_PIPELINE, &fence->flags)) {
 421
 422                 /*
 423                  * Fence is from the same scheduler, only need to wait for
 424                  * it to be scheduled
 425                  */
 426                 fence = dma_fence_get(&s_fence->scheduled);
 427                 dma_fence_put(entity->dependency);
 428                 entity->dependency = fence;
 429                 if (!dma_fence_add_callback(fence, &entity->cb,
 430                                             drm_sched_entity_clear_dep))
 431                         return true;
 432
 433                 /* Ignore it when it is already scheduled */
 434                 dma_fence_put(fence);
 435                 return false;
 436         }
 437
 438         if (!dma_fence_add_callback(entity->dependency, &entity->cb,
 439                                     drm_sched_entity_wakeup))
 440                 return true;
 441
 442         dma_fence_put(entity->dependency);
 443         return false;
 444 }
 445
 446 static struct dma_fence *
 447 drm_sched_job_dependency(struct drm_sched_job *job,
 448                          struct drm_sched_entity *entity)
 449 {
 450         struct dma_fence *f;
 451
 452         /* We keep the fence around, so we can iterate over all dependencies
 453          * in drm_sched_entity_kill_jobs_cb() to ensure all deps are signaled
 454          * before killing the job.
 455          */
 456         f = xa_load(&job->dependencies, job->last_dependency);
 457         if (f) {
 458                 job->last_dependency++;
 459                 return dma_fence_get(f);
 460         }
 461
 462         if (job->sched->ops->prepare_job)
 463                 return job->sched->ops->prepare_job(job, entity);
 464
 465         return NULL;
 466 }
 467
 468 struct drm_sched_job *drm_sched_entity_pop_job(struct drm_sched_entity *entity)
 469 {
 470         struct drm_sched_job *sched_job;
 471
 472         sched_job = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
 473         if (!sched_job)
 474                 return NULL;
 475
 476         while ((entity->dependency =
 477                         drm_sched_job_dependency(sched_job, entity))) {
 478                 trace_drm_sched_job_wait_dep(sched_job, entity->dependency);
 479
 480                 if (drm_sched_entity_add_dependency_cb(entity))
 481                         return NULL;
 482         }
 483
 484         /* skip jobs from entity that marked guilty */
 485         if (entity->guilty && atomic_read(entity->guilty))
 486                 dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
 487
 488         dma_fence_put(rcu_dereference_check(entity->last_scheduled, true));
 489         rcu_assign_pointer(entity->last_scheduled,
 490                            dma_fence_get(&sched_job->s_fence->finished));
 491
 492         /*
 493          * If the queue is empty we allow drm_sched_entity_select_rq() to
 494          * locklessly access ->last_scheduled. This only works if we set the
 495          * pointer before we dequeue and if we a write barrier here.
 496          */
 497         smp_wmb();
 498
 499         spsc_queue_pop(&entity->job_queue);
 500
 501         /*
 502          * Update the entity's location in the min heap according to
 503          * the timestamp of the next job, if any.
 504          */
 505         if (drm_sched_policy == DRM_SCHED_POLICY_FIFO) {
 506                 struct drm_sched_job *next;
 507
 508                 next = to_drm_sched_job(spsc_queue_peek(&entity->job_queue));
 509                 if (next)
 510                         drm_sched_rq_update_fifo(entity, next->submit_ts);
 511         }
 512
 513         /* Jobs and entities might have different lifecycles. Since we're
 514          * removing the job from the entities queue, set the jobs entity pointer
 515          * to NULL to prevent any future access of the entity through this job.
 516          */
 517         sched_job->entity = NULL;
 518
 519         return sched_job;
 520 }
 521
 522 void drm_sched_entity_select_rq(struct drm_sched_entity *entity)
 523 {
 524         struct dma_fence *fence;
 525         struct drm_gpu_scheduler *sched;
 526         struct drm_sched_rq *rq;
 527
 528         /* single possible engine and already selected */
 529         if (!entity->sched_list)
 530                 return;
 531
 532         /* queue non-empty, stay on the same engine */
 533         if (spsc_queue_count(&entity->job_queue))
 534                 return;
 535
 536         /*
 537          * Only when the queue is empty are we guaranteed that the scheduler
 538          * thread cannot change ->last_scheduled. To enforce ordering we need
 539          * a read barrier here. See drm_sched_entity_pop_job() for the other
 540          * side.
 541          */
 542         smp_rmb();
 543
 544         fence = rcu_dereference_check(entity->last_scheduled, true);
 545
 546         /* stay on the same engine if the previous job hasn't finished */
 547         if (fence && !dma_fence_is_signaled(fence))
 548                 return;
 549
 550         spin_lock(&entity->rq_lock);
 551         sched = drm_sched_pick_best(entity->sched_list, entity->num_sched_list);
 552         rq = sched ? sched->sched_rq[entity->priority] : NULL;
 553         if (rq != entity->rq) {
 554                 drm_sched_rq_remove_entity(entity->rq, entity);
 555                 entity->rq = rq;
 556         }
 557         spin_unlock(&entity->rq_lock);
 558
 559         if (entity->num_sched_list == 1)
 560                 entity->sched_list = NULL;
 561 }
 562
 563 /**
 564  * drm_sched_entity_push_job - Submit a job to the entity's job queue
 565  * @sched_job: job to submit
 566  *
 567  * Note: To guarantee that the order of insertion to queue matches the job's
 568  * fence sequence number this function should be called with drm_sched_job_arm()
 569  * under common lock for the struct drm_sched_entity that was set up for
 570  * @sched_job in drm_sched_job_init().
 571  *
 572  * Returns 0 for success, negative error code otherwise.
 573  */
 574 void drm_sched_entity_push_job(struct drm_sched_job *sched_job)
 575 {
 576         struct drm_sched_entity *entity = sched_job->entity;
 577         bool first;
 578         ktime_t submit_ts;
 579
 580         trace_drm_sched_job(sched_job, entity);
 581         atomic_inc(entity->rq->sched->score);
 582         WRITE_ONCE(entity->last_user, current->group_leader);
 583
 584         /*
 585          * After the sched_job is pushed into the entity queue, it may be
 586          * completed and freed up at any time. We can no longer access it.
 587          * Make sure to set the submit_ts first, to avoid a race.
 588          */
 589         sched_job->submit_ts = submit_ts = ktime_get();
 590         first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
 591
 592         /* first job wakes up scheduler */
 593         if (first) {
 594                 /* Add the entity to the run queue */
 595                 spin_lock(&entity->rq_lock);
 596                 if (entity->stopped) {
 597                         spin_unlock(&entity->rq_lock);
 598
 599                         DRM_ERROR("Trying to push to a killed entity\n");
 600                         return;
 601                 }
 602
 603                 drm_sched_rq_add_entity(entity->rq, entity);
 604                 spin_unlock(&entity->rq_lock);
 605
 606                 if (drm_sched_policy == DRM_SCHED_POLICY_FIFO)
 607                         drm_sched_rq_update_fifo(entity, submit_ts);
 608
 609                 drm_sched_wakeup(entity->rq->sched, entity);
 610         }
 611 }
 612 EXPORT_SYMBOL(drm_sched_entity_push_job);