tools/sched_ext/scx_qmap.bpf.c

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * A simple five-level FIFO queue scheduler.
   4  *
   5  * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
   6  * assigned to one depending on its compound weight. Each CPU round robins
   7  * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
   8  * queue0, 2 from queue1, 4 from queue2 and so on.
   9  *
  10  * This scheduler demonstrates:
  11  *
  12  * - BPF-side queueing using PIDs.
  13  * - Sleepable per-task storage allocation using ops.prep_enable().
  14  * - Using ops.cpu_release() to handle a higher priority scheduling class taking
  15  *   the CPU away.
  16  * - Core-sched support.
  17  *
  18  * This scheduler is primarily for demonstration and testing of sched_ext
  19  * features and unlikely to be useful for actual workloads.
  20  *
  21  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  22  * Copyright (c) 2022 Tejun Heo <[email protected]>
  23  * Copyright (c) 2022 David Vernet <[email protected]>
  24  */
  25 #include <scx/common.bpf.h>
  26
  27 enum consts {
  28         ONE_SEC_IN_NS           = 1000000000,
  29         SHARED_DSQ              = 0,
  30         HIGHPRI_DSQ             = 1,
  31         HIGHPRI_WEIGHT          = 8668,         /* this is what -20 maps to */
  32 };
  33
  34 char _license[] SEC("license") = "GPL";
  35
  36 const volatile u64 slice_ns = SCX_SLICE_DFL;
  37 const volatile u32 stall_user_nth;
  38 const volatile u32 stall_kernel_nth;
  39 const volatile u32 dsp_inf_loop_after;
  40 const volatile u32 dsp_batch;
  41 const volatile bool highpri_boosting;
  42 const volatile bool print_shared_dsq;
  43 const volatile s32 disallow_tgid;
  44 const volatile bool suppress_dump;
  45
  46 u64 nr_highpri_queued;
  47 u32 test_error_cnt;
  48
  49 UEI_DEFINE(uei);
  50
  51 struct qmap {
  52         __uint(type, BPF_MAP_TYPE_QUEUE);
  53         __uint(max_entries, 4096);
  54         __type(value, u32);
  55 } queue0 SEC(".maps"),
  56   queue1 SEC(".maps"),
  57   queue2 SEC(".maps"),
  58   queue3 SEC(".maps"),
  59   queue4 SEC(".maps");
  60
  61 struct {
  62         __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
  63         __uint(max_entries, 5);
  64         __type(key, int);
  65         __array(values, struct qmap);
  66 } queue_arr SEC(".maps") = {
  67         .values = {
  68                 [0] = &queue0,
  69                 [1] = &queue1,
  70                 [2] = &queue2,
  71                 [3] = &queue3,
  72                 [4] = &queue4,
  73         },
  74 };
  75
  76 /*
  77  * If enabled, CPU performance target is set according to the queue index
  78  * according to the following table.
  79  */
  80 static const u32 qidx_to_cpuperf_target[] = {
  81         [0] = SCX_CPUPERF_ONE * 0 / 4,
  82         [1] = SCX_CPUPERF_ONE * 1 / 4,
  83         [2] = SCX_CPUPERF_ONE * 2 / 4,
  84         [3] = SCX_CPUPERF_ONE * 3 / 4,
  85         [4] = SCX_CPUPERF_ONE * 4 / 4,
  86 };
  87
  88 /*
  89  * Per-queue sequence numbers to implement core-sched ordering.
  90  *
  91  * Tail seq is assigned to each queued task and incremented. Head seq tracks the
  92  * sequence number of the latest dispatched task. The distance between the a
  93  * task's seq and the associated queue's head seq is called the queue distance
  94  * and used when comparing two tasks for ordering. See qmap_core_sched_before().
  95  */
  96 static u64 core_sched_head_seqs[5];
  97 static u64 core_sched_tail_seqs[5];
  98
  99 /* Per-task scheduling context */
 100 struct task_ctx {
 101         bool    force_local;    /* Dispatch directly to local_dsq */
 102         bool    highpri;
 103         u64     core_sched_seq;
 104 };
 105
 106 struct {
 107         __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
 108         __uint(map_flags, BPF_F_NO_PREALLOC);
 109         __type(key, int);
 110         __type(value, struct task_ctx);
 111 } task_ctx_stor SEC(".maps");
 112
 113 struct cpu_ctx {
 114         u64     dsp_idx;        /* dispatch index */
 115         u64     dsp_cnt;        /* remaining count */
 116         u32     avg_weight;
 117         u32     cpuperf_target;
 118 };
 119
 120 struct {
 121         __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 122         __uint(max_entries, 1);
 123         __type(key, u32);
 124         __type(value, struct cpu_ctx);
 125 } cpu_ctx_stor SEC(".maps");
 126
 127 /* Statistics */
 128 u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
 129 u64 nr_core_sched_execed;
 130 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
 131 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
 132 u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
 133
 134 static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 135 {
 136         s32 cpu;
 137
 138         if (p->nr_cpus_allowed == 1 ||
 139             scx_bpf_test_and_clear_cpu_idle(prev_cpu))
 140                 return prev_cpu;
 141
 142         cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 143         if (cpu >= 0)
 144                 return cpu;
 145
 146         return -1;
 147 }
 148
 149 static struct task_ctx *lookup_task_ctx(struct task_struct *p)
 150 {
 151         struct task_ctx *tctx;
 152
 153         if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
 154                 scx_bpf_error("task_ctx lookup failed");
 155                 return NULL;
 156         }
 157         return tctx;
 158 }
 159
 160 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 161                    s32 prev_cpu, u64 wake_flags)
 162 {
 163         struct task_ctx *tctx;
 164         s32 cpu;
 165
 166         if (!(tctx = lookup_task_ctx(p)))
 167                 return -ESRCH;
 168
 169         cpu = pick_direct_dispatch_cpu(p, prev_cpu);
 170
 171         if (cpu >= 0) {
 172                 tctx->force_local = true;
 173                 return cpu;
 174         } else {
 175                 return prev_cpu;
 176         }
 177 }
 178
 179 static int weight_to_idx(u32 weight)
 180 {
 181         /* Coarsely map the compound weight to a FIFO. */
 182         if (weight <= 25)
 183                 return 0;
 184         else if (weight <= 50)
 185                 return 1;
 186         else if (weight < 200)
 187                 return 2;
 188         else if (weight < 400)
 189                 return 3;
 190         else
 191                 return 4;
 192 }
 193
 194 void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 195 {
 196         static u32 user_cnt, kernel_cnt;
 197         struct task_ctx *tctx;
 198         u32 pid = p->pid;
 199         int idx = weight_to_idx(p->scx.weight);
 200         void *ring;
 201         s32 cpu;
 202
 203         if (p->flags & PF_KTHREAD) {
 204                 if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
 205                         return;
 206         } else {
 207                 if (stall_user_nth && !(++user_cnt % stall_user_nth))
 208                         return;
 209         }
 210
 211         if (test_error_cnt && !--test_error_cnt)
 212                 scx_bpf_error("test triggering error");
 213
 214         if (!(tctx = lookup_task_ctx(p)))
 215                 return;
 216
 217         /*
 218          * All enqueued tasks must have their core_sched_seq updated for correct
 219          * core-sched ordering. Also, take a look at the end of qmap_dispatch().
 220          */
 221         tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
 222
 223         /*
 224          * If qmap_select_cpu() is telling us to or this is the last runnable
 225          * task on the CPU, enqueue locally.
 226          */
 227         if (tctx->force_local) {
 228                 tctx->force_local = false;
 229                 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 230                 return;
 231         }
 232
 233         /* if select_cpu() wasn't called, try direct dispatch */
 234         if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
 235             (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
 236                 __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
 237                 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
 238                 return;
 239         }
 240
 241         /*
 242          * If the task was re-enqueued due to the CPU being preempted by a
 243          * higher priority scheduling class, just re-enqueue the task directly
 244          * on the global DSQ. As we want another CPU to pick it up, find and
 245          * kick an idle CPU.
 246          */
 247         if (enq_flags & SCX_ENQ_REENQ) {
 248                 s32 cpu;
 249
 250                 scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
 251                 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 252                 if (cpu >= 0)
 253                         scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
 254                 return;
 255         }
 256
 257         ring = bpf_map_lookup_elem(&queue_arr, &idx);
 258         if (!ring) {
 259                 scx_bpf_error("failed to find ring %d", idx);
 260                 return;
 261         }
 262
 263         /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
 264         if (bpf_map_push_elem(ring, &pid, 0)) {
 265                 scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
 266                 return;
 267         }
 268
 269         if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
 270                 tctx->highpri = true;
 271                 __sync_fetch_and_add(&nr_highpri_queued, 1);
 272         }
 273         __sync_fetch_and_add(&nr_enqueued, 1);
 274 }
 275
 276 /*
 277  * The BPF queue map doesn't support removal and sched_ext can handle spurious
 278  * dispatches. qmap_dequeue() is only used to collect statistics.
 279  */
 280 void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
 281 {
 282         __sync_fetch_and_add(&nr_dequeued, 1);
 283         if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
 284                 __sync_fetch_and_add(&nr_core_sched_execed, 1);
 285 }
 286
 287 static void update_core_sched_head_seq(struct task_struct *p)
 288 {
 289         int idx = weight_to_idx(p->scx.weight);
 290         struct task_ctx *tctx;
 291
 292         if ((tctx = lookup_task_ctx(p)))
 293                 core_sched_head_seqs[idx] = tctx->core_sched_seq;
 294 }
 295
 296 /*
 297  * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
 298  * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
 299  * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
 300  * difference only when dsp_batch is larger than 1.
 301  *
 302  * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
 303  * non-rq-lock holding BPF programs. As demonstration, this function is called
 304  * from qmap_dispatch() and monitor_timerfn().
 305  */
 306 static bool dispatch_highpri(bool from_timer)
 307 {
 308         struct task_struct *p;
 309         s32 this_cpu = bpf_get_smp_processor_id();
 310
 311         /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
 312         bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
 313                 static u64 highpri_seq;
 314                 struct task_ctx *tctx;
 315
 316                 if (!(tctx = lookup_task_ctx(p)))
 317                         return false;
 318
 319                 if (tctx->highpri) {
 320                         /* exercise the set_*() and vtime interface too */
 321                         __COMPAT_scx_bpf_dsq_move_set_slice(
 322                                 BPF_FOR_EACH_ITER, slice_ns * 2);
 323                         __COMPAT_scx_bpf_dsq_move_set_vtime(
 324                                 BPF_FOR_EACH_ITER, highpri_seq++);
 325                         __COMPAT_scx_bpf_dsq_move_vtime(
 326                                 BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
 327                 }
 328         }
 329
 330         /*
 331          * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
 332          * is found.
 333          */
 334         bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
 335                 bool dispatched = false;
 336                 s32 cpu;
 337
 338                 if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
 339                         cpu = this_cpu;
 340                 else
 341                         cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
 342
 343                 if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
 344                                               SCX_DSQ_LOCAL_ON | cpu,
 345                                               SCX_ENQ_PREEMPT)) {
 346                         if (cpu == this_cpu) {
 347                                 dispatched = true;
 348                                 __sync_fetch_and_add(&nr_expedited_local, 1);
 349                         } else {
 350                                 __sync_fetch_and_add(&nr_expedited_remote, 1);
 351                         }
 352                         if (from_timer)
 353                                 __sync_fetch_and_add(&nr_expedited_from_timer, 1);
 354                 } else {
 355                         __sync_fetch_and_add(&nr_expedited_lost, 1);
 356                 }
 357
 358                 if (dispatched)
 359                         return true;
 360         }
 361
 362         return false;
 363 }
 364
 365 void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 366 {
 367         struct task_struct *p;
 368         struct cpu_ctx *cpuc;
 369         struct task_ctx *tctx;
 370         u32 zero = 0, batch = dsp_batch ?: 1;
 371         void *fifo;
 372         s32 i, pid;
 373
 374         if (dispatch_highpri(false))
 375                 return;
 376
 377         if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
 378                 return;
 379
 380         if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
 381                 /*
 382                  * PID 2 should be kthreadd which should mostly be idle and off
 383                  * the scheduler. Let's keep dispatching it to force the kernel
 384                  * to call this function over and over again.
 385                  */
 386                 p = bpf_task_from_pid(2);
 387                 if (p) {
 388                         scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
 389                         bpf_task_release(p);
 390                         return;
 391                 }
 392         }
 393
 394         if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
 395                 scx_bpf_error("failed to look up cpu_ctx");
 396                 return;
 397         }
 398
 399         for (i = 0; i < 5; i++) {
 400                 /* Advance the dispatch cursor and pick the fifo. */
 401                 if (!cpuc->dsp_cnt) {
 402                         cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
 403                         cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
 404                 }
 405
 406                 fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
 407                 if (!fifo) {
 408                         scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
 409                         return;
 410                 }
 411
 412                 /* Dispatch or advance. */
 413                 bpf_repeat(BPF_MAX_LOOPS) {
 414                         struct task_ctx *tctx;
 415
 416                         if (bpf_map_pop_elem(fifo, &pid))
 417                                 break;
 418
 419                         p = bpf_task_from_pid(pid);
 420                         if (!p)
 421                                 continue;
 422
 423                         if (!(tctx = lookup_task_ctx(p))) {
 424                                 bpf_task_release(p);
 425                                 return;
 426                         }
 427
 428                         if (tctx->highpri)
 429                                 __sync_fetch_and_sub(&nr_highpri_queued, 1);
 430
 431                         update_core_sched_head_seq(p);
 432                         __sync_fetch_and_add(&nr_dispatched, 1);
 433
 434                         scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
 435                         bpf_task_release(p);
 436
 437                         batch--;
 438                         cpuc->dsp_cnt--;
 439                         if (!batch || !scx_bpf_dispatch_nr_slots()) {
 440                                 if (dispatch_highpri(false))
 441                                         return;
 442                                 scx_bpf_dsq_move_to_local(SHARED_DSQ);
 443                                 return;
 444                         }
 445                         if (!cpuc->dsp_cnt)
 446                                 break;
 447                 }
 448
 449                 cpuc->dsp_cnt = 0;
 450         }
 451
 452         /*
 453          * No other tasks. @prev will keep running. Update its core_sched_seq as
 454          * if the task were enqueued and dispatched immediately.
 455          */
 456         if (prev) {
 457                 tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
 458                 if (!tctx) {
 459                         scx_bpf_error("task_ctx lookup failed");
 460                         return;
 461                 }
 462
 463                 tctx->core_sched_seq =
 464                         core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
 465         }
 466 }
 467
 468 void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
 469 {
 470         struct cpu_ctx *cpuc;
 471         u32 zero = 0;
 472         int idx;
 473
 474         if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
 475                 scx_bpf_error("failed to look up cpu_ctx");
 476                 return;
 477         }
 478
 479         /*
 480          * Use the running avg of weights to select the target cpuperf level.
 481          * This is a demonstration of the cpuperf feature rather than a
 482          * practical strategy to regulate CPU frequency.
 483          */
 484         cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
 485         idx = weight_to_idx(cpuc->avg_weight);
 486         cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
 487
 488         scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
 489 }
 490
 491 /*
 492  * The distance from the head of the queue scaled by the weight of the queue.
 493  * The lower the number, the older the task and the higher the priority.
 494  */
 495 static s64 task_qdist(struct task_struct *p)
 496 {
 497         int idx = weight_to_idx(p->scx.weight);
 498         struct task_ctx *tctx;
 499         s64 qdist;
 500
 501         tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
 502         if (!tctx) {
 503                 scx_bpf_error("task_ctx lookup failed");
 504                 return 0;
 505         }
 506
 507         qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
 508
 509         /*
 510          * As queue index increments, the priority doubles. The queue w/ index 3
 511          * is dispatched twice more frequently than 2. Reflect the difference by
 512          * scaling qdists accordingly. Note that the shift amount needs to be
 513          * flipped depending on the sign to avoid flipping priority direction.
 514          */
 515         if (qdist >= 0)
 516                 return qdist << (4 - idx);
 517         else
 518                 return qdist << idx;
 519 }
 520
 521 /*
 522  * This is called to determine the task ordering when core-sched is picking
 523  * tasks to execute on SMT siblings and should encode about the same ordering as
 524  * the regular scheduling path. Use the priority-scaled distances from the head
 525  * of the queues to compare the two tasks which should be consistent with the
 526  * dispatch path behavior.
 527  */
 528 bool BPF_STRUCT_OPS(qmap_core_sched_before,
 529                     struct task_struct *a, struct task_struct *b)
 530 {
 531         return task_qdist(a) > task_qdist(b);
 532 }
 533
 534 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
 535 {
 536         u32 cnt;
 537
 538         /*
 539          * Called when @cpu is taken by a higher priority scheduling class. This
 540          * makes @cpu no longer available for executing sched_ext tasks. As we
 541          * don't want the tasks in @cpu's local dsq to sit there until @cpu
 542          * becomes available again, re-enqueue them into the global dsq. See
 543          * %SCX_ENQ_REENQ handling in qmap_enqueue().
 544          */
 545         cnt = scx_bpf_reenqueue_local();
 546         if (cnt)
 547                 __sync_fetch_and_add(&nr_reenqueued, cnt);
 548 }
 549
 550 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
 551                    struct scx_init_task_args *args)
 552 {
 553         if (p->tgid == disallow_tgid)
 554                 p->scx.disallow = true;
 555
 556         /*
 557          * @p is new. Let's ensure that its task_ctx is available. We can sleep
 558          * in this function and the following will automatically use GFP_KERNEL.
 559          */
 560         if (bpf_task_storage_get(&task_ctx_stor, p, 0,
 561                                  BPF_LOCAL_STORAGE_GET_F_CREATE))
 562                 return 0;
 563         else
 564                 return -ENOMEM;
 565 }
 566
 567 void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
 568 {
 569         s32 i, pid;
 570
 571         if (suppress_dump)
 572                 return;
 573
 574         bpf_for(i, 0, 5) {
 575                 void *fifo;
 576
 577                 if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
 578                         return;
 579
 580                 scx_bpf_dump("QMAP FIFO[%d]:", i);
 581                 bpf_repeat(4096) {
 582                         if (bpf_map_pop_elem(fifo, &pid))
 583                                 break;
 584                         scx_bpf_dump(" %d", pid);
 585                 }
 586                 scx_bpf_dump("\n");
 587         }
 588 }
 589
 590 void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
 591 {
 592         u32 zero = 0;
 593         struct cpu_ctx *cpuc;
 594
 595         if (suppress_dump || idle)
 596                 return;
 597         if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
 598                 return;
 599
 600         scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
 601                      cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
 602                      cpuc->cpuperf_target);
 603 }
 604
 605 void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
 606 {
 607         struct task_ctx *taskc;
 608
 609         if (suppress_dump)
 610                 return;
 611         if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
 612                 return;
 613
 614         scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
 615                      taskc->force_local, taskc->core_sched_seq);
 616 }
 617
 618 /*
 619  * Print out the online and possible CPU map using bpf_printk() as a
 620  * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
 621  */
 622 static void print_cpus(void)
 623 {
 624         const struct cpumask *possible, *online;
 625         s32 cpu;
 626         char buf[128] = "", *p;
 627         int idx;
 628
 629         possible = scx_bpf_get_possible_cpumask();
 630         online = scx_bpf_get_online_cpumask();
 631
 632         idx = 0;
 633         bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
 634                 if (!(p = MEMBER_VPTR(buf, [idx++])))
 635                         break;
 636                 if (bpf_cpumask_test_cpu(cpu, online))
 637                         *p++ = 'O';
 638                 else if (bpf_cpumask_test_cpu(cpu, possible))
 639                         *p++ = 'X';
 640                 else
 641                         *p++ = ' ';
 642
 643                 if ((cpu & 7) == 7) {
 644                         if (!(p = MEMBER_VPTR(buf, [idx++])))
 645                                 break;
 646                         *p++ = '|';
 647                 }
 648         }
 649         buf[sizeof(buf) - 1] = '\0';
 650
 651         scx_bpf_put_cpumask(online);
 652         scx_bpf_put_cpumask(possible);
 653
 654         bpf_printk("CPUS: |%s", buf);
 655 }
 656
 657 void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
 658 {
 659         bpf_printk("CPU %d coming online", cpu);
 660         /* @cpu is already online at this point */
 661         print_cpus();
 662 }
 663
 664 void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
 665 {
 666         bpf_printk("CPU %d going offline", cpu);
 667         /* @cpu is still online at this point */
 668         print_cpus();
 669 }
 670
 671 struct monitor_timer {
 672         struct bpf_timer timer;
 673 };
 674
 675 struct {
 676         __uint(type, BPF_MAP_TYPE_ARRAY);
 677         __uint(max_entries, 1);
 678         __type(key, u32);
 679         __type(value, struct monitor_timer);
 680 } monitor_timer SEC(".maps");
 681
 682 /*
 683  * Print out the min, avg and max performance levels of CPUs every second to
 684  * demonstrate the cpuperf interface.
 685  */
 686 static void monitor_cpuperf(void)
 687 {
 688         u32 zero = 0, nr_cpu_ids;
 689         u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
 690         u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
 691         const struct cpumask *online;
 692         int i, nr_online_cpus = 0;
 693
 694         nr_cpu_ids = scx_bpf_nr_cpu_ids();
 695         online = scx_bpf_get_online_cpumask();
 696
 697         bpf_for(i, 0, nr_cpu_ids) {
 698                 struct cpu_ctx *cpuc;
 699                 u32 cap, cur;
 700
 701                 if (!bpf_cpumask_test_cpu(i, online))
 702                         continue;
 703                 nr_online_cpus++;
 704
 705                 /* collect the capacity and current cpuperf */
 706                 cap = scx_bpf_cpuperf_cap(i);
 707                 cur = scx_bpf_cpuperf_cur(i);
 708
 709                 cur_min = cur < cur_min ? cur : cur_min;
 710                 cur_max = cur > cur_max ? cur : cur_max;
 711
 712                 /*
 713                  * $cur is relative to $cap. Scale it down accordingly so that
 714                  * it's in the same scale as other CPUs and $cur_sum/$cap_sum
 715                  * makes sense.
 716                  */
 717                 cur_sum += cur * cap / SCX_CPUPERF_ONE;
 718                 cap_sum += cap;
 719
 720                 if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
 721                         scx_bpf_error("failed to look up cpu_ctx");
 722                         goto out;
 723                 }
 724
 725                 /* collect target */
 726                 cur = cpuc->cpuperf_target;
 727                 target_sum += cur;
 728                 target_min = cur < target_min ? cur : target_min;
 729                 target_max = cur > target_max ? cur : target_max;
 730         }
 731
 732         cpuperf_min = cur_min;
 733         cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
 734         cpuperf_max = cur_max;
 735
 736         cpuperf_target_min = target_min;
 737         cpuperf_target_avg = target_sum / nr_online_cpus;
 738         cpuperf_target_max = target_max;
 739 out:
 740         scx_bpf_put_cpumask(online);
 741 }
 742
 743 /*
 744  * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
 745  * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
 746  * see meaningful dumps in the trace pipe.
 747  */
 748 static void dump_shared_dsq(void)
 749 {
 750         struct task_struct *p;
 751         s32 nr;
 752
 753         if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
 754                 return;
 755
 756         bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
 757
 758         bpf_rcu_read_lock();
 759         bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
 760                 bpf_printk("%s[%d]", p->comm, p->pid);
 761         bpf_rcu_read_unlock();
 762 }
 763
 764 static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 765 {
 766         bpf_rcu_read_lock();
 767         dispatch_highpri(true);
 768         bpf_rcu_read_unlock();
 769
 770         monitor_cpuperf();
 771
 772         if (print_shared_dsq)
 773                 dump_shared_dsq();
 774
 775         bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 776         return 0;
 777 }
 778
 779 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 780 {
 781         u32 key = 0;
 782         struct bpf_timer *timer;
 783         s32 ret;
 784
 785         print_cpus();
 786
 787         ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
 788         if (ret)
 789                 return ret;
 790
 791         ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
 792         if (ret)
 793                 return ret;
 794
 795         timer = bpf_map_lookup_elem(&monitor_timer, &key);
 796         if (!timer)
 797                 return -ESRCH;
 798
 799         bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
 800         bpf_timer_set_callback(timer, monitor_timerfn);
 801
 802         return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
 803 }
 804
 805 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 806 {
 807         UEI_RECORD(uei, ei);
 808 }
 809
 810 SCX_OPS_DEFINE(qmap_ops,
 811                .select_cpu              = (void *)qmap_select_cpu,
 812                .enqueue                 = (void *)qmap_enqueue,
 813                .dequeue                 = (void *)qmap_dequeue,
 814                .dispatch                = (void *)qmap_dispatch,
 815                .tick                    = (void *)qmap_tick,
 816                .core_sched_before       = (void *)qmap_core_sched_before,
 817                .cpu_release             = (void *)qmap_cpu_release,
 818                .init_task               = (void *)qmap_init_task,
 819                .dump                    = (void *)qmap_dump,
 820                .dump_cpu                = (void *)qmap_dump_cpu,
 821                .dump_task               = (void *)qmap_dump_task,
 822                .cpu_online              = (void *)qmap_cpu_online,
 823                .cpu_offline             = (void *)qmap_cpu_offline,
 824                .init                    = (void *)qmap_init,
 825                .exit                    = (void *)qmap_exit,
 826                .timeout_ms              = 5000U,
 827                .name                    = "qmap");