arch/s390/kernel/hiperdispatch.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright IBM Corp. 2024
   4  */
   5
   6 #define KMSG_COMPONENT "hd"
   7 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
   8
   9 /*
  10  * Hiperdispatch:
  11  * Dynamically calculates the optimum number of high capacity COREs
  12  * by considering the state the system is in. When hiperdispatch decides
  13  * that a capacity update is necessary, it schedules a topology update.
  14  * During topology updates the CPU capacities are always re-adjusted.
  15  *
  16  * There is two places where CPU capacities are being accessed within
  17  * hiperdispatch.
  18  * -> hiperdispatch's reoccuring work function reads CPU capacities to
  19  *    determine high capacity CPU count.
  20  * -> during a topology update hiperdispatch's adjustment function
  21  *    updates CPU capacities.
  22  * These two can run on different CPUs in parallel which can cause
  23  * hiperdispatch to make wrong decisions. This can potentially cause
  24  * some overhead by leading to extra rebuild_sched_domains() calls
  25  * for correction. Access to capacities within hiperdispatch has to be
  26  * serialized to prevent the overhead.
  27  *
  28  * Hiperdispatch decision making revolves around steal time.
  29  * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
  30  * crosses the threshold value hiperdispatch falls back to giving high
  31  * capacities to entitled CPUs. When steal time drops below the
  32  * threshold boundary, hiperdispatch utilizes all CPUs by giving all
  33  * of them high capacity.
  34  *
  35  * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
  36  * performance. Comparing the throughput of;
  37  * - single CORE, with N threads, running N tasks
  38  * - N separate COREs running N tasks,
  39  * using individual COREs for individual tasks yield better
  40  * performance. This performance difference is roughly ~30% (can change
  41  * between machine generations)
  42  *
  43  * Hiperdispatch tries to hint scheduler to use individual COREs for
  44  * each task, as long as steal time on those COREs are less than 30%,
  45  * therefore delaying the throughput loss caused by using SMP threads.
  46  */
  47
  48 #include <linux/cpumask.h>
  49 #include <linux/debugfs.h>
  50 #include <linux/device.h>
  51 #include <linux/kernel_stat.h>
  52 #include <linux/kstrtox.h>
  53 #include <linux/ktime.h>
  54 #include <linux/sysctl.h>
  55 #include <linux/types.h>
  56 #include <linux/workqueue.h>
  57 #include <asm/hiperdispatch.h>
  58 #include <asm/setup.h>
  59 #include <asm/smp.h>
  60 #include <asm/topology.h>
  61
  62 #define CREATE_TRACE_POINTS
  63 #include <asm/trace/hiperdispatch.h>
  64
  65 #define HD_DELAY_FACTOR                 (4)
  66 #define HD_DELAY_INTERVAL               (HZ / 4)
  67 #define HD_STEAL_THRESHOLD              30
  68 #define HD_STEAL_AVG_WEIGHT             16
  69
  70 static cpumask_t hd_vl_coremask;        /* Mask containing all vertical low COREs */
  71 static cpumask_t hd_vmvl_cpumask;       /* Mask containing vertical medium and low CPUs */
  72 static int hd_high_capacity_cores;      /* Current CORE count with high capacity */
  73 static int hd_entitled_cores;           /* Total vertical high and medium CORE count */
  74 static int hd_online_cores;             /* Current online CORE count */
  75
  76 static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */
  77 static unsigned long hd_high_time;      /* Total time spent while all cpus have high capacity */
  78 static unsigned long hd_low_time;       /* Total time spent while vl cpus have low capacity */
  79 static atomic64_t hd_adjustments;       /* Total occurrence count of hiperdispatch adjustments */
  80
  81 static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
  82 static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
  83 static int hd_enabled;
  84
  85 static void hd_capacity_work_fn(struct work_struct *work);
  86 static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
  87
  88 static int hd_set_hiperdispatch_mode(int enable)
  89 {
  90         if (!MACHINE_HAS_TOPOLOGY)
  91                 enable = 0;
  92         if (hd_enabled == enable)
  93                 return 0;
  94         hd_enabled = enable;
  95         return 1;
  96 }
  97
  98 void hd_reset_state(void)
  99 {
 100         cpumask_clear(&hd_vl_coremask);
 101         cpumask_clear(&hd_vmvl_cpumask);
 102         hd_entitled_cores = 0;
 103         hd_online_cores = 0;
 104 }
 105
 106 void hd_add_core(int cpu)
 107 {
 108         const struct cpumask *siblings;
 109         int polarization;
 110
 111         hd_online_cores++;
 112         polarization = smp_cpu_get_polarization(cpu);
 113         siblings = topology_sibling_cpumask(cpu);
 114         switch (polarization) {
 115         case POLARIZATION_VH:
 116                 hd_entitled_cores++;
 117                 break;
 118         case POLARIZATION_VM:
 119                 hd_entitled_cores++;
 120                 cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
 121                 break;
 122         case POLARIZATION_VL:
 123                 cpumask_set_cpu(cpu, &hd_vl_coremask);
 124                 cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
 125                 break;
 126         }
 127 }
 128
 129 /* Serialize update and read operations of debug counters. */
 130 static DEFINE_MUTEX(hd_counter_mutex);
 131
 132 static void hd_update_times(void)
 133 {
 134         static ktime_t prev;
 135         ktime_t now;
 136
 137         /*
 138          * Check if hiperdispatch is active, if not set the prev to 0.
 139          * This way it is possible to differentiate the first update iteration after
 140          * enabling hiperdispatch.
 141          */
 142         if (hd_entitled_cores == 0 || hd_enabled == 0) {
 143                 prev = ktime_set(0, 0);
 144                 return;
 145         }
 146         now = ktime_get();
 147         if (ktime_after(prev, 0)) {
 148                 if (hd_high_capacity_cores == hd_online_cores)
 149                         hd_high_time += ktime_ms_delta(now, prev);
 150                 else
 151                         hd_low_time += ktime_ms_delta(now, prev);
 152         }
 153         prev = now;
 154 }
 155
 156 static void hd_update_capacities(void)
 157 {
 158         int cpu, upscaling_cores;
 159         unsigned long capacity;
 160
 161         upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
 162         capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
 163         hd_high_capacity_cores = hd_entitled_cores;
 164         for_each_cpu(cpu, &hd_vl_coremask) {
 165                 smp_set_core_capacity(cpu, capacity);
 166                 if (capacity != CPU_CAPACITY_HIGH)
 167                         continue;
 168                 hd_high_capacity_cores++;
 169                 upscaling_cores--;
 170                 if (upscaling_cores == 0)
 171                         capacity = CPU_CAPACITY_LOW;
 172         }
 173 }
 174
 175 void hd_disable_hiperdispatch(void)
 176 {
 177         cancel_delayed_work_sync(&hd_capacity_work);
 178         hd_high_capacity_cores = hd_online_cores;
 179         hd_previous_steal = 0;
 180 }
 181
 182 int hd_enable_hiperdispatch(void)
 183 {
 184         mutex_lock(&hd_counter_mutex);
 185         hd_update_times();
 186         mutex_unlock(&hd_counter_mutex);
 187         if (hd_enabled == 0)
 188                 return 0;
 189         if (hd_entitled_cores == 0)
 190                 return 0;
 191         if (hd_online_cores <= hd_entitled_cores)
 192                 return 0;
 193         mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
 194         hd_update_capacities();
 195         return 1;
 196 }
 197
 198 static unsigned long hd_steal_avg(unsigned long new)
 199 {
 200         static unsigned long steal;
 201
 202         steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
 203         return steal;
 204 }
 205
 206 static unsigned long hd_calculate_steal_percentage(void)
 207 {
 208         unsigned long time_delta, steal_delta, steal, percentage;
 209         static ktime_t prev;
 210         int cpus, cpu;
 211         ktime_t now;
 212
 213         cpus = 0;
 214         steal = 0;
 215         percentage = 0;
 216         for_each_cpu(cpu, &hd_vmvl_cpumask) {
 217                 steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
 218                 cpus++;
 219         }
 220         /*
 221          * If there is no vertical medium and low CPUs steal time
 222          * is 0 as vertical high CPUs shouldn't experience steal time.
 223          */
 224         if (cpus == 0)
 225                 return percentage;
 226         now = ktime_get();
 227         time_delta = ktime_to_ns(ktime_sub(now, prev));
 228         if (steal > hd_previous_steal && hd_previous_steal != 0) {
 229                 steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
 230                 percentage = steal_delta / cpus;
 231         }
 232         hd_previous_steal = steal;
 233         prev = now;
 234         return percentage;
 235 }
 236
 237 static void hd_capacity_work_fn(struct work_struct *work)
 238 {
 239         unsigned long steal_percentage, new_cores;
 240
 241         mutex_lock(&smp_cpu_state_mutex);
 242         /*
 243          * If online cores are less or equal to entitled cores hiperdispatch
 244          * does not need to make any adjustments, call a topology update to
 245          * disable hiperdispatch.
 246          * Normally this check is handled on topology update, but during cpu
 247          * unhotplug, topology and cpu mask updates are done in reverse
 248          * order, causing hd_enable_hiperdispatch() to get stale data.
 249          */
 250         if (hd_online_cores <= hd_entitled_cores) {
 251                 topology_schedule_update();
 252                 mutex_unlock(&smp_cpu_state_mutex);
 253                 return;
 254         }
 255         steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
 256         if (steal_percentage < hd_steal_threshold)
 257                 new_cores = hd_online_cores;
 258         else
 259                 new_cores = hd_entitled_cores;
 260         if (hd_high_capacity_cores != new_cores) {
 261                 trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
 262                 hd_high_capacity_cores = new_cores;
 263                 atomic64_inc(&hd_adjustments);
 264                 topology_schedule_update();
 265         }
 266         trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
 267         mutex_unlock(&smp_cpu_state_mutex);
 268         schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
 269 }
 270
 271 static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
 272                                      void *buffer, size_t *lenp, loff_t *ppos)
 273 {
 274         int hiperdispatch;
 275         int rc;
 276         struct ctl_table ctl_entry = {
 277                 .procname       = ctl->procname,
 278                 .data           = &hiperdispatch,
 279                 .maxlen         = sizeof(int),
 280                 .extra1         = SYSCTL_ZERO,
 281                 .extra2         = SYSCTL_ONE,
 282         };
 283
 284         hiperdispatch = hd_enabled;
 285         rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
 286         if (rc < 0 || !write)
 287                 return rc;
 288         mutex_lock(&smp_cpu_state_mutex);
 289         if (hd_set_hiperdispatch_mode(hiperdispatch))
 290                 topology_schedule_update();
 291         mutex_unlock(&smp_cpu_state_mutex);
 292         return 0;
 293 }
 294
 295 static struct ctl_table hiperdispatch_ctl_table[] = {
 296         {
 297                 .procname       = "hiperdispatch",
 298                 .mode           = 0644,
 299                 .proc_handler   = hiperdispatch_ctl_handler,
 300         },
 301 };
 302
 303 static ssize_t hd_steal_threshold_show(struct device *dev,
 304                                        struct device_attribute *attr,
 305                                        char *buf)
 306 {
 307         return sysfs_emit(buf, "%u\n", hd_steal_threshold);
 308 }
 309
 310 static ssize_t hd_steal_threshold_store(struct device *dev,
 311                                         struct device_attribute *attr,
 312                                         const char *buf,
 313                                         size_t count)
 314 {
 315         unsigned int val;
 316         int rc;
 317
 318         rc = kstrtouint(buf, 0, &val);
 319         if (rc)
 320                 return rc;
 321         if (val > 100)
 322                 return -ERANGE;
 323         hd_steal_threshold = val;
 324         return count;
 325 }
 326
 327 static DEVICE_ATTR_RW(hd_steal_threshold);
 328
 329 static ssize_t hd_delay_factor_show(struct device *dev,
 330                                     struct device_attribute *attr,
 331                                     char *buf)
 332 {
 333         return sysfs_emit(buf, "%u\n", hd_delay_factor);
 334 }
 335
 336 static ssize_t hd_delay_factor_store(struct device *dev,
 337                                      struct device_attribute *attr,
 338                                      const char *buf,
 339                                      size_t count)
 340 {
 341         unsigned int val;
 342         int rc;
 343
 344         rc = kstrtouint(buf, 0, &val);
 345         if (rc)
 346                 return rc;
 347         if (!val)
 348                 return -ERANGE;
 349         hd_delay_factor = val;
 350         return count;
 351 }
 352
 353 static DEVICE_ATTR_RW(hd_delay_factor);
 354
 355 static struct attribute *hd_attrs[] = {
 356         &dev_attr_hd_steal_threshold.attr,
 357         &dev_attr_hd_delay_factor.attr,
 358         NULL,
 359 };
 360
 361 static const struct attribute_group hd_attr_group = {
 362         .name  = "hiperdispatch",
 363         .attrs = hd_attrs,
 364 };
 365
 366 static int hd_greedy_time_get(void *unused, u64 *val)
 367 {
 368         mutex_lock(&hd_counter_mutex);
 369         hd_update_times();
 370         *val = hd_high_time;
 371         mutex_unlock(&hd_counter_mutex);
 372         return 0;
 373 }
 374
 375 DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
 376
 377 static int hd_conservative_time_get(void *unused, u64 *val)
 378 {
 379         mutex_lock(&hd_counter_mutex);
 380         hd_update_times();
 381         *val = hd_low_time;
 382         mutex_unlock(&hd_counter_mutex);
 383         return 0;
 384 }
 385
 386 DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
 387
 388 static int hd_adjustment_count_get(void *unused, u64 *val)
 389 {
 390         *val = atomic64_read(&hd_adjustments);
 391         return 0;
 392 }
 393
 394 DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
 395
 396 static void __init hd_create_debugfs_counters(void)
 397 {
 398         struct dentry *dir;
 399
 400         dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
 401         debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
 402         debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
 403         debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
 404 }
 405
 406 static void __init hd_create_attributes(void)
 407 {
 408         struct device *dev;
 409
 410         dev = bus_get_dev_root(&cpu_subsys);
 411         if (!dev)
 412                 return;
 413         if (sysfs_create_group(&dev->kobj, &hd_attr_group))
 414                 pr_warn("Unable to create hiperdispatch attribute group\n");
 415         put_device(dev);
 416 }
 417
 418 static int __init hd_init(void)
 419 {
 420         if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
 421                 hd_set_hiperdispatch_mode(1);
 422                 topology_schedule_update();
 423         }
 424         if (!register_sysctl("s390", hiperdispatch_ctl_table))
 425                 pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
 426         hd_create_debugfs_counters();
 427         hd_create_attributes();
 428         return 0;
 429 }
 430 late_initcall(hd_init);