]> Git Repo - J-linux.git/blob - arch/x86/kernel/cpu/aperfmperf.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / arch / x86 / kernel / cpu / aperfmperf.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * x86 APERF/MPERF KHz calculation for
4  * /sys/.../cpufreq/scaling_cur_freq
5  *
6  * Copyright (C) 2017 Intel Corp.
7  * Author: Len Brown <[email protected]>
8  */
9 #include <linux/cpufreq.h>
10 #include <linux/delay.h>
11 #include <linux/ktime.h>
12 #include <linux/math64.h>
13 #include <linux/percpu.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched/isolation.h>
16 #include <linux/sched/topology.h>
17 #include <linux/smp.h>
18 #include <linux/syscore_ops.h>
19
20 #include <asm/cpu.h>
21 #include <asm/cpu_device_id.h>
22 #include <asm/intel-family.h>
23
24 #include "cpu.h"
25
26 struct aperfmperf {
27         seqcount_t      seq;
28         unsigned long   last_update;
29         u64             acnt;
30         u64             mcnt;
31         u64             aperf;
32         u64             mperf;
33 };
34
35 static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
36         .seq = SEQCNT_ZERO(cpu_samples.seq)
37 };
38
39 static void init_counter_refs(void)
40 {
41         u64 aperf, mperf;
42
43         rdmsrl(MSR_IA32_APERF, aperf);
44         rdmsrl(MSR_IA32_MPERF, mperf);
45
46         this_cpu_write(cpu_samples.aperf, aperf);
47         this_cpu_write(cpu_samples.mperf, mperf);
48 }
49
50 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
51 /*
52  * APERF/MPERF frequency ratio computation.
53  *
54  * The scheduler wants to do frequency invariant accounting and needs a <1
55  * ratio to account for the 'current' frequency, corresponding to
56  * freq_curr / freq_max.
57  *
58  * Since the frequency freq_curr on x86 is controlled by micro-controller and
59  * our P-state setting is little more than a request/hint, we need to observe
60  * the effective frequency 'BusyMHz', i.e. the average frequency over a time
61  * interval after discarding idle time. This is given by:
62  *
63  *   BusyMHz = delta_APERF / delta_MPERF * freq_base
64  *
65  * where freq_base is the max non-turbo P-state.
66  *
67  * The freq_max term has to be set to a somewhat arbitrary value, because we
68  * can't know which turbo states will be available at a given point in time:
69  * it all depends on the thermal headroom of the entire package. We set it to
70  * the turbo level with 4 cores active.
71  *
72  * Benchmarks show that's a good compromise between the 1C turbo ratio
73  * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
74  * which would ignore the entire turbo range (a conspicuous part, making
75  * freq_curr/freq_max always maxed out).
76  *
77  * An exception to the heuristic above is the Atom uarch, where we choose the
78  * highest turbo level for freq_max since Atom's are generally oriented towards
79  * power efficiency.
80  *
81  * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
82  * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
83  */
84
85 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
86
87 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
88 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
89
90 void arch_set_max_freq_ratio(bool turbo_disabled)
91 {
92         arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
93                                         arch_turbo_freq_ratio;
94 }
95 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
96
97 static bool __init turbo_disabled(void)
98 {
99         u64 misc_en;
100         int err;
101
102         err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
103         if (err)
104                 return false;
105
106         return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
107 }
108
109 static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
110 {
111         int err;
112
113         err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
114         if (err)
115                 return false;
116
117         err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
118         if (err)
119                 return false;
120
121         *base_freq = (*base_freq >> 16) & 0x3F;     /* max P state */
122         *turbo_freq = *turbo_freq & 0x3F;           /* 1C turbo    */
123
124         return true;
125 }
126
127 #define X86_MATCH(vfm)                                          \
128         X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
129
130 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
131         X86_MATCH(INTEL_XEON_PHI_KNL),
132         X86_MATCH(INTEL_XEON_PHI_KNM),
133         {}
134 };
135
136 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
137         X86_MATCH(INTEL_SKYLAKE_X),
138         {}
139 };
140
141 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
142         X86_MATCH(INTEL_ATOM_GOLDMONT),
143         X86_MATCH(INTEL_ATOM_GOLDMONT_D),
144         X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
145         {}
146 };
147
148 static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
149                                           int num_delta_fratio)
150 {
151         int fratio, delta_fratio, found;
152         int err, i;
153         u64 msr;
154
155         err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
156         if (err)
157                 return false;
158
159         *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
160
161         err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
162         if (err)
163                 return false;
164
165         fratio = (msr >> 8) & 0xFF;
166         i = 16;
167         found = 0;
168         do {
169                 if (found >= num_delta_fratio) {
170                         *turbo_freq = fratio;
171                         return true;
172                 }
173
174                 delta_fratio = (msr >> (i + 5)) & 0x7;
175
176                 if (delta_fratio) {
177                         found += 1;
178                         fratio -= delta_fratio;
179                 }
180
181                 i += 8;
182         } while (i < 64);
183
184         return true;
185 }
186
187 static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
188 {
189         u64 ratios, counts;
190         u32 group_size;
191         int err, i;
192
193         err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
194         if (err)
195                 return false;
196
197         *base_freq = (*base_freq >> 8) & 0xFF;      /* max P state */
198
199         err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
200         if (err)
201                 return false;
202
203         err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
204         if (err)
205                 return false;
206
207         for (i = 0; i < 64; i += 8) {
208                 group_size = (counts >> i) & 0xFF;
209                 if (group_size >= size) {
210                         *turbo_freq = (ratios >> i) & 0xFF;
211                         return true;
212                 }
213         }
214
215         return false;
216 }
217
218 static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
219 {
220         u64 msr;
221         int err;
222
223         err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
224         if (err)
225                 return false;
226
227         err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
228         if (err)
229                 return false;
230
231         *base_freq = (*base_freq >> 8) & 0xFF;    /* max P state */
232         *turbo_freq = (msr >> 24) & 0xFF;         /* 4C turbo    */
233
234         /* The CPU may have less than 4 cores */
235         if (!*turbo_freq)
236                 *turbo_freq = msr & 0xFF;         /* 1C turbo    */
237
238         return true;
239 }
240
241 static bool __init intel_set_max_freq_ratio(void)
242 {
243         u64 base_freq, turbo_freq;
244         u64 turbo_ratio;
245
246         if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
247                 goto out;
248
249         if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
250             skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
251                 goto out;
252
253         if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
254             knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
255                 goto out;
256
257         if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
258             skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
259                 goto out;
260
261         if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
262                 goto out;
263
264         return false;
265
266 out:
267         /*
268          * Some hypervisors advertise X86_FEATURE_APERFMPERF
269          * but then fill all MSR's with zeroes.
270          * Some CPUs have turbo boost but don't declare any turbo ratio
271          * in MSR_TURBO_RATIO_LIMIT.
272          */
273         if (!base_freq || !turbo_freq) {
274                 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
275                 return false;
276         }
277
278         turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
279         if (!turbo_ratio) {
280                 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
281                 return false;
282         }
283
284         arch_turbo_freq_ratio = turbo_ratio;
285         arch_set_max_freq_ratio(turbo_disabled());
286
287         return true;
288 }
289
290 #ifdef CONFIG_PM_SLEEP
291 static struct syscore_ops freq_invariance_syscore_ops = {
292         .resume = init_counter_refs,
293 };
294
295 static void register_freq_invariance_syscore_ops(void)
296 {
297         register_syscore_ops(&freq_invariance_syscore_ops);
298 }
299 #else
300 static inline void register_freq_invariance_syscore_ops(void) {}
301 #endif
302
303 static void freq_invariance_enable(void)
304 {
305         if (static_branch_unlikely(&arch_scale_freq_key)) {
306                 WARN_ON_ONCE(1);
307                 return;
308         }
309         static_branch_enable_cpuslocked(&arch_scale_freq_key);
310         register_freq_invariance_syscore_ops();
311         pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
312 }
313
314 void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
315 {
316         arch_turbo_freq_ratio = ratio;
317         arch_set_max_freq_ratio(turbo_disabled);
318         freq_invariance_enable();
319 }
320
321 static void __init bp_init_freq_invariance(void)
322 {
323         if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
324                 return;
325
326         if (intel_set_max_freq_ratio()) {
327                 guard(cpus_read_lock)();
328                 freq_invariance_enable();
329         }
330 }
331
332 static void disable_freq_invariance_workfn(struct work_struct *work)
333 {
334         int cpu;
335
336         static_branch_disable(&arch_scale_freq_key);
337
338         /*
339          * Set arch_freq_scale to a default value on all cpus
340          * This negates the effect of scaling
341          */
342         for_each_possible_cpu(cpu)
343                 per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
344 }
345
346 static DECLARE_WORK(disable_freq_invariance_work,
347                     disable_freq_invariance_workfn);
348
349 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
350 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
351
352 static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
353
354 struct arch_hybrid_cpu_scale {
355         unsigned long capacity;
356         unsigned long freq_ratio;
357 };
358
359 static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
360
361 /**
362  * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
363  *
364  * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
365  * initialize it and set the static key controlling its code paths.
366  *
367  * Must be called before arch_set_cpu_capacity().
368  */
369 bool arch_enable_hybrid_capacity_scale(void)
370 {
371         int cpu;
372
373         if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
374                 WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
375                 return true;
376         }
377
378         arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
379         if (!arch_cpu_scale)
380                 return false;
381
382         for_each_possible_cpu(cpu) {
383                 per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
384                 per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
385         }
386
387         static_branch_enable(&arch_hybrid_cap_scale_key);
388
389         pr_info("Hybrid CPU capacity scaling enabled\n");
390
391         return true;
392 }
393
394 /**
395  * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
396  * @cpu: Target CPU.
397  * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
398  * @max_cap: System-wide maximum CPU capacity.
399  * @cap_freq: Frequency of @cpu corresponding to @cap.
400  * @base_freq: Frequency of @cpu at which MPERF counts.
401  *
402  * The units in which @cap and @max_cap are expressed do not matter, so long
403  * as they are consistent, because the former is effectively divided by the
404  * latter.  Analogously for @cap_freq and @base_freq.
405  *
406  * After calling this function for all CPUs, call arch_rebuild_sched_domains()
407  * to let the scheduler know that capacity-aware scheduling can be used going
408  * forward.
409  */
410 void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
411                            unsigned long cap_freq, unsigned long base_freq)
412 {
413         if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
414                 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
415                            div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
416                 WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
417                            div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
418         } else {
419                 WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
420         }
421 }
422
423 unsigned long arch_scale_cpu_capacity(int cpu)
424 {
425         if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
426                 return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
427
428         return SCHED_CAPACITY_SCALE;
429 }
430 EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
431
432 static void scale_freq_tick(u64 acnt, u64 mcnt)
433 {
434         u64 freq_scale, freq_ratio;
435
436         if (!arch_scale_freq_invariant())
437                 return;
438
439         if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
440                 goto error;
441
442         if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
443                 freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
444         else
445                 freq_ratio = arch_max_freq_ratio;
446
447         if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
448                 goto error;
449
450         freq_scale = div64_u64(acnt, mcnt);
451         if (!freq_scale)
452                 goto error;
453
454         if (freq_scale > SCHED_CAPACITY_SCALE)
455                 freq_scale = SCHED_CAPACITY_SCALE;
456
457         this_cpu_write(arch_freq_scale, freq_scale);
458         return;
459
460 error:
461         pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
462         schedule_work(&disable_freq_invariance_work);
463 }
464 #else
465 static inline void bp_init_freq_invariance(void) { }
466 static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
467 #endif /* CONFIG_X86_64 && CONFIG_SMP */
468
469 void arch_scale_freq_tick(void)
470 {
471         struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
472         u64 acnt, mcnt, aperf, mperf;
473
474         if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
475                 return;
476
477         rdmsrl(MSR_IA32_APERF, aperf);
478         rdmsrl(MSR_IA32_MPERF, mperf);
479         acnt = aperf - s->aperf;
480         mcnt = mperf - s->mperf;
481
482         s->aperf = aperf;
483         s->mperf = mperf;
484
485         raw_write_seqcount_begin(&s->seq);
486         s->last_update = jiffies;
487         s->acnt = acnt;
488         s->mcnt = mcnt;
489         raw_write_seqcount_end(&s->seq);
490
491         scale_freq_tick(acnt, mcnt);
492 }
493
494 /*
495  * Discard samples older than the define maximum sample age of 20ms. There
496  * is no point in sending IPIs in such a case. If the scheduler tick was
497  * not running then the CPU is either idle or isolated.
498  */
499 #define MAX_SAMPLE_AGE  ((unsigned long)HZ / 50)
500
501 unsigned int arch_freq_get_on_cpu(int cpu)
502 {
503         struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
504         unsigned int seq, freq;
505         unsigned long last;
506         u64 acnt, mcnt;
507
508         if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
509                 goto fallback;
510
511         do {
512                 seq = raw_read_seqcount_begin(&s->seq);
513                 last = s->last_update;
514                 acnt = s->acnt;
515                 mcnt = s->mcnt;
516         } while (read_seqcount_retry(&s->seq, seq));
517
518         /*
519          * Bail on invalid count and when the last update was too long ago,
520          * which covers idle and NOHZ full CPUs.
521          */
522         if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
523                 goto fallback;
524
525         return div64_u64((cpu_khz * acnt), mcnt);
526
527 fallback:
528         freq = cpufreq_quick_get(cpu);
529         return freq ? freq : cpu_khz;
530 }
531
532 static int __init bp_init_aperfmperf(void)
533 {
534         if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
535                 return 0;
536
537         init_counter_refs();
538         bp_init_freq_invariance();
539         return 0;
540 }
541 early_initcall(bp_init_aperfmperf);
542
543 void ap_init_aperfmperf(void)
544 {
545         if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
546                 init_counter_refs();
547 }
This page took 0.056638 seconds and 4 git commands to generate.