1 // SPDX-License-Identifier: GPL-2.0-only
3 * intel_powerclamp.c - package c-state idle injection
5 * Copyright (c) 2012-2023, Intel Corporation.
12 * 1. better handle wakeup from external interrupts, currently a fixed
13 * compensation is added to clamping duration when excessive amount
14 * of wakeups are observed during idle time. the reason is that in
15 * case of external interrupts without need for ack, clamping down
16 * cpu in non-irq context does not reduce irq. for majority of the
17 * cases, clamping down cpu does help reduce irq as well, we should
18 * be able to differentiate the two cases and give a quantitative
19 * solution for the irqs that we can control. perhaps based on
20 * get_cpu_iowait_time_us()
22 * 2. synchronization with other hw blocks
25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/cpu.h>
31 #include <linux/thermal.h>
32 #include <linux/debugfs.h>
33 #include <linux/seq_file.h>
34 #include <linux/idle_inject.h>
37 #include <asm/mwait.h>
38 #include <asm/cpu_device_id.h>
40 #define MAX_TARGET_RATIO (100U)
41 /* For each undisturbed clamping period (no extra wake ups during idle time),
42 * we increment the confidence counter for the given target ratio.
43 * CONFIDENCE_OK defines the level where runtime calibration results are
46 #define CONFIDENCE_OK (3)
47 /* Default idle injection duration, driver adjust sleep time to meet target
48 * idle ratio. Similar to frequency modulation.
50 #define DEFAULT_DURATION_JIFFIES (6)
52 static unsigned int target_mwait;
53 static struct dentry *debug_dir;
54 static bool poll_pkg_cstate_enable;
56 /* Idle ratio observed using package C-state counters */
57 static unsigned int current_ratio;
59 /* Skip the idle injection till set to true */
60 static bool should_skip;
62 struct powerclamp_data {
66 unsigned int window_size_now;
67 unsigned int target_ratio;
71 static struct powerclamp_data powerclamp_data;
73 static struct thermal_cooling_device *cooling_dev;
75 static DEFINE_MUTEX(powerclamp_lock);
77 /* This duration is in microseconds */
78 static unsigned int duration;
79 static unsigned int pkg_cstate_ratio_cur;
80 static unsigned int window_size;
82 static int duration_set(const char *arg, const struct kernel_param *kp)
85 unsigned long new_duration;
87 ret = kstrtoul(arg, 10, &new_duration);
90 if (new_duration > 25 || new_duration < 6) {
91 pr_err("Out of recommended range %lu, between 6-25ms\n",
97 mutex_lock(&powerclamp_lock);
98 duration = clamp(new_duration, 6ul, 25ul) * 1000;
99 mutex_unlock(&powerclamp_lock);
105 static int duration_get(char *buf, const struct kernel_param *kp)
109 mutex_lock(&powerclamp_lock);
110 ret = sysfs_emit(buf, "%d\n", duration / 1000);
111 mutex_unlock(&powerclamp_lock);
116 static const struct kernel_param_ops duration_ops = {
121 module_param_cb(duration, &duration_ops, NULL, 0644);
122 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
124 #define DEFAULT_MAX_IDLE 50
125 #define MAX_ALL_CPU_IDLE 75
127 static u8 max_idle = DEFAULT_MAX_IDLE;
129 static cpumask_var_t idle_injection_cpu_mask;
131 static int allocate_copy_idle_injection_mask(const struct cpumask *copy_mask)
133 if (cpumask_available(idle_injection_cpu_mask))
136 /* This mask is allocated only one time and freed during module exit */
137 if (!alloc_cpumask_var(&idle_injection_cpu_mask, GFP_KERNEL))
141 cpumask_copy(idle_injection_cpu_mask, copy_mask);
146 /* Return true if the cpumask and idle percent combination is invalid */
147 static bool check_invalid(cpumask_var_t mask, u8 idle)
149 if (cpumask_equal(cpu_present_mask, mask) && idle > MAX_ALL_CPU_IDLE)
155 static int cpumask_set(const char *arg, const struct kernel_param *kp)
157 cpumask_var_t new_mask;
160 mutex_lock(&powerclamp_lock);
162 /* Can't set mask when cooling device is in use */
163 if (powerclamp_data.clamping) {
165 goto skip_cpumask_set;
168 ret = alloc_cpumask_var(&new_mask, GFP_KERNEL);
170 goto skip_cpumask_set;
172 ret = bitmap_parse(arg, strlen(arg), cpumask_bits(new_mask),
175 goto free_cpumask_set;
177 if (cpumask_empty(new_mask) || check_invalid(new_mask, max_idle)) {
179 goto free_cpumask_set;
183 * When module parameters are passed from kernel command line
184 * during insmod, the module parameter callback is called
185 * before powerclamp_init(), so we can't assume that some
186 * cpumask can be allocated and copied before here. Also
187 * in this case this cpumask is used as the default mask.
189 ret = allocate_copy_idle_injection_mask(new_mask);
192 free_cpumask_var(new_mask);
194 mutex_unlock(&powerclamp_lock);
199 static int cpumask_get(char *buf, const struct kernel_param *kp)
201 if (!cpumask_available(idle_injection_cpu_mask))
204 return bitmap_print_to_pagebuf(false, buf, cpumask_bits(idle_injection_cpu_mask),
208 static const struct kernel_param_ops cpumask_ops = {
213 module_param_cb(cpumask, &cpumask_ops, NULL, 0644);
214 MODULE_PARM_DESC(cpumask, "Mask of CPUs to use for idle injection.");
216 static int max_idle_set(const char *arg, const struct kernel_param *kp)
221 mutex_lock(&powerclamp_lock);
223 /* Can't set mask when cooling device is in use */
224 if (powerclamp_data.clamping) {
229 ret = kstrtou8(arg, 10, &new_max_idle);
233 if (new_max_idle > MAX_TARGET_RATIO) {
238 if (!cpumask_available(idle_injection_cpu_mask)) {
239 ret = allocate_copy_idle_injection_mask(cpu_present_mask);
244 if (check_invalid(idle_injection_cpu_mask, new_max_idle)) {
249 max_idle = new_max_idle;
252 mutex_unlock(&powerclamp_lock);
257 static const struct kernel_param_ops max_idle_ops = {
259 .get = param_get_byte,
262 module_param_cb(max_idle, &max_idle_ops, &max_idle, 0644);
263 MODULE_PARM_DESC(max_idle, "maximum injected idle time to the total CPU time ratio in percent range:1-100");
265 struct powerclamp_calibration_data {
266 unsigned long confidence; /* used for calibration, basically a counter
267 * gets incremented each time a clamping
268 * period is completed without extra wakeups
269 * once that counter is reached given level,
270 * compensation is deemed usable.
272 unsigned long steady_comp; /* steady state compensation used when
273 * no extra wakeups occurred.
275 unsigned long dynamic_comp; /* compensate excessive wakeup from idle
276 * mostly from external interrupts.
280 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
282 static int window_size_set(const char *arg, const struct kernel_param *kp)
285 unsigned long new_window_size;
287 ret = kstrtoul(arg, 10, &new_window_size);
290 if (new_window_size > 10 || new_window_size < 2) {
291 pr_err("Out of recommended window size %lu, between 2-10\n",
296 window_size = clamp(new_window_size, 2ul, 10ul);
304 static const struct kernel_param_ops window_size_ops = {
305 .set = window_size_set,
306 .get = param_get_int,
309 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
310 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
311 "\tpowerclamp controls idle ratio within this window. larger\n"
312 "\twindow size results in slower response time but more smooth\n"
313 "\tclamping results. default to 2.");
315 static void find_target_mwait(void)
317 unsigned int eax, ebx, ecx, edx;
318 unsigned int highest_cstate = 0;
319 unsigned int highest_subcstate = 0;
322 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
325 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
327 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
328 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
331 edx >>= MWAIT_SUBSTATE_SIZE;
332 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
333 if (edx & MWAIT_SUBSTATE_MASK) {
335 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
338 target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
339 (highest_subcstate - 1);
343 struct pkg_cstate_info {
349 #define PKG_CSTATE_INIT(id) { \
350 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
354 static struct pkg_cstate_info pkg_cstates[] = {
365 static bool has_pkg_state_counter(void)
368 struct pkg_cstate_info *info = pkg_cstates;
370 /* check if any one of the counter msrs exists */
371 while (info->msr_index) {
372 if (!rdmsrl_safe(info->msr_index, &val))
380 static u64 pkg_state_counter(void)
384 struct pkg_cstate_info *info = pkg_cstates;
386 while (info->msr_index) {
388 if (!rdmsrl_safe(info->msr_index, &val))
399 static unsigned int get_compensation(int ratio)
401 unsigned int comp = 0;
403 if (!poll_pkg_cstate_enable)
406 /* we only use compensation if all adjacent ones are good */
408 cal_data[ratio].confidence >= CONFIDENCE_OK &&
409 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
410 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
411 comp = (cal_data[ratio].steady_comp +
412 cal_data[ratio + 1].steady_comp +
413 cal_data[ratio + 2].steady_comp) / 3;
414 } else if (ratio == MAX_TARGET_RATIO - 1 &&
415 cal_data[ratio].confidence >= CONFIDENCE_OK &&
416 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
417 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
418 comp = (cal_data[ratio].steady_comp +
419 cal_data[ratio - 1].steady_comp +
420 cal_data[ratio - 2].steady_comp) / 3;
421 } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
422 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
423 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
424 comp = (cal_data[ratio].steady_comp +
425 cal_data[ratio - 1].steady_comp +
426 cal_data[ratio + 1].steady_comp) / 3;
429 /* do not exceed limit */
430 if (comp + ratio >= MAX_TARGET_RATIO)
431 comp = MAX_TARGET_RATIO - ratio - 1;
436 static void adjust_compensation(int target_ratio, unsigned int win)
439 struct powerclamp_calibration_data *d = &cal_data[target_ratio];
442 * adjust compensations if confidence level has not been reached.
444 if (d->confidence >= CONFIDENCE_OK)
447 delta = powerclamp_data.target_ratio - current_ratio;
448 /* filter out bad data */
449 if (delta >= 0 && delta <= (1+target_ratio/10)) {
452 roundup(delta+d->steady_comp, 2)/2;
454 d->steady_comp = delta;
459 static bool powerclamp_adjust_controls(unsigned int target_ratio,
460 unsigned int guard, unsigned int win)
462 static u64 msr_last, tsc_last;
463 u64 msr_now, tsc_now;
466 /* check result for the last window */
467 msr_now = pkg_state_counter();
470 /* calculate pkg cstate vs tsc ratio */
471 if (!msr_last || !tsc_last)
473 else if (tsc_now-tsc_last) {
474 val64 = 100*(msr_now-msr_last);
475 do_div(val64, (tsc_now-tsc_last));
476 current_ratio = val64;
483 adjust_compensation(target_ratio, win);
485 /* if we are above target+guard, skip */
486 return powerclamp_data.target_ratio + guard <= current_ratio;
490 * This function calculates runtime from the current target ratio.
491 * This function gets called under powerclamp_lock.
493 static unsigned int get_run_time(void)
495 unsigned int compensated_ratio;
496 unsigned int runtime;
499 * make sure user selected ratio does not take effect until
500 * the next round. adjust target_ratio if user has changed
501 * target such that we can converge quickly.
503 powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20;
504 powerclamp_data.window_size_now = window_size;
507 * systems may have different ability to enter package level
508 * c-states, thus we need to compensate the injected idle ratio
509 * to achieve the actual target reported by the HW.
511 compensated_ratio = powerclamp_data.target_ratio +
512 get_compensation(powerclamp_data.target_ratio);
513 if (compensated_ratio <= 0)
514 compensated_ratio = 1;
516 runtime = duration * 100 / compensated_ratio - duration;
522 * 1 HZ polling while clamping is active, useful for userspace
523 * to monitor actual idle ratio.
525 static void poll_pkg_cstate(struct work_struct *dummy);
526 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
527 static void poll_pkg_cstate(struct work_struct *dummy)
536 msr_now = pkg_state_counter();
539 /* calculate pkg cstate vs tsc ratio */
540 if (!msr_last || !tsc_last)
541 pkg_cstate_ratio_cur = 1;
543 if (tsc_now - tsc_last) {
544 val64 = 100 * (msr_now - msr_last);
545 do_div(val64, (tsc_now - tsc_last));
546 pkg_cstate_ratio_cur = val64;
554 mutex_lock(&powerclamp_lock);
555 if (powerclamp_data.clamping)
556 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
557 mutex_unlock(&powerclamp_lock);
560 static struct idle_inject_device *ii_dev;
563 * This function is called from idle injection core on timer expiry
564 * for the run duration. This allows powerclamp to readjust or skip
565 * injecting idle for this cycle.
567 static bool idle_inject_update(void)
571 /* We can't sleep in this callback */
572 if (!mutex_trylock(&powerclamp_lock))
575 if (!(powerclamp_data.count % powerclamp_data.window_size_now)) {
577 should_skip = powerclamp_adjust_controls(powerclamp_data.target_ratio,
578 powerclamp_data.guard,
579 powerclamp_data.window_size_now);
584 unsigned int runtime = get_run_time();
586 idle_inject_set_duration(ii_dev, runtime, duration);
589 powerclamp_data.count++;
591 mutex_unlock(&powerclamp_lock);
599 /* This function starts idle injection by calling idle_inject_start() */
600 static void trigger_idle_injection(void)
602 unsigned int runtime = get_run_time();
604 idle_inject_set_duration(ii_dev, runtime, duration);
605 idle_inject_start(ii_dev);
606 powerclamp_data.clamping = true;
610 * This function is called from start_power_clamp() to register
611 * CPUS with powercap idle injection register and set default
612 * idle duration and latency.
614 static int powerclamp_idle_injection_register(void)
616 poll_pkg_cstate_enable = false;
617 if (cpumask_equal(cpu_present_mask, idle_injection_cpu_mask)) {
618 ii_dev = idle_inject_register_full(idle_injection_cpu_mask, idle_inject_update);
619 if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
620 poll_pkg_cstate_enable = true;
622 ii_dev = idle_inject_register(idle_injection_cpu_mask);
626 pr_err("powerclamp: idle_inject_register failed\n");
630 idle_inject_set_duration(ii_dev, TICK_USEC, duration);
631 idle_inject_set_latency(ii_dev, UINT_MAX);
637 * This function is called from end_power_clamp() to stop idle injection
638 * and unregister CPUS from powercap idle injection core.
640 static void remove_idle_injection(void)
642 if (!powerclamp_data.clamping)
645 powerclamp_data.clamping = false;
646 idle_inject_stop(ii_dev);
650 * This function is called when user change the cooling device
651 * state from zero to some other value.
653 static int start_power_clamp(void)
657 ret = powerclamp_idle_injection_register();
659 trigger_idle_injection();
660 if (poll_pkg_cstate_enable)
661 schedule_delayed_work(&poll_pkg_cstate_work, 0);
668 * This function is called when user change the cooling device
669 * state from non zero value zero.
671 static void end_power_clamp(void)
673 if (powerclamp_data.clamping) {
674 remove_idle_injection();
675 idle_inject_unregister(ii_dev);
679 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
680 unsigned long *state)
682 *state = MAX_TARGET_RATIO;
687 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
688 unsigned long *state)
690 mutex_lock(&powerclamp_lock);
691 *state = powerclamp_data.target_ratio;
692 mutex_unlock(&powerclamp_lock);
697 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
698 unsigned long new_target_ratio)
702 mutex_lock(&powerclamp_lock);
704 new_target_ratio = clamp(new_target_ratio, 0UL,
705 (unsigned long) (max_idle - 1));
707 if (powerclamp_data.target_ratio == new_target_ratio)
710 if (!powerclamp_data.target_ratio && new_target_ratio > 0) {
711 pr_info("Start idle injection to reduce power\n");
712 powerclamp_data.target_ratio = new_target_ratio;
713 ret = start_power_clamp();
715 powerclamp_data.target_ratio = 0;
717 } else if (powerclamp_data.target_ratio > 0 && new_target_ratio == 0) {
718 pr_info("Stop forced idle injection\n");
720 powerclamp_data.target_ratio = 0;
721 } else /* adjust currently running */ {
722 unsigned int runtime;
724 powerclamp_data.target_ratio = new_target_ratio;
725 runtime = get_run_time();
726 idle_inject_set_duration(ii_dev, runtime, duration);
730 mutex_unlock(&powerclamp_lock);
735 /* bind to generic thermal layer as cooling device*/
736 static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
737 .get_max_state = powerclamp_get_max_state,
738 .get_cur_state = powerclamp_get_cur_state,
739 .set_cur_state = powerclamp_set_cur_state,
742 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
743 X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
746 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
748 static int __init powerclamp_probe(void)
751 if (!x86_match_cpu(intel_powerclamp_ids)) {
752 pr_err("CPU does not support MWAIT\n");
756 /* The goal for idle time alignment is to achieve package cstate. */
757 if (!has_pkg_state_counter()) {
758 pr_info("No package C-state available\n");
762 /* find the deepest mwait value */
768 static int powerclamp_debug_show(struct seq_file *m, void *unused)
772 seq_printf(m, "pct confidence steady dynamic (compensation)\n");
773 for (i = 0; i < MAX_TARGET_RATIO; i++) {
774 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
776 cal_data[i].confidence,
777 cal_data[i].steady_comp,
778 cal_data[i].dynamic_comp);
784 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
786 static inline void powerclamp_create_debug_files(void)
788 debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
790 debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
791 &powerclamp_debug_fops);
794 static int __init powerclamp_init(void)
798 /* probe cpu features and ids here */
799 retval = powerclamp_probe();
803 mutex_lock(&powerclamp_lock);
804 if (!cpumask_available(idle_injection_cpu_mask))
805 retval = allocate_copy_idle_injection_mask(cpu_present_mask);
806 mutex_unlock(&powerclamp_lock);
811 /* set default limit, maybe adjusted during runtime based on feedback */
814 cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
815 &powerclamp_cooling_ops);
816 if (IS_ERR(cooling_dev))
820 duration = jiffies_to_usecs(DEFAULT_DURATION_JIFFIES);
822 powerclamp_create_debug_files();
826 module_init(powerclamp_init);
828 static void __exit powerclamp_exit(void)
830 mutex_lock(&powerclamp_lock);
832 mutex_unlock(&powerclamp_lock);
834 thermal_cooling_device_unregister(cooling_dev);
836 cancel_delayed_work_sync(&poll_pkg_cstate_work);
837 debugfs_remove_recursive(debug_dir);
839 if (cpumask_available(idle_injection_cpu_mask))
840 free_cpumask_var(idle_injection_cpu_mask);
842 module_exit(powerclamp_exit);
844 MODULE_IMPORT_NS(IDLE_INJECT);
846 MODULE_LICENSE("GPL");
849 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");