]> Git Repo - linux.git/commitdiff
Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <[email protected]>
Tue, 17 Sep 2019 19:35:15 +0000 (12:35 -0700)
committerLinus Torvalds <[email protected]>
Tue, 17 Sep 2019 19:35:15 +0000 (12:35 -0700)
Pull core timer updates from Thomas Gleixner:
 "Timers and timekeeping updates:

   - A large overhaul of the posix CPU timer code which is a preparation
     for moving the CPU timer expiry out into task work so it can be
     properly accounted on the task/process.

     An update to the bogus permission checks will come later during the
     merge window as feedback was not complete before heading of for
     travel.

   - Switch the timerqueue code to use cached rbtrees and get rid of the
     homebrewn caching of the leftmost node.

   - Consolidate hrtimer_init() + hrtimer_init_sleeper() calls into a
     single function

   - Implement the separation of hrtimers to be forced to expire in hard
     interrupt context even when PREEMPT_RT is enabled and mark the
     affected timers accordingly.

   - Implement a mechanism for hrtimers and the timer wheel to protect
     RT against priority inversion and live lock issues when a (hr)timer
     which should be canceled is currently executing the callback.
     Instead of infinitely spinning, the task which tries to cancel the
     timer blocks on a per cpu base expiry lock which is held and
     released by the (hr)timer expiry code.

   - Enable the Hyper-V TSC page based sched_clock for Hyper-V guests
     resulting in faster access to timekeeping functions.

   - Updates to various clocksource/clockevent drivers and their device
     tree bindings.

   - The usual small improvements all over the place"

* 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (101 commits)
  posix-cpu-timers: Fix permission check regression
  posix-cpu-timers: Always clear head pointer on dequeue
  hrtimer: Add a missing bracket and hide `migration_base' on !SMP
  posix-cpu-timers: Make expiry_active check actually work correctly
  posix-timers: Unbreak CONFIG_POSIX_TIMERS=n build
  tick: Mark sched_timer to expire in hard interrupt context
  hrtimer: Add kernel doc annotation for HRTIMER_MODE_HARD
  x86/hyperv: Hide pv_ops access for CONFIG_PARAVIRT=n
  posix-cpu-timers: Utilize timerqueue for storage
  posix-cpu-timers: Move state tracking to struct posix_cputimers
  posix-cpu-timers: Deduplicate rlimit handling
  posix-cpu-timers: Remove pointless comparisons
  posix-cpu-timers: Get rid of 64bit divisions
  posix-cpu-timers: Consolidate timer expiry further
  posix-cpu-timers: Get rid of zero checks
  rlimit: Rewrite non-sensical RLIMIT_CPU comment
  posix-cpu-timers: Respect INFINITY for hard RTTIME limit
  posix-cpu-timers: Switch thread group sampling to array
  posix-cpu-timers: Restructure expiry array
  posix-cpu-timers: Remove cputime_expires
  ...

16 files changed:
1  2 
arch/arm64/boot/dts/freescale/imx8mm.dtsi
arch/arm64/boot/dts/freescale/imx8mq.dtsi
arch/x86/hyperv/hv_init.c
arch/x86/include/asm/vdso/gettimeofday.h
arch/x86/kvm/lapic.c
block/blk-mq.c
include/linux/sched.h
include/linux/wait.h
init/init_task.c
kernel/events/core.c
kernel/fork.c
kernel/sched/core.c
kernel/sched/deadline.c
kernel/sched/rt.c
kernel/sys.c
kernel/time/alarmtimer.c

index 984ea7b3fd9f171e606cd9a51646cf675808e677,89ef22a8f81e04091766b5fdc6ff650cc6c1ebdd..5f9d0da196e13c695ccc340a2f5ab5bc96985f67
                #address-cells = <1>;
                #size-cells = <0>;
  
 +              idle-states {
 +                      entry-method = "psci";
 +
 +                      cpu_pd_wait: cpu-pd-wait {
 +                              compatible = "arm,idle-state";
 +                              arm,psci-suspend-param = <0x0010033>;
 +                              local-timer-stop;
 +                              entry-latency-us = <1000>;
 +                              exit-latency-us = <700>;
 +                              min-residency-us = <2700>;
 +                      };
 +              };
 +
                A53_0: cpu@0 {
                        device_type = "cpu";
                        compatible = "arm,cortex-a53";
@@@ -68,7 -55,6 +68,7 @@@
                        operating-points-v2 = <&a53_opp_table>;
                        nvmem-cells = <&cpu_speed_grade>;
                        nvmem-cell-names = "speed_grade";
 +                      cpu-idle-states = <&cpu_pd_wait>;
                };
  
                A53_1: cpu@1 {
@@@ -80,7 -66,6 +80,7 @@@
                        enable-method = "psci";
                        next-level-cache = <&A53_L2>;
                        operating-points-v2 = <&a53_opp_table>;
 +                      cpu-idle-states = <&cpu_pd_wait>;
                };
  
                A53_2: cpu@2 {
@@@ -92,7 -77,6 +92,7 @@@
                        enable-method = "psci";
                        next-level-cache = <&A53_L2>;
                        operating-points-v2 = <&a53_opp_table>;
 +                      cpu-idle-states = <&cpu_pd_wait>;
                };
  
                A53_3: cpu@3 {
                        enable-method = "psci";
                        next-level-cache = <&A53_L2>;
                        operating-points-v2 = <&a53_opp_table>;
 +                      cpu-idle-states = <&cpu_pd_wait>;
                };
  
                A53_L2: l2-cache0 {
                        opp-microvolt = <850000>;
                        opp-supported-hw = <0xe>, <0x7>;
                        clock-latency-ns = <150000>;
 +                      opp-suspend;
                };
  
                opp-1600000000 {
                        opp-microvolt = <900000>;
                        opp-supported-hw = <0xc>, <0x7>;
                        clock-latency-ns = <150000>;
 +                      opp-suspend;
                };
  
                opp-1800000000 {
                        opp-hz = /bits/ 64 <1800000000>;
                        opp-microvolt = <1000000>;
 -                      /* Consumer only but rely on speed grading */
 -                      opp-supported-hw = <0x8>, <0x7>;
 +                      opp-supported-hw = <0x8>, <0x3>;
                        clock-latency-ns = <150000>;
 +                      opp-suspend;
                };
        };
  
                                #gpio-cells = <2>;
                                interrupt-controller;
                                #interrupt-cells = <2>;
 +                              gpio-ranges = <&iomuxc 0 10 30>;
                        };
  
                        gpio2: gpio@30210000 {
                                #gpio-cells = <2>;
                                interrupt-controller;
                                #interrupt-cells = <2>;
 +                              gpio-ranges = <&iomuxc 0 40 21>;
                        };
  
                        gpio3: gpio@30220000 {
                                #gpio-cells = <2>;
                                interrupt-controller;
                                #interrupt-cells = <2>;
 +                              gpio-ranges = <&iomuxc 0 61 26>;
                        };
  
                        gpio4: gpio@30230000 {
                                #gpio-cells = <2>;
                                interrupt-controller;
                                #interrupt-cells = <2>;
 +                              gpio-ranges = <&iomuxc 0 87 32>;
                        };
  
                        gpio5: gpio@30240000 {
                                #gpio-cells = <2>;
                                interrupt-controller;
                                #interrupt-cells = <2>;
 +                              gpio-ranges = <&iomuxc 0 119 30>;
                        };
  
                        wdog1: watchdog@30280000 {
                                         <&clk_ext3>, <&clk_ext4>;
                                clock-names = "osc_32k", "osc_24m", "clk_ext1", "clk_ext2",
                                              "clk_ext3", "clk_ext4";
 +                              assigned-clocks = <&clk IMX8MM_CLK_NOC>,
 +                                              <&clk IMX8MM_CLK_AUDIO_AHB>,
 +                                              <&clk IMX8MM_CLK_IPG_AUDIO_ROOT>,
 +                                              <&clk IMX8MM_SYS_PLL3>,
 +                                              <&clk IMX8MM_VIDEO_PLL1>;
 +                              assigned-clock-parents = <&clk IMX8MM_SYS_PLL3_OUT>,
 +                                                       <&clk IMX8MM_SYS_PLL1_800M>;
 +                              assigned-clock-rates = <0>,
 +                                                      <400000000>,
 +                                                      <400000000>,
 +                                                      <750000000>,
 +                                                      <594000000>;
                        };
  
                        src: reset-controller@30390000 {
 -                              compatible = "fsl,imx8mm-src", "syscon";
 +                              compatible = "fsl,imx8mm-src", "fsl,imx8mq-src", "syscon";
                                reg = <0x30390000 0x10000>;
                                interrupts = <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>;
                                #reset-cells = <1>;
                                #pwm-cells = <2>;
                                status = "disabled";
                        };
+                       system_counter: timer@306a0000 {
+                               compatible = "nxp,sysctr-timer";
+                               reg = <0x306a0000 0x20000>;
+                               interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&osc_24m>;
+                               clock-names = "per";
+                       };
                };
  
                aips3: bus@30800000 {
                                interrupts = <GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>;
                                clocks = <&clk IMX8MM_CLK_USB1_CTRL_ROOT>;
                                clock-names = "usb1_ctrl_root_clk";
 -                              assigned-clocks = <&clk IMX8MM_CLK_USB_BUS>,
 -                                                <&clk IMX8MM_CLK_USB_CORE_REF>;
 -                              assigned-clock-parents = <&clk IMX8MM_SYS_PLL2_500M>,
 -                                                       <&clk IMX8MM_SYS_PLL1_100M>;
 +                              assigned-clocks = <&clk IMX8MM_CLK_USB_BUS>;
 +                              assigned-clock-parents = <&clk IMX8MM_SYS_PLL2_500M>;
                                fsl,usbphy = <&usbphynop1>;
                                fsl,usbmisc = <&usbmisc1 0>;
                                status = "disabled";
                                interrupts = <GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>;
                                clocks = <&clk IMX8MM_CLK_USB1_CTRL_ROOT>;
                                clock-names = "usb1_ctrl_root_clk";
 -                              assigned-clocks = <&clk IMX8MM_CLK_USB_BUS>,
 -                                                <&clk IMX8MM_CLK_USB_CORE_REF>;
 -                              assigned-clock-parents = <&clk IMX8MM_SYS_PLL2_500M>,
 -                                                       <&clk IMX8MM_SYS_PLL1_100M>;
 +                              assigned-clocks = <&clk IMX8MM_CLK_USB_BUS>;
 +                              assigned-clock-parents = <&clk IMX8MM_SYS_PLL2_500M>;
                                fsl,usbphy = <&usbphynop2>;
                                fsl,usbmisc = <&usbmisc2 0>;
                                status = "disabled";
                        interrupt-controller;
                        interrupts = <GIC_PPI 9 IRQ_TYPE_LEVEL_HIGH>;
                };
 +
 +              ddr-pmu@3d800000 {
 +                      compatible = "fsl,imx8mm-ddr-pmu", "fsl,imx8m-ddr-pmu";
 +                      reg = <0x3d800000 0x400000>;
 +                      interrupt-parent = <&gic>;
 +                      interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>;
 +              };
        };
  };
index 046a0c8c8dd5607adf835c055b3415efe8361d2f,b4529773af51836107da4ff69d57c54b1850c741..3f3594d9485cec30c34730255595c76812621f3f
                        /* Industrial only */
                        opp-supported-hw = <0xf>, <0x4>;
                        clock-latency-ns = <150000>;
 +                      opp-suspend;
                };
  
                opp-1000000000 {
                        /* Consumer only */
                        opp-supported-hw = <0xe>, <0x3>;
                        clock-latency-ns = <150000>;
 +                      opp-suspend;
                };
  
                opp-1300000000 {
                        opp-hz = /bits/ 64 <1300000000>;
                        opp-microvolt = <1000000>;
 -                      opp-supported-hw = <0xc>, <0x7>;
 +                      opp-supported-hw = <0xc>, <0x4>;
                        clock-latency-ns = <150000>;
 +                      opp-suspend;
                };
  
                opp-1500000000 {
                        opp-hz = /bits/ 64 <1500000000>;
                        opp-microvolt = <1000000>;
 -                      /* Consumer only but rely on speed grading */
 -                      opp-supported-hw = <0x8>, <0x7>;
 +                      opp-supported-hw = <0x8>, <0x3>;
                        clock-latency-ns = <150000>;
 +                      opp-suspend;
                };
        };
  
                                #gpio-cells = <2>;
                                interrupt-controller;
                                #interrupt-cells = <2>;
 +                              gpio-ranges = <&iomuxc 0 10 30>;
                        };
  
                        gpio2: gpio@30210000 {
                                #gpio-cells = <2>;
                                interrupt-controller;
                                #interrupt-cells = <2>;
 +                              gpio-ranges = <&iomuxc 0 40 21>;
                        };
  
                        gpio3: gpio@30220000 {
                                #gpio-cells = <2>;
                                interrupt-controller;
                                #interrupt-cells = <2>;
 +                              gpio-ranges = <&iomuxc 0 61 26>;
                        };
  
                        gpio4: gpio@30230000 {
                                #gpio-cells = <2>;
                                interrupt-controller;
                                #interrupt-cells = <2>;
 +                              gpio-ranges = <&iomuxc 0 87 32>;
                        };
  
                        gpio5: gpio@30240000 {
                                #gpio-cells = <2>;
                                interrupt-controller;
                                #interrupt-cells = <2>;
 +                              gpio-ranges = <&iomuxc 0 119 30>;
                        };
  
                        tmu: tmu@30260000 {
                                compatible = "fsl,imx8mq-tmu";
                                reg = <0x30260000 0x10000>;
                                interrupt = <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>;
 +                              clocks = <&clk IMX8MQ_CLK_TMU_ROOT>;
                                little-endian;
                                fsl,tmu-range = <0xb0000 0xa0026 0x80048 0x70061>;
                                fsl,tmu-calibration = <0x00000000 0x00000023
                        };
  
                        iomuxc_gpr: syscon@30340000 {
 -                              compatible = "fsl,imx8mq-iomuxc-gpr", "fsl,imx6q-iomuxc-gpr", "syscon";
 +                              compatible = "fsl,imx8mq-iomuxc-gpr", "fsl,imx6q-iomuxc-gpr",
 +                                           "syscon", "simple-mfd";
                                reg = <0x30340000 0x10000>;
 +
 +                              mux: mux-controller {
 +                                      compatible = "mmio-mux";
 +                                      #mux-control-cells = <1>;
 +                                      mux-reg-masks = <0x34 0x00000004>; /* MIPI_MUX_SEL */
 +                              };
                        };
  
                        ocotp: ocotp-ctrl@30350000 {
                                #pwm-cells = <2>;
                                status = "disabled";
                        };
+                       system_counter: timer@306a0000 {
+                               compatible = "nxp,sysctr-timer";
+                               reg = <0x306a0000 0x20000>;
+                               interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&osc_25m>;
+                               clock-names = "per";
+                       };
                };
  
                bus@30800000 { /* AIPS3 */
  
                        sai2: sai@308b0000 {
                                #sound-dai-cells = <0>;
 -                              compatible = "fsl,imx8mq-sai",
 -                                           "fsl,imx6sx-sai";
 +                              compatible = "fsl,imx8mq-sai";
                                reg = <0x308b0000 0x10000>;
                                interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_HIGH>;
                                clocks = <&clk IMX8MQ_CLK_SAI2_IPG>,
                                status = "disabled";
                        };
  
 +                      dphy: dphy@30a00300 {
 +                              compatible = "fsl,imx8mq-mipi-dphy";
 +                              reg = <0x30a00300 0x100>;
 +                              clocks = <&clk IMX8MQ_CLK_DSI_PHY_REF>;
 +                              clock-names = "phy_ref";
 +                              assigned-clocks = <&clk IMX8MQ_CLK_DSI_PHY_REF>;
 +                              assigned-clock-parents = <&clk IMX8MQ_VIDEO_PLL1_OUT>;
 +                              assigned-clock-rates = <24000000>;
 +                              #phy-cells = <0>;
 +                              power-domains = <&pgc_mipi>;
 +                              status = "disabled";
 +                      };
 +
                        i2c1: i2c@30a20000 {
                                compatible = "fsl,imx8mq-i2c", "fsl,imx21-i2c";
                                reg = <0x30a20000 0x10000>;
                usb_dwc3_0: usb@38100000 {
                        compatible = "fsl,imx8mq-dwc3", "snps,dwc3";
                        reg = <0x38100000 0x10000>;
 -                      clocks = <&clk IMX8MQ_CLK_USB_BUS>,
 +                      clocks = <&clk IMX8MQ_CLK_USB1_CTRL_ROOT>,
                                 <&clk IMX8MQ_CLK_USB_CORE_REF>,
 -                               <&clk IMX8MQ_CLK_USB1_CTRL_ROOT>;
 +                               <&clk IMX8MQ_CLK_32K>;
                        clock-names = "bus_early", "ref", "suspend";
                        assigned-clocks = <&clk IMX8MQ_CLK_USB_BUS>,
                                          <&clk IMX8MQ_CLK_USB_CORE_REF>;
                usb_dwc3_1: usb@38200000 {
                        compatible = "fsl,imx8mq-dwc3", "snps,dwc3";
                        reg = <0x38200000 0x10000>;
 -                      clocks = <&clk IMX8MQ_CLK_USB_BUS>,
 +                      clocks = <&clk IMX8MQ_CLK_USB2_CTRL_ROOT>,
                                 <&clk IMX8MQ_CLK_USB_CORE_REF>,
 -                               <&clk IMX8MQ_CLK_USB2_CTRL_ROOT>;
 +                               <&clk IMX8MQ_CLK_32K>;
                        clock-names = "bus_early", "ref", "suspend";
                        assigned-clocks = <&clk IMX8MQ_CLK_USB_BUS>,
                                          <&clk IMX8MQ_CLK_USB_CORE_REF>;
                        interrupts = <GIC_PPI 9 IRQ_TYPE_LEVEL_HIGH>;
                        interrupt-parent = <&gic>;
                };
 +
 +              ddr-pmu@3d800000 {
 +                      compatible = "fsl,imx8mq-ddr-pmu", "fsl,imx8m-ddr-pmu";
 +                      reg = <0x3d800000 0x400000>;
 +                      interrupt-parent = <&gic>;
 +                      interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>;
 +              };
        };
  };
index d314cf1e15fd607cb18becf5c19910533f89f4ed,866dfb3dca4893349fc474d36a6624c92a750f78..2db3972c0e0ff047621cb6ab173e34ec914e257b
@@@ -37,20 -37,6 +37,20 @@@ EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg
  u32 hv_max_vp_index;
  EXPORT_SYMBOL_GPL(hv_max_vp_index);
  
 +void *hv_alloc_hyperv_page(void)
 +{
 +      BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE);
 +
 +      return (void *)__get_free_page(GFP_KERNEL);
 +}
 +EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
 +
 +void hv_free_hyperv_page(unsigned long addr)
 +{
 +      free_page(addr);
 +}
 +EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
 +
  static int hv_cpu_init(unsigned int cpu)
  {
        u64 msr_vp_index;
@@@ -315,8 -301,6 +315,6 @@@ void __init hyperv_init(void
  
        x86_init.pci.arch_init = hv_pci_init;
  
-       /* Register Hyper-V specific clocksource */
-       hv_init_clocksource();
        return;
  
  remove_cpuhp_state:
index ba71a63cdac479d6428159ffbb0e24ba6c5a40f3,bcbf901befbe07dcc3867ab05a612b9410103f26..e9ee139cf29e05ab861d614209601b13d281cf4c
@@@ -51,7 -51,7 +51,7 @@@ extern struct pvclock_vsyscall_time_inf
        __attribute__((visibility("hidden")));
  #endif
  
- #ifdef CONFIG_HYPERV_TSCPAGE
+ #ifdef CONFIG_HYPERV_TIMER
  extern struct ms_hyperv_tsc_page hvclock_page
        __attribute__((visibility("hidden")));
  #endif
@@@ -96,8 -96,6 +96,8 @@@ long clock_getres_fallback(clockid_t _c
  
  #else
  
 +#define VDSO_HAS_32BIT_FALLBACK       1
 +
  static __always_inline
  long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
  {
        return ret;
  }
  
 +static __always_inline
 +long clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
 +{
 +      long ret;
 +
 +      asm (
 +              "mov %%ebx, %%edx \n"
 +              "mov %[clock], %%ebx \n"
 +              "call __kernel_vsyscall \n"
 +              "mov %%edx, %%ebx \n"
 +              : "=a" (ret), "=m" (*_ts)
 +              : "0" (__NR_clock_gettime), [clock] "g" (_clkid), "c" (_ts)
 +              : "edx");
 +
 +      return ret;
 +}
 +
  static __always_inline
  long gettimeofday_fallback(struct __kernel_old_timeval *_tv,
                           struct timezone *_tz)
@@@ -167,23 -148,6 +167,23 @@@ clock_getres_fallback(clockid_t _clkid
        return ret;
  }
  
 +static __always_inline
 +long clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
 +{
 +      long ret;
 +
 +      asm (
 +              "mov %%ebx, %%edx \n"
 +              "mov %[clock], %%ebx \n"
 +              "call __kernel_vsyscall \n"
 +              "mov %%edx, %%ebx \n"
 +              : "=a" (ret), "=m" (*_ts)
 +              : "0" (__NR_clock_getres), [clock] "g" (_clkid), "c" (_ts)
 +              : "edx");
 +
 +      return ret;
 +}
 +
  #endif
  
  #ifdef CONFIG_PARAVIRT_CLOCK
@@@ -228,7 -192,7 +228,7 @@@ static u64 vread_pvclock(void
  }
  #endif
  
- #ifdef CONFIG_HYPERV_TSCPAGE
+ #ifdef CONFIG_HYPERV_TIMER
  static u64 vread_hvclock(void)
  {
        return hv_read_tsc_page(&hvclock_page);
@@@ -251,7 -215,7 +251,7 @@@ static inline u64 __arch_get_hw_counter
                return vread_pvclock();
        }
  #endif
- #ifdef CONFIG_HYPERV_TSCPAGE
+ #ifdef CONFIG_HYPERV_TIMER
        if (clock_mode == VCLOCK_HVCLOCK) {
                barrier();
                return vread_hvclock();
diff --combined arch/x86/kvm/lapic.c
index e904ff06a83d84c9ab5ccd08bdfcc14b1cc3768a,b9e516099d07e30ec64622902625fbd6d40cedd9..2a4f278f3b56842ecdedc1573f5f9cb796fb56fc
@@@ -216,9 -216,6 +216,9 @@@ static void recalculate_apic_map(struc
                if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
                        new->phys_map[xapic_id] = apic;
  
 +              if (!kvm_apic_sw_enabled(apic))
 +                      continue;
 +
                ldr = kvm_lapic_get_reg(apic, APIC_LDR);
  
                if (apic_x2apic_mode(apic)) {
@@@ -261,8 -258,6 +261,8 @@@ static inline void apic_set_spiv(struc
                        static_key_slow_dec_deferred(&apic_sw_disabled);
                else
                        static_key_slow_inc(&apic_sw_disabled.key);
 +
 +              recalculate_apic_map(apic->vcpu->kvm);
        }
  }
  
@@@ -1553,6 -1548,7 +1553,6 @@@ static void kvm_apic_inject_pending_tim
  static void apic_timer_expired(struct kvm_lapic *apic)
  {
        struct kvm_vcpu *vcpu = apic->vcpu;
 -      struct swait_queue_head *q = &vcpu->wq;
        struct kvm_timer *ktimer = &apic->lapic_timer;
  
        if (atomic_read(&apic->lapic_timer.pending))
  
        atomic_inc(&apic->lapic_timer.pending);
        kvm_set_pending_timer(vcpu);
 -
 -      /*
 -       * For x86, the atomic_inc() is serialized, thus
 -       * using swait_active() is safe.
 -       */
 -      if (swait_active(q))
 -              swake_up_one(q);
  }
  
  static void start_sw_tscdeadline(struct kvm_lapic *apic)
            likely(ns > apic->lapic_timer.timer_advance_ns)) {
                expire = ktime_add_ns(now, ns);
                expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
-               hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS);
+               hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
        } else
                apic_timer_expired(apic);
  
@@@ -2299,7 -2302,7 +2299,7 @@@ int kvm_create_lapic(struct kvm_vcpu *v
        apic->vcpu = vcpu;
  
        hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
-                    HRTIMER_MODE_ABS);
+                    HRTIMER_MODE_ABS_HARD);
        apic->lapic_timer.timer.function = apic_timer_fn;
        if (timer_advance_ns == -1) {
                apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
@@@ -2484,7 -2487,7 +2484,7 @@@ void __kvm_migrate_apic_timer(struct kv
  
        timer = &vcpu->arch.apic->lapic_timer.timer;
        if (hrtimer_cancel(timer))
-               hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+               hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD);
  }
  
  /*
diff --combined block/blk-mq.c
index 0835f4d8d42e7e34c043acc040d8f5406b2e2268,f567146f9ed7e680b7351e009d85f1da71a5c3a9..e0b849bfe74d8d03dac595b145b847e6d73a8726
@@@ -669,6 -669,8 +669,6 @@@ void blk_mq_start_request(struct reques
  {
        struct request_queue *q = rq->q;
  
 -      blk_mq_sched_started_request(rq);
 -
        trace_block_rq_issue(q, rq);
  
        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
@@@ -2662,6 -2664,8 +2662,6 @@@ void blk_mq_release(struct request_queu
        struct blk_mq_hw_ctx *hctx, *next;
        int i;
  
 -      cancel_delayed_work_sync(&q->requeue_work);
 -
        queue_for_each_hw_ctx(q, hctx, i)
                WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
  
@@@ -3411,15 -3415,14 +3411,14 @@@ static bool blk_mq_poll_hybrid_sleep(st
        kt = nsecs;
  
        mode = HRTIMER_MODE_REL;
-       hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+       hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
        hrtimer_set_expires(&hs.timer, kt);
  
-       hrtimer_init_sleeper(&hs, current);
        do {
                if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
                        break;
                set_current_state(TASK_UNINTERRUPTIBLE);
-               hrtimer_start_expires(&hs.timer, mode);
+               hrtimer_sleeper_start_expires(&hs, mode);
                if (hs.task)
                        io_schedule();
                hrtimer_cancel(&hs.timer);
diff --combined include/linux/sched.h
index f0edee94834a8262db98f3abe61721121ff1c625,8cc8e323093f7e79f22401fb187db42e6a3fbd78..b75b282870053e083a1f79ae9b55cba6a95caa74
  #include <linux/resource.h>
  #include <linux/latencytop.h>
  #include <linux/sched/prio.h>
+ #include <linux/sched/types.h>
  #include <linux/signal_types.h>
  #include <linux/mm_types_task.h>
  #include <linux/task_io_accounting.h>
+ #include <linux/posix-timers.h>
  #include <linux/rseq.h>
  
  /* task_struct member predeclarations (sorted alphabetically): */
@@@ -244,27 -246,6 +246,6 @@@ struct prev_cputime 
  #endif
  };
  
- /**
-  * struct task_cputime - collected CPU time counts
-  * @utime:            time spent in user mode, in nanoseconds
-  * @stime:            time spent in kernel mode, in nanoseconds
-  * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
-  *
-  * This structure groups together three kinds of CPU time that are tracked for
-  * threads and thread groups.  Most things considering CPU time want to group
-  * these counts together and treat all three of them in parallel.
-  */
- struct task_cputime {
-       u64                             utime;
-       u64                             stime;
-       unsigned long long              sum_exec_runtime;
- };
- /* Alternate field names when used on cache expirations: */
- #define virt_exp                      utime
- #define prof_exp                      stime
- #define sched_exp                     sum_exec_runtime
  enum vtime_state {
        /* Task is sleeping or running in a CPU with VTIME inactive: */
        VTIME_INACTIVE = 0,
@@@ -295,11 -276,6 +276,11 @@@ enum uclamp_id 
        UCLAMP_CNT
  };
  
 +#ifdef CONFIG_SMP
 +extern struct root_domain def_root_domain;
 +extern struct mutex sched_domains_mutex;
 +#endif
 +
  struct sched_info {
  #ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */
@@@ -881,10 -857,8 +862,8 @@@ struct task_struct 
        unsigned long                   min_flt;
        unsigned long                   maj_flt;
  
- #ifdef CONFIG_POSIX_TIMERS
-       struct task_cputime             cputime_expires;
-       struct list_head                cpu_timers[3];
- #endif
+       /* Empty if CONFIG_POSIX_CPUTIMERS=n */
+       struct posix_cputimers          posix_cputimers;
  
        /* Process credentials: */
  
        u64                             last_sum_exec_runtime;
        struct callback_head            numa_work;
  
 -      struct numa_group               *numa_group;
 +      /*
 +       * This pointer is only modified for current in syscall and
 +       * pagefault context (and for tasks being destroyed), so it can be read
 +       * from any of the following contexts:
 +       *  - RCU read-side critical section
 +       *  - current->numa_group from everywhere
 +       *  - task's runqueue locked, task not running
 +       */
 +      struct numa_group __rcu         *numa_group;
  
        /*
         * numa_faults is an array split into four regions:
@@@ -1772,7 -1738,7 +1751,7 @@@ static inline int test_tsk_need_resched
   * value indicates whether a reschedule was done in fact.
   * cond_resched_lock() will drop the spinlock before scheduling,
   */
 -#ifndef CONFIG_PREEMPT
 +#ifndef CONFIG_PREEMPTION
  extern int _cond_resched(void);
  #else
  static inline int _cond_resched(void) { return 0; }
@@@ -1801,12 -1767,12 +1780,12 @@@ static inline void cond_resched_rcu(voi
  
  /*
   * Does a critical section need to be broken due to another
 - * task waiting?: (technically does not depend on CONFIG_PREEMPT,
 + * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
   * but a general need for low latency)
   */
  static inline int spin_needbreak(spinlock_t *lock)
  {
 -#ifdef CONFIG_PREEMPT
 +#ifdef CONFIG_PREEMPTION
        return spin_is_contended(lock);
  #else
        return 0;
diff --combined include/linux/wait.h
index 30c515520fb28c69873aad6d0b498a59679f46ab,4707543ef5752a6bd321fde91b9be49eaef4a856..3eb7cae8206c38d153ecdb72f0c7666b4fec42b7
@@@ -126,19 -126,6 +126,19 @@@ static inline int waitqueue_active(stru
        return !list_empty(&wq_head->head);
  }
  
 +/**
 + * wq_has_single_sleeper - check if there is only one sleeper
 + * @wq_head: wait queue head
 + *
 + * Returns true of wq_head has only one sleeper on the list.
 + *
 + * Please refer to the comment for waitqueue_active.
 + */
 +static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
 +{
 +      return list_is_singular(&wq_head->head);
 +}
 +
  /**
   * wq_has_sleeper - check if there are any waiting processes
   * @wq_head: wait queue head
@@@ -501,8 -488,8 +501,8 @@@ do {                                                                               
        int __ret = 0;                                                          \
        struct hrtimer_sleeper __t;                                             \
                                                                                \
-       hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);   \
-       hrtimer_init_sleeper(&__t, current);                                    \
+       hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC,                    \
+                                     HRTIMER_MODE_REL);                        \
        if ((timeout) != KTIME_MAX)                                             \
                hrtimer_start_range_ns(&__t.timer, timeout,                     \
                                       current->timer_slack_ns,                 \
diff --combined init/init_task.c
index bfe06c53b14e06674a5fddde3944a9fdd9b96c35,d49692a0ec516b0d6b95d6272551903fbbdb14c2..9e5cbe5eab7b1143791c834afa6ece0ccdbd148f
@@@ -30,8 -30,6 +30,6 @@@ static struct signal_struct init_signal
        .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
        .cputimer       = {
                .cputime_atomic = INIT_CPUTIME_ATOMIC,
-               .running        = false,
-               .checking_timer = false,
        },
  #endif
        INIT_CPU_TIMERS(init_signals)
@@@ -174,7 -172,7 +172,7 @@@ struct task_struct init_tas
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        .ret_stack      = NULL,
  #endif
 -#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT)
 +#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION)
        .trace_recursion = 0,
  #endif
  #ifdef CONFIG_LIVEPATCH
diff --combined kernel/events/core.c
index 1c414b8866b454aed555aafdf34e823256f0c8ba,9d623e257a514acae29d8da3ce065616141f3923..4f08b17d642672f9822e3d842f07f2d836af6f9f
@@@ -1103,7 -1103,7 +1103,7 @@@ static void __perf_mux_hrtimer_init(str
        cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
  
        raw_spin_lock_init(&cpuctx->hrtimer_lock);
-       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
        timer->function = perf_mux_hrtimer_handler;
  }
  
@@@ -1121,7 -1121,7 +1121,7 @@@ static int perf_mux_hrtimer_restart(str
        if (!cpuctx->hrtimer_active) {
                cpuctx->hrtimer_active = 1;
                hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
-               hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+               hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
        }
        raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
  
@@@ -1887,89 -1887,6 +1887,89 @@@ list_del_event(struct perf_event *event
        ctx->generation++;
  }
  
 +static int
 +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
 +{
 +      if (!has_aux(aux_event))
 +              return 0;
 +
 +      if (!event->pmu->aux_output_match)
 +              return 0;
 +
 +      return event->pmu->aux_output_match(aux_event);
 +}
 +
 +static void put_event(struct perf_event *event);
 +static void event_sched_out(struct perf_event *event,
 +                          struct perf_cpu_context *cpuctx,
 +                          struct perf_event_context *ctx);
 +
 +static void perf_put_aux_event(struct perf_event *event)
 +{
 +      struct perf_event_context *ctx = event->ctx;
 +      struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 +      struct perf_event *iter;
 +
 +      /*
 +       * If event uses aux_event tear down the link
 +       */
 +      if (event->aux_event) {
 +              iter = event->aux_event;
 +              event->aux_event = NULL;
 +              put_event(iter);
 +              return;
 +      }
 +
 +      /*
 +       * If the event is an aux_event, tear down all links to
 +       * it from other events.
 +       */
 +      for_each_sibling_event(iter, event->group_leader) {
 +              if (iter->aux_event != event)
 +                      continue;
 +
 +              iter->aux_event = NULL;
 +              put_event(event);
 +
 +              /*
 +               * If it's ACTIVE, schedule it out and put it into ERROR
 +               * state so that we don't try to schedule it again. Note
 +               * that perf_event_enable() will clear the ERROR status.
 +               */
 +              event_sched_out(iter, cpuctx, ctx);
 +              perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
 +      }
 +}
 +
 +static int perf_get_aux_event(struct perf_event *event,
 +                            struct perf_event *group_leader)
 +{
 +      /*
 +       * Our group leader must be an aux event if we want to be
 +       * an aux_output. This way, the aux event will precede its
 +       * aux_output events in the group, and therefore will always
 +       * schedule first.
 +       */
 +      if (!group_leader)
 +              return 0;
 +
 +      if (!perf_aux_output_match(event, group_leader))
 +              return 0;
 +
 +      if (!atomic_long_inc_not_zero(&group_leader->refcount))
 +              return 0;
 +
 +      /*
 +       * Link aux_outputs to their aux event; this is undone in
 +       * perf_group_detach() by perf_put_aux_event(). When the
 +       * group in torn down, the aux_output events loose their
 +       * link to the aux_event and can't schedule any more.
 +       */
 +      event->aux_event = group_leader;
 +
 +      return 1;
 +}
 +
  static void perf_group_detach(struct perf_event *event)
  {
        struct perf_event *sibling, *tmp;
  
        event->attach_state &= ~PERF_ATTACH_GROUP;
  
 +      perf_put_aux_event(event);
 +
        /*
         * If this is a sibling, remove it from its group.
         */
@@@ -4174,8 -4089,10 +4174,8 @@@ alloc_perf_context(struct pmu *pmu, str
                return NULL;
  
        __perf_event_init_context(ctx);
 -      if (task) {
 -              ctx->task = task;
 -              get_task_struct(task);
 -      }
 +      if (task)
 +              ctx->task = get_task_struct(task);
        ctx->pmu = pmu;
  
        return ctx;
@@@ -9574,7 -9491,7 +9574,7 @@@ static void perf_swevent_start_hrtimer(
                period = max_t(u64, 10000, hwc->sample_period);
        }
        hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
-                     HRTIMER_MODE_REL_PINNED);
+                     HRTIMER_MODE_REL_PINNED_HARD);
  }
  
  static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@@ -9596,7 -9513,7 +9596,7 @@@ static void perf_swevent_init_hrtimer(s
        if (!is_sampling_event(event))
                return;
  
-       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        hwc->hrtimer.function = perf_swevent_hrtimer;
  
        /*
@@@ -10438,7 -10355,8 +10438,7 @@@ perf_event_alloc(struct perf_event_att
                 * and we cannot use the ctx information because we need the
                 * pmu before we get a ctx.
                 */
 -              get_task_struct(task);
 -              event->hw.target = task;
 +              event->hw.target = get_task_struct(task);
        }
  
        event->clock = &local_clock;
                goto err_ns;
        }
  
 +      if (event->attr.aux_output &&
 +          !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
 +              err = -EOPNOTSUPP;
 +              goto err_pmu;
 +      }
 +
        err = exclusive_event_init(event);
        if (err)
                goto err_pmu;
@@@ -11170,8 -11082,6 +11170,8 @@@ SYSCALL_DEFINE5(perf_event_open
                }
        }
  
 +      if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
 +              goto err_locked;
  
        /*
         * Must be under the same ctx::mutex as perf_install_in_context(),
@@@ -11364,7 -11274,7 +11364,7 @@@ perf_event_create_kernel_counter(struc
                goto err_unlock;
        }
  
 -      perf_install_in_context(ctx, event, cpu);
 +      perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
  
diff --combined kernel/fork.c
index 1d1cd06edbc178daa6347b85b447af3bf83e8d90,f1228d9f0b11747ca1b030cb831ecb0863e7af9c..53e780748fe3367973182edd594c3e27fc9108fd
@@@ -726,7 -726,7 +726,7 @@@ void __put_task_struct(struct task_stru
        WARN_ON(tsk == current);
  
        cgroup_free(tsk);
 -      task_numa_free(tsk);
 +      task_numa_free(tsk, true);
        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
@@@ -768,7 -768,6 +768,7 @@@ static void set_max_threads(unsigned in
  int arch_task_struct_size __read_mostly;
  #endif
  
 +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
  static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
  {
        /* Fetch thread_struct whitelist for the architecture. */
        else
                *offset += offsetof(struct task_struct, thread);
  }
 +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
  
  void __init fork_init(void)
  {
@@@ -1519,28 -1517,17 +1519,17 @@@ void __cleanup_sighand(struct sighand_s
        }
  }
  
- #ifdef CONFIG_POSIX_TIMERS
  /*
   * Initialize POSIX timer handling for a thread group.
   */
  static void posix_cpu_timers_init_group(struct signal_struct *sig)
  {
+       struct posix_cputimers *pct = &sig->posix_cputimers;
        unsigned long cpu_limit;
  
        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-       if (cpu_limit != RLIM_INFINITY) {
-               sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
-               sig->cputimer.running = true;
-       }
-       /* The timer lists. */
-       INIT_LIST_HEAD(&sig->cpu_timers[0]);
-       INIT_LIST_HEAD(&sig->cpu_timers[1]);
-       INIT_LIST_HEAD(&sig->cpu_timers[2]);
+       posix_cputimers_group_init(pct, cpu_limit);
  }
- #else
- static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
- #endif
  
  static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
  {
@@@ -1642,23 -1629,6 +1631,6 @@@ static void rt_mutex_init_task(struct t
  #endif
  }
  
- #ifdef CONFIG_POSIX_TIMERS
- /*
-  * Initialize POSIX timer handling for a single task.
-  */
- static void posix_cpu_timers_init(struct task_struct *tsk)
- {
-       tsk->cputime_expires.prof_exp = 0;
-       tsk->cputime_expires.virt_exp = 0;
-       tsk->cputime_expires.sched_exp = 0;
-       INIT_LIST_HEAD(&tsk->cpu_timers[0]);
-       INIT_LIST_HEAD(&tsk->cpu_timers[1]);
-       INIT_LIST_HEAD(&tsk->cpu_timers[2]);
- }
- #else
- static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
- #endif
  static inline void init_task_pid_links(struct task_struct *task)
  {
        enum pid_type type;
@@@ -1692,14 -1662,6 +1664,14 @@@ static inline void rcu_copy_process(str
  #endif /* #ifdef CONFIG_TASKS_RCU */
  }
  
 +struct pid *pidfd_pid(const struct file *file)
 +{
 +      if (file->f_op == &pidfd_fops)
 +              return file->private_data;
 +
 +      return ERR_PTR(-EBADF);
 +}
 +
  static int pidfd_release(struct inode *inode, struct file *file)
  {
        struct pid *pid = file->private_data;
@@@ -1945,7 -1907,7 +1917,7 @@@ static __latent_entropy struct task_str
        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);
  
-       posix_cpu_timers_init(p);
+       posix_cputimers_init(&p->posix_cputimers);
  
        p->io_context = NULL;
        audit_set_context(p, NULL);
@@@ -2348,8 -2310,6 +2320,8 @@@ struct mm_struct *copy_init_mm(void
   *
   * It copies the process, and if successful kick-starts
   * it and waits for it to finish using the VM if required.
 + *
 + * args->exit_signal is expected to be checked for sanity by the caller.
   */
  long _do_fork(struct kernel_clone_args *args)
  {
@@@ -2574,14 -2534,6 +2546,14 @@@ noinline static int copy_clone_args_fro
        if (copy_from_user(&args, uargs, size))
                return -EFAULT;
  
 +      /*
 +       * Verify that higher 32bits of exit_signal are unset and that
 +       * it is a valid signal
 +       */
 +      if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
 +                   !valid_signal(args.exit_signal)))
 +              return -EINVAL;
 +
        *kargs = (struct kernel_clone_args){
                .flags          = args.flags,
                .pidfd          = u64_to_user_ptr(args.pidfd),
diff --combined kernel/sched/core.c
index 06961b997ed6d8c13ced5558520f75b07c85aedc,389e0993fbb4ae87c45ad8c8a72cb93348114a06..5e8387bdd09c65c9b804534afba93a654d39d8a3
@@@ -255,7 -255,7 +255,7 @@@ static void __hrtick_restart(struct rq 
  {
        struct hrtimer *timer = &rq->hrtick_timer;
  
-       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
  }
  
  /*
@@@ -314,7 -314,7 +314,7 @@@ void hrtick_start(struct rq *rq, u64 de
         */
        delay = max_t(u64, delay, 10000LL);
        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
-                     HRTIMER_MODE_REL_PINNED);
+                     HRTIMER_MODE_REL_PINNED_HARD);
  }
  #endif /* CONFIG_SMP */
  
@@@ -328,7 -328,7 +328,7 @@@ static void hrtick_rq_init(struct rq *r
        rq->hrtick_csd.info = rq;
  #endif
  
-       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        rq->hrtick_timer.function = hrtick;
  }
  #else /* CONFIG_SCHED_HRTICK */
@@@ -773,18 -773,6 +773,18 @@@ static void set_load_weight(struct task
  }
  
  #ifdef CONFIG_UCLAMP_TASK
 +/*
 + * Serializes updates of utilization clamp values
 + *
 + * The (slow-path) user-space triggers utilization clamp value updates which
 + * can require updates on (fast-path) scheduler's data structures used to
 + * support enqueue/dequeue operations.
 + * While the per-CPU rq lock protects fast-path update operations, user-space
 + * requests are serialized using a mutex to reduce the risk of conflicting
 + * updates or API abuses.
 + */
 +static DEFINE_MUTEX(uclamp_mutex);
 +
  /* Max allowed minimum utilization */
  unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
  
@@@ -810,7 -798,7 +810,7 @@@ static inline unsigned int uclamp_bucke
        return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
  }
  
 -static inline unsigned int uclamp_none(int clamp_id)
 +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
  {
        if (clamp_id == UCLAMP_MIN)
                return 0;
@@@ -826,7 -814,7 +826,7 @@@ static inline void uclamp_se_set(struc
  }
  
  static inline unsigned int
 -uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
 +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                  unsigned int clamp_value)
  {
        /*
        return uclamp_none(UCLAMP_MIN);
  }
  
 -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
 +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                     unsigned int clamp_value)
  {
        /* Reset max-clamp retention only on idle exit */
  }
  
  static inline
 -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
 -                               unsigned int clamp_value)
 +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
 +                                 unsigned int clamp_value)
  {
        struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
        int bucket_id = UCLAMP_BUCKETS - 1;
        return uclamp_idle_value(rq, clamp_id, clamp_value);
  }
  
 +static inline struct uclamp_se
 +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
 +{
 +      struct uclamp_se uc_req = p->uclamp_req[clamp_id];
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +      struct uclamp_se uc_max;
 +
 +      /*
 +       * Tasks in autogroups or root task group will be
 +       * restricted by system defaults.
 +       */
 +      if (task_group_is_autogroup(task_group(p)))
 +              return uc_req;
 +      if (task_group(p) == &root_task_group)
 +              return uc_req;
 +
 +      uc_max = task_group(p)->uclamp[clamp_id];
 +      if (uc_req.value > uc_max.value || !uc_req.user_defined)
 +              return uc_max;
 +#endif
 +
 +      return uc_req;
 +}
 +
  /*
   * The effective clamp bucket index of a task depends on, by increasing
   * priority:
   * - the task specific clamp value, when explicitly requested from userspace
 + * - the task group effective clamp value, for tasks not either in the root
 + *   group or in an autogroup
   * - the system default clamp value, defined by the sysadmin
   */
  static inline struct uclamp_se
 -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
 +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
  {
 -      struct uclamp_se uc_req = p->uclamp_req[clamp_id];
 +      struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
        struct uclamp_se uc_max = uclamp_default[clamp_id];
  
        /* System default restrictions always apply */
        return uc_req;
  }
  
 -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
 +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
  {
        struct uclamp_se uc_eff;
  
   * for each bucket when all its RUNNABLE tasks require the same clamp.
   */
  static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
 -                                  unsigned int clamp_id)
 +                                  enum uclamp_id clamp_id)
  {
        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
   * enforce the expected state and warn.
   */
  static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
 -                                  unsigned int clamp_id)
 +                                  enum uclamp_id clamp_id)
  {
        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
  
  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
  {
 -      unsigned int clamp_id;
 +      enum uclamp_id clamp_id;
  
        if (unlikely(!p->sched_class->uclamp_enabled))
                return;
  
  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
  {
 -      unsigned int clamp_id;
 +      enum uclamp_id clamp_id;
  
        if (unlikely(!p->sched_class->uclamp_enabled))
                return;
                uclamp_rq_dec_id(rq, p, clamp_id);
  }
  
 +static inline void
 +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
 +{
 +      struct rq_flags rf;
 +      struct rq *rq;
 +
 +      /*
 +       * Lock the task and the rq where the task is (or was) queued.
 +       *
 +       * We might lock the (previous) rq of a !RUNNABLE task, but that's the
 +       * price to pay to safely serialize util_{min,max} updates with
 +       * enqueues, dequeues and migration operations.
 +       * This is the same locking schema used by __set_cpus_allowed_ptr().
 +       */
 +      rq = task_rq_lock(p, &rf);
 +
 +      /*
 +       * Setting the clamp bucket is serialized by task_rq_lock().
 +       * If the task is not yet RUNNABLE and its task_struct is not
 +       * affecting a valid clamp bucket, the next time it's enqueued,
 +       * it will already see the updated clamp bucket value.
 +       */
 +      if (!p->uclamp[clamp_id].active) {
 +              uclamp_rq_dec_id(rq, p, clamp_id);
 +              uclamp_rq_inc_id(rq, p, clamp_id);
 +      }
 +
 +      task_rq_unlock(rq, p, &rf);
 +}
 +
 +static inline void
 +uclamp_update_active_tasks(struct cgroup_subsys_state *css,
 +                         unsigned int clamps)
 +{
 +      enum uclamp_id clamp_id;
 +      struct css_task_iter it;
 +      struct task_struct *p;
 +
 +      css_task_iter_start(css, 0, &it);
 +      while ((p = css_task_iter_next(&it))) {
 +              for_each_clamp_id(clamp_id) {
 +                      if ((0x1 << clamp_id) & clamps)
 +                              uclamp_update_active(p, clamp_id);
 +              }
 +      }
 +      css_task_iter_end(&it);
 +}
 +
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +static void cpu_util_update_eff(struct cgroup_subsys_state *css);
 +static void uclamp_update_root_tg(void)
 +{
 +      struct task_group *tg = &root_task_group;
 +
 +      uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
 +                    sysctl_sched_uclamp_util_min, false);
 +      uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
 +                    sysctl_sched_uclamp_util_max, false);
 +
 +      rcu_read_lock();
 +      cpu_util_update_eff(&root_task_group.css);
 +      rcu_read_unlock();
 +}
 +#else
 +static void uclamp_update_root_tg(void) { }
 +#endif
 +
  int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
                                loff_t *ppos)
  {
 +      bool update_root_tg = false;
        int old_min, old_max;
 -      static DEFINE_MUTEX(mutex);
        int result;
  
 -      mutex_lock(&mutex);
 +      mutex_lock(&uclamp_mutex);
        old_min = sysctl_sched_uclamp_util_min;
        old_max = sysctl_sched_uclamp_util_max;
  
        if (old_min != sysctl_sched_uclamp_util_min) {
                uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                              sysctl_sched_uclamp_util_min, false);
 +              update_root_tg = true;
        }
        if (old_max != sysctl_sched_uclamp_util_max) {
                uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                              sysctl_sched_uclamp_util_max, false);
 +              update_root_tg = true;
        }
  
 +      if (update_root_tg)
 +              uclamp_update_root_tg();
 +
        /*
 -       * Updating all the RUNNABLE task is expensive, keep it simple and do
 -       * just a lazy update at each next enqueue time.
 +       * We update all RUNNABLE tasks only when task groups are in use.
 +       * Otherwise, keep it simple and do just a lazy update at each next
 +       * task enqueue time.
         */
 +
        goto done;
  
  undo:
        sysctl_sched_uclamp_util_min = old_min;
        sysctl_sched_uclamp_util_max = old_max;
  done:
 -      mutex_unlock(&mutex);
 +      mutex_unlock(&uclamp_mutex);
  
        return result;
  }
@@@ -1187,7 -1075,7 +1187,7 @@@ static int uclamp_validate(struct task_
  static void __setscheduler_uclamp(struct task_struct *p,
                                  const struct sched_attr *attr)
  {
 -      unsigned int clamp_id;
 +      enum uclamp_id clamp_id;
  
        /*
         * On scheduling class change, reset to default clamps for tasks
  
  static void uclamp_fork(struct task_struct *p)
  {
 -      unsigned int clamp_id;
 +      enum uclamp_id clamp_id;
  
        for_each_clamp_id(clamp_id)
                p->uclamp[clamp_id].active = false;
  static void __init init_uclamp(void)
  {
        struct uclamp_se uc_max = {};
 -      unsigned int clamp_id;
 +      enum uclamp_id clamp_id;
        int cpu;
  
 +      mutex_init(&uclamp_mutex);
 +
        for_each_possible_cpu(cpu) {
                memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                cpu_rq(cpu)->uclamp_flags = 0;
  
        /* System defaults allow max clamp values for both indexes */
        uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
 -      for_each_clamp_id(clamp_id)
 +      for_each_clamp_id(clamp_id) {
                uclamp_default[clamp_id] = uc_max;
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +              root_task_group.uclamp_req[clamp_id] = uc_max;
 +              root_task_group.uclamp[clamp_id] = uc_max;
 +#endif
 +      }
  }
  
  #else /* CONFIG_UCLAMP_TASK */
@@@ -1613,7 -1494,7 +1613,7 @@@ void do_set_cpus_allowed(struct task_st
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
 -              set_curr_task(rq, p);
 +              set_next_task(rq, p);
  }
  
  /*
@@@ -3333,8 -3214,12 +3333,8 @@@ static __always_inline struct rq 
  context_switch(struct rq *rq, struct task_struct *prev,
               struct task_struct *next, struct rq_flags *rf)
  {
 -      struct mm_struct *mm, *oldmm;
 -
        prepare_task_switch(rq, prev, next);
  
 -      mm = next->mm;
 -      oldmm = prev->active_mm;
        /*
         * For paravirt, this is coupled with an exit in switch_to to
         * combine the page table reload and the switch backend into
        arch_start_context_switch(prev);
  
        /*
 -       * If mm is non-NULL, we pass through switch_mm(). If mm is
 -       * NULL, we will pass through mmdrop() in finish_task_switch().
 -       * Both of these contain the full memory barrier required by
 -       * membarrier after storing to rq->curr, before returning to
 -       * user-space.
 +       * kernel -> kernel   lazy + transfer active
 +       *   user -> kernel   lazy + mmgrab() active
 +       *
 +       * kernel ->   user   switch + mmdrop() active
 +       *   user ->   user   switch
         */
 -      if (!mm) {
 -              next->active_mm = oldmm;
 -              mmgrab(oldmm);
 -              enter_lazy_tlb(oldmm, next);
 -      } else
 -              switch_mm_irqs_off(oldmm, mm, next);
 +      if (!next->mm) {                                // to kernel
 +              enter_lazy_tlb(prev->active_mm, next);
 +
 +              next->active_mm = prev->active_mm;
 +              if (prev->mm)                           // from user
 +                      mmgrab(prev->active_mm);
 +              else
 +                      prev->active_mm = NULL;
 +      } else {                                        // to user
 +              /*
 +               * sys_membarrier() requires an smp_mb() between setting
 +               * rq->curr and returning to userspace.
 +               *
 +               * The below provides this either through switch_mm(), or in
 +               * case 'prev->active_mm == next->mm' through
 +               * finish_task_switch()'s mmdrop().
 +               */
  
 -      if (!prev->mm) {
 -              prev->active_mm = NULL;
 -              rq->prev_mm = oldmm;
 +              switch_mm_irqs_off(prev->active_mm, next->mm, next);
 +
 +              if (!prev->mm) {                        // from kernel
 +                      /* will mmdrop() in finish_task_switch(). */
 +                      rq->prev_mm = prev->active_mm;
 +                      prev->active_mm = NULL;
 +              }
        }
  
        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@@ -3616,36 -3486,8 +3616,36 @@@ void scheduler_tick(void
  
  struct tick_work {
        int                     cpu;
 +      atomic_t                state;
        struct delayed_work     work;
  };
 +/* Values for ->state, see diagram below. */
 +#define TICK_SCHED_REMOTE_OFFLINE     0
 +#define TICK_SCHED_REMOTE_OFFLINING   1
 +#define TICK_SCHED_REMOTE_RUNNING     2
 +
 +/*
 + * State diagram for ->state:
 + *
 + *
 + *          TICK_SCHED_REMOTE_OFFLINE
 + *                    |   ^
 + *                    |   |
 + *                    |   | sched_tick_remote()
 + *                    |   |
 + *                    |   |
 + *                    +--TICK_SCHED_REMOTE_OFFLINING
 + *                    |   ^
 + *                    |   |
 + * sched_tick_start() |   | sched_tick_stop()
 + *                    |   |
 + *                    V   |
 + *          TICK_SCHED_REMOTE_RUNNING
 + *
 + *
 + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
 + * and sched_tick_start() are happy to leave the state in RUNNING.
 + */
  
  static struct tick_work __percpu *tick_work_cpu;
  
@@@ -3658,7 -3500,6 +3658,7 @@@ static void sched_tick_remote(struct wo
        struct task_struct *curr;
        struct rq_flags rf;
        u64 delta;
 +      int os;
  
        /*
         * Handle the tick only if it appears the remote CPU is running in full
  
        rq_lock_irq(rq, &rf);
        curr = rq->curr;
 -      if (is_idle_task(curr))
 +      if (is_idle_task(curr) || cpu_is_offline(cpu))
                goto out_unlock;
  
        update_rq_clock(rq);
@@@ -3692,18 -3533,13 +3692,18 @@@ out_requeue
        /*
         * Run the remote tick once per second (1Hz). This arbitrary
         * frequency is large enough to avoid overload but short enough
 -       * to keep scheduler internal stats reasonably up to date.
 +       * to keep scheduler internal stats reasonably up to date.  But
 +       * first update state to reflect hotplug activity if required.
         */
 -      queue_delayed_work(system_unbound_wq, dwork, HZ);
 +      os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
 +      WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
 +      if (os == TICK_SCHED_REMOTE_RUNNING)
 +              queue_delayed_work(system_unbound_wq, dwork, HZ);
  }
  
  static void sched_tick_start(int cpu)
  {
 +      int os;
        struct tick_work *twork;
  
        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
        WARN_ON_ONCE(!tick_work_cpu);
  
        twork = per_cpu_ptr(tick_work_cpu, cpu);
 -      twork->cpu = cpu;
 -      INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
 -      queue_delayed_work(system_unbound_wq, &twork->work, HZ);
 +      os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
 +      WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
 +      if (os == TICK_SCHED_REMOTE_OFFLINE) {
 +              twork->cpu = cpu;
 +              INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
 +              queue_delayed_work(system_unbound_wq, &twork->work, HZ);
 +      }
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
  static void sched_tick_stop(int cpu)
  {
        struct tick_work *twork;
 +      int os;
  
        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
                return;
        WARN_ON_ONCE(!tick_work_cpu);
  
        twork = per_cpu_ptr(tick_work_cpu, cpu);
 -      cancel_delayed_work_sync(&twork->work);
 +      /* There cannot be competing actions, but don't rely on stop-machine. */
 +      os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
 +      WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
 +      /* Don't cancel, as this would mess up the state machine. */
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
@@@ -3744,6 -3572,7 +3744,6 @@@ int __init sched_tick_offload_init(void
  {
        tick_work_cpu = alloc_percpu(struct tick_work);
        BUG_ON(!tick_work_cpu);
 -
        return 0;
  }
  
@@@ -3752,7 -3581,7 +3752,7 @@@ static inline void sched_tick_start(in
  static inline void sched_tick_stop(int cpu) { }
  #endif
  
 -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_TRACE_PREEMPT_TOGGLE))
  /*
   * If the value passed in is equal to the current preempt count
@@@ -3910,7 -3739,7 +3910,7 @@@ pick_next_task(struct rq *rq, struct ta
  
                p = fair_sched_class.pick_next_task(rq, prev, rf);
                if (unlikely(p == RETRY_TASK))
 -                      goto again;
 +                      goto restart;
  
                /* Assumes fair_sched_class->next == idle_sched_class */
                if (unlikely(!p))
                return p;
        }
  
 -again:
 +restart:
 +      /*
 +       * Ensure that we put DL/RT tasks before the pick loop, such that they
 +       * can PULL higher prio tasks when we lower the RQ 'priority'.
 +       */
 +      prev->sched_class->put_prev_task(rq, prev, rf);
 +      if (!rq->nr_running)
 +              newidle_balance(rq, rf);
 +
        for_each_class(class) {
 -              p = class->pick_next_task(rq, prev, rf);
 -              if (p) {
 -                      if (unlikely(p == RETRY_TASK))
 -                              goto again;
 +              p = class->pick_next_task(rq, NULL, NULL);
 +              if (p)
                        return p;
 -              }
        }
  
        /* The idle class should always have a runnable task: */
   *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
   *      called on the nearest possible occasion:
   *
 - *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
 + *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
   *
   *         - in syscall or exception context, at the next outmost
   *           preempt_enable(). (this might be as soon as the wake_up()'s
   *         - in IRQ context, return from interrupt-handler to
   *           preemptible context
   *
 - *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
 + *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
   *         then at the next:
   *
   *          - cond_resched() call
@@@ -4080,7 -3904,7 +4080,7 @@@ void __noreturn do_task_dead(void
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
 -      if (!tsk->state || tsk_is_pi_blocked(tsk))
 +      if (!tsk->state)
                return;
  
        /*
                preempt_enable_no_resched();
        }
  
 +      if (tsk_is_pi_blocked(tsk))
 +              return;
 +
        /*
         * If we are going to sleep and we have plugged IO queued,
         * make sure to submit it to avoid deadlocks.
@@@ -4212,7 -4033,7 +4212,7 @@@ static void __sched notrace preempt_sch
        } while (need_resched());
  }
  
 -#ifdef CONFIG_PREEMPT
 +#ifdef CONFIG_PREEMPTION
  /*
   * this is the entry point to schedule() from in-kernel preemption
   * off of preempt_enable. Kernel preemptions off return from interrupt
@@@ -4284,7 -4105,7 +4284,7 @@@ asmlinkage __visible void __sched notra
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
 -#endif /* CONFIG_PREEMPT */
 +#endif /* CONFIG_PREEMPTION */
  
  /*
   * this is the entry point to schedule() from kernel preemption
@@@ -4452,7 -4273,7 +4452,7 @@@ void rt_mutex_setprio(struct task_struc
        if (queued)
                enqueue_task(rq, p, queue_flag);
        if (running)
 -              set_curr_task(rq, p);
 +              set_next_task(rq, p);
  
        check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
@@@ -4519,7 -4340,7 +4519,7 @@@ void set_user_nice(struct task_struct *
                        resched_curr(rq);
        }
        if (running)
 -              set_curr_task(rq, p);
 +              set_next_task(rq, p);
  out_unlock:
        task_rq_unlock(rq, p, &rf);
  }
@@@ -4836,9 -4657,6 +4836,9 @@@ recheck
                        return retval;
        }
  
 +      if (pi)
 +              cpuset_read_lock();
 +
        /*
         * Make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
         * Changing the policy of the stop threads its a very bad idea:
         */
        if (p == rq->stop) {
 -              task_rq_unlock(rq, p, &rf);
 -              return -EINVAL;
 +              retval = -EINVAL;
 +              goto unlock;
        }
  
        /*
                        goto change;
  
                p->sched_reset_on_fork = reset_on_fork;
 -              task_rq_unlock(rq, p, &rf);
 -              return 0;
 +              retval = 0;
 +              goto unlock;
        }
  change:
  
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                !task_group_is_autogroup(task_group(p))) {
 -                      task_rq_unlock(rq, p, &rf);
 -                      return -EPERM;
 +                      retval = -EPERM;
 +                      goto unlock;
                }
  #endif
  #ifdef CONFIG_SMP
                         */
                        if (!cpumask_subset(span, p->cpus_ptr) ||
                            rq->rd->dl_bw.bw == 0) {
 -                              task_rq_unlock(rq, p, &rf);
 -                              return -EPERM;
 +                              retval = -EPERM;
 +                              goto unlock;
                        }
                }
  #endif
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
                task_rq_unlock(rq, p, &rf);
 +              if (pi)
 +                      cpuset_read_unlock();
                goto recheck;
        }
  
         * is available.
         */
        if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
 -              task_rq_unlock(rq, p, &rf);
 -              return -EBUSY;
 +              retval = -EBUSY;
 +              goto unlock;
        }
  
        p->sched_reset_on_fork = reset_on_fork;
                enqueue_task(rq, p, queue_flags);
        }
        if (running)
 -              set_curr_task(rq, p);
 +              set_next_task(rq, p);
  
        check_class_changed(rq, p, prev_class, oldprio);
  
        preempt_disable();
        task_rq_unlock(rq, p, &rf);
  
 -      if (pi)
 +      if (pi) {
 +              cpuset_read_unlock();
                rt_mutex_adjust_pi(p);
 +      }
  
        /* Run balance callbacks after we've adjusted the PI chain: */
        balance_callback(rq);
        preempt_enable();
  
        return 0;
 +
 +unlock:
 +      task_rq_unlock(rq, p, &rf);
 +      if (pi)
 +              cpuset_read_unlock();
 +      return retval;
  }
  
  static int _sched_setscheduler(struct task_struct *p, int policy,
@@@ -5074,15 -4882,10 +5074,15 @@@ do_sched_setscheduler(pid_t pid, int po
        rcu_read_lock();
        retval = -ESRCH;
        p = find_process_by_pid(pid);
 -      if (p != NULL)
 -              retval = sched_setscheduler(p, policy, &lparam);
 +      if (likely(p))
 +              get_task_struct(p);
        rcu_read_unlock();
  
 +      if (likely(p)) {
 +              retval = sched_setscheduler(p, policy, &lparam);
 +              put_task_struct(p);
 +      }
 +
        return retval;
  }
  
@@@ -5299,40 -5102,37 +5299,40 @@@ out_unlock
        return retval;
  }
  
 -static int sched_read_attr(struct sched_attr __user *uattr,
 -                         struct sched_attr *attr,
 -                         unsigned int usize)
 +/*
 + * Copy the kernel size attribute structure (which might be larger
 + * than what user-space knows about) to user-space.
 + *
 + * Note that all cases are valid: user-space buffer can be larger or
 + * smaller than the kernel-space buffer. The usual case is that both
 + * have the same size.
 + */
 +static int
 +sched_attr_copy_to_user(struct sched_attr __user *uattr,
 +                      struct sched_attr *kattr,
 +                      unsigned int usize)
  {
 -      int ret;
 +      unsigned int ksize = sizeof(*kattr);
  
        if (!access_ok(uattr, usize))
                return -EFAULT;
  
        /*
 -       * If we're handed a smaller struct than we know of,
 -       * ensure all the unknown bits are 0 - i.e. old
 -       * user-space does not get uncomplete information.
 +       * sched_getattr() ABI forwards and backwards compatibility:
 +       *
 +       * If usize == ksize then we just copy everything to user-space and all is good.
 +       *
 +       * If usize < ksize then we only copy as much as user-space has space for,
 +       * this keeps ABI compatibility as well. We skip the rest.
 +       *
 +       * If usize > ksize then user-space is using a newer version of the ABI,
 +       * which part the kernel doesn't know about. Just ignore it - tooling can
 +       * detect the kernel's knowledge of attributes from the attr->size value
 +       * which is set to ksize in this case.
         */
 -      if (usize < sizeof(*attr)) {
 -              unsigned char *addr;
 -              unsigned char *end;
 -
 -              addr = (void *)attr + usize;
 -              end  = (void *)attr + sizeof(*attr);
 -
 -              for (; addr < end; addr++) {
 -                      if (*addr)
 -                              return -EFBIG;
 -              }
 -
 -              attr->size = usize;
 -      }
 +      kattr->size = min(usize, ksize);
  
 -      ret = copy_to_user(uattr, attr, attr->size);
 -      if (ret)
 +      if (copy_to_user(uattr, kattr, kattr->size))
                return -EFAULT;
  
        return 0;
   * sys_sched_getattr - similar to sched_getparam, but with sched_attr
   * @pid: the pid in question.
   * @uattr: structure containing the extended parameters.
 - * @size: sizeof(attr) for fwd/bwd comp.
 + * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
   * @flags: for future extension.
   */
  SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 -              unsigned int, size, unsigned int, flags)
 +              unsigned int, usize, unsigned int, flags)
  {
 -      struct sched_attr attr = {
 -              .size = sizeof(struct sched_attr),
 -      };
 +      struct sched_attr kattr = { };
        struct task_struct *p;
        int retval;
  
 -      if (!uattr || pid < 0 || size > PAGE_SIZE ||
 -          size < SCHED_ATTR_SIZE_VER0 || flags)
 +      if (!uattr || pid < 0 || usize > PAGE_SIZE ||
 +          usize < SCHED_ATTR_SIZE_VER0 || flags)
                return -EINVAL;
  
        rcu_read_lock();
        if (retval)
                goto out_unlock;
  
 -      attr.sched_policy = p->policy;
 +      kattr.sched_policy = p->policy;
        if (p->sched_reset_on_fork)
 -              attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 +              kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
        if (task_has_dl_policy(p))
 -              __getparam_dl(p, &attr);
 +              __getparam_dl(p, &kattr);
        else if (task_has_rt_policy(p))
 -              attr.sched_priority = p->rt_priority;
 +              kattr.sched_priority = p->rt_priority;
        else
 -              attr.sched_nice = task_nice(p);
 +              kattr.sched_nice = task_nice(p);
  
  #ifdef CONFIG_UCLAMP_TASK
 -      attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
 -      attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
 +      kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
 +      kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
  #endif
  
        rcu_read_unlock();
  
 -      retval = sched_read_attr(uattr, &attr, size);
 -      return retval;
 +      return sched_attr_copy_to_user(uattr, &kattr, usize);
  
  out_unlock:
        rcu_read_unlock();
@@@ -5613,7 -5416,7 +5613,7 @@@ SYSCALL_DEFINE0(sched_yield
        return 0;
  }
  
 -#ifndef CONFIG_PREEMPT
 +#ifndef CONFIG_PREEMPTION
  int __sched _cond_resched(void)
  {
        if (should_resched(0)) {
@@@ -5630,7 -5433,7 +5630,7 @@@ EXPORT_SYMBOL(_cond_resched)
   * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   * call schedule, and on return reacquire the lock.
   *
 - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
 + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
   * operations here to prevent schedule() from being called twice (once via
   * spin_unlock(), once by hand).
   */
@@@ -6169,7 -5972,7 +6169,7 @@@ void sched_setnuma(struct task_struct *
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
 -              set_curr_task(rq, p);
 +              set_next_task(rq, p);
        task_rq_unlock(rq, p, &rf);
  }
  #endif /* CONFIG_NUMA_BALANCING */
@@@ -6209,22 -6012,21 +6209,22 @@@ static void calc_load_migrate(struct r
                atomic_long_add(delta, &calc_load_tasks);
  }
  
 -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
 +static struct task_struct *__pick_migrate_task(struct rq *rq)
  {
 -}
 +      const struct sched_class *class;
 +      struct task_struct *next;
  
 -static const struct sched_class fake_sched_class = {
 -      .put_prev_task = put_prev_task_fake,
 -};
 +      for_each_class(class) {
 +              next = class->pick_next_task(rq, NULL, NULL);
 +              if (next) {
 +                      next->sched_class->put_prev_task(rq, next, NULL);
 +                      return next;
 +              }
 +      }
  
 -static struct task_struct fake_task = {
 -      /*
 -       * Avoid pull_{rt,dl}_task()
 -       */
 -      .prio = MAX_PRIO + 1,
 -      .sched_class = &fake_sched_class,
 -};
 +      /* The idle class should always have a runnable task */
 +      BUG();
 +}
  
  /*
   * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@@ -6267,7 -6069,12 +6267,7 @@@ static void migrate_tasks(struct rq *de
                if (rq->nr_running == 1)
                        break;
  
 -              /*
 -               * pick_next_task() assumes pinned rq->lock:
 -               */
 -              next = pick_next_task(rq, &fake_task, rf);
 -              BUG_ON(!next);
 -              put_prev_task(rq, next);
 +              next = __pick_migrate_task(rq);
  
                /*
                 * Rules for changing task_struct::cpus_mask are holding
@@@ -6564,19 -6371,19 +6564,19 @@@ DECLARE_PER_CPU(cpumask_var_t, select_i
  
  void __init sched_init(void)
  {
 -      unsigned long alloc_size = 0, ptr;
 +      unsigned long ptr = 0;
        int i;
  
        wait_bit_init();
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 -      alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 +      ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
  #ifdef CONFIG_RT_GROUP_SCHED
 -      alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 +      ptr += 2 * nr_cpu_ids * sizeof(void **);
  #endif
 -      if (alloc_size) {
 -              ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 +      if (ptr) {
 +              ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.se = (struct sched_entity **)ptr;
@@@ -6895,7 -6702,7 +6895,7 @@@ struct task_struct *curr_task(int cpu
  
  #ifdef CONFIG_IA64
  /**
 - * set_curr_task - set the current task for a given CPU.
 + * ia64_set_curr_task - set the current task for a given CPU.
   * @cpu: the processor in question.
   * @p: the task pointer to set.
   *
@@@ -6920,20 -6727,6 +6920,20 @@@ void ia64_set_curr_task(int cpu, struc
  /* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
 +static inline void alloc_uclamp_sched_group(struct task_group *tg,
 +                                          struct task_group *parent)
 +{
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +      enum uclamp_id clamp_id;
 +
 +      for_each_clamp_id(clamp_id) {
 +              uclamp_se_set(&tg->uclamp_req[clamp_id],
 +                            uclamp_none(clamp_id), false);
 +              tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
 +      }
 +#endif
 +}
 +
  static void sched_free_group(struct task_group *tg)
  {
        free_fair_sched_group(tg);
@@@ -6957,8 -6750,6 +6957,8 @@@ struct task_group *sched_create_group(s
        if (!alloc_rt_sched_group(tg, parent))
                goto err;
  
 +      alloc_uclamp_sched_group(tg, parent);
 +
        return tg;
  
  err:
@@@ -7062,7 -6853,7 +7062,7 @@@ void sched_move_task(struct task_struc
        if (queued)
                enqueue_task(rq, tsk, queue_flags);
        if (running)
 -              set_curr_task(rq, tsk);
 +              set_next_task(rq, tsk);
  
        task_rq_unlock(rq, tsk, &rf);
  }
@@@ -7145,6 -6936,10 +7145,6 @@@ static int cpu_cgroup_can_attach(struc
  #ifdef CONFIG_RT_GROUP_SCHED
                if (!sched_rt_can_attach(css_tg(css), task))
                        return -EINVAL;
 -#else
 -              /* We don't support RT-tasks being in separate groups */
 -              if (task->sched_class != &fair_sched_class)
 -                      return -EINVAL;
  #endif
                /*
                 * Serialize against wake_up_new_task() such that if its
@@@ -7175,178 -6970,6 +7175,178 @@@ static void cpu_cgroup_attach(struct cg
                sched_move_task(task);
  }
  
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +static void cpu_util_update_eff(struct cgroup_subsys_state *css)
 +{
 +      struct cgroup_subsys_state *top_css = css;
 +      struct uclamp_se *uc_parent = NULL;
 +      struct uclamp_se *uc_se = NULL;
 +      unsigned int eff[UCLAMP_CNT];
 +      enum uclamp_id clamp_id;
 +      unsigned int clamps;
 +
 +      css_for_each_descendant_pre(css, top_css) {
 +              uc_parent = css_tg(css)->parent
 +                      ? css_tg(css)->parent->uclamp : NULL;
 +
 +              for_each_clamp_id(clamp_id) {
 +                      /* Assume effective clamps matches requested clamps */
 +                      eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
 +                      /* Cap effective clamps with parent's effective clamps */
 +                      if (uc_parent &&
 +                          eff[clamp_id] > uc_parent[clamp_id].value) {
 +                              eff[clamp_id] = uc_parent[clamp_id].value;
 +                      }
 +              }
 +              /* Ensure protection is always capped by limit */
 +              eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
 +
 +              /* Propagate most restrictive effective clamps */
 +              clamps = 0x0;
 +              uc_se = css_tg(css)->uclamp;
 +              for_each_clamp_id(clamp_id) {
 +                      if (eff[clamp_id] == uc_se[clamp_id].value)
 +                              continue;
 +                      uc_se[clamp_id].value = eff[clamp_id];
 +                      uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
 +                      clamps |= (0x1 << clamp_id);
 +              }
 +              if (!clamps) {
 +                      css = css_rightmost_descendant(css);
 +                      continue;
 +              }
 +
 +              /* Immediately update descendants RUNNABLE tasks */
 +              uclamp_update_active_tasks(css, clamps);
 +      }
 +}
 +
 +/*
 + * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
 + * C expression. Since there is no way to convert a macro argument (N) into a
 + * character constant, use two levels of macros.
 + */
 +#define _POW10(exp) ((unsigned int)1e##exp)
 +#define POW10(exp) _POW10(exp)
 +
 +struct uclamp_request {
 +#define UCLAMP_PERCENT_SHIFT  2
 +#define UCLAMP_PERCENT_SCALE  (100 * POW10(UCLAMP_PERCENT_SHIFT))
 +      s64 percent;
 +      u64 util;
 +      int ret;
 +};
 +
 +static inline struct uclamp_request
 +capacity_from_percent(char *buf)
 +{
 +      struct uclamp_request req = {
 +              .percent = UCLAMP_PERCENT_SCALE,
 +              .util = SCHED_CAPACITY_SCALE,
 +              .ret = 0,
 +      };
 +
 +      buf = strim(buf);
 +      if (strcmp(buf, "max")) {
 +              req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
 +                                           &req.percent);
 +              if (req.ret)
 +                      return req;
 +              if (req.percent > UCLAMP_PERCENT_SCALE) {
 +                      req.ret = -ERANGE;
 +                      return req;
 +              }
 +
 +              req.util = req.percent << SCHED_CAPACITY_SHIFT;
 +              req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
 +      }
 +
 +      return req;
 +}
 +
 +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
 +                              size_t nbytes, loff_t off,
 +                              enum uclamp_id clamp_id)
 +{
 +      struct uclamp_request req;
 +      struct task_group *tg;
 +
 +      req = capacity_from_percent(buf);
 +      if (req.ret)
 +              return req.ret;
 +
 +      mutex_lock(&uclamp_mutex);
 +      rcu_read_lock();
 +
 +      tg = css_tg(of_css(of));
 +      if (tg->uclamp_req[clamp_id].value != req.util)
 +              uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
 +
 +      /*
 +       * Because of not recoverable conversion rounding we keep track of the
 +       * exact requested value
 +       */
 +      tg->uclamp_pct[clamp_id] = req.percent;
 +
 +      /* Update effective clamps to track the most restrictive value */
 +      cpu_util_update_eff(of_css(of));
 +
 +      rcu_read_unlock();
 +      mutex_unlock(&uclamp_mutex);
 +
 +      return nbytes;
 +}
 +
 +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
 +                                  char *buf, size_t nbytes,
 +                                  loff_t off)
 +{
 +      return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
 +}
 +
 +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
 +                                  char *buf, size_t nbytes,
 +                                  loff_t off)
 +{
 +      return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
 +}
 +
 +static inline void cpu_uclamp_print(struct seq_file *sf,
 +                                  enum uclamp_id clamp_id)
 +{
 +      struct task_group *tg;
 +      u64 util_clamp;
 +      u64 percent;
 +      u32 rem;
 +
 +      rcu_read_lock();
 +      tg = css_tg(seq_css(sf));
 +      util_clamp = tg->uclamp_req[clamp_id].value;
 +      rcu_read_unlock();
 +
 +      if (util_clamp == SCHED_CAPACITY_SCALE) {
 +              seq_puts(sf, "max\n");
 +              return;
 +      }
 +
 +      percent = tg->uclamp_pct[clamp_id];
 +      percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
 +      seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
 +}
 +
 +static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
 +{
 +      cpu_uclamp_print(sf, UCLAMP_MIN);
 +      return 0;
 +}
 +
 +static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
 +{
 +      cpu_uclamp_print(sf, UCLAMP_MAX);
 +      return 0;
 +}
 +#endif /* CONFIG_UCLAMP_TASK_GROUP */
 +
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                struct cftype *cftype, u64 shareval)
@@@ -7691,20 -7314,6 +7691,20 @@@ static struct cftype cpu_legacy_files[
                .read_u64 = cpu_rt_period_read_uint,
                .write_u64 = cpu_rt_period_write_uint,
        },
 +#endif
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +      {
 +              .name = "uclamp.min",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .seq_show = cpu_uclamp_min_show,
 +              .write = cpu_uclamp_min_write,
 +      },
 +      {
 +              .name = "uclamp.max",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .seq_show = cpu_uclamp_max_show,
 +              .write = cpu_uclamp_max_write,
 +      },
  #endif
        { }     /* Terminate */
  };
@@@ -7872,20 -7481,6 +7872,20 @@@ static struct cftype cpu_files[] = 
                .seq_show = cpu_max_show,
                .write = cpu_max_write,
        },
 +#endif
 +#ifdef CONFIG_UCLAMP_TASK_GROUP
 +      {
 +              .name = "uclamp.min",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .seq_show = cpu_uclamp_min_show,
 +              .write = cpu_uclamp_min_write,
 +      },
 +      {
 +              .name = "uclamp.max",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .seq_show = cpu_uclamp_max_show,
 +              .write = cpu_uclamp_max_write,
 +      },
  #endif
        { }     /* terminate */
  };
diff --combined kernel/sched/deadline.c
index 39dc9f74f2898f13b56837f8073f49043275a5d2,83a663a34196b9a8288414223c3b7c241459be82..2dc48720f18914a9f828f5cd653b751a045c4556
@@@ -287,7 -287,7 +287,7 @@@ static void task_non_contending(struct 
  
        dl_se->dl_non_contending = 1;
        get_task_struct(p);
-       hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+       hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
  }
  
  static void task_contending(struct sched_dl_entity *dl_se, int flags)
@@@ -529,7 -529,6 +529,7 @@@ static struct rq *find_lock_later_rq(st
  static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
  {
        struct rq *later_rq = NULL;
 +      struct dl_bw *dl_b;
  
        later_rq = find_lock_later_rq(p, rq);
        if (!later_rq) {
                double_lock_balance(rq, later_rq);
        }
  
 +      if (p->dl.dl_non_contending || p->dl.dl_throttled) {
 +              /*
 +               * Inactive timer is armed (or callback is running, but
 +               * waiting for us to release rq locks). In any case, when it
 +               * will fire (or continue), it will see running_bw of this
 +               * task migrated to later_rq (and correctly handle it).
 +               */
 +              sub_running_bw(&p->dl, &rq->dl);
 +              sub_rq_bw(&p->dl, &rq->dl);
 +
 +              add_rq_bw(&p->dl, &later_rq->dl);
 +              add_running_bw(&p->dl, &later_rq->dl);
 +      } else {
 +              sub_rq_bw(&p->dl, &rq->dl);
 +              add_rq_bw(&p->dl, &later_rq->dl);
 +      }
 +
 +      /*
 +       * And we finally need to fixup root_domain(s) bandwidth accounting,
 +       * since p is still hanging out in the old (now moved to default) root
 +       * domain.
 +       */
 +      dl_b = &rq->rd->dl_bw;
 +      raw_spin_lock(&dl_b->lock);
 +      __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
 +      raw_spin_unlock(&dl_b->lock);
 +
 +      dl_b = &later_rq->rd->dl_bw;
 +      raw_spin_lock(&dl_b->lock);
 +      __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
 +      raw_spin_unlock(&dl_b->lock);
 +
        set_task_cpu(p, later_rq->cpu);
        double_unlock_balance(later_rq, rq);
  
@@@ -956,7 -923,7 +956,7 @@@ static int start_dl_timer(struct task_s
         */
        if (!hrtimer_is_queued(timer)) {
                get_task_struct(p);
-               hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+               hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
        }
  
        return 1;
@@@ -1086,7 -1053,7 +1086,7 @@@ void init_dl_task_timer(struct sched_dl
  {
        struct hrtimer *timer = &dl_se->dl_timer;
  
-       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        timer->function = dl_task_timer;
  }
  
@@@ -1325,7 -1292,7 +1325,7 @@@ void init_dl_inactive_task_timer(struc
  {
        struct hrtimer *timer = &dl_se->inactive_timer;
  
-       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        timer->function = inactive_task_timer;
  }
  
@@@ -1727,20 -1694,12 +1727,20 @@@ static void start_hrtick_dl(struct rq *
  }
  #endif
  
 -static inline void set_next_task(struct rq *rq, struct task_struct *p)
 +static void set_next_task_dl(struct rq *rq, struct task_struct *p)
  {
        p->se.exec_start = rq_clock_task(rq);
  
        /* You can't push away the running task */
        dequeue_pushable_dl_task(rq, p);
 +
 +      if (hrtick_enabled(rq))
 +              start_hrtick_dl(rq, p);
 +
 +      if (rq->curr->sched_class != &dl_sched_class)
 +              update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 +
 +      deadline_queue_push_tasks(rq);
  }
  
  static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@@ -1761,42 -1720,64 +1761,42 @@@ pick_next_task_dl(struct rq *rq, struc
        struct task_struct *p;
        struct dl_rq *dl_rq;
  
 -      dl_rq = &rq->dl;
 -
 -      if (need_pull_dl_task(rq, prev)) {
 -              /*
 -               * This is OK, because current is on_cpu, which avoids it being
 -               * picked for load-balance and preemption/IRQs are still
 -               * disabled avoiding further scheduler activity on it and we're
 -               * being very careful to re-start the picking loop.
 -               */
 -              rq_unpin_lock(rq, rf);
 -              pull_dl_task(rq);
 -              rq_repin_lock(rq, rf);
 -              /*
 -               * pull_dl_task() can drop (and re-acquire) rq->lock; this
 -               * means a stop task can slip in, in which case we need to
 -               * re-start task selection.
 -               */
 -              if (rq->stop && task_on_rq_queued(rq->stop))
 -                      return RETRY_TASK;
 -      }
 +      WARN_ON_ONCE(prev || rf);
  
 -      /*
 -       * When prev is DL, we may throttle it in put_prev_task().
 -       * So, we update time before we check for dl_nr_running.
 -       */
 -      if (prev->sched_class == &dl_sched_class)
 -              update_curr_dl(rq);
 +      dl_rq = &rq->dl;
  
        if (unlikely(!dl_rq->dl_nr_running))
                return NULL;
  
 -      put_prev_task(rq, prev);
 -
        dl_se = pick_next_dl_entity(rq, dl_rq);
        BUG_ON(!dl_se);
  
        p = dl_task_of(dl_se);
  
 -      set_next_task(rq, p);
 -
 -      if (hrtick_enabled(rq))
 -              start_hrtick_dl(rq, p);
 -
 -      deadline_queue_push_tasks(rq);
 -
 -      if (rq->curr->sched_class != &dl_sched_class)
 -              update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 +      set_next_task_dl(rq, p);
  
        return p;
  }
  
 -static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
 +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  {
        update_curr_dl(rq);
  
        update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
        if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
                enqueue_pushable_dl_task(rq, p);
 +
 +      if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
 +              /*
 +               * This is OK, because current is on_cpu, which avoids it being
 +               * picked for load-balance and preemption/IRQs are still
 +               * disabled avoiding further scheduler activity on it and we've
 +               * not yet started the picking loop.
 +               */
 +              rq_unpin_lock(rq, rf);
 +              pull_dl_task(rq);
 +              rq_repin_lock(rq, rf);
 +      }
  }
  
  /*
@@@ -1830,6 -1811,11 +1830,6 @@@ static void task_fork_dl(struct task_st
         */
  }
  
 -static void set_curr_task_dl(struct rq *rq)
 -{
 -      set_next_task(rq, rq->curr);
 -}
 -
  #ifdef CONFIG_SMP
  
  /* Only try algorithms three times */
@@@ -2102,13 -2088,17 +2102,13 @@@ retry
        }
  
        deactivate_task(rq, next_task, 0);
 -      sub_running_bw(&next_task->dl, &rq->dl);
 -      sub_rq_bw(&next_task->dl, &rq->dl);
        set_task_cpu(next_task, later_rq->cpu);
 -      add_rq_bw(&next_task->dl, &later_rq->dl);
  
        /*
         * Update the later_rq clock here, because the clock is used
         * by the cpufreq_update_util() inside __add_running_bw().
         */
        update_rq_clock(later_rq);
 -      add_running_bw(&next_task->dl, &later_rq->dl);
        activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
        ret = 1;
  
@@@ -2196,7 -2186,11 +2196,7 @@@ static void pull_dl_task(struct rq *thi
                        resched = true;
  
                        deactivate_task(src_rq, p, 0);
 -                      sub_running_bw(&p->dl, &src_rq->dl);
 -                      sub_rq_bw(&p->dl, &src_rq->dl);
                        set_task_cpu(p, this_cpu);
 -                      add_rq_bw(&p->dl, &this_rq->dl);
 -                      add_running_bw(&p->dl, &this_rq->dl);
                        activate_task(this_rq, p, 0);
                        dmin = p->dl.deadline;
  
@@@ -2289,36 -2283,6 +2289,36 @@@ void __init init_sched_dl_class(void
                                        GFP_KERNEL, cpu_to_node(i));
  }
  
 +void dl_add_task_root_domain(struct task_struct *p)
 +{
 +      struct rq_flags rf;
 +      struct rq *rq;
 +      struct dl_bw *dl_b;
 +
 +      rq = task_rq_lock(p, &rf);
 +      if (!dl_task(p))
 +              goto unlock;
 +
 +      dl_b = &rq->rd->dl_bw;
 +      raw_spin_lock(&dl_b->lock);
 +
 +      __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
 +
 +      raw_spin_unlock(&dl_b->lock);
 +
 +unlock:
 +      task_rq_unlock(rq, p, &rf);
 +}
 +
 +void dl_clear_root_domain(struct root_domain *rd)
 +{
 +      unsigned long flags;
 +
 +      raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
 +      rd->dl_bw.total_bw = 0;
 +      raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
 +}
 +
  #endif /* CONFIG_SMP */
  
  static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@@ -2439,7 -2403,6 +2439,7 @@@ const struct sched_class dl_sched_clas
  
        .pick_next_task         = pick_next_task_dl,
        .put_prev_task          = put_prev_task_dl,
 +      .set_next_task          = set_next_task_dl,
  
  #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_dl,
        .task_woken             = task_woken_dl,
  #endif
  
 -      .set_curr_task          = set_curr_task_dl,
        .task_tick              = task_tick_dl,
        .task_fork              = task_fork_dl,
  
diff --combined kernel/sched/rt.c
index 858c4cc6f99bccd888b4388c87c217052b33560a,d6678f773c966fe28bd4de1ba166fc091e21e3aa..ebaa4e619684112cc6c19bc6ba69fe15c3f2b52a
@@@ -45,8 -45,8 +45,8 @@@ void init_rt_bandwidth(struct rt_bandwi
  
        raw_spin_lock_init(&rt_b->rt_runtime_lock);
  
-       hrtimer_init(&rt_b->rt_period_timer,
-                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
+                    HRTIMER_MODE_REL_HARD);
        rt_b->rt_period_timer.function = sched_rt_period_timer;
  }
  
@@@ -67,7 -67,8 +67,8 @@@ static void start_rt_bandwidth(struct r
                 * to update the period.
                 */
                hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
-               hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+               hrtimer_start_expires(&rt_b->rt_period_timer,
+                                     HRTIMER_MODE_ABS_PINNED_HARD);
        }
        raw_spin_unlock(&rt_b->rt_runtime_lock);
  }
@@@ -1498,22 -1499,12 +1499,22 @@@ static void check_preempt_curr_rt(struc
  #endif
  }
  
 -static inline void set_next_task(struct rq *rq, struct task_struct *p)
 +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
  {
        p->se.exec_start = rq_clock_task(rq);
  
        /* The running task is never eligible for pushing */
        dequeue_pushable_task(rq, p);
 +
 +      /*
 +       * If prev task was rt, put_prev_task() has already updated the
 +       * utilization. We only care of the case where we start to schedule a
 +       * rt task
 +       */
 +      if (rq->curr->sched_class != &rt_sched_class)
 +              update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 +
 +      rt_queue_push_tasks(rq);
  }
  
  static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@@ -1553,19 -1544,56 +1554,19 @@@ pick_next_task_rt(struct rq *rq, struc
        struct task_struct *p;
        struct rt_rq *rt_rq = &rq->rt;
  
 -      if (need_pull_rt_task(rq, prev)) {
 -              /*
 -               * This is OK, because current is on_cpu, which avoids it being
 -               * picked for load-balance and preemption/IRQs are still
 -               * disabled avoiding further scheduler activity on it and we're
 -               * being very careful to re-start the picking loop.
 -               */
 -              rq_unpin_lock(rq, rf);
 -              pull_rt_task(rq);
 -              rq_repin_lock(rq, rf);
 -              /*
 -               * pull_rt_task() can drop (and re-acquire) rq->lock; this
 -               * means a dl or stop task can slip in, in which case we need
 -               * to re-start task selection.
 -               */
 -              if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
 -                           rq->dl.dl_nr_running))
 -                      return RETRY_TASK;
 -      }
 -
 -      /*
 -       * We may dequeue prev's rt_rq in put_prev_task().
 -       * So, we update time before rt_queued check.
 -       */
 -      if (prev->sched_class == &rt_sched_class)
 -              update_curr_rt(rq);
 +      WARN_ON_ONCE(prev || rf);
  
        if (!rt_rq->rt_queued)
                return NULL;
  
 -      put_prev_task(rq, prev);
 -
        p = _pick_next_task_rt(rq);
  
 -      set_next_task(rq, p);
 -
 -      rt_queue_push_tasks(rq);
 -
 -      /*
 -       * If prev task was rt, put_prev_task() has already updated the
 -       * utilization. We only care of the case where we start to schedule a
 -       * rt task
 -       */
 -      if (rq->curr->sched_class != &rt_sched_class)
 -              update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 +      set_next_task_rt(rq, p);
  
        return p;
  }
  
 -static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  {
        update_curr_rt(rq);
  
         */
        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 +
 +      if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
 +              /*
 +               * This is OK, because current is on_cpu, which avoids it being
 +               * picked for load-balance and preemption/IRQs are still
 +               * disabled avoiding further scheduler activity on it and we've
 +               * not yet started the picking loop.
 +               */
 +              rq_unpin_lock(rq, rf);
 +              pull_rt_task(rq);
 +              rq_repin_lock(rq, rf);
 +      }
  }
  
  #ifdef CONFIG_SMP
@@@ -2289,8 -2305,10 +2290,10 @@@ static void watchdog(struct rq *rq, str
                }
  
                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
-               if (p->rt.timeout > next)
-                       p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+               if (p->rt.timeout > next) {
+                       posix_cputimers_rt_watchdog(&p->posix_cputimers,
+                                                   p->se.sum_exec_runtime);
+               }
        }
  }
  #else
@@@ -2339,6 -2357,11 +2342,6 @@@ static void task_tick_rt(struct rq *rq
        }
  }
  
 -static void set_curr_task_rt(struct rq *rq)
 -{
 -      set_next_task(rq, rq->curr);
 -}
 -
  static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
  {
        /*
@@@ -2360,7 -2383,6 +2363,7 @@@ const struct sched_class rt_sched_clas
  
        .pick_next_task         = pick_next_task_rt,
        .put_prev_task          = put_prev_task_rt,
 +      .set_next_task          = set_next_task_rt,
  
  #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_rt,
        .switched_from          = switched_from_rt,
  #endif
  
 -      .set_curr_task          = set_curr_task_rt,
        .task_tick              = task_tick_rt,
  
        .get_rr_interval        = get_rr_interval_rt,
diff --combined kernel/sys.c
index d605fe5e58a5805410ec74d1e9bffaddff0dd169,2462aa84247fc9afea9b56c25331d84c1953bc1e..a611d1d58c7d00525247edff32211cc9586f2c9e
  #ifndef SET_TSC_CTL
  # define SET_TSC_CTL(a)               (-EINVAL)
  #endif
 -#ifndef MPX_ENABLE_MANAGEMENT
 -# define MPX_ENABLE_MANAGEMENT()      (-EINVAL)
 -#endif
 -#ifndef MPX_DISABLE_MANAGEMENT
 -# define MPX_DISABLE_MANAGEMENT()     (-EINVAL)
 -#endif
  #ifndef GET_FP_MODE
  # define GET_FP_MODE(a)               (-EINVAL)
  #endif
  #ifndef PAC_RESET_KEYS
  # define PAC_RESET_KEYS(a, b) (-EINVAL)
  #endif
 +#ifndef SET_TAGGED_ADDR_CTRL
 +# define SET_TAGGED_ADDR_CTRL(a)      (-EINVAL)
 +#endif
 +#ifndef GET_TAGGED_ADDR_CTRL
 +# define GET_TAGGED_ADDR_CTRL()               (-EINVAL)
 +#endif
  
  /*
   * this is where the system-wide overflow UID and GID are defined, for
@@@ -1557,15 -1557,6 +1557,6 @@@ int do_prlimit(struct task_struct *tsk
                        retval = -EPERM;
                if (!retval)
                        retval = security_task_setrlimit(tsk, resource, new_rlim);
-               if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
-                       /*
-                        * The caller is asking for an immediate RLIMIT_CPU
-                        * expiry.  But we use the zero value to mean "it was
-                        * never set".  So let's cheat and make it one second
-                        * instead
-                        */
-                       new_rlim->rlim_cur = 1;
-               }
        }
        if (!retval) {
                if (old_rlim)
        task_unlock(tsk->group_leader);
  
        /*
-        * RLIMIT_CPU handling.   Note that the kernel fails to return an error
-        * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a
-        * very long-standing error, and fixing it now risks breakage of
-        * applications, so we live with it
+        * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
+        * infite. In case of RLIM_INFINITY the posix CPU timer code
+        * ignores the rlimit.
         */
         if (!retval && new_rlim && resource == RLIMIT_CPU &&
             new_rlim->rlim_cur != RLIM_INFINITY &&
@@@ -2456,9 -2446,15 +2446,9 @@@ SYSCALL_DEFINE5(prctl, int, option, uns
                up_write(&me->mm->mmap_sem);
                break;
        case PR_MPX_ENABLE_MANAGEMENT:
 -              if (arg2 || arg3 || arg4 || arg5)
 -                      return -EINVAL;
 -              error = MPX_ENABLE_MANAGEMENT();
 -              break;
        case PR_MPX_DISABLE_MANAGEMENT:
 -              if (arg2 || arg3 || arg4 || arg5)
 -                      return -EINVAL;
 -              error = MPX_DISABLE_MANAGEMENT();
 -              break;
 +              /* No longer implemented: */
 +              return -EINVAL;
        case PR_SET_FP_MODE:
                error = SET_FP_MODE(me, arg2);
                break;
                        return -EINVAL;
                error = PAC_RESET_KEYS(me, arg2);
                break;
 +      case PR_SET_TAGGED_ADDR_CTRL:
 +              if (arg3 || arg4 || arg5)
 +                      return -EINVAL;
 +              error = SET_TAGGED_ADDR_CTRL(arg2);
 +              break;
 +      case PR_GET_TAGGED_ADDR_CTRL:
 +              if (arg2 || arg3 || arg4 || arg5)
 +                      return -EINVAL;
 +              error = GET_TAGGED_ADDR_CTRL();
 +              break;
        default:
                error = -EINVAL;
                break;
diff --combined kernel/time/alarmtimer.c
index b7d75a9e8ccf17c7b616b649e8d76bd5522561b4,ec32876e284daf300f18f32d0d6bf5ab276b2d71..271ce6c12907860bc2db9ae94fab4e70dbedbbfb
@@@ -432,7 -432,7 +432,7 @@@ int alarm_cancel(struct alarm *alarm
                int ret = alarm_try_to_cancel(alarm);
                if (ret >= 0)
                        return ret;
-               cpu_relax();
+               hrtimer_cancel_wait_running(&alarm->timer);
        }
  }
  EXPORT_SYMBOL_GPL(alarm_cancel);
@@@ -605,6 -605,19 +605,19 @@@ static int alarm_timer_try_to_cancel(st
        return alarm_try_to_cancel(&timr->it.alarm.alarmtimer);
  }
  
+ /**
+  * alarm_timer_wait_running - Posix timer callback to wait for a timer
+  * @timr:     Pointer to the posixtimer data struct
+  *
+  * Called from the core code when timer cancel detected that the callback
+  * is running. @timr is unlocked and rcu read lock is held to prevent it
+  * from being freed.
+  */
+ static void alarm_timer_wait_running(struct k_itimer *timr)
+ {
+       hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer);
+ }
  /**
   * alarm_timer_arm - Posix timer callback to arm a timer
   * @timr:     Pointer to the posixtimer data struct
@@@ -672,7 -685,7 +685,7 @@@ static int alarm_timer_create(struct k_
        enum  alarmtimer_type type;
  
        if (!alarmtimer_get_rtcdev())
 -              return -ENOTSUPP;
 +              return -EOPNOTSUPP;
  
        if (!capable(CAP_WAKE_ALARM))
                return -EPERM;
@@@ -790,7 -803,7 +803,7 @@@ static int alarm_timer_nsleep(const clo
        int ret = 0;
  
        if (!alarmtimer_get_rtcdev())
 -              return -ENOTSUPP;
 +              return -EOPNOTSUPP;
  
        if (flags & ~TIMER_ABSTIME)
                return -EINVAL;
@@@ -834,6 -847,7 +847,7 @@@ const struct k_clock alarm_clock = 
        .timer_forward          = alarm_timer_forward,
        .timer_remaining        = alarm_timer_remaining,
        .timer_try_to_cancel    = alarm_timer_try_to_cancel,
+       .timer_wait_running     = alarm_timer_wait_running,
        .nsleep                 = alarm_timer_nsleep,
  };
  #endif /* CONFIG_POSIX_TIMERS */
This page took 0.239074 seconds and 4 git commands to generate.