Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <[email protected]>

Tue, 17 Sep 2019 19:35:15 +0000 (12:35 -0700)

committer Linus Torvalds <[email protected]>

Tue, 17 Sep 2019 19:35:15 +0000 (12:35 -0700)
author Linus Torvalds <[email protected]>
Tue, 17 Sep 2019 19:35:15 +0000 (12:35 -0700)
committer Linus Torvalds <[email protected]>
Tue, 17 Sep 2019 19:35:15 +0000 (12:35 -0700)
diff --combined arch/arm64/boot/dts/freescale/imx8mm.dtsi

index 984ea7b3fd9f171e606cd9a51646cf675808e677,89ef22a8f81e04091766b5fdc6ff650cc6c1ebdd..5f9d0da196e13c695ccc340a2f5ab5bc96985f67
--- 1/arch/arm64/boot/dts/freescale/imx8mm.dtsi
--- 2/arch/arm64/boot/dts/freescale/imx8mm.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm.dtsi
@@@ -44,19 -44,6 +44,19 @@@
                 #address-cells = <1>;
                 #size-cells = <0>;
   
+ +              idle-states {
+ +                      entry-method = "psci";
+ +
+ +                      cpu_pd_wait: cpu-pd-wait {
+ +                              compatible = "arm,idle-state";
+ +                              arm,psci-suspend-param = <0x0010033>;
+ +                              local-timer-stop;
+ +                              entry-latency-us = <1000>;
+ +                              exit-latency-us = <700>;
+ +                              min-residency-us = <2700>;
+ +                      };
+ +              };
+ +
                 A53_0: cpu@0 {
                         device_type = "cpu";
                         compatible = "arm,cortex-a53";
@@@ -68,7 -55,6 +68,7 @@@
                         operating-points-v2 = <&a53_opp_table>;
                         nvmem-cells = <&cpu_speed_grade>;
                         nvmem-cell-names = "speed_grade";
+ +                      cpu-idle-states = <&cpu_pd_wait>;
                 };
   
                 A53_1: cpu@1 {
@@@ -80,7 -66,6 +80,7 @@@
                         enable-method = "psci";
                         next-level-cache = <&A53_L2>;
                         operating-points-v2 = <&a53_opp_table>;
+ +                      cpu-idle-states = <&cpu_pd_wait>;
                 };
   
                 A53_2: cpu@2 {
@@@ -92,7 -77,6 +92,7 @@@
                         enable-method = "psci";
                         next-level-cache = <&A53_L2>;
                         operating-points-v2 = <&a53_opp_table>;
+ +                      cpu-idle-states = <&cpu_pd_wait>;
                 };
   
                 A53_3: cpu@3 {
@@@ -104,7 -88,6 +104,7 @@@
                         enable-method = "psci";
                         next-level-cache = <&A53_L2>;
                         operating-points-v2 = <&a53_opp_table>;
+ +                      cpu-idle-states = <&cpu_pd_wait>;
                 };
   
                 A53_L2: l2-cache0 {
@@@ -121,7 -104,6 +121,7 @@@
                         opp-microvolt = <850000>;
                         opp-supported-hw = <0xe>, <0x7>;
                         clock-latency-ns = <150000>;
+ +                      opp-suspend;
                 };
   
                 opp-1600000000 {
@@@ -129,15 -111,14 +129,15 @@@
                         opp-microvolt = <900000>;
                         opp-supported-hw = <0xc>, <0x7>;
                         clock-latency-ns = <150000>;
+ +                      opp-suspend;
                 };
   
                 opp-1800000000 {
                         opp-hz = /bits/ 64 <1800000000>;
                         opp-microvolt = <1000000>;
- -                      /* Consumer only but rely on speed grading */
- -                      opp-supported-hw = <0x8>, <0x7>;
+ +                      opp-supported-hw = <0x8>, <0x3>;
                         clock-latency-ns = <150000>;
+ +                      opp-suspend;
                 };
         };
   
@@@ -314,7 -295,6 +314,7 @@@
                                 #gpio-cells = <2>;
                                 interrupt-controller;
                                 #interrupt-cells = <2>;
+ +                              gpio-ranges = <&iomuxc 0 10 30>;
                         };
   
                         gpio2: gpio@30210000 {
@@@ -327,7 -307,6 +327,7 @@@
                                 #gpio-cells = <2>;
                                 interrupt-controller;
                                 #interrupt-cells = <2>;
+ +                              gpio-ranges = <&iomuxc 0 40 21>;
                         };
   
                         gpio3: gpio@30220000 {
@@@ -340,7 -319,6 +340,7 @@@
                                 #gpio-cells = <2>;
                                 interrupt-controller;
                                 #interrupt-cells = <2>;
+ +                              gpio-ranges = <&iomuxc 0 61 26>;
                         };
   
                         gpio4: gpio@30230000 {
@@@ -353,7 -331,6 +353,7 @@@
                                 #gpio-cells = <2>;
                                 interrupt-controller;
                                 #interrupt-cells = <2>;
+ +                              gpio-ranges = <&iomuxc 0 87 32>;
                         };
   
                         gpio5: gpio@30240000 {
@@@ -366,7 -343,6 +366,7 @@@
                                 #gpio-cells = <2>;
                                 interrupt-controller;
                                 #interrupt-cells = <2>;
+ +                              gpio-ranges = <&iomuxc 0 119 30>;
                         };
   
                         wdog1: watchdog@30280000 {
@@@ -475,22 -451,10 +475,22 @@@
                                          <&clk_ext3>, <&clk_ext4>;
                                 clock-names = "osc_32k", "osc_24m", "clk_ext1", "clk_ext2",
                                               "clk_ext3", "clk_ext4";
+ +                              assigned-clocks = <&clk IMX8MM_CLK_NOC>,
+ +                                              <&clk IMX8MM_CLK_AUDIO_AHB>,
+ +                                              <&clk IMX8MM_CLK_IPG_AUDIO_ROOT>,
+ +                                              <&clk IMX8MM_SYS_PLL3>,
+ +                                              <&clk IMX8MM_VIDEO_PLL1>;
+ +                              assigned-clock-parents = <&clk IMX8MM_SYS_PLL3_OUT>,
+ +                                                       <&clk IMX8MM_SYS_PLL1_800M>;
+ +                              assigned-clock-rates = <0>,
+ +                                                      <400000000>,
+ +                                                      <400000000>,
+ +                                                      <750000000>,
+ +                                                      <594000000>;
                         };
   
                         src: reset-controller@30390000 {
- -                              compatible = "fsl,imx8mm-src", "syscon";
+ +                              compatible = "fsl,imx8mm-src", "fsl,imx8mq-src", "syscon";
                                 reg = <0x30390000 0x10000>;
                                 interrupts = <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>;
                                 #reset-cells = <1>;
@@@ -546,6 -510,14 +546,14 @@@
                                 #pwm-cells = <2>;
                                 status = "disabled";
                         };
+ 
+                       system_counter: timer@306a0000 {
+                               compatible = "nxp,sysctr-timer";
+                               reg = <0x306a0000 0x20000>;
+                               interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&osc_24m>;
+                               clock-names = "per";
+                       };
                 };
   
                 aips3: bus@30800000 {
@@@ -779,8 -751,10 +787,8 @@@
                                 interrupts = <GIC_SPI 40 IRQ_TYPE_LEVEL_HIGH>;
                                 clocks = <&clk IMX8MM_CLK_USB1_CTRL_ROOT>;
                                 clock-names = "usb1_ctrl_root_clk";
- -                              assigned-clocks = <&clk IMX8MM_CLK_USB_BUS>,
- -                                                <&clk IMX8MM_CLK_USB_CORE_REF>;
- -                              assigned-clock-parents = <&clk IMX8MM_SYS_PLL2_500M>,
- -                                                       <&clk IMX8MM_SYS_PLL1_100M>;
+ +                              assigned-clocks = <&clk IMX8MM_CLK_USB_BUS>;
+ +                              assigned-clock-parents = <&clk IMX8MM_SYS_PLL2_500M>;
                                 fsl,usbphy = <&usbphynop1>;
                                 fsl,usbmisc = <&usbmisc1 0>;
                                 status = "disabled";
@@@ -798,8 -772,10 +806,8 @@@
                                 interrupts = <GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>;
                                 clocks = <&clk IMX8MM_CLK_USB1_CTRL_ROOT>;
                                 clock-names = "usb1_ctrl_root_clk";
- -                              assigned-clocks = <&clk IMX8MM_CLK_USB_BUS>,
- -                                                <&clk IMX8MM_CLK_USB_CORE_REF>;
- -                              assigned-clock-parents = <&clk IMX8MM_SYS_PLL2_500M>,
- -                                                       <&clk IMX8MM_SYS_PLL1_100M>;
+ +                              assigned-clocks = <&clk IMX8MM_CLK_USB_BUS>;
+ +                              assigned-clock-parents = <&clk IMX8MM_SYS_PLL2_500M>;
                                 fsl,usbphy = <&usbphynop2>;
                                 fsl,usbmisc = <&usbmisc2 0>;
                                 status = "disabled";
@@@ -850,12 -826,5 +858,12 @@@
                         interrupt-controller;
                         interrupts = <GIC_PPI 9 IRQ_TYPE_LEVEL_HIGH>;
                 };
+ +
+ +              ddr-pmu@3d800000 {
+ +                      compatible = "fsl,imx8mm-ddr-pmu", "fsl,imx8m-ddr-pmu";
+ +                      reg = <0x3d800000 0x400000>;
+ +                      interrupt-parent = <&gic>;
+ +                      interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>;
+ +              };
         };
   };
diff --combined arch/arm64/boot/dts/freescale/imx8mq.dtsi

index 046a0c8c8dd5607adf835c055b3415efe8361d2f,b4529773af51836107da4ff69d57c54b1850c741..3f3594d9485cec30c34730255595c76812621f3f
--- 1/arch/arm64/boot/dts/freescale/imx8mq.dtsi
--- 2/arch/arm64/boot/dts/freescale/imx8mq.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
@@@ -156,7 -156,6 +156,7 @@@
                         /* Industrial only */
                         opp-supported-hw = <0xf>, <0x4>;
                         clock-latency-ns = <150000>;
+ +                      opp-suspend;
                 };
   
                 opp-1000000000 {
@@@ -165,23 -164,21 +165,23 @@@
                         /* Consumer only */
                         opp-supported-hw = <0xe>, <0x3>;
                         clock-latency-ns = <150000>;
+ +                      opp-suspend;
                 };
   
                 opp-1300000000 {
                         opp-hz = /bits/ 64 <1300000000>;
                         opp-microvolt = <1000000>;
- -                      opp-supported-hw = <0xc>, <0x7>;
+ +                      opp-supported-hw = <0xc>, <0x4>;
                         clock-latency-ns = <150000>;
+ +                      opp-suspend;
                 };
   
                 opp-1500000000 {
                         opp-hz = /bits/ 64 <1500000000>;
                         opp-microvolt = <1000000>;
- -                      /* Consumer only but rely on speed grading */
- -                      opp-supported-hw = <0x8>, <0x7>;
+ +                      opp-supported-hw = <0x8>, <0x3>;
                         clock-latency-ns = <150000>;
+ +                      opp-suspend;
                 };
         };
   
@@@ -291,7 -288,6 +291,7 @@@
                                 #gpio-cells = <2>;
                                 interrupt-controller;
                                 #interrupt-cells = <2>;
+ +                              gpio-ranges = <&iomuxc 0 10 30>;
                         };
   
                         gpio2: gpio@30210000 {
@@@ -304,7 -300,6 +304,7 @@@
                                 #gpio-cells = <2>;
                                 interrupt-controller;
                                 #interrupt-cells = <2>;
+ +                              gpio-ranges = <&iomuxc 0 40 21>;
                         };
   
                         gpio3: gpio@30220000 {
@@@ -317,7 -312,6 +317,7 @@@
                                 #gpio-cells = <2>;
                                 interrupt-controller;
                                 #interrupt-cells = <2>;
+ +                              gpio-ranges = <&iomuxc 0 61 26>;
                         };
   
                         gpio4: gpio@30230000 {
@@@ -330,7 -324,6 +330,7 @@@
                                 #gpio-cells = <2>;
                                 interrupt-controller;
                                 #interrupt-cells = <2>;
+ +                              gpio-ranges = <&iomuxc 0 87 32>;
                         };
   
                         gpio5: gpio@30240000 {
@@@ -343,14 -336,12 +343,14 @@@
                                 #gpio-cells = <2>;
                                 interrupt-controller;
                                 #interrupt-cells = <2>;
+ +                              gpio-ranges = <&iomuxc 0 119 30>;
                         };
   
                         tmu: tmu@30260000 {
                                 compatible = "fsl,imx8mq-tmu";
                                 reg = <0x30260000 0x10000>;
                                 interrupt = <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>;
+ +                              clocks = <&clk IMX8MQ_CLK_TMU_ROOT>;
                                 little-endian;
                                 fsl,tmu-range = <0xb0000 0xa0026 0x80048 0x70061>;
                                 fsl,tmu-calibration = <0x00000000 0x00000023
@@@ -440,15 -431,8 +440,15 @@@
                         };
   
                         iomuxc_gpr: syscon@30340000 {
- -                              compatible = "fsl,imx8mq-iomuxc-gpr", "fsl,imx6q-iomuxc-gpr", "syscon";
+ +                              compatible = "fsl,imx8mq-iomuxc-gpr", "fsl,imx6q-iomuxc-gpr",
+ +                                           "syscon", "simple-mfd";
                                 reg = <0x30340000 0x10000>;
+ +
+ +                              mux: mux-controller {
+ +                                      compatible = "mmio-mux";
+ +                                      #mux-control-cells = <1>;
+ +                                      mux-reg-masks = <0x34 0x00000004>; /* MIPI_MUX_SEL */
+ +                              };
                         };
   
                         ocotp: ocotp-ctrl@30350000 {
@@@ -651,6 -635,14 +651,14 @@@
                                 #pwm-cells = <2>;
                                 status = "disabled";
                         };
+ 
+                       system_counter: timer@306a0000 {
+                               compatible = "nxp,sysctr-timer";
+                               reg = <0x306a0000 0x20000>;
+                               interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>;
+                               clocks = <&osc_25m>;
+                               clock-names = "per";
+                       };
                 };
   
                 bus@30800000 { /* AIPS3 */
@@@ -731,7 -723,8 +739,7 @@@
   
                         sai2: sai@308b0000 {
                                 #sound-dai-cells = <0>;
- -                              compatible = "fsl,imx8mq-sai",
- -                                           "fsl,imx6sx-sai";
+ +                              compatible = "fsl,imx8mq-sai";
                                 reg = <0x308b0000 0x10000>;
                                 interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_HIGH>;
                                 clocks = <&clk IMX8MQ_CLK_SAI2_IPG>,
@@@ -743,19 -736,6 +751,19 @@@
                                 status = "disabled";
                         };
   
+ +                      dphy: dphy@30a00300 {
+ +                              compatible = "fsl,imx8mq-mipi-dphy";
+ +                              reg = <0x30a00300 0x100>;
+ +                              clocks = <&clk IMX8MQ_CLK_DSI_PHY_REF>;
+ +                              clock-names = "phy_ref";
+ +                              assigned-clocks = <&clk IMX8MQ_CLK_DSI_PHY_REF>;
+ +                              assigned-clock-parents = <&clk IMX8MQ_VIDEO_PLL1_OUT>;
+ +                              assigned-clock-rates = <24000000>;
+ +                              #phy-cells = <0>;
+ +                              power-domains = <&pgc_mipi>;
+ +                              status = "disabled";
+ +                      };
+ +
                         i2c1: i2c@30a20000 {
                                 compatible = "fsl,imx8mq-i2c", "fsl,imx21-i2c";
                                 reg = <0x30a20000 0x10000>;
@@@ -929,9 -909,9 +937,9 @@@
                 usb_dwc3_0: usb@38100000 {
                         compatible = "fsl,imx8mq-dwc3", "snps,dwc3";
                         reg = <0x38100000 0x10000>;
- -                      clocks = <&clk IMX8MQ_CLK_USB_BUS>,
+ +                      clocks = <&clk IMX8MQ_CLK_USB1_CTRL_ROOT>,
                                  <&clk IMX8MQ_CLK_USB_CORE_REF>,
- -                               <&clk IMX8MQ_CLK_USB1_CTRL_ROOT>;
+ +                               <&clk IMX8MQ_CLK_32K>;
                         clock-names = "bus_early", "ref", "suspend";
                         assigned-clocks = <&clk IMX8MQ_CLK_USB_BUS>,
                                           <&clk IMX8MQ_CLK_USB_CORE_REF>;
@@@ -961,9 -941,9 +969,9 @@@
                 usb_dwc3_1: usb@38200000 {
                         compatible = "fsl,imx8mq-dwc3", "snps,dwc3";
                         reg = <0x38200000 0x10000>;
- -                      clocks = <&clk IMX8MQ_CLK_USB_BUS>,
+ +                      clocks = <&clk IMX8MQ_CLK_USB2_CTRL_ROOT>,
                                  <&clk IMX8MQ_CLK_USB_CORE_REF>,
- -                               <&clk IMX8MQ_CLK_USB2_CTRL_ROOT>;
+ +                               <&clk IMX8MQ_CLK_32K>;
                         clock-names = "bus_early", "ref", "suspend";
                         assigned-clocks = <&clk IMX8MQ_CLK_USB_BUS>,
                                           <&clk IMX8MQ_CLK_USB_CORE_REF>;
@@@ -1061,12 -1041,5 +1069,12 @@@
                         interrupts = <GIC_PPI 9 IRQ_TYPE_LEVEL_HIGH>;
                         interrupt-parent = <&gic>;
                 };
+ +
+ +              ddr-pmu@3d800000 {
+ +                      compatible = "fsl,imx8mq-ddr-pmu", "fsl,imx8m-ddr-pmu";
+ +                      reg = <0x3d800000 0x400000>;
+ +                      interrupt-parent = <&gic>;
+ +                      interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>;
+ +              };
         };
   };
diff --combined arch/x86/hyperv/hv_init.c

index d314cf1e15fd607cb18becf5c19910533f89f4ed,866dfb3dca4893349fc474d36a6624c92a750f78..2db3972c0e0ff047621cb6ab173e34ec914e257b
--- 1/arch/x86/hyperv/hv_init.c
--- 2/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@@ -37,20 -37,6 +37,20 @@@ EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg
   u32 hv_max_vp_index;
   EXPORT_SYMBOL_GPL(hv_max_vp_index);
   
+ +void *hv_alloc_hyperv_page(void)
+ +{
+ +      BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE);
+ +
+ +      return (void *)__get_free_page(GFP_KERNEL);
+ +}
+ +EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
+ +
+ +void hv_free_hyperv_page(unsigned long addr)
+ +{
+ +      free_page(addr);
+ +}
+ +EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
+ +
   static int hv_cpu_init(unsigned int cpu)
   {
         u64 msr_vp_index;
@@@ -315,8 -301,6 +315,6 @@@ void __init hyperv_init(void
   
         x86_init.pci.arch_init = hv_pci_init;
   
-       /* Register Hyper-V specific clocksource */
-       hv_init_clocksource();
         return;
   
   remove_cpuhp_state:
diff --combined arch/x86/include/asm/vdso/gettimeofday.h

index ba71a63cdac479d6428159ffbb0e24ba6c5a40f3,bcbf901befbe07dcc3867ab05a612b9410103f26..e9ee139cf29e05ab861d614209601b13d281cf4c
--- 1/arch/x86/include/asm/vdso/gettimeofday.h
--- 2/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@@ -51,7 -51,7 +51,7 @@@ extern struct pvclock_vsyscall_time_inf
         __attribute__((visibility("hidden")));
   #endif
   
- #ifdef CONFIG_HYPERV_TSCPAGE
+ #ifdef CONFIG_HYPERV_TIMER
   extern struct ms_hyperv_tsc_page hvclock_page
         __attribute__((visibility("hidden")));
   #endif
@@@ -96,8 -96,6 +96,8 @@@ long clock_getres_fallback(clockid_t _c
   
   #else
   
+ +#define VDSO_HAS_32BIT_FALLBACK       1
+ +
   static __always_inline
   long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
   {
@@@ -115,23 -113,6 +115,23 @@@
         return ret;
   }
   
+ +static __always_inline
+ +long clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
+ +{
+ +      long ret;
+ +
+ +      asm (
+ +              "mov %%ebx, %%edx \n"
+ +              "mov %[clock], %%ebx \n"
+ +              "call __kernel_vsyscall \n"
+ +              "mov %%edx, %%ebx \n"
+ +              : "=a" (ret), "=m" (*_ts)
+ +              : "0" (__NR_clock_gettime), [clock] "g" (_clkid), "c" (_ts)
+ +              : "edx");
+ +
+ +      return ret;
+ +}
+ +
   static __always_inline
   long gettimeofday_fallback(struct __kernel_old_timeval *_tv,
                            struct timezone *_tz)
@@@ -167,23 -148,6 +167,23 @@@ clock_getres_fallback(clockid_t _clkid
         return ret;
   }
   
+ +static __always_inline
+ +long clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
+ +{
+ +      long ret;
+ +
+ +      asm (
+ +              "mov %%ebx, %%edx \n"
+ +              "mov %[clock], %%ebx \n"
+ +              "call __kernel_vsyscall \n"
+ +              "mov %%edx, %%ebx \n"
+ +              : "=a" (ret), "=m" (*_ts)
+ +              : "0" (__NR_clock_getres), [clock] "g" (_clkid), "c" (_ts)
+ +              : "edx");
+ +
+ +      return ret;
+ +}
+ +
   #endif
   
   #ifdef CONFIG_PARAVIRT_CLOCK
@@@ -228,7 -192,7 +228,7 @@@ static u64 vread_pvclock(void
   }
   #endif
   
- #ifdef CONFIG_HYPERV_TSCPAGE
+ #ifdef CONFIG_HYPERV_TIMER
   static u64 vread_hvclock(void)
   {
         return hv_read_tsc_page(&hvclock_page);
@@@ -251,7 -215,7 +251,7 @@@ static inline u64 __arch_get_hw_counter
                 return vread_pvclock();
         }
   #endif
- #ifdef CONFIG_HYPERV_TSCPAGE
+ #ifdef CONFIG_HYPERV_TIMER
         if (clock_mode == VCLOCK_HVCLOCK) {
                 barrier();
                 return vread_hvclock();
diff --combined arch/x86/kvm/lapic.c

index e904ff06a83d84c9ab5ccd08bdfcc14b1cc3768a,b9e516099d07e30ec64622902625fbd6d40cedd9..2a4f278f3b56842ecdedc1573f5f9cb796fb56fc
--- 1/arch/x86/kvm/lapic.c
--- 2/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@@ -216,9 -216,6 +216,9 @@@ static void recalculate_apic_map(struc
                 if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
                         new->phys_map[xapic_id] = apic;
   
+ +              if (!kvm_apic_sw_enabled(apic))
+ +                      continue;
+ +
                 ldr = kvm_lapic_get_reg(apic, APIC_LDR);
   
                 if (apic_x2apic_mode(apic)) {
@@@ -261,8 -258,6 +261,8 @@@ static inline void apic_set_spiv(struc
                         static_key_slow_dec_deferred(&apic_sw_disabled);
                 else
                         static_key_slow_inc(&apic_sw_disabled.key);
+ +
+ +              recalculate_apic_map(apic->vcpu->kvm);
         }
   }
   
@@@ -1553,6 -1548,7 +1553,6 @@@ static void kvm_apic_inject_pending_tim
   static void apic_timer_expired(struct kvm_lapic *apic)
   {
         struct kvm_vcpu *vcpu = apic->vcpu;
- -      struct swait_queue_head *q = &vcpu->wq;
         struct kvm_timer *ktimer = &apic->lapic_timer;
   
         if (atomic_read(&apic->lapic_timer.pending))
@@@ -1570,6 -1566,13 +1570,6 @@@
   
         atomic_inc(&apic->lapic_timer.pending);
         kvm_set_pending_timer(vcpu);
- -
- -      /*
- -       * For x86, the atomic_inc() is serialized, thus
- -       * using swait_active() is safe.
- -       */
- -      if (swait_active(q))
- -              swake_up_one(q);
   }
   
   static void start_sw_tscdeadline(struct kvm_lapic *apic)
@@@ -1598,7 -1601,7 +1598,7 @@@
             likely(ns > apic->lapic_timer.timer_advance_ns)) {
                 expire = ktime_add_ns(now, ns);
                 expire = ktime_sub_ns(expire, ktimer->timer_advance_ns);
-               hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS);
+               hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD);
         } else
                 apic_timer_expired(apic);
   
@@@ -2299,7 -2302,7 +2299,7 @@@ int kvm_create_lapic(struct kvm_vcpu *v
         apic->vcpu = vcpu;
   
         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
-                    HRTIMER_MODE_ABS);
+                    HRTIMER_MODE_ABS_HARD);
         apic->lapic_timer.timer.function = apic_timer_fn;
         if (timer_advance_ns == -1) {
                 apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
@@@ -2484,7 -2487,7 +2484,7 @@@ void __kvm_migrate_apic_timer(struct kv
   
         timer = &vcpu->arch.apic->lapic_timer.timer;
         if (hrtimer_cancel(timer))
-               hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+               hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD);
   }
   
   /*
diff --combined block/blk-mq.c

index 0835f4d8d42e7e34c043acc040d8f5406b2e2268,f567146f9ed7e680b7351e009d85f1da71a5c3a9..e0b849bfe74d8d03dac595b145b847e6d73a8726
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -669,6 -669,8 +669,6 @@@ void blk_mq_start_request(struct reques
   {
         struct request_queue *q = rq->q;
   
- -      blk_mq_sched_started_request(rq);
- -
         trace_block_rq_issue(q, rq);
   
         if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
@@@ -2662,6 -2664,8 +2662,6 @@@ void blk_mq_release(struct request_queu
         struct blk_mq_hw_ctx *hctx, *next;
         int i;
   
- -      cancel_delayed_work_sync(&q->requeue_work);
- -
         queue_for_each_hw_ctx(q, hctx, i)
                 WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
   
@@@ -3411,15 -3415,14 +3411,14 @@@ static bool blk_mq_poll_hybrid_sleep(st
         kt = nsecs;
   
         mode = HRTIMER_MODE_REL;
-       hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+       hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
         hrtimer_set_expires(&hs.timer, kt);
   
-       hrtimer_init_sleeper(&hs, current);
         do {
                 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
                         break;
                 set_current_state(TASK_UNINTERRUPTIBLE);
-               hrtimer_start_expires(&hs.timer, mode);
+               hrtimer_sleeper_start_expires(&hs, mode);
                 if (hs.task)
                         io_schedule();
                 hrtimer_cancel(&hs.timer);
diff --combined include/linux/sched.h

index f0edee94834a8262db98f3abe61721121ff1c625,8cc8e323093f7e79f22401fb187db42e6a3fbd78..b75b282870053e083a1f79ae9b55cba6a95caa74
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -25,9 -25,11 +25,11 @@@
   #include <linux/resource.h>
   #include <linux/latencytop.h>
   #include <linux/sched/prio.h>
+ #include <linux/sched/types.h>
   #include <linux/signal_types.h>
   #include <linux/mm_types_task.h>
   #include <linux/task_io_accounting.h>
+ #include <linux/posix-timers.h>
   #include <linux/rseq.h>
   
   /* task_struct member predeclarations (sorted alphabetically): */
@@@ -244,27 -246,6 +246,6 @@@ struct prev_cputime 
   #endif
   };
   
- /**
-  * struct task_cputime - collected CPU time counts
-  * @utime:            time spent in user mode, in nanoseconds
-  * @stime:            time spent in kernel mode, in nanoseconds
-  * @sum_exec_runtime: total time spent on the CPU, in nanoseconds
-  *
-  * This structure groups together three kinds of CPU time that are tracked for
-  * threads and thread groups.  Most things considering CPU time want to group
-  * these counts together and treat all three of them in parallel.
-  */
- struct task_cputime {
-       u64                             utime;
-       u64                             stime;
-       unsigned long long              sum_exec_runtime;
- };
- 
- /* Alternate field names when used on cache expirations: */
- #define virt_exp                      utime
- #define prof_exp                      stime
- #define sched_exp                     sum_exec_runtime
- 
   enum vtime_state {
         /* Task is sleeping or running in a CPU with VTIME inactive: */
         VTIME_INACTIVE = 0,
@@@ -295,11 -276,6 +276,11 @@@ enum uclamp_id 
         UCLAMP_CNT
   };
   
+ +#ifdef CONFIG_SMP
+ +extern struct root_domain def_root_domain;
+ +extern struct mutex sched_domains_mutex;
+ +#endif
+ +
   struct sched_info {
   #ifdef CONFIG_SCHED_INFO
         /* Cumulative counters: */
@@@ -881,10 -857,8 +862,8 @@@ struct task_struct 
         unsigned long                   min_flt;
         unsigned long                   maj_flt;
   
- #ifdef CONFIG_POSIX_TIMERS
-       struct task_cputime             cputime_expires;
-       struct list_head                cpu_timers[3];
- #endif
+       /* Empty if CONFIG_POSIX_CPUTIMERS=n */
+       struct posix_cputimers          posix_cputimers;
   
         /* Process credentials: */
   
@@@ -1097,15 -1071,7 +1076,15 @@@
         u64                             last_sum_exec_runtime;
         struct callback_head            numa_work;
   
- -      struct numa_group               *numa_group;
+ +      /*
+ +       * This pointer is only modified for current in syscall and
+ +       * pagefault context (and for tasks being destroyed), so it can be read
+ +       * from any of the following contexts:
+ +       *  - RCU read-side critical section
+ +       *  - current->numa_group from everywhere
+ +       *  - task's runqueue locked, task not running
+ +       */
+ +      struct numa_group __rcu         *numa_group;
   
         /*
          * numa_faults is an array split into four regions:
@@@ -1772,7 -1738,7 +1751,7 @@@ static inline int test_tsk_need_resched
    * value indicates whether a reschedule was done in fact.
    * cond_resched_lock() will drop the spinlock before scheduling,
    */
- -#ifndef CONFIG_PREEMPT
+ +#ifndef CONFIG_PREEMPTION
   extern int _cond_resched(void);
   #else
   static inline int _cond_resched(void) { return 0; }
@@@ -1801,12 -1767,12 +1780,12 @@@ static inline void cond_resched_rcu(voi
   
   /*
    * Does a critical section need to be broken due to another
- - * task waiting?: (technically does not depend on CONFIG_PREEMPT,
+ + * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
    * but a general need for low latency)
    */
   static inline int spin_needbreak(spinlock_t *lock)
   {
- -#ifdef CONFIG_PREEMPT
+ +#ifdef CONFIG_PREEMPTION
         return spin_is_contended(lock);
   #else
         return 0;
diff --combined include/linux/wait.h

index 30c515520fb28c69873aad6d0b498a59679f46ab,4707543ef5752a6bd321fde91b9be49eaef4a856..3eb7cae8206c38d153ecdb72f0c7666b4fec42b7
--- 1/include/linux/wait.h
--- 2/include/linux/wait.h
+++ b/include/linux/wait.h
@@@ -126,19 -126,6 +126,19 @@@ static inline int waitqueue_active(stru
         return !list_empty(&wq_head->head);
   }
   
+ +/**
+ + * wq_has_single_sleeper - check if there is only one sleeper
+ + * @wq_head: wait queue head
+ + *
+ + * Returns true of wq_head has only one sleeper on the list.
+ + *
+ + * Please refer to the comment for waitqueue_active.
+ + */
+ +static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
+ +{
+ +      return list_is_singular(&wq_head->head);
+ +}
+ +
   /**
    * wq_has_sleeper - check if there are any waiting processes
    * @wq_head: wait queue head
@@@ -501,8 -488,8 +501,8 @@@ do {                                                                               
         int __ret = 0;                                                          \
         struct hrtimer_sleeper __t;                                             \
                                                                                 \
-       hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);   \
-       hrtimer_init_sleeper(&__t, current);                                    \
+       hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC,                    \
+                                     HRTIMER_MODE_REL);                        \
         if ((timeout) != KTIME_MAX)                                             \
                 hrtimer_start_range_ns(&__t.timer, timeout,                     \
                                        current->timer_slack_ns,                 \
diff --combined init/init_task.c

index bfe06c53b14e06674a5fddde3944a9fdd9b96c35,d49692a0ec516b0d6b95d6272551903fbbdb14c2..9e5cbe5eab7b1143791c834afa6ece0ccdbd148f
--- 1/init/init_task.c
--- 2/init/init_task.c
+++ b/init/init_task.c
@@@ -30,8 -30,6 +30,6 @@@ static struct signal_struct init_signal
         .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
         .cputimer       = {
                 .cputime_atomic = INIT_CPUTIME_ATOMIC,
-               .running        = false,
-               .checking_timer = false,
         },
   #endif
         INIT_CPU_TIMERS(init_signals)
@@@ -174,7 -172,7 +172,7 @@@ struct task_struct init_tas
   #ifdef CONFIG_FUNCTION_GRAPH_TRACER
         .ret_stack      = NULL,
   #endif
- -#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT)
+ +#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION)
         .trace_recursion = 0,
   #endif
   #ifdef CONFIG_LIVEPATCH
diff --combined kernel/events/core.c

index 1c414b8866b454aed555aafdf34e823256f0c8ba,9d623e257a514acae29d8da3ce065616141f3923..4f08b17d642672f9822e3d842f07f2d836af6f9f
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -1103,7 -1103,7 +1103,7 @@@ static void __perf_mux_hrtimer_init(str
         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
   
         raw_spin_lock_init(&cpuctx->hrtimer_lock);
-       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
         timer->function = perf_mux_hrtimer_handler;
   }
   
@@@ -1121,7 -1121,7 +1121,7 @@@ static int perf_mux_hrtimer_restart(str
         if (!cpuctx->hrtimer_active) {
                 cpuctx->hrtimer_active = 1;
                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
-               hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+               hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
         }
         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
   
@@@ -1887,89 -1887,6 +1887,89 @@@ list_del_event(struct perf_event *event
         ctx->generation++;
   }
   
+ +static int
+ +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
+ +{
+ +      if (!has_aux(aux_event))
+ +              return 0;
+ +
+ +      if (!event->pmu->aux_output_match)
+ +              return 0;
+ +
+ +      return event->pmu->aux_output_match(aux_event);
+ +}
+ +
+ +static void put_event(struct perf_event *event);
+ +static void event_sched_out(struct perf_event *event,
+ +                          struct perf_cpu_context *cpuctx,
+ +                          struct perf_event_context *ctx);
+ +
+ +static void perf_put_aux_event(struct perf_event *event)
+ +{
+ +      struct perf_event_context *ctx = event->ctx;
+ +      struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ +      struct perf_event *iter;
+ +
+ +      /*
+ +       * If event uses aux_event tear down the link
+ +       */
+ +      if (event->aux_event) {
+ +              iter = event->aux_event;
+ +              event->aux_event = NULL;
+ +              put_event(iter);
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * If the event is an aux_event, tear down all links to
+ +       * it from other events.
+ +       */
+ +      for_each_sibling_event(iter, event->group_leader) {
+ +              if (iter->aux_event != event)
+ +                      continue;
+ +
+ +              iter->aux_event = NULL;
+ +              put_event(event);
+ +
+ +              /*
+ +               * If it's ACTIVE, schedule it out and put it into ERROR
+ +               * state so that we don't try to schedule it again. Note
+ +               * that perf_event_enable() will clear the ERROR status.
+ +               */
+ +              event_sched_out(iter, cpuctx, ctx);
+ +              perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ +      }
+ +}
+ +
+ +static int perf_get_aux_event(struct perf_event *event,
+ +                            struct perf_event *group_leader)
+ +{
+ +      /*
+ +       * Our group leader must be an aux event if we want to be
+ +       * an aux_output. This way, the aux event will precede its
+ +       * aux_output events in the group, and therefore will always
+ +       * schedule first.
+ +       */
+ +      if (!group_leader)
+ +              return 0;
+ +
+ +      if (!perf_aux_output_match(event, group_leader))
+ +              return 0;
+ +
+ +      if (!atomic_long_inc_not_zero(&group_leader->refcount))
+ +              return 0;
+ +
+ +      /*
+ +       * Link aux_outputs to their aux event; this is undone in
+ +       * perf_group_detach() by perf_put_aux_event(). When the
+ +       * group in torn down, the aux_output events loose their
+ +       * link to the aux_event and can't schedule any more.
+ +       */
+ +      event->aux_event = group_leader;
+ +
+ +      return 1;
+ +}
+ +
   static void perf_group_detach(struct perf_event *event)
   {
         struct perf_event *sibling, *tmp;
@@@ -1985,8 -1902,6 +1985,8 @@@
   
         event->attach_state &= ~PERF_ATTACH_GROUP;
   
+ +      perf_put_aux_event(event);
+ +
         /*
          * If this is a sibling, remove it from its group.
          */
@@@ -4174,8 -4089,10 +4174,8 @@@ alloc_perf_context(struct pmu *pmu, str
                 return NULL;
   
         __perf_event_init_context(ctx);
- -      if (task) {
- -              ctx->task = task;
- -              get_task_struct(task);
- -      }
+ +      if (task)
+ +              ctx->task = get_task_struct(task);
         ctx->pmu = pmu;
   
         return ctx;
@@@ -9574,7 -9491,7 +9574,7 @@@ static void perf_swevent_start_hrtimer(
                 period = max_t(u64, 10000, hwc->sample_period);
         }
         hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
-                     HRTIMER_MODE_REL_PINNED);
+                     HRTIMER_MODE_REL_PINNED_HARD);
   }
   
   static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@@ -9596,7 -9513,7 +9596,7 @@@ static void perf_swevent_init_hrtimer(s
         if (!is_sampling_event(event))
                 return;
   
-       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
         hwc->hrtimer.function = perf_swevent_hrtimer;
   
         /*
@@@ -10438,7 -10355,8 +10438,7 @@@ perf_event_alloc(struct perf_event_att
                  * and we cannot use the ctx information because we need the
                  * pmu before we get a ctx.
                  */
- -              get_task_struct(task);
- -              event->hw.target = task;
+ +              event->hw.target = get_task_struct(task);
         }
   
         event->clock = &local_clock;
@@@ -10508,12 -10426,6 +10508,12 @@@
                 goto err_ns;
         }
   
+ +      if (event->attr.aux_output &&
+ +          !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+ +              err = -EOPNOTSUPP;
+ +              goto err_pmu;
+ +      }
+ +
         err = exclusive_event_init(event);
         if (err)
                 goto err_pmu;
@@@ -11170,8 -11082,6 +11170,8 @@@ SYSCALL_DEFINE5(perf_event_open
                 }
         }
   
+ +      if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
+ +              goto err_locked;
   
         /*
          * Must be under the same ctx::mutex as perf_install_in_context(),
@@@ -11364,7 -11274,7 +11364,7 @@@ perf_event_create_kernel_counter(struc
                 goto err_unlock;
         }
   
- -      perf_install_in_context(ctx, event, cpu);
+ +      perf_install_in_context(ctx, event, event->cpu);
         perf_unpin_context(ctx);
         mutex_unlock(&ctx->mutex);
   
diff --combined kernel/fork.c

index 1d1cd06edbc178daa6347b85b447af3bf83e8d90,f1228d9f0b11747ca1b030cb831ecb0863e7af9c..53e780748fe3367973182edd594c3e27fc9108fd
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -726,7 -726,7 +726,7 @@@ void __put_task_struct(struct task_stru
         WARN_ON(tsk == current);
   
         cgroup_free(tsk);
- -      task_numa_free(tsk);
+ +      task_numa_free(tsk, true);
         security_task_free(tsk);
         exit_creds(tsk);
         delayacct_tsk_free(tsk);
@@@ -768,7 -768,6 +768,7 @@@ static void set_max_threads(unsigned in
   int arch_task_struct_size __read_mostly;
   #endif
   
+ +#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
   static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
   {
         /* Fetch thread_struct whitelist for the architecture. */
@@@ -783,7 -782,6 +783,7 @@@
         else
                 *offset += offsetof(struct task_struct, thread);
   }
+ +#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
   
   void __init fork_init(void)
   {
@@@ -1519,28 -1517,17 +1519,17 @@@ void __cleanup_sighand(struct sighand_s
         }
   }
   
- #ifdef CONFIG_POSIX_TIMERS
   /*
    * Initialize POSIX timer handling for a thread group.
    */
   static void posix_cpu_timers_init_group(struct signal_struct *sig)
   {
+       struct posix_cputimers *pct = &sig->posix_cputimers;
         unsigned long cpu_limit;
   
         cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-       if (cpu_limit != RLIM_INFINITY) {
-               sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
-               sig->cputimer.running = true;
-       }
- 
-       /* The timer lists. */
-       INIT_LIST_HEAD(&sig->cpu_timers[0]);
-       INIT_LIST_HEAD(&sig->cpu_timers[1]);
-       INIT_LIST_HEAD(&sig->cpu_timers[2]);
+       posix_cputimers_group_init(pct, cpu_limit);
   }
- #else
- static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
- #endif
   
   static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
   {
@@@ -1642,23 -1629,6 +1631,6 @@@ static void rt_mutex_init_task(struct t
   #endif
   }
   
- #ifdef CONFIG_POSIX_TIMERS
- /*
-  * Initialize POSIX timer handling for a single task.
-  */
- static void posix_cpu_timers_init(struct task_struct *tsk)
- {
-       tsk->cputime_expires.prof_exp = 0;
-       tsk->cputime_expires.virt_exp = 0;
-       tsk->cputime_expires.sched_exp = 0;
-       INIT_LIST_HEAD(&tsk->cpu_timers[0]);
-       INIT_LIST_HEAD(&tsk->cpu_timers[1]);
-       INIT_LIST_HEAD(&tsk->cpu_timers[2]);
- }
- #else
- static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
- #endif
- 
   static inline void init_task_pid_links(struct task_struct *task)
   {
         enum pid_type type;
@@@ -1692,14 -1662,6 +1664,14 @@@ static inline void rcu_copy_process(str
   #endif /* #ifdef CONFIG_TASKS_RCU */
   }
   
+ +struct pid *pidfd_pid(const struct file *file)
+ +{
+ +      if (file->f_op == &pidfd_fops)
+ +              return file->private_data;
+ +
+ +      return ERR_PTR(-EBADF);
+ +}
+ +
   static int pidfd_release(struct inode *inode, struct file *file)
   {
         struct pid *pid = file->private_data;
@@@ -1945,7 -1907,7 +1917,7 @@@ static __latent_entropy struct task_str
         task_io_accounting_init(&p->ioac);
         acct_clear_integrals(p);
   
-       posix_cpu_timers_init(p);
+       posix_cputimers_init(&p->posix_cputimers);
   
         p->io_context = NULL;
         audit_set_context(p, NULL);
@@@ -2348,8 -2310,6 +2320,8 @@@ struct mm_struct *copy_init_mm(void
    *
    * It copies the process, and if successful kick-starts
    * it and waits for it to finish using the VM if required.
+ + *
+ + * args->exit_signal is expected to be checked for sanity by the caller.
    */
   long _do_fork(struct kernel_clone_args *args)
   {
@@@ -2574,14 -2534,6 +2546,14 @@@ noinline static int copy_clone_args_fro
         if (copy_from_user(&args, uargs, size))
                 return -EFAULT;
   
+ +      /*
+ +       * Verify that higher 32bits of exit_signal are unset and that
+ +       * it is a valid signal
+ +       */
+ +      if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
+ +                   !valid_signal(args.exit_signal)))
+ +              return -EINVAL;
+ +
         *kargs = (struct kernel_clone_args){
                 .flags          = args.flags,
                 .pidfd          = u64_to_user_ptr(args.pidfd),
diff --combined kernel/sched/core.c

index 06961b997ed6d8c13ced5558520f75b07c85aedc,389e0993fbb4ae87c45ad8c8a72cb93348114a06..5e8387bdd09c65c9b804534afba93a654d39d8a3
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -255,7 -255,7 +255,7 @@@ static void __hrtick_restart(struct rq 
   {
         struct hrtimer *timer = &rq->hrtick_timer;
   
-       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
   }
   
   /*
@@@ -314,7 -314,7 +314,7 @@@ void hrtick_start(struct rq *rq, u64 de
          */
         delay = max_t(u64, delay, 10000LL);
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
-                     HRTIMER_MODE_REL_PINNED);
+                     HRTIMER_MODE_REL_PINNED_HARD);
   }
   #endif /* CONFIG_SMP */
   
@@@ -328,7 -328,7 +328,7 @@@ static void hrtick_rq_init(struct rq *r
         rq->hrtick_csd.info = rq;
   #endif
   
-       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
         rq->hrtick_timer.function = hrtick;
   }
   #else /* CONFIG_SCHED_HRTICK */
@@@ -773,18 -773,6 +773,18 @@@ static void set_load_weight(struct task
   }
   
   #ifdef CONFIG_UCLAMP_TASK
+ +/*
+ + * Serializes updates of utilization clamp values
+ + *
+ + * The (slow-path) user-space triggers utilization clamp value updates which
+ + * can require updates on (fast-path) scheduler's data structures used to
+ + * support enqueue/dequeue operations.
+ + * While the per-CPU rq lock protects fast-path update operations, user-space
+ + * requests are serialized using a mutex to reduce the risk of conflicting
+ + * updates or API abuses.
+ + */
+ +static DEFINE_MUTEX(uclamp_mutex);
+ +
   /* Max allowed minimum utilization */
   unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
   
@@@ -810,7 -798,7 +810,7 @@@ static inline unsigned int uclamp_bucke
         return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
   }
   
- -static inline unsigned int uclamp_none(int clamp_id)
+ +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
   {
         if (clamp_id == UCLAMP_MIN)
                 return 0;
@@@ -826,7 -814,7 +826,7 @@@ static inline void uclamp_se_set(struc
   }
   
   static inline unsigned int
- -uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+ +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
                   unsigned int clamp_value)
   {
         /*
@@@ -842,7 -830,7 +842,7 @@@
         return uclamp_none(UCLAMP_MIN);
   }
   
- -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+ +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
                                      unsigned int clamp_value)
   {
         /* Reset max-clamp retention only on idle exit */
@@@ -853,8 -841,8 +853,8 @@@
   }
   
   static inline
- -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
- -                               unsigned int clamp_value)
+ +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+ +                                 unsigned int clamp_value)
   {
         struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
         int bucket_id = UCLAMP_BUCKETS - 1;
@@@ -873,42 -861,16 +873,42 @@@
         return uclamp_idle_value(rq, clamp_id, clamp_value);
   }
   
+ +static inline struct uclamp_se
+ +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+ +{
+ +      struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      struct uclamp_se uc_max;
+ +
+ +      /*
+ +       * Tasks in autogroups or root task group will be
+ +       * restricted by system defaults.
+ +       */
+ +      if (task_group_is_autogroup(task_group(p)))
+ +              return uc_req;
+ +      if (task_group(p) == &root_task_group)
+ +              return uc_req;
+ +
+ +      uc_max = task_group(p)->uclamp[clamp_id];
+ +      if (uc_req.value > uc_max.value || !uc_req.user_defined)
+ +              return uc_max;
+ +#endif
+ +
+ +      return uc_req;
+ +}
+ +
   /*
    * The effective clamp bucket index of a task depends on, by increasing
    * priority:
    * - the task specific clamp value, when explicitly requested from userspace
+ + * - the task group effective clamp value, for tasks not either in the root
+ + *   group or in an autogroup
    * - the system default clamp value, defined by the sysadmin
    */
   static inline struct uclamp_se
- -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+ +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
   {
- -      struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ +      struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
         struct uclamp_se uc_max = uclamp_default[clamp_id];
   
         /* System default restrictions always apply */
@@@ -918,7 -880,7 +918,7 @@@
         return uc_req;
   }
   
- -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+ +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
   {
         struct uclamp_se uc_eff;
   
@@@ -942,7 -904,7 +942,7 @@@
    * for each bucket when all its RUNNABLE tasks require the same clamp.
    */
   static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
- -                                  unsigned int clamp_id)
+ +                                  enum uclamp_id clamp_id)
   {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@@ -980,7 -942,7 +980,7 @@@
    * enforce the expected state and warn.
    */
   static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
- -                                  unsigned int clamp_id)
+ +                                  enum uclamp_id clamp_id)
   {
         struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@@ -1019,7 -981,7 +1019,7 @@@
   
   static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@@ -1034,7 -996,7 +1034,7 @@@
   
   static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
@@@ -1043,82 -1005,15 +1043,82 @@@
                 uclamp_rq_dec_id(rq, p, clamp_id);
   }
   
+ +static inline void
+ +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+ +{
+ +      struct rq_flags rf;
+ +      struct rq *rq;
+ +
+ +      /*
+ +       * Lock the task and the rq where the task is (or was) queued.
+ +       *
+ +       * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+ +       * price to pay to safely serialize util_{min,max} updates with
+ +       * enqueues, dequeues and migration operations.
+ +       * This is the same locking schema used by __set_cpus_allowed_ptr().
+ +       */
+ +      rq = task_rq_lock(p, &rf);
+ +
+ +      /*
+ +       * Setting the clamp bucket is serialized by task_rq_lock().
+ +       * If the task is not yet RUNNABLE and its task_struct is not
+ +       * affecting a valid clamp bucket, the next time it's enqueued,
+ +       * it will already see the updated clamp bucket value.
+ +       */
+ +      if (!p->uclamp[clamp_id].active) {
+ +              uclamp_rq_dec_id(rq, p, clamp_id);
+ +              uclamp_rq_inc_id(rq, p, clamp_id);
+ +      }
+ +
+ +      task_rq_unlock(rq, p, &rf);
+ +}
+ +
+ +static inline void
+ +uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+ +                         unsigned int clamps)
+ +{
+ +      enum uclamp_id clamp_id;
+ +      struct css_task_iter it;
+ +      struct task_struct *p;
+ +
+ +      css_task_iter_start(css, 0, &it);
+ +      while ((p = css_task_iter_next(&it))) {
+ +              for_each_clamp_id(clamp_id) {
+ +                      if ((0x1 << clamp_id) & clamps)
+ +                              uclamp_update_active(p, clamp_id);
+ +              }
+ +      }
+ +      css_task_iter_end(&it);
+ +}
+ +
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+ +static void uclamp_update_root_tg(void)
+ +{
+ +      struct task_group *tg = &root_task_group;
+ +
+ +      uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+ +                    sysctl_sched_uclamp_util_min, false);
+ +      uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+ +                    sysctl_sched_uclamp_util_max, false);
+ +
+ +      rcu_read_lock();
+ +      cpu_util_update_eff(&root_task_group.css);
+ +      rcu_read_unlock();
+ +}
+ +#else
+ +static void uclamp_update_root_tg(void) { }
+ +#endif
+ +
   int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp,
                                 loff_t *ppos)
   {
+ +      bool update_root_tg = false;
         int old_min, old_max;
- -      static DEFINE_MUTEX(mutex);
         int result;
   
- -      mutex_lock(&mutex);
+ +      mutex_lock(&uclamp_mutex);
         old_min = sysctl_sched_uclamp_util_min;
         old_max = sysctl_sched_uclamp_util_max;
   
@@@ -1137,30 -1032,23 +1137,30 @@@
         if (old_min != sysctl_sched_uclamp_util_min) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
                               sysctl_sched_uclamp_util_min, false);
+ +              update_root_tg = true;
         }
         if (old_max != sysctl_sched_uclamp_util_max) {
                 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
                               sysctl_sched_uclamp_util_max, false);
+ +              update_root_tg = true;
         }
   
+ +      if (update_root_tg)
+ +              uclamp_update_root_tg();
+ +
         /*
- -       * Updating all the RUNNABLE task is expensive, keep it simple and do
- -       * just a lazy update at each next enqueue time.
+ +       * We update all RUNNABLE tasks only when task groups are in use.
+ +       * Otherwise, keep it simple and do just a lazy update at each next
+ +       * task enqueue time.
          */
+ +
         goto done;
   
   undo:
         sysctl_sched_uclamp_util_min = old_min;
         sysctl_sched_uclamp_util_max = old_max;
   done:
- -      mutex_unlock(&mutex);
+ +      mutex_unlock(&uclamp_mutex);
   
         return result;
   }
@@@ -1187,7 -1075,7 +1187,7 @@@ static int uclamp_validate(struct task_
   static void __setscheduler_uclamp(struct task_struct *p,
                                   const struct sched_attr *attr)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         /*
          * On scheduling class change, reset to default clamps for tasks
@@@ -1224,7 -1112,7 +1224,7 @@@
   
   static void uclamp_fork(struct task_struct *p)
   {
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
   
         for_each_clamp_id(clamp_id)
                 p->uclamp[clamp_id].active = false;
@@@ -1246,11 -1134,9 +1246,11 @@@
   static void __init init_uclamp(void)
   {
         struct uclamp_se uc_max = {};
- -      unsigned int clamp_id;
+ +      enum uclamp_id clamp_id;
         int cpu;
   
+ +      mutex_init(&uclamp_mutex);
+ +
         for_each_possible_cpu(cpu) {
                 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
                 cpu_rq(cpu)->uclamp_flags = 0;
@@@ -1263,13 -1149,8 +1263,13 @@@
   
         /* System defaults allow max clamp values for both indexes */
         uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
- -      for_each_clamp_id(clamp_id)
+ +      for_each_clamp_id(clamp_id) {
                 uclamp_default[clamp_id] = uc_max;
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +              root_task_group.uclamp_req[clamp_id] = uc_max;
+ +              root_task_group.uclamp[clamp_id] = uc_max;
+ +#endif
+ +      }
   }
   
   #else /* CONFIG_UCLAMP_TASK */
@@@ -1613,7 -1494,7 +1613,7 @@@ void do_set_cpus_allowed(struct task_st
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   }
   
   /*
@@@ -3333,8 -3214,12 +3333,8 @@@ static __always_inline struct rq 
   context_switch(struct rq *rq, struct task_struct *prev,
                struct task_struct *next, struct rq_flags *rf)
   {
- -      struct mm_struct *mm, *oldmm;
- -
         prepare_task_switch(rq, prev, next);
   
- -      mm = next->mm;
- -      oldmm = prev->active_mm;
         /*
          * For paravirt, this is coupled with an exit in switch_to to
          * combine the page table reload and the switch backend into
@@@ -3343,37 -3228,22 +3343,37 @@@
         arch_start_context_switch(prev);
   
         /*
- -       * If mm is non-NULL, we pass through switch_mm(). If mm is
- -       * NULL, we will pass through mmdrop() in finish_task_switch().
- -       * Both of these contain the full memory barrier required by
- -       * membarrier after storing to rq->curr, before returning to
- -       * user-space.
+ +       * kernel -> kernel   lazy + transfer active
+ +       *   user -> kernel   lazy + mmgrab() active
+ +       *
+ +       * kernel ->   user   switch + mmdrop() active
+ +       *   user ->   user   switch
          */
- -      if (!mm) {
- -              next->active_mm = oldmm;
- -              mmgrab(oldmm);
- -              enter_lazy_tlb(oldmm, next);
- -      } else
- -              switch_mm_irqs_off(oldmm, mm, next);
+ +      if (!next->mm) {                                // to kernel
+ +              enter_lazy_tlb(prev->active_mm, next);
+ +
+ +              next->active_mm = prev->active_mm;
+ +              if (prev->mm)                           // from user
+ +                      mmgrab(prev->active_mm);
+ +              else
+ +                      prev->active_mm = NULL;
+ +      } else {                                        // to user
+ +              /*
+ +               * sys_membarrier() requires an smp_mb() between setting
+ +               * rq->curr and returning to userspace.
+ +               *
+ +               * The below provides this either through switch_mm(), or in
+ +               * case 'prev->active_mm == next->mm' through
+ +               * finish_task_switch()'s mmdrop().
+ +               */
   
- -      if (!prev->mm) {
- -              prev->active_mm = NULL;
- -              rq->prev_mm = oldmm;
+ +              switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ +
+ +              if (!prev->mm) {                        // from kernel
+ +                      /* will mmdrop() in finish_task_switch(). */
+ +                      rq->prev_mm = prev->active_mm;
+ +                      prev->active_mm = NULL;
+ +              }
         }
   
         rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@@ -3616,36 -3486,8 +3616,36 @@@ void scheduler_tick(void
   
   struct tick_work {
         int                     cpu;
+ +      atomic_t                state;
         struct delayed_work     work;
   };
+ +/* Values for ->state, see diagram below. */
+ +#define TICK_SCHED_REMOTE_OFFLINE     0
+ +#define TICK_SCHED_REMOTE_OFFLINING   1
+ +#define TICK_SCHED_REMOTE_RUNNING     2
+ +
+ +/*
+ + * State diagram for ->state:
+ + *
+ + *
+ + *          TICK_SCHED_REMOTE_OFFLINE
+ + *                    |   ^
+ + *                    |   |
+ + *                    |   | sched_tick_remote()
+ + *                    |   |
+ + *                    |   |
+ + *                    +--TICK_SCHED_REMOTE_OFFLINING
+ + *                    |   ^
+ + *                    |   |
+ + * sched_tick_start() |   | sched_tick_stop()
+ + *                    |   |
+ + *                    V   |
+ + *          TICK_SCHED_REMOTE_RUNNING
+ + *
+ + *
+ + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ + * and sched_tick_start() are happy to leave the state in RUNNING.
+ + */
   
   static struct tick_work __percpu *tick_work_cpu;
   
@@@ -3658,7 -3500,6 +3658,7 @@@ static void sched_tick_remote(struct wo
         struct task_struct *curr;
         struct rq_flags rf;
         u64 delta;
+ +      int os;
   
         /*
          * Handle the tick only if it appears the remote CPU is running in full
@@@ -3672,7 -3513,7 +3672,7 @@@
   
         rq_lock_irq(rq, &rf);
         curr = rq->curr;
- -      if (is_idle_task(curr))
+ +      if (is_idle_task(curr) || cpu_is_offline(cpu))
                 goto out_unlock;
   
         update_rq_clock(rq);
@@@ -3692,18 -3533,13 +3692,18 @@@ out_requeue
         /*
          * Run the remote tick once per second (1Hz). This arbitrary
          * frequency is large enough to avoid overload but short enough
- -       * to keep scheduler internal stats reasonably up to date.
+ +       * to keep scheduler internal stats reasonably up to date.  But
+ +       * first update state to reflect hotplug activity if required.
          */
- -      queue_delayed_work(system_unbound_wq, dwork, HZ);
+ +      os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ +      WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ +      if (os == TICK_SCHED_REMOTE_RUNNING)
+ +              queue_delayed_work(system_unbound_wq, dwork, HZ);
   }
   
   static void sched_tick_start(int cpu)
   {
+ +      int os;
         struct tick_work *twork;
   
         if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@@ -3712,20 -3548,15 +3712,20 @@@
         WARN_ON_ONCE(!tick_work_cpu);
   
         twork = per_cpu_ptr(tick_work_cpu, cpu);
- -      twork->cpu = cpu;
- -      INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- -      queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ +      os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ +      WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ +      if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ +              twork->cpu = cpu;
+ +              INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ +              queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ +      }
   }
   
   #ifdef CONFIG_HOTPLUG_CPU
   static void sched_tick_stop(int cpu)
   {
         struct tick_work *twork;
+ +      int os;
   
         if (housekeeping_cpu(cpu, HK_FLAG_TICK))
                 return;
@@@ -3733,10 -3564,7 +3733,10 @@@
         WARN_ON_ONCE(!tick_work_cpu);
   
         twork = per_cpu_ptr(tick_work_cpu, cpu);
- -      cancel_delayed_work_sync(&twork->work);
+ +      /* There cannot be competing actions, but don't rely on stop-machine. */
+ +      os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ +      WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ +      /* Don't cancel, as this would mess up the state machine. */
   }
   #endif /* CONFIG_HOTPLUG_CPU */
   
@@@ -3744,6 -3572,7 +3744,6 @@@ int __init sched_tick_offload_init(void
   {
         tick_work_cpu = alloc_percpu(struct tick_work);
         BUG_ON(!tick_work_cpu);
- -
         return 0;
   }
   
@@@ -3752,7 -3581,7 +3752,7 @@@ static inline void sched_tick_start(in
   static inline void sched_tick_stop(int cpu) { }
   #endif
   
- -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+ +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
   /*
    * If the value passed in is equal to the current preempt count
@@@ -3910,7 -3739,7 +3910,7 @@@ pick_next_task(struct rq *rq, struct ta
   
                 p = fair_sched_class.pick_next_task(rq, prev, rf);
                 if (unlikely(p == RETRY_TASK))
- -                      goto again;
+ +                      goto restart;
   
                 /* Assumes fair_sched_class->next == idle_sched_class */
                 if (unlikely(!p))
@@@ -3919,19 -3748,14 +3919,19 @@@
                 return p;
         }
   
- -again:
+ +restart:
+ +      /*
+ +       * Ensure that we put DL/RT tasks before the pick loop, such that they
+ +       * can PULL higher prio tasks when we lower the RQ 'priority'.
+ +       */
+ +      prev->sched_class->put_prev_task(rq, prev, rf);
+ +      if (!rq->nr_running)
+ +              newidle_balance(rq, rf);
+ +
         for_each_class(class) {
- -              p = class->pick_next_task(rq, prev, rf);
- -              if (p) {
- -                      if (unlikely(p == RETRY_TASK))
- -                              goto again;
+ +              p = class->pick_next_task(rq, NULL, NULL);
+ +              if (p)
                         return p;
- -              }
         }
   
         /* The idle class should always have a runnable task: */
@@@ -3958,7 -3782,7 +3958,7 @@@
    *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
    *      called on the nearest possible occasion:
    *
- - *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ + *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
    *
    *         - in syscall or exception context, at the next outmost
    *           preempt_enable(). (this might be as soon as the wake_up()'s
@@@ -3967,7 -3791,7 +3967,7 @@@
    *         - in IRQ context, return from interrupt-handler to
    *           preemptible context
    *
- - *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ + *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
    *         then at the next:
    *
    *          - cond_resched() call
@@@ -4080,7 -3904,7 +4080,7 @@@ void __noreturn do_task_dead(void
   
   static inline void sched_submit_work(struct task_struct *tsk)
   {
- -      if (!tsk->state || tsk_is_pi_blocked(tsk))
+ +      if (!tsk->state)
                 return;
   
         /*
@@@ -4096,9 -3920,6 +4096,9 @@@
                 preempt_enable_no_resched();
         }
   
+ +      if (tsk_is_pi_blocked(tsk))
+ +              return;
+ +
         /*
          * If we are going to sleep and we have plugged IO queued,
          * make sure to submit it to avoid deadlocks.
@@@ -4212,7 -4033,7 +4212,7 @@@ static void __sched notrace preempt_sch
         } while (need_resched());
   }
   
- -#ifdef CONFIG_PREEMPT
+ +#ifdef CONFIG_PREEMPTION
   /*
    * this is the entry point to schedule() from in-kernel preemption
    * off of preempt_enable. Kernel preemptions off return from interrupt
@@@ -4284,7 -4105,7 +4284,7 @@@ asmlinkage __visible void __sched notra
   }
   EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
   
- -#endif /* CONFIG_PREEMPT */
+ +#endif /* CONFIG_PREEMPTION */
   
   /*
    * this is the entry point to schedule() from kernel preemption
@@@ -4452,7 -4273,7 +4452,7 @@@ void rt_mutex_setprio(struct task_struc
         if (queued)
                 enqueue_task(rq, p, queue_flag);
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   
         check_class_changed(rq, p, prev_class, oldprio);
   out_unlock:
@@@ -4519,7 -4340,7 +4519,7 @@@ void set_user_nice(struct task_struct *
                         resched_curr(rq);
         }
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   out_unlock:
         task_rq_unlock(rq, p, &rf);
   }
@@@ -4836,9 -4657,6 +4836,9 @@@ recheck
                         return retval;
         }
   
+ +      if (pi)
+ +              cpuset_read_lock();
+ +
         /*
          * Make sure no PI-waiters arrive (or leave) while we are
          * changing the priority of the task:
@@@ -4853,8 -4671,8 +4853,8 @@@
          * Changing the policy of the stop threads its a very bad idea:
          */
         if (p == rq->stop) {
- -              task_rq_unlock(rq, p, &rf);
- -              return -EINVAL;
+ +              retval = -EINVAL;
+ +              goto unlock;
         }
   
         /*
@@@ -4872,8 -4690,8 +4872,8 @@@
                         goto change;
   
                 p->sched_reset_on_fork = reset_on_fork;
- -              task_rq_unlock(rq, p, &rf);
- -              return 0;
+ +              retval = 0;
+ +              goto unlock;
         }
   change:
   
@@@ -4886,8 -4704,8 +4886,8 @@@
                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                 !task_group_is_autogroup(task_group(p))) {
- -                      task_rq_unlock(rq, p, &rf);
- -                      return -EPERM;
+ +                      retval = -EPERM;
+ +                      goto unlock;
                 }
   #endif
   #ifdef CONFIG_SMP
@@@ -4902,8 -4720,8 +4902,8 @@@
                          */
                         if (!cpumask_subset(span, p->cpus_ptr) ||
                             rq->rd->dl_bw.bw == 0) {
- -                              task_rq_unlock(rq, p, &rf);
- -                              return -EPERM;
+ +                              retval = -EPERM;
+ +                              goto unlock;
                         }
                 }
   #endif
@@@ -4913,8 -4731,6 +4913,8 @@@
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
                 task_rq_unlock(rq, p, &rf);
+ +              if (pi)
+ +                      cpuset_read_unlock();
                 goto recheck;
         }
   
@@@ -4924,8 -4740,8 +4924,8 @@@
          * is available.
          */
         if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- -              task_rq_unlock(rq, p, &rf);
- -              return -EBUSY;
+ +              retval = -EBUSY;
+ +              goto unlock;
         }
   
         p->sched_reset_on_fork = reset_on_fork;
@@@ -4967,7 -4783,7 +4967,7 @@@
                 enqueue_task(rq, p, queue_flags);
         }
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
   
         check_class_changed(rq, p, prev_class, oldprio);
   
@@@ -4975,22 -4791,14 +4975,22 @@@
         preempt_disable();
         task_rq_unlock(rq, p, &rf);
   
- -      if (pi)
+ +      if (pi) {
+ +              cpuset_read_unlock();
                 rt_mutex_adjust_pi(p);
+ +      }
   
         /* Run balance callbacks after we've adjusted the PI chain: */
         balance_callback(rq);
         preempt_enable();
   
         return 0;
+ +
+ +unlock:
+ +      task_rq_unlock(rq, p, &rf);
+ +      if (pi)
+ +              cpuset_read_unlock();
+ +      return retval;
   }
   
   static int _sched_setscheduler(struct task_struct *p, int policy,
@@@ -5074,15 -4882,10 +5074,15 @@@ do_sched_setscheduler(pid_t pid, int po
         rcu_read_lock();
         retval = -ESRCH;
         p = find_process_by_pid(pid);
- -      if (p != NULL)
- -              retval = sched_setscheduler(p, policy, &lparam);
+ +      if (likely(p))
+ +              get_task_struct(p);
         rcu_read_unlock();
   
+ +      if (likely(p)) {
+ +              retval = sched_setscheduler(p, policy, &lparam);
+ +              put_task_struct(p);
+ +      }
+ +
         return retval;
   }
   
@@@ -5299,40 -5102,37 +5299,40 @@@ out_unlock
         return retval;
   }
   
- -static int sched_read_attr(struct sched_attr __user *uattr,
- -                         struct sched_attr *attr,
- -                         unsigned int usize)
+ +/*
+ + * Copy the kernel size attribute structure (which might be larger
+ + * than what user-space knows about) to user-space.
+ + *
+ + * Note that all cases are valid: user-space buffer can be larger or
+ + * smaller than the kernel-space buffer. The usual case is that both
+ + * have the same size.
+ + */
+ +static int
+ +sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ +                      struct sched_attr *kattr,
+ +                      unsigned int usize)
   {
- -      int ret;
+ +      unsigned int ksize = sizeof(*kattr);
   
         if (!access_ok(uattr, usize))
                 return -EFAULT;
   
         /*
- -       * If we're handed a smaller struct than we know of,
- -       * ensure all the unknown bits are 0 - i.e. old
- -       * user-space does not get uncomplete information.
+ +       * sched_getattr() ABI forwards and backwards compatibility:
+ +       *
+ +       * If usize == ksize then we just copy everything to user-space and all is good.
+ +       *
+ +       * If usize < ksize then we only copy as much as user-space has space for,
+ +       * this keeps ABI compatibility as well. We skip the rest.
+ +       *
+ +       * If usize > ksize then user-space is using a newer version of the ABI,
+ +       * which part the kernel doesn't know about. Just ignore it - tooling can
+ +       * detect the kernel's knowledge of attributes from the attr->size value
+ +       * which is set to ksize in this case.
          */
- -      if (usize < sizeof(*attr)) {
- -              unsigned char *addr;
- -              unsigned char *end;
- -
- -              addr = (void *)attr + usize;
- -              end  = (void *)attr + sizeof(*attr);
- -
- -              for (; addr < end; addr++) {
- -                      if (*addr)
- -                              return -EFBIG;
- -              }
- -
- -              attr->size = usize;
- -      }
+ +      kattr->size = min(usize, ksize);
   
- -      ret = copy_to_user(uattr, attr, attr->size);
- -      if (ret)
+ +      if (copy_to_user(uattr, kattr, kattr->size))
                 return -EFAULT;
   
         return 0;
@@@ -5342,18 -5142,20 +5342,18 @@@
    * sys_sched_getattr - similar to sched_getparam, but with sched_attr
    * @pid: the pid in question.
    * @uattr: structure containing the extended parameters.
- - * @size: sizeof(attr) for fwd/bwd comp.
+ + * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
    * @flags: for future extension.
    */
   SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- -              unsigned int, size, unsigned int, flags)
+ +              unsigned int, usize, unsigned int, flags)
   {
- -      struct sched_attr attr = {
- -              .size = sizeof(struct sched_attr),
- -      };
+ +      struct sched_attr kattr = { };
         struct task_struct *p;
         int retval;
   
- -      if (!uattr || pid < 0 || size > PAGE_SIZE ||
- -          size < SCHED_ATTR_SIZE_VER0 || flags)
+ +      if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ +          usize < SCHED_ATTR_SIZE_VER0 || flags)
                 return -EINVAL;
   
         rcu_read_lock();
@@@ -5366,24 -5168,25 +5366,24 @@@
         if (retval)
                 goto out_unlock;
   
- -      attr.sched_policy = p->policy;
+ +      kattr.sched_policy = p->policy;
         if (p->sched_reset_on_fork)
- -              attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ +              kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
         if (task_has_dl_policy(p))
- -              __getparam_dl(p, &attr);
+ +              __getparam_dl(p, &kattr);
         else if (task_has_rt_policy(p))
- -              attr.sched_priority = p->rt_priority;
+ +              kattr.sched_priority = p->rt_priority;
         else
- -              attr.sched_nice = task_nice(p);
+ +              kattr.sched_nice = task_nice(p);
   
   #ifdef CONFIG_UCLAMP_TASK
- -      attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- -      attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ +      kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ +      kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
   #endif
   
         rcu_read_unlock();
   
- -      retval = sched_read_attr(uattr, &attr, size);
- -      return retval;
+ +      return sched_attr_copy_to_user(uattr, &kattr, usize);
   
   out_unlock:
         rcu_read_unlock();
@@@ -5613,7 -5416,7 +5613,7 @@@ SYSCALL_DEFINE0(sched_yield
         return 0;
   }
   
- -#ifndef CONFIG_PREEMPT
+ +#ifndef CONFIG_PREEMPTION
   int __sched _cond_resched(void)
   {
         if (should_resched(0)) {
@@@ -5630,7 -5433,7 +5630,7 @@@ EXPORT_SYMBOL(_cond_resched)
    * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
    * call schedule, and on return reacquire the lock.
    *
- - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
    * operations here to prevent schedule() from being called twice (once via
    * spin_unlock(), once by hand).
    */
@@@ -6169,7 -5972,7 +6169,7 @@@ void sched_setnuma(struct task_struct *
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
         if (running)
- -              set_curr_task(rq, p);
+ +              set_next_task(rq, p);
         task_rq_unlock(rq, p, &rf);
   }
   #endif /* CONFIG_NUMA_BALANCING */
@@@ -6209,22 -6012,21 +6209,22 @@@ static void calc_load_migrate(struct r
                 atomic_long_add(delta, &calc_load_tasks);
   }
   
- -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+ +static struct task_struct *__pick_migrate_task(struct rq *rq)
   {
- -}
+ +      const struct sched_class *class;
+ +      struct task_struct *next;
   
- -static const struct sched_class fake_sched_class = {
- -      .put_prev_task = put_prev_task_fake,
- -};
+ +      for_each_class(class) {
+ +              next = class->pick_next_task(rq, NULL, NULL);
+ +              if (next) {
+ +                      next->sched_class->put_prev_task(rq, next, NULL);
+ +                      return next;
+ +              }
+ +      }
   
- -static struct task_struct fake_task = {
- -      /*
- -       * Avoid pull_{rt,dl}_task()
- -       */
- -      .prio = MAX_PRIO + 1,
- -      .sched_class = &fake_sched_class,
- -};
+ +      /* The idle class should always have a runnable task */
+ +      BUG();
+ +}
   
   /*
    * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@@ -6267,7 -6069,12 +6267,7 @@@ static void migrate_tasks(struct rq *de
                 if (rq->nr_running == 1)
                         break;
   
- -              /*
- -               * pick_next_task() assumes pinned rq->lock:
- -               */
- -              next = pick_next_task(rq, &fake_task, rf);
- -              BUG_ON(!next);
- -              put_prev_task(rq, next);
+ +              next = __pick_migrate_task(rq);
   
                 /*
                  * Rules for changing task_struct::cpus_mask are holding
@@@ -6564,19 -6371,19 +6564,19 @@@ DECLARE_PER_CPU(cpumask_var_t, select_i
   
   void __init sched_init(void)
   {
- -      unsigned long alloc_size = 0, ptr;
+ +      unsigned long ptr = 0;
         int i;
   
         wait_bit_init();
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -      alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ +      ptr += 2 * nr_cpu_ids * sizeof(void **);
   #endif
   #ifdef CONFIG_RT_GROUP_SCHED
- -      alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ +      ptr += 2 * nr_cpu_ids * sizeof(void **);
   #endif
- -      if (alloc_size) {
- -              ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+ +      if (ptr) {
+ +              ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.se = (struct sched_entity **)ptr;
@@@ -6895,7 -6702,7 +6895,7 @@@ struct task_struct *curr_task(int cpu
   
   #ifdef CONFIG_IA64
   /**
- - * set_curr_task - set the current task for a given CPU.
+ + * ia64_set_curr_task - set the current task for a given CPU.
    * @cpu: the processor in question.
    * @p: the task pointer to set.
    *
@@@ -6920,20 -6727,6 +6920,20 @@@ void ia64_set_curr_task(int cpu, struc
   /* task_group_lock serializes the addition/removal of task groups */
   static DEFINE_SPINLOCK(task_group_lock);
   
+ +static inline void alloc_uclamp_sched_group(struct task_group *tg,
+ +                                          struct task_group *parent)
+ +{
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      enum uclamp_id clamp_id;
+ +
+ +      for_each_clamp_id(clamp_id) {
+ +              uclamp_se_set(&tg->uclamp_req[clamp_id],
+ +                            uclamp_none(clamp_id), false);
+ +              tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+ +      }
+ +#endif
+ +}
+ +
   static void sched_free_group(struct task_group *tg)
   {
         free_fair_sched_group(tg);
@@@ -6957,8 -6750,6 +6957,8 @@@ struct task_group *sched_create_group(s
         if (!alloc_rt_sched_group(tg, parent))
                 goto err;
   
+ +      alloc_uclamp_sched_group(tg, parent);
+ +
         return tg;
   
   err:
@@@ -7062,7 -6853,7 +7062,7 @@@ void sched_move_task(struct task_struc
         if (queued)
                 enqueue_task(rq, tsk, queue_flags);
         if (running)
- -              set_curr_task(rq, tsk);
+ +              set_next_task(rq, tsk);
   
         task_rq_unlock(rq, tsk, &rf);
   }
@@@ -7145,6 -6936,10 +7145,6 @@@ static int cpu_cgroup_can_attach(struc
   #ifdef CONFIG_RT_GROUP_SCHED
                 if (!sched_rt_can_attach(css_tg(css), task))
                         return -EINVAL;
- -#else
- -              /* We don't support RT-tasks being in separate groups */
- -              if (task->sched_class != &fair_sched_class)
- -                      return -EINVAL;
   #endif
                 /*
                  * Serialize against wake_up_new_task() such that if its
@@@ -7175,178 -6970,6 +7175,178 @@@ static void cpu_cgroup_attach(struct cg
                 sched_move_task(task);
   }
   
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+ +{
+ +      struct cgroup_subsys_state *top_css = css;
+ +      struct uclamp_se *uc_parent = NULL;
+ +      struct uclamp_se *uc_se = NULL;
+ +      unsigned int eff[UCLAMP_CNT];
+ +      enum uclamp_id clamp_id;
+ +      unsigned int clamps;
+ +
+ +      css_for_each_descendant_pre(css, top_css) {
+ +              uc_parent = css_tg(css)->parent
+ +                      ? css_tg(css)->parent->uclamp : NULL;
+ +
+ +              for_each_clamp_id(clamp_id) {
+ +                      /* Assume effective clamps matches requested clamps */
+ +                      eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+ +                      /* Cap effective clamps with parent's effective clamps */
+ +                      if (uc_parent &&
+ +                          eff[clamp_id] > uc_parent[clamp_id].value) {
+ +                              eff[clamp_id] = uc_parent[clamp_id].value;
+ +                      }
+ +              }
+ +              /* Ensure protection is always capped by limit */
+ +              eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+ +
+ +              /* Propagate most restrictive effective clamps */
+ +              clamps = 0x0;
+ +              uc_se = css_tg(css)->uclamp;
+ +              for_each_clamp_id(clamp_id) {
+ +                      if (eff[clamp_id] == uc_se[clamp_id].value)
+ +                              continue;
+ +                      uc_se[clamp_id].value = eff[clamp_id];
+ +                      uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+ +                      clamps |= (0x1 << clamp_id);
+ +              }
+ +              if (!clamps) {
+ +                      css = css_rightmost_descendant(css);
+ +                      continue;
+ +              }
+ +
+ +              /* Immediately update descendants RUNNABLE tasks */
+ +              uclamp_update_active_tasks(css, clamps);
+ +      }
+ +}
+ +
+ +/*
+ + * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ + * C expression. Since there is no way to convert a macro argument (N) into a
+ + * character constant, use two levels of macros.
+ + */
+ +#define _POW10(exp) ((unsigned int)1e##exp)
+ +#define POW10(exp) _POW10(exp)
+ +
+ +struct uclamp_request {
+ +#define UCLAMP_PERCENT_SHIFT  2
+ +#define UCLAMP_PERCENT_SCALE  (100 * POW10(UCLAMP_PERCENT_SHIFT))
+ +      s64 percent;
+ +      u64 util;
+ +      int ret;
+ +};
+ +
+ +static inline struct uclamp_request
+ +capacity_from_percent(char *buf)
+ +{
+ +      struct uclamp_request req = {
+ +              .percent = UCLAMP_PERCENT_SCALE,
+ +              .util = SCHED_CAPACITY_SCALE,
+ +              .ret = 0,
+ +      };
+ +
+ +      buf = strim(buf);
+ +      if (strcmp(buf, "max")) {
+ +              req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+ +                                           &req.percent);
+ +              if (req.ret)
+ +                      return req;
+ +              if (req.percent > UCLAMP_PERCENT_SCALE) {
+ +                      req.ret = -ERANGE;
+ +                      return req;
+ +              }
+ +
+ +              req.util = req.percent << SCHED_CAPACITY_SHIFT;
+ +              req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+ +      }
+ +
+ +      return req;
+ +}
+ +
+ +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+ +                              size_t nbytes, loff_t off,
+ +                              enum uclamp_id clamp_id)
+ +{
+ +      struct uclamp_request req;
+ +      struct task_group *tg;
+ +
+ +      req = capacity_from_percent(buf);
+ +      if (req.ret)
+ +              return req.ret;
+ +
+ +      mutex_lock(&uclamp_mutex);
+ +      rcu_read_lock();
+ +
+ +      tg = css_tg(of_css(of));
+ +      if (tg->uclamp_req[clamp_id].value != req.util)
+ +              uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+ +
+ +      /*
+ +       * Because of not recoverable conversion rounding we keep track of the
+ +       * exact requested value
+ +       */
+ +      tg->uclamp_pct[clamp_id] = req.percent;
+ +
+ +      /* Update effective clamps to track the most restrictive value */
+ +      cpu_util_update_eff(of_css(of));
+ +
+ +      rcu_read_unlock();
+ +      mutex_unlock(&uclamp_mutex);
+ +
+ +      return nbytes;
+ +}
+ +
+ +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+ +                                  char *buf, size_t nbytes,
+ +                                  loff_t off)
+ +{
+ +      return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+ +}
+ +
+ +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+ +                                  char *buf, size_t nbytes,
+ +                                  loff_t off)
+ +{
+ +      return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+ +}
+ +
+ +static inline void cpu_uclamp_print(struct seq_file *sf,
+ +                                  enum uclamp_id clamp_id)
+ +{
+ +      struct task_group *tg;
+ +      u64 util_clamp;
+ +      u64 percent;
+ +      u32 rem;
+ +
+ +      rcu_read_lock();
+ +      tg = css_tg(seq_css(sf));
+ +      util_clamp = tg->uclamp_req[clamp_id].value;
+ +      rcu_read_unlock();
+ +
+ +      if (util_clamp == SCHED_CAPACITY_SCALE) {
+ +              seq_puts(sf, "max\n");
+ +              return;
+ +      }
+ +
+ +      percent = tg->uclamp_pct[clamp_id];
+ +      percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+ +      seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+ +}
+ +
+ +static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+ +{
+ +      cpu_uclamp_print(sf, UCLAMP_MIN);
+ +      return 0;
+ +}
+ +
+ +static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+ +{
+ +      cpu_uclamp_print(sf, UCLAMP_MAX);
+ +      return 0;
+ +}
+ +#endif /* CONFIG_UCLAMP_TASK_GROUP */
+ +
   #ifdef CONFIG_FAIR_GROUP_SCHED
   static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                 struct cftype *cftype, u64 shareval)
@@@ -7691,20 -7314,6 +7691,20 @@@ static struct cftype cpu_legacy_files[
                 .read_u64 = cpu_rt_period_read_uint,
                 .write_u64 = cpu_rt_period_write_uint,
         },
+ +#endif
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      {
+ +              .name = "uclamp.min",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_min_show,
+ +              .write = cpu_uclamp_min_write,
+ +      },
+ +      {
+ +              .name = "uclamp.max",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_max_show,
+ +              .write = cpu_uclamp_max_write,
+ +      },
   #endif
         { }     /* Terminate */
   };
@@@ -7872,20 -7481,6 +7872,20 @@@ static struct cftype cpu_files[] = 
                 .seq_show = cpu_max_show,
                 .write = cpu_max_write,
         },
+ +#endif
+ +#ifdef CONFIG_UCLAMP_TASK_GROUP
+ +      {
+ +              .name = "uclamp.min",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_min_show,
+ +              .write = cpu_uclamp_min_write,
+ +      },
+ +      {
+ +              .name = "uclamp.max",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .seq_show = cpu_uclamp_max_show,
+ +              .write = cpu_uclamp_max_write,
+ +      },
   #endif
         { }     /* terminate */
   };
diff --combined kernel/sched/deadline.c

index 39dc9f74f2898f13b56837f8073f49043275a5d2,83a663a34196b9a8288414223c3b7c241459be82..2dc48720f18914a9f828f5cd653b751a045c4556
--- 1/kernel/sched/deadline.c
--- 2/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@@ -287,7 -287,7 +287,7 @@@ static void task_non_contending(struct 
   
         dl_se->dl_non_contending = 1;
         get_task_struct(p);
-       hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+       hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
   }
   
   static void task_contending(struct sched_dl_entity *dl_se, int flags)
@@@ -529,7 -529,6 +529,7 @@@ static struct rq *find_lock_later_rq(st
   static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
   {
         struct rq *later_rq = NULL;
+ +      struct dl_bw *dl_b;
   
         later_rq = find_lock_later_rq(p, rq);
         if (!later_rq) {
@@@ -558,38 -557,6 +558,38 @@@
                 double_lock_balance(rq, later_rq);
         }
   
+ +      if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+ +              /*
+ +               * Inactive timer is armed (or callback is running, but
+ +               * waiting for us to release rq locks). In any case, when it
+ +               * will fire (or continue), it will see running_bw of this
+ +               * task migrated to later_rq (and correctly handle it).
+ +               */
+ +              sub_running_bw(&p->dl, &rq->dl);
+ +              sub_rq_bw(&p->dl, &rq->dl);
+ +
+ +              add_rq_bw(&p->dl, &later_rq->dl);
+ +              add_running_bw(&p->dl, &later_rq->dl);
+ +      } else {
+ +              sub_rq_bw(&p->dl, &rq->dl);
+ +              add_rq_bw(&p->dl, &later_rq->dl);
+ +      }
+ +
+ +      /*
+ +       * And we finally need to fixup root_domain(s) bandwidth accounting,
+ +       * since p is still hanging out in the old (now moved to default) root
+ +       * domain.
+ +       */
+ +      dl_b = &rq->rd->dl_bw;
+ +      raw_spin_lock(&dl_b->lock);
+ +      __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+ +      raw_spin_unlock(&dl_b->lock);
+ +
+ +      dl_b = &later_rq->rd->dl_bw;
+ +      raw_spin_lock(&dl_b->lock);
+ +      __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+ +      raw_spin_unlock(&dl_b->lock);
+ +
         set_task_cpu(p, later_rq->cpu);
         double_unlock_balance(later_rq, rq);
   
@@@ -956,7 -923,7 +956,7 @@@ static int start_dl_timer(struct task_s
          */
         if (!hrtimer_is_queued(timer)) {
                 get_task_struct(p);
-               hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+               hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
         }
   
         return 1;
@@@ -1086,7 -1053,7 +1086,7 @@@ void init_dl_task_timer(struct sched_dl
   {
         struct hrtimer *timer = &dl_se->dl_timer;
   
-       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
         timer->function = dl_task_timer;
   }
   
@@@ -1325,7 -1292,7 +1325,7 @@@ void init_dl_inactive_task_timer(struc
   {
         struct hrtimer *timer = &dl_se->inactive_timer;
   
-       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
         timer->function = inactive_task_timer;
   }
   
@@@ -1727,20 -1694,12 +1727,20 @@@ static void start_hrtick_dl(struct rq *
   }
   #endif
   
- -static inline void set_next_task(struct rq *rq, struct task_struct *p)
+ +static void set_next_task_dl(struct rq *rq, struct task_struct *p)
   {
         p->se.exec_start = rq_clock_task(rq);
   
         /* You can't push away the running task */
         dequeue_pushable_dl_task(rq, p);
+ +
+ +      if (hrtick_enabled(rq))
+ +              start_hrtick_dl(rq, p);
+ +
+ +      if (rq->curr->sched_class != &dl_sched_class)
+ +              update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ +
+ +      deadline_queue_push_tasks(rq);
   }
   
   static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@@ -1761,42 -1720,64 +1761,42 @@@ pick_next_task_dl(struct rq *rq, struc
         struct task_struct *p;
         struct dl_rq *dl_rq;
   
- -      dl_rq = &rq->dl;
- -
- -      if (need_pull_dl_task(rq, prev)) {
- -              /*
- -               * This is OK, because current is on_cpu, which avoids it being
- -               * picked for load-balance and preemption/IRQs are still
- -               * disabled avoiding further scheduler activity on it and we're
- -               * being very careful to re-start the picking loop.
- -               */
- -              rq_unpin_lock(rq, rf);
- -              pull_dl_task(rq);
- -              rq_repin_lock(rq, rf);
- -              /*
- -               * pull_dl_task() can drop (and re-acquire) rq->lock; this
- -               * means a stop task can slip in, in which case we need to
- -               * re-start task selection.
- -               */
- -              if (rq->stop && task_on_rq_queued(rq->stop))
- -                      return RETRY_TASK;
- -      }
+ +      WARN_ON_ONCE(prev || rf);
   
- -      /*
- -       * When prev is DL, we may throttle it in put_prev_task().
- -       * So, we update time before we check for dl_nr_running.
- -       */
- -      if (prev->sched_class == &dl_sched_class)
- -              update_curr_dl(rq);
+ +      dl_rq = &rq->dl;
   
         if (unlikely(!dl_rq->dl_nr_running))
                 return NULL;
   
- -      put_prev_task(rq, prev);
- -
         dl_se = pick_next_dl_entity(rq, dl_rq);
         BUG_ON(!dl_se);
   
         p = dl_task_of(dl_se);
   
- -      set_next_task(rq, p);
- -
- -      if (hrtick_enabled(rq))
- -              start_hrtick_dl(rq, p);
- -
- -      deadline_queue_push_tasks(rq);
- -
- -      if (rq->curr->sched_class != &dl_sched_class)
- -              update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ +      set_next_task_dl(rq, p);
   
         return p;
   }
   
- -static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+ +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
   {
         update_curr_dl(rq);
   
         update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
         if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
                 enqueue_pushable_dl_task(rq, p);
+ +
+ +      if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+ +              /*
+ +               * This is OK, because current is on_cpu, which avoids it being
+ +               * picked for load-balance and preemption/IRQs are still
+ +               * disabled avoiding further scheduler activity on it and we've
+ +               * not yet started the picking loop.
+ +               */
+ +              rq_unpin_lock(rq, rf);
+ +              pull_dl_task(rq);
+ +              rq_repin_lock(rq, rf);
+ +      }
   }
   
   /*
@@@ -1830,6 -1811,11 +1830,6 @@@ static void task_fork_dl(struct task_st
          */
   }
   
- -static void set_curr_task_dl(struct rq *rq)
- -{
- -      set_next_task(rq, rq->curr);
- -}
- -
   #ifdef CONFIG_SMP
   
   /* Only try algorithms three times */
@@@ -2102,13 -2088,17 +2102,13 @@@ retry
         }
   
         deactivate_task(rq, next_task, 0);
- -      sub_running_bw(&next_task->dl, &rq->dl);
- -      sub_rq_bw(&next_task->dl, &rq->dl);
         set_task_cpu(next_task, later_rq->cpu);
- -      add_rq_bw(&next_task->dl, &later_rq->dl);
   
         /*
          * Update the later_rq clock here, because the clock is used
          * by the cpufreq_update_util() inside __add_running_bw().
          */
         update_rq_clock(later_rq);
- -      add_running_bw(&next_task->dl, &later_rq->dl);
         activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
         ret = 1;
   
@@@ -2196,7 -2186,11 +2196,7 @@@ static void pull_dl_task(struct rq *thi
                         resched = true;
   
                         deactivate_task(src_rq, p, 0);
- -                      sub_running_bw(&p->dl, &src_rq->dl);
- -                      sub_rq_bw(&p->dl, &src_rq->dl);
                         set_task_cpu(p, this_cpu);
- -                      add_rq_bw(&p->dl, &this_rq->dl);
- -                      add_running_bw(&p->dl, &this_rq->dl);
                         activate_task(this_rq, p, 0);
                         dmin = p->dl.deadline;
   
@@@ -2289,36 -2283,6 +2289,36 @@@ void __init init_sched_dl_class(void
                                         GFP_KERNEL, cpu_to_node(i));
   }
   
+ +void dl_add_task_root_domain(struct task_struct *p)
+ +{
+ +      struct rq_flags rf;
+ +      struct rq *rq;
+ +      struct dl_bw *dl_b;
+ +
+ +      rq = task_rq_lock(p, &rf);
+ +      if (!dl_task(p))
+ +              goto unlock;
+ +
+ +      dl_b = &rq->rd->dl_bw;
+ +      raw_spin_lock(&dl_b->lock);
+ +
+ +      __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+ +
+ +      raw_spin_unlock(&dl_b->lock);
+ +
+ +unlock:
+ +      task_rq_unlock(rq, p, &rf);
+ +}
+ +
+ +void dl_clear_root_domain(struct root_domain *rd)
+ +{
+ +      unsigned long flags;
+ +
+ +      raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+ +      rd->dl_bw.total_bw = 0;
+ +      raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+ +}
+ +
   #endif /* CONFIG_SMP */
   
   static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@@ -2439,7 -2403,6 +2439,7 @@@ const struct sched_class dl_sched_clas
   
         .pick_next_task         = pick_next_task_dl,
         .put_prev_task          = put_prev_task_dl,
+ +      .set_next_task          = set_next_task_dl,
   
   #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_dl,
@@@ -2450,6 -2413,7 +2450,6 @@@
         .task_woken             = task_woken_dl,
   #endif
   
- -      .set_curr_task          = set_curr_task_dl,
         .task_tick              = task_tick_dl,
         .task_fork              = task_fork_dl,
   
diff --combined kernel/sched/rt.c

index 858c4cc6f99bccd888b4388c87c217052b33560a,d6678f773c966fe28bd4de1ba166fc091e21e3aa..ebaa4e619684112cc6c19bc6ba69fe15c3f2b52a
--- 1/kernel/sched/rt.c
--- 2/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@@ -45,8 -45,8 +45,8 @@@ void init_rt_bandwidth(struct rt_bandwi
   
         raw_spin_lock_init(&rt_b->rt_runtime_lock);
   
-       hrtimer_init(&rt_b->rt_period_timer,
-                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
+                    HRTIMER_MODE_REL_HARD);
         rt_b->rt_period_timer.function = sched_rt_period_timer;
   }
   
@@@ -67,7 -67,8 +67,8 @@@ static void start_rt_bandwidth(struct r
                  * to update the period.
                  */
                 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
-               hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+               hrtimer_start_expires(&rt_b->rt_period_timer,
+                                     HRTIMER_MODE_ABS_PINNED_HARD);
         }
         raw_spin_unlock(&rt_b->rt_runtime_lock);
   }
@@@ -1498,22 -1499,12 +1499,22 @@@ static void check_preempt_curr_rt(struc
   #endif
   }
   
- -static inline void set_next_task(struct rq *rq, struct task_struct *p)
+ +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
   {
         p->se.exec_start = rq_clock_task(rq);
   
         /* The running task is never eligible for pushing */
         dequeue_pushable_task(rq, p);
+ +
+ +      /*
+ +       * If prev task was rt, put_prev_task() has already updated the
+ +       * utilization. We only care of the case where we start to schedule a
+ +       * rt task
+ +       */
+ +      if (rq->curr->sched_class != &rt_sched_class)
+ +              update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ +
+ +      rt_queue_push_tasks(rq);
   }
   
   static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@@ -1553,19 -1544,56 +1554,19 @@@ pick_next_task_rt(struct rq *rq, struc
         struct task_struct *p;
         struct rt_rq *rt_rq = &rq->rt;
   
- -      if (need_pull_rt_task(rq, prev)) {
- -              /*
- -               * This is OK, because current is on_cpu, which avoids it being
- -               * picked for load-balance and preemption/IRQs are still
- -               * disabled avoiding further scheduler activity on it and we're
- -               * being very careful to re-start the picking loop.
- -               */
- -              rq_unpin_lock(rq, rf);
- -              pull_rt_task(rq);
- -              rq_repin_lock(rq, rf);
- -              /*
- -               * pull_rt_task() can drop (and re-acquire) rq->lock; this
- -               * means a dl or stop task can slip in, in which case we need
- -               * to re-start task selection.
- -               */
- -              if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
- -                           rq->dl.dl_nr_running))
- -                      return RETRY_TASK;
- -      }
- -
- -      /*
- -       * We may dequeue prev's rt_rq in put_prev_task().
- -       * So, we update time before rt_queued check.
- -       */
- -      if (prev->sched_class == &rt_sched_class)
- -              update_curr_rt(rq);
+ +      WARN_ON_ONCE(prev || rf);
   
         if (!rt_rq->rt_queued)
                 return NULL;
   
- -      put_prev_task(rq, prev);
- -
         p = _pick_next_task_rt(rq);
   
- -      set_next_task(rq, p);
- -
- -      rt_queue_push_tasks(rq);
- -
- -      /*
- -       * If prev task was rt, put_prev_task() has already updated the
- -       * utilization. We only care of the case where we start to schedule a
- -       * rt task
- -       */
- -      if (rq->curr->sched_class != &rt_sched_class)
- -              update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ +      set_next_task_rt(rq, p);
   
         return p;
   }
   
- -static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+ +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
   {
         update_curr_rt(rq);
   
@@@ -1577,18 -1605,6 +1578,18 @@@
          */
         if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
                 enqueue_pushable_task(rq, p);
+ +
+ +      if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
+ +              /*
+ +               * This is OK, because current is on_cpu, which avoids it being
+ +               * picked for load-balance and preemption/IRQs are still
+ +               * disabled avoiding further scheduler activity on it and we've
+ +               * not yet started the picking loop.
+ +               */
+ +              rq_unpin_lock(rq, rf);
+ +              pull_rt_task(rq);
+ +              rq_repin_lock(rq, rf);
+ +      }
   }
   
   #ifdef CONFIG_SMP
@@@ -2289,8 -2305,10 +2290,10 @@@ static void watchdog(struct rq *rq, str
                 }
   
                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
-               if (p->rt.timeout > next)
-                       p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+               if (p->rt.timeout > next) {
+                       posix_cputimers_rt_watchdog(&p->posix_cputimers,
+                                                   p->se.sum_exec_runtime);
+               }
         }
   }
   #else
@@@ -2339,6 -2357,11 +2342,6 @@@ static void task_tick_rt(struct rq *rq
         }
   }
   
- -static void set_curr_task_rt(struct rq *rq)
- -{
- -      set_next_task(rq, rq->curr);
- -}
- -
   static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
   {
         /*
@@@ -2360,7 -2383,6 +2363,7 @@@ const struct sched_class rt_sched_clas
   
         .pick_next_task         = pick_next_task_rt,
         .put_prev_task          = put_prev_task_rt,
+ +      .set_next_task          = set_next_task_rt,
   
   #ifdef CONFIG_SMP
         .select_task_rq         = select_task_rq_rt,
@@@ -2372,6 -2394,7 +2375,6 @@@
         .switched_from          = switched_from_rt,
   #endif
   
- -      .set_curr_task          = set_curr_task_rt,
         .task_tick              = task_tick_rt,
   
         .get_rr_interval        = get_rr_interval_rt,
diff --combined kernel/sys.c

index d605fe5e58a5805410ec74d1e9bffaddff0dd169,2462aa84247fc9afea9b56c25331d84c1953bc1e..a611d1d58c7d00525247edff32211cc9586f2c9e
--- 1/kernel/sys.c
--- 2/kernel/sys.c
+++ b/kernel/sys.c
@@@ -103,6 -103,12 +103,6 @@@
   #ifndef SET_TSC_CTL
   # define SET_TSC_CTL(a)               (-EINVAL)
   #endif
- -#ifndef MPX_ENABLE_MANAGEMENT
- -# define MPX_ENABLE_MANAGEMENT()      (-EINVAL)
- -#endif
- -#ifndef MPX_DISABLE_MANAGEMENT
- -# define MPX_DISABLE_MANAGEMENT()     (-EINVAL)
- -#endif
   #ifndef GET_FP_MODE
   # define GET_FP_MODE(a)               (-EINVAL)
   #endif
@@@ -118,12 -124,6 +118,12 @@@
   #ifndef PAC_RESET_KEYS
   # define PAC_RESET_KEYS(a, b) (-EINVAL)
   #endif
+ +#ifndef SET_TAGGED_ADDR_CTRL
+ +# define SET_TAGGED_ADDR_CTRL(a)      (-EINVAL)
+ +#endif
+ +#ifndef GET_TAGGED_ADDR_CTRL
+ +# define GET_TAGGED_ADDR_CTRL()               (-EINVAL)
+ +#endif
   
   /*
    * this is where the system-wide overflow UID and GID are defined, for
@@@ -1557,15 -1557,6 +1557,6 @@@ int do_prlimit(struct task_struct *tsk
                         retval = -EPERM;
                 if (!retval)
                         retval = security_task_setrlimit(tsk, resource, new_rlim);
-               if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
-                       /*
-                        * The caller is asking for an immediate RLIMIT_CPU
-                        * expiry.  But we use the zero value to mean "it was
-                        * never set".  So let's cheat and make it one second
-                        * instead
-                        */
-                       new_rlim->rlim_cur = 1;
-               }
         }
         if (!retval) {
                 if (old_rlim)
@@@ -1576,10 -1567,9 +1567,9 @@@
         task_unlock(tsk->group_leader);
   
         /*
-        * RLIMIT_CPU handling.   Note that the kernel fails to return an error
-        * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a
-        * very long-standing error, and fixing it now risks breakage of
-        * applications, so we live with it
+        * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
+        * infite. In case of RLIM_INFINITY the posix CPU timer code
+        * ignores the rlimit.
          */
          if (!retval && new_rlim && resource == RLIMIT_CPU &&
              new_rlim->rlim_cur != RLIM_INFINITY &&
@@@ -2456,9 -2446,15 +2446,9 @@@ SYSCALL_DEFINE5(prctl, int, option, uns
                 up_write(&me->mm->mmap_sem);
                 break;
         case PR_MPX_ENABLE_MANAGEMENT:
- -              if (arg2 || arg3 || arg4 || arg5)
- -                      return -EINVAL;
- -              error = MPX_ENABLE_MANAGEMENT();
- -              break;
         case PR_MPX_DISABLE_MANAGEMENT:
- -              if (arg2 || arg3 || arg4 || arg5)
- -                      return -EINVAL;
- -              error = MPX_DISABLE_MANAGEMENT();
- -              break;
+ +              /* No longer implemented: */
+ +              return -EINVAL;
         case PR_SET_FP_MODE:
                 error = SET_FP_MODE(me, arg2);
                 break;
@@@ -2486,16 -2482,6 +2476,16 @@@
                         return -EINVAL;
                 error = PAC_RESET_KEYS(me, arg2);
                 break;
+ +      case PR_SET_TAGGED_ADDR_CTRL:
+ +              if (arg3 || arg4 || arg5)
+ +                      return -EINVAL;
+ +              error = SET_TAGGED_ADDR_CTRL(arg2);
+ +              break;
+ +      case PR_GET_TAGGED_ADDR_CTRL:
+ +              if (arg2 || arg3 || arg4 || arg5)
+ +                      return -EINVAL;
+ +              error = GET_TAGGED_ADDR_CTRL();
+ +              break;
         default:
                 error = -EINVAL;
                 break;
diff --combined kernel/time/alarmtimer.c

index b7d75a9e8ccf17c7b616b649e8d76bd5522561b4,ec32876e284daf300f18f32d0d6bf5ab276b2d71..271ce6c12907860bc2db9ae94fab4e70dbedbbfb
--- 1/kernel/time/alarmtimer.c
--- 2/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@@ -432,7 -432,7 +432,7 @@@ int alarm_cancel(struct alarm *alarm
                 int ret = alarm_try_to_cancel(alarm);
                 if (ret >= 0)
                         return ret;
-               cpu_relax();
+               hrtimer_cancel_wait_running(&alarm->timer);
         }
   }
   EXPORT_SYMBOL_GPL(alarm_cancel);
@@@ -605,6 -605,19 +605,19 @@@ static int alarm_timer_try_to_cancel(st
         return alarm_try_to_cancel(&timr->it.alarm.alarmtimer);
   }
   
+ /**
+  * alarm_timer_wait_running - Posix timer callback to wait for a timer
+  * @timr:     Pointer to the posixtimer data struct
+  *
+  * Called from the core code when timer cancel detected that the callback
+  * is running. @timr is unlocked and rcu read lock is held to prevent it
+  * from being freed.
+  */
+ static void alarm_timer_wait_running(struct k_itimer *timr)
+ {
+       hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer);
+ }
+ 
   /**
    * alarm_timer_arm - Posix timer callback to arm a timer
    * @timr:     Pointer to the posixtimer data struct
@@@ -672,7 -685,7 +685,7 @@@ static int alarm_timer_create(struct k_
         enum  alarmtimer_type type;
   
         if (!alarmtimer_get_rtcdev())
- -              return -ENOTSUPP;
+ +              return -EOPNOTSUPP;
   
         if (!capable(CAP_WAKE_ALARM))
                 return -EPERM;
@@@ -790,7 -803,7 +803,7 @@@ static int alarm_timer_nsleep(const clo
         int ret = 0;
   
         if (!alarmtimer_get_rtcdev())
- -              return -ENOTSUPP;
+ +              return -EOPNOTSUPP;
   
         if (flags & ~TIMER_ABSTIME)
                 return -EINVAL;
@@@ -834,6 -847,7 +847,7 @@@ const struct k_clock alarm_clock = 
         .timer_forward          = alarm_timer_forward,
         .timer_remaining        = alarm_timer_remaining,
         .timer_try_to_cancel    = alarm_timer_try_to_cancel,
+       .timer_wait_running     = alarm_timer_wait_running,
         .nsleep                 = alarm_timer_nsleep,
   };
   #endif /* CONFIG_POSIX_TIMERS */
author	Linus Torvalds <[email protected]>
	Tue, 17 Sep 2019 19:35:15 +0000 (12:35 -0700)
committer	Linus Torvalds <[email protected]>
	Tue, 17 Sep 2019 19:35:15 +0000 (12:35 -0700)
		1	2
arch/arm64/boot/dts/freescale/imx8mm.dtsi	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/boot/dts/freescale/imx8mq.dtsi	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/hyperv/hv_init.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/vdso/gettimeofday.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/lapic.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/wait.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/init_task.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/deadline.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/rt.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/alarmtimer.c	patch \|	diff1 \|	diff2 \|	blob \| history