Merge tag 'kthread-cleanups-for-v5.19' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <[email protected]>

Fri, 3 Jun 2022 23:03:05 +0000 (16:03 -0700)

committer Linus Torvalds <[email protected]>

Fri, 3 Jun 2022 23:03:05 +0000 (16:03 -0700)
author Linus Torvalds <[email protected]>
Fri, 3 Jun 2022 23:03:05 +0000 (16:03 -0700)
committer Linus Torvalds <[email protected]>
Fri, 3 Jun 2022 23:03:05 +0000 (16:03 -0700)
diff --combined arch/arm64/kernel/process.c

index 2f42123e059ffc179e745beda02e9c42dba58ed7,d0ef05c661b03038f6a6e72c893f958408dc8d01..92bcc1768f0b997b761771ba2dad659a0fe16953
--- 1/arch/arm64/kernel/process.c
--- 2/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@@ -111,7 -111,8 +111,7 @@@ void machine_power_off(void
   {
         local_irq_disable();
         smp_send_stop();
- -      if (pm_power_off)
- -              pm_power_off();
+ +      do_kernel_power_off();
   }
   
   /*
@@@ -249,8 -250,6 +249,8 @@@ void show_regs(struct pt_regs *regs
   static void tls_thread_flush(void)
   {
         write_sysreg(0, tpidr_el0);
+ +      if (system_supports_tpidr2())
+ +              write_sysreg_s(0, SYS_TPIDR2_EL0);
   
         if (is_compat_task()) {
                 current->thread.uw.tp_value = 0;
@@@ -299,42 -298,16 +299,42 @@@ int arch_dup_task_struct(struct task_st
   
         /*
          * Detach src's sve_state (if any) from dst so that it does not
- -       * get erroneously used or freed prematurely.  dst's sve_state
+ +       * get erroneously used or freed prematurely.  dst's copies
          * will be allocated on demand later on if dst uses SVE.
          * For consistency, also clear TIF_SVE here: this could be done
          * later in copy_process(), but to avoid tripping up future
- -       * maintainers it is best not to leave TIF_SVE and sve_state in
+ +       * maintainers it is best not to leave TIF flags and buffers in
          * an inconsistent state, even temporarily.
          */
         dst->thread.sve_state = NULL;
         clear_tsk_thread_flag(dst, TIF_SVE);
   
+ +      /*
+ +       * In the unlikely event that we create a new thread with ZA
+ +       * enabled we should retain the ZA state so duplicate it here.
+ +       * This may be shortly freed if we exec() or if CLONE_SETTLS
+ +       * but it's simpler to do it here. To avoid confusing the rest
+ +       * of the code ensure that we have a sve_state allocated
+ +       * whenever za_state is allocated.
+ +       */
+ +      if (thread_za_enabled(&src->thread)) {
+ +              dst->thread.sve_state = kzalloc(sve_state_size(src),
+ +                                              GFP_KERNEL);
+ +              if (!dst->thread.sve_state)
+ +                      return -ENOMEM;
+ +              dst->thread.za_state = kmemdup(src->thread.za_state,
+ +                                             za_state_size(src),
+ +                                             GFP_KERNEL);
+ +              if (!dst->thread.za_state) {
+ +                      kfree(dst->thread.sve_state);
+ +                      dst->thread.sve_state = NULL;
+ +                      return -ENOMEM;
+ +              }
+ +      } else {
+ +              dst->thread.za_state = NULL;
+ +              clear_tsk_thread_flag(dst, TIF_SME);
+ +      }
+ +
         /* clear any pending asynchronous tag fault raised by the parent */
         clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT);
   
@@@ -343,9 -316,11 +343,11 @@@
   
   asmlinkage void ret_from_fork(void) asm("ret_from_fork");
   
- int copy_thread(unsigned long clone_flags, unsigned long stack_start,
-               unsigned long stk_sz, struct task_struct *p, unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
   {
+       unsigned long clone_flags = args->flags;
+       unsigned long stack_start = args->stack;
+       unsigned long tls = args->tls;
         struct pt_regs *childregs = task_pt_regs(p);
   
         memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
@@@ -361,7 -336,7 +363,7 @@@
   
         ptrauth_thread_init_kernel(p);
   
-       if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) {
+       if (likely(!args->fn)) {
                 *childregs = *current_pt_regs();
                 childregs->regs[0] = 0;
   
@@@ -370,8 -345,6 +372,8 @@@
                  * out-of-sync with the saved value.
                  */
                 *task_user_tls(p) = read_sysreg(tpidr_el0);
+ +              if (system_supports_tpidr2())
+ +                      p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
   
                 if (stack_start) {
                         if (is_compat_thread(task_thread_info(p)))
@@@ -382,12 -355,10 +384,12 @@@
   
                 /*
                  * If a TLS pointer was passed to clone, use it for the new
- -               * thread.
+ +               * thread.  We also reset TPIDR2 if it's in use.
                  */
- -              if (clone_flags & CLONE_SETTLS)
+ +              if (clone_flags & CLONE_SETTLS) {
                         p->thread.uw.tp_value = tls;
+ +                      p->thread.tpidr2_el0 = 0;
+ +              }
         } else {
                 /*
                  * A kthread has no context to ERET to, so ensure any buggy
@@@ -399,8 -370,8 +401,8 @@@
                 memset(childregs, 0, sizeof(struct pt_regs));
                 childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT;
   
-               p->thread.cpu_context.x19 = stack_start;
-               p->thread.cpu_context.x20 = stk_sz;
+               p->thread.cpu_context.x19 = (unsigned long)args->fn;
+               p->thread.cpu_context.x20 = (unsigned long)args->fn_arg;
         }
         p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
         p->thread.cpu_context.sp = (unsigned long)childregs;
@@@ -418,8 -389,6 +420,8 @@@
   void tls_preserve_current_state(void)
   {
         *task_user_tls(current) = read_sysreg(tpidr_el0);
+ +      if (system_supports_tpidr2() && !is_compat_task())
+ +              current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
   }
   
   static void tls_thread_switch(struct task_struct *next)
@@@ -432,8 -401,6 +434,8 @@@
                 write_sysreg(0, tpidrro_el0);
   
         write_sysreg(*task_user_tls(next), tpidr_el0);
+ +      if (system_supports_tpidr2())
+ +              write_sysreg_s(next->thread.tpidr2_el0, SYS_TPIDR2_EL0);
   }
   
   /*
diff --combined arch/csky/kernel/process.c

index 5de04707aa07bc4ddf9ee17e59bf679dc5721bae,9af49aea1c3bbbc4b81a309082890badc3099088..eedddb1556696f04ca2cc0bb3f4bed27d0155f84
--- 1/arch/csky/kernel/process.c
--- 2/arch/csky/kernel/process.c
+++ b/arch/csky/kernel/process.c
@@@ -2,6 -2,7 +2,6 @@@
   // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
   
   #include <linux/module.h>
- -#include <linux/version.h>
   #include <linux/sched.h>
   #include <linux/sched/task_stack.h>
   #include <linux/sched/debug.h>
@@@ -29,12 -30,11 +29,11 @@@ asmlinkage void ret_from_kernel_thread(
    */
   void flush_thread(void){}
   
- int copy_thread(unsigned long clone_flags,
-               unsigned long usp,
-               unsigned long kthread_arg,
-               struct task_struct *p,
-               unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
   {
+       unsigned long clone_flags = args->flags;
+       unsigned long usp = args->stack;
+       unsigned long tls = args->tls;
         struct switch_stack *childstack;
         struct pt_regs *childregs = task_pt_regs(p);
   
@@@ -48,11 -48,11 +47,11 @@@
         /* setup thread.sp for switch_to !!! */
         p->thread.sp = (unsigned long)childstack;
   
-       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+       if (unlikely(args->fn)) {
                 memset(childregs, 0, sizeof(struct pt_regs));
                 childstack->r15 = (unsigned long) ret_from_kernel_thread;
-               childstack->r10 = kthread_arg;
-               childstack->r9 = usp;
+               childstack->r10 = (unsigned long) args->fn_arg;
+               childstack->r9 = (unsigned long) args->fn;
                 childregs->sr = mfcr("psr");
         } else {
                 *childregs = *(current_pt_regs());
diff --combined arch/ia64/kernel/process.c

index 89025e3b3f61cf0594cd1249dd25108d92c16ffd,167b1765bea15e501a8fe4a545b2aa087360de87..416305e550e281831400caddb7db289b880d97e4
--- 1/arch/ia64/kernel/process.c
--- 2/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@@ -19,7 -19,6 +19,7 @@@
   #include <linux/module.h>
   #include <linux/notifier.h>
   #include <linux/personality.h>
+ +#include <linux/reboot.h>
   #include <linux/sched.h>
   #include <linux/sched/debug.h>
   #include <linux/sched/hotplug.h>
@@@ -296,9 -295,12 +296,12 @@@ ia64_load_extra (struct task_struct *ta
    * so there is nothing to worry about.
    */
   int
- copy_thread(unsigned long clone_flags, unsigned long user_stack_base,
-           unsigned long user_stack_size, struct task_struct *p, unsigned long tls)
+ copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
   {
+       unsigned long clone_flags = args->flags;
+       unsigned long user_stack_base = args->stack;
+       unsigned long user_stack_size = args->stack_size;
+       unsigned long tls = args->tls;
         extern char ia64_ret_from_clone;
         struct switch_stack *child_stack, *stack;
         unsigned long rbs, child_rbs, rbs_size;
@@@ -339,14 -341,14 +342,14 @@@
   
         ia64_drop_fpu(p);       /* don't pick up stale state from a CPU's fph */
   
-       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
-               if (unlikely(!user_stack_base)) {
+       if (unlikely(args->fn)) {
+               if (unlikely(args->idle)) {
                         /* fork_idle() called us */
                         return 0;
                 }
                 memset(child_stack, 0, sizeof(*child_ptregs) + sizeof(*child_stack));
-               child_stack->r4 = user_stack_base;      /* payload */
-               child_stack->r5 = user_stack_size;      /* argument */
+               child_stack->r4 = (unsigned long) args->fn;
+               child_stack->r5 = (unsigned long) args->fn_arg;
                 /*
                  * Preserve PSR bits, except for bits 32-34 and 37-45,
                  * which we can't read.
@@@ -600,7 -602,8 +603,7 @@@ machine_halt (void
   void
   machine_power_off (void)
   {
- -      if (pm_power_off)
- -              pm_power_off();
+ +      do_kernel_power_off();
         machine_halt();
   }
   
diff --combined arch/m68k/kernel/process.c

index e160a7c57bd3c4db663d9533d584f2c9f3a48e32,221feb0269f130c1848b38f4df553cbcddae3acf..2cb4a61bcfacbdc94f868128cc6ba695773aa681
--- 1/arch/m68k/kernel/process.c
--- 2/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@@ -67,11 -67,12 +67,11 @@@ void machine_halt(void
   
   void machine_power_off(void)
   {
- -      if (mach_power_off)
- -              mach_power_off();
+ +      do_kernel_power_off();
         for (;;);
   }
   
- -void (*pm_power_off)(void) = machine_power_off;
+ +void (*pm_power_off)(void);
   EXPORT_SYMBOL(pm_power_off);
   
   void show_regs(struct pt_regs * regs)
@@@ -137,9 -138,11 +137,11 @@@ asmlinkage int m68k_clone3(struct pt_re
         return sys_clone3((struct clone_args __user *)regs->d1, regs->d2);
   }
   
- int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
-               struct task_struct *p, unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
   {
+       unsigned long clone_flags = args->flags;
+       unsigned long usp = args->stack;
+       unsigned long tls = args->tls;
         struct fork_frame {
                 struct switch_stack sw;
                 struct pt_regs regs;
@@@ -156,12 -159,12 +158,12 @@@
          */
         p->thread.fc = USER_DATA;
   
-       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+       if (unlikely(args->fn)) {
                 /* kernel thread */
                 memset(frame, 0, sizeof(struct fork_frame));
                 frame->regs.sr = PS_S;
-               frame->sw.a3 = usp; /* function */
-               frame->sw.d7 = arg;
+               frame->sw.a3 = (unsigned long)args->fn;
+               frame->sw.d7 = (unsigned long)args->fn_arg;
                 frame->sw.retpc = (unsigned long)ret_from_kernel_thread;
                 p->thread.usp = 0;
                 return 0;
diff --combined arch/openrisc/kernel/process.c

index 1d4c0921aafa6fb14a433a079458705c9b6ec7a9,d9697cc9bc4ddb4ab3956013e7ec96c69d50af69..52dc983ddeba30c6e978aa683e256be746da7139
--- 1/arch/openrisc/kernel/process.c
--- 2/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@@ -52,8 -52,6 +52,8 @@@ void machine_restart(char *cmd
   {
         do_kernel_restart(cmd);
   
+ +      __asm__("l.nop 13");
+ +
         /* Give a grace period for failure to restart of 1s */
         mdelay(1000);
   
@@@ -62,16 -60,6 +62,16 @@@
         while (1);
   }
   
+ +/*
+ + * This is used if pm_power_off has not been set by a power management
+ + * driver, in this case we can assume we are on a simulator.  On
+ + * OpenRISC simulators l.nop 1 will trigger the simulator exit.
+ + */
+ +static void default_power_off(void)
+ +{
+ +      __asm__("l.nop 1");
+ +}
+ +
   /*
    * Similar to machine_power_off, but don't shut off power.  Add code
    * here to freeze the system for e.g. post-mortem debug purpose when
@@@ -87,10 -75,7 +87,10 @@@ void machine_halt(void
   void machine_power_off(void)
   {
         printk(KERN_INFO "*** MACHINE POWER OFF ***\n");
- -      __asm__("l.nop 1");
+ +      if (pm_power_off != NULL)
+ +              pm_power_off();
+ +      else
+ +              default_power_off();
   }
   
   /*
@@@ -104,7 -89,7 +104,7 @@@ void arch_cpu_idle(void
                 mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME);
   }
   
- -void (*pm_power_off) (void) = machine_power_off;
+ +void (*pm_power_off)(void) = NULL;
   EXPORT_SYMBOL(pm_power_off);
   
   /*
@@@ -167,9 -152,11 +167,11 @@@ extern asmlinkage void ret_from_fork(vo
    */
   
   int
- copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
-           struct task_struct *p, unsigned long tls)
+ copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
   {
+       unsigned long clone_flags = args->flags;
+       unsigned long usp = args->stack;
+       unsigned long tls = args->tls;
         struct pt_regs *userregs;
         struct pt_regs *kregs;
         unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
@@@ -187,10 -174,10 +189,10 @@@
         sp -= sizeof(struct pt_regs);
         kregs = (struct pt_regs *)sp;
   
-       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+       if (unlikely(args->fn)) {
                 memset(kregs, 0, sizeof(struct pt_regs));
-               kregs->gpr[20] = usp; /* fn, kernel thread */
-               kregs->gpr[22] = arg;
+               kregs->gpr[20] = (unsigned long)args->fn;
+               kregs->gpr[22] = (unsigned long)args->fn_arg;
         } else {
                 *userregs = *current_pt_regs();
   
diff --combined arch/parisc/kernel/process.c

index d145184696ea5a2b5c87dfdd4e612d261c3098c8,a6a2a558fc5bb986ee2c7992d591328b8b7fdf34..7c37e09c92da6fe0e6e5cbd3ced4604d55a1024d
--- 1/arch/parisc/kernel/process.c
--- 2/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@@ -26,7 -26,6 +26,7 @@@
   #include <linux/module.h>
   #include <linux/personality.h>
   #include <linux/ptrace.h>
+ +#include <linux/reboot.h>
   #include <linux/sched.h>
   #include <linux/sched/debug.h>
   #include <linux/sched/task.h>
@@@ -117,7 -116,8 +117,7 @@@ void machine_power_off(void
         pdc_chassis_send_status(PDC_CHASSIS_DIRECT_SHUTDOWN);
   
         /* ipmi_poweroff may have been installed. */
- -      if (pm_power_off)
- -              pm_power_off();
+ +      do_kernel_power_off();
                 
         /* It seems we have no way to power the system off via
          * software. The user has to press the button himself. */
@@@ -206,9 -206,11 +206,11 @@@ arch_initcall(parisc_idle_init)
    * Copy architecture-specific thread state
    */
   int
- copy_thread(unsigned long clone_flags, unsigned long usp,
-           unsigned long kthread_arg, struct task_struct *p, unsigned long tls)
+ copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
   {
+       unsigned long clone_flags = args->flags;
+       unsigned long usp = args->stack;
+       unsigned long tls = args->tls;
         struct pt_regs *cregs = &(p->thread.regs);
         void *stack = task_stack_page(p);
         
@@@ -218,10 -220,10 +220,10 @@@
         extern void * const ret_from_kernel_thread;
         extern void * const child_return;
   
-       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+       if (unlikely(args->fn)) {
                 /* kernel thread */
                 memset(cregs, 0, sizeof(struct pt_regs));
-               if (!usp) /* idle thread */
+               if (args->idle) /* idle thread */
                         return 0;
                 /* Must exit via ret_from_kernel_thread in order
                  * to call schedule_tail()
@@@ -233,12 -235,12 +235,12 @@@
                  * ret_from_kernel_thread.
                  */
   #ifdef CONFIG_64BIT
-               cregs->gr[27] = ((unsigned long *)usp)[3];
-               cregs->gr[26] = ((unsigned long *)usp)[2];
+               cregs->gr[27] = ((unsigned long *)args->fn)[3];
+               cregs->gr[26] = ((unsigned long *)args->fn)[2];
   #else
-               cregs->gr[26] = usp;
+               cregs->gr[26] = (unsigned long) args->fn;
   #endif
-               cregs->gr[25] = kthread_arg;
+               cregs->gr[25] = (unsigned long) args->fn_arg;
         } else {
                 /* user thread */
                 /* usp must be word aligned.  This also prevents users from
diff --combined arch/powerpc/kernel/process.c

index d00b20c6596671f53db148f184a04ddb7fc6650d,4f367bb689061cfb1684ee805d9d7755e3588e1e..b62046bf3bb88035615f5829c35d531c9579dc37
--- 1/arch/powerpc/kernel/process.c
--- 2/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@@ -34,8 -34,10 +34,8 @@@
   #include <linux/ftrace.h>
   #include <linux/kernel_stat.h>
   #include <linux/personality.h>
- -#include <linux/random.h>
   #include <linux/hw_breakpoint.h>
   #include <linux/uaccess.h>
- -#include <linux/elf-randomize.h>
   #include <linux/pkeys.h>
   #include <linux/seq_buf.h>
   
@@@ -43,6 -45,7 +43,6 @@@
   #include <asm/io.h>
   #include <asm/processor.h>
   #include <asm/mmu.h>
- -#include <asm/prom.h>
   #include <asm/machdep.h>
   #include <asm/time.h>
   #include <asm/runlatch.h>
@@@ -304,7 -307,7 +304,7 @@@ static void __giveup_vsx(struct task_st
         unsigned long msr = tsk->thread.regs->msr;
   
         /*
- -       * We should never be ssetting MSR_VSX without also setting
+ +       * We should never be setting MSR_VSX without also setting
          * MSR_FP and MSR_VEC
          */
         WARN_ON((msr & MSR_VSX) && !((msr & MSR_FP) && (msr & MSR_VEC)));
@@@ -642,7 -645,7 +642,7 @@@ static void do_break_handler(struct pt_
                 return;
         }
   
- -      /* Otherwise findout which DAWR caused exception and disable it. */
+ +      /* Otherwise find out which DAWR caused exception and disable it. */
         wp_get_instr_detail(regs, &instr, &type, &size, &ea);
   
         for (i = 0; i < nr_wp_slots(); i++) {
@@@ -1713,10 -1716,11 +1713,11 @@@ static void setup_ksp_vsid(struct task_
   /*
    * Copy architecture-specific thread state
    */
- int copy_thread(unsigned long clone_flags, unsigned long usp,
-               unsigned long kthread_arg, struct task_struct *p,
-               unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
   {
+       unsigned long clone_flags = args->flags;
+       unsigned long usp = args->stack;
+       unsigned long tls = args->tls;
         struct pt_regs *childregs, *kregs;
         extern void ret_from_fork(void);
         extern void ret_from_fork_scv(void);
@@@ -1733,18 -1737,18 +1734,18 @@@
         /* Copy registers */
         sp -= sizeof(struct pt_regs);
         childregs = (struct pt_regs *) sp;
-       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+       if (unlikely(args->fn)) {
                 /* kernel thread */
                 memset(childregs, 0, sizeof(struct pt_regs));
                 childregs->gpr[1] = sp + sizeof(struct pt_regs);
                 /* function */
-               if (usp)
-                       childregs->gpr[14] = ppc_function_entry((void *)usp);
+               if (args->fn)
+                       childregs->gpr[14] = ppc_function_entry((void *)args->fn);
   #ifdef CONFIG_PPC64
                 clear_tsk_thread_flag(p, TIF_32BIT);
                 childregs->softe = IRQS_ENABLED;
   #endif
-               childregs->gpr[15] = kthread_arg;
+               childregs->gpr[15] = (unsigned long)args->fn_arg;
                 p->thread.regs = NULL;  /* no user register state */
                 ti->flags |= _TIF_RESTOREALL;
                 f = ret_from_kernel_thread;
@@@ -2310,3 -2314,42 +2311,3 @@@ unsigned long arch_align_stack(unsigne
                 sp -= get_random_int() & ~PAGE_MASK;
         return sp & ~0xf;
   }
- -
- -static inline unsigned long brk_rnd(void)
- -{
- -        unsigned long rnd = 0;
- -
- -      /* 8MB for 32bit, 1GB for 64bit */
- -      if (is_32bit_task())
- -              rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT)));
- -      else
- -              rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT)));
- -
- -      return rnd << PAGE_SHIFT;
- -}
- -
- -unsigned long arch_randomize_brk(struct mm_struct *mm)
- -{
- -      unsigned long base = mm->brk;
- -      unsigned long ret;
- -
- -#ifdef CONFIG_PPC_BOOK3S_64
- -      /*
- -       * If we are using 1TB segments and we are allowed to randomise
- -       * the heap, we can put it above 1TB so it is backed by a 1TB
- -       * segment. Otherwise the heap will be in the bottom 1TB
- -       * which always uses 256MB segments and this may result in a
- -       * performance penalty.
- -       */
- -      if (!radix_enabled() && !is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T))
- -              base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T);
- -#endif
- -
- -      ret = PAGE_ALIGN(base + brk_rnd());
- -
- -      if (ret < mm->brk)
- -              return mm->brk;
- -
- -      return ret;
- -}
- -
diff --combined arch/riscv/kernel/process.c

index 1c7be865ab317676c497c0fcda5085f921fab1d8,24efabdbc551153fe456a114c170f0e0c3d4406f..ceb9ebab6558cba7c64dae26ea911abec09df6b1
--- 1/arch/riscv/kernel/process.c
--- 2/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@@ -84,34 -84,6 +84,34 @@@ void show_regs(struct pt_regs *regs
                 dump_backtrace(regs, NULL, KERN_DEFAULT);
   }
   
+ +#ifdef CONFIG_COMPAT
+ +static bool compat_mode_supported __read_mostly;
+ +
+ +bool compat_elf_check_arch(Elf32_Ehdr *hdr)
+ +{
+ +      return compat_mode_supported &&
+ +             hdr->e_machine == EM_RISCV &&
+ +             hdr->e_ident[EI_CLASS] == ELFCLASS32;
+ +}
+ +
+ +static int __init compat_mode_detect(void)
+ +{
+ +      unsigned long tmp = csr_read(CSR_STATUS);
+ +
+ +      csr_write(CSR_STATUS, (tmp & ~SR_UXL) | SR_UXL_32);
+ +      compat_mode_supported =
+ +                      (csr_read(CSR_STATUS) & SR_UXL) == SR_UXL_32;
+ +
+ +      csr_write(CSR_STATUS, tmp);
+ +
+ +      pr_info("riscv: ELF compat mode %s",
+ +                      compat_mode_supported ? "supported" : "failed");
+ +
+ +      return 0;
+ +}
+ +early_initcall(compat_mode_detect);
+ +#endif
+ +
   void start_thread(struct pt_regs *regs, unsigned long pc,
         unsigned long sp)
   {
@@@ -126,15 -98,6 +126,15 @@@
         }
         regs->epc = pc;
         regs->sp = sp;
+ +
+ +#ifdef CONFIG_64BIT
+ +      regs->status &= ~SR_UXL;
+ +
+ +      if (is_compat_task())
+ +              regs->status |= SR_UXL_32;
+ +      else
+ +              regs->status |= SR_UXL_64;
+ +#endif
   }
   
   void flush_thread(void)
@@@ -157,13 -120,15 +157,15 @@@ int arch_dup_task_struct(struct task_st
         return 0;
   }
   
- int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
-               struct task_struct *p, unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
   {
+       unsigned long clone_flags = args->flags;
+       unsigned long usp = args->stack;
+       unsigned long tls = args->tls;
         struct pt_regs *childregs = task_pt_regs(p);
   
         /* p->thread holds context to be restored by __switch_to() */
-       if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+       if (unlikely(args->fn)) {
                 /* Kernel thread */
                 memset(childregs, 0, sizeof(struct pt_regs));
                 childregs->gp = gp_in_global;
@@@ -171,8 -136,8 +173,8 @@@
                 childregs->status = SR_PP | SR_PIE;
   
                 p->thread.ra = (unsigned long)ret_from_kernel_thread;
-               p->thread.s[0] = usp; /* fn */
-               p->thread.s[1] = arg;
+               p->thread.s[0] = (unsigned long)args->fn;
+               p->thread.s[1] = (unsigned long)args->fn_arg;
         } else {
                 *childregs = *(current_pt_regs());
                 if (usp) /* User fork */
diff --combined arch/x86/kernel/fpu/core.c

index 0fdc807ae13f8e8abb535eadbcaafba4b4d8b747,fbade5a3975bf43cada47720661b499977e2f3cc..0531d6a06df5fc57bd96ddcaf27d9371fed996fb
--- 1/arch/x86/kernel/fpu/core.c
--- 2/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@@ -14,8 -14,6 +14,8 @@@
   #include <asm/traps.h>
   #include <asm/irq_regs.h>
   
+ +#include <uapi/asm/kvm.h>
+ +
   #include <linux/hardirq.h>
   #include <linux/pkeys.h>
   #include <linux/vmalloc.h>
@@@ -43,7 -41,17 +43,7 @@@ struct fpu_state_config fpu_user_cfg __
    */
   struct fpstate init_fpstate __ro_after_init;
   
- -/*
- - * Track whether the kernel is using the FPU state
- - * currently.
- - *
- - * This flag is used:
- - *
- - *   - by IRQ context code to potentially use the FPU
- - *     if it's unused.
- - *
- - *   - to debug kernel_fpu_begin()/end() correctness
- - */
+ +/* Track in-kernel FPU usage */
   static DEFINE_PER_CPU(bool, in_kernel_fpu);
   
   /*
@@@ -51,37 -59,42 +51,37 @@@
    */
   DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
   
- -static bool kernel_fpu_disabled(void)
- -{
- -      return this_cpu_read(in_kernel_fpu);
- -}
- -
- -static bool interrupted_kernel_fpu_idle(void)
- -{
- -      return !kernel_fpu_disabled();
- -}
- -
- -/*
- - * Were we in user mode (or vm86 mode) when we were
- - * interrupted?
- - *
- - * Doing kernel_fpu_begin/end() is ok if we are running
- - * in an interrupt context from user mode - we'll just
- - * save the FPU state as required.
- - */
- -static bool interrupted_user_mode(void)
- -{
- -      struct pt_regs *regs = get_irq_regs();
- -      return regs && user_mode(regs);
- -}
- -
   /*
    * Can we use the FPU in kernel mode with the
    * whole "kernel_fpu_begin/end()" sequence?
- - *
- - * It's always ok in process context (ie "not interrupt")
- - * but it is sometimes ok even from an irq.
    */
   bool irq_fpu_usable(void)
   {
- -      return !in_interrupt() ||
- -              interrupted_user_mode() ||
- -              interrupted_kernel_fpu_idle();
+ +      if (WARN_ON_ONCE(in_nmi()))
+ +              return false;
+ +
+ +      /* In kernel FPU usage already active? */
+ +      if (this_cpu_read(in_kernel_fpu))
+ +              return false;
+ +
+ +      /*
+ +       * When not in NMI or hard interrupt context, FPU can be used in:
+ +       *
+ +       * - Task context except from within fpregs_lock()'ed critical
+ +       *   regions.
+ +       *
+ +       * - Soft interrupt processing context which cannot happen
+ +       *   while in a fpregs_lock()'ed critical region.
+ +       */
+ +      if (!in_hardirq())
+ +              return true;
+ +
+ +      /*
+ +       * In hard interrupt context it's safe when soft interrupts
+ +       * are enabled, which means the interrupt did not hit in
+ +       * a fpregs_lock()'ed critical region.
+ +       */
+ +      return !softirq_count();
   }
   EXPORT_SYMBOL(irq_fpu_usable);
   
@@@ -234,20 -247,7 +234,20 @@@ bool fpu_alloc_guest_fpstate(struct fpu
         gfpu->fpstate           = fpstate;
         gfpu->xfeatures         = fpu_user_cfg.default_features;
         gfpu->perm              = fpu_user_cfg.default_features;
- -      gfpu->uabi_size         = fpu_user_cfg.default_size;
+ +
+ +      /*
+ +       * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
+ +       * to userspace, even when XSAVE is unsupported, so that restoring FPU
+ +       * state on a different CPU that does support XSAVE can cleanly load
+ +       * the incoming state using its natural XSAVE.  In other words, KVM's
+ +       * uABI size may be larger than this host's default size.  Conversely,
+ +       * the default size should never be larger than KVM's base uABI size;
+ +       * all features that can expand the uABI size must be opt-in.
+ +       */
+ +      gfpu->uabi_size         = sizeof(struct kvm_xsave);
+ +      if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
+ +              gfpu->uabi_size = fpu_user_cfg.default_size;
+ +
         fpu_init_guest_permissions(gfpu);
   
         return true;
@@@ -556,7 -556,7 +556,7 @@@ static inline void fpu_inherit_perms(st
   }
   
   /* Clone current's FPU state on fork */
- int fpu_clone(struct task_struct *dst, unsigned long clone_flags)
+ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal)
   {
         struct fpu *src_fpu = &current->thread.fpu;
         struct fpu *dst_fpu = &dst->thread.fpu;
@@@ -579,7 -579,7 +579,7 @@@
          * No FPU state inheritance for kernel threads and IO
          * worker threads.
          */
-       if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) {
+       if (minimal) {
                 /* Clear out the minimal state */
                 memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
                        init_fpstate_copy_size());
diff --combined arch/x86/kernel/process.c

index 58fb48d3004fe2ef766c61d4b8ee13a175823583,d20eaad52a859485685a4b00fb5b9854a030d8e4..9b2772b7e1f3990eee3760100880054cf7942a31
--- 1/arch/x86/kernel/process.c
--- 2/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@@ -46,7 -46,6 +46,7 @@@
   #include <asm/proto.h>
   #include <asm/frame.h>
   #include <asm/unwind.h>
+ +#include <asm/tdx.h>
   
   #include "process.h"
   
@@@ -131,9 -130,11 +131,11 @@@ static int set_new_tls(struct task_stru
                 return do_set_thread_area_64(p, ARCH_SET_FS, tls);
   }
   
- int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
-               struct task_struct *p, unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
   {
+       unsigned long clone_flags = args->flags;
+       unsigned long sp = args->stack;
+       unsigned long tls = args->tls;
         struct inactive_task_frame *frame;
         struct fork_frame *fork_frame;
         struct pt_regs *childregs;
@@@ -161,7 -162,6 +163,7 @@@
         savesegment(ds, p->thread.ds);
   #else
         p->thread.sp0 = (unsigned long) (childregs + 1);
+ +      savesegment(gs, p->thread.gs);
         /*
          * Clear all status flags including IF and set fixed bit. 64bit
          * does not have this initialization as the frame does not contain
@@@ -171,13 -171,13 +173,13 @@@
         frame->flags = X86_EFLAGS_FIXED;
   #endif
   
-       fpu_clone(p, clone_flags);
+       fpu_clone(p, clone_flags, args->fn);
   
         /* Kernel thread ? */
         if (unlikely(p->flags & PF_KTHREAD)) {
                 p->thread.pkru = pkru_get_init_value();
                 memset(childregs, 0, sizeof(struct pt_regs));
-               kthread_frame_init(frame, sp, arg);
+               kthread_frame_init(frame, args->fn, args->fn_arg);
                 return 0;
         }
   
@@@ -193,10 -193,14 +195,10 @@@
         if (sp)
                 childregs->sp = sp;
   
-       if (unlikely(p->flags & PF_IO_WORKER)) {
- -#ifdef CONFIG_X86_32
- -      task_user_gs(p) = get_user_gs(current_pt_regs());
- -#endif
- -
+       if (unlikely(args->fn)) {
                 /*
-                * An IO thread is a user space thread, but it doesn't
-                * return to ret_after_fork().
+                * A user space thread, but it doesn't return to
+                * ret_after_fork().
                  *
                  * In order to indicate that to tools like gdb,
                  * we reset the stack and instruction pointers.
@@@ -206,7 -210,7 +208,7 @@@
                  */
                 childregs->sp = 0;
                 childregs->ip = 0;
-               kthread_frame_init(frame, sp, arg);
+               kthread_frame_init(frame, args->fn, args->fn_arg);
                 return 0;
         }
   
@@@ -332,7 -336,7 +334,7 @@@ static int get_cpuid_mode(void
         return !test_thread_flag(TIF_NOCPUID);
   }
   
- -static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
+ +static int set_cpuid_mode(unsigned long cpuid_enabled)
   {
         if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
                 return -ENODEV;
@@@ -403,7 -407,7 +405,7 @@@ static void tss_copy_io_bitmap(struct t
   }
   
   /**
- - * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
+ + * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode
    */
   void native_tss_update_io_bitmap(void)
   {
@@@ -684,6 -688,9 +686,6 @@@ void __switch_to_xtra(struct task_struc
                 /* Enforce MSR update to ensure consistent state */
                 __speculation_ctrl_update(~tifn, tifn);
         }
- -
- -      if ((tifp ^ tifn) & _TIF_SLD)
- -              switch_to_sld(tifn);
   }
   
   /*
@@@ -868,9 -875,6 +870,9 @@@ void select_idle_routine(const struct c
         } else if (prefer_mwait_c1_over_halt(c)) {
                 pr_info("using mwait in idle threads\n");
                 x86_idle = mwait_idle;
+ +      } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ +              pr_info("using TDX aware idle routine\n");
+ +              x86_idle = tdx_safe_halt;
         } else
                 x86_idle = default_idle;
   }
@@@ -983,19 -987,20 +985,19 @@@ unsigned long __get_wchan(struct task_s
         return addr;
   }
   
- -long do_arch_prctl_common(struct task_struct *task, int option,
- -                        unsigned long arg2)
+ +long do_arch_prctl_common(int option, unsigned long arg2)
   {
         switch (option) {
         case ARCH_GET_CPUID:
                 return get_cpuid_mode();
         case ARCH_SET_CPUID:
- -              return set_cpuid_mode(task, arg2);
+ +              return set_cpuid_mode(arg2);
         case ARCH_GET_XCOMP_SUPP:
         case ARCH_GET_XCOMP_PERM:
         case ARCH_REQ_XCOMP_PERM:
         case ARCH_GET_XCOMP_GUEST_PERM:
         case ARCH_REQ_XCOMP_GUEST_PERM:
- -              return fpu_xstate_prctl(task, option, arg2);
+ +              return fpu_xstate_prctl(option, arg2);
         }
   
         return -EINVAL;
diff --combined arch/xtensa/kernel/process.c

index 7e38292dd07abff7d016c7249715c73f827e15c0,c3751cc88e5dc94378bd58d639331152390b9395..68e0e2f06d660a9b411ca81a68554c73a33d5f5a
--- 1/arch/xtensa/kernel/process.c
--- 2/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@@ -47,7 -47,6 +47,7 @@@
   #include <asm/asm-offsets.h>
   #include <asm/regs.h>
   #include <asm/hw_breakpoint.h>
+ +#include <asm/traps.h>
   
   extern void ret_from_fork(void);
   extern void ret_from_kernel_thread(void);
@@@ -64,114 -63,52 +64,114 @@@ EXPORT_SYMBOL(__stack_chk_guard)
   
   #if XTENSA_HAVE_COPROCESSORS
   
- -void coprocessor_release_all(struct thread_info *ti)
+ +void local_coprocessors_flush_release_all(void)
   {
- -      unsigned long cpenable;
- -      int i;
+ +      struct thread_info **coprocessor_owner;
+ +      struct thread_info *unique_owner[XCHAL_CP_MAX];
+ +      int n = 0;
+ +      int i, j;
   
- -      /* Make sure we don't switch tasks during this operation. */
+ +      coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner;
+ +      xtensa_set_sr(XCHAL_CP_MASK, cpenable);
   
- -      preempt_disable();
+ +      for (i = 0; i < XCHAL_CP_MAX; i++) {
+ +              struct thread_info *ti = coprocessor_owner[i];
   
- -      /* Walk through all cp owners and release it for the requested one. */
+ +              if (ti) {
+ +                      coprocessor_flush(ti, i);
   
- -      cpenable = ti->cpenable;
+ +                      for (j = 0; j < n; j++)
+ +                              if (unique_owner[j] == ti)
+ +                                      break;
+ +                      if (j == n)
+ +                              unique_owner[n++] = ti;
   
- -      for (i = 0; i < XCHAL_CP_MAX; i++) {
- -              if (coprocessor_owner[i] == ti) {
- -                      coprocessor_owner[i] = 0;
- -                      cpenable &= ~(1 << i);
+ +                      coprocessor_owner[i] = NULL;
                 }
         }
+ +      for (i = 0; i < n; i++) {
+ +              /* pairs with memw (1) in fast_coprocessor and memw in switch_to */
+ +              smp_wmb();
+ +              unique_owner[i]->cpenable = 0;
+ +      }
+ +      xtensa_set_sr(0, cpenable);
+ +}
   
- -      ti->cpenable = cpenable;
+ +static void local_coprocessor_release_all(void *info)
+ +{
+ +      struct thread_info *ti = info;
+ +      struct thread_info **coprocessor_owner;
+ +      int i;
+ +
+ +      coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner;
+ +
+ +      /* Walk through all cp owners and release it for the requested one. */
+ +
+ +      for (i = 0; i < XCHAL_CP_MAX; i++) {
+ +              if (coprocessor_owner[i] == ti)
+ +                      coprocessor_owner[i] = NULL;
+ +      }
+ +      /* pairs with memw (1) in fast_coprocessor and memw in switch_to */
+ +      smp_wmb();
+ +      ti->cpenable = 0;
         if (ti == current_thread_info())
                 xtensa_set_sr(0, cpenable);
+ +}
   
- -      preempt_enable();
+ +void coprocessor_release_all(struct thread_info *ti)
+ +{
+ +      if (ti->cpenable) {
+ +              /* pairs with memw (2) in fast_coprocessor */
+ +              smp_rmb();
+ +              smp_call_function_single(ti->cp_owner_cpu,
+ +                                       local_coprocessor_release_all,
+ +                                       ti, true);
+ +      }
   }
   
- -void coprocessor_flush_all(struct thread_info *ti)
+ +static void local_coprocessor_flush_all(void *info)
   {
- -      unsigned long cpenable, old_cpenable;
+ +      struct thread_info *ti = info;
+ +      struct thread_info **coprocessor_owner;
+ +      unsigned long old_cpenable;
         int i;
   
- -      preempt_disable();
- -
- -      old_cpenable = xtensa_get_sr(cpenable);
- -      cpenable = ti->cpenable;
- -      xtensa_set_sr(cpenable, cpenable);
+ +      coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner;
+ +      old_cpenable = xtensa_xsr(ti->cpenable, cpenable);
   
         for (i = 0; i < XCHAL_CP_MAX; i++) {
- -              if ((cpenable & 1) != 0 && coprocessor_owner[i] == ti)
+ +              if (coprocessor_owner[i] == ti)
                         coprocessor_flush(ti, i);
- -              cpenable >>= 1;
         }
         xtensa_set_sr(old_cpenable, cpenable);
+ +}
+ +
+ +void coprocessor_flush_all(struct thread_info *ti)
+ +{
+ +      if (ti->cpenable) {
+ +              /* pairs with memw (2) in fast_coprocessor */
+ +              smp_rmb();
+ +              smp_call_function_single(ti->cp_owner_cpu,
+ +                                       local_coprocessor_flush_all,
+ +                                       ti, true);
+ +      }
+ +}
+ +
+ +static void local_coprocessor_flush_release_all(void *info)
+ +{
+ +      local_coprocessor_flush_all(info);
+ +      local_coprocessor_release_all(info);
+ +}
   
- -      preempt_enable();
+ +void coprocessor_flush_release_all(struct thread_info *ti)
+ +{
+ +      if (ti->cpenable) {
+ +              /* pairs with memw (2) in fast_coprocessor */
+ +              smp_rmb();
+ +              smp_call_function_single(ti->cp_owner_cpu,
+ +                                       local_coprocessor_flush_release_all,
+ +                                       ti, true);
+ +      }
   }
   
   #endif
@@@ -203,7 -140,8 +203,7 @@@ void flush_thread(void
   {
   #if XTENSA_HAVE_COPROCESSORS
         struct thread_info *ti = current_thread_info();
- -      coprocessor_flush_all(ti);
- -      coprocessor_release_all(ti);
+ +      coprocessor_flush_release_all(ti);
   #endif
         flush_ptrace_hw_breakpoint(current);
   }
@@@ -263,10 -201,11 +263,11 @@@ int arch_dup_task_struct(struct task_st
    * involved.  Much simpler to just not copy those live frames across.
    */
   
- int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn,
-               unsigned long thread_fn_arg, struct task_struct *p,
-               unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
   {
+       unsigned long clone_flags = args->flags;
+       unsigned long usp_thread_fn = args->stack;
+       unsigned long tls = args->tls;
         struct pt_regs *childregs = task_pt_regs(p);
   
   #if (XTENSA_HAVE_COPROCESSORS || XTENSA_HAVE_IO_PORTS)
@@@ -286,7 -225,7 +287,7 @@@
   #error Unsupported Xtensa ABI
   #endif
   
-       if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+       if (!args->fn) {
                 struct pt_regs *regs = current_pt_regs();
                 unsigned long usp = usp_thread_fn ?
                         usp_thread_fn : regs->areg[1];
@@@ -338,15 -277,15 +339,15 @@@
                  * Window underflow will load registers from the
                  * spill slots on the stack on return from _switch_to.
                  */
-               SPILL_SLOT(childregs, 2) = usp_thread_fn;
-               SPILL_SLOT(childregs, 3) = thread_fn_arg;
+               SPILL_SLOT(childregs, 2) = (unsigned long)args->fn;
+               SPILL_SLOT(childregs, 3) = (unsigned long)args->fn_arg;
   #elif defined(__XTENSA_CALL0_ABI__)
                 /*
                  * a12 = thread_fn, a13 = thread_fn arg.
                  * _switch_to epilogue will load registers from the stack.
                  */
-               ((unsigned long *)p->thread.sp)[0] = usp_thread_fn;
-               ((unsigned long *)p->thread.sp)[1] = thread_fn_arg;
+               ((unsigned long *)p->thread.sp)[0] = (unsigned long)args->fn;
+               ((unsigned long *)p->thread.sp)[1] = (unsigned long)args->fn_arg;
   #else
   #error Unsupported Xtensa ABI
   #endif
diff --combined fs/exec.c

index 14b4b3755580ca85d019848b3cee9f34f13c862c,9c5260e74517bc67ffab4d782ea3dfade549d750..0989fb8472a18fa65214769692f3d30cc979f1a8
--- 1/fs/exec.c
--- 2/fs/exec.c
+++ b/fs/exec.c
@@@ -758,7 -758,6 +758,7 @@@ int setup_arg_pages(struct linux_binpr
         unsigned long stack_size;
         unsigned long stack_expand;
         unsigned long rlim_stack;
+ +      struct mmu_gather tlb;
   
   #ifdef CONFIG_STACK_GROWSUP
         /* Limit stack size */
@@@ -813,11 -812,8 +813,11 @@@
         vm_flags |= mm->def_flags;
         vm_flags |= VM_STACK_INCOMPLETE_SETUP;
   
- -      ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
+ +      tlb_gather_mmu(&tlb, mm);
+ +      ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end,
                         vm_flags);
+ +      tlb_finish_mmu(&tlb);
+ +
         if (ret)
                 goto out_unlock;
         BUG_ON(prev != vma);
@@@ -1312,9 -1308,7 +1312,7 @@@ int begin_new_exec(struct linux_binprm 
         if (retval)
                 goto out_unlock;
   
-       if (me->flags & PF_KTHREAD)
-               free_kthread_struct(me);
-       me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+       me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
                                         PF_NOFREEZE | PF_NO_SETAFFINITY);
         flush_thread();
         me->personality &= ~bprm->per_clear;
@@@ -1959,6 -1953,10 +1957,10 @@@ int kernel_execve(const char *kernel_fi
         int fd = AT_FDCWD;
         int retval;
   
+       /* It is non-sense for kernel threads to call execve */
+       if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+               return -EINVAL;
+ 
         filename = getname_kernel(kernel_filename);
         if (IS_ERR(filename))
                 return PTR_ERR(filename);
diff --combined init/initramfs.c

index dc84cf756cea1a50e060e1663b50ec49fa49b6b9,41e7857d510d68d995433b1239acbebd0570249f..18229cfe8906b7632d7f3f94574c870145f565a6
--- 1/init/initramfs.c
--- 2/init/initramfs.c
+++ b/init/initramfs.c
@@@ -15,13 -15,11 +15,14 @@@
   #include <linux/mm.h>
   #include <linux/namei.h>
   #include <linux/init_syscalls.h>
+ #include <linux/task_work.h>
   #include <linux/umh.h>
   
- -static ssize_t __init xwrite(struct file *file, const char *p, size_t count,
- -              loff_t *pos)
+ +static __initdata bool csum_present;
+ +static __initdata u32 io_csum;
+ +
+ +static ssize_t __init xwrite(struct file *file, const unsigned char *p,
+ +              size_t count, loff_t *pos)
   {
         ssize_t out = 0;
   
@@@ -36,13 -34,6 +37,13 @@@
                 } else if (rv == 0)
                         break;
   
+ +              if (csum_present) {
+ +                      ssize_t i;
+ +
+ +                      for (i = 0; i < rv; i++)
+ +                              io_csum += p[i];
+ +              }
+ +
                 p += rv;
                 out += rv;
                 count -= rv;
@@@ -126,36 -117,31 +127,36 @@@ static void __init free_hash(void
         }
   }
   
- -static long __init do_utime(char *filename, time64_t mtime)
+ +#ifdef CONFIG_INITRAMFS_PRESERVE_MTIME
+ +static void __init do_utime(char *filename, time64_t mtime)
   {
- -      struct timespec64 t[2];
+ +      struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } };
+ +      init_utimes(filename, t);
+ +}
   
- -      t[0].tv_sec = mtime;
- -      t[0].tv_nsec = 0;
- -      t[1].tv_sec = mtime;
- -      t[1].tv_nsec = 0;
- -      return init_utimes(filename, t);
+ +static void __init do_utime_path(const struct path *path, time64_t mtime)
+ +{
+ +      struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } };
+ +      vfs_utimes(path, t);
   }
   
   static __initdata LIST_HEAD(dir_list);
   struct dir_entry {
         struct list_head list;
- -      char *name;
         time64_t mtime;
+ +      char name[];
   };
   
   static void __init dir_add(const char *name, time64_t mtime)
   {
- -      struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL);
+ +      size_t nlen = strlen(name) + 1;
+ +      struct dir_entry *de;
+ +
+ +      de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL);
         if (!de)
                 panic_show_mem("can't allocate dir_entry buffer");
         INIT_LIST_HEAD(&de->list);
- -      de->name = kstrdup(name, GFP_KERNEL);
+ +      strscpy(de->name, name, nlen);
         de->mtime = mtime;
         list_add(&de->list, &dir_list);
   }
@@@ -166,15 -152,10 +167,15 @@@ static void __init dir_utime(void
         list_for_each_entry_safe(de, tmp, &dir_list, list) {
                 list_del(&de->list);
                 do_utime(de->name, de->mtime);
- -              kfree(de->name);
                 kfree(de);
         }
   }
+ +#else
+ +static void __init do_utime(char *filename, time64_t mtime) {}
+ +static void __init do_utime_path(const struct path *path, time64_t mtime) {}
+ +static void __init dir_add(const char *name, time64_t mtime) {}
+ +static void __init dir_utime(void) {}
+ +#endif
   
   static __initdata time64_t mtime;
   
@@@ -186,16 -167,15 +187,16 @@@ static __initdata unsigned long body_le
   static __initdata uid_t uid;
   static __initdata gid_t gid;
   static __initdata unsigned rdev;
+ +static __initdata u32 hdr_csum;
   
   static void __init parse_header(char *s)
   {
- -      unsigned long parsed[12];
+ +      unsigned long parsed[13];
         char buf[9];
         int i;
   
         buf[8] = '\0';
- -      for (i = 0, s += 6; i < 12; i++, s += 8) {
+ +      for (i = 0, s += 6; i < 13; i++, s += 8) {
                 memcpy(buf, s, 8);
                 parsed[i] = simple_strtoul(buf, NULL, 16);
         }
@@@ -210,7 -190,6 +211,7 @@@
         minor = parsed[8];
         rdev = new_encode_dev(MKDEV(parsed[9], parsed[10]));
         name_len = parsed[11];
+ +      hdr_csum = parsed[12];
   }
   
   /* FSM */
@@@ -279,15 -258,12 +280,15 @@@ static int __init do_collect(void
   
   static int __init do_header(void)
   {
- -      if (memcmp(collected, "070707", 6)==0) {
- -              error("incorrect cpio method used: use -H newc option");
- -              return 1;
- -      }
- -      if (memcmp(collected, "070701", 6)) {
- -              error("no cpio magic");
+ +      if (!memcmp(collected, "070701", 6)) {
+ +              csum_present = false;
+ +      } else if (!memcmp(collected, "070702", 6)) {
+ +              csum_present = true;
+ +      } else {
+ +              if (memcmp(collected, "070707", 6) == 0)
+ +                      error("incorrect cpio method used: use -H newc option");
+ +              else
+ +                      error("no cpio magic");
                 return 1;
         }
         parse_header(collected);
@@@ -378,7 -354,6 +379,7 @@@ static int __init do_name(void
                         if (IS_ERR(wfile))
                                 return 0;
                         wfile_pos = 0;
+ +                      io_csum = 0;
   
                         vfs_fchown(wfile, uid, gid);
                         vfs_fchmod(wfile, mode);
@@@ -406,13 -381,15 +407,13 @@@
   static int __init do_copy(void)
   {
         if (byte_count >= body_len) {
- -              struct timespec64 t[2] = { };
                 if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len)
                         error("write error");
   
- -              t[0].tv_sec = mtime;
- -              t[1].tv_sec = mtime;
- -              vfs_utimes(&wfile->f_path, t);
- -
+ +              do_utime_path(&wfile->f_path, mtime);
                 fput(wfile);
+ +              if (csum_present && io_csum != hdr_csum)
+ +                      error("bad data checksum");
                 eat(body_len);
                 state = SkipIt;
                 return 0;
@@@ -727,6 -704,7 +728,7 @@@ done
         initrd_end = 0;
   
         flush_delayed_fput();
+       task_work_run();
   }
   
   static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain);
diff --combined init/main.c

index 02eb533018f6fed4b8b5dbf3c12e654b3470252b,39baac0211c6da3b40335b252c59fbb45bd14c0a..0ee39cdcfcac97614b9da06d7ff17564a7ef2306
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -266,7 -266,7 +266,7 @@@ static int __init loglevel(char *str
   early_param("loglevel", loglevel);
   
   #ifdef CONFIG_BLK_DEV_INITRD
- -static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum)
+ +static void * __init get_boot_config_from_initrd(size_t *_size)
   {
         u32 size, csum;
         char *data;
@@@ -300,20 -300,17 +300,20 @@@ found
                 return NULL;
         }
   
+ +      if (xbc_calc_checksum(data, size) != csum) {
+ +              pr_err("bootconfig checksum failed\n");
+ +              return NULL;
+ +      }
+ +
         /* Remove bootconfig from initramfs/initrd */
         initrd_end = (unsigned long)data;
         if (_size)
                 *_size = size;
- -      if (_csum)
- -              *_csum = csum;
   
         return data;
   }
   #else
- -static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum)
+ +static void * __init get_boot_config_from_initrd(size_t *_size)
   {
         return NULL;
   }
@@@ -410,16 -407,14 +410,16 @@@ static int __init warn_bootconfig(char 
   static void __init setup_boot_config(void)
   {
         static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
- -      const char *msg;
- -      int pos;
- -      u32 size, csum;
- -      char *data, *err;
- -      int ret;
+ +      const char *msg, *data;
+ +      int pos, ret;
+ +      size_t size;
+ +      char *err;
   
         /* Cut out the bootconfig data even if we have no bootconfig option */
- -      data = get_boot_config_from_initrd(&size, &csum);
+ +      data = get_boot_config_from_initrd(&size);
+ +      /* If there is no bootconfig in initrd, try embedded one. */
+ +      if (!data)
+ +              data = xbc_get_embedded_bootconfig(&size);
   
         strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
         err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
@@@ -438,8 -433,13 +438,8 @@@
         }
   
         if (size >= XBC_DATA_MAX) {
- -              pr_err("bootconfig size %d greater than max size %d\n",
- -                      size, XBC_DATA_MAX);
- -              return;
- -      }
- -
- -      if (xbc_calc_checksum(data, size) != csum) {
- -              pr_err("bootconfig checksum failed\n");
+ +              pr_err("bootconfig size %ld greater than max size %d\n",
+ +                      (long)size, XBC_DATA_MAX);
                 return;
         }
   
@@@ -452,7 -452,7 +452,7 @@@
                                 msg, pos);
         } else {
                 xbc_get_info(&ret, NULL);
- -              pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret);
+ +              pr_info("Load bootconfig: %ld bytes %d nodes\n", (long)size, ret);
                 /* keys starting with "kernel." are passed via cmdline */
                 extra_command_line = xbc_make_cmdline("kernel");
                 /* Also, "init." keys are init arguments */
@@@ -471,7 -471,7 +471,7 @@@ static void __init exit_boot_config(voi
   static void __init setup_boot_config(void)
   {
         /* Remove bootconfig data from initrd */
- -      get_boot_config_from_initrd(NULL, NULL);
+ +      get_boot_config_from_initrd(NULL);
   }
   
   static int __init warn_bootconfig(char *str)
@@@ -688,7 -688,7 +688,7 @@@ noinline void __ref rest_init(void
          * the init task will end up wanting to create kthreads, which, if
          * we schedule it before we create kthreadd, will OOPS.
          */
-       pid = kernel_thread(kernel_init, NULL, CLONE_FS);
+       pid = user_mode_thread(kernel_init, NULL, CLONE_FS);
         /*
          * Pin init on the boot CPU. Task migration is not properly working
          * until sched_init_smp() has been run. It will set the allowed
@@@ -1035,18 -1035,21 +1035,18 @@@ asmlinkage __visible void __init __no_s
         softirq_init();
         timekeeping_init();
         kfence_init();
+ +      time_init();
   
         /*
          * For best initial stack canary entropy, prepare it after:
          * - setup_arch() for any UEFI RNG entropy and boot cmdline access
- -       * - timekeeping_init() for ktime entropy used in rand_initialize()
- -       * - rand_initialize() to get any arch-specific entropy like RDRAND
- -       * - add_latent_entropy() to get any latent entropy
- -       * - adding command line entropy
+ +       * - timekeeping_init() for ktime entropy used in random_init()
+ +       * - time_init() for making random_get_entropy() work on some platforms
+ +       * - random_init() to initialize the RNG from from early entropy sources
          */
- -      rand_initialize();
- -      add_latent_entropy();
- -      add_device_randomness(command_line, strlen(command_line));
+ +      random_init(command_line);
         boot_init_stack_canary();
   
- -      time_init();
         perf_event_init();
         profile_init();
         call_function_init();
diff --combined kernel/fork.c

index 124829ed01632303b1b2f9614f1275809fb506ba,35645f57bd2f37f447132e04a22e7c1964360aac..9d44f2d46c6964d5cf7e29e06ad377b03fe25dc2
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -612,7 -612,9 +612,7 @@@ static __latent_entropy int dup_mmap(st
         retval = ksm_fork(mm, oldmm);
         if (retval)
                 goto out;
- -      retval = khugepaged_fork(mm, oldmm);
- -      if (retval)
- -              goto out;
+ +      khugepaged_fork(mm, oldmm);
   
         prev = NULL;
         for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@@ -790,7 -792,6 +790,7 @@@ void __mmdrop(struct mm_struct *mm
         mmu_notifier_subscriptions_destroy(mm);
         check_mm(mm);
         put_user_ns(mm->user_ns);
+ +      mm_pasid_drop(mm);
         free_mm(mm);
   }
   EXPORT_SYMBOL_GPL(__mmdrop);
@@@ -1044,11 -1045,6 +1044,11 @@@ static struct task_struct *dup_task_str
   #ifdef CONFIG_MEMCG
         tsk->active_memcg = NULL;
   #endif
+ +
+ +#ifdef CONFIG_CPU_SUP_INTEL
+ +      tsk->reported_split_lock = 0;
+ +#endif
+ +
         return tsk;
   
   free_stack:
@@@ -1194,6 -1190,7 +1194,6 @@@ static inline void __mmput(struct mm_st
         }
         if (mm->binfmt)
                 module_put(mm->binfmt->module);
- -      mm_pasid_drop(mm);
         mmdrop(mm);
   }
   
@@@ -1982,7 -1979,7 +1982,7 @@@ static __latent_entropy struct task_str
         struct task_struct *p;
         struct multiprocess_signals delayed;
         struct file *pidfile = NULL;
-       u64 clone_flags = args->flags;
+       const u64 clone_flags = args->flags;
         struct nsproxy *nsp = current->nsproxy;
   
         /*
@@@ -2071,6 -2068,9 +2071,9 @@@
         p = dup_task_struct(current, node);
         if (!p)
                 goto fork_out;
+       p->flags &= ~PF_KTHREAD;
+       if (args->kthread)
+               p->flags |= PF_KTHREAD;
         if (args->io_thread) {
                 /*
                  * Mark us an IO worker, and block any signal that isn't
@@@ -2160,7 -2160,7 +2163,7 @@@
         p->io_context = NULL;
         audit_set_context(p, NULL);
         cgroup_fork(p);
-       if (p->flags & PF_KTHREAD) {
+       if (args->kthread) {
                 if (!set_kthread_struct(p))
                         goto bad_fork_cleanup_delayacct;
         }
@@@ -2243,7 -2243,7 +2246,7 @@@
         retval = copy_io(clone_flags, p);
         if (retval)
                 goto bad_fork_cleanup_namespaces;
-       retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
+       retval = copy_thread(p, args);
         if (retval)
                 goto bad_fork_cleanup_io;
   
@@@ -2547,11 -2547,21 +2550,21 @@@ static inline void init_idle_pids(struc
         }
   }
   
+ static int idle_dummy(void *dummy)
+ {
+       /* This function is never called */
+       return 0;
+ }
+ 
   struct task_struct * __init fork_idle(int cpu)
   {
         struct task_struct *task;
         struct kernel_clone_args args = {
-               .flags = CLONE_VM,
+               .flags          = CLONE_VM,
+               .fn             = &idle_dummy,
+               .fn_arg         = NULL,
+               .kthread        = 1,
+               .idle           = 1,
         };
   
         task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
@@@ -2582,8 -2592,8 +2595,8 @@@ struct task_struct *create_io_thread(in
                 .flags          = ((lower_32_bits(flags) | CLONE_VM |
                                     CLONE_UNTRACED) & ~CSIGNAL),
                 .exit_signal    = (lower_32_bits(flags) & CSIGNAL),
-               .stack          = (unsigned long)fn,
-               .stack_size     = (unsigned long)arg,
+               .fn             = fn,
+               .fn_arg         = arg,
                 .io_thread      = 1,
         };
   
@@@ -2687,8 -2697,25 +2700,25 @@@ pid_t kernel_thread(int (*fn)(void *), 
                 .flags          = ((lower_32_bits(flags) | CLONE_VM |
                                     CLONE_UNTRACED) & ~CSIGNAL),
                 .exit_signal    = (lower_32_bits(flags) & CSIGNAL),
-               .stack          = (unsigned long)fn,
-               .stack_size     = (unsigned long)arg,
+               .fn             = fn,
+               .fn_arg         = arg,
+               .kthread        = 1,
+       };
+ 
+       return kernel_clone(&args);
+ }
+ 
+ /*
+  * Create a user mode thread.
+  */
+ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
+ {
+       struct kernel_clone_args args = {
+               .flags          = ((lower_32_bits(flags) | CLONE_VM |
+                                   CLONE_UNTRACED) & ~CSIGNAL),
+               .exit_signal    = (lower_32_bits(flags) & CSIGNAL),
+               .fn             = fn,
+               .fn_arg         = arg,
         };
   
         return kernel_clone(&args);
diff --combined kernel/sched/fair.c

index 8c5b74f66bd3b60490bb4fe97787ece910e8f7a7,db6f0df9d43e3ff0e5e9424fe455f9eb49b5e425..77b2048a932622a0188f8f93680d9215559a941c
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -36,7 -36,6 +36,7 @@@
   #include <linux/sched/cond_resched.h>
   #include <linux/sched/cputime.h>
   #include <linux/sched/isolation.h>
+ +#include <linux/sched/nohz.h>
   
   #include <linux/cpuidle.h>
   #include <linux/interrupt.h>
@@@ -174,37 -173,7 +174,37 @@@ int __weak arch_asym_cpu_priority(int c
    *
    * (default: 5 msec, units: microseconds)
    */
- -unsigned int sysctl_sched_cfs_bandwidth_slice         = 5000UL;
+ +static unsigned int sysctl_sched_cfs_bandwidth_slice          = 5000UL;
+ +#endif
+ +
+ +#ifdef CONFIG_SYSCTL
+ +static struct ctl_table sched_fair_sysctls[] = {
+ +      {
+ +              .procname       = "sched_child_runs_first",
+ +              .data           = &sysctl_sched_child_runs_first,
+ +              .maxlen         = sizeof(unsigned int),
+ +              .mode           = 0644,
+ +              .proc_handler   = proc_dointvec,
+ +      },
+ +#ifdef CONFIG_CFS_BANDWIDTH
+ +      {
+ +              .procname       = "sched_cfs_bandwidth_slice_us",
+ +              .data           = &sysctl_sched_cfs_bandwidth_slice,
+ +              .maxlen         = sizeof(unsigned int),
+ +              .mode           = 0644,
+ +              .proc_handler   = proc_dointvec_minmax,
+ +              .extra1         = SYSCTL_ONE,
+ +      },
+ +#endif
+ +      {}
+ +};
+ +
+ +static int __init sched_fair_sysctl_init(void)
+ +{
+ +      register_sysctl_init("kernel", sched_fair_sysctls);
+ +      return 0;
+ +}
+ +late_initcall(sched_fair_sysctl_init);
   #endif
   
   static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@@ -344,6 -313,19 +344,6 @@@ const struct sched_class fair_sched_cla
   #define for_each_sched_entity(se) \
                 for (; se; se = se->parent)
   
- -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
- -{
- -      if (!path)
- -              return;
- -
- -      if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
- -              autogroup_path(cfs_rq->tg, path, len);
- -      else if (cfs_rq && cfs_rq->tg->css.cgroup)
- -              cgroup_path(cfs_rq->tg->css.cgroup, path, len);
- -      else
- -              strlcpy(path, "(null)", len);
- -}
- -
   static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
   {
         struct rq *rq = rq_of(cfs_rq);
@@@ -511,6 -493,12 +511,6 @@@ static int se_is_idle(struct sched_enti
   #define for_each_sched_entity(se) \
                 for (; se; se = NULL)
   
- -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
- -{
- -      if (path)
- -              strlcpy(path, "(null)", len);
- -}
- -
   static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
   {
         return true;
@@@ -2927,7 -2915,7 +2927,7 @@@ static void task_tick_numa(struct rq *r
         /*
          * We don't care about NUMA placement if we don't have memory.
          */
-       if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
+       if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
                 return;
   
         /*
@@@ -3841,11 -3829,11 +3841,11 @@@ static void attach_entity_load_avg(stru
   
         se->avg.runnable_sum = se->avg.runnable_avg * divider;
   
- -      se->avg.load_sum = divider;
- -      if (se_weight(se)) {
- -              se->avg.load_sum =
- -                      div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
- -      }
+ +      se->avg.load_sum = se->avg.load_avg * divider;
+ +      if (se_weight(se) < se->avg.load_sum)
+ +              se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+ +      else
+ +              se->avg.load_sum = 1;
   
         enqueue_load_avg(cfs_rq, se);
         cfs_rq->avg.util_avg += se->avg.util_avg;
@@@ -4858,11 -4846,11 +4858,11 @@@ static int tg_unthrottle_up(struct task
   
         cfs_rq->throttle_count--;
         if (!cfs_rq->throttle_count) {
- -              cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
- -                                           cfs_rq->throttled_clock_task;
+ +              cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ +                                           cfs_rq->throttled_clock_pelt;
   
                 /* Add cfs_rq with load or one or more already running entities to the list */
- -              if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
+ +              if (!cfs_rq_is_decayed(cfs_rq))
                         list_add_leaf_cfs_rq(cfs_rq);
         }
   
@@@ -4876,7 -4864,7 +4876,7 @@@ static int tg_throttle_down(struct task
   
         /* group is entering throttled state, stop time */
         if (!cfs_rq->throttle_count) {
- -              cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ +              cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
                 list_del_leaf_cfs_rq(cfs_rq);
         }
         cfs_rq->throttle_count++;
@@@ -5320,7 -5308,7 +5320,7 @@@ static void sync_throttle(struct task_g
         pcfs_rq = tg->parent->cfs_rq[cpu];
   
         cfs_rq->throttle_count = pcfs_rq->throttle_count;
- -      cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+ +      cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
   }
   
   /* conditionally throttle active cfs_rq's from put_prev_entity() */
@@@ -6556,19 -6544,108 +6556,19 @@@ static int select_idle_sibling(struct t
   }
   
   /*
- - * cpu_util_without: compute cpu utilization without any contributions from *p
- - * @cpu: the CPU which utilization is requested
- - * @p: the task which utilization should be discounted
- - *
- - * The utilization of a CPU is defined by the utilization of tasks currently
- - * enqueued on that CPU as well as tasks which are currently sleeping after an
- - * execution on that CPU.
- - *
- - * This method returns the utilization of the specified CPU by discounting the
- - * utilization of the specified task, whenever the task is currently
- - * contributing to the CPU utilization.
- - */
- -static unsigned long cpu_util_without(int cpu, struct task_struct *p)
- -{
- -      struct cfs_rq *cfs_rq;
- -      unsigned int util;
- -
- -      /* Task has no contribution or is new */
- -      if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- -              return cpu_util_cfs(cpu);
- -
- -      cfs_rq = &cpu_rq(cpu)->cfs;
- -      util = READ_ONCE(cfs_rq->avg.util_avg);
- -
- -      /* Discount task's util from CPU's util */
- -      lsub_positive(&util, task_util(p));
- -
- -      /*
- -       * Covered cases:
- -       *
- -       * a) if *p is the only task sleeping on this CPU, then:
- -       *      cpu_util (== task_util) > util_est (== 0)
- -       *    and thus we return:
- -       *      cpu_util_without = (cpu_util - task_util) = 0
- -       *
- -       * b) if other tasks are SLEEPING on this CPU, which is now exiting
- -       *    IDLE, then:
- -       *      cpu_util >= task_util
- -       *      cpu_util > util_est (== 0)
- -       *    and thus we discount *p's blocked utilization to return:
- -       *      cpu_util_without = (cpu_util - task_util) >= 0
- -       *
- -       * c) if other tasks are RUNNABLE on that CPU and
- -       *      util_est > cpu_util
- -       *    then we use util_est since it returns a more restrictive
- -       *    estimation of the spare capacity on that CPU, by just
- -       *    considering the expected utilization of tasks already
- -       *    runnable on that CPU.
- -       *
- -       * Cases a) and b) are covered by the above code, while case c) is
- -       * covered by the following code when estimated utilization is
- -       * enabled.
- -       */
- -      if (sched_feat(UTIL_EST)) {
- -              unsigned int estimated =
- -                      READ_ONCE(cfs_rq->avg.util_est.enqueued);
- -
- -              /*
- -               * Despite the following checks we still have a small window
- -               * for a possible race, when an execl's select_task_rq_fair()
- -               * races with LB's detach_task():
- -               *
- -               *   detach_task()
- -               *     p->on_rq = TASK_ON_RQ_MIGRATING;
- -               *     ---------------------------------- A
- -               *     deactivate_task()                   \
- -               *       dequeue_task()                     + RaceTime
- -               *         util_est_dequeue()              /
- -               *     ---------------------------------- B
- -               *
- -               * The additional check on "current == p" it's required to
- -               * properly fix the execl regression and it helps in further
- -               * reducing the chances for the above race.
- -               */
- -              if (unlikely(task_on_rq_queued(p) || current == p))
- -                      lsub_positive(&estimated, _task_util_est(p));
- -
- -              util = max(util, estimated);
- -      }
- -
- -      /*
- -       * Utilization (estimated) can exceed the CPU capacity, thus let's
- -       * clamp to the maximum CPU capacity to ensure consistency with
- -       * cpu_util.
- -       */
- -      return min_t(unsigned long, util, capacity_orig_of(cpu));
- -}
- -
- -/*
- - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- - * to @dst_cpu.
+ + * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
+ + * (@dst_cpu = -1) or migrated to @dst_cpu.
    */
   static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
   {
         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
- -      unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+ +      unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
   
         /*
- -       * If @p migrates from @cpu to another, remove its contribution. Or,
- -       * if @p migrates from another CPU to @cpu, add its contribution. In
- -       * the other cases, @cpu is not impacted by the migration, so the
- -       * util_avg should already be correct.
+ +       * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
+ +       * contribution. If @p migrates from another CPU to @cpu add its
+ +       * contribution. In all the other cases @cpu is not impacted by the
+ +       * migration so its util_avg is already correct.
          */
         if (task_cpu(p) == cpu && dst_cpu != cpu)
                 lsub_positive(&util, task_util(p));
@@@ -6576,40 -6653,16 +6576,40 @@@
                 util += task_util(p);
   
         if (sched_feat(UTIL_EST)) {
+ +              unsigned long util_est;
+ +
                 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
   
                 /*
- -               * During wake-up, the task isn't enqueued yet and doesn't
- -               * appear in the cfs_rq->avg.util_est.enqueued of any rq,
- -               * so just add it (if needed) to "simulate" what will be
- -               * cpu_util after the task has been enqueued.
+ +               * During wake-up @p isn't enqueued yet and doesn't contribute
+ +               * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+ +               * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
+ +               * has been enqueued.
+ +               *
+ +               * During exec (@dst_cpu = -1) @p is enqueued and does
+ +               * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+ +               * Remove it to "simulate" cpu_util without @p's contribution.
+ +               *
+ +               * Despite the task_on_rq_queued(@p) check there is still a
+ +               * small window for a possible race when an exec
+ +               * select_task_rq_fair() races with LB's detach_task().
+ +               *
+ +               *   detach_task()
+ +               *     deactivate_task()
+ +               *       p->on_rq = TASK_ON_RQ_MIGRATING;
+ +               *       -------------------------------- A
+ +               *       dequeue_task()                    \
+ +               *         dequeue_task_fair()              + Race Time
+ +               *           util_est_dequeue()            /
+ +               *       -------------------------------- B
+ +               *
+ +               * The additional check "current == p" is required to further
+ +               * reduce the race window.
                  */
                 if (dst_cpu == cpu)
                         util_est += _task_util_est(p);
+ +              else if (unlikely(task_on_rq_queued(p) || current == p))
+ +                      lsub_positive(&util_est, _task_util_est(p));
   
                 util = max(util, util_est);
         }
@@@ -6617,28 -6670,6 +6617,28 @@@
         return min(util, capacity_orig_of(cpu));
   }
   
+ +/*
+ + * cpu_util_without: compute cpu utilization without any contributions from *p
+ + * @cpu: the CPU which utilization is requested
+ + * @p: the task which utilization should be discounted
+ + *
+ + * The utilization of a CPU is defined by the utilization of tasks currently
+ + * enqueued on that CPU as well as tasks which are currently sleeping after an
+ + * execution on that CPU.
+ + *
+ + * This method returns the utilization of the specified CPU by discounting the
+ + * utilization of the specified task, whenever the task is currently
+ + * contributing to the CPU utilization.
+ + */
+ +static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+ +{
+ +      /* Task has no contribution or is new */
+ +      if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ +              return cpu_util_cfs(cpu);
+ +
+ +      return cpu_util_next(cpu, p, -1);
+ +}
+ +
   /*
    * compute_energy(): Estimates the energy that @pd would consume if @p was
    * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
@@@ -9429,6 -9460,8 +9429,6 @@@ static inline void calculate_imbalance(
                 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
                                   local->group_capacity;
   
- -              sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
- -                              sds->total_capacity;
                 /*
                  * If the local group is more loaded than the selected
                  * busiest group don't try to pull any tasks.
@@@ -9437,9 -9470,6 +9437,9 @@@
                         env->imbalance = 0;
                         return;
                 }
+ +
+ +              sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ +                              sds->total_capacity;
         }
   
         /*
@@@ -9465,7 -9495,7 +9465,7 @@@
    * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
    * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
    * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
- - * misfit_task      force     N/A        N/A    N/A  force      force
+ + * misfit_task      force     N/A        N/A    N/A  N/A        N/A
    * asym_packing     force     force      N/A    N/A  force      force
    * imbalanced       force     force      N/A    N/A  force      force
    * overloaded       force     force      N/A    N/A  force      avg_load
@@@ -11851,3 -11881,101 +11851,3 @@@ __init void init_sched_fair_class(void
   #endif /* SMP */
   
   }
- -
- -/*
- - * Helper functions to facilitate extracting info from tracepoints.
- - */
- -
- -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
- -{
- -#ifdef CONFIG_SMP
- -      return cfs_rq ? &cfs_rq->avg : NULL;
- -#else
- -      return NULL;
- -#endif
- -}
- -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
- -
- -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
- -{
- -      if (!cfs_rq) {
- -              if (str)
- -                      strlcpy(str, "(null)", len);
- -              else
- -                      return NULL;
- -      }
- -
- -      cfs_rq_tg_path(cfs_rq, str, len);
- -      return str;
- -}
- -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
- -
- -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
- -{
- -      return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
- -}
- -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
- -
- -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
- -{
- -#ifdef CONFIG_SMP
- -      return rq ? &rq->avg_rt : NULL;
- -#else
- -      return NULL;
- -#endif
- -}
- -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
- -
- -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
- -{
- -#ifdef CONFIG_SMP
- -      return rq ? &rq->avg_dl : NULL;
- -#else
- -      return NULL;
- -#endif
- -}
- -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
- -
- -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
- -{
- -#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
- -      return rq ? &rq->avg_irq : NULL;
- -#else
- -      return NULL;
- -#endif
- -}
- -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
- -
- -int sched_trace_rq_cpu(struct rq *rq)
- -{
- -      return rq ? cpu_of(rq) : -1;
- -}
- -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
- -
- -int sched_trace_rq_cpu_capacity(struct rq *rq)
- -{
- -      return rq ?
- -#ifdef CONFIG_SMP
- -              rq->cpu_capacity
- -#else
- -              SCHED_CAPACITY_SCALE
- -#endif
- -              : -1;
- -}
- -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
- -
- -const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
- -{
- -#ifdef CONFIG_SMP
- -      return rd ? rd->span : NULL;
- -#else
- -      return NULL;
- -#endif
- -}
- -EXPORT_SYMBOL_GPL(sched_trace_rd_span);
- -
- -int sched_trace_rq_nr_running(struct rq *rq)
- -{
- -        return rq ? rq->nr_running : -1;
- -}
- -EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
author	Linus Torvalds <[email protected]>
	Fri, 3 Jun 2022 23:03:05 +0000 (16:03 -0700)
committer	Linus Torvalds <[email protected]>
	Fri, 3 Jun 2022 23:03:05 +0000 (16:03 -0700)
		1	2
arch/arm64/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/csky/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/m68k/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/openrisc/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/parisc/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/riscv/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/fpu/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/xtensa/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
init/initramfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history