From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 3 Jun 2022 23:03:05 +0000 (-0700)
Subject: Merge tag 'kthread-cleanups-for-v5.19' of git://git.kernel.org/pub/scm/linux/kernel... 
X-Git-Tag: v5.19-rc1~30
X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/1ec6574a3c0a22c130c08e8c36c825cb87d68f8e?hp=-c

Merge tag 'kthread-cleanups-for-v5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull kthread updates from Eric Biederman:
 "This updates init and user mode helper tasks to be ordinary user mode
  tasks.

  Commit 40966e316f86 ("kthread: Ensure struct kthread is present for
  all kthreads") caused init and the user mode helper threads that call
  kernel_execve to have struct kthread allocated for them. This struct
  kthread going away during execve in turned made a use after free of
  struct kthread possible.

  Here, commit 343f4c49f243 ("kthread: Don't allocate kthread_struct for
  init and umh") is enough to fix the use after free and is simple
  enough to be backportable.

  The rest of the changes pass struct kernel_clone_args to clean things
  up and cause the code to make sense.

  In making init and the user mode helpers tasks purely user mode tasks
  I ran into two complications. The function task_tick_numa was
  detecting tasks without an mm by testing for the presence of
  PF_KTHREAD. The initramfs code in populate_initrd_image was using
  flush_delayed_fput to ensuere the closing of all it's file descriptors
  was complete, and flush_delayed_fput does not work in a userspace
  thread.

  I have looked and looked and more complications and in my code review
  I have not found any, and neither has anyone else with the code
  sitting in linux-next"

* tag 'kthread-cleanups-for-v5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
  sched: Update task_tick_numa to ignore tasks without an mm
  fork: Stop allowing kthreads to call execve
  fork: Explicitly set PF_KTHREAD
  init: Deal with the init process being a user mode process
  fork: Generalize PF_IO_WORKER handling
  fork: Explicity test for idle tasks in copy_thread
  fork: Pass struct kernel_clone_args into copy_thread
  kthread: Don't allocate kthread_struct for init and umh
---

1ec6574a3c0a22c130c08e8c36c825cb87d68f8e
diff --combined arch/arm64/kernel/process.c
index 2f42123e059f,d0ef05c661b0..92bcc1768f0b
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@@ -111,7 -111,8 +111,7 @@@ void machine_power_off(void
  {
  	local_irq_disable();
  	smp_send_stop();
 -	if (pm_power_off)
 -		pm_power_off();
 +	do_kernel_power_off();
  }
  
  /*
@@@ -249,8 -250,6 +249,8 @@@ void show_regs(struct pt_regs *regs
  static void tls_thread_flush(void)
  {
  	write_sysreg(0, tpidr_el0);
 +	if (system_supports_tpidr2())
 +		write_sysreg_s(0, SYS_TPIDR2_EL0);
  
  	if (is_compat_task()) {
  		current->thread.uw.tp_value = 0;
@@@ -299,42 -298,16 +299,42 @@@ int arch_dup_task_struct(struct task_st
  
  	/*
  	 * Detach src's sve_state (if any) from dst so that it does not
 -	 * get erroneously used or freed prematurely.  dst's sve_state
 +	 * get erroneously used or freed prematurely.  dst's copies
  	 * will be allocated on demand later on if dst uses SVE.
  	 * For consistency, also clear TIF_SVE here: this could be done
  	 * later in copy_process(), but to avoid tripping up future
 -	 * maintainers it is best not to leave TIF_SVE and sve_state in
 +	 * maintainers it is best not to leave TIF flags and buffers in
  	 * an inconsistent state, even temporarily.
  	 */
  	dst->thread.sve_state = NULL;
  	clear_tsk_thread_flag(dst, TIF_SVE);
  
 +	/*
 +	 * In the unlikely event that we create a new thread with ZA
 +	 * enabled we should retain the ZA state so duplicate it here.
 +	 * This may be shortly freed if we exec() or if CLONE_SETTLS
 +	 * but it's simpler to do it here. To avoid confusing the rest
 +	 * of the code ensure that we have a sve_state allocated
 +	 * whenever za_state is allocated.
 +	 */
 +	if (thread_za_enabled(&src->thread)) {
 +		dst->thread.sve_state = kzalloc(sve_state_size(src),
 +						GFP_KERNEL);
 +		if (!dst->thread.sve_state)
 +			return -ENOMEM;
 +		dst->thread.za_state = kmemdup(src->thread.za_state,
 +					       za_state_size(src),
 +					       GFP_KERNEL);
 +		if (!dst->thread.za_state) {
 +			kfree(dst->thread.sve_state);
 +			dst->thread.sve_state = NULL;
 +			return -ENOMEM;
 +		}
 +	} else {
 +		dst->thread.za_state = NULL;
 +		clear_tsk_thread_flag(dst, TIF_SME);
 +	}
 +
  	/* clear any pending asynchronous tag fault raised by the parent */
  	clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT);
  
@@@ -343,9 -316,11 +343,11 @@@
  
  asmlinkage void ret_from_fork(void) asm("ret_from_fork");
  
- int copy_thread(unsigned long clone_flags, unsigned long stack_start,
- 		unsigned long stk_sz, struct task_struct *p, unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  {
+ 	unsigned long clone_flags = args->flags;
+ 	unsigned long stack_start = args->stack;
+ 	unsigned long tls = args->tls;
  	struct pt_regs *childregs = task_pt_regs(p);
  
  	memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
@@@ -361,7 -336,7 +363,7 @@@
  
  	ptrauth_thread_init_kernel(p);
  
- 	if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) {
+ 	if (likely(!args->fn)) {
  		*childregs = *current_pt_regs();
  		childregs->regs[0] = 0;
  
@@@ -370,8 -345,6 +372,8 @@@
  		 * out-of-sync with the saved value.
  		 */
  		*task_user_tls(p) = read_sysreg(tpidr_el0);
 +		if (system_supports_tpidr2())
 +			p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
  
  		if (stack_start) {
  			if (is_compat_thread(task_thread_info(p)))
@@@ -382,12 -355,10 +384,12 @@@
  
  		/*
  		 * If a TLS pointer was passed to clone, use it for the new
 -		 * thread.
 +		 * thread.  We also reset TPIDR2 if it's in use.
  		 */
 -		if (clone_flags & CLONE_SETTLS)
 +		if (clone_flags & CLONE_SETTLS) {
  			p->thread.uw.tp_value = tls;
 +			p->thread.tpidr2_el0 = 0;
 +		}
  	} else {
  		/*
  		 * A kthread has no context to ERET to, so ensure any buggy
@@@ -399,8 -370,8 +401,8 @@@
  		memset(childregs, 0, sizeof(struct pt_regs));
  		childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT;
  
- 		p->thread.cpu_context.x19 = stack_start;
- 		p->thread.cpu_context.x20 = stk_sz;
+ 		p->thread.cpu_context.x19 = (unsigned long)args->fn;
+ 		p->thread.cpu_context.x20 = (unsigned long)args->fn_arg;
  	}
  	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
  	p->thread.cpu_context.sp = (unsigned long)childregs;
@@@ -418,8 -389,6 +420,8 @@@
  void tls_preserve_current_state(void)
  {
  	*task_user_tls(current) = read_sysreg(tpidr_el0);
 +	if (system_supports_tpidr2() && !is_compat_task())
 +		current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);
  }
  
  static void tls_thread_switch(struct task_struct *next)
@@@ -432,8 -401,6 +434,8 @@@
  		write_sysreg(0, tpidrro_el0);
  
  	write_sysreg(*task_user_tls(next), tpidr_el0);
 +	if (system_supports_tpidr2())
 +		write_sysreg_s(next->thread.tpidr2_el0, SYS_TPIDR2_EL0);
  }
  
  /*
diff --combined arch/csky/kernel/process.c
index 5de04707aa07,9af49aea1c3b..eedddb155669
--- a/arch/csky/kernel/process.c
+++ b/arch/csky/kernel/process.c
@@@ -2,6 -2,7 +2,6 @@@
  // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
  
  #include <linux/module.h>
 -#include <linux/version.h>
  #include <linux/sched.h>
  #include <linux/sched/task_stack.h>
  #include <linux/sched/debug.h>
@@@ -29,12 -30,11 +29,11 @@@ asmlinkage void ret_from_kernel_thread(
   */
  void flush_thread(void){}
  
- int copy_thread(unsigned long clone_flags,
- 		unsigned long usp,
- 		unsigned long kthread_arg,
- 		struct task_struct *p,
- 		unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  {
+ 	unsigned long clone_flags = args->flags;
+ 	unsigned long usp = args->stack;
+ 	unsigned long tls = args->tls;
  	struct switch_stack *childstack;
  	struct pt_regs *childregs = task_pt_regs(p);
  
@@@ -48,11 -48,11 +47,11 @@@
  	/* setup thread.sp for switch_to !!! */
  	p->thread.sp = (unsigned long)childstack;
  
- 	if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+ 	if (unlikely(args->fn)) {
  		memset(childregs, 0, sizeof(struct pt_regs));
  		childstack->r15 = (unsigned long) ret_from_kernel_thread;
- 		childstack->r10 = kthread_arg;
- 		childstack->r9 = usp;
+ 		childstack->r10 = (unsigned long) args->fn_arg;
+ 		childstack->r9 = (unsigned long) args->fn;
  		childregs->sr = mfcr("psr");
  	} else {
  		*childregs = *(current_pt_regs());
diff --combined arch/ia64/kernel/process.c
index 89025e3b3f61,167b1765bea1..416305e550e2
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@@ -19,7 -19,6 +19,7 @@@
  #include <linux/module.h>
  #include <linux/notifier.h>
  #include <linux/personality.h>
 +#include <linux/reboot.h>
  #include <linux/sched.h>
  #include <linux/sched/debug.h>
  #include <linux/sched/hotplug.h>
@@@ -296,9 -295,12 +296,12 @@@ ia64_load_extra (struct task_struct *ta
   * so there is nothing to worry about.
   */
  int
- copy_thread(unsigned long clone_flags, unsigned long user_stack_base,
- 	    unsigned long user_stack_size, struct task_struct *p, unsigned long tls)
+ copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  {
+ 	unsigned long clone_flags = args->flags;
+ 	unsigned long user_stack_base = args->stack;
+ 	unsigned long user_stack_size = args->stack_size;
+ 	unsigned long tls = args->tls;
  	extern char ia64_ret_from_clone;
  	struct switch_stack *child_stack, *stack;
  	unsigned long rbs, child_rbs, rbs_size;
@@@ -339,14 -341,14 +342,14 @@@
  
  	ia64_drop_fpu(p);	/* don't pick up stale state from a CPU's fph */
  
- 	if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
- 		if (unlikely(!user_stack_base)) {
+ 	if (unlikely(args->fn)) {
+ 		if (unlikely(args->idle)) {
  			/* fork_idle() called us */
  			return 0;
  		}
  		memset(child_stack, 0, sizeof(*child_ptregs) + sizeof(*child_stack));
- 		child_stack->r4 = user_stack_base;	/* payload */
- 		child_stack->r5 = user_stack_size;	/* argument */
+ 		child_stack->r4 = (unsigned long) args->fn;
+ 		child_stack->r5 = (unsigned long) args->fn_arg;
  		/*
  		 * Preserve PSR bits, except for bits 32-34 and 37-45,
  		 * which we can't read.
@@@ -600,7 -602,8 +603,7 @@@ machine_halt (void
  void
  machine_power_off (void)
  {
 -	if (pm_power_off)
 -		pm_power_off();
 +	do_kernel_power_off();
  	machine_halt();
  }
  
diff --combined arch/m68k/kernel/process.c
index e160a7c57bd3,221feb0269f1..2cb4a61bcfac
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@@ -67,11 -67,12 +67,11 @@@ void machine_halt(void
  
  void machine_power_off(void)
  {
 -	if (mach_power_off)
 -		mach_power_off();
 +	do_kernel_power_off();
  	for (;;);
  }
  
 -void (*pm_power_off)(void) = machine_power_off;
 +void (*pm_power_off)(void);
  EXPORT_SYMBOL(pm_power_off);
  
  void show_regs(struct pt_regs * regs)
@@@ -137,9 -138,11 +137,11 @@@ asmlinkage int m68k_clone3(struct pt_re
  	return sys_clone3((struct clone_args __user *)regs->d1, regs->d2);
  }
  
- int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
- 		struct task_struct *p, unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  {
+ 	unsigned long clone_flags = args->flags;
+ 	unsigned long usp = args->stack;
+ 	unsigned long tls = args->tls;
  	struct fork_frame {
  		struct switch_stack sw;
  		struct pt_regs regs;
@@@ -156,12 -159,12 +158,12 @@@
  	 */
  	p->thread.fc = USER_DATA;
  
- 	if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+ 	if (unlikely(args->fn)) {
  		/* kernel thread */
  		memset(frame, 0, sizeof(struct fork_frame));
  		frame->regs.sr = PS_S;
- 		frame->sw.a3 = usp; /* function */
- 		frame->sw.d7 = arg;
+ 		frame->sw.a3 = (unsigned long)args->fn;
+ 		frame->sw.d7 = (unsigned long)args->fn_arg;
  		frame->sw.retpc = (unsigned long)ret_from_kernel_thread;
  		p->thread.usp = 0;
  		return 0;
diff --combined arch/openrisc/kernel/process.c
index 1d4c0921aafa,d9697cc9bc4d..52dc983ddeba
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@@ -52,8 -52,6 +52,8 @@@ void machine_restart(char *cmd
  {
  	do_kernel_restart(cmd);
  
 +	__asm__("l.nop 13");
 +
  	/* Give a grace period for failure to restart of 1s */
  	mdelay(1000);
  
@@@ -62,16 -60,6 +62,16 @@@
  	while (1);
  }
  
 +/*
 + * This is used if pm_power_off has not been set by a power management
 + * driver, in this case we can assume we are on a simulator.  On
 + * OpenRISC simulators l.nop 1 will trigger the simulator exit.
 + */
 +static void default_power_off(void)
 +{
 +	__asm__("l.nop 1");
 +}
 +
  /*
   * Similar to machine_power_off, but don't shut off power.  Add code
   * here to freeze the system for e.g. post-mortem debug purpose when
@@@ -87,10 -75,7 +87,10 @@@ void machine_halt(void
  void machine_power_off(void)
  {
  	printk(KERN_INFO "*** MACHINE POWER OFF ***\n");
 -	__asm__("l.nop 1");
 +	if (pm_power_off != NULL)
 +		pm_power_off();
 +	else
 +		default_power_off();
  }
  
  /*
@@@ -104,7 -89,7 +104,7 @@@ void arch_cpu_idle(void
  		mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME);
  }
  
 -void (*pm_power_off) (void) = machine_power_off;
 +void (*pm_power_off)(void) = NULL;
  EXPORT_SYMBOL(pm_power_off);
  
  /*
@@@ -167,9 -152,11 +167,11 @@@ extern asmlinkage void ret_from_fork(vo
   */
  
  int
- copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
- 	    struct task_struct *p, unsigned long tls)
+ copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  {
+ 	unsigned long clone_flags = args->flags;
+ 	unsigned long usp = args->stack;
+ 	unsigned long tls = args->tls;
  	struct pt_regs *userregs;
  	struct pt_regs *kregs;
  	unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
@@@ -187,10 -174,10 +189,10 @@@
  	sp -= sizeof(struct pt_regs);
  	kregs = (struct pt_regs *)sp;
  
- 	if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+ 	if (unlikely(args->fn)) {
  		memset(kregs, 0, sizeof(struct pt_regs));
- 		kregs->gpr[20] = usp; /* fn, kernel thread */
- 		kregs->gpr[22] = arg;
+ 		kregs->gpr[20] = (unsigned long)args->fn;
+ 		kregs->gpr[22] = (unsigned long)args->fn_arg;
  	} else {
  		*userregs = *current_pt_regs();
  
diff --combined arch/parisc/kernel/process.c
index d145184696ea,a6a2a558fc5b..7c37e09c92da
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@@ -26,7 -26,6 +26,7 @@@
  #include <linux/module.h>
  #include <linux/personality.h>
  #include <linux/ptrace.h>
 +#include <linux/reboot.h>
  #include <linux/sched.h>
  #include <linux/sched/debug.h>
  #include <linux/sched/task.h>
@@@ -117,7 -116,8 +117,7 @@@ void machine_power_off(void
  	pdc_chassis_send_status(PDC_CHASSIS_DIRECT_SHUTDOWN);
  
  	/* ipmi_poweroff may have been installed. */
 -	if (pm_power_off)
 -		pm_power_off();
 +	do_kernel_power_off();
  		
  	/* It seems we have no way to power the system off via
  	 * software. The user has to press the button himself. */
@@@ -206,9 -206,11 +206,11 @@@ arch_initcall(parisc_idle_init)
   * Copy architecture-specific thread state
   */
  int
- copy_thread(unsigned long clone_flags, unsigned long usp,
- 	    unsigned long kthread_arg, struct task_struct *p, unsigned long tls)
+ copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  {
+ 	unsigned long clone_flags = args->flags;
+ 	unsigned long usp = args->stack;
+ 	unsigned long tls = args->tls;
  	struct pt_regs *cregs = &(p->thread.regs);
  	void *stack = task_stack_page(p);
  	
@@@ -218,10 -220,10 +220,10 @@@
  	extern void * const ret_from_kernel_thread;
  	extern void * const child_return;
  
- 	if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+ 	if (unlikely(args->fn)) {
  		/* kernel thread */
  		memset(cregs, 0, sizeof(struct pt_regs));
- 		if (!usp) /* idle thread */
+ 		if (args->idle) /* idle thread */
  			return 0;
  		/* Must exit via ret_from_kernel_thread in order
  		 * to call schedule_tail()
@@@ -233,12 -235,12 +235,12 @@@
  		 * ret_from_kernel_thread.
  		 */
  #ifdef CONFIG_64BIT
- 		cregs->gr[27] = ((unsigned long *)usp)[3];
- 		cregs->gr[26] = ((unsigned long *)usp)[2];
+ 		cregs->gr[27] = ((unsigned long *)args->fn)[3];
+ 		cregs->gr[26] = ((unsigned long *)args->fn)[2];
  #else
- 		cregs->gr[26] = usp;
+ 		cregs->gr[26] = (unsigned long) args->fn;
  #endif
- 		cregs->gr[25] = kthread_arg;
+ 		cregs->gr[25] = (unsigned long) args->fn_arg;
  	} else {
  		/* user thread */
  		/* usp must be word aligned.  This also prevents users from
diff --combined arch/powerpc/kernel/process.c
index d00b20c65966,4f367bb68906..b62046bf3bb8
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@@ -34,8 -34,10 +34,8 @@@
  #include <linux/ftrace.h>
  #include <linux/kernel_stat.h>
  #include <linux/personality.h>
 -#include <linux/random.h>
  #include <linux/hw_breakpoint.h>
  #include <linux/uaccess.h>
 -#include <linux/elf-randomize.h>
  #include <linux/pkeys.h>
  #include <linux/seq_buf.h>
  
@@@ -43,6 -45,7 +43,6 @@@
  #include <asm/io.h>
  #include <asm/processor.h>
  #include <asm/mmu.h>
 -#include <asm/prom.h>
  #include <asm/machdep.h>
  #include <asm/time.h>
  #include <asm/runlatch.h>
@@@ -304,7 -307,7 +304,7 @@@ static void __giveup_vsx(struct task_st
  	unsigned long msr = tsk->thread.regs->msr;
  
  	/*
 -	 * We should never be ssetting MSR_VSX without also setting
 +	 * We should never be setting MSR_VSX without also setting
  	 * MSR_FP and MSR_VEC
  	 */
  	WARN_ON((msr & MSR_VSX) && !((msr & MSR_FP) && (msr & MSR_VEC)));
@@@ -642,7 -645,7 +642,7 @@@ static void do_break_handler(struct pt_
  		return;
  	}
  
 -	/* Otherwise findout which DAWR caused exception and disable it. */
 +	/* Otherwise find out which DAWR caused exception and disable it. */
  	wp_get_instr_detail(regs, &instr, &type, &size, &ea);
  
  	for (i = 0; i < nr_wp_slots(); i++) {
@@@ -1713,10 -1716,11 +1713,11 @@@ static void setup_ksp_vsid(struct task_
  /*
   * Copy architecture-specific thread state
   */
- int copy_thread(unsigned long clone_flags, unsigned long usp,
- 		unsigned long kthread_arg, struct task_struct *p,
- 		unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  {
+ 	unsigned long clone_flags = args->flags;
+ 	unsigned long usp = args->stack;
+ 	unsigned long tls = args->tls;
  	struct pt_regs *childregs, *kregs;
  	extern void ret_from_fork(void);
  	extern void ret_from_fork_scv(void);
@@@ -1733,18 -1737,18 +1734,18 @@@
  	/* Copy registers */
  	sp -= sizeof(struct pt_regs);
  	childregs = (struct pt_regs *) sp;
- 	if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+ 	if (unlikely(args->fn)) {
  		/* kernel thread */
  		memset(childregs, 0, sizeof(struct pt_regs));
  		childregs->gpr[1] = sp + sizeof(struct pt_regs);
  		/* function */
- 		if (usp)
- 			childregs->gpr[14] = ppc_function_entry((void *)usp);
+ 		if (args->fn)
+ 			childregs->gpr[14] = ppc_function_entry((void *)args->fn);
  #ifdef CONFIG_PPC64
  		clear_tsk_thread_flag(p, TIF_32BIT);
  		childregs->softe = IRQS_ENABLED;
  #endif
- 		childregs->gpr[15] = kthread_arg;
+ 		childregs->gpr[15] = (unsigned long)args->fn_arg;
  		p->thread.regs = NULL;	/* no user register state */
  		ti->flags |= _TIF_RESTOREALL;
  		f = ret_from_kernel_thread;
@@@ -2310,3 -2314,42 +2311,3 @@@ unsigned long arch_align_stack(unsigne
  		sp -= get_random_int() & ~PAGE_MASK;
  	return sp & ~0xf;
  }
 -
 -static inline unsigned long brk_rnd(void)
 -{
 -        unsigned long rnd = 0;
 -
 -	/* 8MB for 32bit, 1GB for 64bit */
 -	if (is_32bit_task())
 -		rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT)));
 -	else
 -		rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT)));
 -
 -	return rnd << PAGE_SHIFT;
 -}
 -
 -unsigned long arch_randomize_brk(struct mm_struct *mm)
 -{
 -	unsigned long base = mm->brk;
 -	unsigned long ret;
 -
 -#ifdef CONFIG_PPC_BOOK3S_64
 -	/*
 -	 * If we are using 1TB segments and we are allowed to randomise
 -	 * the heap, we can put it above 1TB so it is backed by a 1TB
 -	 * segment. Otherwise the heap will be in the bottom 1TB
 -	 * which always uses 256MB segments and this may result in a
 -	 * performance penalty.
 -	 */
 -	if (!radix_enabled() && !is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T))
 -		base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T);
 -#endif
 -
 -	ret = PAGE_ALIGN(base + brk_rnd());
 -
 -	if (ret < mm->brk)
 -		return mm->brk;
 -
 -	return ret;
 -}
 -
diff --combined arch/riscv/kernel/process.c
index 1c7be865ab31,24efabdbc551..ceb9ebab6558
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@@ -84,34 -84,6 +84,34 @@@ void show_regs(struct pt_regs *regs
  		dump_backtrace(regs, NULL, KERN_DEFAULT);
  }
  
 +#ifdef CONFIG_COMPAT
 +static bool compat_mode_supported __read_mostly;
 +
 +bool compat_elf_check_arch(Elf32_Ehdr *hdr)
 +{
 +	return compat_mode_supported &&
 +	       hdr->e_machine == EM_RISCV &&
 +	       hdr->e_ident[EI_CLASS] == ELFCLASS32;
 +}
 +
 +static int __init compat_mode_detect(void)
 +{
 +	unsigned long tmp = csr_read(CSR_STATUS);
 +
 +	csr_write(CSR_STATUS, (tmp & ~SR_UXL) | SR_UXL_32);
 +	compat_mode_supported =
 +			(csr_read(CSR_STATUS) & SR_UXL) == SR_UXL_32;
 +
 +	csr_write(CSR_STATUS, tmp);
 +
 +	pr_info("riscv: ELF compat mode %s",
 +			compat_mode_supported ? "supported" : "failed");
 +
 +	return 0;
 +}
 +early_initcall(compat_mode_detect);
 +#endif
 +
  void start_thread(struct pt_regs *regs, unsigned long pc,
  	unsigned long sp)
  {
@@@ -126,15 -98,6 +126,15 @@@
  	}
  	regs->epc = pc;
  	regs->sp = sp;
 +
 +#ifdef CONFIG_64BIT
 +	regs->status &= ~SR_UXL;
 +
 +	if (is_compat_task())
 +		regs->status |= SR_UXL_32;
 +	else
 +		regs->status |= SR_UXL_64;
 +#endif
  }
  
  void flush_thread(void)
@@@ -157,13 -120,15 +157,15 @@@ int arch_dup_task_struct(struct task_st
  	return 0;
  }
  
- int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
- 		struct task_struct *p, unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  {
+ 	unsigned long clone_flags = args->flags;
+ 	unsigned long usp = args->stack;
+ 	unsigned long tls = args->tls;
  	struct pt_regs *childregs = task_pt_regs(p);
  
  	/* p->thread holds context to be restored by __switch_to() */
- 	if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+ 	if (unlikely(args->fn)) {
  		/* Kernel thread */
  		memset(childregs, 0, sizeof(struct pt_regs));
  		childregs->gp = gp_in_global;
@@@ -171,8 -136,8 +173,8 @@@
  		childregs->status = SR_PP | SR_PIE;
  
  		p->thread.ra = (unsigned long)ret_from_kernel_thread;
- 		p->thread.s[0] = usp; /* fn */
- 		p->thread.s[1] = arg;
+ 		p->thread.s[0] = (unsigned long)args->fn;
+ 		p->thread.s[1] = (unsigned long)args->fn_arg;
  	} else {
  		*childregs = *(current_pt_regs());
  		if (usp) /* User fork */
diff --combined arch/x86/kernel/fpu/core.c
index 0fdc807ae13f,fbade5a3975b..0531d6a06df5
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@@ -14,8 -14,6 +14,8 @@@
  #include <asm/traps.h>
  #include <asm/irq_regs.h>
  
 +#include <uapi/asm/kvm.h>
 +
  #include <linux/hardirq.h>
  #include <linux/pkeys.h>
  #include <linux/vmalloc.h>
@@@ -43,7 -41,17 +43,7 @@@ struct fpu_state_config fpu_user_cfg __
   */
  struct fpstate init_fpstate __ro_after_init;
  
 -/*
 - * Track whether the kernel is using the FPU state
 - * currently.
 - *
 - * This flag is used:
 - *
 - *   - by IRQ context code to potentially use the FPU
 - *     if it's unused.
 - *
 - *   - to debug kernel_fpu_begin()/end() correctness
 - */
 +/* Track in-kernel FPU usage */
  static DEFINE_PER_CPU(bool, in_kernel_fpu);
  
  /*
@@@ -51,37 -59,42 +51,37 @@@
   */
  DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
  
 -static bool kernel_fpu_disabled(void)
 -{
 -	return this_cpu_read(in_kernel_fpu);
 -}
 -
 -static bool interrupted_kernel_fpu_idle(void)
 -{
 -	return !kernel_fpu_disabled();
 -}
 -
 -/*
 - * Were we in user mode (or vm86 mode) when we were
 - * interrupted?
 - *
 - * Doing kernel_fpu_begin/end() is ok if we are running
 - * in an interrupt context from user mode - we'll just
 - * save the FPU state as required.
 - */
 -static bool interrupted_user_mode(void)
 -{
 -	struct pt_regs *regs = get_irq_regs();
 -	return regs && user_mode(regs);
 -}
 -
  /*
   * Can we use the FPU in kernel mode with the
   * whole "kernel_fpu_begin/end()" sequence?
 - *
 - * It's always ok in process context (ie "not interrupt")
 - * but it is sometimes ok even from an irq.
   */
  bool irq_fpu_usable(void)
  {
 -	return !in_interrupt() ||
 -		interrupted_user_mode() ||
 -		interrupted_kernel_fpu_idle();
 +	if (WARN_ON_ONCE(in_nmi()))
 +		return false;
 +
 +	/* In kernel FPU usage already active? */
 +	if (this_cpu_read(in_kernel_fpu))
 +		return false;
 +
 +	/*
 +	 * When not in NMI or hard interrupt context, FPU can be used in:
 +	 *
 +	 * - Task context except from within fpregs_lock()'ed critical
 +	 *   regions.
 +	 *
 +	 * - Soft interrupt processing context which cannot happen
 +	 *   while in a fpregs_lock()'ed critical region.
 +	 */
 +	if (!in_hardirq())
 +		return true;
 +
 +	/*
 +	 * In hard interrupt context it's safe when soft interrupts
 +	 * are enabled, which means the interrupt did not hit in
 +	 * a fpregs_lock()'ed critical region.
 +	 */
 +	return !softirq_count();
  }
  EXPORT_SYMBOL(irq_fpu_usable);
  
@@@ -234,20 -247,7 +234,20 @@@ bool fpu_alloc_guest_fpstate(struct fpu
  	gfpu->fpstate		= fpstate;
  	gfpu->xfeatures		= fpu_user_cfg.default_features;
  	gfpu->perm		= fpu_user_cfg.default_features;
 -	gfpu->uabi_size		= fpu_user_cfg.default_size;
 +
 +	/*
 +	 * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
 +	 * to userspace, even when XSAVE is unsupported, so that restoring FPU
 +	 * state on a different CPU that does support XSAVE can cleanly load
 +	 * the incoming state using its natural XSAVE.  In other words, KVM's
 +	 * uABI size may be larger than this host's default size.  Conversely,
 +	 * the default size should never be larger than KVM's base uABI size;
 +	 * all features that can expand the uABI size must be opt-in.
 +	 */
 +	gfpu->uabi_size		= sizeof(struct kvm_xsave);
 +	if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
 +		gfpu->uabi_size = fpu_user_cfg.default_size;
 +
  	fpu_init_guest_permissions(gfpu);
  
  	return true;
@@@ -556,7 -556,7 +556,7 @@@ static inline void fpu_inherit_perms(st
  }
  
  /* Clone current's FPU state on fork */
- int fpu_clone(struct task_struct *dst, unsigned long clone_flags)
+ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal)
  {
  	struct fpu *src_fpu = &current->thread.fpu;
  	struct fpu *dst_fpu = &dst->thread.fpu;
@@@ -579,7 -579,7 +579,7 @@@
  	 * No FPU state inheritance for kernel threads and IO
  	 * worker threads.
  	 */
- 	if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) {
+ 	if (minimal) {
  		/* Clear out the minimal state */
  		memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
  		       init_fpstate_copy_size());
diff --combined arch/x86/kernel/process.c
index 58fb48d3004f,d20eaad52a85..9b2772b7e1f3
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@@ -46,7 -46,6 +46,7 @@@
  #include <asm/proto.h>
  #include <asm/frame.h>
  #include <asm/unwind.h>
 +#include <asm/tdx.h>
  
  #include "process.h"
  
@@@ -131,9 -130,11 +131,11 @@@ static int set_new_tls(struct task_stru
  		return do_set_thread_area_64(p, ARCH_SET_FS, tls);
  }
  
- int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
- 		struct task_struct *p, unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  {
+ 	unsigned long clone_flags = args->flags;
+ 	unsigned long sp = args->stack;
+ 	unsigned long tls = args->tls;
  	struct inactive_task_frame *frame;
  	struct fork_frame *fork_frame;
  	struct pt_regs *childregs;
@@@ -161,7 -162,6 +163,7 @@@
  	savesegment(ds, p->thread.ds);
  #else
  	p->thread.sp0 = (unsigned long) (childregs + 1);
 +	savesegment(gs, p->thread.gs);
  	/*
  	 * Clear all status flags including IF and set fixed bit. 64bit
  	 * does not have this initialization as the frame does not contain
@@@ -171,13 -171,13 +173,13 @@@
  	frame->flags = X86_EFLAGS_FIXED;
  #endif
  
- 	fpu_clone(p, clone_flags);
+ 	fpu_clone(p, clone_flags, args->fn);
  
  	/* Kernel thread ? */
  	if (unlikely(p->flags & PF_KTHREAD)) {
  		p->thread.pkru = pkru_get_init_value();
  		memset(childregs, 0, sizeof(struct pt_regs));
- 		kthread_frame_init(frame, sp, arg);
+ 		kthread_frame_init(frame, args->fn, args->fn_arg);
  		return 0;
  	}
  
@@@ -193,10 -193,14 +195,10 @@@
  	if (sp)
  		childregs->sp = sp;
  
- 	if (unlikely(p->flags & PF_IO_WORKER)) {
 -#ifdef CONFIG_X86_32
 -	task_user_gs(p) = get_user_gs(current_pt_regs());
 -#endif
 -
+ 	if (unlikely(args->fn)) {
  		/*
- 		 * An IO thread is a user space thread, but it doesn't
- 		 * return to ret_after_fork().
+ 		 * A user space thread, but it doesn't return to
+ 		 * ret_after_fork().
  		 *
  		 * In order to indicate that to tools like gdb,
  		 * we reset the stack and instruction pointers.
@@@ -206,7 -210,7 +208,7 @@@
  		 */
  		childregs->sp = 0;
  		childregs->ip = 0;
- 		kthread_frame_init(frame, sp, arg);
+ 		kthread_frame_init(frame, args->fn, args->fn_arg);
  		return 0;
  	}
  
@@@ -332,7 -336,7 +334,7 @@@ static int get_cpuid_mode(void
  	return !test_thread_flag(TIF_NOCPUID);
  }
  
 -static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
 +static int set_cpuid_mode(unsigned long cpuid_enabled)
  {
  	if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
  		return -ENODEV;
@@@ -403,7 -407,7 +405,7 @@@ static void tss_copy_io_bitmap(struct t
  }
  
  /**
 - * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
 + * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode
   */
  void native_tss_update_io_bitmap(void)
  {
@@@ -684,6 -688,9 +686,6 @@@ void __switch_to_xtra(struct task_struc
  		/* Enforce MSR update to ensure consistent state */
  		__speculation_ctrl_update(~tifn, tifn);
  	}
 -
 -	if ((tifp ^ tifn) & _TIF_SLD)
 -		switch_to_sld(tifn);
  }
  
  /*
@@@ -868,9 -875,6 +870,9 @@@ void select_idle_routine(const struct c
  	} else if (prefer_mwait_c1_over_halt(c)) {
  		pr_info("using mwait in idle threads\n");
  		x86_idle = mwait_idle;
 +	} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
 +		pr_info("using TDX aware idle routine\n");
 +		x86_idle = tdx_safe_halt;
  	} else
  		x86_idle = default_idle;
  }
@@@ -983,19 -987,20 +985,19 @@@ unsigned long __get_wchan(struct task_s
  	return addr;
  }
  
 -long do_arch_prctl_common(struct task_struct *task, int option,
 -			  unsigned long arg2)
 +long do_arch_prctl_common(int option, unsigned long arg2)
  {
  	switch (option) {
  	case ARCH_GET_CPUID:
  		return get_cpuid_mode();
  	case ARCH_SET_CPUID:
 -		return set_cpuid_mode(task, arg2);
 +		return set_cpuid_mode(arg2);
  	case ARCH_GET_XCOMP_SUPP:
  	case ARCH_GET_XCOMP_PERM:
  	case ARCH_REQ_XCOMP_PERM:
  	case ARCH_GET_XCOMP_GUEST_PERM:
  	case ARCH_REQ_XCOMP_GUEST_PERM:
 -		return fpu_xstate_prctl(task, option, arg2);
 +		return fpu_xstate_prctl(option, arg2);
  	}
  
  	return -EINVAL;
diff --combined arch/xtensa/kernel/process.c
index 7e38292dd07a,c3751cc88e5d..68e0e2f06d66
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@@ -47,7 -47,6 +47,7 @@@
  #include <asm/asm-offsets.h>
  #include <asm/regs.h>
  #include <asm/hw_breakpoint.h>
 +#include <asm/traps.h>
  
  extern void ret_from_fork(void);
  extern void ret_from_kernel_thread(void);
@@@ -64,114 -63,52 +64,114 @@@ EXPORT_SYMBOL(__stack_chk_guard)
  
  #if XTENSA_HAVE_COPROCESSORS
  
 -void coprocessor_release_all(struct thread_info *ti)
 +void local_coprocessors_flush_release_all(void)
  {
 -	unsigned long cpenable;
 -	int i;
 +	struct thread_info **coprocessor_owner;
 +	struct thread_info *unique_owner[XCHAL_CP_MAX];
 +	int n = 0;
 +	int i, j;
  
 -	/* Make sure we don't switch tasks during this operation. */
 +	coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner;
 +	xtensa_set_sr(XCHAL_CP_MASK, cpenable);
  
 -	preempt_disable();
 +	for (i = 0; i < XCHAL_CP_MAX; i++) {
 +		struct thread_info *ti = coprocessor_owner[i];
  
 -	/* Walk through all cp owners and release it for the requested one. */
 +		if (ti) {
 +			coprocessor_flush(ti, i);
  
 -	cpenable = ti->cpenable;
 +			for (j = 0; j < n; j++)
 +				if (unique_owner[j] == ti)
 +					break;
 +			if (j == n)
 +				unique_owner[n++] = ti;
  
 -	for (i = 0; i < XCHAL_CP_MAX; i++) {
 -		if (coprocessor_owner[i] == ti) {
 -			coprocessor_owner[i] = 0;
 -			cpenable &= ~(1 << i);
 +			coprocessor_owner[i] = NULL;
  		}
  	}
 +	for (i = 0; i < n; i++) {
 +		/* pairs with memw (1) in fast_coprocessor and memw in switch_to */
 +		smp_wmb();
 +		unique_owner[i]->cpenable = 0;
 +	}
 +	xtensa_set_sr(0, cpenable);
 +}
  
 -	ti->cpenable = cpenable;
 +static void local_coprocessor_release_all(void *info)
 +{
 +	struct thread_info *ti = info;
 +	struct thread_info **coprocessor_owner;
 +	int i;
 +
 +	coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner;
 +
 +	/* Walk through all cp owners and release it for the requested one. */
 +
 +	for (i = 0; i < XCHAL_CP_MAX; i++) {
 +		if (coprocessor_owner[i] == ti)
 +			coprocessor_owner[i] = NULL;
 +	}
 +	/* pairs with memw (1) in fast_coprocessor and memw in switch_to */
 +	smp_wmb();
 +	ti->cpenable = 0;
  	if (ti == current_thread_info())
  		xtensa_set_sr(0, cpenable);
 +}
  
 -	preempt_enable();
 +void coprocessor_release_all(struct thread_info *ti)
 +{
 +	if (ti->cpenable) {
 +		/* pairs with memw (2) in fast_coprocessor */
 +		smp_rmb();
 +		smp_call_function_single(ti->cp_owner_cpu,
 +					 local_coprocessor_release_all,
 +					 ti, true);
 +	}
  }
  
 -void coprocessor_flush_all(struct thread_info *ti)
 +static void local_coprocessor_flush_all(void *info)
  {
 -	unsigned long cpenable, old_cpenable;
 +	struct thread_info *ti = info;
 +	struct thread_info **coprocessor_owner;
 +	unsigned long old_cpenable;
  	int i;
  
 -	preempt_disable();
 -
 -	old_cpenable = xtensa_get_sr(cpenable);
 -	cpenable = ti->cpenable;
 -	xtensa_set_sr(cpenable, cpenable);
 +	coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner;
 +	old_cpenable = xtensa_xsr(ti->cpenable, cpenable);
  
  	for (i = 0; i < XCHAL_CP_MAX; i++) {
 -		if ((cpenable & 1) != 0 && coprocessor_owner[i] == ti)
 +		if (coprocessor_owner[i] == ti)
  			coprocessor_flush(ti, i);
 -		cpenable >>= 1;
  	}
  	xtensa_set_sr(old_cpenable, cpenable);
 +}
 +
 +void coprocessor_flush_all(struct thread_info *ti)
 +{
 +	if (ti->cpenable) {
 +		/* pairs with memw (2) in fast_coprocessor */
 +		smp_rmb();
 +		smp_call_function_single(ti->cp_owner_cpu,
 +					 local_coprocessor_flush_all,
 +					 ti, true);
 +	}
 +}
 +
 +static void local_coprocessor_flush_release_all(void *info)
 +{
 +	local_coprocessor_flush_all(info);
 +	local_coprocessor_release_all(info);
 +}
  
 -	preempt_enable();
 +void coprocessor_flush_release_all(struct thread_info *ti)
 +{
 +	if (ti->cpenable) {
 +		/* pairs with memw (2) in fast_coprocessor */
 +		smp_rmb();
 +		smp_call_function_single(ti->cp_owner_cpu,
 +					 local_coprocessor_flush_release_all,
 +					 ti, true);
 +	}
  }
  
  #endif
@@@ -203,7 -140,8 +203,7 @@@ void flush_thread(void
  {
  #if XTENSA_HAVE_COPROCESSORS
  	struct thread_info *ti = current_thread_info();
 -	coprocessor_flush_all(ti);
 -	coprocessor_release_all(ti);
 +	coprocessor_flush_release_all(ti);
  #endif
  	flush_ptrace_hw_breakpoint(current);
  }
@@@ -263,10 -201,11 +263,11 @@@ int arch_dup_task_struct(struct task_st
   * involved.  Much simpler to just not copy those live frames across.
   */
  
- int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn,
- 		unsigned long thread_fn_arg, struct task_struct *p,
- 		unsigned long tls)
+ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
  {
+ 	unsigned long clone_flags = args->flags;
+ 	unsigned long usp_thread_fn = args->stack;
+ 	unsigned long tls = args->tls;
  	struct pt_regs *childregs = task_pt_regs(p);
  
  #if (XTENSA_HAVE_COPROCESSORS || XTENSA_HAVE_IO_PORTS)
@@@ -286,7 -225,7 +287,7 @@@
  #error Unsupported Xtensa ABI
  #endif
  
- 	if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+ 	if (!args->fn) {
  		struct pt_regs *regs = current_pt_regs();
  		unsigned long usp = usp_thread_fn ?
  			usp_thread_fn : regs->areg[1];
@@@ -338,15 -277,15 +339,15 @@@
  		 * Window underflow will load registers from the
  		 * spill slots on the stack on return from _switch_to.
  		 */
- 		SPILL_SLOT(childregs, 2) = usp_thread_fn;
- 		SPILL_SLOT(childregs, 3) = thread_fn_arg;
+ 		SPILL_SLOT(childregs, 2) = (unsigned long)args->fn;
+ 		SPILL_SLOT(childregs, 3) = (unsigned long)args->fn_arg;
  #elif defined(__XTENSA_CALL0_ABI__)
  		/*
  		 * a12 = thread_fn, a13 = thread_fn arg.
  		 * _switch_to epilogue will load registers from the stack.
  		 */
- 		((unsigned long *)p->thread.sp)[0] = usp_thread_fn;
- 		((unsigned long *)p->thread.sp)[1] = thread_fn_arg;
+ 		((unsigned long *)p->thread.sp)[0] = (unsigned long)args->fn;
+ 		((unsigned long *)p->thread.sp)[1] = (unsigned long)args->fn_arg;
  #else
  #error Unsupported Xtensa ABI
  #endif
diff --combined fs/exec.c
index 14b4b3755580,9c5260e74517..0989fb8472a1
--- a/fs/exec.c
+++ b/fs/exec.c
@@@ -758,7 -758,6 +758,7 @@@ int setup_arg_pages(struct linux_binpr
  	unsigned long stack_size;
  	unsigned long stack_expand;
  	unsigned long rlim_stack;
 +	struct mmu_gather tlb;
  
  #ifdef CONFIG_STACK_GROWSUP
  	/* Limit stack size */
@@@ -813,11 -812,8 +813,11 @@@
  	vm_flags |= mm->def_flags;
  	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
  
 -	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
 +	tlb_gather_mmu(&tlb, mm);
 +	ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end,
  			vm_flags);
 +	tlb_finish_mmu(&tlb);
 +
  	if (ret)
  		goto out_unlock;
  	BUG_ON(prev != vma);
@@@ -1312,9 -1308,7 +1312,7 @@@ int begin_new_exec(struct linux_binprm 
  	if (retval)
  		goto out_unlock;
  
- 	if (me->flags & PF_KTHREAD)
- 		free_kthread_struct(me);
- 	me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+ 	me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
  					PF_NOFREEZE | PF_NO_SETAFFINITY);
  	flush_thread();
  	me->personality &= ~bprm->per_clear;
@@@ -1959,6 -1953,10 +1957,10 @@@ int kernel_execve(const char *kernel_fi
  	int fd = AT_FDCWD;
  	int retval;
  
+ 	/* It is non-sense for kernel threads to call execve */
+ 	if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+ 		return -EINVAL;
+ 
  	filename = getname_kernel(kernel_filename);
  	if (IS_ERR(filename))
  		return PTR_ERR(filename);
diff --combined init/initramfs.c
index dc84cf756cea,41e7857d510d..18229cfe8906
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@@ -15,13 -15,11 +15,14 @@@
  #include <linux/mm.h>
  #include <linux/namei.h>
  #include <linux/init_syscalls.h>
+ #include <linux/task_work.h>
  #include <linux/umh.h>
  
 -static ssize_t __init xwrite(struct file *file, const char *p, size_t count,
 -		loff_t *pos)
 +static __initdata bool csum_present;
 +static __initdata u32 io_csum;
 +
 +static ssize_t __init xwrite(struct file *file, const unsigned char *p,
 +		size_t count, loff_t *pos)
  {
  	ssize_t out = 0;
  
@@@ -36,13 -34,6 +37,13 @@@
  		} else if (rv == 0)
  			break;
  
 +		if (csum_present) {
 +			ssize_t i;
 +
 +			for (i = 0; i < rv; i++)
 +				io_csum += p[i];
 +		}
 +
  		p += rv;
  		out += rv;
  		count -= rv;
@@@ -126,36 -117,31 +127,36 @@@ static void __init free_hash(void
  	}
  }
  
 -static long __init do_utime(char *filename, time64_t mtime)
 +#ifdef CONFIG_INITRAMFS_PRESERVE_MTIME
 +static void __init do_utime(char *filename, time64_t mtime)
  {
 -	struct timespec64 t[2];
 +	struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } };
 +	init_utimes(filename, t);
 +}
  
 -	t[0].tv_sec = mtime;
 -	t[0].tv_nsec = 0;
 -	t[1].tv_sec = mtime;
 -	t[1].tv_nsec = 0;
 -	return init_utimes(filename, t);
 +static void __init do_utime_path(const struct path *path, time64_t mtime)
 +{
 +	struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } };
 +	vfs_utimes(path, t);
  }
  
  static __initdata LIST_HEAD(dir_list);
  struct dir_entry {
  	struct list_head list;
 -	char *name;
  	time64_t mtime;
 +	char name[];
  };
  
  static void __init dir_add(const char *name, time64_t mtime)
  {
 -	struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL);
 +	size_t nlen = strlen(name) + 1;
 +	struct dir_entry *de;
 +
 +	de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL);
  	if (!de)
  		panic_show_mem("can't allocate dir_entry buffer");
  	INIT_LIST_HEAD(&de->list);
 -	de->name = kstrdup(name, GFP_KERNEL);
 +	strscpy(de->name, name, nlen);
  	de->mtime = mtime;
  	list_add(&de->list, &dir_list);
  }
@@@ -166,15 -152,10 +167,15 @@@ static void __init dir_utime(void
  	list_for_each_entry_safe(de, tmp, &dir_list, list) {
  		list_del(&de->list);
  		do_utime(de->name, de->mtime);
 -		kfree(de->name);
  		kfree(de);
  	}
  }
 +#else
 +static void __init do_utime(char *filename, time64_t mtime) {}
 +static void __init do_utime_path(const struct path *path, time64_t mtime) {}
 +static void __init dir_add(const char *name, time64_t mtime) {}
 +static void __init dir_utime(void) {}
 +#endif
  
  static __initdata time64_t mtime;
  
@@@ -186,16 -167,15 +187,16 @@@ static __initdata unsigned long body_le
  static __initdata uid_t uid;
  static __initdata gid_t gid;
  static __initdata unsigned rdev;
 +static __initdata u32 hdr_csum;
  
  static void __init parse_header(char *s)
  {
 -	unsigned long parsed[12];
 +	unsigned long parsed[13];
  	char buf[9];
  	int i;
  
  	buf[8] = '\0';
 -	for (i = 0, s += 6; i < 12; i++, s += 8) {
 +	for (i = 0, s += 6; i < 13; i++, s += 8) {
  		memcpy(buf, s, 8);
  		parsed[i] = simple_strtoul(buf, NULL, 16);
  	}
@@@ -210,7 -190,6 +211,7 @@@
  	minor = parsed[8];
  	rdev = new_encode_dev(MKDEV(parsed[9], parsed[10]));
  	name_len = parsed[11];
 +	hdr_csum = parsed[12];
  }
  
  /* FSM */
@@@ -279,15 -258,12 +280,15 @@@ static int __init do_collect(void
  
  static int __init do_header(void)
  {
 -	if (memcmp(collected, "070707", 6)==0) {
 -		error("incorrect cpio method used: use -H newc option");
 -		return 1;
 -	}
 -	if (memcmp(collected, "070701", 6)) {
 -		error("no cpio magic");
 +	if (!memcmp(collected, "070701", 6)) {
 +		csum_present = false;
 +	} else if (!memcmp(collected, "070702", 6)) {
 +		csum_present = true;
 +	} else {
 +		if (memcmp(collected, "070707", 6) == 0)
 +			error("incorrect cpio method used: use -H newc option");
 +		else
 +			error("no cpio magic");
  		return 1;
  	}
  	parse_header(collected);
@@@ -378,7 -354,6 +379,7 @@@ static int __init do_name(void
  			if (IS_ERR(wfile))
  				return 0;
  			wfile_pos = 0;
 +			io_csum = 0;
  
  			vfs_fchown(wfile, uid, gid);
  			vfs_fchmod(wfile, mode);
@@@ -406,13 -381,15 +407,13 @@@
  static int __init do_copy(void)
  {
  	if (byte_count >= body_len) {
 -		struct timespec64 t[2] = { };
  		if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len)
  			error("write error");
  
 -		t[0].tv_sec = mtime;
 -		t[1].tv_sec = mtime;
 -		vfs_utimes(&wfile->f_path, t);
 -
 +		do_utime_path(&wfile->f_path, mtime);
  		fput(wfile);
 +		if (csum_present && io_csum != hdr_csum)
 +			error("bad data checksum");
  		eat(body_len);
  		state = SkipIt;
  		return 0;
@@@ -727,6 -704,7 +728,7 @@@ done
  	initrd_end = 0;
  
  	flush_delayed_fput();
+ 	task_work_run();
  }
  
  static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain);
diff --combined init/main.c
index 02eb533018f6,39baac0211c6..0ee39cdcfcac
--- a/init/main.c
+++ b/init/main.c
@@@ -266,7 -266,7 +266,7 @@@ static int __init loglevel(char *str
  early_param("loglevel", loglevel);
  
  #ifdef CONFIG_BLK_DEV_INITRD
 -static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum)
 +static void * __init get_boot_config_from_initrd(size_t *_size)
  {
  	u32 size, csum;
  	char *data;
@@@ -300,20 -300,17 +300,20 @@@ found
  		return NULL;
  	}
  
 +	if (xbc_calc_checksum(data, size) != csum) {
 +		pr_err("bootconfig checksum failed\n");
 +		return NULL;
 +	}
 +
  	/* Remove bootconfig from initramfs/initrd */
  	initrd_end = (unsigned long)data;
  	if (_size)
  		*_size = size;
 -	if (_csum)
 -		*_csum = csum;
  
  	return data;
  }
  #else
 -static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum)
 +static void * __init get_boot_config_from_initrd(size_t *_size)
  {
  	return NULL;
  }
@@@ -410,16 -407,14 +410,16 @@@ static int __init warn_bootconfig(char 
  static void __init setup_boot_config(void)
  {
  	static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
 -	const char *msg;
 -	int pos;
 -	u32 size, csum;
 -	char *data, *err;
 -	int ret;
 +	const char *msg, *data;
 +	int pos, ret;
 +	size_t size;
 +	char *err;
  
  	/* Cut out the bootconfig data even if we have no bootconfig option */
 -	data = get_boot_config_from_initrd(&size, &csum);
 +	data = get_boot_config_from_initrd(&size);
 +	/* If there is no bootconfig in initrd, try embedded one. */
 +	if (!data)
 +		data = xbc_get_embedded_bootconfig(&size);
  
  	strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
  	err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
@@@ -438,8 -433,13 +438,8 @@@
  	}
  
  	if (size >= XBC_DATA_MAX) {
 -		pr_err("bootconfig size %d greater than max size %d\n",
 -			size, XBC_DATA_MAX);
 -		return;
 -	}
 -
 -	if (xbc_calc_checksum(data, size) != csum) {
 -		pr_err("bootconfig checksum failed\n");
 +		pr_err("bootconfig size %ld greater than max size %d\n",
 +			(long)size, XBC_DATA_MAX);
  		return;
  	}
  
@@@ -452,7 -452,7 +452,7 @@@
  				msg, pos);
  	} else {
  		xbc_get_info(&ret, NULL);
 -		pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret);
 +		pr_info("Load bootconfig: %ld bytes %d nodes\n", (long)size, ret);
  		/* keys starting with "kernel." are passed via cmdline */
  		extra_command_line = xbc_make_cmdline("kernel");
  		/* Also, "init." keys are init arguments */
@@@ -471,7 -471,7 +471,7 @@@ static void __init exit_boot_config(voi
  static void __init setup_boot_config(void)
  {
  	/* Remove bootconfig data from initrd */
 -	get_boot_config_from_initrd(NULL, NULL);
 +	get_boot_config_from_initrd(NULL);
  }
  
  static int __init warn_bootconfig(char *str)
@@@ -688,7 -688,7 +688,7 @@@ noinline void __ref rest_init(void
  	 * the init task will end up wanting to create kthreads, which, if
  	 * we schedule it before we create kthreadd, will OOPS.
  	 */
- 	pid = kernel_thread(kernel_init, NULL, CLONE_FS);
+ 	pid = user_mode_thread(kernel_init, NULL, CLONE_FS);
  	/*
  	 * Pin init on the boot CPU. Task migration is not properly working
  	 * until sched_init_smp() has been run. It will set the allowed
@@@ -1035,18 -1035,21 +1035,18 @@@ asmlinkage __visible void __init __no_s
  	softirq_init();
  	timekeeping_init();
  	kfence_init();
 +	time_init();
  
  	/*
  	 * For best initial stack canary entropy, prepare it after:
  	 * - setup_arch() for any UEFI RNG entropy and boot cmdline access
 -	 * - timekeeping_init() for ktime entropy used in rand_initialize()
 -	 * - rand_initialize() to get any arch-specific entropy like RDRAND
 -	 * - add_latent_entropy() to get any latent entropy
 -	 * - adding command line entropy
 +	 * - timekeeping_init() for ktime entropy used in random_init()
 +	 * - time_init() for making random_get_entropy() work on some platforms
 +	 * - random_init() to initialize the RNG from from early entropy sources
  	 */
 -	rand_initialize();
 -	add_latent_entropy();
 -	add_device_randomness(command_line, strlen(command_line));
 +	random_init(command_line);
  	boot_init_stack_canary();
  
 -	time_init();
  	perf_event_init();
  	profile_init();
  	call_function_init();
diff --combined kernel/fork.c
index 124829ed0163,35645f57bd2f..9d44f2d46c69
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@@ -612,7 -612,9 +612,7 @@@ static __latent_entropy int dup_mmap(st
  	retval = ksm_fork(mm, oldmm);
  	if (retval)
  		goto out;
 -	retval = khugepaged_fork(mm, oldmm);
 -	if (retval)
 -		goto out;
 +	khugepaged_fork(mm, oldmm);
  
  	prev = NULL;
  	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@@ -790,7 -792,6 +790,7 @@@ void __mmdrop(struct mm_struct *mm
  	mmu_notifier_subscriptions_destroy(mm);
  	check_mm(mm);
  	put_user_ns(mm->user_ns);
 +	mm_pasid_drop(mm);
  	free_mm(mm);
  }
  EXPORT_SYMBOL_GPL(__mmdrop);
@@@ -1044,11 -1045,6 +1044,11 @@@ static struct task_struct *dup_task_str
  #ifdef CONFIG_MEMCG
  	tsk->active_memcg = NULL;
  #endif
 +
 +#ifdef CONFIG_CPU_SUP_INTEL
 +	tsk->reported_split_lock = 0;
 +#endif
 +
  	return tsk;
  
  free_stack:
@@@ -1194,6 -1190,7 +1194,6 @@@ static inline void __mmput(struct mm_st
  	}
  	if (mm->binfmt)
  		module_put(mm->binfmt->module);
 -	mm_pasid_drop(mm);
  	mmdrop(mm);
  }
  
@@@ -1982,7 -1979,7 +1982,7 @@@ static __latent_entropy struct task_str
  	struct task_struct *p;
  	struct multiprocess_signals delayed;
  	struct file *pidfile = NULL;
- 	u64 clone_flags = args->flags;
+ 	const u64 clone_flags = args->flags;
  	struct nsproxy *nsp = current->nsproxy;
  
  	/*
@@@ -2071,6 -2068,9 +2071,9 @@@
  	p = dup_task_struct(current, node);
  	if (!p)
  		goto fork_out;
+ 	p->flags &= ~PF_KTHREAD;
+ 	if (args->kthread)
+ 		p->flags |= PF_KTHREAD;
  	if (args->io_thread) {
  		/*
  		 * Mark us an IO worker, and block any signal that isn't
@@@ -2160,7 -2160,7 +2163,7 @@@
  	p->io_context = NULL;
  	audit_set_context(p, NULL);
  	cgroup_fork(p);
- 	if (p->flags & PF_KTHREAD) {
+ 	if (args->kthread) {
  		if (!set_kthread_struct(p))
  			goto bad_fork_cleanup_delayacct;
  	}
@@@ -2243,7 -2243,7 +2246,7 @@@
  	retval = copy_io(clone_flags, p);
  	if (retval)
  		goto bad_fork_cleanup_namespaces;
- 	retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
+ 	retval = copy_thread(p, args);
  	if (retval)
  		goto bad_fork_cleanup_io;
  
@@@ -2547,11 -2547,21 +2550,21 @@@ static inline void init_idle_pids(struc
  	}
  }
  
+ static int idle_dummy(void *dummy)
+ {
+ 	/* This function is never called */
+ 	return 0;
+ }
+ 
  struct task_struct * __init fork_idle(int cpu)
  {
  	struct task_struct *task;
  	struct kernel_clone_args args = {
- 		.flags = CLONE_VM,
+ 		.flags		= CLONE_VM,
+ 		.fn		= &idle_dummy,
+ 		.fn_arg		= NULL,
+ 		.kthread	= 1,
+ 		.idle		= 1,
  	};
  
  	task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
@@@ -2582,8 -2592,8 +2595,8 @@@ struct task_struct *create_io_thread(in
  		.flags		= ((lower_32_bits(flags) | CLONE_VM |
  				    CLONE_UNTRACED) & ~CSIGNAL),
  		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
- 		.stack		= (unsigned long)fn,
- 		.stack_size	= (unsigned long)arg,
+ 		.fn		= fn,
+ 		.fn_arg		= arg,
  		.io_thread	= 1,
  	};
  
@@@ -2687,8 -2697,25 +2700,25 @@@ pid_t kernel_thread(int (*fn)(void *), 
  		.flags		= ((lower_32_bits(flags) | CLONE_VM |
  				    CLONE_UNTRACED) & ~CSIGNAL),
  		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
- 		.stack		= (unsigned long)fn,
- 		.stack_size	= (unsigned long)arg,
+ 		.fn		= fn,
+ 		.fn_arg		= arg,
+ 		.kthread	= 1,
+ 	};
+ 
+ 	return kernel_clone(&args);
+ }
+ 
+ /*
+  * Create a user mode thread.
+  */
+ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
+ {
+ 	struct kernel_clone_args args = {
+ 		.flags		= ((lower_32_bits(flags) | CLONE_VM |
+ 				    CLONE_UNTRACED) & ~CSIGNAL),
+ 		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
+ 		.fn		= fn,
+ 		.fn_arg		= arg,
  	};
  
  	return kernel_clone(&args);
diff --combined kernel/sched/fair.c
index 8c5b74f66bd3,db6f0df9d43e..77b2048a9326
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -36,7 -36,6 +36,7 @@@
  #include <linux/sched/cond_resched.h>
  #include <linux/sched/cputime.h>
  #include <linux/sched/isolation.h>
 +#include <linux/sched/nohz.h>
  
  #include <linux/cpuidle.h>
  #include <linux/interrupt.h>
@@@ -174,37 -173,7 +174,37 @@@ int __weak arch_asym_cpu_priority(int c
   *
   * (default: 5 msec, units: microseconds)
   */
 -unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 +static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 +#endif
 +
 +#ifdef CONFIG_SYSCTL
 +static struct ctl_table sched_fair_sysctls[] = {
 +	{
 +		.procname       = "sched_child_runs_first",
 +		.data           = &sysctl_sched_child_runs_first,
 +		.maxlen         = sizeof(unsigned int),
 +		.mode           = 0644,
 +		.proc_handler   = proc_dointvec,
 +	},
 +#ifdef CONFIG_CFS_BANDWIDTH
 +	{
 +		.procname       = "sched_cfs_bandwidth_slice_us",
 +		.data           = &sysctl_sched_cfs_bandwidth_slice,
 +		.maxlen         = sizeof(unsigned int),
 +		.mode           = 0644,
 +		.proc_handler   = proc_dointvec_minmax,
 +		.extra1         = SYSCTL_ONE,
 +	},
 +#endif
 +	{}
 +};
 +
 +static int __init sched_fair_sysctl_init(void)
 +{
 +	register_sysctl_init("kernel", sched_fair_sysctls);
 +	return 0;
 +}
 +late_initcall(sched_fair_sysctl_init);
  #endif
  
  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@@ -344,6 -313,19 +344,6 @@@ const struct sched_class fair_sched_cla
  #define for_each_sched_entity(se) \
  		for (; se; se = se->parent)
  
 -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 -{
 -	if (!path)
 -		return;
 -
 -	if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
 -		autogroup_path(cfs_rq->tg, path, len);
 -	else if (cfs_rq && cfs_rq->tg->css.cgroup)
 -		cgroup_path(cfs_rq->tg->css.cgroup, path, len);
 -	else
 -		strlcpy(path, "(null)", len);
 -}
 -
  static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
  	struct rq *rq = rq_of(cfs_rq);
@@@ -511,6 -493,12 +511,6 @@@ static int se_is_idle(struct sched_enti
  #define for_each_sched_entity(se) \
  		for (; se; se = NULL)
  
 -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 -{
 -	if (path)
 -		strlcpy(path, "(null)", len);
 -}
 -
  static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
  	return true;
@@@ -2927,7 -2915,7 +2927,7 @@@ static void task_tick_numa(struct rq *r
  	/*
  	 * We don't care about NUMA placement if we don't have memory.
  	 */
- 	if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
+ 	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
  		return;
  
  	/*
@@@ -3841,11 -3829,11 +3841,11 @@@ static void attach_entity_load_avg(stru
  
  	se->avg.runnable_sum = se->avg.runnable_avg * divider;
  
 -	se->avg.load_sum = divider;
 -	if (se_weight(se)) {
 -		se->avg.load_sum =
 -			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
 -	}
 +	se->avg.load_sum = se->avg.load_avg * divider;
 +	if (se_weight(se) < se->avg.load_sum)
 +		se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
 +	else
 +		se->avg.load_sum = 1;
  
  	enqueue_load_avg(cfs_rq, se);
  	cfs_rq->avg.util_avg += se->avg.util_avg;
@@@ -4858,11 -4846,11 +4858,11 @@@ static int tg_unthrottle_up(struct task
  
  	cfs_rq->throttle_count--;
  	if (!cfs_rq->throttle_count) {
 -		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
 -					     cfs_rq->throttled_clock_task;
 +		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
 +					     cfs_rq->throttled_clock_pelt;
  
  		/* Add cfs_rq with load or one or more already running entities to the list */
 -		if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
 +		if (!cfs_rq_is_decayed(cfs_rq))
  			list_add_leaf_cfs_rq(cfs_rq);
  	}
  
@@@ -4876,7 -4864,7 +4876,7 @@@ static int tg_throttle_down(struct task
  
  	/* group is entering throttled state, stop time */
  	if (!cfs_rq->throttle_count) {
 -		cfs_rq->throttled_clock_task = rq_clock_task(rq);
 +		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
  		list_del_leaf_cfs_rq(cfs_rq);
  	}
  	cfs_rq->throttle_count++;
@@@ -5320,7 -5308,7 +5320,7 @@@ static void sync_throttle(struct task_g
  	pcfs_rq = tg->parent->cfs_rq[cpu];
  
  	cfs_rq->throttle_count = pcfs_rq->throttle_count;
 -	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
 +	cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
  }
  
  /* conditionally throttle active cfs_rq's from put_prev_entity() */
@@@ -6556,19 -6544,108 +6556,19 @@@ static int select_idle_sibling(struct t
  }
  
  /*
 - * cpu_util_without: compute cpu utilization without any contributions from *p
 - * @cpu: the CPU which utilization is requested
 - * @p: the task which utilization should be discounted
 - *
 - * The utilization of a CPU is defined by the utilization of tasks currently
 - * enqueued on that CPU as well as tasks which are currently sleeping after an
 - * execution on that CPU.
 - *
 - * This method returns the utilization of the specified CPU by discounting the
 - * utilization of the specified task, whenever the task is currently
 - * contributing to the CPU utilization.
 - */
 -static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 -{
 -	struct cfs_rq *cfs_rq;
 -	unsigned int util;
 -
 -	/* Task has no contribution or is new */
 -	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
 -		return cpu_util_cfs(cpu);
 -
 -	cfs_rq = &cpu_rq(cpu)->cfs;
 -	util = READ_ONCE(cfs_rq->avg.util_avg);
 -
 -	/* Discount task's util from CPU's util */
 -	lsub_positive(&util, task_util(p));
 -
 -	/*
 -	 * Covered cases:
 -	 *
 -	 * a) if *p is the only task sleeping on this CPU, then:
 -	 *      cpu_util (== task_util) > util_est (== 0)
 -	 *    and thus we return:
 -	 *      cpu_util_without = (cpu_util - task_util) = 0
 -	 *
 -	 * b) if other tasks are SLEEPING on this CPU, which is now exiting
 -	 *    IDLE, then:
 -	 *      cpu_util >= task_util
 -	 *      cpu_util > util_est (== 0)
 -	 *    and thus we discount *p's blocked utilization to return:
 -	 *      cpu_util_without = (cpu_util - task_util) >= 0
 -	 *
 -	 * c) if other tasks are RUNNABLE on that CPU and
 -	 *      util_est > cpu_util
 -	 *    then we use util_est since it returns a more restrictive
 -	 *    estimation of the spare capacity on that CPU, by just
 -	 *    considering the expected utilization of tasks already
 -	 *    runnable on that CPU.
 -	 *
 -	 * Cases a) and b) are covered by the above code, while case c) is
 -	 * covered by the following code when estimated utilization is
 -	 * enabled.
 -	 */
 -	if (sched_feat(UTIL_EST)) {
 -		unsigned int estimated =
 -			READ_ONCE(cfs_rq->avg.util_est.enqueued);
 -
 -		/*
 -		 * Despite the following checks we still have a small window
 -		 * for a possible race, when an execl's select_task_rq_fair()
 -		 * races with LB's detach_task():
 -		 *
 -		 *   detach_task()
 -		 *     p->on_rq = TASK_ON_RQ_MIGRATING;
 -		 *     ---------------------------------- A
 -		 *     deactivate_task()                   \
 -		 *       dequeue_task()                     + RaceTime
 -		 *         util_est_dequeue()              /
 -		 *     ---------------------------------- B
 -		 *
 -		 * The additional check on "current == p" it's required to
 -		 * properly fix the execl regression and it helps in further
 -		 * reducing the chances for the above race.
 -		 */
 -		if (unlikely(task_on_rq_queued(p) || current == p))
 -			lsub_positive(&estimated, _task_util_est(p));
 -
 -		util = max(util, estimated);
 -	}
 -
 -	/*
 -	 * Utilization (estimated) can exceed the CPU capacity, thus let's
 -	 * clamp to the maximum CPU capacity to ensure consistency with
 -	 * cpu_util.
 -	 */
 -	return min_t(unsigned long, util, capacity_orig_of(cpu));
 -}
 -
 -/*
 - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
 - * to @dst_cpu.
 + * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
 + * (@dst_cpu = -1) or migrated to @dst_cpu.
   */
  static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
  {
  	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
 -	unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
 +	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
  
  	/*
 -	 * If @p migrates from @cpu to another, remove its contribution. Or,
 -	 * if @p migrates from another CPU to @cpu, add its contribution. In
 -	 * the other cases, @cpu is not impacted by the migration, so the
 -	 * util_avg should already be correct.
 +	 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
 +	 * contribution. If @p migrates from another CPU to @cpu add its
 +	 * contribution. In all the other cases @cpu is not impacted by the
 +	 * migration so its util_avg is already correct.
  	 */
  	if (task_cpu(p) == cpu && dst_cpu != cpu)
  		lsub_positive(&util, task_util(p));
@@@ -6576,40 -6653,16 +6576,40 @@@
  		util += task_util(p);
  
  	if (sched_feat(UTIL_EST)) {
 +		unsigned long util_est;
 +
  		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
  
  		/*
 -		 * During wake-up, the task isn't enqueued yet and doesn't
 -		 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
 -		 * so just add it (if needed) to "simulate" what will be
 -		 * cpu_util after the task has been enqueued.
 +		 * During wake-up @p isn't enqueued yet and doesn't contribute
 +		 * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
 +		 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
 +		 * has been enqueued.
 +		 *
 +		 * During exec (@dst_cpu = -1) @p is enqueued and does
 +		 * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
 +		 * Remove it to "simulate" cpu_util without @p's contribution.
 +		 *
 +		 * Despite the task_on_rq_queued(@p) check there is still a
 +		 * small window for a possible race when an exec
 +		 * select_task_rq_fair() races with LB's detach_task().
 +		 *
 +		 *   detach_task()
 +		 *     deactivate_task()
 +		 *       p->on_rq = TASK_ON_RQ_MIGRATING;
 +		 *       -------------------------------- A
 +		 *       dequeue_task()                    \
 +		 *         dequeue_task_fair()              + Race Time
 +		 *           util_est_dequeue()            /
 +		 *       -------------------------------- B
 +		 *
 +		 * The additional check "current == p" is required to further
 +		 * reduce the race window.
  		 */
  		if (dst_cpu == cpu)
  			util_est += _task_util_est(p);
 +		else if (unlikely(task_on_rq_queued(p) || current == p))
 +			lsub_positive(&util_est, _task_util_est(p));
  
  		util = max(util, util_est);
  	}
@@@ -6617,28 -6670,6 +6617,28 @@@
  	return min(util, capacity_orig_of(cpu));
  }
  
 +/*
 + * cpu_util_without: compute cpu utilization without any contributions from *p
 + * @cpu: the CPU which utilization is requested
 + * @p: the task which utilization should be discounted
 + *
 + * The utilization of a CPU is defined by the utilization of tasks currently
 + * enqueued on that CPU as well as tasks which are currently sleeping after an
 + * execution on that CPU.
 + *
 + * This method returns the utilization of the specified CPU by discounting the
 + * utilization of the specified task, whenever the task is currently
 + * contributing to the CPU utilization.
 + */
 +static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 +{
 +	/* Task has no contribution or is new */
 +	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
 +		return cpu_util_cfs(cpu);
 +
 +	return cpu_util_next(cpu, p, -1);
 +}
 +
  /*
   * compute_energy(): Estimates the energy that @pd would consume if @p was
   * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
@@@ -9429,6 -9460,8 +9429,6 @@@ static inline void calculate_imbalance(
  		local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
  				  local->group_capacity;
  
 -		sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
 -				sds->total_capacity;
  		/*
  		 * If the local group is more loaded than the selected
  		 * busiest group don't try to pull any tasks.
@@@ -9437,9 -9470,6 +9437,9 @@@
  			env->imbalance = 0;
  			return;
  		}
 +
 +		sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
 +				sds->total_capacity;
  	}
  
  	/*
@@@ -9465,7 -9495,7 +9465,7 @@@
   * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
   * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
   * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
 - * misfit_task      force     N/A        N/A    N/A  force      force
 + * misfit_task      force     N/A        N/A    N/A  N/A        N/A
   * asym_packing     force     force      N/A    N/A  force      force
   * imbalanced       force     force      N/A    N/A  force      force
   * overloaded       force     force      N/A    N/A  force      avg_load
@@@ -11851,3 -11881,101 +11851,3 @@@ __init void init_sched_fair_class(void
  #endif /* SMP */
  
  }
 -
 -/*
 - * Helper functions to facilitate extracting info from tracepoints.
 - */
 -
 -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
 -{
 -#ifdef CONFIG_SMP
 -	return cfs_rq ? &cfs_rq->avg : NULL;
 -#else
 -	return NULL;
 -#endif
 -}
 -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
 -
 -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
 -{
 -	if (!cfs_rq) {
 -		if (str)
 -			strlcpy(str, "(null)", len);
 -		else
 -			return NULL;
 -	}
 -
 -	cfs_rq_tg_path(cfs_rq, str, len);
 -	return str;
 -}
 -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
 -
 -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
 -{
 -	return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
 -}
 -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
 -
 -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
 -{
 -#ifdef CONFIG_SMP
 -	return rq ? &rq->avg_rt : NULL;
 -#else
 -	return NULL;
 -#endif
 -}
 -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
 -
 -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
 -{
 -#ifdef CONFIG_SMP
 -	return rq ? &rq->avg_dl : NULL;
 -#else
 -	return NULL;
 -#endif
 -}
 -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
 -
 -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
 -{
 -#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
 -	return rq ? &rq->avg_irq : NULL;
 -#else
 -	return NULL;
 -#endif
 -}
 -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
 -
 -int sched_trace_rq_cpu(struct rq *rq)
 -{
 -	return rq ? cpu_of(rq) : -1;
 -}
 -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
 -
 -int sched_trace_rq_cpu_capacity(struct rq *rq)
 -{
 -	return rq ?
 -#ifdef CONFIG_SMP
 -		rq->cpu_capacity
 -#else
 -		SCHED_CAPACITY_SCALE
 -#endif
 -		: -1;
 -}
 -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
 -
 -const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
 -{
 -#ifdef CONFIG_SMP
 -	return rd ? rd->span : NULL;
 -#else
 -	return NULL;
 -#endif
 -}
 -EXPORT_SYMBOL_GPL(sched_trace_rd_span);
 -
 -int sched_trace_rq_nr_running(struct rq *rq)
 -{
 -        return rq ? rq->nr_running : -1;
 -}
 -EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);