From: Linus Torvalds Date: Fri, 3 Jun 2022 23:03:05 +0000 (-0700) Subject: Merge tag 'kthread-cleanups-for-v5.19' of git://git.kernel.org/pub/scm/linux/kernel... X-Git-Tag: v5.19-rc1~30 X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/1ec6574a3c0a22c130c08e8c36c825cb87d68f8e?hp=-c Merge tag 'kthread-cleanups-for-v5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace Pull kthread updates from Eric Biederman: "This updates init and user mode helper tasks to be ordinary user mode tasks. Commit 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads") caused init and the user mode helper threads that call kernel_execve to have struct kthread allocated for them. This struct kthread going away during execve in turned made a use after free of struct kthread possible. Here, commit 343f4c49f243 ("kthread: Don't allocate kthread_struct for init and umh") is enough to fix the use after free and is simple enough to be backportable. The rest of the changes pass struct kernel_clone_args to clean things up and cause the code to make sense. In making init and the user mode helpers tasks purely user mode tasks I ran into two complications. The function task_tick_numa was detecting tasks without an mm by testing for the presence of PF_KTHREAD. The initramfs code in populate_initrd_image was using flush_delayed_fput to ensuere the closing of all it's file descriptors was complete, and flush_delayed_fput does not work in a userspace thread. I have looked and looked and more complications and in my code review I have not found any, and neither has anyone else with the code sitting in linux-next" * tag 'kthread-cleanups-for-v5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: sched: Update task_tick_numa to ignore tasks without an mm fork: Stop allowing kthreads to call execve fork: Explicitly set PF_KTHREAD init: Deal with the init process being a user mode process fork: Generalize PF_IO_WORKER handling fork: Explicity test for idle tasks in copy_thread fork: Pass struct kernel_clone_args into copy_thread kthread: Don't allocate kthread_struct for init and umh --- 1ec6574a3c0a22c130c08e8c36c825cb87d68f8e diff --combined arch/arm64/kernel/process.c index 2f42123e059f,d0ef05c661b0..92bcc1768f0b --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@@ -111,7 -111,8 +111,7 @@@ void machine_power_off(void { local_irq_disable(); smp_send_stop(); - if (pm_power_off) - pm_power_off(); + do_kernel_power_off(); } /* @@@ -249,8 -250,6 +249,8 @@@ void show_regs(struct pt_regs *regs static void tls_thread_flush(void) { write_sysreg(0, tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(0, SYS_TPIDR2_EL0); if (is_compat_task()) { current->thread.uw.tp_value = 0; @@@ -299,42 -298,16 +299,42 @@@ int arch_dup_task_struct(struct task_st /* * Detach src's sve_state (if any) from dst so that it does not - * get erroneously used or freed prematurely. dst's sve_state + * get erroneously used or freed prematurely. dst's copies * will be allocated on demand later on if dst uses SVE. * For consistency, also clear TIF_SVE here: this could be done * later in copy_process(), but to avoid tripping up future - * maintainers it is best not to leave TIF_SVE and sve_state in + * maintainers it is best not to leave TIF flags and buffers in * an inconsistent state, even temporarily. */ dst->thread.sve_state = NULL; clear_tsk_thread_flag(dst, TIF_SVE); + /* + * In the unlikely event that we create a new thread with ZA + * enabled we should retain the ZA state so duplicate it here. + * This may be shortly freed if we exec() or if CLONE_SETTLS + * but it's simpler to do it here. To avoid confusing the rest + * of the code ensure that we have a sve_state allocated + * whenever za_state is allocated. + */ + if (thread_za_enabled(&src->thread)) { + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + dst->thread.za_state = kmemdup(src->thread.za_state, + za_state_size(src), + GFP_KERNEL); + if (!dst->thread.za_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + } else { + dst->thread.za_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + } + /* clear any pending asynchronous tag fault raised by the parent */ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); @@@ -343,9 -316,11 +343,11 @@@ asmlinkage void ret_from_fork(void) asm("ret_from_fork"); - int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, unsigned long tls) + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long stack_start = args->stack; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); @@@ -361,7 -336,7 +363,7 @@@ ptrauth_thread_init_kernel(p); - if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) { + if (likely(!args->fn)) { *childregs = *current_pt_regs(); childregs->regs[0] = 0; @@@ -370,8 -345,6 +372,8 @@@ * out-of-sync with the saved value. */ *task_user_tls(p) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2()) + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); if (stack_start) { if (is_compat_thread(task_thread_info(p))) @@@ -382,12 -355,10 +384,12 @@@ /* * If a TLS pointer was passed to clone, use it for the new - * thread. + * thread. We also reset TPIDR2 if it's in use. */ - if (clone_flags & CLONE_SETTLS) + if (clone_flags & CLONE_SETTLS) { p->thread.uw.tp_value = tls; + p->thread.tpidr2_el0 = 0; + } } else { /* * A kthread has no context to ERET to, so ensure any buggy @@@ -399,8 -370,8 +401,8 @@@ memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; - p->thread.cpu_context.x19 = stack_start; - p->thread.cpu_context.x20 = stk_sz; + p->thread.cpu_context.x19 = (unsigned long)args->fn; + p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; } p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; @@@ -418,8 -389,6 +420,8 @@@ void tls_preserve_current_state(void) { *task_user_tls(current) = read_sysreg(tpidr_el0); + if (system_supports_tpidr2() && !is_compat_task()) + current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); } static void tls_thread_switch(struct task_struct *next) @@@ -432,8 -401,6 +434,8 @@@ write_sysreg(0, tpidrro_el0); write_sysreg(*task_user_tls(next), tpidr_el0); + if (system_supports_tpidr2()) + write_sysreg_s(next->thread.tpidr2_el0, SYS_TPIDR2_EL0); } /* diff --combined arch/csky/kernel/process.c index 5de04707aa07,9af49aea1c3b..eedddb155669 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@@ -2,6 -2,7 +2,6 @@@ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. #include -#include #include #include #include @@@ -29,12 -30,11 +29,11 @@@ asmlinkage void ret_from_kernel_thread( */ void flush_thread(void){} - int copy_thread(unsigned long clone_flags, - unsigned long usp, - unsigned long kthread_arg, - struct task_struct *p, - unsigned long tls) + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long tls = args->tls; struct switch_stack *childstack; struct pt_regs *childregs = task_pt_regs(p); @@@ -48,11 -48,11 +47,11 @@@ /* setup thread.sp for switch_to !!! */ p->thread.sp = (unsigned long)childstack; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childregs, 0, sizeof(struct pt_regs)); childstack->r15 = (unsigned long) ret_from_kernel_thread; - childstack->r10 = kthread_arg; - childstack->r9 = usp; + childstack->r10 = (unsigned long) args->fn_arg; + childstack->r9 = (unsigned long) args->fn; childregs->sr = mfcr("psr"); } else { *childregs = *(current_pt_regs()); diff --combined arch/ia64/kernel/process.c index 89025e3b3f61,167b1765bea1..416305e550e2 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@@ -19,7 -19,6 +19,7 @@@ #include #include #include +#include #include #include #include @@@ -296,9 -295,12 +296,12 @@@ ia64_load_extra (struct task_struct *ta * so there is nothing to worry about. */ int - copy_thread(unsigned long clone_flags, unsigned long user_stack_base, - unsigned long user_stack_size, struct task_struct *p, unsigned long tls) + copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long user_stack_base = args->stack; + unsigned long user_stack_size = args->stack_size; + unsigned long tls = args->tls; extern char ia64_ret_from_clone; struct switch_stack *child_stack, *stack; unsigned long rbs, child_rbs, rbs_size; @@@ -339,14 -341,14 +342,14 @@@ ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { - if (unlikely(!user_stack_base)) { + if (unlikely(args->fn)) { + if (unlikely(args->idle)) { /* fork_idle() called us */ return 0; } memset(child_stack, 0, sizeof(*child_ptregs) + sizeof(*child_stack)); - child_stack->r4 = user_stack_base; /* payload */ - child_stack->r5 = user_stack_size; /* argument */ + child_stack->r4 = (unsigned long) args->fn; + child_stack->r5 = (unsigned long) args->fn_arg; /* * Preserve PSR bits, except for bits 32-34 and 37-45, * which we can't read. @@@ -600,7 -602,8 +603,7 @@@ machine_halt (void void machine_power_off (void) { - if (pm_power_off) - pm_power_off(); + do_kernel_power_off(); machine_halt(); } diff --combined arch/m68k/kernel/process.c index e160a7c57bd3,221feb0269f1..2cb4a61bcfac --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@@ -67,11 -67,12 +67,11 @@@ void machine_halt(void void machine_power_off(void) { - if (mach_power_off) - mach_power_off(); + do_kernel_power_off(); for (;;); } -void (*pm_power_off)(void) = machine_power_off; +void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); void show_regs(struct pt_regs * regs) @@@ -137,9 -138,11 +137,11 @@@ asmlinkage int m68k_clone3(struct pt_re return sys_clone3((struct clone_args __user *)regs->d1, regs->d2); } - int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, unsigned long tls) + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long tls = args->tls; struct fork_frame { struct switch_stack sw; struct pt_regs regs; @@@ -156,12 -159,12 +158,12 @@@ */ p->thread.fc = USER_DATA; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(frame, 0, sizeof(struct fork_frame)); frame->regs.sr = PS_S; - frame->sw.a3 = usp; /* function */ - frame->sw.d7 = arg; + frame->sw.a3 = (unsigned long)args->fn; + frame->sw.d7 = (unsigned long)args->fn_arg; frame->sw.retpc = (unsigned long)ret_from_kernel_thread; p->thread.usp = 0; return 0; diff --combined arch/openrisc/kernel/process.c index 1d4c0921aafa,d9697cc9bc4d..52dc983ddeba --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@@ -52,8 -52,6 +52,8 @@@ void machine_restart(char *cmd { do_kernel_restart(cmd); + __asm__("l.nop 13"); + /* Give a grace period for failure to restart of 1s */ mdelay(1000); @@@ -62,16 -60,6 +62,16 @@@ while (1); } +/* + * This is used if pm_power_off has not been set by a power management + * driver, in this case we can assume we are on a simulator. On + * OpenRISC simulators l.nop 1 will trigger the simulator exit. + */ +static void default_power_off(void) +{ + __asm__("l.nop 1"); +} + /* * Similar to machine_power_off, but don't shut off power. Add code * here to freeze the system for e.g. post-mortem debug purpose when @@@ -87,10 -75,7 +87,10 @@@ void machine_halt(void void machine_power_off(void) { printk(KERN_INFO "*** MACHINE POWER OFF ***\n"); - __asm__("l.nop 1"); + if (pm_power_off != NULL) + pm_power_off(); + else + default_power_off(); } /* @@@ -104,7 -89,7 +104,7 @@@ void arch_cpu_idle(void mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME); } -void (*pm_power_off) (void) = machine_power_off; +void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); /* @@@ -167,9 -152,11 +167,11 @@@ extern asmlinkage void ret_from_fork(vo */ int - copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, unsigned long tls) + copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long tls = args->tls; struct pt_regs *userregs; struct pt_regs *kregs; unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; @@@ -187,10 -174,10 +189,10 @@@ sp -= sizeof(struct pt_regs); kregs = (struct pt_regs *)sp; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(kregs, 0, sizeof(struct pt_regs)); - kregs->gpr[20] = usp; /* fn, kernel thread */ - kregs->gpr[22] = arg; + kregs->gpr[20] = (unsigned long)args->fn; + kregs->gpr[22] = (unsigned long)args->fn_arg; } else { *userregs = *current_pt_regs(); diff --combined arch/parisc/kernel/process.c index d145184696ea,a6a2a558fc5b..7c37e09c92da --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@@ -26,7 -26,6 +26,7 @@@ #include #include #include +#include #include #include #include @@@ -117,7 -116,8 +117,7 @@@ void machine_power_off(void pdc_chassis_send_status(PDC_CHASSIS_DIRECT_SHUTDOWN); /* ipmi_poweroff may have been installed. */ - if (pm_power_off) - pm_power_off(); + do_kernel_power_off(); /* It seems we have no way to power the system off via * software. The user has to press the button himself. */ @@@ -206,9 -206,11 +206,11 @@@ arch_initcall(parisc_idle_init) * Copy architecture-specific thread state */ int - copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, unsigned long tls) + copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long tls = args->tls; struct pt_regs *cregs = &(p->thread.regs); void *stack = task_stack_page(p); @@@ -218,10 -220,10 +220,10 @@@ extern void * const ret_from_kernel_thread; extern void * const child_return; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(cregs, 0, sizeof(struct pt_regs)); - if (!usp) /* idle thread */ + if (args->idle) /* idle thread */ return 0; /* Must exit via ret_from_kernel_thread in order * to call schedule_tail() @@@ -233,12 -235,12 +235,12 @@@ * ret_from_kernel_thread. */ #ifdef CONFIG_64BIT - cregs->gr[27] = ((unsigned long *)usp)[3]; - cregs->gr[26] = ((unsigned long *)usp)[2]; + cregs->gr[27] = ((unsigned long *)args->fn)[3]; + cregs->gr[26] = ((unsigned long *)args->fn)[2]; #else - cregs->gr[26] = usp; + cregs->gr[26] = (unsigned long) args->fn; #endif - cregs->gr[25] = kthread_arg; + cregs->gr[25] = (unsigned long) args->fn_arg; } else { /* user thread */ /* usp must be word aligned. This also prevents users from diff --combined arch/powerpc/kernel/process.c index d00b20c65966,4f367bb68906..b62046bf3bb8 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@@ -34,8 -34,10 +34,8 @@@ #include #include #include -#include #include #include -#include #include #include @@@ -43,6 -45,7 +43,6 @@@ #include #include #include -#include #include #include #include @@@ -304,7 -307,7 +304,7 @@@ static void __giveup_vsx(struct task_st unsigned long msr = tsk->thread.regs->msr; /* - * We should never be ssetting MSR_VSX without also setting + * We should never be setting MSR_VSX without also setting * MSR_FP and MSR_VEC */ WARN_ON((msr & MSR_VSX) && !((msr & MSR_FP) && (msr & MSR_VEC))); @@@ -642,7 -645,7 +642,7 @@@ static void do_break_handler(struct pt_ return; } - /* Otherwise findout which DAWR caused exception and disable it. */ + /* Otherwise find out which DAWR caused exception and disable it. */ wp_get_instr_detail(regs, &instr, &type, &size, &ea); for (i = 0; i < nr_wp_slots(); i++) { @@@ -1713,10 -1716,11 +1713,11 @@@ static void setup_ksp_vsid(struct task_ /* * Copy architecture-specific thread state */ - int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p, - unsigned long tls) + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long tls = args->tls; struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); extern void ret_from_fork_scv(void); @@@ -1733,18 -1737,18 +1734,18 @@@ /* Copy registers */ sp -= sizeof(struct pt_regs); childregs = (struct pt_regs *) sp; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->gpr[1] = sp + sizeof(struct pt_regs); /* function */ - if (usp) - childregs->gpr[14] = ppc_function_entry((void *)usp); + if (args->fn) + childregs->gpr[14] = ppc_function_entry((void *)args->fn); #ifdef CONFIG_PPC64 clear_tsk_thread_flag(p, TIF_32BIT); childregs->softe = IRQS_ENABLED; #endif - childregs->gpr[15] = kthread_arg; + childregs->gpr[15] = (unsigned long)args->fn_arg; p->thread.regs = NULL; /* no user register state */ ti->flags |= _TIF_RESTOREALL; f = ret_from_kernel_thread; @@@ -2310,3 -2314,42 +2311,3 @@@ unsigned long arch_align_stack(unsigne sp -= get_random_int() & ~PAGE_MASK; return sp & ~0xf; } - -static inline unsigned long brk_rnd(void) -{ - unsigned long rnd = 0; - - /* 8MB for 32bit, 1GB for 64bit */ - if (is_32bit_task()) - rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT))); - else - rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT))); - - return rnd << PAGE_SHIFT; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long base = mm->brk; - unsigned long ret; - -#ifdef CONFIG_PPC_BOOK3S_64 - /* - * If we are using 1TB segments and we are allowed to randomise - * the heap, we can put it above 1TB so it is backed by a 1TB - * segment. Otherwise the heap will be in the bottom 1TB - * which always uses 256MB segments and this may result in a - * performance penalty. - */ - if (!radix_enabled() && !is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T)) - base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T); -#endif - - ret = PAGE_ALIGN(base + brk_rnd()); - - if (ret < mm->brk) - return mm->brk; - - return ret; -} - diff --combined arch/riscv/kernel/process.c index 1c7be865ab31,24efabdbc551..ceb9ebab6558 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@@ -84,34 -84,6 +84,34 @@@ void show_regs(struct pt_regs *regs dump_backtrace(regs, NULL, KERN_DEFAULT); } +#ifdef CONFIG_COMPAT +static bool compat_mode_supported __read_mostly; + +bool compat_elf_check_arch(Elf32_Ehdr *hdr) +{ + return compat_mode_supported && + hdr->e_machine == EM_RISCV && + hdr->e_ident[EI_CLASS] == ELFCLASS32; +} + +static int __init compat_mode_detect(void) +{ + unsigned long tmp = csr_read(CSR_STATUS); + + csr_write(CSR_STATUS, (tmp & ~SR_UXL) | SR_UXL_32); + compat_mode_supported = + (csr_read(CSR_STATUS) & SR_UXL) == SR_UXL_32; + + csr_write(CSR_STATUS, tmp); + + pr_info("riscv: ELF compat mode %s", + compat_mode_supported ? "supported" : "failed"); + + return 0; +} +early_initcall(compat_mode_detect); +#endif + void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { @@@ -126,15 -98,6 +126,15 @@@ } regs->epc = pc; regs->sp = sp; + +#ifdef CONFIG_64BIT + regs->status &= ~SR_UXL; + + if (is_compat_task()) + regs->status |= SR_UXL_32; + else + regs->status |= SR_UXL_64; +#endif } void flush_thread(void) @@@ -157,13 -120,15 +157,15 @@@ int arch_dup_task_struct(struct task_st return 0; } - int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, unsigned long tls) + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp = args->stack; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); /* p->thread holds context to be restored by __switch_to() */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* Kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->gp = gp_in_global; @@@ -171,8 -136,8 +173,8 @@@ childregs->status = SR_PP | SR_PIE; p->thread.ra = (unsigned long)ret_from_kernel_thread; - p->thread.s[0] = usp; /* fn */ - p->thread.s[1] = arg; + p->thread.s[0] = (unsigned long)args->fn; + p->thread.s[1] = (unsigned long)args->fn_arg; } else { *childregs = *(current_pt_regs()); if (usp) /* User fork */ diff --combined arch/x86/kernel/fpu/core.c index 0fdc807ae13f,fbade5a3975b..0531d6a06df5 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@@ -14,8 -14,6 +14,8 @@@ #include #include +#include + #include #include #include @@@ -43,7 -41,17 +43,7 @@@ struct fpu_state_config fpu_user_cfg __ */ struct fpstate init_fpstate __ro_after_init; -/* - * Track whether the kernel is using the FPU state - * currently. - * - * This flag is used: - * - * - by IRQ context code to potentially use the FPU - * if it's unused. - * - * - to debug kernel_fpu_begin()/end() correctness - */ +/* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* @@@ -51,37 -59,42 +51,37 @@@ */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -static bool kernel_fpu_disabled(void) -{ - return this_cpu_read(in_kernel_fpu); -} - -static bool interrupted_kernel_fpu_idle(void) -{ - return !kernel_fpu_disabled(); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode(regs); -} - /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. */ bool irq_fpu_usable(void) { - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); + if (WARN_ON_ONCE(in_nmi())) + return false; + + /* In kernel FPU usage already active? */ + if (this_cpu_read(in_kernel_fpu)) + return false; + + /* + * When not in NMI or hard interrupt context, FPU can be used in: + * + * - Task context except from within fpregs_lock()'ed critical + * regions. + * + * - Soft interrupt processing context which cannot happen + * while in a fpregs_lock()'ed critical region. + */ + if (!in_hardirq()) + return true; + + /* + * In hard interrupt context it's safe when soft interrupts + * are enabled, which means the interrupt did not hit in + * a fpregs_lock()'ed critical region. + */ + return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); @@@ -234,20 -247,7 +234,20 @@@ bool fpu_alloc_guest_fpstate(struct fpu gfpu->fpstate = fpstate; gfpu->xfeatures = fpu_user_cfg.default_features; gfpu->perm = fpu_user_cfg.default_features; - gfpu->uabi_size = fpu_user_cfg.default_size; + + /* + * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state + * to userspace, even when XSAVE is unsupported, so that restoring FPU + * state on a different CPU that does support XSAVE can cleanly load + * the incoming state using its natural XSAVE. In other words, KVM's + * uABI size may be larger than this host's default size. Conversely, + * the default size should never be larger than KVM's base uABI size; + * all features that can expand the uABI size must be opt-in. + */ + gfpu->uabi_size = sizeof(struct kvm_xsave); + if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) + gfpu->uabi_size = fpu_user_cfg.default_size; + fpu_init_guest_permissions(gfpu); return true; @@@ -556,7 -556,7 +556,7 @@@ static inline void fpu_inherit_perms(st } /* Clone current's FPU state on fork */ - int fpu_clone(struct task_struct *dst, unsigned long clone_flags) + int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@@ -579,7 -579,7 +579,7 @@@ * No FPU state inheritance for kernel threads and IO * worker threads. */ - if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { + if (minimal) { /* Clear out the minimal state */ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); diff --combined arch/x86/kernel/process.c index 58fb48d3004f,d20eaad52a85..9b2772b7e1f3 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@@ -46,7 -46,6 +46,7 @@@ #include #include #include +#include #include "process.h" @@@ -131,9 -130,11 +131,11 @@@ static int set_new_tls(struct task_stru return do_set_thread_area_64(p, ARCH_SET_FS, tls); } - int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, - struct task_struct *p, unsigned long tls) + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; @@@ -161,7 -162,6 +163,7 @@@ savesegment(ds, p->thread.ds); #else p->thread.sp0 = (unsigned long) (childregs + 1); + savesegment(gs, p->thread.gs); /* * Clear all status flags including IF and set fixed bit. 64bit * does not have this initialization as the frame does not contain @@@ -171,13 -171,13 +173,13 @@@ frame->flags = X86_EFLAGS_FIXED; #endif - fpu_clone(p, clone_flags); + fpu_clone(p, clone_flags, args->fn); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } @@@ -193,10 -193,14 +195,10 @@@ if (sp) childregs->sp = sp; - if (unlikely(p->flags & PF_IO_WORKER)) { -#ifdef CONFIG_X86_32 - task_user_gs(p) = get_user_gs(current_pt_regs()); -#endif - + if (unlikely(args->fn)) { /* - * An IO thread is a user space thread, but it doesn't - * return to ret_after_fork(). + * A user space thread, but it doesn't return to + * ret_after_fork(). * * In order to indicate that to tools like gdb, * we reset the stack and instruction pointers. @@@ -206,7 -210,7 +208,7 @@@ */ childregs->sp = 0; childregs->ip = 0; - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } @@@ -332,7 -336,7 +334,7 @@@ static int get_cpuid_mode(void return !test_thread_flag(TIF_NOCPUID); } -static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +static int set_cpuid_mode(unsigned long cpuid_enabled) { if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; @@@ -403,7 -407,7 +405,7 @@@ static void tss_copy_io_bitmap(struct t } /** - * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode + * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode */ void native_tss_update_io_bitmap(void) { @@@ -684,6 -688,9 +686,6 @@@ void __switch_to_xtra(struct task_struc /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } - - if ((tifp ^ tifn) & _TIF_SLD) - switch_to_sld(tifn); } /* @@@ -868,9 -875,6 +870,9 @@@ void select_idle_routine(const struct c } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); x86_idle = mwait_idle; + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_info("using TDX aware idle routine\n"); + x86_idle = tdx_safe_halt; } else x86_idle = default_idle; } @@@ -983,19 -987,20 +985,19 @@@ unsigned long __get_wchan(struct task_s return addr; } -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long arg2) +long do_arch_prctl_common(int option, unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, arg2); + return set_cpuid_mode(arg2); case ARCH_GET_XCOMP_SUPP: case ARCH_GET_XCOMP_PERM: case ARCH_REQ_XCOMP_PERM: case ARCH_GET_XCOMP_GUEST_PERM: case ARCH_REQ_XCOMP_GUEST_PERM: - return fpu_xstate_prctl(task, option, arg2); + return fpu_xstate_prctl(option, arg2); } return -EINVAL; diff --combined arch/xtensa/kernel/process.c index 7e38292dd07a,c3751cc88e5d..68e0e2f06d66 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@@ -47,7 -47,6 +47,7 @@@ #include #include #include +#include extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); @@@ -64,114 -63,52 +64,114 @@@ EXPORT_SYMBOL(__stack_chk_guard) #if XTENSA_HAVE_COPROCESSORS -void coprocessor_release_all(struct thread_info *ti) +void local_coprocessors_flush_release_all(void) { - unsigned long cpenable; - int i; + struct thread_info **coprocessor_owner; + struct thread_info *unique_owner[XCHAL_CP_MAX]; + int n = 0; + int i, j; - /* Make sure we don't switch tasks during this operation. */ + coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner; + xtensa_set_sr(XCHAL_CP_MASK, cpenable); - preempt_disable(); + for (i = 0; i < XCHAL_CP_MAX; i++) { + struct thread_info *ti = coprocessor_owner[i]; - /* Walk through all cp owners and release it for the requested one. */ + if (ti) { + coprocessor_flush(ti, i); - cpenable = ti->cpenable; + for (j = 0; j < n; j++) + if (unique_owner[j] == ti) + break; + if (j == n) + unique_owner[n++] = ti; - for (i = 0; i < XCHAL_CP_MAX; i++) { - if (coprocessor_owner[i] == ti) { - coprocessor_owner[i] = 0; - cpenable &= ~(1 << i); + coprocessor_owner[i] = NULL; } } + for (i = 0; i < n; i++) { + /* pairs with memw (1) in fast_coprocessor and memw in switch_to */ + smp_wmb(); + unique_owner[i]->cpenable = 0; + } + xtensa_set_sr(0, cpenable); +} - ti->cpenable = cpenable; +static void local_coprocessor_release_all(void *info) +{ + struct thread_info *ti = info; + struct thread_info **coprocessor_owner; + int i; + + coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner; + + /* Walk through all cp owners and release it for the requested one. */ + + for (i = 0; i < XCHAL_CP_MAX; i++) { + if (coprocessor_owner[i] == ti) + coprocessor_owner[i] = NULL; + } + /* pairs with memw (1) in fast_coprocessor and memw in switch_to */ + smp_wmb(); + ti->cpenable = 0; if (ti == current_thread_info()) xtensa_set_sr(0, cpenable); +} - preempt_enable(); +void coprocessor_release_all(struct thread_info *ti) +{ + if (ti->cpenable) { + /* pairs with memw (2) in fast_coprocessor */ + smp_rmb(); + smp_call_function_single(ti->cp_owner_cpu, + local_coprocessor_release_all, + ti, true); + } } -void coprocessor_flush_all(struct thread_info *ti) +static void local_coprocessor_flush_all(void *info) { - unsigned long cpenable, old_cpenable; + struct thread_info *ti = info; + struct thread_info **coprocessor_owner; + unsigned long old_cpenable; int i; - preempt_disable(); - - old_cpenable = xtensa_get_sr(cpenable); - cpenable = ti->cpenable; - xtensa_set_sr(cpenable, cpenable); + coprocessor_owner = this_cpu_ptr(&exc_table)->coprocessor_owner; + old_cpenable = xtensa_xsr(ti->cpenable, cpenable); for (i = 0; i < XCHAL_CP_MAX; i++) { - if ((cpenable & 1) != 0 && coprocessor_owner[i] == ti) + if (coprocessor_owner[i] == ti) coprocessor_flush(ti, i); - cpenable >>= 1; } xtensa_set_sr(old_cpenable, cpenable); +} + +void coprocessor_flush_all(struct thread_info *ti) +{ + if (ti->cpenable) { + /* pairs with memw (2) in fast_coprocessor */ + smp_rmb(); + smp_call_function_single(ti->cp_owner_cpu, + local_coprocessor_flush_all, + ti, true); + } +} + +static void local_coprocessor_flush_release_all(void *info) +{ + local_coprocessor_flush_all(info); + local_coprocessor_release_all(info); +} - preempt_enable(); +void coprocessor_flush_release_all(struct thread_info *ti) +{ + if (ti->cpenable) { + /* pairs with memw (2) in fast_coprocessor */ + smp_rmb(); + smp_call_function_single(ti->cp_owner_cpu, + local_coprocessor_flush_release_all, + ti, true); + } } #endif @@@ -203,7 -140,8 +203,7 @@@ void flush_thread(void { #if XTENSA_HAVE_COPROCESSORS struct thread_info *ti = current_thread_info(); - coprocessor_flush_all(ti); - coprocessor_release_all(ti); + coprocessor_flush_release_all(ti); #endif flush_ptrace_hw_breakpoint(current); } @@@ -263,10 -201,11 +263,11 @@@ int arch_dup_task_struct(struct task_st * involved. Much simpler to just not copy those live frames across. */ - int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, - unsigned long thread_fn_arg, struct task_struct *p, - unsigned long tls) + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long usp_thread_fn = args->stack; + unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); #if (XTENSA_HAVE_COPROCESSORS || XTENSA_HAVE_IO_PORTS) @@@ -286,7 -225,7 +287,7 @@@ #error Unsupported Xtensa ABI #endif - if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (!args->fn) { struct pt_regs *regs = current_pt_regs(); unsigned long usp = usp_thread_fn ? usp_thread_fn : regs->areg[1]; @@@ -338,15 -277,15 +339,15 @@@ * Window underflow will load registers from the * spill slots on the stack on return from _switch_to. */ - SPILL_SLOT(childregs, 2) = usp_thread_fn; - SPILL_SLOT(childregs, 3) = thread_fn_arg; + SPILL_SLOT(childregs, 2) = (unsigned long)args->fn; + SPILL_SLOT(childregs, 3) = (unsigned long)args->fn_arg; #elif defined(__XTENSA_CALL0_ABI__) /* * a12 = thread_fn, a13 = thread_fn arg. * _switch_to epilogue will load registers from the stack. */ - ((unsigned long *)p->thread.sp)[0] = usp_thread_fn; - ((unsigned long *)p->thread.sp)[1] = thread_fn_arg; + ((unsigned long *)p->thread.sp)[0] = (unsigned long)args->fn; + ((unsigned long *)p->thread.sp)[1] = (unsigned long)args->fn_arg; #else #error Unsupported Xtensa ABI #endif diff --combined fs/exec.c index 14b4b3755580,9c5260e74517..0989fb8472a1 --- a/fs/exec.c +++ b/fs/exec.c @@@ -758,7 -758,6 +758,7 @@@ int setup_arg_pages(struct linux_binpr unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; + struct mmu_gather tlb; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@@ -813,11 -812,8 +813,11 @@@ vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; - ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, + tlb_gather_mmu(&tlb, mm); + ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); + tlb_finish_mmu(&tlb); + if (ret) goto out_unlock; BUG_ON(prev != vma); @@@ -1312,9 -1308,7 +1312,7 @@@ int begin_new_exec(struct linux_binprm if (retval) goto out_unlock; - if (me->flags & PF_KTHREAD) - free_kthread_struct(me); - me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; @@@ -1959,6 -1953,10 +1957,10 @@@ int kernel_execve(const char *kernel_fi int fd = AT_FDCWD; int retval; + /* It is non-sense for kernel threads to call execve */ + if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) + return -EINVAL; + filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); diff --combined init/initramfs.c index dc84cf756cea,41e7857d510d..18229cfe8906 --- a/init/initramfs.c +++ b/init/initramfs.c @@@ -15,13 -15,11 +15,14 @@@ #include #include #include + #include #include -static ssize_t __init xwrite(struct file *file, const char *p, size_t count, - loff_t *pos) +static __initdata bool csum_present; +static __initdata u32 io_csum; + +static ssize_t __init xwrite(struct file *file, const unsigned char *p, + size_t count, loff_t *pos) { ssize_t out = 0; @@@ -36,13 -34,6 +37,13 @@@ } else if (rv == 0) break; + if (csum_present) { + ssize_t i; + + for (i = 0; i < rv; i++) + io_csum += p[i]; + } + p += rv; out += rv; count -= rv; @@@ -126,36 -117,31 +127,36 @@@ static void __init free_hash(void } } -static long __init do_utime(char *filename, time64_t mtime) +#ifdef CONFIG_INITRAMFS_PRESERVE_MTIME +static void __init do_utime(char *filename, time64_t mtime) { - struct timespec64 t[2]; + struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; + init_utimes(filename, t); +} - t[0].tv_sec = mtime; - t[0].tv_nsec = 0; - t[1].tv_sec = mtime; - t[1].tv_nsec = 0; - return init_utimes(filename, t); +static void __init do_utime_path(const struct path *path, time64_t mtime) +{ + struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; + vfs_utimes(path, t); } static __initdata LIST_HEAD(dir_list); struct dir_entry { struct list_head list; - char *name; time64_t mtime; + char name[]; }; static void __init dir_add(const char *name, time64_t mtime) { - struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); + size_t nlen = strlen(name) + 1; + struct dir_entry *de; + + de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); if (!de) panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); - de->name = kstrdup(name, GFP_KERNEL); + strscpy(de->name, name, nlen); de->mtime = mtime; list_add(&de->list, &dir_list); } @@@ -166,15 -152,10 +167,15 @@@ static void __init dir_utime(void list_for_each_entry_safe(de, tmp, &dir_list, list) { list_del(&de->list); do_utime(de->name, de->mtime); - kfree(de->name); kfree(de); } } +#else +static void __init do_utime(char *filename, time64_t mtime) {} +static void __init do_utime_path(const struct path *path, time64_t mtime) {} +static void __init dir_add(const char *name, time64_t mtime) {} +static void __init dir_utime(void) {} +#endif static __initdata time64_t mtime; @@@ -186,16 -167,15 +187,16 @@@ static __initdata unsigned long body_le static __initdata uid_t uid; static __initdata gid_t gid; static __initdata unsigned rdev; +static __initdata u32 hdr_csum; static void __init parse_header(char *s) { - unsigned long parsed[12]; + unsigned long parsed[13]; char buf[9]; int i; buf[8] = '\0'; - for (i = 0, s += 6; i < 12; i++, s += 8) { + for (i = 0, s += 6; i < 13; i++, s += 8) { memcpy(buf, s, 8); parsed[i] = simple_strtoul(buf, NULL, 16); } @@@ -210,7 -190,6 +211,7 @@@ minor = parsed[8]; rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); name_len = parsed[11]; + hdr_csum = parsed[12]; } /* FSM */ @@@ -279,15 -258,12 +280,15 @@@ static int __init do_collect(void static int __init do_header(void) { - if (memcmp(collected, "070707", 6)==0) { - error("incorrect cpio method used: use -H newc option"); - return 1; - } - if (memcmp(collected, "070701", 6)) { - error("no cpio magic"); + if (!memcmp(collected, "070701", 6)) { + csum_present = false; + } else if (!memcmp(collected, "070702", 6)) { + csum_present = true; + } else { + if (memcmp(collected, "070707", 6) == 0) + error("incorrect cpio method used: use -H newc option"); + else + error("no cpio magic"); return 1; } parse_header(collected); @@@ -378,7 -354,6 +379,7 @@@ static int __init do_name(void if (IS_ERR(wfile)) return 0; wfile_pos = 0; + io_csum = 0; vfs_fchown(wfile, uid, gid); vfs_fchmod(wfile, mode); @@@ -406,13 -381,15 +407,13 @@@ static int __init do_copy(void) { if (byte_count >= body_len) { - struct timespec64 t[2] = { }; if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len) error("write error"); - t[0].tv_sec = mtime; - t[1].tv_sec = mtime; - vfs_utimes(&wfile->f_path, t); - + do_utime_path(&wfile->f_path, mtime); fput(wfile); + if (csum_present && io_csum != hdr_csum) + error("bad data checksum"); eat(body_len); state = SkipIt; return 0; @@@ -727,6 -704,7 +728,7 @@@ done initrd_end = 0; flush_delayed_fput(); + task_work_run(); } static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain); diff --combined init/main.c index 02eb533018f6,39baac0211c6..0ee39cdcfcac --- a/init/main.c +++ b/init/main.c @@@ -266,7 -266,7 +266,7 @@@ static int __init loglevel(char *str early_param("loglevel", loglevel); #ifdef CONFIG_BLK_DEV_INITRD -static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum) +static void * __init get_boot_config_from_initrd(size_t *_size) { u32 size, csum; char *data; @@@ -300,20 -300,17 +300,20 @@@ found return NULL; } + if (xbc_calc_checksum(data, size) != csum) { + pr_err("bootconfig checksum failed\n"); + return NULL; + } + /* Remove bootconfig from initramfs/initrd */ initrd_end = (unsigned long)data; if (_size) *_size = size; - if (_csum) - *_csum = csum; return data; } #else -static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum) +static void * __init get_boot_config_from_initrd(size_t *_size) { return NULL; } @@@ -410,16 -407,14 +410,16 @@@ static int __init warn_bootconfig(char static void __init setup_boot_config(void) { static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; - const char *msg; - int pos; - u32 size, csum; - char *data, *err; - int ret; + const char *msg, *data; + int pos, ret; + size_t size; + char *err; /* Cut out the bootconfig data even if we have no bootconfig option */ - data = get_boot_config_from_initrd(&size, &csum); + data = get_boot_config_from_initrd(&size); + /* If there is no bootconfig in initrd, try embedded one. */ + if (!data) + data = xbc_get_embedded_bootconfig(&size); strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, @@@ -438,8 -433,13 +438,8 @@@ } if (size >= XBC_DATA_MAX) { - pr_err("bootconfig size %d greater than max size %d\n", - size, XBC_DATA_MAX); - return; - } - - if (xbc_calc_checksum(data, size) != csum) { - pr_err("bootconfig checksum failed\n"); + pr_err("bootconfig size %ld greater than max size %d\n", + (long)size, XBC_DATA_MAX); return; } @@@ -452,7 -452,7 +452,7 @@@ msg, pos); } else { xbc_get_info(&ret, NULL); - pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret); + pr_info("Load bootconfig: %ld bytes %d nodes\n", (long)size, ret); /* keys starting with "kernel." are passed via cmdline */ extra_command_line = xbc_make_cmdline("kernel"); /* Also, "init." keys are init arguments */ @@@ -471,7 -471,7 +471,7 @@@ static void __init exit_boot_config(voi static void __init setup_boot_config(void) { /* Remove bootconfig data from initrd */ - get_boot_config_from_initrd(NULL, NULL); + get_boot_config_from_initrd(NULL); } static int __init warn_bootconfig(char *str) @@@ -688,7 -688,7 +688,7 @@@ noinline void __ref rest_init(void * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ - pid = kernel_thread(kernel_init, NULL, CLONE_FS); + pid = user_mode_thread(kernel_init, NULL, CLONE_FS); /* * Pin init on the boot CPU. Task migration is not properly working * until sched_init_smp() has been run. It will set the allowed @@@ -1035,18 -1035,21 +1035,18 @@@ asmlinkage __visible void __init __no_s softirq_init(); timekeeping_init(); kfence_init(); + time_init(); /* * For best initial stack canary entropy, prepare it after: * - setup_arch() for any UEFI RNG entropy and boot cmdline access - * - timekeeping_init() for ktime entropy used in rand_initialize() - * - rand_initialize() to get any arch-specific entropy like RDRAND - * - add_latent_entropy() to get any latent entropy - * - adding command line entropy + * - timekeeping_init() for ktime entropy used in random_init() + * - time_init() for making random_get_entropy() work on some platforms + * - random_init() to initialize the RNG from from early entropy sources */ - rand_initialize(); - add_latent_entropy(); - add_device_randomness(command_line, strlen(command_line)); + random_init(command_line); boot_init_stack_canary(); - time_init(); perf_event_init(); profile_init(); call_function_init(); diff --combined kernel/fork.c index 124829ed0163,35645f57bd2f..9d44f2d46c69 --- a/kernel/fork.c +++ b/kernel/fork.c @@@ -612,7 -612,9 +612,7 @@@ static __latent_entropy int dup_mmap(st retval = ksm_fork(mm, oldmm); if (retval) goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; + khugepaged_fork(mm, oldmm); prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { @@@ -790,7 -792,6 +790,7 @@@ void __mmdrop(struct mm_struct *mm mmu_notifier_subscriptions_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); + mm_pasid_drop(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@@ -1044,11 -1045,6 +1044,11 @@@ static struct task_struct *dup_task_str #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif + +#ifdef CONFIG_CPU_SUP_INTEL + tsk->reported_split_lock = 0; +#endif + return tsk; free_stack: @@@ -1194,6 -1190,7 +1194,6 @@@ static inline void __mmput(struct mm_st } if (mm->binfmt) module_put(mm->binfmt->module); - mm_pasid_drop(mm); mmdrop(mm); } @@@ -1982,7 -1979,7 +1982,7 @@@ static __latent_entropy struct task_str struct task_struct *p; struct multiprocess_signals delayed; struct file *pidfile = NULL; - u64 clone_flags = args->flags; + const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; /* @@@ -2071,6 -2068,9 +2071,9 @@@ p = dup_task_struct(current, node); if (!p) goto fork_out; + p->flags &= ~PF_KTHREAD; + if (args->kthread) + p->flags |= PF_KTHREAD; if (args->io_thread) { /* * Mark us an IO worker, and block any signal that isn't @@@ -2160,7 -2160,7 +2163,7 @@@ p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); - if (p->flags & PF_KTHREAD) { + if (args->kthread) { if (!set_kthread_struct(p)) goto bad_fork_cleanup_delayacct; } @@@ -2243,7 -2243,7 +2246,7 @@@ retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls); + retval = copy_thread(p, args); if (retval) goto bad_fork_cleanup_io; @@@ -2547,11 -2547,21 +2550,21 @@@ static inline void init_idle_pids(struc } } + static int idle_dummy(void *dummy) + { + /* This function is never called */ + return 0; + } + struct task_struct * __init fork_idle(int cpu) { struct task_struct *task; struct kernel_clone_args args = { - .flags = CLONE_VM, + .flags = CLONE_VM, + .fn = &idle_dummy, + .fn_arg = NULL, + .kthread = 1, + .idle = 1, }; task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); @@@ -2582,8 -2592,8 +2595,8 @@@ struct task_struct *create_io_thread(in .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, .io_thread = 1, }; @@@ -2687,8 -2697,25 +2700,25 @@@ pid_t kernel_thread(int (*fn)(void *), .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, + .kthread = 1, + }; + + return kernel_clone(&args); + } + + /* + * Create a user mode thread. + */ + pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) + { + struct kernel_clone_args args = { + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .fn = fn, + .fn_arg = arg, }; return kernel_clone(&args); diff --combined kernel/sched/fair.c index 8c5b74f66bd3,db6f0df9d43e..77b2048a9326 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@@ -36,7 -36,6 +36,7 @@@ #include #include #include +#include #include #include @@@ -174,37 -173,7 +174,37 @@@ int __weak arch_asym_cpu_priority(int c * * (default: 5 msec, units: microseconds) */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_fair_sysctls[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif + {} +}; + +static int __init sched_fair_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_fair_sysctls); + return 0; +} +late_initcall(sched_fair_sysctl_init); #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) @@@ -344,6 -313,19 +344,6 @@@ const struct sched_class fair_sched_cla #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (!path) - return; - - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) - autogroup_path(cfs_rq->tg, path, len); - else if (cfs_rq && cfs_rq->tg->css.cgroup) - cgroup_path(cfs_rq->tg->css.cgroup, path, len); - else - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@@ -511,6 -493,12 +511,6 @@@ static int se_is_idle(struct sched_enti #define for_each_sched_entity(se) \ for (; se; se = NULL) -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (path) - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { return true; @@@ -2927,7 -2915,7 +2927,7 @@@ static void task_tick_numa(struct rq *r /* * We don't care about NUMA placement if we don't have memory. */ - if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* @@@ -3841,11 -3829,11 +3841,11 @@@ static void attach_entity_load_avg(stru se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; @@@ -4858,11 -4846,11 +4858,11 @@@ static int tg_unthrottle_up(struct task cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) + if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } @@@ -4876,7 -4864,7 +4876,7 @@@ static int tg_throttle_down(struct task /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); } cfs_rq->throttle_count++; @@@ -5320,7 -5308,7 +5320,7 @@@ static void sync_throttle(struct task_g pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@@ -6556,19 -6544,108 +6556,19 @@@ static int select_idle_sibling(struct t } /* - * cpu_util_without: compute cpu utilization without any contributions from *p - * @cpu: the CPU which utilization is requested - * @p: the task which utilization should be discounted - * - * The utilization of a CPU is defined by the utilization of tasks currently - * enqueued on that CPU as well as tasks which are currently sleeping after an - * execution on that CPU. - * - * This method returns the utilization of the specified CPU by discounting the - * utilization of the specified task, whenever the task is currently - * contributing to the CPU utilization. - */ -static unsigned long cpu_util_without(int cpu, struct task_struct *p) -{ - struct cfs_rq *cfs_rq; - unsigned int util; - - /* Task has no contribution or is new */ - if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util_cfs(cpu); - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - /* Discount task's util from CPU's util */ - lsub_positive(&util, task_util(p)); - - /* - * Covered cases: - * - * a) if *p is the only task sleeping on this CPU, then: - * cpu_util (== task_util) > util_est (== 0) - * and thus we return: - * cpu_util_without = (cpu_util - task_util) = 0 - * - * b) if other tasks are SLEEPING on this CPU, which is now exiting - * IDLE, then: - * cpu_util >= task_util - * cpu_util > util_est (== 0) - * and thus we discount *p's blocked utilization to return: - * cpu_util_without = (cpu_util - task_util) >= 0 - * - * c) if other tasks are RUNNABLE on that CPU and - * util_est > cpu_util - * then we use util_est since it returns a more restrictive - * estimation of the spare capacity on that CPU, by just - * considering the expected utilization of tasks already - * runnable on that CPU. - * - * Cases a) and b) are covered by the above code, while case c) is - * covered by the following code when estimated utilization is - * enabled. - */ - if (sched_feat(UTIL_EST)) { - unsigned int estimated = - READ_ONCE(cfs_rq->avg.util_est.enqueued); - - /* - * Despite the following checks we still have a small window - * for a possible race, when an execl's select_task_rq_fair() - * races with LB's detach_task(): - * - * detach_task() - * p->on_rq = TASK_ON_RQ_MIGRATING; - * ---------------------------------- A - * deactivate_task() \ - * dequeue_task() + RaceTime - * util_est_dequeue() / - * ---------------------------------- B - * - * The additional check on "current == p" it's required to - * properly fix the execl regression and it helps in further - * reducing the chances for the above race. - */ - if (unlikely(task_on_rq_queued(p) || current == p)) - lsub_positive(&estimated, _task_util_est(p)); - - util = max(util, estimated); - } - - /* - * Utilization (estimated) can exceed the CPU capacity, thus let's - * clamp to the maximum CPU capacity to ensure consistency with - * cpu_util. - */ - return min_t(unsigned long, util, capacity_orig_of(cpu)); -} - -/* - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) - * to @dst_cpu. + * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu + * (@dst_cpu = -1) or migrated to @dst_cpu. */ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) { struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); /* - * If @p migrates from @cpu to another, remove its contribution. Or, - * if @p migrates from another CPU to @cpu, add its contribution. In - * the other cases, @cpu is not impacted by the migration, so the - * util_avg should already be correct. + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its + * contribution. If @p migrates from another CPU to @cpu add its + * contribution. In all the other cases @cpu is not impacted by the + * migration so its util_avg is already correct. */ if (task_cpu(p) == cpu && dst_cpu != cpu) lsub_positive(&util, task_util(p)); @@@ -6576,40 -6653,16 +6576,40 @@@ util += task_util(p); if (sched_feat(UTIL_EST)) { + unsigned long util_est; + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); /* - * During wake-up, the task isn't enqueued yet and doesn't - * appear in the cfs_rq->avg.util_est.enqueued of any rq, - * so just add it (if needed) to "simulate" what will be - * cpu_util after the task has been enqueued. + * During wake-up @p isn't enqueued yet and doesn't contribute + * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p + * has been enqueued. + * + * During exec (@dst_cpu = -1) @p is enqueued and does + * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. + * Remove it to "simulate" cpu_util without @p's contribution. + * + * Despite the task_on_rq_queued(@p) check there is still a + * small window for a possible race when an exec + * select_task_rq_fair() races with LB's detach_task(). + * + * detach_task() + * deactivate_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * -------------------------------- A + * dequeue_task() \ + * dequeue_task_fair() + Race Time + * util_est_dequeue() / + * -------------------------------- B + * + * The additional check "current == p" is required to further + * reduce the race window. */ if (dst_cpu == cpu) util_est += _task_util_est(p); + else if (unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&util_est, _task_util_est(p)); util = max(util, util_est); } @@@ -6617,28 -6670,6 +6617,28 @@@ return min(util, capacity_orig_of(cpu)); } +/* + * cpu_util_without: compute cpu utilization without any contributions from *p + * @cpu: the CPU which utilization is requested + * @p: the task which utilization should be discounted + * + * The utilization of a CPU is defined by the utilization of tasks currently + * enqueued on that CPU as well as tasks which are currently sleeping after an + * execution on that CPU. + * + * This method returns the utilization of the specified CPU by discounting the + * utilization of the specified task, whenever the task is currently + * contributing to the CPU utilization. + */ +static unsigned long cpu_util_without(int cpu, struct task_struct *p) +{ + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_util_cfs(cpu); + + return cpu_util_next(cpu, p, -1); +} + /* * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization @@@ -9429,6 -9460,8 +9429,6 @@@ static inline void calculate_imbalance( local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / local->group_capacity; - sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / - sds->total_capacity; /* * If the local group is more loaded than the selected * busiest group don't try to pull any tasks. @@@ -9437,9 -9470,6 +9437,9 @@@ env->imbalance = 0; return; } + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; } /* @@@ -9465,7 -9495,7 +9465,7 @@@ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced - * misfit_task force N/A N/A N/A force force + * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load @@@ -11851,3 -11881,101 +11851,3 @@@ __init void init_sched_fair_class(void #endif /* SMP */ } - -/* - * Helper functions to facilitate extracting info from tracepoints. - */ - -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) -{ -#ifdef CONFIG_SMP - return cfs_rq ? &cfs_rq->avg : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); - -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) -{ - if (!cfs_rq) { - if (str) - strlcpy(str, "(null)", len); - else - return NULL; - } - - cfs_rq_tg_path(cfs_rq, str, len); - return str; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); - -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) -{ - return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); - -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_rt : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); - -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_dl : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); - -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) - return rq ? &rq->avg_irq : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); - -int sched_trace_rq_cpu(struct rq *rq) -{ - return rq ? cpu_of(rq) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); - -int sched_trace_rq_cpu_capacity(struct rq *rq) -{ - return rq ? -#ifdef CONFIG_SMP - rq->cpu_capacity -#else - SCHED_CAPACITY_SCALE -#endif - : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); - -const struct cpumask *sched_trace_rd_span(struct root_domain *rd) -{ -#ifdef CONFIG_SMP - return rd ? rd->span : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rd_span); - -int sched_trace_rq_nr_running(struct rq *rq) -{ - return rq ? rq->nr_running : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);