From: Linus Torvalds Date: Tue, 23 Oct 2018 17:43:04 +0000 (+0100) Subject: Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git... X-Git-Tag: v4.20-rc1~156 X-Git-Url: https://repo.jachan.dev/J-linux.git/commitdiff_plain/d82924c3b8d0607094b94fab290a33c5ad7d586c?hp=-c Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 pti updates from Ingo Molnar: "The main changes: - Make the IBPB barrier more strict and add STIBP support (Jiri Kosina) - Micro-optimize and clean up the entry code (Andy Lutomirski) - ... plus misc other fixes" * 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/speculation: Propagate information about RSB filling mitigation to sysfs x86/speculation: Enable cross-hyperthread spectre v2 STIBP mitigation x86/speculation: Apply IBPB more strictly to avoid cross-process data leak x86/speculation: Add RETPOLINE_AMD support to the inline asm CALL_NOSPEC variant x86/CPU: Fix unused variable warning when !CONFIG_IA32_EMULATION x86/pti/64: Remove the SYSCALL64 entry trampoline x86/entry/64: Use the TSS sp2 slot for SYSCALL/SYSRET scratch space x86/entry/64: Document idtentry --- d82924c3b8d0607094b94fab290a33c5ad7d586c diff --combined arch/x86/entry/entry_64.S index 7c5ce0a6c4d2,0d728142467f..4d7a2d9d44cf --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@@ -142,67 -142,6 +142,6 @@@ END(native_usergs_sysret64 * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - - /* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - - #define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - - /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ - #define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ - SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA - - ENTRY(entry_SYSCALL_64_trampoline) - UNWIND_HINT_EMPTY - swapgs - - /* Stash the user RSP. */ - movq %rsp, RSP_SCRATCH - - /* Note: using %rsp as a scratch reg. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - - /* Load the top of the task stack into RSP */ - movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp - - /* Start building the simulated IRET frame. */ - pushq $__USER_DS /* pt_regs->ss */ - pushq RSP_SCRATCH /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - - /* - * x86 lacks a near absolute jump, and we can't jump to the real - * entry text with a relative jump. We could push the target - * address and then use retq, but this destroys the pipeline on - * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, - * spill RDI and restore it in a second-stage trampoline. - */ - pushq %rdi - movq $entry_SYSCALL_64_stage2, %rdi - JMP_NOSPEC %rdi - END(entry_SYSCALL_64_trampoline) - - .popsection - - ENTRY(entry_SYSCALL_64_stage2) - UNWIND_HINT_EMPTY - popq %rdi - jmp entry_SYSCALL_64_after_hwframe - END(entry_SYSCALL_64_stage2) - ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* @@@ -212,21 -151,19 +151,19 @@@ */ swapgs - /* - * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it - * is not required to switch CR3. - */ - movq %rsp, PER_CPU_VAR(rsp_scratch) + /* tss.sp2 is scratch space. */ + movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ GLOBAL(entry_SYSCALL_64_after_hwframe) - pushq %rax /* pt_regs->orig_ax */ + pushq %rax /* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS @@@ -900,6 -837,42 +837,42 @@@ apicinterrupt IRQ_WORK_VECTOR irq_wor */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) + /** + * idtentry - Generate an IDT entry stub + * @sym: Name of the generated entry point + * @do_sym: C function to be called + * @has_error_code: True if this IDT vector has an error code on the stack + * @paranoid: non-zero means that this vector may be invoked from + * kernel mode with user GSBASE and/or user CR3. + * 2 is special -- see below. + * @shift_ist: Set to an IST index if entries from kernel mode should + * decrement the IST stack so that nested entries get a + * fresh stack. (This is for #DB, which has a nasty habit + * of recursing.) + * + * idtentry generates an IDT stub that sets up a usable kernel context, + * creates struct pt_regs, and calls @do_sym. The stub has the following + * special behaviors: + * + * On an entry from user mode, the stub switches from the trampoline or + * IST stack to the normal thread stack. On an exit to user mode, the + * normal exit-to-usermode path is invoked. + * + * On an exit to kernel mode, if @paranoid == 0, we check for preemption, + * whereas we omit the preemption check if @paranoid != 0. This is purely + * because the implementation is simpler this way. The kernel only needs + * to check for asynchronous kernel preemption when IRQ handlers return. + * + * If @paranoid == 0, then the stub will handle IRET faults by pretending + * that the fault came from user mode. It will handle gs_change faults by + * pretending that the fault happened with kernel GSBASE. Since this handling + * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have + * @paranoid == 0. This special handling will do the wrong thing for + * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0. + * + * @paranoid == 2 is special: the stub will never switch stacks. This is for + * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. + */ .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@@ -1050,7 -1023,7 +1023,7 @@@ ENTRY(do_softirq_own_stack ret ENDPROC(do_softirq_own_stack) -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* @@@ -1130,13 -1103,11 +1103,13 @@@ ENTRY(xen_failsafe_callback ENCODE_FRAME_POINTER jmp error_exit END(xen_failsafe_callback) +#endif /* CONFIG_XEN_PV */ +#ifdef CONFIG_XEN_PVHVM apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ xen_hvm_callback_vector xen_evtchn_do_upcall +#endif -#endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ @@@ -1153,7 -1124,7 +1126,7 @@@ idtentry debug do_debug has_error_co idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PV idtentry xennmi do_nmi has_error_code=0 idtentry xendebug do_debug has_error_code=0 idtentry xenint3 do_int3 has_error_code=0 @@@ -1189,16 -1160,6 +1162,16 @@@ ENTRY(paranoid_entry xorl %ebx, %ebx 1: + /* + * Always stash CR3 in %r14. This value will be restored, + * verbatim, at exit. Needed if paranoid_entry interrupted + * another entry that already switched to the user CR3 value + * but has not yet returned to userspace. + * + * This is also why CS (stashed in the "iret frame" by the + * hardware at entry) can not be used: this may be a return + * to kernel code, but with a user CR3 value. + */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 ret @@@ -1223,13 -1184,11 +1196,13 @@@ ENTRY(paranoid_exit testl %ebx, %ebx /* swapgs needed? */ jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK jmp .Lparanoid_exit_restore .Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 .Lparanoid_exit_restore: jmp restore_regs_and_return_to_kernel @@@ -1640,7 -1599,6 +1613,7 @@@ end_repeat_nmi movq $-1, %rsi call do_nmi + /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 testl %ebx, %ebx /* swapgs needed? */ diff --combined arch/x86/include/asm/processor.h index c7a4e2a174b9,b2bb1d691efc..617805981cce --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@@ -155,8 -155,7 +155,8 @@@ enum cpuid_regs_idx #define X86_VENDOR_CENTAUR 5 #define X86_VENDOR_TRANSMETA 7 #define X86_VENDOR_NSC 8 -#define X86_VENDOR_NUM 9 +#define X86_VENDOR_HYGON 9 +#define X86_VENDOR_NUM 10 #define X86_VENDOR_UNKNOWN 0xff @@@ -316,7 -315,13 +316,13 @@@ struct x86_hw_tss */ u64 sp1; + /* + * Since Linux does not use ring 2, the 'sp2' slot is unused by + * hardware. entry_SYSCALL_64 uses it as scratch space to stash + * the user RSP value. + */ u64 sp2; + u64 reserved2; u64 ist[7]; u32 reserved3; @@@ -579,7 -584,7 +585,7 @@@ static inline bool on_thread_stack(void current_stack_pointer) < THREAD_SIZE; } -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL #include #else #define __cpuid native_cpuid @@@ -590,7 -595,7 +596,7 @@@ static inline void load_sp0(unsigned lo } #define set_iopl_mask native_set_iopl_mask -#endif /* CONFIG_PARAVIRT */ +#endif /* CONFIG_PARAVIRT_XXL */ /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); diff --combined arch/x86/kernel/asm-offsets.c index fc02c3cf238f,083c01309027..72adf6c335dc --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@@ -64,12 -64,15 +64,12 @@@ void common(void) OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); #endif -#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_PARAVIRT_XXL BLANK(); - OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); - OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); - OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); - OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); - OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); + OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable); + OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable); + OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret); + OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2); #endif #ifdef CONFIG_XEN @@@ -96,13 -99,12 +96,12 @@@ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); /* Layout info for cpu_entry_area */ - OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); - OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); - /* Offset for sp0 and sp1 into the tss_struct */ + /* Offset for fields in tss_struct */ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); + OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); } diff --combined arch/x86/kernel/cpu/bugs.c index b810cc239375,fe32103fcdc7..c37e66e493bf --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@@ -35,12 -35,10 +35,10 @@@ static void __init spectre_v2_select_mi static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); - /* - * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any - * writes to SPEC_CTRL contain whatever reserved bits have been set. - */ - u64 __ro_after_init x86_spec_ctrl_base; + /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ + u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + static DEFINE_MUTEX(spec_ctrl_mutex); /* * The vendor and possibly platform specific bits which can be modified in @@@ -312,7 -310,6 +310,7 @@@ static enum spectre_v2_mitigation_cmd _ } if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON && boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); return SPECTRE_V2_CMD_AUTO; @@@ -326,6 -323,46 +324,46 @@@ return cmd; } + static bool stibp_needed(void) + { + if (spectre_v2_enabled == SPECTRE_V2_NONE) + return false; + + if (!boot_cpu_has(X86_FEATURE_STIBP)) + return false; + + return true; + } + + static void update_stibp_msr(void *info) + { + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + } + + void arch_smt_update(void) + { + u64 mask; + + if (!stibp_needed()) + return; + + mutex_lock(&spec_ctrl_mutex); + mask = x86_spec_ctrl_base; + if (cpu_smt_control == CPU_SMT_ENABLED) + mask |= SPEC_CTRL_STIBP; + else + mask &= ~SPEC_CTRL_STIBP; + + if (mask != x86_spec_ctrl_base) { + pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", + cpu_smt_control == CPU_SMT_ENABLED ? + "Enabling" : "Disabling"); + x86_spec_ctrl_base = mask; + on_each_cpu(update_stibp_msr, NULL, 1); + } + mutex_unlock(&spec_ctrl_mutex); + } + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@@ -372,8 -409,7 +410,8 @@@ return; retpoline_auto: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { retpoline_amd: if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); @@@ -426,6 -462,9 +464,9 @@@ specv2_set_mode setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } + + /* Enable STIBP if appropriate */ + arch_smt_update(); } #undef pr_fmt @@@ -816,6 -855,8 +857,8 @@@ static ssize_t l1tf_show_state(char *bu static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { + int ret; + if (!boot_cpu_has_bug(bug)) return sprintf(buf, "Not affected\n"); @@@ -833,10 -874,13 +876,13 @@@ return sprintf(buf, "Mitigation: __user pointer sanitization\n"); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + ret = sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "", + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", spectre_v2_module_string()); + return ret; case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --combined arch/x86/kernel/cpu/common.c index 9315a1660668,8bffeae9bac2..660d0b22e962 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@@ -949,11 -949,11 +949,11 @@@ static void identify_cpu_without_cpuid( } static const __initconst struct x86_cpu_id cpu_no_speculation[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_TABLET, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL_MID, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SALTWELL_MID, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_BONNELL, X86_FEATURE_ANY }, { X86_VENDOR_CENTAUR, 5 }, { X86_VENDOR_INTEL, 5 }, { X86_VENDOR_NSC, 5 }, @@@ -963,16 -963,15 +963,16 @@@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { { X86_VENDOR_AMD }, + { X86_VENDOR_HYGON }, {} }; /* Only list CPUs which speculate but are non susceptible to SSB */ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, @@@ -985,14 -984,14 +985,14 @@@ static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { /* in addition to cpu_no_speculation */ - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_X }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT_MID }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT_MID }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_X }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT_PLUS }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, {} @@@ -1077,9 -1076,6 +1077,9 @@@ static void __init early_identify_cpu(s memset(&c->x86_capability, 0, sizeof c->x86_capability); c->extended_cpuid_level = 0; + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + /* cyrix could have cpuid enabled via c_identify()*/ if (have_cpuid_p()) { cpu_detect(c); @@@ -1097,6 -1093,7 +1097,6 @@@ if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); } else { - identify_cpu_without_cpuid(c); setup_clear_cpu_cap(X86_FEATURE_CPUID); } @@@ -1243,10 -1240,10 +1243,10 @@@ static void generic_identify(struct cpu * ESPFIX issue, we can change this. */ #ifdef CONFIG_X86_32 -# ifdef CONFIG_PARAVIRT +# ifdef CONFIG_PARAVIRT_XXL do { extern void native_iret(void); - if (pv_cpu_ops.iret == native_iret) + if (pv_ops.cpu.iret == native_iret) set_cpu_bug(c, X86_BUG_ESPFIX); } while (0); # else @@@ -1534,19 -1531,8 +1534,8 @@@ EXPORT_PER_CPU_SYMBOL(__preempt_count) /* May not be marked __init: used by software suspend */ void syscall_init(void) { - extern char _entry_trampoline[]; - extern char entry_SYSCALL_64_trampoline[]; - - int cpu = smp_processor_id(); - unsigned long SYSCALL64_entry_trampoline = - (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + - (entry_SYSCALL_64_trampoline - _entry_trampoline); - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - if (static_cpu_has(X86_FEATURE_PTI)) - wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); - else - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); @@@ -1557,7 -1543,8 +1546,8 @@@ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@@ -1672,29 -1659,6 +1662,29 @@@ static void wait_for_master_cpu(int cpu #endif } +#ifdef CONFIG_X86_64 +static void setup_getcpu(int cpu) +{ + unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); + struct desc_struct d = { }; + + if (static_cpu_has(X86_FEATURE_RDTSCP)) + write_rdtscp_aux(cpudata); + + /* Store CPU and node number in limit. */ + d.limit0 = cpudata; + d.limit1 = cpudata >> 16; + + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); +} +#endif + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@@ -1732,7 -1696,6 +1722,7 @@@ void cpu_init(void early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif + setup_getcpu(cpu); me = current; diff --combined arch/x86/kernel/kprobes/core.c index f72a47b602e2,f802cf5b4478..c33b06f5faa4 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@@ -1020,26 -1020,56 +1020,18 @@@ int kprobe_fault_handler(struct pt_reg */ if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - if (fixup_exception(regs, trapnr)) - return 1; - - /* - * fixup routine could not handle it, - * Let do_page_fault() fix it. - */ } return 0; } NOKPROBE_SYMBOL(kprobe_fault_handler); -/* - * Wrapper routine for handling exceptions. - */ -int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, - void *data) -{ - struct die_args *args = data; - int ret = NOTIFY_DONE; - - if (args->regs && user_mode(args->regs)) - return ret; - - if (val == DIE_GPF) { - /* - * To be potentially processing a kprobe fault and to - * trust the result from kprobe_running(), we have - * be non-preemptible. - */ - if (!preemptible() && kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - ret = NOTIFY_STOP; - } - return ret; -} -NOKPROBE_SYMBOL(kprobe_exceptions_notify); - bool arch_within_kprobe_blacklist(unsigned long addr) { - bool is_in_entry_trampoline_section = false; - - #ifdef CONFIG_X86_64 - is_in_entry_trampoline_section = - (addr >= (unsigned long)__entry_trampoline_start && - addr < (unsigned long)__entry_trampoline_end); - #endif return (addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end) || (addr >= (unsigned long)__entry_text_start && - addr < (unsigned long)__entry_text_end) || - is_in_entry_trampoline_section; + addr < (unsigned long)__entry_text_end); } int __init arch_init_kprobes(void) diff --combined arch/x86/kernel/process_64.c index d6674a425714,0fa7aa19f09e..31b4755369f0 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@@ -54,16 -54,13 +54,14 @@@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include #endif - __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); - /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs *regs, int all) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; @@@ -88,17 -85,9 +86,17 @@@ printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); - if (!all) + if (mode == SHOW_REGS_SHORT) return; + if (mode == SHOW_REGS_USER) { + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n", + fs, shadowgs); + return; + } + asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); @@@ -287,138 -276,6 +285,138 @@@ static __always_inline void load_seg_le } } +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, + struct thread_struct *next) +{ + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); +} + +static unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) +{ + unsigned short idx = selector >> 3; + unsigned long base; + + if (likely((selector & SEGMENT_TI_MASK) == 0)) { + if (unlikely(idx >= GDT_ENTRIES)) + return 0; + + /* + * There are no user segments in the GDT with nonzero bases + * other than the TLS segments. + */ + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return 0; + + idx -= GDT_ENTRY_TLS_MIN; + base = get_desc_base(&task->thread.tls_array[idx]); + } else { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + + /* + * If performance here mattered, we could protect the LDT + * with RCU. This is a slow path, though, so we can just + * take the mutex. + */ + mutex_lock(&task->mm->context.lock); + ldt = task->mm->context.ldt; + if (unlikely(idx >= ldt->nr_entries)) + base = 0; + else + base = get_desc_base(ldt->entries + idx); + mutex_unlock(&task->mm->context.lock); +#else + base = 0; +#endif + } + + return base; +} + +void x86_fsbase_write_cpu(unsigned long fsbase) +{ + /* + * Set the selector to 0 as a notion, that the segment base is + * overwritten, which will be checked for skipping the segment load + * during context switch. + */ + loadseg(FS, 0); + wrmsrl(MSR_FS_BASE, fsbase); +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + /* Set the selector to 0 for the same reason as %fs above. */ + loadseg(GS, 0); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); +} + +unsigned long x86_fsbase_read_task(struct task_struct *task) +{ + unsigned long fsbase; + + if (task == current) + fsbase = x86_fsbase_read_cpu(); + else if (task->thread.fsindex == 0) + fsbase = task->thread.fsbase; + else + fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); + + return fsbase; +} + +unsigned long x86_gsbase_read_task(struct task_struct *task) +{ + unsigned long gsbase; + + if (task == current) + gsbase = x86_gsbase_read_cpu_inactive(); + else if (task->thread.gsindex == 0) + gsbase = task->thread.gsbase; + else + gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); + + return gsbase; +} + +int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) +{ + /* + * Not strictly needed for %fs, but do it for symmetry + * with %gs + */ + if (unlikely(fsbase >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + task->thread.fsbase = fsbase; + if (task == current) + x86_fsbase_write_cpu(fsbase); + task->thread.fsindex = 0; + preempt_enable(); + + return 0; +} + +int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) +{ + if (unlikely(gsbase >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + task->thread.gsbase = gsbase; + if (task == current) + x86_gsbase_write_cpu_inactive(gsbase); + task->thread.gsindex = 0; + preempt_enable(); + + return 0; +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@@ -606,7 -463,10 +604,7 @@@ __switch_to(struct task_struct *prev_p if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + x86_fsgsbase_load(prev, next); switch_fpu_finish(next_fpu, cpu); @@@ -757,25 -617,54 +755,25 @@@ static long prctl_map_vdso(const struc long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; - int doit = task == current; - int cpu; switch (option) { - case ARCH_SET_GS: - if (arg2 >= TASK_SIZE_MAX) - return -EPERM; - cpu = get_cpu(); - task->thread.gsindex = 0; - task->thread.gsbase = arg2; - if (doit) { - load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2); - } - put_cpu(); + case ARCH_SET_GS: { + ret = x86_gsbase_write_task(task, arg2); break; - case ARCH_SET_FS: - /* Not strictly needed for fs, but do it for symmetry - with gs */ - if (arg2 >= TASK_SIZE_MAX) - return -EPERM; - cpu = get_cpu(); - task->thread.fsindex = 0; - task->thread.fsbase = arg2; - if (doit) { - /* set the selector to 0 to not confuse __switch_to */ - loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, arg2); - } - put_cpu(); + } + case ARCH_SET_FS: { + ret = x86_fsbase_write_task(task, arg2); break; + } case ARCH_GET_FS: { - unsigned long base; + unsigned long base = x86_fsbase_read_task(task); - if (doit) - rdmsrl(MSR_FS_BASE, base); - else - base = task->thread.fsbase; ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { - unsigned long base; + unsigned long base = x86_gsbase_read_task(task); - if (doit) - rdmsrl(MSR_KERNEL_GS_BASE, base); - else - base = task->thread.gsbase; ret = put_user(base, (unsigned long __user *)arg2); break; } diff --combined arch/x86/kernel/traps.c index 16c95cb90496,1a90821c0b74..5bd0a997d81e --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@@ -206,7 -206,7 +206,7 @@@ do_trap_no_signal(struct task_struct *t } if (!user_mode(regs)) { - if (fixup_exception(regs, trapnr)) + if (fixup_exception(regs, trapnr, error_code, 0)) return 0; tsk->thread.error_code = error_code; @@@ -383,6 -383,10 +383,10 @@@ dotraplinkage void do_double_fault(stru * we won't enable interupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. + * + * We will enter general_protection with kernel GSBASE, + * which is what the stub expects, given that the faulting + * RIP will be the IRET instruction. */ regs->ip = (unsigned long)general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; @@@ -551,21 -555,11 +555,21 @@@ do_general_protection(struct pt_regs *r tsk = current; if (!user_mode(regs)) { - if (fixup_exception(regs, X86_TRAP_GP)) + if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) return; tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; + + /* + * To be potentially processing a kprobe fault and to + * trust the result from kprobe_running(), we have to + * be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(regs, X86_TRAP_GP)) + return; + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) die("general protection fault", regs, error_code); @@@ -848,7 -842,7 +852,7 @@@ static void math_error(struct pt_regs * cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (fixup_exception(regs, trapnr)) + if (fixup_exception(regs, trapnr, error_code, 0)) return; task->thread.error_code = error_code; diff --combined arch/x86/kernel/vmlinux.lds.S index 5dd3317d761f,9c77d2df9c27..0d618ee634ac --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@@ -65,23 -65,6 +65,23 @@@ jiffies_64 = jiffies #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); +/* + * This section contains data which will be mapped as decrypted. Memory + * encryption operates on a page basis. Make this section PMD-aligned + * to avoid splitting the pages while mapping the section early. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#define BSS_DECRYPTED \ + . = ALIGN(PMD_SIZE); \ + __start_bss_decrypted = .; \ + *(.bss..decrypted); \ + . = ALIGN(PAGE_SIZE); \ + __start_bss_decrypted_unused = .; \ + . = ALIGN(PMD_SIZE); \ + __end_bss_decrypted = .; \ + #else #define X86_ALIGN_RODATA_BEGIN @@@ -91,7 -74,6 +91,7 @@@ #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END +#define BSS_DECRYPTED #endif @@@ -136,16 -118,6 +136,6 @@@ SECTION *(.fixup) *(.gnu.warning) - #ifdef CONFIG_X86_64 - . = ALIGN(PAGE_SIZE); - __entry_trampoline_start = .; - _entry_trampoline = .; - *(.entry_trampoline) - . = ALIGN(PAGE_SIZE); - __entry_trampoline_end = .; - ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); - #endif - #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; *(.text.__x86.indirect_thunk) @@@ -373,7 -345,6 +363,7 @@@ __bss_start = .; *(.bss..page_aligned) *(.bss) + BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --combined arch/x86/mm/tlb.c index 7d68489cfdb1,073b8df349a0..bddd6b3cee1d --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@@ -7,6 -7,7 +7,7 @@@ #include #include #include + #include #include #include @@@ -180,16 -181,26 +181,29 @@@ static void sync_current_stack_to_mm(st } } + static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id) + { + /* + * Check if the current (previous) task has access to the memory + * of the @tsk (next) task. If access is denied, make sure to + * issue a IBPB to stop user->user Spectre-v2 attacks. + * + * Note: __ptrace_may_access() returns 0 or -ERRNO. + */ + return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id && + ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB)); + } + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; + bool need_flush; + u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@@ -243,41 -254,20 +257,41 @@@ next->context.ctx_id); /* - * We don't currently support having a real mm loaded without - * our cpu set in mm_cpumask(). We have all the bookkeeping - * in place to figure out whether we would need to flush - * if our cpu were cleared in mm_cpumask(), but we don't - * currently use it. + * Even in lazy TLB mode, the CPU should stay set in the + * mm_cpumask. The TLB shootdown code can figure out from + * from cpu_tlbstate.is_lazy whether or not to send an IPI. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); - return; + /* + * If the CPU is not in lazy TLB mode, we are just switching + * from one thread in a process to another thread in the same + * process. No TLB flush required. + */ + if (!was_lazy) + return; + + /* + * Read the tlb_gen to check whether a flush is needed. + * If the TLB is up to date, just use it. + * The barrier synchronizes with the tlb_gen increment in + * the TLB shootdown code. + */ + smp_mb(); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == + next_tlb_gen) + return; + + /* + * TLB contents went out of date while we were in lazy + * mode. Fall through to the TLB switching code below. + */ + new_asid = prev_asid; + need_flush = true; } else { - u16 new_asid; - bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@@ -286,18 -276,13 +300,13 @@@ * one process from doing Spectre-v2 attacks on another. * * As an optimization, flush indirect branches only when - * switching into processes that disable dumping. This - * protects high value processes like gpg, without having - * too high performance overhead. IBPB is *expensive*! - * - * This will not flush branches when switching into kernel - * threads. It will also not flush if we switch to idle - * thread and back to the same process. It will flush if we - * switch to a different non-dumpable process. + * switching into a processes that can't be ptrace by the + * current one (as in such case, attacker has much more + * convenient way how to tamper with the next process than + * branch buffer poisoning). */ - if (tsk && tsk->mm && - tsk->mm->context.ctx_id != last_ctx_id && - get_dumpable(tsk->mm) != SUID_DUMP_USER) + if (static_cpu_has(X86_FEATURE_USE_IBPB) && + ibpb_needed(tsk, last_ctx_id)) indirect_branch_prediction_barrier(); if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@@ -332,48 -317,46 +341,48 @@@ /* Let nmi_uaccess_okay() know that we're changing CR3. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); + } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); - - /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); - - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); - } + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - - /* Make sure we write CR3 before loaded_mm. */ - barrier(); + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } - load_mm_cr4(next); - switch_ldt(real_prev, next); + /* + * Record last user mm's context id, so we can avoid + * flushing branch buffer with IBPB if we switch back + * to the same user. + */ + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); + + /* Make sure we write CR3 before loaded_mm. */ + barrier(); + + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + + if (next != real_prev) { + load_mm_cr4(next); + switch_ldt(real_prev, next); + } } /* @@@ -394,7 -377,20 +403,7 @@@ void enter_lazy_tlb(struct mm_struct *m if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - if (tlb_defer_switch_to_init_mm()) { - /* - * There's a significant optimization that may be possible - * here. We have accurate enough TLB flush tracking that we - * don't need to maintain coherence of TLB per se when we're - * lazy. We do, however, need to maintain coherence of - * paging-structure caches. We could, in principle, leave our - * old mm loaded and only switch to init_mm when - * tlb_remove_page() happens. - */ - this_cpu_write(cpu_tlbstate.is_lazy, true); - } else { - switch_mm(NULL, &init_mm, NULL); - } + this_cpu_write(cpu_tlbstate.is_lazy, true); } /* @@@ -481,9 -477,6 +490,9 @@@ static void flush_tlb_func_common(cons * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. + * + * This should be rare, with native_flush_tlb_others skipping + * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; @@@ -544,16 -537,17 +553,16 @@@ f->new_tlb_gen == local_tlb_gen + 1 && f->new_tlb_gen == mm_tlb_gen) { /* Partial flush */ - unsigned long addr; - unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; + unsigned long addr = f->start; - addr = f->start; while (addr < f->end) { __flush_tlb_one_user(addr); - addr += PAGE_SIZE; + addr += 1UL << f->stride_shift; } if (local) - count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); - trace_tlb_flush(reason, nr_pages); + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); + trace_tlb_flush(reason, nr_invalidate); } else { /* Full flush. */ local_flush_tlb(); @@@ -586,11 -580,6 +595,11 @@@ static void flush_tlb_func_remote(void flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); } +static bool tlb_is_not_lazy(int cpu, void *data) +{ + return !per_cpu(cpu_tlbstate.is_lazy, cpu); +} + void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { @@@ -626,23 -615,8 +635,23 @@@ (void *)info, 1); return; } - smp_call_function_many(cpumask, flush_tlb_func_remote, + + /* + * If no page tables were freed, we can skip sending IPIs to + * CPUs in lazy TLB mode. They will flush the CPU themselves + * at the next context switch. + * + * However, if page tables are getting freed, we need to send the + * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping + * up on the new contents of what used to be page tables, while + * doing a speculative memory access. + */ + if (info->freed_tables) + smp_call_function_many(cpumask, flush_tlb_func_remote, (void *)info, 1); + else + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, + (void *)info, 1, GFP_ATOMIC, cpumask); } /* @@@ -658,15 -632,12 +667,15 @@@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long vmflag) + unsigned long end, unsigned int stride_shift, + bool freed_tables) { int cpu; struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { .mm = mm, + .stride_shift = stride_shift, + .freed_tables = freed_tables, }; cpu = get_cpu(); @@@ -676,7 -647,8 +685,7 @@@ /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && - !(vmflag & VM_HUGETLB) && - ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { + ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { info.start = start; info.end = end; } else { diff --combined kernel/cpu.c index e82920b8bee1,2fb49916ea56..3c7f3b4c453c --- a/kernel/cpu.c +++ b/kernel/cpu.c @@@ -315,16 -315,6 +315,16 @@@ void lockdep_assert_cpus_held(void percpu_rwsem_assert_held(&cpu_hotplug_lock); } +static void lockdep_acquire_cpus_lock(void) +{ + rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); +} + +static void lockdep_release_cpus_lock(void) +{ + rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_); +} + /* * Wait for currently running CPU hotplug operations to complete (if any) and * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects @@@ -354,17 -344,6 +354,17 @@@ void cpu_hotplug_enable(void cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); + +#else + +static void lockdep_acquire_cpus_lock(void) +{ +} + +static void lockdep_release_cpus_lock(void) +{ +} + #endif /* CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_HOTPLUG_SMT @@@ -383,7 -362,6 +383,7 @@@ void __init cpu_smt_disable(bool force pr_info("SMT: Force disabled\n"); cpu_smt_control = CPU_SMT_FORCE_DISABLED; } else { + pr_info("SMT: disabled\n"); cpu_smt_control = CPU_SMT_DISABLED; } } @@@ -629,21 -607,15 +629,21 @@@ static void cpuhp_thread_fun(unsigned i bool bringup = st->bringup; enum cpuhp_state state; + if (WARN_ON_ONCE(!st->should_run)) + return; + /* * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures * that if we see ->should_run we also see the rest of the state. */ smp_mb(); - if (WARN_ON_ONCE(!st->should_run)) - return; - + /* + * The BP holds the hotplug lock, but we're now running on the AP, + * ensure that anybody asserting the lock is held, will actually find + * it so. + */ + lockdep_acquire_cpus_lock(); cpuhp_lock_acquire(bringup); if (st->single) { @@@ -689,7 -661,6 +689,7 @@@ } cpuhp_lock_release(bringup); + lockdep_release_cpus_lock(); if (!st->should_run) complete_ap_thread(st, bringup); @@@ -945,8 -916,7 +945,8 @@@ static int cpuhp_down_callbacks(unsigne ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); if (ret) { st->target = prev_state; - undo_cpu_down(cpu, st); + if (st->state < prev_state) + undo_cpu_down(cpu, st); break; } } @@@ -999,7 -969,7 +999,7 @@@ static int __ref _cpu_down(unsigned in * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, target); - if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { + if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) { cpuhp_reset_state(st, prev_state); __cpuhp_kick_ap(st); } @@@ -2055,6 -2025,12 +2055,12 @@@ static void cpuhp_online_cpu_device(uns kobject_uevent(&dev->kobj, KOBJ_ONLINE); } + /* + * Architectures that need SMT-specific errata handling during SMT hotplug + * should override this. + */ + void __weak arch_smt_update(void) { }; + static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; @@@ -2081,8 -2057,10 +2087,10 @@@ */ cpuhp_offline_cpu_device(cpu); } - if (!ret) + if (!ret) { cpu_smt_control = ctrlval; + arch_smt_update(); + } cpu_maps_update_done(); return ret; } @@@ -2093,6 -2071,7 +2101,7 @@@ static int cpuhp_smt_enable(void cpu_maps_update_begin(); cpu_smt_control = CPU_SMT_ENABLED; + arch_smt_update(); for_each_present_cpu(cpu) { /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))