From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 23 Oct 2018 17:43:04 +0000 (+0100)
Subject: Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git... 
X-Git-Tag: v4.20-rc1~156
X-Git-Url: https://repo.jachan.dev/J-linux.git/commitdiff_plain/d82924c3b8d0607094b94fab290a33c5ad7d586c?hp=-c

Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 pti updates from Ingo Molnar:
 "The main changes:

   - Make the IBPB barrier more strict and add STIBP support (Jiri
     Kosina)

   - Micro-optimize and clean up the entry code (Andy Lutomirski)

   - ... plus misc other fixes"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/speculation: Propagate information about RSB filling mitigation to sysfs
  x86/speculation: Enable cross-hyperthread spectre v2 STIBP mitigation
  x86/speculation: Apply IBPB more strictly to avoid cross-process data leak
  x86/speculation: Add RETPOLINE_AMD support to the inline asm CALL_NOSPEC variant
  x86/CPU: Fix unused variable warning when !CONFIG_IA32_EMULATION
  x86/pti/64: Remove the SYSCALL64 entry trampoline
  x86/entry/64: Use the TSS sp2 slot for SYSCALL/SYSRET scratch space
  x86/entry/64: Document idtentry
---

d82924c3b8d0607094b94fab290a33c5ad7d586c
diff --combined arch/x86/entry/entry_64.S
index 7c5ce0a6c4d2,0d728142467f..4d7a2d9d44cf
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@@ -142,67 -142,6 +142,6 @@@ END(native_usergs_sysret64
   * with them due to bugs in both AMD and Intel CPUs.
   */
  
- 	.pushsection .entry_trampoline, "ax"
- 
- /*
-  * The code in here gets remapped into cpu_entry_area's trampoline.  This means
-  * that the assembler and linker have the wrong idea as to where this code
-  * lives (and, in fact, it's mapped more than once, so it's not even at a
-  * fixed address).  So we can't reference any symbols outside the entry
-  * trampoline and expect it to work.
-  *
-  * Instead, we carefully abuse %rip-relative addressing.
-  * _entry_trampoline(%rip) refers to the start of the remapped) entry
-  * trampoline.  We can thus find cpu_entry_area with this macro:
-  */
- 
- #define CPU_ENTRY_AREA \
- 	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
- 
- /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
- #define RSP_SCRATCH	CPU_ENTRY_AREA_entry_stack + \
- 			SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
- 
- ENTRY(entry_SYSCALL_64_trampoline)
- 	UNWIND_HINT_EMPTY
- 	swapgs
- 
- 	/* Stash the user RSP. */
- 	movq	%rsp, RSP_SCRATCH
- 
- 	/* Note: using %rsp as a scratch reg. */
- 	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
- 
- 	/* Load the top of the task stack into RSP */
- 	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
- 
- 	/* Start building the simulated IRET frame. */
- 	pushq	$__USER_DS			/* pt_regs->ss */
- 	pushq	RSP_SCRATCH			/* pt_regs->sp */
- 	pushq	%r11				/* pt_regs->flags */
- 	pushq	$__USER_CS			/* pt_regs->cs */
- 	pushq	%rcx				/* pt_regs->ip */
- 
- 	/*
- 	 * x86 lacks a near absolute jump, and we can't jump to the real
- 	 * entry text with a relative jump.  We could push the target
- 	 * address and then use retq, but this destroys the pipeline on
- 	 * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead,
- 	 * spill RDI and restore it in a second-stage trampoline.
- 	 */
- 	pushq	%rdi
- 	movq	$entry_SYSCALL_64_stage2, %rdi
- 	JMP_NOSPEC %rdi
- END(entry_SYSCALL_64_trampoline)
- 
- 	.popsection
- 
- ENTRY(entry_SYSCALL_64_stage2)
- 	UNWIND_HINT_EMPTY
- 	popq	%rdi
- 	jmp	entry_SYSCALL_64_after_hwframe
- END(entry_SYSCALL_64_stage2)
- 
  ENTRY(entry_SYSCALL_64)
  	UNWIND_HINT_EMPTY
  	/*
@@@ -212,21 -151,19 +151,19 @@@
  	 */
  
  	swapgs
- 	/*
- 	 * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
- 	 * is not required to switch CR3.
- 	 */
- 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
+ 	/* tss.sp2 is scratch space. */
+ 	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+ 	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
  	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
  
  	/* Construct struct pt_regs on stack */
- 	pushq	$__USER_DS			/* pt_regs->ss */
- 	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
- 	pushq	%r11				/* pt_regs->flags */
- 	pushq	$__USER_CS			/* pt_regs->cs */
- 	pushq	%rcx				/* pt_regs->ip */
+ 	pushq	$__USER_DS				/* pt_regs->ss */
+ 	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
+ 	pushq	%r11					/* pt_regs->flags */
+ 	pushq	$__USER_CS				/* pt_regs->cs */
+ 	pushq	%rcx					/* pt_regs->ip */
  GLOBAL(entry_SYSCALL_64_after_hwframe)
- 	pushq	%rax				/* pt_regs->orig_ax */
+ 	pushq	%rax					/* pt_regs->orig_ax */
  
  	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
  
@@@ -900,6 -837,42 +837,42 @@@ apicinterrupt IRQ_WORK_VECTOR			irq_wor
   */
  #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
  
+ /**
+  * idtentry - Generate an IDT entry stub
+  * @sym:		Name of the generated entry point
+  * @do_sym: 		C function to be called
+  * @has_error_code: 	True if this IDT vector has an error code on the stack
+  * @paranoid: 		non-zero means that this vector may be invoked from
+  *			kernel mode with user GSBASE and/or user CR3.
+  *			2 is special -- see below.
+  * @shift_ist:		Set to an IST index if entries from kernel mode should
+  *             		decrement the IST stack so that nested entries get a
+  *			fresh stack.  (This is for #DB, which has a nasty habit
+  *             		of recursing.)
+  *
+  * idtentry generates an IDT stub that sets up a usable kernel context,
+  * creates struct pt_regs, and calls @do_sym.  The stub has the following
+  * special behaviors:
+  *
+  * On an entry from user mode, the stub switches from the trampoline or
+  * IST stack to the normal thread stack.  On an exit to user mode, the
+  * normal exit-to-usermode path is invoked.
+  *
+  * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
+  * whereas we omit the preemption check if @paranoid != 0.  This is purely
+  * because the implementation is simpler this way.  The kernel only needs
+  * to check for asynchronous kernel preemption when IRQ handlers return.
+  *
+  * If @paranoid == 0, then the stub will handle IRET faults by pretending
+  * that the fault came from user mode.  It will handle gs_change faults by
+  * pretending that the fault happened with kernel GSBASE.  Since this handling
+  * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
+  * @paranoid == 0.  This special handling will do the wrong thing for
+  * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
+  *
+  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
+  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
+  */
  .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
  ENTRY(\sym)
  	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@@ -1050,7 -1023,7 +1023,7 @@@ ENTRY(do_softirq_own_stack
  	ret
  ENDPROC(do_softirq_own_stack)
  
 -#ifdef CONFIG_XEN
 +#ifdef CONFIG_XEN_PV
  idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
  
  /*
@@@ -1130,13 -1103,11 +1103,13 @@@ ENTRY(xen_failsafe_callback
  	ENCODE_FRAME_POINTER
  	jmp	error_exit
  END(xen_failsafe_callback)
 +#endif /* CONFIG_XEN_PV */
  
 +#ifdef CONFIG_XEN_PVHVM
  apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
  	xen_hvm_callback_vector xen_evtchn_do_upcall
 +#endif
  
 -#endif /* CONFIG_XEN */
  
  #if IS_ENABLED(CONFIG_HYPERV)
  apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@@ -1153,7 -1124,7 +1126,7 @@@ idtentry debug			do_debug		has_error_co
  idtentry int3			do_int3			has_error_code=0
  idtentry stack_segment		do_stack_segment	has_error_code=1
  
 -#ifdef CONFIG_XEN
 +#ifdef CONFIG_XEN_PV
  idtentry xennmi			do_nmi			has_error_code=0
  idtentry xendebug		do_debug		has_error_code=0
  idtentry xenint3		do_int3			has_error_code=0
@@@ -1189,16 -1160,6 +1162,16 @@@ ENTRY(paranoid_entry
  	xorl	%ebx, %ebx
  
  1:
 +	/*
 +	 * Always stash CR3 in %r14.  This value will be restored,
 +	 * verbatim, at exit.  Needed if paranoid_entry interrupted
 +	 * another entry that already switched to the user CR3 value
 +	 * but has not yet returned to userspace.
 +	 *
 +	 * This is also why CS (stashed in the "iret frame" by the
 +	 * hardware at entry) can not be used: this may be a return
 +	 * to kernel code, but with a user CR3 value.
 +	 */
  	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
  
  	ret
@@@ -1223,13 -1184,11 +1196,13 @@@ ENTRY(paranoid_exit
  	testl	%ebx, %ebx			/* swapgs needed? */
  	jnz	.Lparanoid_exit_no_swapgs
  	TRACE_IRQS_IRETQ
 +	/* Always restore stashed CR3 value (see paranoid_entry) */
  	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
  	SWAPGS_UNSAFE_STACK
  	jmp	.Lparanoid_exit_restore
  .Lparanoid_exit_no_swapgs:
  	TRACE_IRQS_IRETQ_DEBUG
 +	/* Always restore stashed CR3 value (see paranoid_entry) */
  	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
  .Lparanoid_exit_restore:
  	jmp restore_regs_and_return_to_kernel
@@@ -1640,7 -1599,6 +1613,7 @@@ end_repeat_nmi
  	movq	$-1, %rsi
  	call	do_nmi
  
 +	/* Always restore stashed CR3 value (see paranoid_entry) */
  	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
  
  	testl	%ebx, %ebx			/* swapgs needed? */
diff --combined arch/x86/include/asm/processor.h
index c7a4e2a174b9,b2bb1d691efc..617805981cce
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@@ -155,8 -155,7 +155,8 @@@ enum cpuid_regs_idx 
  #define X86_VENDOR_CENTAUR	5
  #define X86_VENDOR_TRANSMETA	7
  #define X86_VENDOR_NSC		8
 -#define X86_VENDOR_NUM		9
 +#define X86_VENDOR_HYGON	9
 +#define X86_VENDOR_NUM		10
  
  #define X86_VENDOR_UNKNOWN	0xff
  
@@@ -316,7 -315,13 +316,13 @@@ struct x86_hw_tss 
  	 */
  	u64			sp1;
  
+ 	/*
+ 	 * Since Linux does not use ring 2, the 'sp2' slot is unused by
+ 	 * hardware.  entry_SYSCALL_64 uses it as scratch space to stash
+ 	 * the user RSP value.
+ 	 */
  	u64			sp2;
+ 
  	u64			reserved2;
  	u64			ist[7];
  	u32			reserved3;
@@@ -579,7 -584,7 +585,7 @@@ static inline bool on_thread_stack(void
  			       current_stack_pointer) < THREAD_SIZE;
  }
  
 -#ifdef CONFIG_PARAVIRT
 +#ifdef CONFIG_PARAVIRT_XXL
  #include <asm/paravirt.h>
  #else
  #define __cpuid			native_cpuid
@@@ -590,7 -595,7 +596,7 @@@ static inline void load_sp0(unsigned lo
  }
  
  #define set_iopl_mask native_set_iopl_mask
 -#endif /* CONFIG_PARAVIRT */
 +#endif /* CONFIG_PARAVIRT_XXL */
  
  /* Free all resources held by a thread. */
  extern void release_thread(struct task_struct *);
diff --combined arch/x86/kernel/asm-offsets.c
index fc02c3cf238f,083c01309027..72adf6c335dc
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@@ -64,12 -64,15 +64,12 @@@ void common(void) 
  	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
  #endif
  
 -#ifdef CONFIG_PARAVIRT
 +#ifdef CONFIG_PARAVIRT_XXL
  	BLANK();
 -	OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
 -	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
 -	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 -	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 -	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
 -	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 -	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 +	OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable);
 +	OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable);
 +	OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret);
 +	OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2);
  #endif
  
  #ifdef CONFIG_XEN
@@@ -96,13 -99,12 +96,12 @@@
  	OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
  
  	/* Layout info for cpu_entry_area */
- 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
- 	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
  	OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
  	DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
  	DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
  
- 	/* Offset for sp0 and sp1 into the tss_struct */
+ 	/* Offset for fields in tss_struct */
  	OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
  	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
+ 	OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
  }
diff --combined arch/x86/kernel/cpu/bugs.c
index b810cc239375,fe32103fcdc7..c37e66e493bf
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@@ -35,12 -35,10 +35,10 @@@ static void __init spectre_v2_select_mi
  static void __init ssb_select_mitigation(void);
  static void __init l1tf_select_mitigation(void);
  
- /*
-  * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
-  * writes to SPEC_CTRL contain whatever reserved bits have been set.
-  */
- u64 __ro_after_init x86_spec_ctrl_base;
+ /* The base value of the SPEC_CTRL MSR that always has to be preserved. */
+ u64 x86_spec_ctrl_base;
  EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+ static DEFINE_MUTEX(spec_ctrl_mutex);
  
  /*
   * The vendor and possibly platform specific bits which can be modified in
@@@ -312,7 -310,6 +310,7 @@@ static enum spectre_v2_mitigation_cmd _
  	}
  
  	if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
 +	    boot_cpu_data.x86_vendor != X86_VENDOR_HYGON &&
  	    boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
  		pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
  		return SPECTRE_V2_CMD_AUTO;
@@@ -326,6 -323,46 +324,46 @@@
  	return cmd;
  }
  
+ static bool stibp_needed(void)
+ {
+ 	if (spectre_v2_enabled == SPECTRE_V2_NONE)
+ 		return false;
+ 
+ 	if (!boot_cpu_has(X86_FEATURE_STIBP))
+ 		return false;
+ 
+ 	return true;
+ }
+ 
+ static void update_stibp_msr(void *info)
+ {
+ 	wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ }
+ 
+ void arch_smt_update(void)
+ {
+ 	u64 mask;
+ 
+ 	if (!stibp_needed())
+ 		return;
+ 
+ 	mutex_lock(&spec_ctrl_mutex);
+ 	mask = x86_spec_ctrl_base;
+ 	if (cpu_smt_control == CPU_SMT_ENABLED)
+ 		mask |= SPEC_CTRL_STIBP;
+ 	else
+ 		mask &= ~SPEC_CTRL_STIBP;
+ 
+ 	if (mask != x86_spec_ctrl_base) {
+ 		pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n",
+ 				cpu_smt_control == CPU_SMT_ENABLED ?
+ 				"Enabling" : "Disabling");
+ 		x86_spec_ctrl_base = mask;
+ 		on_each_cpu(update_stibp_msr, NULL, 1);
+ 	}
+ 	mutex_unlock(&spec_ctrl_mutex);
+ }
+ 
  static void __init spectre_v2_select_mitigation(void)
  {
  	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@@ -372,8 -409,7 +410,8 @@@
  	return;
  
  retpoline_auto:
 -	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 +	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
 +	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
  	retpoline_amd:
  		if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
  			pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
@@@ -426,6 -462,9 +464,9 @@@ specv2_set_mode
  		setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
  		pr_info("Enabling Restricted Speculation for firmware calls\n");
  	}
+ 
+ 	/* Enable STIBP if appropriate */
+ 	arch_smt_update();
  }
  
  #undef pr_fmt
@@@ -816,6 -855,8 +857,8 @@@ static ssize_t l1tf_show_state(char *bu
  static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
  			       char *buf, unsigned int bug)
  {
+ 	int ret;
+ 
  	if (!boot_cpu_has_bug(bug))
  		return sprintf(buf, "Not affected\n");
  
@@@ -833,10 -874,13 +876,13 @@@
  		return sprintf(buf, "Mitigation: __user pointer sanitization\n");
  
  	case X86_BUG_SPECTRE_V2:
- 		return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+ 		ret = sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
  			       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
  			       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ 			       (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "",
+ 			       boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
  			       spectre_v2_module_string());
+ 		return ret;
  
  	case X86_BUG_SPEC_STORE_BYPASS:
  		return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
diff --combined arch/x86/kernel/cpu/common.c
index 9315a1660668,8bffeae9bac2..660d0b22e962
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -949,11 -949,11 +949,11 @@@ static void identify_cpu_without_cpuid(
  }
  
  static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
 -	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_CEDARVIEW,	X86_FEATURE_ANY },
 -	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_CLOVERVIEW,	X86_FEATURE_ANY },
 -	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_LINCROFT,	X86_FEATURE_ANY },
 -	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_PENWELL,	X86_FEATURE_ANY },
 -	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_PINEVIEW,	X86_FEATURE_ANY },
 +	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_SALTWELL,	X86_FEATURE_ANY },
 +	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_SALTWELL_TABLET,	X86_FEATURE_ANY },
 +	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_BONNELL_MID,	X86_FEATURE_ANY },
 +	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_SALTWELL_MID,	X86_FEATURE_ANY },
 +	{ X86_VENDOR_INTEL,	6, INTEL_FAM6_ATOM_BONNELL,	X86_FEATURE_ANY },
  	{ X86_VENDOR_CENTAUR,	5 },
  	{ X86_VENDOR_INTEL,	5 },
  	{ X86_VENDOR_NSC,	5 },
@@@ -963,16 -963,15 +963,16 @@@
  
  static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
  	{ X86_VENDOR_AMD },
 +	{ X86_VENDOR_HYGON },
  	{}
  };
  
  /* Only list CPUs which speculate but are non susceptible to SSB */
  static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
 -	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT1	},
 +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT	},
  	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
 -	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT2	},
 -	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_MERRIFIELD	},
 +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_X	},
 +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_MID	},
  	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_CORE_YONAH		},
  	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
  	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
@@@ -985,14 -984,14 +985,14 @@@
  
  static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
  	/* in addition to cpu_no_speculation */
 -	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT1	},
 -	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT2	},
 +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT	},
 +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_X	},
  	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT		},
 -	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_MERRIFIELD	},
 -	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_MOOREFIELD	},
 +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_SILVERMONT_MID	},
 +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_AIRMONT_MID	},
  	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GOLDMONT	},
 -	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_DENVERTON	},
 -	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GEMINI_LAKE	},
 +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GOLDMONT_X	},
 +	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_ATOM_GOLDMONT_PLUS	},
  	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNL		},
  	{ X86_VENDOR_INTEL,	6,	INTEL_FAM6_XEON_PHI_KNM		},
  	{}
@@@ -1077,9 -1076,6 +1077,9 @@@ static void __init early_identify_cpu(s
  	memset(&c->x86_capability, 0, sizeof c->x86_capability);
  	c->extended_cpuid_level = 0;
  
 +	if (!have_cpuid_p())
 +		identify_cpu_without_cpuid(c);
 +
  	/* cyrix could have cpuid enabled via c_identify()*/
  	if (have_cpuid_p()) {
  		cpu_detect(c);
@@@ -1097,6 -1093,7 +1097,6 @@@
  		if (this_cpu->c_bsp_init)
  			this_cpu->c_bsp_init(c);
  	} else {
 -		identify_cpu_without_cpuid(c);
  		setup_clear_cpu_cap(X86_FEATURE_CPUID);
  	}
  
@@@ -1243,10 -1240,10 +1243,10 @@@ static void generic_identify(struct cpu
  	 * ESPFIX issue, we can change this.
  	 */
  #ifdef CONFIG_X86_32
 -# ifdef CONFIG_PARAVIRT
 +# ifdef CONFIG_PARAVIRT_XXL
  	do {
  		extern void native_iret(void);
 -		if (pv_cpu_ops.iret == native_iret)
 +		if (pv_ops.cpu.iret == native_iret)
  			set_cpu_bug(c, X86_BUG_ESPFIX);
  	} while (0);
  # else
@@@ -1534,19 -1531,8 +1534,8 @@@ EXPORT_PER_CPU_SYMBOL(__preempt_count)
  /* May not be marked __init: used by software suspend */
  void syscall_init(void)
  {
- 	extern char _entry_trampoline[];
- 	extern char entry_SYSCALL_64_trampoline[];
- 
- 	int cpu = smp_processor_id();
- 	unsigned long SYSCALL64_entry_trampoline =
- 		(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
- 		(entry_SYSCALL_64_trampoline - _entry_trampoline);
- 
  	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
- 	if (static_cpu_has(X86_FEATURE_PTI))
- 		wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
- 	else
- 		wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+ 	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
  
  #ifdef CONFIG_IA32_EMULATION
  	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@@@ -1557,7 -1543,8 +1546,8 @@@
  	 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
  	 */
  	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+ 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+ 		    (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
  	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
  #else
  	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@@ -1672,29 -1659,6 +1662,29 @@@ static void wait_for_master_cpu(int cpu
  #endif
  }
  
 +#ifdef CONFIG_X86_64
 +static void setup_getcpu(int cpu)
 +{
 +	unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
 +	struct desc_struct d = { };
 +
 +	if (static_cpu_has(X86_FEATURE_RDTSCP))
 +		write_rdtscp_aux(cpudata);
 +
 +	/* Store CPU and node number in limit. */
 +	d.limit0 = cpudata;
 +	d.limit1 = cpudata >> 16;
 +
 +	d.type = 5;		/* RO data, expand down, accessed */
 +	d.dpl = 3;		/* Visible to user code */
 +	d.s = 1;		/* Not a system segment */
 +	d.p = 1;		/* Present */
 +	d.d = 1;		/* 32-bit */
 +
 +	write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
 +}
 +#endif
 +
  /*
   * cpu_init() initializes state that is per-CPU. Some data is already
   * initialized (naturally) in the bootstrap process, such as the GDT
@@@ -1732,7 -1696,6 +1722,7 @@@ void cpu_init(void
  	    early_cpu_to_node(cpu) != NUMA_NO_NODE)
  		set_numa_node(early_cpu_to_node(cpu));
  #endif
 +	setup_getcpu(cpu);
  
  	me = current;
  
diff --combined arch/x86/kernel/kprobes/core.c
index f72a47b602e2,f802cf5b4478..c33b06f5faa4
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@@ -1020,26 -1020,56 +1020,18 @@@ int kprobe_fault_handler(struct pt_reg
  		 */
  		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
  			return 1;
 -
 -		/*
 -		 * In case the user-specified fault handler returned
 -		 * zero, try to fix up.
 -		 */
 -		if (fixup_exception(regs, trapnr))
 -			return 1;
 -
 -		/*
 -		 * fixup routine could not handle it,
 -		 * Let do_page_fault() fix it.
 -		 */
  	}
  
  	return 0;
  }
  NOKPROBE_SYMBOL(kprobe_fault_handler);
  
 -/*
 - * Wrapper routine for handling exceptions.
 - */
 -int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
 -			     void *data)
 -{
 -	struct die_args *args = data;
 -	int ret = NOTIFY_DONE;
 -
 -	if (args->regs && user_mode(args->regs))
 -		return ret;
 -
 -	if (val == DIE_GPF) {
 -		/*
 -		 * To be potentially processing a kprobe fault and to
 -		 * trust the result from kprobe_running(), we have
 -		 * be non-preemptible.
 -		 */
 -		if (!preemptible() && kprobe_running() &&
 -		    kprobe_fault_handler(args->regs, args->trapnr))
 -			ret = NOTIFY_STOP;
 -	}
 -	return ret;
 -}
 -NOKPROBE_SYMBOL(kprobe_exceptions_notify);
 -
  bool arch_within_kprobe_blacklist(unsigned long addr)
  {
- 	bool is_in_entry_trampoline_section = false;
- 
- #ifdef CONFIG_X86_64
- 	is_in_entry_trampoline_section =
- 		(addr >= (unsigned long)__entry_trampoline_start &&
- 		 addr < (unsigned long)__entry_trampoline_end);
- #endif
  	return  (addr >= (unsigned long)__kprobes_text_start &&
  		 addr < (unsigned long)__kprobes_text_end) ||
  		(addr >= (unsigned long)__entry_text_start &&
- 		 addr < (unsigned long)__entry_text_end) ||
- 		is_in_entry_trampoline_section;
+ 		 addr < (unsigned long)__entry_text_end);
  }
  
  int __init arch_init_kprobes(void)
diff --combined arch/x86/kernel/process_64.c
index d6674a425714,0fa7aa19f09e..31b4755369f0
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@@ -54,16 -54,13 +54,14 @@@
  #include <asm/vdso.h>
  #include <asm/intel_rdt_sched.h>
  #include <asm/unistd.h>
 +#include <asm/fsgsbase.h>
  #ifdef CONFIG_IA32_EMULATION
  /* Not included via unistd.h */
  #include <asm/unistd_32_ia32.h>
  #endif
  
- __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
- 
  /* Prints also some state that isn't saved in the pt_regs */
 -void __show_regs(struct pt_regs *regs, int all)
 +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
  {
  	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
  	unsigned long d0, d1, d2, d3, d6, d7;
@@@ -88,17 -85,9 +86,17 @@@
  	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
  	       regs->r13, regs->r14, regs->r15);
  
 -	if (!all)
 +	if (mode == SHOW_REGS_SHORT)
  		return;
  
 +	if (mode == SHOW_REGS_USER) {
 +		rdmsrl(MSR_FS_BASE, fs);
 +		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 +		printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
 +		       fs, shadowgs);
 +		return;
 +	}
 +
  	asm("movl %%ds,%0" : "=r" (ds));
  	asm("movl %%cs,%0" : "=r" (cs));
  	asm("movl %%es,%0" : "=r" (es));
@@@ -287,138 -276,6 +285,138 @@@ static __always_inline void load_seg_le
  	}
  }
  
 +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
 +					      struct thread_struct *next)
 +{
 +	load_seg_legacy(prev->fsindex, prev->fsbase,
 +			next->fsindex, next->fsbase, FS);
 +	load_seg_legacy(prev->gsindex, prev->gsbase,
 +			next->gsindex, next->gsbase, GS);
 +}
 +
 +static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
 +					    unsigned short selector)
 +{
 +	unsigned short idx = selector >> 3;
 +	unsigned long base;
 +
 +	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
 +		if (unlikely(idx >= GDT_ENTRIES))
 +			return 0;
 +
 +		/*
 +		 * There are no user segments in the GDT with nonzero bases
 +		 * other than the TLS segments.
 +		 */
 +		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 +			return 0;
 +
 +		idx -= GDT_ENTRY_TLS_MIN;
 +		base = get_desc_base(&task->thread.tls_array[idx]);
 +	} else {
 +#ifdef CONFIG_MODIFY_LDT_SYSCALL
 +		struct ldt_struct *ldt;
 +
 +		/*
 +		 * If performance here mattered, we could protect the LDT
 +		 * with RCU.  This is a slow path, though, so we can just
 +		 * take the mutex.
 +		 */
 +		mutex_lock(&task->mm->context.lock);
 +		ldt = task->mm->context.ldt;
 +		if (unlikely(idx >= ldt->nr_entries))
 +			base = 0;
 +		else
 +			base = get_desc_base(ldt->entries + idx);
 +		mutex_unlock(&task->mm->context.lock);
 +#else
 +		base = 0;
 +#endif
 +	}
 +
 +	return base;
 +}
 +
 +void x86_fsbase_write_cpu(unsigned long fsbase)
 +{
 +	/*
 +	 * Set the selector to 0 as a notion, that the segment base is
 +	 * overwritten, which will be checked for skipping the segment load
 +	 * during context switch.
 +	 */
 +	loadseg(FS, 0);
 +	wrmsrl(MSR_FS_BASE, fsbase);
 +}
 +
 +void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
 +{
 +	/* Set the selector to 0 for the same reason as %fs above. */
 +	loadseg(GS, 0);
 +	wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
 +}
 +
 +unsigned long x86_fsbase_read_task(struct task_struct *task)
 +{
 +	unsigned long fsbase;
 +
 +	if (task == current)
 +		fsbase = x86_fsbase_read_cpu();
 +	else if (task->thread.fsindex == 0)
 +		fsbase = task->thread.fsbase;
 +	else
 +		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
 +
 +	return fsbase;
 +}
 +
 +unsigned long x86_gsbase_read_task(struct task_struct *task)
 +{
 +	unsigned long gsbase;
 +
 +	if (task == current)
 +		gsbase = x86_gsbase_read_cpu_inactive();
 +	else if (task->thread.gsindex == 0)
 +		gsbase = task->thread.gsbase;
 +	else
 +		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
 +
 +	return gsbase;
 +}
 +
 +int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
 +{
 +	/*
 +	 * Not strictly needed for %fs, but do it for symmetry
 +	 * with %gs
 +	 */
 +	if (unlikely(fsbase >= TASK_SIZE_MAX))
 +		return -EPERM;
 +
 +	preempt_disable();
 +	task->thread.fsbase = fsbase;
 +	if (task == current)
 +		x86_fsbase_write_cpu(fsbase);
 +	task->thread.fsindex = 0;
 +	preempt_enable();
 +
 +	return 0;
 +}
 +
 +int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 +{
 +	if (unlikely(gsbase >= TASK_SIZE_MAX))
 +		return -EPERM;
 +
 +	preempt_disable();
 +	task->thread.gsbase = gsbase;
 +	if (task == current)
 +		x86_gsbase_write_cpu_inactive(gsbase);
 +	task->thread.gsindex = 0;
 +	preempt_enable();
 +
 +	return 0;
 +}
 +
  int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
  		unsigned long arg, struct task_struct *p, unsigned long tls)
  {
@@@ -606,7 -463,10 +604,7 @@@ __switch_to(struct task_struct *prev_p
  	if (unlikely(next->ds | prev->ds))
  		loadsegment(ds, next->ds);
  
 -	load_seg_legacy(prev->fsindex, prev->fsbase,
 -			next->fsindex, next->fsbase, FS);
 -	load_seg_legacy(prev->gsindex, prev->gsbase,
 -			next->gsindex, next->gsbase, GS);
 +	x86_fsgsbase_load(prev, next);
  
  	switch_fpu_finish(next_fpu, cpu);
  
@@@ -757,25 -617,54 +755,25 @@@ static long prctl_map_vdso(const struc
  long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
  {
  	int ret = 0;
 -	int doit = task == current;
 -	int cpu;
  
  	switch (option) {
 -	case ARCH_SET_GS:
 -		if (arg2 >= TASK_SIZE_MAX)
 -			return -EPERM;
 -		cpu = get_cpu();
 -		task->thread.gsindex = 0;
 -		task->thread.gsbase = arg2;
 -		if (doit) {
 -			load_gs_index(0);
 -			ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
 -		}
 -		put_cpu();
 +	case ARCH_SET_GS: {
 +		ret = x86_gsbase_write_task(task, arg2);
  		break;
 -	case ARCH_SET_FS:
 -		/* Not strictly needed for fs, but do it for symmetry
 -		   with gs */
 -		if (arg2 >= TASK_SIZE_MAX)
 -			return -EPERM;
 -		cpu = get_cpu();
 -		task->thread.fsindex = 0;
 -		task->thread.fsbase = arg2;
 -		if (doit) {
 -			/* set the selector to 0 to not confuse __switch_to */
 -			loadsegment(fs, 0);
 -			ret = wrmsrl_safe(MSR_FS_BASE, arg2);
 -		}
 -		put_cpu();
 +	}
 +	case ARCH_SET_FS: {
 +		ret = x86_fsbase_write_task(task, arg2);
  		break;
 +	}
  	case ARCH_GET_FS: {
 -		unsigned long base;
 +		unsigned long base = x86_fsbase_read_task(task);
  
 -		if (doit)
 -			rdmsrl(MSR_FS_BASE, base);
 -		else
 -			base = task->thread.fsbase;
  		ret = put_user(base, (unsigned long __user *)arg2);
  		break;
  	}
  	case ARCH_GET_GS: {
 -		unsigned long base;
 +		unsigned long base = x86_gsbase_read_task(task);
  
 -		if (doit)
 -			rdmsrl(MSR_KERNEL_GS_BASE, base);
 -		else
 -			base = task->thread.gsbase;
  		ret = put_user(base, (unsigned long __user *)arg2);
  		break;
  	}
diff --combined arch/x86/kernel/traps.c
index 16c95cb90496,1a90821c0b74..5bd0a997d81e
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@@ -206,7 -206,7 +206,7 @@@ do_trap_no_signal(struct task_struct *t
  	}
  
  	if (!user_mode(regs)) {
 -		if (fixup_exception(regs, trapnr))
 +		if (fixup_exception(regs, trapnr, error_code, 0))
  			return 0;
  
  		tsk->thread.error_code = error_code;
@@@ -383,6 -383,10 +383,10 @@@ dotraplinkage void do_double_fault(stru
  		 * we won't enable interupts or schedule before we invoke
  		 * general_protection, so nothing will clobber the stack
  		 * frame we just set up.
+ 		 *
+ 		 * We will enter general_protection with kernel GSBASE,
+ 		 * which is what the stub expects, given that the faulting
+ 		 * RIP will be the IRET instruction.
  		 */
  		regs->ip = (unsigned long)general_protection;
  		regs->sp = (unsigned long)&gpregs->orig_ax;
@@@ -551,21 -555,11 +555,21 @@@ do_general_protection(struct pt_regs *r
  
  	tsk = current;
  	if (!user_mode(regs)) {
 -		if (fixup_exception(regs, X86_TRAP_GP))
 +		if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
  			return;
  
  		tsk->thread.error_code = error_code;
  		tsk->thread.trap_nr = X86_TRAP_GP;
 +
 +		/*
 +		 * To be potentially processing a kprobe fault and to
 +		 * trust the result from kprobe_running(), we have to
 +		 * be non-preemptible.
 +		 */
 +		if (!preemptible() && kprobe_running() &&
 +		    kprobe_fault_handler(regs, X86_TRAP_GP))
 +			return;
 +
  		if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
  			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
  			die("general protection fault", regs, error_code);
@@@ -848,7 -842,7 +852,7 @@@ static void math_error(struct pt_regs *
  	cond_local_irq_enable(regs);
  
  	if (!user_mode(regs)) {
 -		if (fixup_exception(regs, trapnr))
 +		if (fixup_exception(regs, trapnr, error_code, 0))
  			return;
  
  		task->thread.error_code = error_code;
diff --combined arch/x86/kernel/vmlinux.lds.S
index 5dd3317d761f,9c77d2df9c27..0d618ee634ac
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@@ -65,23 -65,6 +65,23 @@@ jiffies_64 = jiffies
  #define ALIGN_ENTRY_TEXT_BEGIN	. = ALIGN(PMD_SIZE);
  #define ALIGN_ENTRY_TEXT_END	. = ALIGN(PMD_SIZE);
  
 +/*
 + * This section contains data which will be mapped as decrypted. Memory
 + * encryption operates on a page basis. Make this section PMD-aligned
 + * to avoid splitting the pages while mapping the section early.
 + *
 + * Note: We use a separate section so that only this section gets
 + * decrypted to avoid exposing more than we wish.
 + */
 +#define BSS_DECRYPTED						\
 +	. = ALIGN(PMD_SIZE);					\
 +	__start_bss_decrypted = .;				\
 +	*(.bss..decrypted);					\
 +	. = ALIGN(PAGE_SIZE);					\
 +	__start_bss_decrypted_unused = .;			\
 +	. = ALIGN(PMD_SIZE);					\
 +	__end_bss_decrypted = .;				\
 +
  #else
  
  #define X86_ALIGN_RODATA_BEGIN
@@@ -91,7 -74,6 +91,7 @@@
  
  #define ALIGN_ENTRY_TEXT_BEGIN
  #define ALIGN_ENTRY_TEXT_END
 +#define BSS_DECRYPTED
  
  #endif
  
@@@ -136,16 -118,6 +136,6 @@@ SECTION
  		*(.fixup)
  		*(.gnu.warning)
  
- #ifdef CONFIG_X86_64
- 		. = ALIGN(PAGE_SIZE);
- 		__entry_trampoline_start = .;
- 		_entry_trampoline = .;
- 		*(.entry_trampoline)
- 		. = ALIGN(PAGE_SIZE);
- 		__entry_trampoline_end = .;
- 		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
- #endif
- 
  #ifdef CONFIG_RETPOLINE
  		__indirect_thunk_start = .;
  		*(.text.__x86.indirect_thunk)
@@@ -373,7 -345,6 +363,7 @@@
  		__bss_start = .;
  		*(.bss..page_aligned)
  		*(.bss)
 +		BSS_DECRYPTED
  		. = ALIGN(PAGE_SIZE);
  		__bss_stop = .;
  	}
diff --combined arch/x86/mm/tlb.c
index 7d68489cfdb1,073b8df349a0..bddd6b3cee1d
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@@ -7,6 -7,7 +7,7 @@@
  #include <linux/export.h>
  #include <linux/cpu.h>
  #include <linux/debugfs.h>
+ #include <linux/ptrace.h>
  
  #include <asm/tlbflush.h>
  #include <asm/mmu_context.h>
@@@ -180,16 -181,26 +181,29 @@@ static void sync_current_stack_to_mm(st
  	}
  }
  
+ static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
+ {
+ 	/*
+ 	 * Check if the current (previous) task has access to the memory
+ 	 * of the @tsk (next) task. If access is denied, make sure to
+ 	 * issue a IBPB to stop user->user Spectre-v2 attacks.
+ 	 *
+ 	 * Note: __ptrace_may_access() returns 0 or -ERRNO.
+ 	 */
+ 	return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
+ 		ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
+ }
+ 
  void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
  			struct task_struct *tsk)
  {
  	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
  	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 +	bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
  	unsigned cpu = smp_processor_id();
  	u64 next_tlb_gen;
 +	bool need_flush;
 +	u16 new_asid;
  
  	/*
  	 * NB: The scheduler will call us with prev == next when switching
@@@ -243,41 -254,20 +257,41 @@@
  			   next->context.ctx_id);
  
  		/*
 -		 * We don't currently support having a real mm loaded without
 -		 * our cpu set in mm_cpumask().  We have all the bookkeeping
 -		 * in place to figure out whether we would need to flush
 -		 * if our cpu were cleared in mm_cpumask(), but we don't
 -		 * currently use it.
 +		 * Even in lazy TLB mode, the CPU should stay set in the
 +		 * mm_cpumask. The TLB shootdown code can figure out from
 +		 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
  		 */
  		if (WARN_ON_ONCE(real_prev != &init_mm &&
  				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
  			cpumask_set_cpu(cpu, mm_cpumask(next));
  
 -		return;
 +		/*
 +		 * If the CPU is not in lazy TLB mode, we are just switching
 +		 * from one thread in a process to another thread in the same
 +		 * process. No TLB flush required.
 +		 */
 +		if (!was_lazy)
 +			return;
 +
 +		/*
 +		 * Read the tlb_gen to check whether a flush is needed.
 +		 * If the TLB is up to date, just use it.
 +		 * The barrier synchronizes with the tlb_gen increment in
 +		 * the TLB shootdown code.
 +		 */
 +		smp_mb();
 +		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 +		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
 +				next_tlb_gen)
 +			return;
 +
 +		/*
 +		 * TLB contents went out of date while we were in lazy
 +		 * mode. Fall through to the TLB switching code below.
 +		 */
 +		new_asid = prev_asid;
 +		need_flush = true;
  	} else {
 -		u16 new_asid;
 -		bool need_flush;
  		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
  
  		/*
@@@ -286,18 -276,13 +300,13 @@@
  		 * one process from doing Spectre-v2 attacks on another.
  		 *
  		 * As an optimization, flush indirect branches only when
- 		 * switching into processes that disable dumping. This
- 		 * protects high value processes like gpg, without having
- 		 * too high performance overhead. IBPB is *expensive*!
- 		 *
- 		 * This will not flush branches when switching into kernel
- 		 * threads. It will also not flush if we switch to idle
- 		 * thread and back to the same process. It will flush if we
- 		 * switch to a different non-dumpable process.
+ 		 * switching into a processes that can't be ptrace by the
+ 		 * current one (as in such case, attacker has much more
+ 		 * convenient way how to tamper with the next process than
+ 		 * branch buffer poisoning).
  		 */
- 		if (tsk && tsk->mm &&
- 		    tsk->mm->context.ctx_id != last_ctx_id &&
- 		    get_dumpable(tsk->mm) != SUID_DUMP_USER)
+ 		if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
+ 				ibpb_needed(tsk, last_ctx_id))
  			indirect_branch_prediction_barrier();
  
  		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@@ -332,48 -317,46 +341,48 @@@
  		/* Let nmi_uaccess_okay() know that we're changing CR3. */
  		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
  		barrier();
 +	}
  
 -		if (need_flush) {
 -			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 -			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
 -			load_new_mm_cr3(next->pgd, new_asid, true);
 -
 -			/*
 -			 * NB: This gets called via leave_mm() in the idle path
 -			 * where RCU functions differently.  Tracing normally
 -			 * uses RCU, so we need to use the _rcuidle variant.
 -			 *
 -			 * (There is no good reason for this.  The idle code should
 -			 *  be rearranged to call this before rcu_idle_enter().)
 -			 */
 -			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 -		} else {
 -			/* The new ASID is already up to date. */
 -			load_new_mm_cr3(next->pgd, new_asid, false);
 -
 -			/* See above wrt _rcuidle. */
 -			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
 -		}
 +	if (need_flush) {
 +		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 +		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
 +		load_new_mm_cr3(next->pgd, new_asid, true);
  
  		/*
 -		 * Record last user mm's context id, so we can avoid
 -		 * flushing branch buffer with IBPB if we switch back
 -		 * to the same user.
 +		 * NB: This gets called via leave_mm() in the idle path
 +		 * where RCU functions differently.  Tracing normally
 +		 * uses RCU, so we need to use the _rcuidle variant.
 +		 *
 +		 * (There is no good reason for this.  The idle code should
 +		 *  be rearranged to call this before rcu_idle_enter().)
  		 */
 -		if (next != &init_mm)
 -			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
 -
 -		/* Make sure we write CR3 before loaded_mm. */
 -		barrier();
 +		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 +	} else {
 +		/* The new ASID is already up to date. */
 +		load_new_mm_cr3(next->pgd, new_asid, false);
  
 -		this_cpu_write(cpu_tlbstate.loaded_mm, next);
 -		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 +		/* See above wrt _rcuidle. */
 +		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
  	}
  
 -	load_mm_cr4(next);
 -	switch_ldt(real_prev, next);
 +	/*
 +	 * Record last user mm's context id, so we can avoid
 +	 * flushing branch buffer with IBPB if we switch back
 +	 * to the same user.
 +	 */
 +	if (next != &init_mm)
 +		this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
 +
 +	/* Make sure we write CR3 before loaded_mm. */
 +	barrier();
 +
 +	this_cpu_write(cpu_tlbstate.loaded_mm, next);
 +	this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 +
 +	if (next != real_prev) {
 +		load_mm_cr4(next);
 +		switch_ldt(real_prev, next);
 +	}
  }
  
  /*
@@@ -394,7 -377,20 +403,7 @@@ void enter_lazy_tlb(struct mm_struct *m
  	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
  		return;
  
 -	if (tlb_defer_switch_to_init_mm()) {
 -		/*
 -		 * There's a significant optimization that may be possible
 -		 * here.  We have accurate enough TLB flush tracking that we
 -		 * don't need to maintain coherence of TLB per se when we're
 -		 * lazy.  We do, however, need to maintain coherence of
 -		 * paging-structure caches.  We could, in principle, leave our
 -		 * old mm loaded and only switch to init_mm when
 -		 * tlb_remove_page() happens.
 -		 */
 -		this_cpu_write(cpu_tlbstate.is_lazy, true);
 -	} else {
 -		switch_mm(NULL, &init_mm, NULL);
 -	}
 +	this_cpu_write(cpu_tlbstate.is_lazy, true);
  }
  
  /*
@@@ -481,9 -477,6 +490,9 @@@ static void flush_tlb_func_common(cons
  		 * paging-structure cache to avoid speculatively reading
  		 * garbage into our TLB.  Since switching to init_mm is barely
  		 * slower than a minimal flush, just switch to init_mm.
 +		 *
 +		 * This should be rare, with native_flush_tlb_others skipping
 +		 * IPIs to lazy TLB mode CPUs.
  		 */
  		switch_mm_irqs_off(NULL, &init_mm, NULL);
  		return;
@@@ -544,16 -537,17 +553,16 @@@
  	    f->new_tlb_gen == local_tlb_gen + 1 &&
  	    f->new_tlb_gen == mm_tlb_gen) {
  		/* Partial flush */
 -		unsigned long addr;
 -		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
 +		unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
 +		unsigned long addr = f->start;
  
 -		addr = f->start;
  		while (addr < f->end) {
  			__flush_tlb_one_user(addr);
 -			addr += PAGE_SIZE;
 +			addr += 1UL << f->stride_shift;
  		}
  		if (local)
 -			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
 -		trace_tlb_flush(reason, nr_pages);
 +			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
 +		trace_tlb_flush(reason, nr_invalidate);
  	} else {
  		/* Full flush. */
  		local_flush_tlb();
@@@ -586,11 -580,6 +595,11 @@@ static void flush_tlb_func_remote(void 
  	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
  }
  
 +static bool tlb_is_not_lazy(int cpu, void *data)
 +{
 +	return !per_cpu(cpu_tlbstate.is_lazy, cpu);
 +}
 +
  void native_flush_tlb_others(const struct cpumask *cpumask,
  			     const struct flush_tlb_info *info)
  {
@@@ -626,23 -615,8 +635,23 @@@
  					       (void *)info, 1);
  		return;
  	}
 -	smp_call_function_many(cpumask, flush_tlb_func_remote,
 +
 +	/*
 +	 * If no page tables were freed, we can skip sending IPIs to
 +	 * CPUs in lazy TLB mode. They will flush the CPU themselves
 +	 * at the next context switch.
 +	 *
 +	 * However, if page tables are getting freed, we need to send the
 +	 * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
 +	 * up on the new contents of what used to be page tables, while
 +	 * doing a speculative memory access.
 +	 */
 +	if (info->freed_tables)
 +		smp_call_function_many(cpumask, flush_tlb_func_remote,
  			       (void *)info, 1);
 +	else
 +		on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
 +				(void *)info, 1, GFP_ATOMIC, cpumask);
  }
  
  /*
@@@ -658,15 -632,12 +667,15 @@@
  static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
  
  void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 -				unsigned long end, unsigned long vmflag)
 +				unsigned long end, unsigned int stride_shift,
 +				bool freed_tables)
  {
  	int cpu;
  
  	struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
  		.mm = mm,
 +		.stride_shift = stride_shift,
 +		.freed_tables = freed_tables,
  	};
  
  	cpu = get_cpu();
@@@ -676,7 -647,8 +685,7 @@@
  
  	/* Should we flush just the requested range? */
  	if ((end != TLB_FLUSH_ALL) &&
 -	    !(vmflag & VM_HUGETLB) &&
 -	    ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
 +	    ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
  		info.start = start;
  		info.end = end;
  	} else {
diff --combined kernel/cpu.c
index e82920b8bee1,2fb49916ea56..3c7f3b4c453c
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@@ -315,16 -315,6 +315,16 @@@ void lockdep_assert_cpus_held(void
  	percpu_rwsem_assert_held(&cpu_hotplug_lock);
  }
  
 +static void lockdep_acquire_cpus_lock(void)
 +{
 +	rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
 +}
 +
 +static void lockdep_release_cpus_lock(void)
 +{
 +	rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
 +}
 +
  /*
   * Wait for currently running CPU hotplug operations to complete (if any) and
   * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
@@@ -354,17 -344,6 +354,17 @@@ void cpu_hotplug_enable(void
  	cpu_maps_update_done();
  }
  EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 +
 +#else
 +
 +static void lockdep_acquire_cpus_lock(void)
 +{
 +}
 +
 +static void lockdep_release_cpus_lock(void)
 +{
 +}
 +
  #endif	/* CONFIG_HOTPLUG_CPU */
  
  #ifdef CONFIG_HOTPLUG_SMT
@@@ -383,7 -362,6 +383,7 @@@ void __init cpu_smt_disable(bool force
  		pr_info("SMT: Force disabled\n");
  		cpu_smt_control = CPU_SMT_FORCE_DISABLED;
  	} else {
 +		pr_info("SMT: disabled\n");
  		cpu_smt_control = CPU_SMT_DISABLED;
  	}
  }
@@@ -629,21 -607,15 +629,21 @@@ static void cpuhp_thread_fun(unsigned i
  	bool bringup = st->bringup;
  	enum cpuhp_state state;
  
 +	if (WARN_ON_ONCE(!st->should_run))
 +		return;
 +
  	/*
  	 * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
  	 * that if we see ->should_run we also see the rest of the state.
  	 */
  	smp_mb();
  
 -	if (WARN_ON_ONCE(!st->should_run))
 -		return;
 -
 +	/*
 +	 * The BP holds the hotplug lock, but we're now running on the AP,
 +	 * ensure that anybody asserting the lock is held, will actually find
 +	 * it so.
 +	 */
 +	lockdep_acquire_cpus_lock();
  	cpuhp_lock_acquire(bringup);
  
  	if (st->single) {
@@@ -689,7 -661,6 +689,7 @@@
  	}
  
  	cpuhp_lock_release(bringup);
 +	lockdep_release_cpus_lock();
  
  	if (!st->should_run)
  		complete_ap_thread(st, bringup);
@@@ -945,8 -916,7 +945,8 @@@ static int cpuhp_down_callbacks(unsigne
  		ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
  		if (ret) {
  			st->target = prev_state;
 -			undo_cpu_down(cpu, st);
 +			if (st->state < prev_state)
 +				undo_cpu_down(cpu, st);
  			break;
  		}
  	}
@@@ -999,7 -969,7 +999,7 @@@ static int __ref _cpu_down(unsigned in
  	 * to do the further cleanups.
  	 */
  	ret = cpuhp_down_callbacks(cpu, st, target);
 -	if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
 +	if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
  		cpuhp_reset_state(st, prev_state);
  		__cpuhp_kick_ap(st);
  	}
@@@ -2055,6 -2025,12 +2055,12 @@@ static void cpuhp_online_cpu_device(uns
  	kobject_uevent(&dev->kobj, KOBJ_ONLINE);
  }
  
+ /*
+  * Architectures that need SMT-specific errata handling during SMT hotplug
+  * should override this.
+  */
+ void __weak arch_smt_update(void) { };
+ 
  static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
  {
  	int cpu, ret = 0;
@@@ -2081,8 -2057,10 +2087,10 @@@
  		 */
  		cpuhp_offline_cpu_device(cpu);
  	}
- 	if (!ret)
+ 	if (!ret) {
  		cpu_smt_control = ctrlval;
+ 		arch_smt_update();
+ 	}
  	cpu_maps_update_done();
  	return ret;
  }
@@@ -2093,6 -2071,7 +2101,7 @@@ static int cpuhp_smt_enable(void
  
  	cpu_maps_update_begin();
  	cpu_smt_control = CPU_SMT_ENABLED;
+ 	arch_smt_update();
  	for_each_present_cpu(cpu) {
  		/* Skip online CPUs and CPUs on offline nodes */
  		if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))