+====================================================
+Complete virtual memory map with 4-level page tables
+====================================================
-Virtual memory map with 4 level page tables:
-
-0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
-hole caused by [47:63] sign extension
-ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
-ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
-ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
-ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
-ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
-ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
-... unused hole ...
-ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
-... unused hole ...
- vaddr_end for KASLR
-fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
-fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
-ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
-... unused hole ...
-ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
-... unused hole ...
-ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
-[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
-ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
-ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
-STACKLEAK_POISON value in this last hole: ffffffffffff4111
-
-Virtual memory map with 5 level page tables:
-
-0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
-hole caused by [56:63] sign extension
-ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
-ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
-ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
-ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
-ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
-... unused hole ...
-ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
-... unused hole ...
- vaddr_end for KASLR
-fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
-... unused hole ...
-ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
-... unused hole ...
-ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
-... unused hole ...
-ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
-[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
-ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
-ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
-STACKLEAK_POISON value in this last hole: ffffffffffff4111
+Notes:
+
+ - Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down
+ from the top of the 64-bit address space. It's easier to understand the layout
+ when seen both in absolute addresses and in distance-from-top notation.
+
+ For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the
+ 64-bit address space (ffffffffffffffff).
+
+ Note that as we get closer to the top of the address space, the notation changes
+ from TB to GB and then MB/KB.
+
+ - "16M TB" might look weird at first sight, but it's an easier to visualize size
+ notation than "16 EB", which few will recognize at first sight as 16 exabytes.
+ It also shows it nicely how incredibly large 64-bit address space is.
+
+========================================================================================================================
+ Start addr | Offset | End addr | Size | VM area description
+========================================================================================================================
+ | | | |
+ 0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space virtual memory, different per mm
+__________________|____________|__________________|_________|___________________________________________________________
+ | | | |
+ 0000800000000000 | +128 TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
+ | | | | virtual memory addresses up to the -128 TB
+ | | | | starting offset of kernel mappings.
+__________________|____________|__________________|_________|___________________________________________________________
+ |
+ | Kernel-space virtual memory, shared between all processes:
+____________________________________________________________|___________________________________________________________
+ | | | |
+ ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor
+ ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base)
+ ffffc80000000000 | -56 TB | ffffc8ffffffffff | 1 TB | ... unused hole
+ ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base)
+ ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole
+ ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base)
+ ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole
+ ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory
+ fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
+ | | | | vaddr_end for KASLR
+ fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
+ fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | LDT remap for PTI
+ ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
+__________________|____________|__________________|_________|____________________________________________________________
+ |
+ | Identical layout to the 47-bit one from here on:
+____________________________________________________________|____________________________________________________________
+ | | | |
+ ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
+ ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
+ ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
+ ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
+ ffffffff80000000 |-2048 MB | | |
+ ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
+ ffffffffff000000 | -16 MB | | |
+ FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
+ ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
+ ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
+__________________|____________|__________________|_________|___________________________________________________________
+
+
+====================================================
+Complete virtual memory map with 5-level page tables
+====================================================
+
+Notes:
+
+ - With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
+ from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
+ offset and many of the regions expand to support the much larger physical
+ memory supported.
+
+========================================================================================================================
+ Start addr | Offset | End addr | Size | VM area description
+========================================================================================================================
+ | | | |
+ 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm
+__________________|____________|__________________|_________|___________________________________________________________
+ | | | |
+ 0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
+ | | | | virtual memory addresses up to the -128 TB
+ | | | | starting offset of kernel mappings.
+__________________|____________|__________________|_________|___________________________________________________________
+ |
+ | Kernel-space virtual memory, shared between all processes:
+____________________________________________________________|___________________________________________________________
+ | | | |
+ ff00000000000000 | -64 PB | ff0fffffffffffff | 4 PB | ... guard hole, also reserved for hypervisor
+ ff10000000000000 | -60 PB | ff8fffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base)
+ ff90000000000000 | -28 PB | ff9fffffffffffff | 4 PB | LDT remap for PTI
+ ffa0000000000000 | -24 PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base)
+ ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole
+ ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base)
+ ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole
+ ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory
+ fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
+ | | | | vaddr_end for KASLR
+ fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
+ fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
+ ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
+__________________|____________|__________________|_________|____________________________________________________________
+ |
+ | Identical layout to the 47-bit one from here on:
+____________________________________________________________|____________________________________________________________
+ | | | |
+ ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
+ ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
+ ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
+ ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
+ ffffffff80000000 |-2048 MB | | |
+ ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
+ ffffffffff000000 | -16 MB | | |
+ FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
+ ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
+ ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
+__________________|____________|__________________|_________|___________________________________________________________
Architecture defines a 64-bit virtual address. Implementations can support
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
Be very careful vs. KASLR when changing anything here. The KASLR address
range must not overlap with anything except the KASAN shadow area, which is
correct as KASAN disables KASLR.
++
++For both 4- and 5-level layouts, the STACKLEAK_POISON value in the last 2MB
++hole: ffffffffffff4111
This symbol should be selected by an architecture if it
supports an implementation of restartable sequences.
+config HAVE_FUNCTION_ARG_ACCESS_API
+ bool
+ help
+ This symbol should be selected by an architecure if it supports
+ the API needed to access function arguments from pt_regs,
+ declared in asm/ptrace.h
+
config HAVE_CLK
bool
help
config HAVE_ARCH_JUMP_LABEL
bool
+config HAVE_ARCH_JUMP_LABEL_RELATIVE
+ bool
+
config HAVE_RCU_TABLE_FREE
bool
See Documentation/userspace-api/seccomp_filter.rst for details.
+ config HAVE_ARCH_STACKLEAK
+ bool
+ help
+ An architecture should select this if it has the code which
+ fills the used part of the kernel stack with the STACKLEAK_POISON
+ value before returning from system calls.
+
config HAVE_STACKPROTECTOR
bool
help
if (IS_ENABLED(CONFIG_ARM64_UAO) &&
cpus_have_const_cap(ARM64_HAS_UAO))
childregs->pstate |= PSR_UAO_BIT;
+
+ if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
+ childregs->pstate |= PSR_SSBS_BIT;
+
p->thread.cpu_context.x19 = stack_start;
p->thread.cpu_context.x20 = stk_sz;
}
{
current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
}
-
- #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
- void __used stackleak_check_alloca(unsigned long size)
- {
- unsigned long stack_left;
- unsigned long current_sp = current_stack_pointer;
- struct stack_info info;
-
- BUG_ON(!on_accessible_stack(current, current_sp, &info));
-
- stack_left = current_sp - info.low;
-
- /*
- * There's a good chance we're almost out of stack space if this
- * is true. Using panic() over BUG() is more likely to give
- * reliable debugging output.
- */
- if (size >= stack_left)
- panic("alloca() over the kernel stack boundary\n");
- }
- EXPORT_SYMBOL(stackleak_check_alloca);
- #endif
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
select ANON_INODES
select ARCH_CLOCKSOURCE_DATA
+ select ARCH_CLOCKSOURCE_INIT
select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_DEBUG_VIRTUAL
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_JUMP_LABEL
+ select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN if X86_64
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_PREL32_RELOCATIONS
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
+ select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
select HAVE_KRETPROBES
select HAVE_KVM
select HAVE_LIVEPATCH if X86_64
- select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MIXED_BREAKPOINTS_REGS
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
+ select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
select HAVE_STACK_VALIDATION if X86_64
select HAVE_RSEQ
config INTEL_RDT
bool "Intel Resource Director Technology support"
- default n
depends on X86 && CPU_SUP_INTEL
select KERNFS
help
bool "ScaleMP vSMP"
select HYPERVISOR_GUEST
select PARAVIRT
+ select PARAVIRT_XXL
depends on X86_64 && PCI
depends on X86_EXTENDED_PLATFORM
depends on SMP
select SWIOTLB
select MFD_STA2X11
select GPIOLIB
- default n
---help---
This adds support for boards based on the STA2X11 IO-Hub,
a.k.a. "ConneXt". The chip is used in place of the standard
over full virtualization. However, when run without a hypervisor
the kernel is theoretically slower and slightly larger.
+config PARAVIRT_XXL
+ bool
+
config PARAVIRT_DEBUG
bool "paravirt-ops debugging"
depends on PARAVIRT && DEBUG_KERNEL
config KVM_DEBUG_FS
bool "Enable debug information for KVM Guests in debugfs"
depends on KVM_GUEST && DEBUG_FS
- default n
---help---
This option enables collection of various statistics for KVM guest.
Statistics are displayed in debugfs filesystem. Enabling this option
config PARAVIRT_TIME_ACCOUNTING
bool "Paravirtual steal time accounting"
depends on PARAVIRT
- default n
---help---
Select this option to enable fine granularity task steal time
accounting. Time spent executing other tasks in parallel with
endif #HYPERVISOR_GUEST
-config NO_BOOTMEM
- def_bool y
-
source "arch/x86/Kconfig.cpu"
config HPET_TIMER
config X86_LEGACY_VM86
bool "Legacy VM86 support"
- default n
depends on X86_32
---help---
This option allows user programs to put the CPU into V8086
supports them), so don't confuse the user by printing
that we have them enabled.
+config X86_CPA_STATISTICS
+ bool "Enable statistic for Change Page Attribute"
+ depends on DEBUG_FS
+ ---help---
+ Expose statistics about the Change Page Attribute mechanims, which
+ helps to determine the effectivness of preserving large and huge
+ page mappings when mapping protections are changed.
+
config ARCH_HAS_MEM_ENCRYPT
def_bool y
config BOOTPARAM_HOTPLUG_CPU0
bool "Set default setting of cpu0_hotpluggable"
- default n
depends on HOTPLUG_CPU
---help---
Set whether default state of cpu0_hotpluggable is on or off.
config ARCH_HIBERNATION_HEADER
def_bool y
- depends on X86_64 && HIBERNATION
+ depends on HIBERNATION
source "kernel/power/Kconfig"
config OLPC_XO1_PM
bool "OLPC XO-1 Power Management"
- depends on OLPC && MFD_CS5535 && PM_SLEEP
- select MFD_CORE
+ depends on OLPC && MFD_CS5535=y && PM_SLEEP
---help---
Add support for poweroff and suspend of the OLPC XO-1 laptop.
config RAPIDIO
tristate "RapidIO support"
depends on PCI
- default n
help
If enabled this option will include drivers and the core
infrastructure code to support RapidIO interconnect devices.
#endif
+ .macro STACKLEAK_ERASE_NOCLOBBER
+ #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+ PUSH_AND_CLEAR_REGS
+ call stackleak_erase
+ POP_REGS
+ #endif
+ .endm
+
#endif /* CONFIG_X86_64 */
+ .macro STACKLEAK_ERASE
+ #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+ call stackleak_erase
+ #endif
+ .endm
+
/*
* This does 'call enter_from_user_mode' unless we can avoid it based on
* kernel config or using the static jump infrastructure.
.macro CALL_enter_from_user_mode
#ifdef CONFIG_CONTEXT_TRACKING
#ifdef HAVE_JUMP_LABEL
- STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+ STATIC_BRANCH_JMP l_yes=.Lafter_call_\@, key=context_tracking_enabled, branch=1
#endif
call enter_from_user_mode
.Lafter_call_\@:
#include <asm/frame.h>
#include <asm/nospec-branch.h>
+ #include "calling.h"
+
.section .entry.text, "ax"
/*
* that register for the time this macro runs
*/
+ /*
+ * The high bits of the CS dword (__csh) are used for
+ * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
+ * hardware didn't do this for us.
+ */
+ andl $(0x0000ffff), PT_CS(%esp)
+
/* Are we on the entry stack? Bail out if not! */
movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
/* Load top of task-stack into %edi */
movl TSS_entry2task_stack(%edi), %edi
- /*
- * Clear unused upper bits of the dword containing the word-sized CS
- * slot in pt_regs in case hardware didn't clear it for us.
- */
- andl $(0x0000ffff), PT_CS(%esp)
-
/* Special case - entry from kernel mode via entry stack */
#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS
/* When we fork, we trace the syscall return in the child, too. */
movl %esp, %eax
call syscall_return_slowpath
+ STACKLEAK_ERASE
jmp restore_all
/* kernel thread */
* will ignore all of the single-step traps generated in this range.
*/
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
/*
* Xen doesn't set %esp to be precisely what the normal SYSENTER
* entry point expects, so fix it up before using the normal path.
ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
"jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ STACKLEAK_ERASE
+
/* Opportunistic SYSEXIT */
TRACE_IRQS_ON /* User mode traces as IRQs on. */
call do_int80_syscall_32
.Lsyscall_32_done:
+ STACKLEAK_ERASE
+
restore_all:
TRACE_IRQS_IRET
SWITCH_TO_ENTRY_STACK
jmp common_exception
END(spurious_interrupt_bug)
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
ENTRY(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
_ASM_EXTABLE(3b, 8b)
_ASM_EXTABLE(4b, 9b)
ENDPROC(xen_failsafe_callback)
+#endif /* CONFIG_XEN_PV */
+#ifdef CONFIG_XEN_PVHVM
BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
xen_evtchn_do_upcall)
+#endif
-#endif /* CONFIG_XEN */
#if IS_ENABLED(CONFIG_HYPERV)
* with them due to bugs in both AMD and Intel CPUs.
*/
- .pushsection .entry_trampoline, "ax"
-
-/*
- * The code in here gets remapped into cpu_entry_area's trampoline. This means
- * that the assembler and linker have the wrong idea as to where this code
- * lives (and, in fact, it's mapped more than once, so it's not even at a
- * fixed address). So we can't reference any symbols outside the entry
- * trampoline and expect it to work.
- *
- * Instead, we carefully abuse %rip-relative addressing.
- * _entry_trampoline(%rip) refers to the start of the remapped) entry
- * trampoline. We can thus find cpu_entry_area with this macro:
- */
-
-#define CPU_ENTRY_AREA \
- _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
-
-/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
- SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
-
-ENTRY(entry_SYSCALL_64_trampoline)
- UNWIND_HINT_EMPTY
- swapgs
-
- /* Stash the user RSP. */
- movq %rsp, RSP_SCRATCH
-
- /* Note: using %rsp as a scratch reg. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
-
- /* Load the top of the task stack into RSP */
- movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
-
- /* Start building the simulated IRET frame. */
- pushq $__USER_DS /* pt_regs->ss */
- pushq RSP_SCRATCH /* pt_regs->sp */
- pushq %r11 /* pt_regs->flags */
- pushq $__USER_CS /* pt_regs->cs */
- pushq %rcx /* pt_regs->ip */
-
- /*
- * x86 lacks a near absolute jump, and we can't jump to the real
- * entry text with a relative jump. We could push the target
- * address and then use retq, but this destroys the pipeline on
- * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
- * spill RDI and restore it in a second-stage trampoline.
- */
- pushq %rdi
- movq $entry_SYSCALL_64_stage2, %rdi
- JMP_NOSPEC %rdi
-END(entry_SYSCALL_64_trampoline)
-
- .popsection
-
-ENTRY(entry_SYSCALL_64_stage2)
- UNWIND_HINT_EMPTY
- popq %rdi
- jmp entry_SYSCALL_64_after_hwframe
-END(entry_SYSCALL_64_stage2)
-
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
*/
swapgs
- /*
- * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
- * is not required to switch CR3.
- */
- movq %rsp, PER_CPU_VAR(rsp_scratch)
+ /* tss.sp2 is scratch space. */
+ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
- pushq $__USER_DS /* pt_regs->ss */
- pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
- pushq %r11 /* pt_regs->flags */
- pushq $__USER_CS /* pt_regs->cs */
- pushq %rcx /* pt_regs->ip */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
- pushq %rax /* pt_regs->orig_ax */
+ pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
+ STACKLEAK_ERASE_NOCLOBBER
+
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
+ STACKLEAK_ERASE_NOCLOBBER
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
*/
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+/**
+ * idtentry - Generate an IDT entry stub
+ * @sym: Name of the generated entry point
+ * @do_sym: C function to be called
+ * @has_error_code: True if this IDT vector has an error code on the stack
+ * @paranoid: non-zero means that this vector may be invoked from
+ * kernel mode with user GSBASE and/or user CR3.
+ * 2 is special -- see below.
+ * @shift_ist: Set to an IST index if entries from kernel mode should
+ * decrement the IST stack so that nested entries get a
+ * fresh stack. (This is for #DB, which has a nasty habit
+ * of recursing.)
+ *
+ * idtentry generates an IDT stub that sets up a usable kernel context,
+ * creates struct pt_regs, and calls @do_sym. The stub has the following
+ * special behaviors:
+ *
+ * On an entry from user mode, the stub switches from the trampoline or
+ * IST stack to the normal thread stack. On an exit to user mode, the
+ * normal exit-to-usermode path is invoked.
+ *
+ * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
+ * whereas we omit the preemption check if @paranoid != 0. This is purely
+ * because the implementation is simpler this way. The kernel only needs
+ * to check for asynchronous kernel preemption when IRQ handlers return.
+ *
+ * If @paranoid == 0, then the stub will handle IRET faults by pretending
+ * that the fault came from user mode. It will handle gs_change faults by
+ * pretending that the fault happened with kernel GSBASE. Since this handling
+ * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
+ * @paranoid == 0. This special handling will do the wrong thing for
+ * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
+ *
+ * @paranoid == 2 is special: the stub will never switch stacks. This is for
+ * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
+ */
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
ret
ENDPROC(do_softirq_own_stack)
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
ENCODE_FRAME_POINTER
jmp error_exit
END(xen_failsafe_callback)
+#endif /* CONFIG_XEN_PV */
+#ifdef CONFIG_XEN_PVHVM
apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
xen_hvm_callback_vector xen_evtchn_do_upcall
+#endif
-#endif /* CONFIG_XEN */
#if IS_ENABLED(CONFIG_HYPERV)
apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
idtentry int3 do_int3 has_error_code=0
idtentry stack_segment do_stack_segment has_error_code=1
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
idtentry xennmi do_nmi has_error_code=0
idtentry xendebug do_debug has_error_code=0
idtentry xenint3 do_int3 has_error_code=0
xorl %ebx, %ebx
1:
+ /*
+ * Always stash CR3 in %r14. This value will be restored,
+ * verbatim, at exit. Needed if paranoid_entry interrupted
+ * another entry that already switched to the user CR3 value
+ * but has not yet returned to userspace.
+ *
+ * This is also why CS (stashed in the "iret frame" by the
+ * hardware at entry) can not be used: this may be a return
+ * to kernel code, but with a user CR3 value.
+ */
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
ret
testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
+ /* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
+ /* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
.Lparanoid_exit_restore:
jmp restore_regs_and_return_to_kernel
movq $-1, %rsi
call do_nmi
+ /* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
testl %ebx, %ebx /* swapgs needed? */
CRASHTYPE(USERCOPY_STACK_FRAME_FROM),
CRASHTYPE(USERCOPY_STACK_BEYOND),
CRASHTYPE(USERCOPY_KERNEL),
+ CRASHTYPE(USERCOPY_KERNEL_DS),
+ CRASHTYPE(STACKLEAK_ERASING),
};
void lkdtm_USERCOPY_STACK_FRAME_FROM(void);
void lkdtm_USERCOPY_STACK_BEYOND(void);
void lkdtm_USERCOPY_KERNEL(void);
+void lkdtm_USERCOPY_KERNEL_DS(void);
+ /* lkdtm_stackleak.c */
+ void lkdtm_STACKLEAK_ERASING(void);
+
#endif
unsigned long *entries;
int err;
+ /*
+ * The ability to racily run the kernel stack unwinder on a running task
+ * and then observe the unwinder output is scary; while it is useful for
+ * debugging kernel issues, it can also allow an attacker to leak kernel
+ * stack contents.
+ * Doing this in a manner that is at least safe from races would require
+ * some work to ensure that the remote task can not be scheduled; and
+ * even then, this would still expose the unwinder as local attack
+ * surface.
+ * Therefore, this interface is restricted to root.
+ */
+ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EACCES;
+
entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
GFP_KERNEL);
if (!entries)
}
#endif /* CONFIG_LIVEPATCH */
+ #ifdef CONFIG_STACKLEAK_METRICS
+ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+ {
+ unsigned long prev_depth = THREAD_SIZE -
+ (task->prev_lowest_stack & (THREAD_SIZE - 1));
+ unsigned long depth = THREAD_SIZE -
+ (task->lowest_stack & (THREAD_SIZE - 1));
+
+ seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
+ prev_depth, depth);
+ return 0;
+ }
+ #endif /* CONFIG_STACKLEAK_METRICS */
+
/*
* Thread groups
*/
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
+ #ifdef CONFIG_STACKLEAK_METRICS
+ ONE("stack_depth", S_IRUGO, proc_stack_depth),
+ #endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/signal_types.h>
+#include <linux/psi_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
#include <linux/rseq.h>
struct {
u8 blocked;
u8 need_qs;
- u8 exp_need_qs;
-
- /* Otherwise the compiler can store garbage here: */
- u8 pad;
} b; /* Bits. */
- u32 s; /* Set of bits. */
+ u16 s; /* Set of bits. */
};
enum perf_event_task_context {
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
unsigned sched_remote_wakeup:1;
+#ifdef CONFIG_PSI
+ unsigned sched_psi_wake_requeue:1;
+#endif
+
/* Force alignment to the next boundary: */
unsigned :0;
#endif
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
-#ifdef CONFIG_MEMCG_KMEM
- unsigned memcg_kmem_skip_account:1;
-#endif
#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
unsigned use_memdelay:1;
#endif
+ /*
+ * May usercopy functions fault on kernel addresses?
+ * This is not just a single bit because this can potentially nest.
+ */
+ unsigned int kernel_uaccess_faults_ok;
+
unsigned long atomic_flags; /* Flags requiring atomic access. */
struct restart_block restart_block;
/* Ptrace state: */
unsigned long ptrace_message;
- siginfo_t *last_siginfo;
+ kernel_siginfo_t *last_siginfo;
struct task_io_accounting ioac;
+#ifdef CONFIG_PSI
+ /* Pressure stall state */
+ unsigned int psi_flags;
+#endif
#ifdef CONFIG_TASK_XACCT
/* Accumulated RSS usage: */
u64 acct_rss_mem1;
void *security;
#endif
+ #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+ unsigned long lowest_stack;
+ unsigned long prev_lowest_stack;
+ #endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
+#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
+ #include <linux/stackleak.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
return s->addr;
}
+ /*
+ * Allocated stacks are cached and later reused by new threads,
+ * so memcg accounting is performed manually on assigning/releasing
+ * stacks to tasks. Drop __GFP_ACCOUNT.
+ */
stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP,
+ THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
static inline void free_thread_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
- if (task_stack_vm_area(tsk)) {
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+
+ if (vm) {
int i;
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ mod_memcg_page_state(vm->pages[i],
+ MEMCG_KERNEL_STACK_KB,
+ -(int)(PAGE_SIZE / 1024));
+
+ memcg_kmem_uncharge(vm->pages[i], 0);
+ }
+
for (i = 0; i < NR_CACHED_STACKS; i++) {
if (this_cpu_cmpxchg(cached_stacks[i],
NULL, tsk->stack_vm_area) != NULL)
NR_KERNEL_STACK_KB,
PAGE_SIZE / 1024 * account);
}
-
- /* All stack pages belong to the same memcg. */
- mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
} else {
/*
* All stack pages are in the same zone and belong to the
}
}
+static int memcg_charge_kernel_stack(struct task_struct *tsk)
+{
+#ifdef CONFIG_VMAP_STACK
+ struct vm_struct *vm = task_stack_vm_area(tsk);
+ int ret;
+
+ if (vm) {
+ int i;
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ /*
+ * If memcg_kmem_charge() fails, page->mem_cgroup
+ * pointer is NULL, and both memcg_kmem_uncharge()
+ * and mod_memcg_page_state() in free_thread_stack()
+ * will ignore this page. So it's safe.
+ */
+ ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ return ret;
+
+ mod_memcg_page_state(vm->pages[i],
+ MEMCG_KERNEL_STACK_KB,
+ PAGE_SIZE / 1024);
+ }
+ }
+#endif
+ return 0;
+}
+
static void release_task_stack(struct task_struct *tsk)
{
if (WARN_ON(tsk->state != TASK_DEAD))
goto out;
}
/* a new mm has just been created */
- arch_dup_mmap(oldmm, mm);
- retval = 0;
+ retval = arch_dup_mmap(oldmm, mm);
out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
if (!stack)
goto free_tsk;
+ if (memcg_charge_kernel_stack(tsk))
+ goto free_stack;
+
stack_vm_area = task_stack_vm_area(tsk);
err = arch_dup_task_struct(tsk, orig);
p->default_timer_slack_ns = current->timer_slack_ns;
+#ifdef CONFIG_PSI
+ p->psi_flags = 0;
+#endif
+
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
if (retval)
goto bad_fork_cleanup_io;
+ stackleak_task_init(p);
+
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
if (IS_ERR(pid)) {