Merge tag 'stackleak-v4.20-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <[email protected]>

Thu, 1 Nov 2018 18:46:27 +0000 (11:46 -0700)

committer Linus Torvalds <[email protected]>

Thu, 1 Nov 2018 18:46:27 +0000 (11:46 -0700)
author Linus Torvalds <[email protected]>
Thu, 1 Nov 2018 18:46:27 +0000 (11:46 -0700)
committer Linus Torvalds <[email protected]>
Thu, 1 Nov 2018 18:46:27 +0000 (11:46 -0700)
diff --combined Documentation/x86/x86_64/mm.txt

index 702898633b0007a1e50670fd05c7c24d58123c6c,600bc2afa27d6d80b8f15f7277ebe7045bec82ce..73aaaa3da4369e39b41bf360b24b820061a15df9
--- 1/Documentation/x86/x86_64/mm.txt
--- 2/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@@ -1,124 -1,57 +1,124 @@@
+ +====================================================
+ +Complete virtual memory map with 4-level page tables
+ +====================================================
   
- -Virtual memory map with 4 level page tables:
- -
- -0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
- -hole caused by [47:63] sign extension
- -ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
- -ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
- -ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
- -ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
- -ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
- -ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
- -... unused hole ...
- -ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
- -... unused hole ...
- -                                  vaddr_end for KASLR
- -fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
- -fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
- -ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
- -... unused hole ...
- -ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
- -... unused hole ...
- -ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
- -ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
- -[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
- -ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
- -ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
- -STACKLEAK_POISON value in this last hole: ffffffffffff4111
- -
- -Virtual memory map with 5 level page tables:
- -
- -0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
- -hole caused by [56:63] sign extension
- -ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
- -ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
- -ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
- -ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
- -ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
- -ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
- -... unused hole ...
- -ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
- -... unused hole ...
- -                                  vaddr_end for KASLR
- -fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
- -... unused hole ...
- -ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
- -... unused hole ...
- -ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
- -... unused hole ...
- -ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
- -ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
- -[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
- -ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
- -ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
- -STACKLEAK_POISON value in this last hole: ffffffffffff4111
+ +Notes:
+ +
+ + - Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down
+ +   from the top of the 64-bit address space. It's easier to understand the layout
+ +   when seen both in absolute addresses and in distance-from-top notation.
+ +
+ +   For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the
+ +   64-bit address space (ffffffffffffffff).
+ +
+ +   Note that as we get closer to the top of the address space, the notation changes
+ +   from TB to GB and then MB/KB.
+ +
+ + - "16M TB" might look weird at first sight, but it's an easier to visualize size
+ +   notation than "16 EB", which few will recognize at first sight as 16 exabytes.
+ +   It also shows it nicely how incredibly large 64-bit address space is.
+ +
+ +========================================================================================================================
+ +    Start addr    |   Offset   |     End addr     |  Size   | VM area description
+ +========================================================================================================================
+ +                  |            |                  |         |
+ + 0000000000000000 |    0       | 00007fffffffffff |  128 TB | user-space virtual memory, different per mm
+ +__________________|____________|__________________|_________|___________________________________________________________
+ +                  |            |                  |         |
+ + 0000800000000000 | +128    TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
+ +                  |            |                  |         |     virtual memory addresses up to the -128 TB
+ +                  |            |                  |         |     starting offset of kernel mappings.
+ +__________________|____________|__________________|_________|___________________________________________________________
+ +                                                            |
+ +                                                            | Kernel-space virtual memory, shared between all processes:
+ +____________________________________________________________|___________________________________________________________
+ +                  |            |                  |         |
+ + ffff800000000000 | -128    TB | ffff87ffffffffff |    8 TB | ... guard hole, also reserved for hypervisor
+ + ffff880000000000 | -120    TB | ffffc7ffffffffff |   64 TB | direct mapping of all physical memory (page_offset_base)
+ + ffffc80000000000 |  -56    TB | ffffc8ffffffffff |    1 TB | ... unused hole
+ + ffffc90000000000 |  -55    TB | ffffe8ffffffffff |   32 TB | vmalloc/ioremap space (vmalloc_base)
+ + ffffe90000000000 |  -23    TB | ffffe9ffffffffff |    1 TB | ... unused hole
+ + ffffea0000000000 |  -22    TB | ffffeaffffffffff |    1 TB | virtual memory map (vmemmap_base)
+ + ffffeb0000000000 |  -21    TB | ffffebffffffffff |    1 TB | ... unused hole
+ + ffffec0000000000 |  -20    TB | fffffbffffffffff |   16 TB | KASAN shadow memory
+ + fffffc0000000000 |   -4    TB | fffffdffffffffff |    2 TB | ... unused hole
+ +                  |            |                  |         | vaddr_end for KASLR
+ + fffffe0000000000 |   -2    TB | fffffe7fffffffff |  0.5 TB | cpu_entry_area mapping
+ + fffffe8000000000 |   -1.5  TB | fffffeffffffffff |  0.5 TB | LDT remap for PTI
+ + ffffff0000000000 |   -1    TB | ffffff7fffffffff |  0.5 TB | %esp fixup stacks
+ +__________________|____________|__________________|_________|____________________________________________________________
+ +                                                            |
+ +                                                            | Identical layout to the 47-bit one from here on:
+ +____________________________________________________________|____________________________________________________________
+ +                  |            |                  |         |
+ + ffffff8000000000 | -512    GB | ffffffeeffffffff |  444 GB | ... unused hole
+ + ffffffef00000000 |  -68    GB | fffffffeffffffff |   64 GB | EFI region mapping space
+ + ffffffff00000000 |   -4    GB | ffffffff7fffffff |    2 GB | ... unused hole
+ + ffffffff80000000 |   -2    GB | ffffffff9fffffff |  512 MB | kernel text mapping, mapped to physical address 0
+ + ffffffff80000000 |-2048    MB |                  |         |
+ + ffffffffa0000000 |-1536    MB | fffffffffeffffff | 1520 MB | module mapping space
+ + ffffffffff000000 |  -16    MB |                  |         |
+ +    FIXADDR_START | ~-11    MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
+ + ffffffffff600000 |  -10    MB | ffffffffff600fff |    4 kB | legacy vsyscall ABI
+ + ffffffffffe00000 |   -2    MB | ffffffffffffffff |    2 MB | ... unused hole
+ +__________________|____________|__________________|_________|___________________________________________________________
+ +
+ +
+ +====================================================
+ +Complete virtual memory map with 5-level page tables
+ +====================================================
+ +
+ +Notes:
+ +
+ + - With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
+ +   from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
+ +   offset and many of the regions expand to support the much larger physical
+ +   memory supported.
+ +
+ +========================================================================================================================
+ +    Start addr    |   Offset   |     End addr     |  Size   | VM area description
+ +========================================================================================================================
+ +                  |            |                  |         |
+ + 0000000000000000 |    0       | 00ffffffffffffff |   64 PB | user-space virtual memory, different per mm
+ +__________________|____________|__________________|_________|___________________________________________________________
+ +                  |            |                  |         |
+ + 0000800000000000 |  +64    PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
+ +                  |            |                  |         |     virtual memory addresses up to the -128 TB
+ +                  |            |                  |         |     starting offset of kernel mappings.
+ +__________________|____________|__________________|_________|___________________________________________________________
+ +                                                            |
+ +                                                            | Kernel-space virtual memory, shared between all processes:
+ +____________________________________________________________|___________________________________________________________
+ +                  |            |                  |         |
+ + ff00000000000000 |  -64    PB | ff0fffffffffffff |    4 PB | ... guard hole, also reserved for hypervisor
+ + ff10000000000000 |  -60    PB | ff8fffffffffffff |   32 PB | direct mapping of all physical memory (page_offset_base)
+ + ff90000000000000 |  -28    PB | ff9fffffffffffff |    4 PB | LDT remap for PTI
+ + ffa0000000000000 |  -24    PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base)
+ + ffd2000000000000 |  -11.5  PB | ffd3ffffffffffff |  0.5 PB | ... unused hole
+ + ffd4000000000000 |  -11    PB | ffd5ffffffffffff |  0.5 PB | virtual memory map (vmemmap_base)
+ + ffd6000000000000 |  -10.5  PB | ffdeffffffffffff | 2.25 PB | ... unused hole
+ + ffdf000000000000 |   -8.25 PB | fffffdffffffffff |   ~8 PB | KASAN shadow memory
+ + fffffc0000000000 |   -4    TB | fffffdffffffffff |    2 TB | ... unused hole
+ +                  |            |                  |         | vaddr_end for KASLR
+ + fffffe0000000000 |   -2    TB | fffffe7fffffffff |  0.5 TB | cpu_entry_area mapping
+ + fffffe8000000000 |   -1.5  TB | fffffeffffffffff |  0.5 TB | ... unused hole
+ + ffffff0000000000 |   -1    TB | ffffff7fffffffff |  0.5 TB | %esp fixup stacks
+ +__________________|____________|__________________|_________|____________________________________________________________
+ +                                                            |
+ +                                                            | Identical layout to the 47-bit one from here on:
+ +____________________________________________________________|____________________________________________________________
+ +                  |            |                  |         |
+ + ffffff8000000000 | -512    GB | ffffffeeffffffff |  444 GB | ... unused hole
+ + ffffffef00000000 |  -68    GB | fffffffeffffffff |   64 GB | EFI region mapping space
+ + ffffffff00000000 |   -4    GB | ffffffff7fffffff |    2 GB | ... unused hole
+ + ffffffff80000000 |   -2    GB | ffffffff9fffffff |  512 MB | kernel text mapping, mapped to physical address 0
+ + ffffffff80000000 |-2048    MB |                  |         |
+ + ffffffffa0000000 |-1536    MB | fffffffffeffffff | 1520 MB | module mapping space
+ + ffffffffff000000 |  -16    MB |                  |         |
+ +    FIXADDR_START | ~-11    MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
+ + ffffffffff600000 |  -10    MB | ffffffffff600fff |    4 kB | legacy vsyscall ABI
+ + ffffffffffe00000 |   -2    MB | ffffffffffffffff |    2 MB | ... unused hole
+ +__________________|____________|__________________|_________|___________________________________________________________
   
   Architecture defines a 64-bit virtual address. Implementations can support
   less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
@@@ -146,3 -79,3 +146,6 @@@ Their order is preserved but their bas
   Be very careful vs. KASLR when changing anything here. The KASLR address
   range must not overlap with anything except the KASAN shadow area, which is
   correct as KASAN disables KASLR.
++
++For both 4- and 5-level layouts, the STACKLEAK_POISON value in the last 2MB
++hole: ffffffffffff4111
diff --combined arch/Kconfig

index ed27fd26262764fc44092d639030aa7b19f53ea8,ee79ff56faab9e8505bf6647f35ce868af754f0f..e1e540ffa9793d5279c68d9bca412e8a3ef115ae
--- 1/arch/Kconfig
--- 2/arch/Kconfig
+++ b/arch/Kconfig
@@@ -290,13 -290,6 +290,13 @@@ config HAVE_RSE
           This symbol should be selected by an architecture if it
           supports an implementation of restartable sequences.
   
+ +config HAVE_FUNCTION_ARG_ACCESS_API
+ +      bool
+ +      help
+ +        This symbol should be selected by an architecure if it supports
+ +        the API needed to access function arguments from pt_regs,
+ +        declared in asm/ptrace.h
+ +
   config HAVE_CLK
         bool
         help
@@@ -366,9 -359,6 +366,9 @@@ config HAVE_PERF_USER_STACK_DUM
   config HAVE_ARCH_JUMP_LABEL
         bool
   
+ +config HAVE_ARCH_JUMP_LABEL_RELATIVE
+ +      bool
+ +
   config HAVE_RCU_TABLE_FREE
         bool
   
@@@ -429,6 -419,13 +429,13 @@@ config SECCOMP_FILTE
   
           See Documentation/userspace-api/seccomp_filter.rst for details.
   
+ config HAVE_ARCH_STACKLEAK
+       bool
+       help
+         An architecture should select this if it has the code which
+         fills the used part of the kernel stack with the STACKLEAK_POISON
+         value before returning from system calls.
+ 
   config HAVE_STACKPROTECTOR
         bool
         help
diff --combined arch/arm64/kernel/process.c

index ce99c58cd1f1d2081355a7f4420072a31b43ca71,740b31f77adeeaa9c6183714b935b05577a57cbb..d9a4c2d6dd8b8b8031e6b552067690797eed6b6e
--- 1/arch/arm64/kernel/process.c
--- 2/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@@ -358,10 -358,6 +358,10 @@@ int copy_thread(unsigned long clone_fla
                 if (IS_ENABLED(CONFIG_ARM64_UAO) &&
                     cpus_have_const_cap(ARM64_HAS_UAO))
                         childregs->pstate |= PSR_UAO_BIT;
+ +
+ +              if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
+ +                      childregs->pstate |= PSR_SSBS_BIT;
+ +
                 p->thread.cpu_context.x19 = stack_start;
                 p->thread.cpu_context.x20 = stk_sz;
         }
@@@ -497,25 -493,3 +497,3 @@@ void arch_setup_new_exec(void
   {
         current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
   }
- 
- #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
- void __used stackleak_check_alloca(unsigned long size)
- {
-       unsigned long stack_left;
-       unsigned long current_sp = current_stack_pointer;
-       struct stack_info info;
- 
-       BUG_ON(!on_accessible_stack(current, current_sp, &info));
- 
-       stack_left = current_sp - info.low;
- 
-       /*
-        * There's a good chance we're almost out of stack space if this
-        * is true. Using panic() over BUG() is more likely to give
-        * reliable debugging output.
-        */
-       if (size >= stack_left)
-               panic("alloca() over the kernel stack boundary\n");
- }
- EXPORT_SYMBOL(stackleak_check_alloca);
- #endif
diff --combined arch/x86/Kconfig

index c51c989c19c08da99155d354cc11558c1cdb36d4,662cb2cc9630cb65ace570f92804ef1978c98ac0..ba7e3464ee9235fe43f0edd66034d670b2fc4ffd
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -48,7 -48,6 +48,7 @@@ config X8
         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
         select ANON_INODES
         select ARCH_CLOCKSOURCE_DATA
+ +      select ARCH_CLOCKSOURCE_INIT
         select ARCH_DISCARD_MEMBLOCK
         select ARCH_HAS_ACPI_TABLE_UPGRADE      if ACPI
         select ARCH_HAS_DEBUG_VIRTUAL
@@@ -120,7 -119,6 +120,7 @@@
         select HAVE_ARCH_AUDITSYSCALL
         select HAVE_ARCH_HUGE_VMAP              if X86_64 || X86_PAE
         select HAVE_ARCH_JUMP_LABEL
+ +      select HAVE_ARCH_JUMP_LABEL_RELATIVE
         select HAVE_ARCH_KASAN                  if X86_64
         select HAVE_ARCH_KGDB
         select HAVE_ARCH_MMAP_RND_BITS          if MMU
@@@ -129,6 -127,7 +129,7 @@@
         select HAVE_ARCH_PREL32_RELOCATIONS
         select HAVE_ARCH_SECCOMP_FILTER
         select HAVE_ARCH_THREAD_STRUCT_WHITELIST
+       select HAVE_ARCH_STACKLEAK
         select HAVE_ARCH_TRACEHOOK
         select HAVE_ARCH_TRANSPARENT_HUGEPAGE
         select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
@@@ -169,6 -168,7 +170,6 @@@
         select HAVE_KRETPROBES
         select HAVE_KVM
         select HAVE_LIVEPATCH                   if X86_64
- -      select HAVE_MEMBLOCK
         select HAVE_MEMBLOCK_NODE_MAP
         select HAVE_MIXED_BREAKPOINTS_REGS
         select HAVE_MOD_ARCH_SPECIFIC
@@@ -185,7 -185,6 +186,7 @@@
         select HAVE_RCU_TABLE_INVALIDATE        if HAVE_RCU_TABLE_FREE
         select HAVE_REGS_AND_STACK_ACCESS_API
         select HAVE_RELIABLE_STACKTRACE         if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
+ +      select HAVE_FUNCTION_ARG_ACCESS_API
         select HAVE_STACKPROTECTOR              if CC_HAS_SANE_STACKPROTECTOR
         select HAVE_STACK_VALIDATION            if X86_64
         select HAVE_RSEQ
@@@ -449,6 -448,7 +450,6 @@@ config RETPOLIN
   
   config INTEL_RDT
         bool "Intel Resource Director Technology support"
- -      default n
         depends on X86 && CPU_SUP_INTEL
         select KERNFS
         help
@@@ -524,7 -524,6 +525,7 @@@ config X86_VSM
         bool "ScaleMP vSMP"
         select HYPERVISOR_GUEST
         select PARAVIRT
+ +      select PARAVIRT_XXL
         depends on X86_64 && PCI
         depends on X86_EXTENDED_PLATFORM
         depends on SMP
@@@ -703,6 -702,7 +704,6 @@@ config STA2X1
         select SWIOTLB
         select MFD_STA2X11
         select GPIOLIB
- -      default n
         ---help---
           This adds support for boards based on the STA2X11 IO-Hub,
           a.k.a. "ConneXt". The chip is used in place of the standard
@@@ -755,9 -755,6 +756,9 @@@ config PARAVIR
           over full virtualization.  However, when run without a hypervisor
           the kernel is theoretically slower and slightly larger.
   
+ +config PARAVIRT_XXL
+ +      bool
+ +
   config PARAVIRT_DEBUG
         bool "paravirt-ops debugging"
         depends on PARAVIRT && DEBUG_KERNEL
@@@ -803,6 -800,7 +804,6 @@@ config KVM_GUES
   config KVM_DEBUG_FS
         bool "Enable debug information for KVM Guests in debugfs"
         depends on KVM_GUEST && DEBUG_FS
- -      default n
         ---help---
           This option enables collection of various statistics for KVM guest.
           Statistics are displayed in debugfs filesystem. Enabling this option
@@@ -811,6 -809,7 +812,6 @@@
   config PARAVIRT_TIME_ACCOUNTING
         bool "Paravirtual steal time accounting"
         depends on PARAVIRT
- -      default n
         ---help---
           Select this option to enable fine granularity task steal time
           accounting. Time spent executing other tasks in parallel with
@@@ -833,6 -832,9 +834,6 @@@ config JAILHOUSE_GUES
   
   endif #HYPERVISOR_GUEST
   
- -config NO_BOOTMEM
- -      def_bool y
- -
   source "arch/x86/Kconfig.cpu"
   
   config HPET_TIMER
@@@ -1167,6 -1169,7 +1168,6 @@@ source "arch/x86/events/Kconfig
   
   config X86_LEGACY_VM86
         bool "Legacy VM86 support"
- -      default n
         depends on X86_32
         ---help---
           This option allows user programs to put the CPU into V8086
@@@ -1489,14 -1492,6 +1490,14 @@@ config X86_DIRECT_GBPAGE
           supports them), so don't confuse the user by printing
           that we have them enabled.
   
+ +config X86_CPA_STATISTICS
+ +      bool "Enable statistic for Change Page Attribute"
+ +      depends on DEBUG_FS
+ +      ---help---
+ +        Expose statistics about the Change Page Attribute mechanims, which
+ +        helps to determine the effectivness of preserving large and huge
+ +        page mappings when mapping protections are changed.
+ +
   config ARCH_HAS_MEM_ENCRYPT
         def_bool y
   
@@@ -2226,6 -2221,7 +2227,6 @@@ config HOTPLUG_CP
   
   config BOOTPARAM_HOTPLUG_CPU0
         bool "Set default setting of cpu0_hotpluggable"
- -      default n
         depends on HOTPLUG_CPU
         ---help---
           Set whether default state of cpu0_hotpluggable is on or off.
@@@ -2427,7 -2423,7 +2428,7 @@@ menu "Power management and ACPI options
   
   config ARCH_HIBERNATION_HEADER
         def_bool y
- -      depends on X86_64 && HIBERNATION
+ +      depends on HIBERNATION
   
   source "kernel/power/Kconfig"
   
@@@ -2747,7 -2743,8 +2748,7 @@@ config OLP
   
   config OLPC_XO1_PM
         bool "OLPC XO-1 Power Management"
- -      depends on OLPC && MFD_CS5535 && PM_SLEEP
- -      select MFD_CORE
+ +      depends on OLPC && MFD_CS5535=y && PM_SLEEP
         ---help---
           Add support for poweroff and suspend of the OLPC XO-1 laptop.
   
@@@ -2829,6 -2826,7 +2830,6 @@@ source "drivers/pcmcia/Kconfig
   config RAPIDIO
         tristate "RapidIO support"
         depends on PCI
- -      default n
         help
           If enabled this option will include drivers and the core
           infrastructure code to support RapidIO interconnect devices.
diff --combined arch/x86/entry/calling.h

index 708b46a54578d8722fc1c9fa07e58d74f7ff49d8,20d0885b00fbec4c77dfee23c701ba0c3612890b..25e5a6bda8c3a971609dff93919ccab27d6a3aa9
--- 1/arch/x86/entry/calling.h
--- 2/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@@ -329,8 -329,22 +329,22 @@@ For 32-bit we have the following conven
   
   #endif
   
+ .macro STACKLEAK_ERASE_NOCLOBBER
+ #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+       PUSH_AND_CLEAR_REGS
+       call stackleak_erase
+       POP_REGS
+ #endif
+ .endm
+ 
   #endif /* CONFIG_X86_64 */
   
+ .macro STACKLEAK_ERASE
+ #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+       call stackleak_erase
+ #endif
+ .endm
+ 
   /*
    * This does 'call enter_from_user_mode' unless we can avoid it based on
    * kernel config or using the static jump infrastructure.
@@@ -338,7 -352,7 +352,7 @@@
   .macro CALL_enter_from_user_mode
   #ifdef CONFIG_CONTEXT_TRACKING
   #ifdef HAVE_JUMP_LABEL
- -      STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+ +      STATIC_BRANCH_JMP l_yes=.Lafter_call_\@, key=context_tracking_enabled, branch=1
   #endif
         call enter_from_user_mode
   .Lafter_call_\@:
diff --combined arch/x86/entry/entry_32.S

index 687e47f8a796621d4effcac9a055965969a81dc2,dfb975b4c981f85724e304c8af6559490fbaac76..d309f30cf7af84e67ac38910eff4256da9c25a11
--- 1/arch/x86/entry/entry_32.S
--- 2/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@@ -46,6 -46,8 +46,8 @@@
   #include <asm/frame.h>
   #include <asm/nospec-branch.h>
   
+ #include "calling.h"
+ 
         .section .entry.text, "ax"
   
   /*
@@@ -389,13 -391,6 +391,13 @@@
          * that register for the time this macro runs
          */
   
+ +      /*
+ +       * The high bits of the CS dword (__csh) are used for
+ +       * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
+ +       * hardware didn't do this for us.
+ +       */
+ +      andl    $(0x0000ffff), PT_CS(%esp)
+ +
         /* Are we on the entry stack? Bail out if not! */
         movl    PER_CPU_VAR(cpu_entry_area), %ecx
         addl    $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
@@@ -414,6 -409,12 +416,6 @@@
         /* Load top of task-stack into %edi */
         movl    TSS_entry2task_stack(%edi), %edi
   
- -      /*
- -       * Clear unused upper bits of the dword containing the word-sized CS
- -       * slot in pt_regs in case hardware didn't clear it for us.
- -       */
- -      andl    $(0x0000ffff), PT_CS(%esp)
- -
         /* Special case - entry from kernel mode via entry stack */
   #ifdef CONFIG_VM86
         movl    PT_EFLAGS(%esp), %ecx           # mix EFLAGS and CS
@@@ -712,6 -713,7 +714,7 @@@ ENTRY(ret_from_fork
         /* When we fork, we trace the syscall return in the child, too. */
         movl    %esp, %eax
         call    syscall_return_slowpath
+       STACKLEAK_ERASE
         jmp     restore_all
   
         /* kernel thread */
@@@ -783,7 -785,7 +786,7 @@@ GLOBAL(__begin_SYSENTER_singlestep_regi
    * will ignore all of the single-step traps generated in this range.
    */
   
- -#ifdef CONFIG_XEN
+ +#ifdef CONFIG_XEN_PV
   /*
    * Xen doesn't set %esp to be precisely what the normal SYSENTER
    * entry point expects, so fix it up before using the normal path.
@@@ -886,6 -888,8 +889,8 @@@ ENTRY(entry_SYSENTER_32
         ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
                     "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
   
+       STACKLEAK_ERASE
+ 
   /* Opportunistic SYSEXIT */
         TRACE_IRQS_ON                   /* User mode traces as IRQs on. */
   
@@@ -997,6 -1001,8 +1002,8 @@@ ENTRY(entry_INT80_32
         call    do_int80_syscall_32
   .Lsyscall_32_done:
   
+       STACKLEAK_ERASE
+ 
   restore_all:
         TRACE_IRQS_IRET
         SWITCH_TO_ENTRY_STACK
@@@ -1241,7 -1247,7 +1248,7 @@@ ENTRY(spurious_interrupt_bug
         jmp     common_exception
   END(spurious_interrupt_bug)
   
- -#ifdef CONFIG_XEN
+ +#ifdef CONFIG_XEN_PV
   ENTRY(xen_hypervisor_callback)
         pushl   $-1                             /* orig_ax = -1 => not a system call */
         SAVE_ALL
@@@ -1322,13 -1328,11 +1329,13 @@@ ENTRY(xen_failsafe_callback
         _ASM_EXTABLE(3b, 8b)
         _ASM_EXTABLE(4b, 9b)
   ENDPROC(xen_failsafe_callback)
+ +#endif /* CONFIG_XEN_PV */
   
+ +#ifdef CONFIG_XEN_PVHVM
   BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
                  xen_evtchn_do_upcall)
+ +#endif
   
- -#endif /* CONFIG_XEN */
   
   #if IS_ENABLED(CONFIG_HYPERV)
   
diff --combined arch/x86/entry/entry_64.S

index 4d7a2d9d44cfec5928b902cef1bca9bca29093a6,a5dd2809302022385888847d6b7efda634bd4582..ce25d84023c021ce25f041cd81497500f20c3a60
--- 1/arch/x86/entry/entry_64.S
--- 2/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@@ -142,6 -142,67 +142,6 @@@ END(native_usergs_sysret64
    * with them due to bugs in both AMD and Intel CPUs.
    */
   
- -      .pushsection .entry_trampoline, "ax"
- -
- -/*
- - * The code in here gets remapped into cpu_entry_area's trampoline.  This means
- - * that the assembler and linker have the wrong idea as to where this code
- - * lives (and, in fact, it's mapped more than once, so it's not even at a
- - * fixed address).  So we can't reference any symbols outside the entry
- - * trampoline and expect it to work.
- - *
- - * Instead, we carefully abuse %rip-relative addressing.
- - * _entry_trampoline(%rip) refers to the start of the remapped) entry
- - * trampoline.  We can thus find cpu_entry_area with this macro:
- - */
- -
- -#define CPU_ENTRY_AREA \
- -      _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
- -
- -/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
- -#define RSP_SCRATCH   CPU_ENTRY_AREA_entry_stack + \
- -                      SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
- -
- -ENTRY(entry_SYSCALL_64_trampoline)
- -      UNWIND_HINT_EMPTY
- -      swapgs
- -
- -      /* Stash the user RSP. */
- -      movq    %rsp, RSP_SCRATCH
- -
- -      /* Note: using %rsp as a scratch reg. */
- -      SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
- -
- -      /* Load the top of the task stack into RSP */
- -      movq    CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
- -
- -      /* Start building the simulated IRET frame. */
- -      pushq   $__USER_DS                      /* pt_regs->ss */
- -      pushq   RSP_SCRATCH                     /* pt_regs->sp */
- -      pushq   %r11                            /* pt_regs->flags */
- -      pushq   $__USER_CS                      /* pt_regs->cs */
- -      pushq   %rcx                            /* pt_regs->ip */
- -
- -      /*
- -       * x86 lacks a near absolute jump, and we can't jump to the real
- -       * entry text with a relative jump.  We could push the target
- -       * address and then use retq, but this destroys the pipeline on
- -       * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead,
- -       * spill RDI and restore it in a second-stage trampoline.
- -       */
- -      pushq   %rdi
- -      movq    $entry_SYSCALL_64_stage2, %rdi
- -      JMP_NOSPEC %rdi
- -END(entry_SYSCALL_64_trampoline)
- -
- -      .popsection
- -
- -ENTRY(entry_SYSCALL_64_stage2)
- -      UNWIND_HINT_EMPTY
- -      popq    %rdi
- -      jmp     entry_SYSCALL_64_after_hwframe
- -END(entry_SYSCALL_64_stage2)
- -
   ENTRY(entry_SYSCALL_64)
         UNWIND_HINT_EMPTY
         /*
@@@ -151,19 -212,21 +151,19 @@@
          */
   
         swapgs
- -      /*
- -       * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
- -       * is not required to switch CR3.
- -       */
- -      movq    %rsp, PER_CPU_VAR(rsp_scratch)
+ +      /* tss.sp2 is scratch space. */
+ +      movq    %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+ +      SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
   
         /* Construct struct pt_regs on stack */
- -      pushq   $__USER_DS                      /* pt_regs->ss */
- -      pushq   PER_CPU_VAR(rsp_scratch)        /* pt_regs->sp */
- -      pushq   %r11                            /* pt_regs->flags */
- -      pushq   $__USER_CS                      /* pt_regs->cs */
- -      pushq   %rcx                            /* pt_regs->ip */
+ +      pushq   $__USER_DS                              /* pt_regs->ss */
+ +      pushq   PER_CPU_VAR(cpu_tss_rw + TSS_sp2)       /* pt_regs->sp */
+ +      pushq   %r11                                    /* pt_regs->flags */
+ +      pushq   $__USER_CS                              /* pt_regs->cs */
+ +      pushq   %rcx                                    /* pt_regs->ip */
   GLOBAL(entry_SYSCALL_64_after_hwframe)
- -      pushq   %rax                            /* pt_regs->orig_ax */
+ +      pushq   %rax                                    /* pt_regs->orig_ax */
   
         PUSH_AND_CLEAR_REGS rax=$-ENOSYS
   
@@@ -266,6 -329,8 +266,8 @@@ syscall_return_via_sysret
          * We are on the trampoline stack.  All regs except RDI are live.
          * We can do future final exit work right here.
          */
+       STACKLEAK_ERASE_NOCLOBBER
+ 
         SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
   
         popq    %rdi
@@@ -625,6 -690,7 +627,7 @@@ GLOBAL(swapgs_restore_regs_and_return_t
          * We are on the trampoline stack.  All regs except RDI are live.
          * We can do future final exit work right here.
          */
+       STACKLEAK_ERASE_NOCLOBBER
   
         SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
   
@@@ -837,42 -903,6 +840,42 @@@ apicinterrupt IRQ_WORK_VECTOR                    irq_wor
    */
   #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
   
+ +/**
+ + * idtentry - Generate an IDT entry stub
+ + * @sym:              Name of the generated entry point
+ + * @do_sym:           C function to be called
+ + * @has_error_code:   True if this IDT vector has an error code on the stack
+ + * @paranoid:                 non-zero means that this vector may be invoked from
+ + *                    kernel mode with user GSBASE and/or user CR3.
+ + *                    2 is special -- see below.
+ + * @shift_ist:                Set to an IST index if entries from kernel mode should
+ + *                            decrement the IST stack so that nested entries get a
+ + *                    fresh stack.  (This is for #DB, which has a nasty habit
+ + *                            of recursing.)
+ + *
+ + * idtentry generates an IDT stub that sets up a usable kernel context,
+ + * creates struct pt_regs, and calls @do_sym.  The stub has the following
+ + * special behaviors:
+ + *
+ + * On an entry from user mode, the stub switches from the trampoline or
+ + * IST stack to the normal thread stack.  On an exit to user mode, the
+ + * normal exit-to-usermode path is invoked.
+ + *
+ + * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
+ + * whereas we omit the preemption check if @paranoid != 0.  This is purely
+ + * because the implementation is simpler this way.  The kernel only needs
+ + * to check for asynchronous kernel preemption when IRQ handlers return.
+ + *
+ + * If @paranoid == 0, then the stub will handle IRET faults by pretending
+ + * that the fault came from user mode.  It will handle gs_change faults by
+ + * pretending that the fault happened with kernel GSBASE.  Since this handling
+ + * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
+ + * @paranoid == 0.  This special handling will do the wrong thing for
+ + * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
+ + *
+ + * @paranoid == 2 is special: the stub will never switch stacks.  This is for
+ + * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
+ + */
   .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
   ENTRY(\sym)
         UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@@ -1023,7 -1053,7 +1026,7 @@@ ENTRY(do_softirq_own_stack
         ret
   ENDPROC(do_softirq_own_stack)
   
- -#ifdef CONFIG_XEN
+ +#ifdef CONFIG_XEN_PV
   idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
   
   /*
@@@ -1103,13 -1133,11 +1106,13 @@@ ENTRY(xen_failsafe_callback
         ENCODE_FRAME_POINTER
         jmp     error_exit
   END(xen_failsafe_callback)
+ +#endif /* CONFIG_XEN_PV */
   
+ +#ifdef CONFIG_XEN_PVHVM
   apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
         xen_hvm_callback_vector xen_evtchn_do_upcall
+ +#endif
   
- -#endif /* CONFIG_XEN */
   
   #if IS_ENABLED(CONFIG_HYPERV)
   apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@@ -1126,7 -1154,7 +1129,7 @@@ idtentry debug                  do_debug                has_error_co
   idtentry int3                 do_int3                 has_error_code=0
   idtentry stack_segment                do_stack_segment        has_error_code=1
   
- -#ifdef CONFIG_XEN
+ +#ifdef CONFIG_XEN_PV
   idtentry xennmi                       do_nmi                  has_error_code=0
   idtentry xendebug             do_debug                has_error_code=0
   idtentry xenint3              do_int3                 has_error_code=0
@@@ -1162,16 -1190,6 +1165,16 @@@ ENTRY(paranoid_entry
         xorl    %ebx, %ebx
   
   1:
+ +      /*
+ +       * Always stash CR3 in %r14.  This value will be restored,
+ +       * verbatim, at exit.  Needed if paranoid_entry interrupted
+ +       * another entry that already switched to the user CR3 value
+ +       * but has not yet returned to userspace.
+ +       *
+ +       * This is also why CS (stashed in the "iret frame" by the
+ +       * hardware at entry) can not be used: this may be a return
+ +       * to kernel code, but with a user CR3 value.
+ +       */
         SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
   
         ret
@@@ -1196,13 -1214,11 +1199,13 @@@ ENTRY(paranoid_exit
         testl   %ebx, %ebx                      /* swapgs needed? */
         jnz     .Lparanoid_exit_no_swapgs
         TRACE_IRQS_IRETQ
+ +      /* Always restore stashed CR3 value (see paranoid_entry) */
         RESTORE_CR3     scratch_reg=%rbx save_reg=%r14
         SWAPGS_UNSAFE_STACK
         jmp     .Lparanoid_exit_restore
   .Lparanoid_exit_no_swapgs:
         TRACE_IRQS_IRETQ_DEBUG
+ +      /* Always restore stashed CR3 value (see paranoid_entry) */
         RESTORE_CR3     scratch_reg=%rbx save_reg=%r14
   .Lparanoid_exit_restore:
         jmp restore_regs_and_return_to_kernel
@@@ -1613,7 -1629,6 +1616,7 @@@ end_repeat_nmi
         movq    $-1, %rsi
         call    do_nmi
   
+ +      /* Always restore stashed CR3 value (see paranoid_entry) */
         RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
   
         testl   %ebx, %ebx                      /* swapgs needed? */
diff --combined drivers/misc/lkdtm/core.c

index 5a755590d3dcefe85b0354c88bacc9dffd2e392d,aca26d81e9b8d3345eb1543406817a31c5f1668b..2837dc77478ed43e9a8561c850c92879f010b07d
--- 1/drivers/misc/lkdtm/core.c
--- 2/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@@ -183,7 -183,7 +183,8 @@@ static const struct crashtype crashtype
         CRASHTYPE(USERCOPY_STACK_FRAME_FROM),
         CRASHTYPE(USERCOPY_STACK_BEYOND),
         CRASHTYPE(USERCOPY_KERNEL),
+ +      CRASHTYPE(USERCOPY_KERNEL_DS),
+       CRASHTYPE(STACKLEAK_ERASING),
   };
   
   
diff --combined drivers/misc/lkdtm/lkdtm.h

index 07db641d71d023bd2eb710873114261170cce079,b611b157c84fae3bb659ed290d1ea35d2f53cf6f..3c6fd327e166a4c83dfef10f8d83209cd4f6f892
--- 1/drivers/misc/lkdtm/lkdtm.h
--- 2/drivers/misc/lkdtm/lkdtm.h
+++ b/drivers/misc/lkdtm/lkdtm.h
@@@ -82,6 -82,8 +82,9 @@@ void lkdtm_USERCOPY_STACK_FRAME_TO(void
   void lkdtm_USERCOPY_STACK_FRAME_FROM(void);
   void lkdtm_USERCOPY_STACK_BEYOND(void);
   void lkdtm_USERCOPY_KERNEL(void);
+ +void lkdtm_USERCOPY_KERNEL_DS(void);
   
+ /* lkdtm_stackleak.c */
+ void lkdtm_STACKLEAK_ERASING(void);
+ 
   #endif
diff --combined fs/proc/base.c

index 7e9f07bf260d20bb0a0cd4cd6b6b4abe82b23e20,2a238d68610ef486301c0b9ca0795106f39e2a98..ce34654794472d0a7b8c2574340c18cc7d594f7a
--- 1/fs/proc/base.c
--- 2/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -407,20 -407,6 +407,20 @@@ static int proc_pid_stack(struct seq_fi
         unsigned long *entries;
         int err;
   
+ +      /*
+ +       * The ability to racily run the kernel stack unwinder on a running task
+ +       * and then observe the unwinder output is scary; while it is useful for
+ +       * debugging kernel issues, it can also allow an attacker to leak kernel
+ +       * stack contents.
+ +       * Doing this in a manner that is at least safe from races would require
+ +       * some work to ensure that the remote task can not be scheduled; and
+ +       * even then, this would still expose the unwinder as local attack
+ +       * surface.
+ +       * Therefore, this interface is restricted to root.
+ +       */
+ +      if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+ +              return -EACCES;
+ +
         entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
                                 GFP_KERNEL);
         if (!entries)
@@@ -2905,6 -2891,21 +2905,21 @@@ static int proc_pid_patch_state(struct 
   }
   #endif /* CONFIG_LIVEPATCH */
   
+ #ifdef CONFIG_STACKLEAK_METRICS
+ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
+                               struct pid *pid, struct task_struct *task)
+ {
+       unsigned long prev_depth = THREAD_SIZE -
+                               (task->prev_lowest_stack & (THREAD_SIZE - 1));
+       unsigned long depth = THREAD_SIZE -
+                               (task->lowest_stack & (THREAD_SIZE - 1));
+ 
+       seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
+                                                       prev_depth, depth);
+       return 0;
+ }
+ #endif /* CONFIG_STACKLEAK_METRICS */
+ 
   /*
    * Thread groups
    */
@@@ -3006,6 -3007,9 +3021,9 @@@ static const struct pid_entry tgid_base
   #ifdef CONFIG_LIVEPATCH
         ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
   #endif
+ #ifdef CONFIG_STACKLEAK_METRICS
+       ONE("stack_depth", S_IRUGO, proc_stack_depth),
+ #endif
   };
   
   static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
diff --combined include/linux/sched.h

index 8f8a5418b627a2db2377add2da367796e572cd2d,ae9d10e14b82a2a35c978dce7aeed315d3b3804e..a51c13c2b1a0316b00f928bc9e5afaa2ec949d83
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -25,7 -25,6 +25,7 @@@
   #include <linux/latencytop.h>
   #include <linux/sched/prio.h>
   #include <linux/signal_types.h>
+ +#include <linux/psi_types.h>
   #include <linux/mm_types_task.h>
   #include <linux/task_io_accounting.h>
   #include <linux/rseq.h>
@@@ -572,8 -571,12 +572,8 @@@ union rcu_special 
         struct {
                 u8                      blocked;
                 u8                      need_qs;
- -              u8                      exp_need_qs;
- -
- -              /* Otherwise the compiler can store garbage here: */
- -              u8                      pad;
         } b; /* Bits. */
- -      u32 s; /* Set of bits. */
+ +      u16 s; /* Set of bits. */
   };
   
   enum perf_event_task_context {
@@@ -707,10 -710,6 +707,10 @@@ struct task_struct 
         unsigned                        sched_contributes_to_load:1;
         unsigned                        sched_migrated:1;
         unsigned                        sched_remote_wakeup:1;
+ +#ifdef CONFIG_PSI
+ +      unsigned                        sched_psi_wake_requeue:1;
+ +#endif
+ +
         /* Force alignment to the next boundary: */
         unsigned                        :0;
   
@@@ -724,6 -723,9 +724,6 @@@
   #endif
   #ifdef CONFIG_MEMCG
         unsigned                        in_user_fault:1;
- -#ifdef CONFIG_MEMCG_KMEM
- -      unsigned                        memcg_kmem_skip_account:1;
- -#endif
   #endif
   #ifdef CONFIG_COMPAT_BRK
         unsigned                        brk_randomized:1;
@@@ -737,12 -739,6 +737,12 @@@
         unsigned                        use_memdelay:1;
   #endif
   
+ +      /*
+ +       * May usercopy functions fault on kernel addresses?
+ +       * This is not just a single bit because this can potentially nest.
+ +       */
+ +      unsigned int                    kernel_uaccess_faults_ok;
+ +
         unsigned long                   atomic_flags; /* Flags requiring atomic access. */
   
         struct restart_block            restart_block;
@@@ -964,13 -960,9 +964,13 @@@
   
         /* Ptrace state: */
         unsigned long                   ptrace_message;
- -      siginfo_t                       *last_siginfo;
+ +      kernel_siginfo_t                *last_siginfo;
   
         struct task_io_accounting       ioac;
+ +#ifdef CONFIG_PSI
+ +      /* Pressure stall state */
+ +      unsigned int                    psi_flags;
+ +#endif
   #ifdef CONFIG_TASK_XACCT
         /* Accumulated RSS usage: */
         u64                             acct_rss_mem1;
@@@ -1200,6 -1192,11 +1200,11 @@@
         void                            *security;
   #endif
   
+ #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+       unsigned long                   lowest_stack;
+       unsigned long                   prev_lowest_stack;
+ #endif
+ 
         /*
          * New fields for task_struct should be added above here, so that
          * they are included in the randomized portion of task_struct.
@@@ -1397,7 -1394,6 +1402,7 @@@ extern struct pid *cad_pid
   #define PF_KTHREAD            0x00200000      /* I am a kernel thread */
   #define PF_RANDOMIZE          0x00400000      /* Randomize virtual address space */
   #define PF_SWAPWRITE          0x00800000      /* Allowed to write to swap */
+ +#define PF_MEMSTALL           0x01000000      /* Stalled due to lack of memory */
   #define PF_NO_SETAFFINITY     0x04000000      /* Userland is not allowed to meddle with cpus_allowed */
   #define PF_MCE_EARLY          0x08000000      /* Early kill for mce process policy */
   #define PF_MUTEX_TESTER               0x20000000      /* Thread belongs to the rt mutex tester */
diff --combined kernel/fork.c

index 8f82a3bdcb8feff10a8ce4c8d608a406890b6673,47911e49c2b1df9ec10ed7edbb8b241a6522fe91..07cddff89c7b6bac3658c8cb41dd32dc64a3cfa4
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -91,6 -91,7 +91,7 @@@
   #include <linux/kcov.h>
   #include <linux/livepatch.h>
   #include <linux/thread_info.h>
+ #include <linux/stackleak.h>
   
   #include <asm/pgtable.h>
   #include <asm/pgalloc.h>
@@@ -223,14 -224,9 +224,14 @@@ static unsigned long *alloc_thread_stac
                 return s->addr;
         }
   
+ +      /*
+ +       * Allocated stacks are cached and later reused by new threads,
+ +       * so memcg accounting is performed manually on assigning/releasing
+ +       * stacks to tasks. Drop __GFP_ACCOUNT.
+ +       */
         stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
                                      VMALLOC_START, VMALLOC_END,
- -                                   THREADINFO_GFP,
+ +                                   THREADINFO_GFP & ~__GFP_ACCOUNT,
                                      PAGE_KERNEL,
                                      0, node, __builtin_return_address(0));
   
@@@ -253,19 -249,9 +254,19 @@@
   static inline void free_thread_stack(struct task_struct *tsk)
   {
   #ifdef CONFIG_VMAP_STACK
- -      if (task_stack_vm_area(tsk)) {
+ +      struct vm_struct *vm = task_stack_vm_area(tsk);
+ +
+ +      if (vm) {
                 int i;
   
+ +              for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ +                      mod_memcg_page_state(vm->pages[i],
+ +                                           MEMCG_KERNEL_STACK_KB,
+ +                                           -(int)(PAGE_SIZE / 1024));
+ +
+ +                      memcg_kmem_uncharge(vm->pages[i], 0);
+ +              }
+ +
                 for (i = 0; i < NR_CACHED_STACKS; i++) {
                         if (this_cpu_cmpxchg(cached_stacks[i],
                                         NULL, tsk->stack_vm_area) != NULL)
@@@ -366,6 -352,10 +367,6 @@@ static void account_kernel_stack(struc
                                             NR_KERNEL_STACK_KB,
                                             PAGE_SIZE / 1024 * account);
                 }
- -
- -              /* All stack pages belong to the same memcg. */
- -              mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
- -                                   account * (THREAD_SIZE / 1024));
         } else {
                 /*
                  * All stack pages are in the same zone and belong to the
@@@ -381,35 -371,6 +382,35 @@@
         }
   }
   
+ +static int memcg_charge_kernel_stack(struct task_struct *tsk)
+ +{
+ +#ifdef CONFIG_VMAP_STACK
+ +      struct vm_struct *vm = task_stack_vm_area(tsk);
+ +      int ret;
+ +
+ +      if (vm) {
+ +              int i;
+ +
+ +              for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ +                      /*
+ +                       * If memcg_kmem_charge() fails, page->mem_cgroup
+ +                       * pointer is NULL, and both memcg_kmem_uncharge()
+ +                       * and mod_memcg_page_state() in free_thread_stack()
+ +                       * will ignore this page. So it's safe.
+ +                       */
+ +                      ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0);
+ +                      if (ret)
+ +                              return ret;
+ +
+ +                      mod_memcg_page_state(vm->pages[i],
+ +                                           MEMCG_KERNEL_STACK_KB,
+ +                                           PAGE_SIZE / 1024);
+ +              }
+ +      }
+ +#endif
+ +      return 0;
+ +}
+ +
   static void release_task_stack(struct task_struct *tsk)
   {
         if (WARN_ON(tsk->state != TASK_DEAD))
@@@ -590,7 -551,8 +591,7 @@@ static __latent_entropy int dup_mmap(st
                         goto out;
         }
         /* a new mm has just been created */
- -      arch_dup_mmap(oldmm, mm);
- -      retval = 0;
+ +      retval = arch_dup_mmap(oldmm, mm);
   out:
         up_write(&mm->mmap_sem);
         flush_tlb_mm(oldmm);
@@@ -847,9 -809,6 +848,9 @@@ static struct task_struct *dup_task_str
         if (!stack)
                 goto free_tsk;
   
+ +      if (memcg_charge_kernel_stack(tsk))
+ +              goto free_stack;
+ +
         stack_vm_area = task_stack_vm_area(tsk);
   
         err = arch_dup_task_struct(tsk, orig);
@@@ -1822,10 -1781,6 +1823,10 @@@ static __latent_entropy struct task_str
   
         p->default_timer_slack_ns = current->timer_slack_ns;
   
+ +#ifdef CONFIG_PSI
+ +      p->psi_flags = 0;
+ +#endif
+ +
         task_io_accounting_init(&p->ioac);
         acct_clear_integrals(p);
   
@@@ -1926,6 -1881,8 +1927,8 @@@
         if (retval)
                 goto bad_fork_cleanup_io;
   
+       stackleak_task_init(p);
+ 
         if (pid != &init_struct_pid) {
                 pid = alloc_pid(p->nsproxy->pid_ns_for_children);
                 if (IS_ERR(pid)) {
author	Linus Torvalds <[email protected]>
	Thu, 1 Nov 2018 18:46:27 +0000 (11:46 -0700)
committer	Linus Torvalds <[email protected]>
	Thu, 1 Nov 2018 18:46:27 +0000 (11:46 -0700)
		1	2
Documentation/x86/x86_64/mm.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/kernel/process.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/entry/calling.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/entry/entry_32.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/entry/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/misc/lkdtm/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/misc/lkdtm/lkdtm.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/base.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history