]> Git Repo - J-linux.git/commitdiff
Merge branch 'x86/bugs' into x86/core, to pick up pending changes before dependent...
authorIngo Molnar <[email protected]>
Wed, 14 Feb 2024 09:48:28 +0000 (10:48 +0100)
committerIngo Molnar <[email protected]>
Wed, 14 Feb 2024 09:49:37 +0000 (10:49 +0100)
Merge in pending alternatives patching infrastructure changes, before
applying more patches.

Signed-off-by: Ingo Molnar <[email protected]>
27 files changed:
1  2 
Documentation/admin-guide/kernel-parameters.txt
arch/x86/Kconfig
arch/x86/Makefile
arch/x86/entry/calling.h
arch/x86/entry/entry_64.S
arch/x86/include/asm/current.h
arch/x86/include/asm/nospec-branch.h
arch/x86/kernel/alternative.c
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/head_64.S
arch/x86/kernel/vmlinux.lds.S
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/mmu_internal.h
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/vmenter.S
arch/x86/kvm/vmx/vmx.c
arch/x86/net/bpf_jit_comp.c
include/linux/compiler-gcc.h
include/linux/indirect_call_wrapper.h
include/linux/module.h
include/net/tc_wrapper.h
kernel/trace/ring_buffer.c
net/netfilter/nft_ct.c
net/sched/sch_api.c
scripts/Makefile.lib
scripts/generate_rust_target.rs
scripts/mod/modpost.c

index 31b3a25680d08cfac3603d58b3d3783bbf1e34bb,d93f403777f232974178fdd18cf4d2c4e309bd85..f42618b958255155f403256b5bf24bb9c310bd30
@@@ -1,14 -1,3 +1,14 @@@
 +      accept_memory=  [MM]
 +                      Format: { eager | lazy }
 +                      default: lazy
 +                      By default, unaccepted memory is accepted lazily to
 +                      avoid prolonged boot times. The lazy option will add
 +                      some runtime overhead until all memory is eventually
 +                      accepted. In most cases the overhead is negligible.
 +                      For some workloads or for debugging purposes
 +                      accept_memory=eager can be used to accept all memory
 +                      at once during boot.
 +
        acpi=           [HW,ACPI,X86,ARM64,RISCV64]
                        Advanced Configuration and Power Interface
                        Format: { force | on | off | strict | noirq | rsdt |
                        memory region [offset, offset + size] for that kernel
                        image. If '@offset' is omitted, then a suitable offset
                        is selected automatically.
 -                      [KNL, X86-64, ARM64, RISCV] Select a region under 4G first, and
 -                      fall back to reserve region above 4G when '@offset'
 -                      hasn't been specified.
 +                      [KNL, X86-64, ARM64, RISCV, LoongArch] Select a region
 +                      under 4G first, and fall back to reserve region above
 +                      4G when '@offset' hasn't been specified.
                        See Documentation/admin-guide/kdump/kdump.rst for further details.
  
        crashkernel=range1:size1[,range2:size2,...][@offset]
                        Documentation/admin-guide/kdump/kdump.rst for an example.
  
        crashkernel=size[KMG],high
 -                      [KNL, X86-64, ARM64, RISCV] range could be above 4G.
 +                      [KNL, X86-64, ARM64, RISCV, LoongArch] range could be
 +                      above 4G.
                        Allow kernel to allocate physical memory region from top,
                        so could be above 4G if system have more than 4G ram
                        installed. Otherwise memory region will be allocated
                        below 4G, if available.
                        It will be ignored if crashkernel=X is specified.
        crashkernel=size[KMG],low
 -                      [KNL, X86-64, ARM64, RISCV] range under 4G. When crashkernel=X,high
 -                      is passed, kernel could allocate physical memory region
 -                      above 4G, that cause second kernel crash on system
 -                      that require some amount of low memory, e.g. swiotlb
 -                      requires at least 64M+32K low memory, also enough extra
 -                      low memory is needed to make sure DMA buffers for 32-bit
 -                      devices won't run out. Kernel would try to allocate
 +                      [KNL, X86-64, ARM64, RISCV, LoongArch] range under 4G.
 +                      When crashkernel=X,high is passed, kernel could allocate
 +                      physical memory region above 4G, that cause second kernel
 +                      crash on system that require some amount of low memory,
 +                      e.g. swiotlb requires at least 64M+32K low memory, also
 +                      enough extra low memory is needed to make sure DMA buffers
 +                      for 32-bit devices won't run out. Kernel would try to allocate
                        default size of memory below 4G automatically. The default
                        size is platform dependent.
                          --> x86: max(swiotlb_size_or_default() + 8MiB, 256MiB)
                          --> arm64: 128MiB
                          --> riscv: 128MiB
 +                        --> loongarch: 128MiB
                        This one lets the user specify own low range under 4G
                        for second kernel instead.
                        0: to disable low allocation.
                        between unregistering the boot console and initializing
                        the real console.
  
 -      keepinitrd      [HW,ARM]
 +      keepinitrd      [HW,ARM] See retain_initrd.
  
        kernelcore=     [KNL,X86,IA-64,PPC]
                        Format: nn[KMGTPE] | nn% | "mirror"
                        vulnerability. System may allow data leaks with this
                        option.
  
 -      no-steal-acc    [X86,PV_OPS,ARM64,PPC/PSERIES] Disable paravirtualized
 -                      steal time accounting. steal time is computed, but
 -                      won't influence scheduler behaviour
 +      no-steal-acc    [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV] Disable
 +                      paravirtualized steal time accounting. steal time is
 +                      computed, but won't influence scheduler behaviour
  
        nosync          [HW,M68K] Disables sync negotiation for all devices.
  
                        Dump ftrace buffer after reporting RCU CPU
                        stall warning.
  
 +      rcupdate.rcu_cpu_stall_notifiers= [KNL]
 +                      Provide RCU CPU stall notifiers, but see the
 +                      warnings in the RCU_CPU_STALL_NOTIFIER Kconfig
 +                      option's help text.  TL;DR:  You almost certainly
 +                      do not want rcupdate.rcu_cpu_stall_notifiers.
 +
        rcupdate.rcu_cpu_stall_suppress= [KNL]
                        Suppress RCU CPU stall warning messages.
  
                        Useful for devices that are detected asynchronously
                        (e.g. USB and MMC devices).
  
 -      retain_initrd   [RAM] Keep initrd memory after extraction
 +      retain_initrd   [RAM] Keep initrd memory after extraction. After boot, it will
 +                      be accessible via /sys/firmware/initrd.
  
        retbleed=       [X86] Control mitigation of RETBleed (Arbitrary
                        Speculative Code Execution with Return Instructions)
                        Selecting 'on' will, and 'auto' may, choose a
                        mitigation method at run time according to the
                        CPU, the available microcode, the setting of the
-                       CONFIG_RETPOLINE configuration option, and the
-                       compiler with which the kernel was built.
+                       CONFIG_MITIGATION_RETPOLINE configuration option,
+                       and the compiler with which the kernel was built.
  
                        Selecting 'on' will also enable the mitigation
                        against user space to user space task attacks.
                                        pause after every control message);
                                o = USB_QUIRK_HUB_SLOW_RESET (Hub needs extra
                                        delay after resetting its port);
 +                              p = USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT
 +                                      (Reduce timeout of the SET_ADDRESS
 +                                      request from 5000 ms to 500 ms);
                        Example: quirks=0781:5580:bk,0a5c:5834:gij
  
        usbhid.mousepoll=
diff --combined arch/x86/Kconfig
index 0a31b515d1205d0246e72fb796083e74c70049b6,0a9fea390ef30233401d1d2d652edc4741a85336..502986237cb65a9d4a6a301d3182b827254cba6b
@@@ -59,7 -59,6 +59,7 @@@ config X8
        #
        select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
        select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
 +      select ACPI_HOTPLUG_CPU                 if ACPI_PROCESSOR && HOTPLUG_CPU
        select ARCH_32BIT_OFF_T                 if X86_32
        select ARCH_CLOCKSOURCE_INIT
        select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
@@@ -72,7 -71,6 +72,7 @@@
        select ARCH_HAS_CACHE_LINE_SIZE
        select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
        select ARCH_HAS_CPU_FINALIZE_INIT
 +      select ARCH_HAS_CPU_PASID               if IOMMU_SVA
        select ARCH_HAS_CURRENT_STACK_POINTER
        select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEBUG_VM_PGTABLE        if !X86_PAE
        select GENERIC_CLOCKEVENTS_MIN_ADJUST
        select GENERIC_CMOS_UPDATE
        select GENERIC_CPU_AUTOPROBE
 +      select GENERIC_CPU_DEVICES
        select GENERIC_CPU_VULNERABILITIES
        select GENERIC_EARLY_IOREMAP
        select GENERIC_ENTRY
@@@ -1970,11 -1967,6 +1970,11 @@@ config INTEL_TDX_HOS
        depends on CPU_SUP_INTEL
        depends on X86_64
        depends on KVM_INTEL
 +      depends on X86_X2APIC
 +      select ARCH_KEEP_MEMBLOCK
 +      depends on CONTIG_ALLOC
 +      depends on !KEXEC_CORE
 +      depends on X86_MCE
        help
          Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
          host and certain physical attacks.  This option enables necessary TDX
@@@ -2434,18 -2426,6 +2434,18 @@@ source "kernel/livepatch/Kconfig
  
  endmenu
  
 +config CC_HAS_NAMED_AS
 +      def_bool CC_IS_GCC && GCC_VERSION >= 120100
 +
 +config USE_X86_SEG_SUPPORT
 +      def_bool y
 +      depends on CC_HAS_NAMED_AS
 +      #
 +      # -fsanitize=kernel-address (KASAN) is at the moment incompatible
 +      # with named address spaces - see GCC PR sanitizer/111736.
 +      #
 +      depends on !KASAN
 +
  config CC_HAS_SLS
        def_bool $(cc-option,-mharden-sls=all)
  
@@@ -2477,12 -2457,12 +2477,12 @@@ config CALL_PADDIN
  
  config FINEIBT
        def_bool y
-       depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE
+       depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE
        select CALL_PADDING
  
  config HAVE_CALL_THUNKS
        def_bool y
-       depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL
+       depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL
  
  config CALL_THUNKS
        def_bool n
@@@ -2504,7 -2484,7 +2504,7 @@@ menuconfig SPECULATION_MITIGATION
  
  if SPECULATION_MITIGATIONS
  
- config PAGE_TABLE_ISOLATION
+ config MITIGATION_PAGE_TABLE_ISOLATION
        bool "Remove the kernel mapping in user mode"
        default y
        depends on (X86_64 || X86_PAE)
  
          See Documentation/arch/x86/pti.rst for more details.
  
- config RETPOLINE
+ config MITIGATION_RETPOLINE
        bool "Avoid speculative indirect branches in kernel"
        select OBJTOOL if HAVE_OBJTOOL
        default y
          branches. Requires a compiler with -mindirect-branch=thunk-extern
          support for full protection. The kernel may run slower.
  
- config RETHUNK
+ config MITIGATION_RETHUNK
        bool "Enable return-thunks"
-       depends on RETPOLINE && CC_HAS_RETURN_THUNK
+       depends on MITIGATION_RETPOLINE && CC_HAS_RETURN_THUNK
        select OBJTOOL if HAVE_OBJTOOL
        default y if X86_64
        help
          Requires a compiler with -mfunction-return=thunk-extern
          support for full protection. The kernel may run slower.
  
- config CPU_UNRET_ENTRY
+ config MITIGATION_UNRET_ENTRY
        bool "Enable UNRET on kernel entry"
-       depends on CPU_SUP_AMD && RETHUNK && X86_64
+       depends on CPU_SUP_AMD && MITIGATION_RETHUNK && X86_64
        default y
        help
          Compile the kernel with support for the retbleed=unret mitigation.
  
- config CALL_DEPTH_TRACKING
+ config MITIGATION_CALL_DEPTH_TRACKING
        bool "Mitigate RSB underflow with call depth tracking"
        depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS
        select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
  
  config CALL_THUNKS_DEBUG
        bool "Enable call thunks and call depth tracking debugging"
-       depends on CALL_DEPTH_TRACKING
+       depends on MITIGATION_CALL_DEPTH_TRACKING
        select FUNCTION_ALIGNMENT_32B
        default n
        help
          Only enable this when you are debugging call thunks as this
          creates a noticeable runtime overhead. If unsure say N.
  
- config CPU_IBPB_ENTRY
+ config MITIGATION_IBPB_ENTRY
        bool "Enable IBPB on kernel entry"
        depends on CPU_SUP_AMD && X86_64
        default y
        help
          Compile the kernel with support for the retbleed=ibpb mitigation.
  
- config CPU_IBRS_ENTRY
+ config MITIGATION_IBRS_ENTRY
        bool "Enable IBRS on kernel entry"
        depends on CPU_SUP_INTEL && X86_64
        default y
          This mitigates both spectre_v2 and retbleed at great cost to
          performance.
  
- config CPU_SRSO
+ config MITIGATION_SRSO
        bool "Mitigate speculative RAS overflow on AMD"
-       depends on CPU_SUP_AMD && X86_64 && RETHUNK
+       depends on CPU_SUP_AMD && X86_64 && MITIGATION_RETHUNK
        default y
        help
          Enable the SRSO mitigation needed on AMD Zen1-4 machines.
  
- config SLS
+ config MITIGATION_SLS
        bool "Mitigate Straight-Line-Speculation"
        depends on CC_HAS_SLS && X86_64
        select OBJTOOL if HAVE_OBJTOOL
          against straight line speculation. The kernel image might be slightly
          larger.
  
- config GDS_FORCE_MITIGATION
+ config MITIGATION_GDS_FORCE
        bool "Force GDS Mitigation"
        depends on CPU_SUP_INTEL
        default n
diff --combined arch/x86/Makefile
index 2264db14a25d3b034ffa93685d6353564d4a9225,ba046afb850e518ad2fbe707210f551e5d4cde0e..c54f835fc062c0c36755ffe4829e48acc9226e4f
@@@ -22,7 -22,7 +22,7 @@@ RETPOLINE_VDSO_CFLAGS := -mretpolin
  endif
  RETPOLINE_CFLAGS      += $(call cc-option,-mindirect-branch-cs-prefix)
  
- ifdef CONFIG_RETHUNK
+ ifdef CONFIG_MITIGATION_RETHUNK
  RETHUNK_CFLAGS                := -mfunction-return=thunk-extern
  RETPOLINE_CFLAGS      += $(RETHUNK_CFLAGS)
  endif
@@@ -112,13 -112,13 +112,13 @@@ ifeq ($(CONFIG_X86_32),y
          # temporary until string.h is fixed
          KBUILD_CFLAGS += -ffreestanding
  
 -      ifeq ($(CONFIG_STACKPROTECTOR),y)
 -              ifeq ($(CONFIG_SMP),y)
 +        ifeq ($(CONFIG_STACKPROTECTOR),y)
 +                ifeq ($(CONFIG_SMP),y)
                        KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard
 -              else
 +                else
                        KBUILD_CFLAGS += -mstack-protector-guard=global
 -              endif
 -      endif
 +                endif
 +        endif
  else
          BITS := 64
          UTS_MACHINE := x86_64
@@@ -192,7 -192,7 +192,7 @@@ KBUILD_CFLAGS += -Wno-sign-compar
  KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
  
  # Avoid indirect branches in kernel to deal with Spectre
- ifdef CONFIG_RETPOLINE
+ ifdef CONFIG_MITIGATION_RETPOLINE
    KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
    # Additionally, avoid generating expensive indirect jumps which
    # are subject to retpolines for small number of switch cases.
    endif
  endif
  
- ifdef CONFIG_SLS
+ ifdef CONFIG_MITIGATION_SLS
    KBUILD_CFLAGS += -mharden-sls=all
  endif
  
@@@ -301,7 -301,7 +301,7 @@@ vdso-install-$(CONFIG_IA32_EMULATION)      +
  
  archprepare: checkbin
  checkbin:
- ifdef CONFIG_RETPOLINE
+ ifdef CONFIG_MITIGATION_RETPOLINE
  ifeq ($(RETPOLINE_CFLAGS),)
        @echo "You are building kernel with non-retpoline compiler." >&2
        @echo "Please update your compiler." >&2
diff --combined arch/x86/entry/calling.h
index e59d3073e7cf2e4daaccf986108c5c8ea21c1504,bd31b253405388854e0db323cf63235af4cc57c8..650c63795ca3039415d6acb88bd5c75c981c097e
@@@ -142,10 -142,10 +142,10 @@@ For 32-bit we have the following conven
        .endif
  .endm
  
- #ifdef CONFIG_PAGE_TABLE_ISOLATION
+ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
  
  /*
-  * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
+  * MITIGATION_PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
   * halves:
   */
  #define PTI_USER_PGTABLE_BIT          PAGE_SHIFT
  
  .macro ADJUST_KERNEL_CR3 reg:req
        ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
-       /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+       /* Clear PCID and "MITIGATION_PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
        andq    $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
  .endm
  
  .endm
  
  #define THIS_CPU_user_pcid_flush_mask   \
 -      PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
 +      PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask)
  
  .macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req
        mov     %cr3, \scratch_reg
  .Lend_\@:
  .endm
  
- #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+ #else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=n: */
  
  .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
  .endm
   * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set.
   */
  .macro IBRS_ENTER save_reg
- #ifdef CONFIG_CPU_IBRS_ENTRY
+ #ifdef CONFIG_MITIGATION_IBRS_ENTRY
        ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
        movl    $MSR_IA32_SPEC_CTRL, %ecx
  
   * regs. Must be called after the last RET.
   */
  .macro IBRS_EXIT save_reg
- #ifdef CONFIG_CPU_IBRS_ENTRY
+ #ifdef CONFIG_MITIGATION_IBRS_ENTRY
        ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
        movl    $MSR_IA32_SPEC_CTRL, %ecx
  
  .endm
  
  #endif /* CONFIG_SMP */
+ #ifdef CONFIG_X86_64
+ /* rdi:       arg1 ... normal C conventions. rax is saved/restored. */
+ .macro THUNK name, func
+ SYM_FUNC_START(\name)
+       pushq %rbp
+       movq %rsp, %rbp
+       pushq %rdi
+       pushq %rsi
+       pushq %rdx
+       pushq %rcx
+       pushq %rax
+       pushq %r8
+       pushq %r9
+       pushq %r10
+       pushq %r11
+       call \func
+       popq %r11
+       popq %r10
+       popq %r9
+       popq %r8
+       popq %rax
+       popq %rcx
+       popq %rdx
+       popq %rsi
+       popq %rdi
+       popq %rbp
+       RET
+ SYM_FUNC_END(\name)
+       _ASM_NOKPROBE(\name)
+ .endm
+ #else /* CONFIG_X86_32 */
+ /* put return address in eax (arg1) */
+ .macro THUNK name, func, put_ret_addr_in_eax=0
+ SYM_CODE_START_NOALIGN(\name)
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       .if \put_ret_addr_in_eax
+       /* Place EIP in the arg1 */
+       movl 3*4(%esp), %eax
+       .endif
+       call \func
+       popl %edx
+       popl %ecx
+       popl %eax
+       RET
+       _ASM_NOKPROBE(\name)
+ SYM_CODE_END(\name)
+       .endm
+ #endif
index 567d973eed0381810ed276a5b8c8561573a13374,d08cb3865c8ad60338b3eaa76fe64172c66d7569..1f09b1e3edebbfc070ee740cd3eb43f65b17d81f
@@@ -190,7 -190,7 +190,7 @@@ SYM_FUNC_START(__switch_to_asm
  
  #ifdef CONFIG_STACKPROTECTOR
        movq    TASK_stack_canary(%rsi), %rbx
 -      movq    %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary
 +      movq    %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary)
  #endif
  
        /*
@@@ -562,7 -562,7 +562,7 @@@ SYM_INNER_LABEL(swapgs_restore_regs_and
  #ifdef CONFIG_XEN_PV
        ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
  #endif
- #ifdef CONFIG_PAGE_TABLE_ISOLATION
+ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
        ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI
  #endif
  
        jnz     .Lnative_iret
        ud2
  
- #ifdef CONFIG_PAGE_TABLE_ISOLATION
+ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
  .Lpti_restore_regs_and_return_to_usermode:
        POP_REGS pop_rdi=0
  
@@@ -1096,7 -1096,7 +1096,7 @@@ SYM_CODE_END(error_return
   *
   * Registers:
   *    %r14: Used to save/restore the CR3 of the interrupted context
-  *          when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
+  *          when MITIGATION_PAGE_TABLE_ISOLATION is in use.  Do not clobber.
   */
  SYM_CODE_START(asm_exc_nmi)
        UNWIND_HINT_IRET_ENTRY
index fb7702d4170c554dd9f34737d76ac69e7b07fee0,d4ff517cfbd1fbc8bf8a98701f5f918b611e052c..bf5953883ec365377fec5979f6d5c34418ebba32
@@@ -2,7 -2,6 +2,7 @@@
  #ifndef _ASM_X86_CURRENT_H
  #define _ASM_X86_CURRENT_H
  
 +#include <linux/build_bug.h>
  #include <linux/compiler.h>
  
  #ifndef __ASSEMBLY__
@@@ -18,7 -17,7 +18,7 @@@ struct pcpu_hot 
                        struct task_struct      *current_task;
                        int                     preempt_count;
                        int                     cpu_number;
- #ifdef CONFIG_CALL_DEPTH_TRACKING
+ #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
                        u64                     call_depth;
  #endif
                        unsigned long           top_of_stack;
@@@ -37,15 -36,8 +37,15 @@@ static_assert(sizeof(struct pcpu_hot) =
  
  DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
  
 +/* const-qualified alias to pcpu_hot, aliased by linker. */
 +DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
 +                      const_pcpu_hot);
 +
  static __always_inline struct task_struct *get_current(void)
  {
 +      if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
 +              return this_cpu_read_const(const_pcpu_hot.current_task);
 +
        return this_cpu_read_stable(pcpu_hot.current_task);
  }
  
index 691ff1ef701b623b8d671ea89b9b258cd25562be,55754617eaee88406217a0af6bff5b6c3a5cd97c..8bd8ed576f91fc4424a2013dca4c9b23a4681a89
  
  #ifdef CONFIG_CALL_THUNKS_DEBUG
  # define CALL_THUNKS_DEBUG_INC_CALLS                          \
 -      incq    %gs:__x86_call_count;
 +      incq    PER_CPU_VAR(__x86_call_count);
  # define CALL_THUNKS_DEBUG_INC_RETS                           \
 -      incq    %gs:__x86_ret_count;
 +      incq    PER_CPU_VAR(__x86_ret_count);
  # define CALL_THUNKS_DEBUG_INC_STUFFS                         \
 -      incq    %gs:__x86_stuffs_count;
 +      incq    PER_CPU_VAR(__x86_stuffs_count);
  # define CALL_THUNKS_DEBUG_INC_CTXSW                          \
 -      incq    %gs:__x86_ctxsw_count;
 +      incq    PER_CPU_VAR(__x86_ctxsw_count);
  #else
  # define CALL_THUNKS_DEBUG_INC_CALLS
  # define CALL_THUNKS_DEBUG_INC_RETS
  # define CALL_THUNKS_DEBUG_INC_CTXSW
  #endif
  
- #if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
+ #if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
  
  #include <asm/asm-offsets.h>
  
  #define CREDIT_CALL_DEPTH                                     \
        movq    $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
  
 -#define ASM_CREDIT_CALL_DEPTH                                 \
 -      movq    $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
 -
  #define RESET_CALL_DEPTH                                      \
        xor     %eax, %eax;                                     \
        bts     $63, %rax;                                      \
        CALL_THUNKS_DEBUG_INC_CALLS
  
  #define INCREMENT_CALL_DEPTH                                  \
 -      sarq    $5, %gs:pcpu_hot + X86_call_depth;              \
 -      CALL_THUNKS_DEBUG_INC_CALLS
 -
 -#define ASM_INCREMENT_CALL_DEPTH                              \
        sarq    $5, PER_CPU_VAR(pcpu_hot + X86_call_depth);     \
        CALL_THUNKS_DEBUG_INC_CALLS
  
  #else
  #define CREDIT_CALL_DEPTH
 -#define ASM_CREDIT_CALL_DEPTH
  #define RESET_CALL_DEPTH
 -#define INCREMENT_CALL_DEPTH
 -#define ASM_INCREMENT_CALL_DEPTH
  #define RESET_CALL_DEPTH_FROM_CALL
 +#define INCREMENT_CALL_DEPTH
  #endif
  
  /*
        jnz     771b;                                   \
        /* barrier for jnz misprediction */             \
        lfence;                                         \
 -      ASM_CREDIT_CALL_DEPTH                           \
 +      CREDIT_CALL_DEPTH                               \
        CALL_THUNKS_DEBUG_INC_CTXSW
  #else
  /*
   */
  .macro VALIDATE_UNRET_END
  #if defined(CONFIG_NOINSTR_VALIDATION) && \
-       (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
+       (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO))
        ANNOTATE_RETPOLINE_SAFE
        nop
  #endif
   * instruction irrespective of kCFI.
   */
  .macro JMP_NOSPEC reg:req
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
        __CS_PREFIX \reg
        jmp     __x86_indirect_thunk_\reg
  #else
  .endm
  
  .macro CALL_NOSPEC reg:req
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
        __CS_PREFIX \reg
        call    __x86_indirect_thunk_\reg
  #else
  .Lskip_rsb_\@:
  .endm
  
- #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
+ #if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
  #define CALL_UNTRAIN_RET      "call entry_untrain_ret"
  #else
  #define CALL_UNTRAIN_RET      ""
   * where we have a stack but before any RET instruction.
   */
  .macro __UNTRAIN_RET ibpb_feature, call_depth_insns
- #if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY)
+ #if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY)
        VALIDATE_UNRET_END
        ALTERNATIVE_3 "",                                               \
                      CALL_UNTRAIN_RET, X86_FEATURE_UNRET,              \
  
  
  .macro CALL_DEPTH_ACCOUNT
- #ifdef CONFIG_CALL_DEPTH_TRACKING
+ #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
        ALTERNATIVE "",                                                 \
 -                  __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
 +                  __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
  #endif
  .endm
  
@@@ -319,19 -328,19 +319,19 @@@ extern retpoline_thunk_t __x86_indirect
  extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
  extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
  
- #ifdef CONFIG_RETHUNK
+ #ifdef CONFIG_MITIGATION_RETHUNK
  extern void __x86_return_thunk(void);
  #else
  static inline void __x86_return_thunk(void) {}
  #endif
  
- #ifdef CONFIG_CPU_UNRET_ENTRY
+ #ifdef CONFIG_MITIGATION_UNRET_ENTRY
  extern void retbleed_return_thunk(void);
  #else
  static inline void retbleed_return_thunk(void) {}
  #endif
  
- #ifdef CONFIG_CPU_SRSO
+ #ifdef CONFIG_MITIGATION_SRSO
  extern void srso_return_thunk(void);
  extern void srso_alias_return_thunk(void);
  #else
@@@ -348,7 -357,9 +348,9 @@@ extern void entry_ibpb(void)
  
  extern void (*x86_return_thunk)(void);
  
- #ifdef CONFIG_CALL_DEPTH_TRACKING
+ extern void __warn_thunk(void);
+ #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
  extern void call_depth_return_thunk(void);
  
  #define CALL_DEPTH_ACCOUNT                                    \
@@@ -362,14 -373,14 +364,14 @@@ DECLARE_PER_CPU(u64, __x86_ret_count)
  DECLARE_PER_CPU(u64, __x86_stuffs_count);
  DECLARE_PER_CPU(u64, __x86_ctxsw_count);
  #endif
- #else /* !CONFIG_CALL_DEPTH_TRACKING */
+ #else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
  
  static inline void call_depth_return_thunk(void) {}
  #define CALL_DEPTH_ACCOUNT ""
  
- #endif /* CONFIG_CALL_DEPTH_TRACKING */
+ #endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
  
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
  
  #define GEN(reg) \
        extern retpoline_thunk_t __x86_indirect_thunk_ ## reg;
  
  /*
   * Inline asm uses the %V modifier which is only in newer GCC
-  * which is ensured when CONFIG_RETPOLINE is defined.
+  * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined.
   */
  # define CALL_NOSPEC                                          \
        ALTERNATIVE_2(                                          \
index e7aeae02aacaf6a63deb7ac065fc1bbca5e049a3,df91abea3420f5f97e0a606c9fdea941290e0ee3..ff6e32ec8259c278a743da1482c2329cbf15a846
@@@ -30,7 -30,6 +30,7 @@@
  #include <asm/fixmap.h>
  #include <asm/paravirt.h>
  #include <asm/asm-prototypes.h>
 +#include <asm/cfi.h>
  
  int __read_mostly alternatives_patched;
  
@@@ -45,7 -44,7 +45,7 @@@ EXPORT_SYMBOL_GPL(alternatives_patched)
  #define DA_ENDBR      0x08
  #define DA_SMP                0x10
  
 -static unsigned int __initdata_or_module debug_alternative;
 +static unsigned int debug_alternative;
  
  static int __init debug_alt(char *str)
  {
@@@ -133,7 -132,7 +133,7 @@@ const unsigned char * const x86_nops[AS
   * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
   * *jump* over instead of executing long and daft NOPs.
   */
 -static void __init_or_module add_nop(u8 *instr, unsigned int len)
 +static void add_nop(u8 *instr, unsigned int len)
  {
        u8 *target = instr + len;
  
@@@ -206,7 -205,7 +206,7 @@@ static int skip_nops(u8 *instr, int off
   * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
   * to the end of the NOP sequence into a single NOP.
   */
 -static bool __init_or_module
 +static bool
  __optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
  {
        int i = *next - insn->length;
@@@ -335,7 -334,8 +335,7 @@@ bool need_reloc(unsigned long offset, u
        return (target < src || target > src + src_len);
  }
  
 -static void __init_or_module noinline
 -apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
 +void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
  {
        int prev, target = 0;
  
@@@ -402,7 -402,7 +402,7 @@@ noinstr void BUG_func(void
  {
        BUG();
  }
 -EXPORT_SYMBOL_GPL(BUG_func);
 +EXPORT_SYMBOL(BUG_func);
  
  #define CALL_RIP_REL_OPCODE   0xff
  #define CALL_RIP_REL_MODRM    0x15
@@@ -544,7 -544,7 +544,7 @@@ static inline bool is_jcc32(struct ins
        return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
  }
  
- #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
+ #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
  
  /*
   * CALL/JMP *%\reg
@@@ -708,8 -708,8 +708,8 @@@ static int patch_retpoline(void *addr, 
        /*
         * The compiler is supposed to EMIT an INT3 after every unconditional
         * JMP instruction due to AMD BTC. However, if the compiler is too old
-        * or SLS isn't enabled, we still need an INT3 after indirect JMPs
-        * even on Intel.
+        * or MITIGATION_SLS isn't enabled, we still need an INT3 after
+        * indirect JMPs even on Intel.
         */
        if (op == JMP32_INSN_OPCODE && i < insn->length)
                bytes[i++] = INT3_INSN_OPCODE;
@@@ -769,7 -769,7 +769,7 @@@ void __init_or_module noinline apply_re
        }
  }
  
- #ifdef CONFIG_RETHUNK
+ #ifdef CONFIG_MITIGATION_RETHUNK
  
  /*
   * Rewrite the compiler generated return thunk tail-calls.
@@@ -842,14 -842,14 +842,14 @@@ void __init_or_module noinline apply_re
  }
  #else
  void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
- #endif /* CONFIG_RETHUNK */
+ #endif /* CONFIG_MITIGATION_RETHUNK */
  
- #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
+ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
  
  void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
  void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
  
- #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
+ #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */
  
  #ifdef CONFIG_X86_KERNEL_IBT
  
@@@ -903,82 -903,15 +903,82 @@@ void __init_or_module apply_seal_endbr(
  #endif /* CONFIG_X86_KERNEL_IBT */
  
  #ifdef CONFIG_FINEIBT
 +#define __CFI_DEFAULT CFI_DEFAULT
 +#elif defined(CONFIG_CFI_CLANG)
 +#define __CFI_DEFAULT CFI_KCFI
 +#else
 +#define __CFI_DEFAULT CFI_OFF
 +#endif
  
 -enum cfi_mode {
 -      CFI_DEFAULT,
 -      CFI_OFF,
 -      CFI_KCFI,
 -      CFI_FINEIBT,
 -};
 +enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
 +
 +#ifdef CONFIG_CFI_CLANG
 +struct bpf_insn;
 +
 +/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
 +extern unsigned int __bpf_prog_runX(const void *ctx,
 +                                  const struct bpf_insn *insn);
 +
 +/*
 + * Force a reference to the external symbol so the compiler generates
 + * __kcfi_typid.
 + */
 +__ADDRESSABLE(__bpf_prog_runX);
 +
 +/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
 +asm (
 +"     .pushsection    .data..ro_after_init,\"aw\",@progbits   \n"
 +"     .type   cfi_bpf_hash,@object                            \n"
 +"     .globl  cfi_bpf_hash                                    \n"
 +"     .p2align        2, 0x0                                  \n"
 +"cfi_bpf_hash:                                                        \n"
 +"     .long   __kcfi_typeid___bpf_prog_runX                   \n"
 +"     .size   cfi_bpf_hash, 4                                 \n"
 +"     .popsection                                             \n"
 +);
 +
 +/* Must match bpf_callback_t */
 +extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
 +
 +__ADDRESSABLE(__bpf_callback_fn);
 +
 +/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
 +asm (
 +"     .pushsection    .data..ro_after_init,\"aw\",@progbits   \n"
 +"     .type   cfi_bpf_subprog_hash,@object                    \n"
 +"     .globl  cfi_bpf_subprog_hash                            \n"
 +"     .p2align        2, 0x0                                  \n"
 +"cfi_bpf_subprog_hash:                                                \n"
 +"     .long   __kcfi_typeid___bpf_callback_fn                 \n"
 +"     .size   cfi_bpf_subprog_hash, 4                         \n"
 +"     .popsection                                             \n"
 +);
 +
 +u32 cfi_get_func_hash(void *func)
 +{
 +      u32 hash;
 +
 +      func -= cfi_get_offset();
 +      switch (cfi_mode) {
 +      case CFI_FINEIBT:
 +              func += 7;
 +              break;
 +      case CFI_KCFI:
 +              func += 1;
 +              break;
 +      default:
 +              return 0;
 +      }
 +
 +      if (get_kernel_nofault(hash, func))
 +              return 0;
 +
 +      return hash;
 +}
 +#endif
 +
 +#ifdef CONFIG_FINEIBT
  
 -static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
  static bool cfi_rand __ro_after_init = true;
  static u32  cfi_seed __ro_after_init;
  
@@@ -1287,11 -1220,8 +1287,11 @@@ static void __apply_fineibt(s32 *start_
                goto err;
  
        if (cfi_rand) {
 -              if (builtin)
 +              if (builtin) {
                        cfi_seed = get_random_u32();
 +                      cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
 +                      cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
 +              }
  
                ret = cfi_rand_preamble(start_cfi, end_cfi);
                if (ret)
index f3abca334199d8eae235f1560f99448eb9675a27,6c7db2ec2c5e50cee5313b4a16a9b7e45064f278..310fea1af124a09d20f9e4542b76ad026dac5314
@@@ -538,7 -538,7 +538,7 @@@ static void bsp_init_amd(struct cpuinfo
  
        /* Figure out Zen generations: */
        switch (c->x86) {
 -      case 0x17: {
 +      case 0x17:
                switch (c->x86_model) {
                case 0x00 ... 0x2f:
                case 0x50 ... 0x5f:
                        goto warn;
                }
                break;
 -      }
 -      case 0x19: {
 +
 +      case 0x19:
                switch (c->x86_model) {
                case 0x00 ... 0x0f:
                case 0x20 ... 0x5f:
                        goto warn;
                }
                break;
 -      }
 +
 +      case 0x1a:
 +              switch (c->x86_model) {
 +              case 0x00 ... 0x0f:
 +              case 0x20 ... 0x2f:
 +              case 0x40 ... 0x4f:
 +              case 0x70 ... 0x7f:
 +                      setup_force_cpu_cap(X86_FEATURE_ZEN5);
 +                      break;
 +              default:
 +                      goto warn;
 +              }
 +              break;
 +
        default:
                break;
        }
@@@ -941,7 -928,7 +941,7 @@@ static void fix_erratum_1386(struct cpu
  
  void init_spectral_chicken(struct cpuinfo_x86 *c)
  {
- #ifdef CONFIG_CPU_UNRET_ENTRY
+ #ifdef CONFIG_MITIGATION_UNRET_ENTRY
        u64 value;
  
        /*
@@@ -1052,11 -1039,6 +1052,11 @@@ static void init_amd_zen4(struct cpuinf
                msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
  }
  
 +static void init_amd_zen5(struct cpuinfo_x86 *c)
 +{
 +      init_amd_zen_common();
 +}
 +
  static void init_amd(struct cpuinfo_x86 *c)
  {
        u64 vm_cr;
                init_amd_zen3(c);
        else if (boot_cpu_has(X86_FEATURE_ZEN4))
                init_amd_zen4(c);
 +      else if (boot_cpu_has(X86_FEATURE_ZEN5))
 +              init_amd_zen5(c);
  
        /*
         * Enable workaround for FXSAVE leak on CPUs
index bb8ee1ce696836667caa5700b4bcce6cb2ab5488,cd54313706a2882181dddf2108909961202f7a57..cc3a81852e4acbe759e3d156722ac9959a2ad1d7
@@@ -477,7 -477,7 +477,7 @@@ SYM_CODE_START(soft_restart_cpu
        UNWIND_HINT_END_OF_STACK
  
        /* Find the idle task stack */
 -      movq    PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
 +      movq    PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx
        movq    TASK_threadsp(%rcx), %rsp
  
        jmp     .Ljump_to_C_code
@@@ -622,7 -622,7 +622,7 @@@ SYM_CODE_END(vc_no_ghcb
  #define SYM_DATA_START_PAGE_ALIGNED(name)                     \
        SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
  
- #ifdef CONFIG_PAGE_TABLE_ISOLATION
+ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
  /*
   * Each PGD needs to be 8k long and 8k aligned.  We do not
   * ever go out to userspace with these, so we do not
index 9be175c8ac975b46c61ebb75a47160fb6bf1deb1,6716fccd59ce09930f295ac652c648b5376e8c74..56451fd2099e718b6cf89fde074980042aa3096a
@@@ -46,7 -46,6 +46,7 @@@ ENTRY(phys_startup_64
  #endif
  
  jiffies = jiffies_64;
 +const_pcpu_hot = pcpu_hot;
  
  #if defined(CONFIG_X86_64)
  /*
@@@ -133,7 -132,7 +133,7 @@@ SECTION
                LOCK_TEXT
                KPROBES_TEXT
                SOFTIRQENTRY_TEXT
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
                *(.text..__x86.indirect_thunk)
                *(.text..__x86.return_thunk)
  #endif
                *(.text..__x86.rethunk_untrain)
                ENTRY_TEXT
  
- #ifdef CONFIG_CPU_SRSO
+ #ifdef CONFIG_MITIGATION_SRSO
                /*
                 * See the comment above srso_alias_untrain_ret()'s
                 * definition.
        }
  #endif
  
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
        /*
         * List of instructions that call/jmp/jcc to retpoline thunks
         * __x86_indirect_thunk_*(). These instructions can be patched along
@@@ -505,11 -504,11 +505,11 @@@ INIT_PER_CPU(irq_stack_backing_store)
             "fixed_percpu_data is not at start of per-cpu area");
  #endif
  
- #ifdef CONFIG_CPU_UNRET_ENTRY
+ #ifdef CONFIG_MITIGATION_UNRET_ENTRY
  . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
  #endif
  
- #ifdef CONFIG_CPU_SRSO
+ #ifdef CONFIG_MITIGATION_SRSO
  . = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
  /*
   * GNU ld cannot do XOR until 2.41.
diff --combined arch/x86/kvm/mmu/mmu.c
index 2d6cdeab1f8a3e78306148d44a4665a1d51d8b1e,6fdc1cf5c33d43532522fe8411d8a04c383a4ed1..d59e3ba5d6463ec36199b437f40f78ecfbdf8d83
@@@ -263,7 -263,7 +263,7 @@@ static unsigned long get_guest_cr3(stru
  static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
                                                  struct kvm_mmu *mmu)
  {
-       if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+       if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
                return kvm_read_cr3(vcpu);
  
        return mmu->get_guest_pgd(vcpu);
  
  static inline bool kvm_available_flush_remote_tlbs_range(void)
  {
 +#if IS_ENABLED(CONFIG_HYPERV)
        return kvm_x86_ops.flush_remote_tlbs_range;
 -}
 -
 -int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
 -{
 -      if (!kvm_x86_ops.flush_remote_tlbs_range)
 -              return -EOPNOTSUPP;
 -
 -      return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
 +#else
 +      return false;
 +#endif
  }
  
  static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
@@@ -791,26 -795,16 +791,26 @@@ static struct kvm_lpage_info *lpage_inf
        return &slot->arch.lpage_info[level - 2][idx];
  }
  
 +/*
 + * The most significant bit in disallow_lpage tracks whether or not memory
 + * attributes are mixed, i.e. not identical for all gfns at the current level.
 + * The lower order bits are used to refcount other cases where a hugepage is
 + * disallowed, e.g. if KVM has shadow a page table at the gfn.
 + */
 +#define KVM_LPAGE_MIXED_FLAG  BIT(31)
 +
  static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
                                            gfn_t gfn, int count)
  {
        struct kvm_lpage_info *linfo;
 -      int i;
 +      int old, i;
  
        for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
                linfo = lpage_info_slot(gfn, slot, i);
 +
 +              old = linfo->disallow_lpage;
                linfo->disallow_lpage += count;
 -              WARN_ON_ONCE(linfo->disallow_lpage < 0);
 +              WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG);
        }
  }
  
@@@ -1388,7 -1382,7 +1388,7 @@@ void kvm_arch_mmu_enable_log_dirty_pt_m
                gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
  
                if (READ_ONCE(eager_page_split))
 -                      kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
 +                      kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K);
  
                kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
  
@@@ -2846,9 -2840,9 +2846,9 @@@ int mmu_try_to_unsync_pages(struct kvm 
                        /*
                         * Recheck after taking the spinlock, a different vCPU
                         * may have since marked the page unsync.  A false
 -                       * positive on the unprotected check above is not
 +                       * negative on the unprotected check above is not
                         * possible as clearing sp->unsync _must_ hold mmu_lock
 -                       * for write, i.e. unsync cannot transition from 0->1
 +                       * for write, i.e. unsync cannot transition from 1->0
                         * while this CPU holds mmu_lock for read (or write).
                         */
                        if (READ_ONCE(sp->unsync))
@@@ -3062,7 -3056,7 +3062,7 @@@ static void direct_pte_prefetch(struct 
   *
   * There are several ways to safely use this helper:
   *
 - * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
 + * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
   *   consuming it.  In this case, mmu_lock doesn't need to be held during the
   *   lookup, but it does need to be held while checking the MMU notifier.
   *
@@@ -3143,9 -3137,9 +3143,9 @@@ out
        return level;
  }
  
 -int kvm_mmu_max_mapping_level(struct kvm *kvm,
 -                            const struct kvm_memory_slot *slot, gfn_t gfn,
 -                            int max_level)
 +static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
 +                                     const struct kvm_memory_slot *slot,
 +                                     gfn_t gfn, int max_level, bool is_private)
  {
        struct kvm_lpage_info *linfo;
        int host_level;
                        break;
        }
  
 +      if (is_private)
 +              return max_level;
 +
        if (max_level == PG_LEVEL_4K)
                return PG_LEVEL_4K;
  
        return min(host_level, max_level);
  }
  
 +int kvm_mmu_max_mapping_level(struct kvm *kvm,
 +                            const struct kvm_memory_slot *slot, gfn_t gfn,
 +                            int max_level)
 +{
 +      bool is_private = kvm_slot_can_be_private(slot) &&
 +                        kvm_mem_is_private(kvm, gfn);
 +
 +      return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
 +}
 +
  void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
  {
        struct kvm_memory_slot *slot = fault->slot;
         * Enforce the iTLB multihit workaround after capturing the requested
         * level, which will be used to do precise, accurate accounting.
         */
 -      fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
 -                                                   fault->gfn, fault->max_level);
 +      fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
 +                                                     fault->gfn, fault->max_level,
 +                                                     fault->is_private);
        if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
                return;
  
@@@ -3576,7 -3556,7 +3576,7 @@@ static void mmu_free_root_page(struct k
                return;
  
        if (is_tdp_mmu_page(sp))
 -              kvm_tdp_mmu_put_root(kvm, sp, false);
 +              kvm_tdp_mmu_put_root(kvm, sp);
        else if (!--sp->root_count && sp->role.invalid)
                kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
  
@@@ -3759,7 -3739,7 +3759,7 @@@ static int mmu_first_shadow_root_alloc(
            kvm_page_track_write_tracking_enabled(kvm))
                goto out_success;
  
 -      for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 +      for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                slots = __kvm_memslots(kvm, i);
                kvm_for_each_memslot(slot, bkt, slots) {
                        /*
@@@ -3802,7 -3782,7 +3802,7 @@@ static int mmu_alloc_shadow_roots(struc
        hpa_t root;
  
        root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
 -      root_gfn = root_pgd >> PAGE_SHIFT;
 +      root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
  
        if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
                mmu->root.hpa = kvm_mmu_get_dummy_root();
@@@ -4279,55 -4259,6 +4279,55 @@@ void kvm_arch_async_page_ready(struct k
        kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
  }
  
 +static inline u8 kvm_max_level_for_order(int order)
 +{
 +      BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
 +
 +      KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
 +                      order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
 +                      order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
 +
 +      if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
 +              return PG_LEVEL_1G;
 +
 +      if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
 +              return PG_LEVEL_2M;
 +
 +      return PG_LEVEL_4K;
 +}
 +
 +static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
 +                                            struct kvm_page_fault *fault)
 +{
 +      kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
 +                                    PAGE_SIZE, fault->write, fault->exec,
 +                                    fault->is_private);
 +}
 +
 +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
 +                                 struct kvm_page_fault *fault)
 +{
 +      int max_order, r;
 +
 +      if (!kvm_slot_can_be_private(fault->slot)) {
 +              kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
 +              return -EFAULT;
 +      }
 +
 +      r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
 +                           &max_order);
 +      if (r) {
 +              kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
 +              return r;
 +      }
 +
 +      fault->max_level = min(kvm_max_level_for_order(max_order),
 +                             fault->max_level);
 +      fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
 +
 +      return RET_PF_CONTINUE;
 +}
 +
  static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
  {
        struct kvm_memory_slot *slot = fault->slot;
                        return RET_PF_EMULATE;
        }
  
 +      if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
 +              kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
 +              return -EFAULT;
 +      }
 +
 +      if (fault->is_private)
 +              return kvm_faultin_pfn_private(vcpu, fault);
 +
        async = false;
        fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
                                          fault->write, &fault->map_writable,
@@@ -4443,7 -4366,7 +4443,7 @@@ static bool is_page_fault_stale(struct 
                return true;
  
        return fault->slot &&
 -             mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
 +             mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
  }
  
  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@@ -6305,7 -6228,7 +6305,7 @@@ static bool kvm_rmap_zap_gfn_range(stru
        if (!kvm_memslots_have_rmaps(kvm))
                return flush;
  
 -      for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 +      for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                slots = __kvm_memslots(kvm, i);
  
                kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
@@@ -6337,9 -6260,7 +6337,9 @@@ void kvm_zap_gfn_range(struct kvm *kvm
  
        write_lock(&kvm->mmu_lock);
  
 -      kvm_mmu_invalidate_begin(kvm, 0, -1ul);
 +      kvm_mmu_invalidate_begin(kvm);
 +
 +      kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
  
        flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
  
        if (flush)
                kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
  
 -      kvm_mmu_invalidate_end(kvm, 0, -1ul);
 +      kvm_mmu_invalidate_end(kvm);
  
        write_unlock(&kvm->mmu_lock);
  }
@@@ -6802,7 -6723,7 +6802,7 @@@ void kvm_mmu_invalidate_mmio_sptes(stru
         * modifier prior to checking for a wrap of the MMIO generation so
         * that a wrap in any address space is detected.
         */
 -      gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
 +      gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1);
  
        /*
         * The very rare case: if the MMIO generation number has wrapped,
@@@ -7255,163 -7176,3 +7255,163 @@@ void kvm_mmu_pre_destroy_vm(struct kvm 
        if (kvm->arch.nx_huge_page_recovery_thread)
                kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
  }
 +
 +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
 +bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
 +                                      struct kvm_gfn_range *range)
 +{
 +      /*
 +       * Zap SPTEs even if the slot can't be mapped PRIVATE.  KVM x86 only
 +       * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
 +       * can simply ignore such slots.  But if userspace is making memory
 +       * PRIVATE, then KVM must prevent the guest from accessing the memory
 +       * as shared.  And if userspace is making memory SHARED and this point
 +       * is reached, then at least one page within the range was previously
 +       * PRIVATE, i.e. the slot's possible hugepage ranges are changing.
 +       * Zapping SPTEs in this case ensures KVM will reassess whether or not
 +       * a hugepage can be used for affected ranges.
 +       */
 +      if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
 +              return false;
 +
 +      return kvm_unmap_gfn_range(kvm, range);
 +}
 +
 +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
 +                              int level)
 +{
 +      return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
 +}
 +
 +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
 +                               int level)
 +{
 +      lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
 +}
 +
 +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
 +                             int level)
 +{
 +      lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
 +}
 +
 +static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
 +                             gfn_t gfn, int level, unsigned long attrs)
 +{
 +      const unsigned long start = gfn;
 +      const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
 +
 +      if (level == PG_LEVEL_2M)
 +              return kvm_range_has_memory_attributes(kvm, start, end, attrs);
 +
 +      for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
 +              if (hugepage_test_mixed(slot, gfn, level - 1) ||
 +                  attrs != kvm_get_memory_attributes(kvm, gfn))
 +                      return false;
 +      }
 +      return true;
 +}
 +
 +bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
 +                                       struct kvm_gfn_range *range)
 +{
 +      unsigned long attrs = range->arg.attributes;
 +      struct kvm_memory_slot *slot = range->slot;
 +      int level;
 +
 +      lockdep_assert_held_write(&kvm->mmu_lock);
 +      lockdep_assert_held(&kvm->slots_lock);
 +
 +      /*
 +       * Calculate which ranges can be mapped with hugepages even if the slot
 +       * can't map memory PRIVATE.  KVM mustn't create a SHARED hugepage over
 +       * a range that has PRIVATE GFNs, and conversely converting a range to
 +       * SHARED may now allow hugepages.
 +       */
 +      if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
 +              return false;
 +
 +      /*
 +       * The sequence matters here: upper levels consume the result of lower
 +       * level's scanning.
 +       */
 +      for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
 +              gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
 +              gfn_t gfn = gfn_round_for_level(range->start, level);
 +
 +              /* Process the head page if it straddles the range. */
 +              if (gfn != range->start || gfn + nr_pages > range->end) {
 +                      /*
 +                       * Skip mixed tracking if the aligned gfn isn't covered
 +                       * by the memslot, KVM can't use a hugepage due to the
 +                       * misaligned address regardless of memory attributes.
 +                       */
 +                      if (gfn >= slot->base_gfn) {
 +                              if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
 +                                      hugepage_clear_mixed(slot, gfn, level);
 +                              else
 +                                      hugepage_set_mixed(slot, gfn, level);
 +                      }
 +                      gfn += nr_pages;
 +              }
 +
 +              /*
 +               * Pages entirely covered by the range are guaranteed to have
 +               * only the attributes which were just set.
 +               */
 +              for ( ; gfn + nr_pages <= range->end; gfn += nr_pages)
 +                      hugepage_clear_mixed(slot, gfn, level);
 +
 +              /*
 +               * Process the last tail page if it straddles the range and is
 +               * contained by the memslot.  Like the head page, KVM can't
 +               * create a hugepage if the slot size is misaligned.
 +               */
 +              if (gfn < range->end &&
 +                  (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) {
 +                      if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
 +                              hugepage_clear_mixed(slot, gfn, level);
 +                      else
 +                              hugepage_set_mixed(slot, gfn, level);
 +              }
 +      }
 +      return false;
 +}
 +
 +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
 +                                          struct kvm_memory_slot *slot)
 +{
 +      int level;
 +
 +      if (!kvm_arch_has_private_mem(kvm))
 +              return;
 +
 +      for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
 +              /*
 +               * Don't bother tracking mixed attributes for pages that can't
 +               * be huge due to alignment, i.e. process only pages that are
 +               * entirely contained by the memslot.
 +               */
 +              gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level);
 +              gfn_t start = gfn_round_for_level(slot->base_gfn, level);
 +              gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
 +              gfn_t gfn;
 +
 +              if (start < slot->base_gfn)
 +                      start += nr_pages;
 +
 +              /*
 +               * Unlike setting attributes, every potential hugepage needs to
 +               * be manually checked as the attributes may already be mixed.
 +               */
 +              for (gfn = start; gfn < end; gfn += nr_pages) {
 +                      unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
 +
 +                      if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
 +                              hugepage_clear_mixed(slot, gfn, level);
 +                      else
 +                              hugepage_set_mixed(slot, gfn, level);
 +              }
 +      }
 +}
 +#endif
index 0669a8a668cacd4d0be68affbecbb686524c5213,bf73a121c5ef14fdd758607ef8992f2deb01112e..5390a591a5718cce422958210e3eb91f457bd169
@@@ -13,7 -13,6 +13,7 @@@
  #endif
  
  /* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
 +#define __PT_BASE_ADDR_MASK GENMASK_ULL(51, 12)
  #define __PT_LEVEL_SHIFT(level, bits_per_level)       \
        (PAGE_SHIFT + ((level) - 1) * (bits_per_level))
  #define __PT_INDEX(address, level, bits_per_level) \
@@@ -202,7 -201,6 +202,7 @@@ struct kvm_page_fault 
  
        /* Derived from mmu and global state.  */
        const bool is_tdp;
 +      const bool is_private;
        const bool nx_huge_page_workaround_enabled;
  
        /*
@@@ -298,7 -296,6 +298,7 @@@ static inline int kvm_mmu_do_page_fault
                .max_level = KVM_MAX_HUGEPAGE_LEVEL,
                .req_level = PG_LEVEL_4K,
                .goal_level = PG_LEVEL_4K,
 +              .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
        };
        int r;
  
        if (!prefetch)
                vcpu->stat.pf_taken++;
  
-       if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp)
+       if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
                r = kvm_tdp_page_fault(vcpu, &fault);
        else
                r = vcpu->arch.mmu->page_fault(vcpu, &fault);
diff --combined arch/x86/kvm/svm/svm.c
index e90b429c84f158bdd8d4348172d56eac1e80763b,b2751b9acf03da401e7034d0207c82c74b45ea41..61f2bdc9f4f8b294c3db3486637f9d1e5c3b428b
@@@ -3455,7 -3455,7 +3455,7 @@@ int svm_invoke_exit_handler(struct kvm_
        if (!svm_check_exit_valid(exit_code))
                return svm_handle_invalid_exit(vcpu, exit_code);
  
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
        if (exit_code == SVM_EXIT_MSR)
                return msr_interception(vcpu);
        else if (exit_code == SVM_EXIT_VINTR)
@@@ -3563,15 -3563,8 +3563,15 @@@ static void svm_inject_nmi(struct kvm_v
        if (svm->nmi_l1_to_l2)
                return;
  
 -      svm->nmi_masked = true;
 -      svm_set_iret_intercept(svm);
 +      /*
 +       * No need to manually track NMI masking when vNMI is enabled, hardware
 +       * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the
 +       * case where software directly injects an NMI.
 +       */
 +      if (!is_vnmi_enabled(svm)) {
 +              svm->nmi_masked = true;
 +              svm_set_iret_intercept(svm);
 +      }
        ++vcpu->stat.nmi_injections;
  }
  
@@@ -5086,13 -5079,6 +5086,13 @@@ static __init void svm_set_cpu_caps(voi
                kvm_cpu_cap_set(X86_FEATURE_SVM);
                kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
  
 +              /*
 +               * KVM currently flushes TLBs on *every* nested SVM transition,
 +               * and so for all intents and purposes KVM supports flushing by
 +               * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush.
 +               */
 +              kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID);
 +
                if (nrips)
                        kvm_cpu_cap_set(X86_FEATURE_NRIPS);
  
index 9499f9c6b07711bb1254ce574584ebc166d293fc,b9e08837ab96070c5706ae97ddb9b8bd97bdb02f..187018c424bfb4ba8cadfa71a0f4ec7d4c63d766
@@@ -207,7 -207,7 +207,7 @@@ SYM_FUNC_START(__svm_vcpu_run
  7:    vmload %_ASM_AX
  8:
  
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
        /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
        FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
  #endif
        RESTORE_GUEST_SPEC_CTRL_BODY
        RESTORE_HOST_SPEC_CTRL_BODY
  
 -10:   cmpb $0, kvm_rebooting
 +10:   cmpb $0, _ASM_RIP(kvm_rebooting)
        jne 2b
        ud2
 -30:   cmpb $0, kvm_rebooting
 +30:   cmpb $0, _ASM_RIP(kvm_rebooting)
        jne 4b
        ud2
 -50:   cmpb $0, kvm_rebooting
 +50:   cmpb $0, _ASM_RIP(kvm_rebooting)
        jne 6b
        ud2
 -70:   cmpb $0, kvm_rebooting
 +70:   cmpb $0, _ASM_RIP(kvm_rebooting)
        jne 8b
        ud2
  
@@@ -344,7 -344,7 +344,7 @@@ SYM_FUNC_START(__svm_sev_es_vcpu_run
        /* Pop @svm to RDI, guest registers have been saved already. */
        pop %_ASM_DI
  
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
        /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
        FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
  #endif
        RESTORE_GUEST_SPEC_CTRL_BODY
        RESTORE_HOST_SPEC_CTRL_BODY
  
 -3:    cmpb $0, kvm_rebooting
 +3:    cmpb $0, _ASM_RIP(kvm_rebooting)
        jne 2b
        ud2
  
diff --combined arch/x86/kvm/vmx/vmx.c
index 1111d9d089038b2f17b372891a235222b74f87bf,4e1003ba380aec4cfb73943907e345b0e5e7c192..ebf11ef59f03c612b7ae48ebeee97519f9eb59eb
@@@ -66,7 -66,6 +66,7 @@@
  #include "vmx.h"
  #include "x86.h"
  #include "smm.h"
 +#include "vmx_onhyperv.h"
  
  MODULE_AUTHOR("Qumranet");
  MODULE_LICENSE("GPL");
@@@ -524,14 -523,22 +524,14 @@@ module_param(enlightened_vmcs, bool, 04
  static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
  {
        struct hv_enlightened_vmcs *evmcs;
 -      struct hv_partition_assist_pg **p_hv_pa_pg =
 -                      &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
 -      /*
 -       * Synthetic VM-Exit is not enabled in current code and so All
 -       * evmcs in singe VM shares same assist page.
 -       */
 -      if (!*p_hv_pa_pg)
 -              *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
 +      hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
  
 -      if (!*p_hv_pa_pg)
 +      if (partition_assist_page == INVALID_PAGE)
                return -ENOMEM;
  
        evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
  
 -      evmcs->partition_assist_page =
 -              __pa(*p_hv_pa_pg);
 +      evmcs->partition_assist_page = partition_assist_page;
        evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
        evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
  
@@@ -738,7 -745,7 +738,7 @@@ static int vmx_set_guest_uret_msr(struc
   */
  static int kvm_cpu_vmxoff(void)
  {
 -      asm_volatile_goto("1: vmxoff\n\t"
 +      asm goto("1: vmxoff\n\t"
                          _ASM_EXTABLE(1b, %l[fault])
                          ::: "cc", "memory" : fault);
  
@@@ -2048,7 -2055,6 +2048,7 @@@ static int vmx_get_msr(struct kvm_vcpu 
                if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
                                    &msr_info->data))
                        return 1;
 +#ifdef CONFIG_KVM_HYPERV
                /*
                 * Enlightened VMCS v1 doesn't have certain VMCS fields but
                 * instead of just ignoring the features, different Hyper-V
                if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
                        nested_evmcs_filter_control_msr(vcpu, msr_info->index,
                                                        &msr_info->data);
 +#endif
                break;
        case MSR_IA32_RTIT_CTL:
                if (!vmx_pt_mode_is_host_guest())
@@@ -2784,7 -2789,7 +2784,7 @@@ static int kvm_cpu_vmxon(u64 vmxon_poin
  
        cr4_set_bits(X86_CR4_VMXE);
  
 -      asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
 +      asm goto("1: vmxon %[vmxon_pointer]\n\t"
                          _ASM_EXTABLE(1b, %l[fault])
                          : : [vmxon_pointer] "m"(vmxon_pointer)
                          : : fault);
@@@ -3395,8 -3400,7 +3395,8 @@@ static void vmx_load_mmu_pgd(struct kvm
                        update_guest_cr3 = false;
                vmx_ept_load_pdptrs(vcpu);
        } else {
 -              guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
 +              guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
 +                          kvm_get_active_cr3_lam_bits(vcpu);
        }
  
        if (update_guest_cr3)
@@@ -4829,10 -4833,7 +4829,10 @@@ static void __vmx_vcpu_reset(struct kvm
        vmx->nested.posted_intr_nv = -1;
        vmx->nested.vmxon_ptr = INVALID_GPA;
        vmx->nested.current_vmptr = INVALID_GPA;
 +
 +#ifdef CONFIG_KVM_HYPERV
        vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
 +#endif
  
        vcpu->arch.microcode_version = 0x100000000ULL;
        vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
@@@ -5781,7 -5782,7 +5781,7 @@@ static int handle_ept_violation(struct 
         * would also use advanced VM-exit information for EPT violations to
         * reconstruct the page fault error code.
         */
 -      if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
 +      if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
                return kvm_emulate_instruction(vcpu, 0);
  
        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@@ -6543,7 -6544,7 +6543,7 @@@ static int __vmx_handle_exit(struct kvm
  
        if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
                goto unexpected_vmexit;
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
        if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
                return kvm_emulate_wrmsr(vcpu);
        else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
@@@ -6756,10 -6757,10 +6756,10 @@@ static void vmx_set_apic_access_page_ad
                return;
  
        /*
 -       * Grab the memslot so that the hva lookup for the mmu_notifier retry
 -       * is guaranteed to use the same memslot as the pfn lookup, i.e. rely
 -       * on the pfn lookup's validation of the memslot to ensure a valid hva
 -       * is used for the retry check.
 +       * Explicitly grab the memslot using KVM's internal slot ID to ensure
 +       * KVM doesn't unintentionally grab a userspace memslot.  It _should_
 +       * be impossible for userspace to create a memslot for the APIC when
 +       * APICv is enabled, but paranoia won't hurt in this case.
         */
        slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
                return;
  
        read_lock(&vcpu->kvm->mmu_lock);
 -      if (mmu_invalidate_retry_hva(kvm, mmu_seq,
 -                                   gfn_to_hva_memslot(slot, gfn))) {
 +      if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) {
                kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
                read_unlock(&vcpu->kvm->mmu_lock);
                goto out;
@@@ -7672,9 -7674,6 +7672,9 @@@ static void nested_vmx_cr_fixed1_bits_u
        cr4_fixed1_update(X86_CR4_UMIP,       ecx, feature_bit(UMIP));
        cr4_fixed1_update(X86_CR4_LA57,       ecx, feature_bit(LA57));
  
 +      entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
 +      cr4_fixed1_update(X86_CR4_LAM_SUP,    eax, feature_bit(LAM));
 +
  #undef cr4_fixed1_update
  }
  
@@@ -7761,7 -7760,6 +7761,7 @@@ static void vmx_vcpu_after_set_cpuid(st
                kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
  
        kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
 +      kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM);
  
        vmx_setup_uret_msrs(vmx);
  
@@@ -8208,50 -8206,6 +8208,50 @@@ static void vmx_vm_destroy(struct kvm *
        free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
  }
  
 +/*
 + * Note, the SDM states that the linear address is masked *after* the modified
 + * canonicality check, whereas KVM masks (untags) the address and then performs
 + * a "normal" canonicality check.  Functionally, the two methods are identical,
 + * and when the masking occurs relative to the canonicality check isn't visible
 + * to software, i.e. KVM's behavior doesn't violate the SDM.
 + */
 +gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
 +{
 +      int lam_bit;
 +      unsigned long cr3_bits;
 +
 +      if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
 +              return gva;
 +
 +      if (!is_64_bit_mode(vcpu))
 +              return gva;
 +
 +      /*
 +       * Bit 63 determines if the address should be treated as user address
 +       * or a supervisor address.
 +       */
 +      if (!(gva & BIT_ULL(63))) {
 +              cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
 +              if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
 +                      return gva;
 +
 +              /* LAM_U48 is ignored if LAM_U57 is set. */
 +              lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
 +      } else {
 +              if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
 +                      return gva;
 +
 +              lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
 +      }
 +
 +      /*
 +       * Untag the address by sign-extending the lam_bit, but NOT to bit 63.
 +       * Bit 63 is retained from the raw virtual address so that untagging
 +       * doesn't change a user access to a supervisor access, and vice versa.
 +       */
 +      return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
 +}
 +
  static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .name = KBUILD_MODNAME,
  
        .complete_emulated_msr = kvm_complete_insn_gp,
  
        .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
 +
 +      .get_untagged_addr = vmx_get_untagged_addr,
  };
  
  static unsigned int vmx_handle_intel_pt_intr(void)
index 919f647c740fb54f0fca4f4a16c5f614f9cf8521,63b7aa48793e386855530b72fa77cd321e9e1186..f3b4716317c1aebc8efb03fb404c4aa92dc2e9cc
@@@ -17,7 -17,6 +17,7 @@@
  #include <asm/nospec-branch.h>
  #include <asm/text-patching.h>
  #include <asm/unwind.h>
 +#include <asm/cfi.h>
  
  static bool all_callee_regs_used[4] = {true, true, true, true};
  
@@@ -52,11 -51,9 +52,11 @@@ static u8 *emit_code(u8 *ptr, u32 bytes
        do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
  
  #ifdef CONFIG_X86_KERNEL_IBT
 -#define EMIT_ENDBR()  EMIT(gen_endbr(), 4)
 +#define EMIT_ENDBR()          EMIT(gen_endbr(), 4)
 +#define EMIT_ENDBR_POISON()   EMIT(gen_endbr_poison(), 4)
  #else
  #define EMIT_ENDBR()
 +#define EMIT_ENDBR_POISON()
  #endif
  
  static bool is_imm8(int value)
@@@ -307,88 -304,6 +307,88 @@@ static void pop_callee_regs(u8 **pprog
        *pprog = prog;
  }
  
 +static void emit_nops(u8 **pprog, int len)
 +{
 +      u8 *prog = *pprog;
 +      int i, noplen;
 +
 +      while (len > 0) {
 +              noplen = len;
 +
 +              if (noplen > ASM_NOP_MAX)
 +                      noplen = ASM_NOP_MAX;
 +
 +              for (i = 0; i < noplen; i++)
 +                      EMIT1(x86_nops[noplen][i]);
 +              len -= noplen;
 +      }
 +
 +      *pprog = prog;
 +}
 +
 +/*
 + * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
 + * in arch/x86/kernel/alternative.c
 + */
 +
 +static void emit_fineibt(u8 **pprog, u32 hash)
 +{
 +      u8 *prog = *pprog;
 +
 +      EMIT_ENDBR();
 +      EMIT3_off32(0x41, 0x81, 0xea, hash);            /* subl $hash, %r10d    */
 +      EMIT2(0x74, 0x07);                              /* jz.d8 +7             */
 +      EMIT2(0x0f, 0x0b);                              /* ud2                  */
 +      EMIT1(0x90);                                    /* nop                  */
 +      EMIT_ENDBR_POISON();
 +
 +      *pprog = prog;
 +}
 +
 +static void emit_kcfi(u8 **pprog, u32 hash)
 +{
 +      u8 *prog = *pprog;
 +
 +      EMIT1_off32(0xb8, hash);                        /* movl $hash, %eax     */
 +#ifdef CONFIG_CALL_PADDING
 +      EMIT1(0x90);
 +      EMIT1(0x90);
 +      EMIT1(0x90);
 +      EMIT1(0x90);
 +      EMIT1(0x90);
 +      EMIT1(0x90);
 +      EMIT1(0x90);
 +      EMIT1(0x90);
 +      EMIT1(0x90);
 +      EMIT1(0x90);
 +      EMIT1(0x90);
 +#endif
 +      EMIT_ENDBR();
 +
 +      *pprog = prog;
 +}
 +
 +static void emit_cfi(u8 **pprog, u32 hash)
 +{
 +      u8 *prog = *pprog;
 +
 +      switch (cfi_mode) {
 +      case CFI_FINEIBT:
 +              emit_fineibt(&prog, hash);
 +              break;
 +
 +      case CFI_KCFI:
 +              emit_kcfi(&prog, hash);
 +              break;
 +
 +      default:
 +              EMIT_ENDBR();
 +              break;
 +      }
 +
 +      *pprog = prog;
 +}
 +
  /*
   * Emit x86-64 prologue code for BPF program.
   * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
@@@ -400,11 -315,12 +400,11 @@@ static void emit_prologue(u8 **pprog, u
  {
        u8 *prog = *pprog;
  
 +      emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
        /* BPF trampoline can be made to work without these nops,
         * but let's waste 5 bytes for now and optimize later
         */
 -      EMIT_ENDBR();
 -      memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
 -      prog += X86_PATCH_SIZE;
 +      emit_nops(&prog, X86_PATCH_SIZE);
        if (!ebpf_from_cbpf) {
                if (tail_call_reachable && !is_subprog)
                        /* When it's the entry of the whole tailcall context,
@@@ -553,7 -469,7 +553,7 @@@ static void emit_indirect_jump(u8 **ppr
                        emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
        } else {
                EMIT2(0xFF, 0xE0 + reg);        /* jmp *%\reg */
-               if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS))
+               if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || IS_ENABLED(CONFIG_MITIGATION_SLS))
                        EMIT1(0xCC);            /* int3 */
        }
  
@@@ -568,7 -484,7 +568,7 @@@ static void emit_return(u8 **pprog, u8 
                emit_jump(&prog, x86_return_thunk, ip);
        } else {
                EMIT1(0xC3);            /* ret */
-               if (IS_ENABLED(CONFIG_SLS))
+               if (IS_ENABLED(CONFIG_MITIGATION_SLS))
                        EMIT1(0xCC);    /* int3 */
        }
  
@@@ -710,7 -626,8 +710,7 @@@ static void emit_bpf_tail_call_direct(s
        if (stack_depth)
                EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
  
 -      memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
 -      prog += X86_PATCH_SIZE;
 +      emit_nops(&prog, X86_PATCH_SIZE);
  
        /* out: */
        ctx->tail_call_direct_label = prog - start;
@@@ -1072,6 -989,25 +1072,6 @@@ static void detect_reg_usage(struct bpf
        }
  }
  
 -static void emit_nops(u8 **pprog, int len)
 -{
 -      u8 *prog = *pprog;
 -      int i, noplen;
 -
 -      while (len > 0) {
 -              noplen = len;
 -
 -              if (noplen > ASM_NOP_MAX)
 -                      noplen = ASM_NOP_MAX;
 -
 -              for (i = 0; i < noplen; i++)
 -                      EMIT1(x86_nops[noplen][i]);
 -              len -= noplen;
 -      }
 -
 -      *pprog = prog;
 -}
 -
  /* emit the 3-byte VEX prefix
   *
   * r: same as rex.r, extra bit for ModRM reg field
@@@ -2262,8 -2198,7 +2262,8 @@@ static void restore_regs(const struct b
  
  static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
                           struct bpf_tramp_link *l, int stack_size,
 -                         int run_ctx_off, bool save_ret)
 +                         int run_ctx_off, bool save_ret,
 +                         void *image, void *rw_image)
  {
        u8 *prog = *pprog;
        u8 *jmp_insn;
        else
                EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
  
 -      if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog))
 +      if (emit_rsb_call(&prog, bpf_trampoline_enter(p), image + (prog - (u8 *)rw_image)))
                return -EINVAL;
        /* remember prog start time returned by __bpf_prog_enter */
        emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
                               (long) p->insnsi >> 32,
                               (u32) (long) p->insnsi);
        /* call JITed bpf program or interpreter */
 -      if (emit_rsb_call(&prog, p->bpf_func, prog))
 +      if (emit_rsb_call(&prog, p->bpf_func, image + (prog - (u8 *)rw_image)))
                return -EINVAL;
  
        /*
                EMIT3_off32(0x48, 0x8D, 0x95, -run_ctx_off);
        else
                EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
 -      if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog))
 +      if (emit_rsb_call(&prog, bpf_trampoline_exit(p), image + (prog - (u8 *)rw_image)))
                return -EINVAL;
  
        *pprog = prog;
@@@ -2377,15 -2312,14 +2377,15 @@@ static int emit_cond_near_jump(u8 **ppr
  
  static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
                      struct bpf_tramp_links *tl, int stack_size,
 -                    int run_ctx_off, bool save_ret)
 +                    int run_ctx_off, bool save_ret,
 +                    void *image, void *rw_image)
  {
        int i;
        u8 *prog = *pprog;
  
        for (i = 0; i < tl->nr_links; i++) {
                if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
 -                                  run_ctx_off, save_ret))
 +                                  run_ctx_off, save_ret, image, rw_image))
                        return -EINVAL;
        }
        *pprog = prog;
  
  static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
                              struct bpf_tramp_links *tl, int stack_size,
 -                            int run_ctx_off, u8 **branches)
 +                            int run_ctx_off, u8 **branches,
 +                            void *image, void *rw_image)
  {
        u8 *prog = *pprog;
        int i;
        emit_mov_imm32(&prog, false, BPF_REG_0, 0);
        emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
        for (i = 0; i < tl->nr_links; i++) {
 -              if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true))
 +              if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true,
 +                                  image, rw_image))
                        return -EINVAL;
  
                /* mod_ret prog stored return value into [rbp - 8]. Emit:
   * add rsp, 8                      // skip eth_type_trans's frame
   * ret                             // return to its caller
   */
 -int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 -                              const struct btf_func_model *m, u32 flags,
 -                              struct bpf_tramp_links *tlinks,
 -                              void *func_addr)
 +static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
 +                                       void *rw_image_end, void *image,
 +                                       const struct btf_func_model *m, u32 flags,
 +                                       struct bpf_tramp_links *tlinks,
 +                                       void *func_addr)
  {
        int i, ret, nr_regs = m->nr_args, stack_size = 0;
        int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
        u8 *prog;
        bool save_ret;
  
 +      /*
 +       * F_INDIRECT is only compatible with F_RET_FENTRY_RET, it is
 +       * explicitly incompatible with F_CALL_ORIG | F_SKIP_FRAME | F_IP_ARG
 +       * because @func_addr.
 +       */
 +      WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
 +                   (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));
 +
        /* extra registers for struct arguments */
 -      for (i = 0; i < m->nr_args; i++)
 +      for (i = 0; i < m->nr_args; i++) {
                if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
                        nr_regs += (m->arg_size[i] + 7) / 8 - 1;
 +      }
  
        /* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
         * are passed through regs, the remains are through stack.
                orig_call += X86_PATCH_SIZE;
        }
  
 -      prog = image;
 +      prog = rw_image;
  
 -      EMIT_ENDBR();
 -      /*
 -       * This is the direct-call trampoline, as such it needs accounting
 -       * for the __fentry__ call.
 -       */
 -      x86_call_depth_emit_accounting(&prog, NULL);
 +      if (flags & BPF_TRAMP_F_INDIRECT) {
 +              /*
 +               * Indirect call for bpf_struct_ops
 +               */
 +              emit_cfi(&prog, cfi_get_func_hash(func_addr));
 +      } else {
 +              /*
 +               * Direct-call fentry stub, as such it needs accounting for the
 +               * __fentry__ call.
 +               */
 +              x86_call_depth_emit_accounting(&prog, NULL);
 +      }
        EMIT1(0x55);             /* push rbp */
        EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
 -      if (!is_imm8(stack_size))
 +      if (!is_imm8(stack_size)) {
                /* sub rsp, stack_size */
                EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
 -      else
 +      } else {
                /* sub rsp, stack_size */
                EMIT4(0x48, 0x83, 0xEC, stack_size);
 +      }
        if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
                EMIT1(0x50);            /* push rax */
        /* mov QWORD PTR [rbp - rbx_off], rbx */
        if (flags & BPF_TRAMP_F_CALL_ORIG) {
                /* arg1: mov rdi, im */
                emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
 -              if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) {
 +              if (emit_rsb_call(&prog, __bpf_tramp_enter,
 +                                image + (prog - (u8 *)rw_image))) {
                        ret = -EINVAL;
                        goto cleanup;
                }
        }
  
 -      if (fentry->nr_links)
 +      if (fentry->nr_links) {
                if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
 -                             flags & BPF_TRAMP_F_RET_FENTRY_RET))
 +                             flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image))
                        return -EINVAL;
 +      }
  
        if (fmod_ret->nr_links) {
                branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
                        return -ENOMEM;
  
                if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
 -                                     run_ctx_off, branches)) {
 +                                     run_ctx_off, branches, image, rw_image)) {
                        ret = -EINVAL;
                        goto cleanup;
                }
                restore_regs(m, &prog, regs_off);
                save_args(m, &prog, arg_stack_off, true);
  
 -              if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
 +              if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
                        /* Before calling the original function, restore the
                         * tail_call_cnt from stack to rax.
                         */
                        RESTORE_TAIL_CALL_CNT(stack_size);
 +              }
  
                if (flags & BPF_TRAMP_F_ORIG_STACK) {
                        emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
                        EMIT2(0xff, 0xd3); /* call *rbx */
                } else {
                        /* call original function */
 -                      if (emit_rsb_call(&prog, orig_call, prog)) {
 +                      if (emit_rsb_call(&prog, orig_call, image + (prog - (u8 *)rw_image))) {
                                ret = -EINVAL;
                                goto cleanup;
                        }
                }
                /* remember return value in a stack for bpf prog to access */
                emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
 -              im->ip_after_call = prog;
 -              memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
 -              prog += X86_PATCH_SIZE;
 +              im->ip_after_call = image + (prog - (u8 *)rw_image);
 +              emit_nops(&prog, X86_PATCH_SIZE);
        }
  
        if (fmod_ret->nr_links) {
                /* Update the branches saved in invoke_bpf_mod_ret with the
                 * aligned address of do_fexit.
                 */
 -              for (i = 0; i < fmod_ret->nr_links; i++)
 -                      emit_cond_near_jump(&branches[i], prog, branches[i],
 -                                          X86_JNE);
 +              for (i = 0; i < fmod_ret->nr_links; i++) {
 +                      emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image),
 +                                          image + (branches[i] - (u8 *)rw_image), X86_JNE);
 +              }
        }
  
 -      if (fexit->nr_links)
 -              if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) {
 +      if (fexit->nr_links) {
 +              if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off,
 +                             false, image, rw_image)) {
                        ret = -EINVAL;
                        goto cleanup;
                }
 +      }
  
        if (flags & BPF_TRAMP_F_RESTORE_REGS)
                restore_regs(m, &prog, regs_off);
         * restored to R0.
         */
        if (flags & BPF_TRAMP_F_CALL_ORIG) {
 -              im->ip_epilogue = prog;
 +              im->ip_epilogue = image + (prog - (u8 *)rw_image);
                /* arg1: mov rdi, im */
                emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
 -              if (emit_rsb_call(&prog, __bpf_tramp_exit, prog)) {
 +              if (emit_rsb_call(&prog, __bpf_tramp_exit, image + (prog - (u8 *)rw_image))) {
                        ret = -EINVAL;
                        goto cleanup;
                }
 -      } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
 +      } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
                /* Before running the original function, restore the
                 * tail_call_cnt from stack to rax.
                 */
                RESTORE_TAIL_CALL_CNT(stack_size);
 +      }
  
        /* restore return value of orig_call or fentry prog back into RAX */
        if (save_ret)
  
        emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
        EMIT1(0xC9); /* leave */
 -      if (flags & BPF_TRAMP_F_SKIP_FRAME)
 +      if (flags & BPF_TRAMP_F_SKIP_FRAME) {
                /* skip our return address and return to parent */
                EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
 -      emit_return(&prog, prog);
 +      }
 +      emit_return(&prog, image + (prog - (u8 *)rw_image));
        /* Make sure the trampoline generation logic doesn't overflow */
 -      if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) {
 +      if (WARN_ON_ONCE(prog > (u8 *)rw_image_end - BPF_INSN_SAFETY)) {
                ret = -EFAULT;
                goto cleanup;
        }
 -      ret = prog - (u8 *)image;
 +      ret = prog - (u8 *)rw_image + BPF_INSN_SAFETY;
  
  cleanup:
        kfree(branches);
        return ret;
  }
  
 +void *arch_alloc_bpf_trampoline(unsigned int size)
 +{
 +      return bpf_prog_pack_alloc(size, jit_fill_hole);
 +}
 +
 +void arch_free_bpf_trampoline(void *image, unsigned int size)
 +{
 +      bpf_prog_pack_free(image, size);
 +}
 +
 +void arch_protect_bpf_trampoline(void *image, unsigned int size)
 +{
 +}
 +
 +void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
 +{
 +}
 +
 +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 +                              const struct btf_func_model *m, u32 flags,
 +                              struct bpf_tramp_links *tlinks,
 +                              void *func_addr)
 +{
 +      void *rw_image, *tmp;
 +      int ret;
 +      u32 size = image_end - image;
 +
 +      /* rw_image doesn't need to be in module memory range, so we can
 +       * use kvmalloc.
 +       */
 +      rw_image = kvmalloc(size, GFP_KERNEL);
 +      if (!rw_image)
 +              return -ENOMEM;
 +
 +      ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m,
 +                                          flags, tlinks, func_addr);
 +      if (ret < 0)
 +              goto out;
 +
 +      tmp = bpf_arch_text_copy(image, rw_image, size);
 +      if (IS_ERR(tmp))
 +              ret = PTR_ERR(tmp);
 +out:
 +      kvfree(rw_image);
 +      return ret;
 +}
 +
 +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 +                           struct bpf_tramp_links *tlinks, void *func_addr)
 +{
 +      struct bpf_tramp_image im;
 +      void *image;
 +      int ret;
 +
 +      /* Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
 +       * This will NOT cause fragmentation in direct map, as we do not
 +       * call set_memory_*() on this buffer.
 +       *
 +       * We cannot use kvmalloc here, because we need image to be in
 +       * module memory range.
 +       */
 +      image = bpf_jit_alloc_exec(PAGE_SIZE);
 +      if (!image)
 +              return -ENOMEM;
 +
 +      ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
 +                                          m, flags, tlinks, func_addr);
 +      bpf_jit_free_exec(image);
 +      return ret;
 +}
 +
  static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
  {
        u8 *jg_reloc, *prog = *pprog;
@@@ -3098,16 -2935,9 +3098,16 @@@ out_image
                        jit_data->header = header;
                        jit_data->rw_header = rw_header;
                }
 -              prog->bpf_func = (void *)image;
 +              /*
 +               * ctx.prog_offset is used when CFI preambles put code *before*
 +               * the function. See emit_cfi(). For FineIBT specifically this code
 +               * can also be executed and bpf_prog_kallsyms_add() will
 +               * generate an additional symbol to cover this, hence also
 +               * decrement proglen.
 +               */
 +              prog->bpf_func = (void *)image + cfi_get_offset();
                prog->jited = 1;
 -              prog->jited_len = proglen;
 +              prog->jited_len = proglen - cfi_get_offset();
        } else {
                prog = orig_prog;
        }
@@@ -3162,7 -2992,6 +3162,7 @@@ void bpf_jit_free(struct bpf_prog *prog
                        kvfree(jit_data->addrs);
                        kfree(jit_data);
                }
 +              prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
                hdr = bpf_jit_binary_pack_hdr(prog);
                bpf_jit_binary_pack_free(hdr, NULL);
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
index c1a963be7d289e6edafba98f1d7d0236ee4f69f1,d24f29091f4b0b96db4555290da07b061ec00b12..34ad8bb549a645a7e5682f3d84853e971272763f
@@@ -35,7 -35,7 +35,7 @@@
        (typeof(ptr)) (__ptr + (off));                                  \
  })
  
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
  #define __noretpoline __attribute__((__indirect_branch__("keep")))
  #endif
  
                __builtin_unreachable();        \
        } while (0)
  
 +/*
 + * GCC 'asm goto' with outputs miscompiles certain code sequences:
 + *
 + *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420
 + *   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422
 + *
 + * Work it around via the same compiler barrier quirk that we used
 + * to use for the old 'asm goto' workaround.
 + *
 + * Also, always mark such 'asm goto' statements as volatile: all
 + * asm goto statements are supposed to be volatile as per the
 + * documentation, but some versions of gcc didn't actually do
 + * that for asms with outputs:
 + *
 + *    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619
 + */
 +#define asm_goto_output(x...) \
 +      do { asm volatile goto(x); asm (""); } while (0)
 +
  #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
  #define __HAVE_BUILTIN_BSWAP32__
  #define __HAVE_BUILTIN_BSWAP64__
  #endif
  
  #define __diag_ignore_all(option, comment) \
 -      __diag_GCC(8, ignore, option)
 +      __diag(__diag_GCC_ignore option)
  
  /*
   * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size"
index adb83a42a6b90b03cb468ef2787a568bd8a008d0,fe050dab55a3824b988a587a09444f69e0bbd8fa..35227d47cfc98c14f80a17a01d8df6050c56768e
@@@ -2,7 -2,7 +2,7 @@@
  #ifndef _LINUX_INDIRECT_CALL_WRAPPER_H
  #define _LINUX_INDIRECT_CALL_WRAPPER_H
  
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
  
  /*
   * INDIRECT_CALL_$NR - wrapper for indirect calls with $NR known builtin
@@@ -11,7 -11,7 +11,7 @@@
   *  @__VA_ARGS__: arguments for @f
   *
   * Avoid retpoline overhead for known builtin, checking @f vs each of them and
 - * eventually invoking directly the builtin function. The functions are check
 + * eventually invoking directly the builtin function. The functions are checked
   * in the given order. Fallback to the indirect call.
   */
  #define INDIRECT_CALL_1(f, f1, ...)                                   \
diff --combined include/linux/module.h
index 96bc462872c0ca9fcecfb06a4c68e3f12697d789,087b369e8f17861187e866c4c11498451614645b..1153b0d99a808876f7ba62d137e5095e1e92a954
@@@ -670,7 -670,7 +670,7 @@@ extern void __module_get(struct module 
   * @module: the module we should check for
   *
   * Only try to get a module reference count if the module is not being removed.
 - * This call will fail if the module is already being removed.
 + * This call will fail if the module is in the process of being removed.
   *
   * Care must also be taken to ensure the module exists and is alive prior to
   * usage of this call. This can be gauranteed through two means:
@@@ -885,7 -885,7 +885,7 @@@ static inline void module_bug_finalize(
  static inline void module_bug_cleanup(struct module *mod) {}
  #endif        /* CONFIG_GENERIC_BUG */
  
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
  extern bool retpoline_module_ok(bool has_retpoline);
  #else
  static inline bool retpoline_module_ok(bool has_retpoline)
diff --combined include/net/tc_wrapper.h
index a608546bcefcf81fa9056dfc567f985c8fccb9b9,a13ba0326d5e0f57bd7715849c6d3b67bf768612..ffe58a02537c3ae7979e37ace09373a443d81cc9
@@@ -4,7 -4,7 +4,7 @@@
  
  #include <net/pkt_cls.h>
  
- #if IS_ENABLED(CONFIG_RETPOLINE)
+ #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
  
  #include <linux/cpufeature.h>
  #include <linux/static_key.h>
@@@ -117,6 -117,10 +117,6 @@@ static inline int tc_act(struct sk_buf
        if (a->ops->act == tcf_ife_act)
                return tcf_ife_act(skb, a, res);
  #endif
 -#if IS_BUILTIN(CONFIG_NET_ACT_IPT)
 -      if (a->ops->act == tcf_ipt_act)
 -              return tcf_ipt_act(skb, a, res);
 -#endif
  #if IS_BUILTIN(CONFIG_NET_ACT_SIMP)
        if (a->ops->act == tcf_simp_act)
                return tcf_simp_act(skb, a, res);
index fd4bfe3ecf014f6b3c83f9a7fa043b7df44dac32,9cb69332921d919a7c875b8b3e6c89ecfa260a5e..0a9d5984687cb4443499ae1be67630d9578d332f
@@@ -27,7 -27,6 +27,7 @@@
  #include <linux/cpu.h>
  #include <linux/oom.h>
  
 +#include <asm/local64.h>
  #include <asm/local.h>
  
  /*
@@@ -318,11 -317,6 +318,11 @@@ struct buffer_data_page 
        unsigned char    data[] RB_ALIGN_DATA;  /* data of buffer page */
  };
  
 +struct buffer_data_read_page {
 +      unsigned                order;  /* order of the page */
 +      struct buffer_data_page *data;  /* actual data, stored in this page */
 +};
 +
  /*
   * Note, the buffer_page list must be first. The buffer pages
   * are allocated in cache lines, which means that each buffer
@@@ -337,7 -331,6 +337,7 @@@ struct buffer_page 
        unsigned         read;          /* index for next read */
        local_t          entries;       /* entries on this page */
        unsigned long    real_end;      /* real end of data */
 +      unsigned         order;         /* order of the page */
        struct buffer_data_page *page;  /* Actual data page */
  };
  
@@@ -368,7 -361,7 +368,7 @@@ static __always_inline unsigned int rb_
  
  static void free_buffer_page(struct buffer_page *bpage)
  {
 -      free_page((unsigned long)bpage->page);
 +      free_pages((unsigned long)bpage->page, bpage->order);
        kfree(bpage);
  }
  
@@@ -380,6 -373,41 +380,6 @@@ static inline bool test_time_stamp(u64 
        return !!(delta & TS_DELTA_TEST);
  }
  
 -#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 -
 -/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 -#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 -
 -int ring_buffer_print_page_header(struct trace_seq *s)
 -{
 -      struct buffer_data_page field;
 -
 -      trace_seq_printf(s, "\tfield: u64 timestamp;\t"
 -                       "offset:0;\tsize:%u;\tsigned:%u;\n",
 -                       (unsigned int)sizeof(field.time_stamp),
 -                       (unsigned int)is_signed_type(u64));
 -
 -      trace_seq_printf(s, "\tfield: local_t commit;\t"
 -                       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 -                       (unsigned int)offsetof(typeof(field), commit),
 -                       (unsigned int)sizeof(field.commit),
 -                       (unsigned int)is_signed_type(long));
 -
 -      trace_seq_printf(s, "\tfield: int overwrite;\t"
 -                       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 -                       (unsigned int)offsetof(typeof(field), commit),
 -                       1,
 -                       (unsigned int)is_signed_type(long));
 -
 -      trace_seq_printf(s, "\tfield: char data;\t"
 -                       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 -                       (unsigned int)offsetof(typeof(field), data),
 -                       (unsigned int)BUF_PAGE_SIZE,
 -                       (unsigned int)is_signed_type(char));
 -
 -      return !trace_seq_has_overflowed(s);
 -}
 -
  struct rb_irq_work {
        struct irq_work                 work;
        wait_queue_head_t               waiters;
@@@ -435,9 -463,27 +435,9 @@@ enum 
        RB_CTX_MAX
  };
  
 -#if BITS_PER_LONG == 32
 -#define RB_TIME_32
 -#endif
 -
 -/* To test on 64 bit machines */
 -//#define RB_TIME_32
 -
 -#ifdef RB_TIME_32
 -
 -struct rb_time_struct {
 -      local_t         cnt;
 -      local_t         top;
 -      local_t         bottom;
 -      local_t         msb;
 -};
 -#else
 -#include <asm/local64.h>
  struct rb_time_struct {
        local64_t       time;
  };
 -#endif
  typedef struct rb_time_struct rb_time_t;
  
  #define MAX_NEST      5
@@@ -511,10 -557,6 +511,10 @@@ struct trace_buffer 
  
        struct rb_irq_work              irq_work;
        bool                            time_stamp_abs;
 +
 +      unsigned int                    subbuf_size;
 +      unsigned int                    subbuf_order;
 +      unsigned int                    max_data_size;
  };
  
  struct ring_buffer_iter {
        u64                             read_stamp;
        u64                             page_stamp;
        struct ring_buffer_event        *event;
 +      size_t                          event_size;
        int                             missed_events;
  };
  
 -#ifdef RB_TIME_32
 -
 -/*
 - * On 32 bit machines, local64_t is very expensive. As the ring
 - * buffer doesn't need all the features of a true 64 bit atomic,
 - * on 32 bit, it uses these functions (64 still uses local64_t).
 - *
 - * For the ring buffer, 64 bit required operations for the time is
 - * the following:
 - *
 - *  - Reads may fail if it interrupted a modification of the time stamp.
 - *      It will succeed if it did not interrupt another write even if
 - *      the read itself is interrupted by a write.
 - *      It returns whether it was successful or not.
 - *
 - *  - Writes always succeed and will overwrite other writes and writes
 - *      that were done by events interrupting the current write.
 - *
 - *  - A write followed by a read of the same time stamp will always succeed,
 - *      but may not contain the same value.
 - *
 - *  - A cmpxchg will fail if it interrupted another write or cmpxchg.
 - *      Other than that, it acts like a normal cmpxchg.
 - *
 - * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
 - *  (bottom being the least significant 30 bits of the 60 bit time stamp).
 - *
 - * The two most significant bits of each half holds a 2 bit counter (0-3).
 - * Each update will increment this counter by one.
 - * When reading the top and bottom, if the two counter bits match then the
 - *  top and bottom together make a valid 60 bit number.
 - */
 -#define RB_TIME_SHIFT 30
 -#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
 -#define RB_TIME_MSB_SHIFT      60
 -
 -static inline int rb_time_cnt(unsigned long val)
 +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
  {
 -      return (val >> RB_TIME_SHIFT) & 3;
 -}
 -
 -static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
 -{
 -      u64 val;
 -
 -      val = top & RB_TIME_VAL_MASK;
 -      val <<= RB_TIME_SHIFT;
 -      val |= bottom & RB_TIME_VAL_MASK;
 -
 -      return val;
 -}
 -
 -static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
 -{
 -      unsigned long top, bottom, msb;
 -      unsigned long c;
 -
 -      /*
 -       * If the read is interrupted by a write, then the cnt will
 -       * be different. Loop until both top and bottom have been read
 -       * without interruption.
 -       */
 -      do {
 -              c = local_read(&t->cnt);
 -              top = local_read(&t->top);
 -              bottom = local_read(&t->bottom);
 -              msb = local_read(&t->msb);
 -      } while (c != local_read(&t->cnt));
 -
 -      *cnt = rb_time_cnt(top);
 -
 -      /* If top, msb or bottom counts don't match, this interrupted a write */
 -      if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
 -              return false;
 -
 -      /* The shift to msb will lose its cnt bits */
 -      *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
 -      return true;
 -}
 -
 -static bool rb_time_read(rb_time_t *t, u64 *ret)
 -{
 -      unsigned long cnt;
 -
 -      return __rb_time_read(t, ret, &cnt);
 -}
 -
 -static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
 -{
 -      return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
 -}
 -
 -static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
 -                               unsigned long *msb)
 -{
 -      *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
 -      *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
 -      *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
 -}
 +      struct buffer_data_page field;
  
 -static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
 -{
 -      val = rb_time_val_cnt(val, cnt);
 -      local_set(t, val);
 -}
 +      trace_seq_printf(s, "\tfield: u64 timestamp;\t"
 +                       "offset:0;\tsize:%u;\tsigned:%u;\n",
 +                       (unsigned int)sizeof(field.time_stamp),
 +                       (unsigned int)is_signed_type(u64));
  
 -static void rb_time_set(rb_time_t *t, u64 val)
 -{
 -      unsigned long cnt, top, bottom, msb;
 +      trace_seq_printf(s, "\tfield: local_t commit;\t"
 +                       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 +                       (unsigned int)offsetof(typeof(field), commit),
 +                       (unsigned int)sizeof(field.commit),
 +                       (unsigned int)is_signed_type(long));
  
 -      rb_time_split(val, &top, &bottom, &msb);
 +      trace_seq_printf(s, "\tfield: int overwrite;\t"
 +                       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 +                       (unsigned int)offsetof(typeof(field), commit),
 +                       1,
 +                       (unsigned int)is_signed_type(long));
  
 -      /* Writes always succeed with a valid number even if it gets interrupted. */
 -      do {
 -              cnt = local_inc_return(&t->cnt);
 -              rb_time_val_set(&t->top, top, cnt);
 -              rb_time_val_set(&t->bottom, bottom, cnt);
 -              rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
 -      } while (cnt != local_read(&t->cnt));
 -}
 +      trace_seq_printf(s, "\tfield: char data;\t"
 +                       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 +                       (unsigned int)offsetof(typeof(field), data),
 +                       (unsigned int)buffer->subbuf_size,
 +                       (unsigned int)is_signed_type(char));
  
 -static inline bool
 -rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
 -{
 -      return local_try_cmpxchg(l, &expect, set);
 +      return !trace_seq_has_overflowed(s);
  }
  
 -#else /* 64 bits */
 -
 -/* local64_t always succeeds */
 -
 -static inline bool rb_time_read(rb_time_t *t, u64 *ret)
 +static inline void rb_time_read(rb_time_t *t, u64 *ret)
  {
        *ret = local64_read(&t->time);
 -      return true;
  }
  static void rb_time_set(rb_time_t *t, u64 val)
  {
        local64_set(&t->time, val);
  }
 -#endif
  
  /*
   * Enable this to make sure that the event passed to
@@@ -676,7 -820,10 +676,7 @@@ u64 ring_buffer_event_time_stamp(struc
        WARN_ONCE(1, "nest (%d) greater than max", nest);
  
   fail:
 -      /* Can only fail on 32 bit */
 -      if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
 -              /* Screw it, just read the current time */
 -              ts = rb_time_stamp(cpu_buffer->buffer);
 +      rb_time_read(&cpu_buffer->write_stamp, &ts);
  
        return ts;
  }
@@@ -944,7 -1091,7 +944,7 @@@ __poll_t ring_buffer_poll_wait(struct t
                full = 0;
        } else {
                if (!cpumask_test_cpu(cpu, buffer->cpumask))
 -                      return -EINVAL;
 +                      return EPOLLERR;
  
                cpu_buffer = buffer->buffers[cpu];
                work = &cpu_buffer->irq_work;
@@@ -1009,7 -1156,7 +1009,7 @@@ static inline u64 rb_time_stamp(struct 
        u64 ts;
  
        /* Skip retpolines :-( */
-       if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+       if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && likely(buffer->clock == trace_clock_local))
                ts = trace_clock_local();
        else
                ts = buffer->clock();
@@@ -1472,12 -1619,10 +1472,12 @@@ static int __rb_allocate_pages(struct r
  
                list_add(&bpage->list, pages);
  
 -              page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
 +              page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
 +                                      cpu_buffer->buffer->subbuf_order);
                if (!page)
                        goto free_pages;
                bpage->page = page_address(page);
 +              bpage->order = cpu_buffer->buffer->subbuf_order;
                rb_init_page(bpage->page);
  
                if (user_thread && fatal_signal_pending(current))
@@@ -1556,8 -1701,7 +1556,8 @@@ rb_allocate_cpu_buffer(struct trace_buf
        rb_check_bpage(cpu_buffer, bpage);
  
        cpu_buffer->reader_page = bpage;
 -      page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
 +
 +      page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
        if (!page)
                goto fail_free_reader;
        bpage->page = page_address(page);
@@@ -1640,14 -1784,7 +1640,14 @@@ struct trace_buffer *__ring_buffer_allo
        if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
                goto fail_free_buffer;
  
 -      nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 +      /* Default buffer page size - one system page */
 +      buffer->subbuf_order = 0;
 +      buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
 +
 +      /* Max payload is buffer page size - header (8bytes) */
 +      buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
 +
 +      nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
        buffer->flags = flags;
        buffer->clock = trace_clock_local;
        buffer->reader_lock_key = key;
@@@ -1966,7 -2103,7 +1966,7 @@@ static void update_pages_handler(struc
   * @size: the new size.
   * @cpu_id: the cpu buffer to resize
   *
 - * Minimum size is 2 * BUF_PAGE_SIZE.
 + * Minimum size is 2 * buffer->subbuf_size.
   *
   * Returns 0 on success and < 0 on failure.
   */
@@@ -1988,7 -2125,7 +1988,7 @@@ int ring_buffer_resize(struct trace_buf
            !cpumask_test_cpu(cpu_id, buffer->cpumask))
                return 0;
  
 -      nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 +      nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
  
        /* we need a minimum of two pages */
        if (nr_pages < 2)
@@@ -2235,7 -2372,7 +2235,7 @@@ rb_iter_head_event(struct ring_buffer_i
         */
        barrier();
  
 -      if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
 +      if ((iter->head + length) > commit || length > iter->event_size)
                /* Writer corrupted the read? */
                goto reset;
  
@@@ -2275,13 -2412,11 +2275,13 @@@ rb_commit_index(struct ring_buffer_per_
  }
  
  static __always_inline unsigned
 -rb_event_index(struct ring_buffer_event *event)
 +rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event)
  {
        unsigned long addr = (unsigned long)event;
  
 -      return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
 +      addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1;
 +
 +      return addr - BUF_PAGE_HDR_SIZE;
  }
  
  static void rb_inc_iter(struct ring_buffer_iter *iter)
@@@ -2470,7 -2605,6 +2470,7 @@@ static inline voi
  rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
              unsigned long tail, struct rb_event_info *info)
  {
 +      unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
        struct buffer_page *tail_page = info->tail_page;
        struct ring_buffer_event *event;
        unsigned long length = info->length;
         * Only the event that crossed the page boundary
         * must fill the old tail_page with padding.
         */
 -      if (tail >= BUF_PAGE_SIZE) {
 +      if (tail >= bsize) {
                /*
                 * If the page was filled, then we still need
                 * to update the real_end. Reset it to zero
                 * and the reader will ignore it.
                 */
 -              if (tail == BUF_PAGE_SIZE)
 +              if (tail == bsize)
                        tail_page->real_end = 0;
  
                local_sub(length, &tail_page->write);
         * If we are less than the minimum size, we don't need to
         * worry about it.
         */
 -      if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
 +      if (tail > (bsize - RB_EVNT_MIN_SIZE)) {
                /* No room for any events */
  
                /* Mark the rest of the page with padding */
        }
  
        /* Put in a discarded event */
 -      event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
 +      event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE;
        event->type_len = RINGBUF_TYPE_PADDING;
        /* time delta must be non zero */
        event->time_delta = 1;
  
        /* account for padding bytes */
 -      local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
 +      local_add(bsize - tail, &cpu_buffer->entries_bytes);
  
        /* Make sure the padding is visible before the tail_page->write update */
        smp_wmb();
  
        /* Set write to end of buffer */
 -      length = (tail + length) - BUF_PAGE_SIZE;
 +      length = (tail + length) - bsize;
        local_sub(length, &tail_page->write);
  }
  
@@@ -2654,8 -2788,7 +2654,8 @@@ rb_move_tail(struct ring_buffer_per_cp
  
  /* Slow path */
  static struct ring_buffer_event *
 -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
 +rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 +                struct ring_buffer_event *event, u64 delta, bool abs)
  {
        if (abs)
                event->type_len = RINGBUF_TYPE_TIME_STAMP;
                event->type_len = RINGBUF_TYPE_TIME_EXTEND;
  
        /* Not the first event on the page, or not delta? */
 -      if (abs || rb_event_index(event)) {
 +      if (abs || rb_event_index(cpu_buffer, event)) {
                event->time_delta = delta & TS_MASK;
                event->array[0] = delta >> TS_SHIFT;
        } else {
@@@ -2693,7 -2826,7 +2693,7 @@@ rb_check_timestamp(struct ring_buffer_p
                  (unsigned long long)info->ts,
                  (unsigned long long)info->before,
                  (unsigned long long)info->after,
 -                (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
 +                (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}),
                  sched_clock_stable() ? "" :
                  "If you just came from a suspend/resume,\n"
                  "please switch to the trace global clock:\n"
@@@ -2737,7 -2870,7 +2737,7 @@@ static void rb_add_timestamp(struct rin
                if (!abs)
                        info->delta = 0;
        }
 -      *event = rb_add_time_stamp(*event, info->delta, abs);
 +      *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs);
        *length -= RB_LEN_TIME_EXTEND;
        *delta = 0;
  }
@@@ -2821,10 -2954,10 +2821,10 @@@ rb_try_to_discard(struct ring_buffer_pe
        struct buffer_page *bpage;
        unsigned long addr;
  
 -      new_index = rb_event_index(event);
 +      new_index = rb_event_index(cpu_buffer, event);
        old_index = new_index + rb_event_ts_length(event);
        addr = (unsigned long)event;
 -      addr &= PAGE_MASK;
 +      addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
  
        bpage = READ_ONCE(cpu_buffer->tail_page);
  
@@@ -3211,76 -3344,6 +3211,76 @@@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_co
  #define CHECK_FULL_PAGE               1L
  
  #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
 +
 +static const char *show_irq_str(int bits)
 +{
 +      const char *type[] = {
 +              ".",    // 0
 +              "s",    // 1
 +              "h",    // 2
 +              "Hs",   // 3
 +              "n",    // 4
 +              "Ns",   // 5
 +              "Nh",   // 6
 +              "NHs",  // 7
 +      };
 +
 +      return type[bits];
 +}
 +
 +/* Assume this is an trace event */
 +static const char *show_flags(struct ring_buffer_event *event)
 +{
 +      struct trace_entry *entry;
 +      int bits = 0;
 +
 +      if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
 +              return "X";
 +
 +      entry = ring_buffer_event_data(event);
 +
 +      if (entry->flags & TRACE_FLAG_SOFTIRQ)
 +              bits |= 1;
 +
 +      if (entry->flags & TRACE_FLAG_HARDIRQ)
 +              bits |= 2;
 +
 +      if (entry->flags & TRACE_FLAG_NMI)
 +              bits |= 4;
 +
 +      return show_irq_str(bits);
 +}
 +
 +static const char *show_irq(struct ring_buffer_event *event)
 +{
 +      struct trace_entry *entry;
 +
 +      if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
 +              return "";
 +
 +      entry = ring_buffer_event_data(event);
 +      if (entry->flags & TRACE_FLAG_IRQS_OFF)
 +              return "d";
 +      return "";
 +}
 +
 +static const char *show_interrupt_level(void)
 +{
 +      unsigned long pc = preempt_count();
 +      unsigned char level = 0;
 +
 +      if (pc & SOFTIRQ_OFFSET)
 +              level |= 1;
 +
 +      if (pc & HARDIRQ_MASK)
 +              level |= 2;
 +
 +      if (pc & NMI_MASK)
 +              level |= 4;
 +
 +      return show_irq_str(level);
 +}
 +
  static void dump_buffer_page(struct buffer_data_page *bpage,
                             struct rb_event_info *info,
                             unsigned long tail)
                case RINGBUF_TYPE_TIME_EXTEND:
                        delta = rb_event_time_stamp(event);
                        ts += delta;
 -                      pr_warn("  [%lld] delta:%lld TIME EXTEND\n", ts, delta);
 +                      pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n",
 +                              e, ts, delta);
                        break;
  
                case RINGBUF_TYPE_TIME_STAMP:
                        delta = rb_event_time_stamp(event);
                        ts = rb_fix_abs_ts(delta, ts);
 -                      pr_warn("  [%lld] absolute:%lld TIME STAMP\n", ts, delta);
 +                      pr_warn(" 0x%x:  [%lld] absolute:%lld TIME STAMP\n",
 +                              e, ts, delta);
                        break;
  
                case RINGBUF_TYPE_PADDING:
                        ts += event->time_delta;
 -                      pr_warn("  [%lld] delta:%d PADDING\n", ts, event->time_delta);
 +                      pr_warn(" 0x%x:  [%lld] delta:%d PADDING\n",
 +                              e, ts, event->time_delta);
                        break;
  
                case RINGBUF_TYPE_DATA:
                        ts += event->time_delta;
 -                      pr_warn("  [%lld] delta:%d\n", ts, event->time_delta);
 +                      pr_warn(" 0x%x:  [%lld] delta:%d %s%s\n",
 +                              e, ts, event->time_delta,
 +                              show_flags(event), show_irq(event));
                        break;
  
                default:
                        break;
                }
        }
 +      pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e);
  }
  
  static DEFINE_PER_CPU(atomic_t, checking);
  static atomic_t ts_dump;
  
 +#define buffer_warn_return(fmt, ...)                                  \
 +      do {                                                            \
 +              /* If another report is happening, ignore this one */   \
 +              if (atomic_inc_return(&ts_dump) != 1) {                 \
 +                      atomic_dec(&ts_dump);                           \
 +                      goto out;                                       \
 +              }                                                       \
 +              atomic_inc(&cpu_buffer->record_disabled);               \
 +              pr_warn(fmt, ##__VA_ARGS__);                            \
 +              dump_buffer_page(bpage, info, tail);                    \
 +              atomic_dec(&ts_dump);                                   \
 +              /* There's some cases in boot up that this can happen */ \
 +              if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING))       \
 +                      /* Do not re-enable checking */                 \
 +                      return;                                         \
 +      } while (0)
 +
  /*
   * Check if the current event time stamp matches the deltas on
   * the buffer page.
@@@ -3405,12 -3445,7 +3405,12 @@@ static void check_buffer(struct ring_bu
  
                case RINGBUF_TYPE_TIME_STAMP:
                        delta = rb_event_time_stamp(event);
 -                      ts = rb_fix_abs_ts(delta, ts);
 +                      delta = rb_fix_abs_ts(delta, ts);
 +                      if (delta < ts) {
 +                              buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
 +                                                 cpu_buffer->cpu, ts, delta);
 +                      }
 +                      ts = delta;
                        break;
  
                case RINGBUF_TYPE_PADDING:
        }
        if ((full && ts > info->ts) ||
            (!full && ts + info->delta != info->ts)) {
 -              /* If another report is happening, ignore this one */
 -              if (atomic_inc_return(&ts_dump) != 1) {
 -                      atomic_dec(&ts_dump);
 -                      goto out;
 -              }
 -              atomic_inc(&cpu_buffer->record_disabled);
 -              /* There's some cases in boot up that this can happen */
 -              WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
 -              pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
 -                      cpu_buffer->cpu,
 -                      ts + info->delta, info->ts, info->delta,
 -                      info->before, info->after,
 -                      full ? " (full)" : "");
 -              dump_buffer_page(bpage, info, tail);
 -              atomic_dec(&ts_dump);
 -              /* Do not re-enable checking */
 -              return;
 +              buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
 +                                 cpu_buffer->cpu,
 +                                 ts + info->delta, info->ts, info->delta,
 +                                 info->before, info->after,
 +                                 full ? " (full)" : "", show_interrupt_level());
        }
  out:
        atomic_dec(this_cpu_ptr(&checking));
@@@ -3451,14 -3498,16 +3451,14 @@@ __rb_reserve_next(struct ring_buffer_pe
        struct ring_buffer_event *event;
        struct buffer_page *tail_page;
        unsigned long tail, write, w;
 -      bool a_ok;
 -      bool b_ok;
  
        /* Don't let the compiler play games with cpu_buffer->tail_page */
        tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
  
   /*A*/        w = local_read(&tail_page->write) & RB_WRITE_MASK;
        barrier();
 -      b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
 -      a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
 +      rb_time_read(&cpu_buffer->before_stamp, &info->before);
 +      rb_time_read(&cpu_buffer->write_stamp, &info->after);
        barrier();
        info->ts = rb_time_stamp(cpu_buffer->buffer);
  
                if (!w) {
                        /* Use the sub-buffer timestamp */
                        info->delta = 0;
 -              } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
 +              } else if (unlikely(info->before != info->after)) {
                        info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
                        info->length += RB_LEN_TIME_EXTEND;
                } else {
        tail = write - info->length;
  
        /* See if we shot pass the end of this buffer page */
 -      if (unlikely(write > BUF_PAGE_SIZE)) {
 +      if (unlikely(write > cpu_buffer->buffer->subbuf_size)) {
                check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
                return rb_move_tail(cpu_buffer, tail, info);
        }
                /* SLOW PATH - Interrupted between A and C */
  
                /* Save the old before_stamp */
 -              a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
 -              RB_WARN_ON(cpu_buffer, !a_ok);
 +              rb_time_read(&cpu_buffer->before_stamp, &info->before);
  
                /*
                 * Read a new timestamp and update the before_stamp to make
                rb_time_set(&cpu_buffer->before_stamp, ts);
  
                barrier();
 - /*E*/                a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
 -              /* Was interrupted before here, write_stamp must be valid */
 -              RB_WARN_ON(cpu_buffer, !a_ok);
 + /*E*/                rb_time_read(&cpu_buffer->write_stamp, &info->after);
                barrier();
   /*F*/                if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
                    info->after == info->before && info->after < ts) {
@@@ -3626,7 -3678,7 +3626,7 @@@ rb_reserve_next_event(struct trace_buff
        if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
                add_ts_default = RB_ADD_STAMP_ABSOLUTE;
                info.length += RB_LEN_TIME_EXTEND;
 -              if (info.length > BUF_MAX_DATA_SIZE)
 +              if (info.length > cpu_buffer->buffer->max_data_size)
                        goto out_fail;
        } else {
                add_ts_default = RB_ADD_STAMP_NONE;
@@@ -3701,7 -3753,7 +3701,7 @@@ ring_buffer_lock_reserve(struct trace_b
        if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
                goto out;
  
 -      if (unlikely(length > BUF_MAX_DATA_SIZE))
 +      if (unlikely(length > buffer->max_data_size))
                goto out;
  
        if (unlikely(trace_recursive_lock(cpu_buffer)))
@@@ -3735,7 -3787,7 +3735,7 @@@ rb_decrement_entry(struct ring_buffer_p
        struct buffer_page *bpage = cpu_buffer->commit_page;
        struct buffer_page *start;
  
 -      addr &= PAGE_MASK;
 +      addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
  
        /* Do the likely case first */
        if (likely(bpage->page == (void *)addr)) {
@@@ -3851,7 -3903,7 +3851,7 @@@ int ring_buffer_write(struct trace_buff
        if (atomic_read(&cpu_buffer->record_disabled))
                goto out;
  
 -      if (length > BUF_MAX_DATA_SIZE)
 +      if (length > buffer->max_data_size)
                goto out;
  
        if (unlikely(trace_recursive_lock(cpu_buffer)))
@@@ -4431,7 -4483,6 +4431,7 @@@ static struct buffer_page 
  rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
  {
        struct buffer_page *reader = NULL;
 +      unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
        unsigned long overwrite;
        unsigned long flags;
        int nr_loops = 0;
  #define USECS_WAIT    1000000
          for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
                /* If the write is past the end of page, a writer is still updating it */
 -              if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
 +              if (likely(!reader || rb_page_write(reader) <= bsize))
                        break;
  
                udelay(1);
@@@ -5011,8 -5062,7 +5011,8 @@@ ring_buffer_read_prepare(struct trace_b
                return NULL;
  
        /* Holds the entire event: data and meta data */
 -      iter->event = kmalloc(BUF_PAGE_SIZE, flags);
 +      iter->event_size = buffer->subbuf_size;
 +      iter->event = kmalloc(iter->event_size, flags);
        if (!iter->event) {
                kfree(iter);
                return NULL;
@@@ -5128,28 -5178,19 +5128,28 @@@ EXPORT_SYMBOL_GPL(ring_buffer_iter_adva
   */
  unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
  {
 -      /*
 -       * Earlier, this method returned
 -       *      BUF_PAGE_SIZE * buffer->nr_pages
 -       * Since the nr_pages field is now removed, we have converted this to
 -       * return the per cpu buffer value.
 -       */
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return 0;
  
 -      return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
 +      return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages;
  }
  EXPORT_SYMBOL_GPL(ring_buffer_size);
  
 +/**
 + * ring_buffer_max_event_size - return the max data size of an event
 + * @buffer: The ring buffer.
 + *
 + * Returns the maximum size an event can be.
 + */
 +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
 +{
 +      /* If abs timestamp is requested, events have a timestamp too */
 +      if (ring_buffer_time_stamp_abs(buffer))
 +              return buffer->max_data_size - RB_LEN_TIME_EXTEND;
 +      return buffer->max_data_size;
 +}
 +EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
 +
  static void rb_clear_buffer_page(struct buffer_page *page)
  {
        local_set(&page->write, 0);
@@@ -5420,9 -5461,6 +5420,9 @@@ int ring_buffer_swap_cpu(struct trace_b
        if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
                goto out;
  
 +      if (buffer_a->subbuf_order != buffer_b->subbuf_order)
 +              goto out;
 +
        ret = -EAGAIN;
  
        if (atomic_read(&buffer_a->record_disabled))
@@@ -5494,48 -5532,40 +5494,48 @@@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu)
   * Returns:
   *  The page allocated, or ERR_PTR
   */
 -void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
 +struct buffer_data_read_page *
 +ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
  {
        struct ring_buffer_per_cpu *cpu_buffer;
 -      struct buffer_data_page *bpage = NULL;
 +      struct buffer_data_read_page *bpage = NULL;
        unsigned long flags;
        struct page *page;
  
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return ERR_PTR(-ENODEV);
  
 +      bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
 +      if (!bpage)
 +              return ERR_PTR(-ENOMEM);
 +
 +      bpage->order = buffer->subbuf_order;
        cpu_buffer = buffer->buffers[cpu];
        local_irq_save(flags);
        arch_spin_lock(&cpu_buffer->lock);
  
        if (cpu_buffer->free_page) {
 -              bpage = cpu_buffer->free_page;
 +              bpage->data = cpu_buffer->free_page;
                cpu_buffer->free_page = NULL;
        }
  
        arch_spin_unlock(&cpu_buffer->lock);
        local_irq_restore(flags);
  
 -      if (bpage)
 +      if (bpage->data)
                goto out;
  
 -      page = alloc_pages_node(cpu_to_node(cpu),
 -                              GFP_KERNEL | __GFP_NORETRY, 0);
 -      if (!page)
 +      page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
 +                              cpu_buffer->buffer->subbuf_order);
 +      if (!page) {
 +              kfree(bpage);
                return ERR_PTR(-ENOMEM);
 +      }
  
 -      bpage = page_address(page);
 +      bpage->data = page_address(page);
  
   out:
 -      rb_init_page(bpage);
 +      rb_init_page(bpage->data);
  
        return bpage;
  }
@@@ -5545,15 -5575,14 +5545,15 @@@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_rea
   * ring_buffer_free_read_page - free an allocated read page
   * @buffer: the buffer the page was allocate for
   * @cpu: the cpu buffer the page came from
 - * @data: the page to free
 + * @data_page: the page to free
   *
   * Free a page allocated from ring_buffer_alloc_read_page.
   */
 -void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
 +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
 +                              struct buffer_data_read_page *data_page)
  {
        struct ring_buffer_per_cpu *cpu_buffer;
 -      struct buffer_data_page *bpage = data;
 +      struct buffer_data_page *bpage = data_page->data;
        struct page *page = virt_to_page(bpage);
        unsigned long flags;
  
  
        cpu_buffer = buffer->buffers[cpu];
  
 -      /* If the page is still in use someplace else, we can't reuse it */
 -      if (page_ref_count(page) > 1)
 +      /*
 +       * If the page is still in use someplace else, or order of the page
 +       * is different from the subbuffer order of the buffer -
 +       * we can't reuse it
 +       */
 +      if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order)
                goto out;
  
        local_irq_save(flags);
        local_irq_restore(flags);
  
   out:
 -      free_page((unsigned long)bpage);
 +      free_pages((unsigned long)bpage, data_page->order);
 +      kfree(data_page);
  }
  EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
  
   *    rpage = ring_buffer_alloc_read_page(buffer, cpu);
   *    if (IS_ERR(rpage))
   *            return PTR_ERR(rpage);
 - *    ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
 + *    ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0);
   *    if (ret >= 0)
 - *            process_page(rpage, ret);
 + *            process_page(ring_buffer_read_page_data(rpage), ret);
 + *    ring_buffer_free_read_page(buffer, cpu, rpage);
   *
   * When @full is set, the function will not return true unless
   * the writer is off the reader page.
   *  <0 if no data has been transferred.
   */
  int ring_buffer_read_page(struct trace_buffer *buffer,
 -                        void **data_page, size_t len, int cpu, int full)
 +                        struct buffer_data_read_page *data_page,
 +                        size_t len, int cpu, int full)
  {
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
        struct ring_buffer_event *event;
  
        len -= BUF_PAGE_HDR_SIZE;
  
 -      if (!data_page)
 +      if (!data_page || !data_page->data)
 +              goto out;
 +      if (data_page->order != buffer->subbuf_order)
                goto out;
  
 -      bpage = *data_page;
 +      bpage = data_page->data;
        if (!bpage)
                goto out;
  
                /* swap the pages */
                rb_init_page(bpage);
                bpage = reader->page;
 -              reader->page = *data_page;
 +              reader->page = data_page->data;
                local_set(&reader->write, 0);
                local_set(&reader->entries, 0);
                reader->read = 0;
 -              *data_page = bpage;
 +              data_page->data = bpage;
  
                /*
                 * Use the real_end for the data size,
                /* If there is room at the end of the page to save the
                 * missed events, then record it there.
                 */
 -              if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
 +              if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
                        memcpy(&bpage->data[commit], &missed_events,
                               sizeof(missed_events));
                        local_add(RB_MISSED_STORED, &bpage->commit);
        /*
         * This page may be off to user land. Zero it out here.
         */
 -      if (commit < BUF_PAGE_SIZE)
 -              memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
 +      if (commit < buffer->subbuf_size)
 +              memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
  
   out_unlock:
        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
  }
  EXPORT_SYMBOL_GPL(ring_buffer_read_page);
  
 +/**
 + * ring_buffer_read_page_data - get pointer to the data in the page.
 + * @page:  the page to get the data from
 + *
 + * Returns pointer to the actual data in this page.
 + */
 +void *ring_buffer_read_page_data(struct buffer_data_read_page *page)
 +{
 +      return page->data;
 +}
 +EXPORT_SYMBOL_GPL(ring_buffer_read_page_data);
 +
 +/**
 + * ring_buffer_subbuf_size_get - get size of the sub buffer.
 + * @buffer: the buffer to get the sub buffer size from
 + *
 + * Returns size of the sub buffer, in bytes.
 + */
 +int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
 +{
 +      return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
 +}
 +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
 +
 +/**
 + * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page.
 + * @buffer: The ring_buffer to get the system sub page order from
 + *
 + * By default, one ring buffer sub page equals to one system page. This parameter
 + * is configurable, per ring buffer. The size of the ring buffer sub page can be
 + * extended, but must be an order of system page size.
 + *
 + * Returns the order of buffer sub page size, in system pages:
 + * 0 means the sub buffer size is 1 system page and so forth.
 + * In case of an error < 0 is returned.
 + */
 +int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
 +{
 +      if (!buffer)
 +              return -EINVAL;
 +
 +      return buffer->subbuf_order;
 +}
 +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
 +
 +/**
 + * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
 + * @buffer: The ring_buffer to set the new page size.
 + * @order: Order of the system pages in one sub buffer page
 + *
 + * By default, one ring buffer pages equals to one system page. This API can be
 + * used to set new size of the ring buffer page. The size must be order of
 + * system page size, that's why the input parameter @order is the order of
 + * system pages that are allocated for one ring buffer page:
 + *  0 - 1 system page
 + *  1 - 2 system pages
 + *  3 - 4 system pages
 + *  ...
 + *
 + * Returns 0 on success or < 0 in case of an error.
 + */
 +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
 +{
 +      struct ring_buffer_per_cpu *cpu_buffer;
 +      struct buffer_page *bpage, *tmp;
 +      int old_order, old_size;
 +      int nr_pages;
 +      int psize;
 +      int err;
 +      int cpu;
 +
 +      if (!buffer || order < 0)
 +              return -EINVAL;
 +
 +      if (buffer->subbuf_order == order)
 +              return 0;
 +
 +      psize = (1 << order) * PAGE_SIZE;
 +      if (psize <= BUF_PAGE_HDR_SIZE)
 +              return -EINVAL;
 +
 +      old_order = buffer->subbuf_order;
 +      old_size = buffer->subbuf_size;
 +
 +      /* prevent another thread from changing buffer sizes */
 +      mutex_lock(&buffer->mutex);
 +      atomic_inc(&buffer->record_disabled);
 +
 +      /* Make sure all commits have finished */
 +      synchronize_rcu();
 +
 +      buffer->subbuf_order = order;
 +      buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
 +
 +      /* Make sure all new buffers are allocated, before deleting the old ones */
 +      for_each_buffer_cpu(buffer, cpu) {
 +
 +              if (!cpumask_test_cpu(cpu, buffer->cpumask))
 +                      continue;
 +
 +              cpu_buffer = buffer->buffers[cpu];
 +
 +              /* Update the number of pages to match the new size */
 +              nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
 +              nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
 +
 +              /* we need a minimum of two pages */
 +              if (nr_pages < 2)
 +                      nr_pages = 2;
 +
 +              cpu_buffer->nr_pages_to_update = nr_pages;
 +
 +              /* Include the reader page */
 +              nr_pages++;
 +
 +              /* Allocate the new size buffer */
 +              INIT_LIST_HEAD(&cpu_buffer->new_pages);
 +              if (__rb_allocate_pages(cpu_buffer, nr_pages,
 +                                      &cpu_buffer->new_pages)) {
 +                      /* not enough memory for new pages */
 +                      err = -ENOMEM;
 +                      goto error;
 +              }
 +      }
 +
 +      for_each_buffer_cpu(buffer, cpu) {
 +
 +              if (!cpumask_test_cpu(cpu, buffer->cpumask))
 +                      continue;
 +
 +              cpu_buffer = buffer->buffers[cpu];
 +
 +              /* Clear the head bit to make the link list normal to read */
 +              rb_head_page_deactivate(cpu_buffer);
 +
 +              /* Now walk the list and free all the old sub buffers */
 +              list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
 +                      list_del_init(&bpage->list);
 +                      free_buffer_page(bpage);
 +              }
 +              /* The above loop stopped an the last page needing to be freed */
 +              bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
 +              free_buffer_page(bpage);
 +
 +              /* Free the current reader page */
 +              free_buffer_page(cpu_buffer->reader_page);
 +
 +              /* One page was allocated for the reader page */
 +              cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
 +                                                   struct buffer_page, list);
 +              list_del_init(&cpu_buffer->reader_page->list);
 +
 +              /* The cpu_buffer pages are a link list with no head */
 +              cpu_buffer->pages = cpu_buffer->new_pages.next;
 +              cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
 +              cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
 +
 +              /* Clear the new_pages list */
 +              INIT_LIST_HEAD(&cpu_buffer->new_pages);
 +
 +              cpu_buffer->head_page
 +                      = list_entry(cpu_buffer->pages, struct buffer_page, list);
 +              cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
 +
 +              cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
 +              cpu_buffer->nr_pages_to_update = 0;
 +
 +              free_pages((unsigned long)cpu_buffer->free_page, old_order);
 +              cpu_buffer->free_page = NULL;
 +
 +              rb_head_page_activate(cpu_buffer);
 +
 +              rb_check_pages(cpu_buffer);
 +      }
 +
 +      atomic_dec(&buffer->record_disabled);
 +      mutex_unlock(&buffer->mutex);
 +
 +      return 0;
 +
 +error:
 +      buffer->subbuf_order = old_order;
 +      buffer->subbuf_size = old_size;
 +
 +      atomic_dec(&buffer->record_disabled);
 +      mutex_unlock(&buffer->mutex);
 +
 +      for_each_buffer_cpu(buffer, cpu) {
 +              cpu_buffer = buffer->buffers[cpu];
 +
 +              if (!cpu_buffer->nr_pages_to_update)
 +                      continue;
 +
 +              list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
 +                      list_del_init(&bpage->list);
 +                      free_buffer_page(bpage);
 +              }
 +      }
 +
 +      return err;
 +}
 +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
 +
  /*
   * We only allocate new buffers, never free them if the CPU goes down.
   * If we were to free the buffer, then the user would lose any trace that was in
diff --combined net/netfilter/nft_ct.c
index bfd3e5a14dab68484469bdba71af37a460822549,d3e66bcb2a910e471edc437f28c9bd6968ecc715..d73d49c7acd66caeec27c39bcf1b39355f4a3744
@@@ -476,9 -476,6 +476,9 @@@ static int nft_ct_get_init(const struc
                break;
  #endif
        case NFT_CT_ID:
 +              if (tb[NFTA_CT_DIRECTION])
 +                      return -EINVAL;
 +
                len = sizeof(u32);
                break;
        default:
@@@ -754,7 -751,7 +754,7 @@@ static bool nft_ct_set_reduce(struct nf
        return false;
  }
  
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
  static const struct nft_expr_ops nft_ct_get_fast_ops = {
        .type           = &nft_ct_type,
        .size           = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
@@@ -799,7 -796,7 +799,7 @@@ nft_ct_select_ops(const struct nft_ctx 
                return ERR_PTR(-EINVAL);
  
        if (tb[NFTA_CT_DREG]) {
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
                u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
  
                switch (k) {
@@@ -1253,31 -1250,7 +1253,31 @@@ static int nft_ct_expect_obj_init(cons
        if (tb[NFTA_CT_EXPECT_L3PROTO])
                priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO]));
  
 +      switch (priv->l3num) {
 +      case NFPROTO_IPV4:
 +      case NFPROTO_IPV6:
 +              if (priv->l3num != ctx->family)
 +                      return -EINVAL;
 +
 +              fallthrough;
 +      case NFPROTO_INET:
 +              break;
 +      default:
 +              return -EOPNOTSUPP;
 +      }
 +
        priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]);
 +      switch (priv->l4proto) {
 +      case IPPROTO_TCP:
 +      case IPPROTO_UDP:
 +      case IPPROTO_UDPLITE:
 +      case IPPROTO_DCCP:
 +      case IPPROTO_SCTP:
 +              break;
 +      default:
 +              return -EOPNOTSUPP;
 +      }
 +
        priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]);
        priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]);
        priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]);
diff --combined net/sched/sch_api.c
index 36b025cc4fd263be35cdf6c6637c7d79e53c5f8f,d577c9e1cb42040e358ffb6d41250103a890dec0..87f6e3c6daa86dd870fe0ac5aefb50e76cd8bdd8
@@@ -1003,32 -1003,6 +1003,32 @@@ static bool tc_qdisc_dump_ignore(struc
        return false;
  }
  
 +static int qdisc_get_notify(struct net *net, struct sk_buff *oskb,
 +                          struct nlmsghdr *n, u32 clid, struct Qdisc *q,
 +                          struct netlink_ext_ack *extack)
 +{
 +      struct sk_buff *skb;
 +      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 +
 +      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 +      if (!skb)
 +              return -ENOBUFS;
 +
 +      if (!tc_qdisc_dump_ignore(q, false)) {
 +              if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0,
 +                                RTM_NEWQDISC, extack) < 0)
 +                      goto err_out;
 +      }
 +
 +      if (skb->len)
 +              return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 +                                    n->nlmsg_flags & NLM_F_ECHO);
 +
 +err_out:
 +      kfree_skb(skb);
 +      return -EINVAL;
 +}
 +
  static int qdisc_notify(struct net *net, struct sk_buff *oskb,
                        struct nlmsghdr *n, u32 clid,
                        struct Qdisc *old, struct Qdisc *new,
        struct sk_buff *skb;
        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
  
 +      if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
 +              return 0;
 +
        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
                return -ENOBUFS;
@@@ -1571,7 -1542,7 +1571,7 @@@ static int tc_get_qdisc(struct sk_buff 
                if (err != 0)
                        return err;
        } else {
 -              qdisc_notify(net, skb, n, clid, NULL, q, NULL);
 +              qdisc_get_notify(net, skb, n, clid, q, NULL);
        }
        return 0;
  }
@@@ -1965,9 -1936,6 +1965,9 @@@ static int tclass_notify(struct net *ne
        struct sk_buff *skb;
        u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
  
 +      if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
 +              return 0;
 +
        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
                return -ENOBUFS;
                              n->nlmsg_flags & NLM_F_ECHO);
  }
  
 +static int tclass_get_notify(struct net *net, struct sk_buff *oskb,
 +                           struct nlmsghdr *n, struct Qdisc *q,
 +                           unsigned long cl, struct netlink_ext_ack *extack)
 +{
 +      struct sk_buff *skb;
 +      u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
 +
 +      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 +      if (!skb)
 +              return -ENOBUFS;
 +
 +      if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS,
 +                         extack) < 0) {
 +              kfree_skb(skb);
 +              return -EINVAL;
 +      }
 +
 +      return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 +                            n->nlmsg_flags & NLM_F_ECHO);
 +}
 +
  static int tclass_del_notify(struct net *net,
                             const struct Qdisc_class_ops *cops,
                             struct sk_buff *oskb, struct nlmsghdr *n,
        if (!cops->delete)
                return -EOPNOTSUPP;
  
 -      skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 -      if (!skb)
 -              return -ENOBUFS;
 +      if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) {
 +              skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 +              if (!skb)
 +                      return -ENOBUFS;
  
 -      if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
 -                         RTM_DELTCLASS, extack) < 0) {
 -              kfree_skb(skb);
 -              return -EINVAL;
 +              if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
 +                                 RTM_DELTCLASS, extack) < 0) {
 +                      kfree_skb(skb);
 +                      return -EINVAL;
 +              }
 +      } else {
 +              skb = NULL;
        }
  
        err = cops->delete(q, cl, extack);
                return err;
        }
  
 -      err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 -                           n->nlmsg_flags & NLM_F_ECHO);
 +      err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC,
 +                                 n->nlmsg_flags & NLM_F_ECHO);
        return err;
  }
  
@@@ -2231,7 -2174,7 +2231,7 @@@ static int tc_ctl_tclass(struct sk_buf
                        tc_bind_tclass(q, portid, clid, 0);
                        goto out;
                case RTM_GETTCLASS:
 -                      err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
 +                      err = tclass_get_notify(net, skb, n, q, cl, extack);
                        goto out;
                default:
                        err = -EINVAL;
@@@ -2410,7 -2353,7 +2410,7 @@@ static struct pernet_operations psched_
        .exit = psched_net_exit,
  };
  
- #if IS_ENABLED(CONFIG_RETPOLINE)
+ #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
  DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
  #endif
  
diff --combined scripts/Makefile.lib
index cd5b181060f151f2c28186feb5b96db37ee04da9,c3f4cacad7b0984925b4a2350d7e7d8ee8b63635..473f5496ae61e782865df6834cabeae5fd54e2d8
@@@ -83,8 -83,8 +83,8 @@@ dtb-$(CONFIG_OF_ALL_DTBS)       += $(dt
  multi-dtb-y := $(call multi-search, $(dtb-y), .dtb, -dtbs)
  # Primitive DTB compiled from *.dts
  real-dtb-y := $(call real-search, $(dtb-y), .dtb, -dtbs)
 -# Base DTB that overlay is applied onto (each first word of $(*-dtbs) expansion)
 -base-dtb-y := $(foreach m, $(multi-dtb-y), $(firstword $(call suffix-search, $m, .dtb, -dtbs)))
 +# Base DTB that overlay is applied onto
 +base-dtb-y := $(filter %.dtb, $(call real-search, $(multi-dtb-y), .dtb, -dtbs))
  
  always-y                      += $(dtb-y)
  
@@@ -254,7 -254,7 +254,7 @@@ objtool := $(objtree)/tools/objtool/obj
  
  objtool-args-$(CONFIG_HAVE_JUMP_LABEL_HACK)           += --hacks=jump_label
  objtool-args-$(CONFIG_HAVE_NOINSTR_HACK)              += --hacks=noinstr
- objtool-args-$(CONFIG_CALL_DEPTH_TRACKING)            += --hacks=skylake
+ objtool-args-$(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) += --hacks=skylake
  objtool-args-$(CONFIG_X86_KERNEL_IBT)                 += --ibt
  objtool-args-$(CONFIG_FINEIBT)                                += --cfi
  objtool-args-$(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL)      += --mcount
@@@ -262,9 -262,9 +262,9 @@@ ifdef CONFIG_FTRACE_MCOUNT_USE_OBJTOO
  objtool-args-$(CONFIG_HAVE_OBJTOOL_NOP_MCOUNT)                += --mnop
  endif
  objtool-args-$(CONFIG_UNWINDER_ORC)                   += --orc
- objtool-args-$(CONFIG_RETPOLINE)                      += --retpoline
- objtool-args-$(CONFIG_RETHUNK)                                += --rethunk
- objtool-args-$(CONFIG_SLS)                            += --sls
+ objtool-args-$(CONFIG_MITIGATION_RETPOLINE)           += --retpoline
+ objtool-args-$(CONFIG_MITIGATION_RETHUNK)             += --rethunk
+ objtool-args-$(CONFIG_MITIGATION_SLS)                 += --sls
  objtool-args-$(CONFIG_STACK_VALIDATION)                       += --stackval
  objtool-args-$(CONFIG_HAVE_STATIC_CALL_INLINE)                += --static-call
  objtool-args-$(CONFIG_HAVE_UACCESS_VALIDATION)                += --uaccess
index 0da52b548ba50f5e1333c3c2b2a18da2533a18a1,eaf5246037964c1ac2f7cc5a94264cbeb0c7a6cd..19f72bfdbb82a987f834024b3e36f48e95b58351
@@@ -155,19 -155,12 +155,19 @@@ fn main() 
              "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
          );
          let mut features = "-3dnow,-3dnowa,-mmx,+soft-float".to_string();
-         if cfg.has("RETPOLINE") {
+         if cfg.has("MITIGATION_RETPOLINE") {
              features += ",+retpoline-external-thunk";
          }
          ts.push("features", features);
          ts.push("llvm-target", "x86_64-linux-gnu");
          ts.push("target-pointer-width", "64");
 +    } else if cfg.has("LOONGARCH") {
 +        ts.push("arch", "loongarch64");
 +        ts.push("data-layout", "e-m:e-p:64:64-i64:64-i128:128-n64-S128");
 +        ts.push("features", "-f,-d");
 +        ts.push("llvm-target", "loongarch64-linux-gnusf");
 +        ts.push("llvm-abiname", "lp64s");
 +        ts.push("target-pointer-width", "64");
      } else {
          panic!("Unsupported architecture");
      }
diff --combined scripts/mod/modpost.c
index 267b9a0a3abcd849fe4f0bae4cddd8a287d26184,72fead5f973b7dee903762e307ef357abca11a65..bf7c4b4b5ff45694322af0d6b221b6735ea1a739
@@@ -60,7 -60,8 +60,7 @@@ static unsigned int nr_unresolved
  
  #define MODULE_NAME_LEN (64 - sizeof(Elf_Addr))
  
 -void __attribute__((format(printf, 2, 3)))
 -modpost_log(enum loglevel loglevel, const char *fmt, ...)
 +void modpost_log(enum loglevel loglevel, const char *fmt, ...)
  {
        va_list arglist;
  
@@@ -70,7 -71,9 +70,7 @@@
                break;
        case LOG_ERROR:
                fprintf(stderr, "ERROR: ");
 -              break;
 -      case LOG_FATAL:
 -              fprintf(stderr, "FATAL: ");
 +              error_occurred = true;
                break;
        default: /* invalid loglevel, ignore */
                break;
        va_start(arglist, fmt);
        vfprintf(stderr, fmt, arglist);
        va_end(arglist);
 -
 -      if (loglevel == LOG_FATAL)
 -              exit(1);
 -      if (loglevel == LOG_ERROR)
 -              error_occurred = true;
  }
  
  static inline bool strends(const char *str, const char *postfix)
@@@ -466,9 -474,11 +466,9 @@@ static int parse_elf(struct elf_info *i
                fatal("%s: not relocatable object.", filename);
  
        /* Check if file offset is correct */
 -      if (hdr->e_shoff > info->size) {
 +      if (hdr->e_shoff > info->size)
                fatal("section header offset=%lu in file '%s' is bigger than filesize=%zu\n",
                      (unsigned long)hdr->e_shoff, filename, info->size);
 -              return 0;
 -      }
  
        if (hdr->e_shnum == SHN_UNDEF) {
                /*
                const char *secname;
                int nobits = sechdrs[i].sh_type == SHT_NOBITS;
  
 -              if (!nobits && sechdrs[i].sh_offset > info->size) {
 +              if (!nobits && sechdrs[i].sh_offset > info->size)
                        fatal("%s is truncated. sechdrs[i].sh_offset=%lu > sizeof(*hrd)=%zu\n",
                              filename, (unsigned long)sechdrs[i].sh_offset,
                              sizeof(*hdr));
 -                      return 0;
 -              }
 +
                secname = secstrings + sechdrs[i].sh_name;
                if (strcmp(secname, ".modinfo") == 0) {
                        if (nobits)
@@@ -796,8 -807,7 +796,8 @@@ static void check_section(const char *m
  
  #define DATA_SECTIONS ".data", ".data.rel"
  #define TEXT_SECTIONS ".text", ".text.*", ".sched.text", \
 -              ".kprobes.text", ".cpuidle.text", ".noinstr.text"
 +              ".kprobes.text", ".cpuidle.text", ".noinstr.text", \
 +              ".ltext", ".ltext.*"
  #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \
                ".fixup", ".entry.text", ".exception.text", \
                ".coldtext", ".softirqentry.text"
@@@ -1336,14 -1346,6 +1336,14 @@@ static Elf_Addr addend_mips_rel(uint32_
  #define R_LARCH_SUB32         55
  #endif
  
 +#ifndef R_LARCH_RELAX
 +#define R_LARCH_RELAX         100
 +#endif
 +
 +#ifndef R_LARCH_ALIGN
 +#define R_LARCH_ALIGN         102
 +#endif
 +
  static void get_rel_type_and_sym(struct elf_info *elf, uint64_t r_info,
                                 unsigned int *r_type, unsigned int *r_sym)
  {
@@@ -1398,16 -1400,9 +1398,16 @@@ static void section_rela(struct module 
                                continue;
                        break;
                case EM_LOONGARCH:
 -                      if (!strcmp("__ex_table", fromsec) &&
 -                          r_type == R_LARCH_SUB32)
 +                      switch (r_type) {
 +                      case R_LARCH_SUB32:
 +                              if (!strcmp("__ex_table", fromsec))
 +                                      continue;
 +                              break;
 +                      case R_LARCH_RELAX:
 +                      case R_LARCH_ALIGN:
 +                              /* These relocs do not refer to symbols */
                                continue;
 +                      }
                        break;
                }
  
@@@ -1424,7 -1419,7 +1424,7 @@@ static void section_rel(struct module *
  
        for (rel = start; rel < stop; rel++) {
                Elf_Sym *tsym;
 -              Elf_Addr taddr = 0, r_offset;
 +              Elf_Addr taddr, r_offset;
                unsigned int r_type, r_sym;
                void *loc;
  
@@@ -1848,7 -1843,7 +1848,7 @@@ static void add_header(struct buffer *b
  
        buf_printf(b,
                   "\n"
-                  "#ifdef CONFIG_RETPOLINE\n"
+                  "#ifdef CONFIG_MITIGATION_RETPOLINE\n"
                   "MODULE_INFO(retpoline, \"Y\");\n"
                   "#endif\n");
  
This page took 0.263464 seconds and 4 git commands to generate.