+ accept_memory= [MM]
+ Format: { eager | lazy }
+ default: lazy
+ By default, unaccepted memory is accepted lazily to
+ avoid prolonged boot times. The lazy option will add
+ some runtime overhead until all memory is eventually
+ accepted. In most cases the overhead is negligible.
+ For some workloads or for debugging purposes
+ accept_memory=eager can be used to accept all memory
+ at once during boot.
+
acpi= [HW,ACPI,X86,ARM64,RISCV64]
Advanced Configuration and Power Interface
Format: { force | on | off | strict | noirq | rsdt |
memory region [offset, offset + size] for that kernel
image. If '@offset' is omitted, then a suitable offset
is selected automatically.
- [KNL, X86-64, ARM64, RISCV] Select a region under 4G first, and
- fall back to reserve region above 4G when '@offset'
- hasn't been specified.
+ [KNL, X86-64, ARM64, RISCV, LoongArch] Select a region
+ under 4G first, and fall back to reserve region above
+ 4G when '@offset' hasn't been specified.
See Documentation/admin-guide/kdump/kdump.rst for further details.
crashkernel=range1:size1[,range2:size2,...][@offset]
Documentation/admin-guide/kdump/kdump.rst for an example.
crashkernel=size[KMG],high
- [KNL, X86-64, ARM64, RISCV] range could be above 4G.
+ [KNL, X86-64, ARM64, RISCV, LoongArch] range could be
+ above 4G.
Allow kernel to allocate physical memory region from top,
so could be above 4G if system have more than 4G ram
installed. Otherwise memory region will be allocated
below 4G, if available.
It will be ignored if crashkernel=X is specified.
crashkernel=size[KMG],low
- [KNL, X86-64, ARM64, RISCV] range under 4G. When crashkernel=X,high
- is passed, kernel could allocate physical memory region
- above 4G, that cause second kernel crash on system
- that require some amount of low memory, e.g. swiotlb
- requires at least 64M+32K low memory, also enough extra
- low memory is needed to make sure DMA buffers for 32-bit
- devices won't run out. Kernel would try to allocate
+ [KNL, X86-64, ARM64, RISCV, LoongArch] range under 4G.
+ When crashkernel=X,high is passed, kernel could allocate
+ physical memory region above 4G, that cause second kernel
+ crash on system that require some amount of low memory,
+ e.g. swiotlb requires at least 64M+32K low memory, also
+ enough extra low memory is needed to make sure DMA buffers
+ for 32-bit devices won't run out. Kernel would try to allocate
default size of memory below 4G automatically. The default
size is platform dependent.
--> x86: max(swiotlb_size_or_default() + 8MiB, 256MiB)
--> arm64: 128MiB
--> riscv: 128MiB
+ --> loongarch: 128MiB
This one lets the user specify own low range under 4G
for second kernel instead.
0: to disable low allocation.
between unregistering the boot console and initializing
the real console.
- keepinitrd [HW,ARM]
+ keepinitrd [HW,ARM] See retain_initrd.
kernelcore= [KNL,X86,IA-64,PPC]
Format: nn[KMGTPE] | nn% | "mirror"
vulnerability. System may allow data leaks with this
option.
- no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES] Disable paravirtualized
- steal time accounting. steal time is computed, but
- won't influence scheduler behaviour
+ no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV] Disable
+ paravirtualized steal time accounting. steal time is
+ computed, but won't influence scheduler behaviour
nosync [HW,M68K] Disables sync negotiation for all devices.
Dump ftrace buffer after reporting RCU CPU
stall warning.
+ rcupdate.rcu_cpu_stall_notifiers= [KNL]
+ Provide RCU CPU stall notifiers, but see the
+ warnings in the RCU_CPU_STALL_NOTIFIER Kconfig
+ option's help text. TL;DR: You almost certainly
+ do not want rcupdate.rcu_cpu_stall_notifiers.
+
rcupdate.rcu_cpu_stall_suppress= [KNL]
Suppress RCU CPU stall warning messages.
Useful for devices that are detected asynchronously
(e.g. USB and MMC devices).
- retain_initrd [RAM] Keep initrd memory after extraction
+ retain_initrd [RAM] Keep initrd memory after extraction. After boot, it will
+ be accessible via /sys/firmware/initrd.
retbleed= [X86] Control mitigation of RETBleed (Arbitrary
Speculative Code Execution with Return Instructions)
Selecting 'on' will, and 'auto' may, choose a
mitigation method at run time according to the
CPU, the available microcode, the setting of the
- CONFIG_RETPOLINE configuration option, and the
- compiler with which the kernel was built.
+ CONFIG_MITIGATION_RETPOLINE configuration option,
+ and the compiler with which the kernel was built.
Selecting 'on' will also enable the mitigation
against user space to user space task attacks.
pause after every control message);
o = USB_QUIRK_HUB_SLOW_RESET (Hub needs extra
delay after resetting its port);
+ p = USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT
+ (Reduce timeout of the SET_ADDRESS
+ request from 5000 ms to 500 ms);
Example: quirks=0781:5580:bk,0a5c:5834:gij
usbhid.mousepoll=
#
select ACPI_LEGACY_TABLES_LOOKUP if ACPI
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
+ select ACPI_HOTPLUG_CPU if ACPI_PROCESSOR && HOTPLUG_CPU
select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_INIT
select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
select ARCH_HAS_CPU_FINALIZE_INIT
+ select ARCH_HAS_CPU_PASID if IOMMU_SVA
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
select GENERIC_CLOCKEVENTS_MIN_ADJUST
select GENERIC_CMOS_UPDATE
select GENERIC_CPU_AUTOPROBE
+ select GENERIC_CPU_DEVICES
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
select GENERIC_ENTRY
depends on CPU_SUP_INTEL
depends on X86_64
depends on KVM_INTEL
+ depends on X86_X2APIC
+ select ARCH_KEEP_MEMBLOCK
+ depends on CONTIG_ALLOC
+ depends on !KEXEC_CORE
+ depends on X86_MCE
help
Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
host and certain physical attacks. This option enables necessary TDX
endmenu
+config CC_HAS_NAMED_AS
+ def_bool CC_IS_GCC && GCC_VERSION >= 120100
+
+config USE_X86_SEG_SUPPORT
+ def_bool y
+ depends on CC_HAS_NAMED_AS
+ #
+ # -fsanitize=kernel-address (KASAN) is at the moment incompatible
+ # with named address spaces - see GCC PR sanitizer/111736.
+ #
+ depends on !KASAN
+
config CC_HAS_SLS
def_bool $(cc-option,-mharden-sls=all)
config FINEIBT
def_bool y
- depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE
+ depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE
select CALL_PADDING
config HAVE_CALL_THUNKS
def_bool y
- depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL
+ depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL
config CALL_THUNKS
def_bool n
if SPECULATION_MITIGATIONS
- config PAGE_TABLE_ISOLATION
+ config MITIGATION_PAGE_TABLE_ISOLATION
bool "Remove the kernel mapping in user mode"
default y
depends on (X86_64 || X86_PAE)
See Documentation/arch/x86/pti.rst for more details.
- config RETPOLINE
+ config MITIGATION_RETPOLINE
bool "Avoid speculative indirect branches in kernel"
select OBJTOOL if HAVE_OBJTOOL
default y
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
- config RETHUNK
+ config MITIGATION_RETHUNK
bool "Enable return-thunks"
- depends on RETPOLINE && CC_HAS_RETURN_THUNK
+ depends on MITIGATION_RETPOLINE && CC_HAS_RETURN_THUNK
select OBJTOOL if HAVE_OBJTOOL
default y if X86_64
help
Requires a compiler with -mfunction-return=thunk-extern
support for full protection. The kernel may run slower.
- config CPU_UNRET_ENTRY
+ config MITIGATION_UNRET_ENTRY
bool "Enable UNRET on kernel entry"
- depends on CPU_SUP_AMD && RETHUNK && X86_64
+ depends on CPU_SUP_AMD && MITIGATION_RETHUNK && X86_64
default y
help
Compile the kernel with support for the retbleed=unret mitigation.
- config CALL_DEPTH_TRACKING
+ config MITIGATION_CALL_DEPTH_TRACKING
bool "Mitigate RSB underflow with call depth tracking"
depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS
select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
config CALL_THUNKS_DEBUG
bool "Enable call thunks and call depth tracking debugging"
- depends on CALL_DEPTH_TRACKING
+ depends on MITIGATION_CALL_DEPTH_TRACKING
select FUNCTION_ALIGNMENT_32B
default n
help
Only enable this when you are debugging call thunks as this
creates a noticeable runtime overhead. If unsure say N.
- config CPU_IBPB_ENTRY
+ config MITIGATION_IBPB_ENTRY
bool "Enable IBPB on kernel entry"
depends on CPU_SUP_AMD && X86_64
default y
help
Compile the kernel with support for the retbleed=ibpb mitigation.
- config CPU_IBRS_ENTRY
+ config MITIGATION_IBRS_ENTRY
bool "Enable IBRS on kernel entry"
depends on CPU_SUP_INTEL && X86_64
default y
This mitigates both spectre_v2 and retbleed at great cost to
performance.
- config CPU_SRSO
+ config MITIGATION_SRSO
bool "Mitigate speculative RAS overflow on AMD"
- depends on CPU_SUP_AMD && X86_64 && RETHUNK
+ depends on CPU_SUP_AMD && X86_64 && MITIGATION_RETHUNK
default y
help
Enable the SRSO mitigation needed on AMD Zen1-4 machines.
- config SLS
+ config MITIGATION_SLS
bool "Mitigate Straight-Line-Speculation"
depends on CC_HAS_SLS && X86_64
select OBJTOOL if HAVE_OBJTOOL
against straight line speculation. The kernel image might be slightly
larger.
- config GDS_FORCE_MITIGATION
+ config MITIGATION_GDS_FORCE
bool "Force GDS Mitigation"
depends on CPU_SUP_INTEL
default n
endif
RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
- ifdef CONFIG_RETHUNK
+ ifdef CONFIG_MITIGATION_RETHUNK
RETHUNK_CFLAGS := -mfunction-return=thunk-extern
RETPOLINE_CFLAGS += $(RETHUNK_CFLAGS)
endif
# temporary until string.h is fixed
KBUILD_CFLAGS += -ffreestanding
- ifeq ($(CONFIG_STACKPROTECTOR),y)
- ifeq ($(CONFIG_SMP),y)
+ ifeq ($(CONFIG_STACKPROTECTOR),y)
+ ifeq ($(CONFIG_SMP),y)
KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard
- else
+ else
KBUILD_CFLAGS += -mstack-protector-guard=global
- endif
- endif
+ endif
+ endif
else
BITS := 64
UTS_MACHINE := x86_64
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# Avoid indirect branches in kernel to deal with Spectre
- ifdef CONFIG_RETPOLINE
+ ifdef CONFIG_MITIGATION_RETPOLINE
KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
# Additionally, avoid generating expensive indirect jumps which
# are subject to retpolines for small number of switch cases.
endif
endif
- ifdef CONFIG_SLS
+ ifdef CONFIG_MITIGATION_SLS
KBUILD_CFLAGS += -mharden-sls=all
endif
archprepare: checkbin
checkbin:
- ifdef CONFIG_RETPOLINE
+ ifdef CONFIG_MITIGATION_RETPOLINE
ifeq ($(RETPOLINE_CFLAGS),)
@echo "You are building kernel with non-retpoline compiler." >&2
@echo "Please update your compiler." >&2
.endif
.endm
- #ifdef CONFIG_PAGE_TABLE_ISOLATION
+ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
- * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
+ * MITIGATION_PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
* halves:
*/
#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
.macro ADJUST_KERNEL_CR3 reg:req
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
- /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+ /* Clear PCID and "MITIGATION_PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
.endm
.endm
#define THIS_CPU_user_pcid_flush_mask \
- PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+ PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask)
.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req
mov %cr3, \scratch_reg
.Lend_\@:
.endm
- #else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+ #else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=n: */
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm
* Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set.
*/
.macro IBRS_ENTER save_reg
- #ifdef CONFIG_CPU_IBRS_ENTRY
+ #ifdef CONFIG_MITIGATION_IBRS_ENTRY
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
movl $MSR_IA32_SPEC_CTRL, %ecx
* regs. Must be called after the last RET.
*/
.macro IBRS_EXIT save_reg
- #ifdef CONFIG_CPU_IBRS_ENTRY
+ #ifdef CONFIG_MITIGATION_IBRS_ENTRY
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
movl $MSR_IA32_SPEC_CTRL, %ecx
.endm
#endif /* CONFIG_SMP */
+
+ #ifdef CONFIG_X86_64
+
+ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+ .macro THUNK name, func
+ SYM_FUNC_START(\name)
+ pushq %rbp
+ movq %rsp, %rbp
+
+ pushq %rdi
+ pushq %rsi
+ pushq %rdx
+ pushq %rcx
+ pushq %rax
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r11
+
+ call \func
+
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ popq %rbp
+ RET
+ SYM_FUNC_END(\name)
+ _ASM_NOKPROBE(\name)
+ .endm
+
+ #else /* CONFIG_X86_32 */
+
+ /* put return address in eax (arg1) */
+ .macro THUNK name, func, put_ret_addr_in_eax=0
+ SYM_CODE_START_NOALIGN(\name)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ .if \put_ret_addr_in_eax
+ /* Place EIP in the arg1 */
+ movl 3*4(%esp), %eax
+ .endif
+
+ call \func
+ popl %edx
+ popl %ecx
+ popl %eax
+ RET
+ _ASM_NOKPROBE(\name)
+ SYM_CODE_END(\name)
+ .endm
+
+ #endif
#ifdef CONFIG_STACKPROTECTOR
movq TASK_stack_canary(%rsi), %rbx
- movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary
+ movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary)
#endif
/*
#ifdef CONFIG_XEN_PV
ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
#endif
- #ifdef CONFIG_PAGE_TABLE_ISOLATION
+ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI
#endif
jnz .Lnative_iret
ud2
- #ifdef CONFIG_PAGE_TABLE_ISOLATION
+ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
.Lpti_restore_regs_and_return_to_usermode:
POP_REGS pop_rdi=0
*
* Registers:
* %r14: Used to save/restore the CR3 of the interrupted context
- * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
+ * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
SYM_CODE_START(asm_exc_nmi)
UNWIND_HINT_IRET_ENTRY
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H
+#include <linux/build_bug.h>
#include <linux/compiler.h>
#ifndef __ASSEMBLY__
struct task_struct *current_task;
int preempt_count;
int cpu_number;
- #ifdef CONFIG_CALL_DEPTH_TRACKING
+ #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
u64 call_depth;
#endif
unsigned long top_of_stack;
DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
+/* const-qualified alias to pcpu_hot, aliased by linker. */
+DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
+ const_pcpu_hot);
+
static __always_inline struct task_struct *get_current(void)
{
+ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
+ return this_cpu_read_const(const_pcpu_hot.current_task);
+
return this_cpu_read_stable(pcpu_hot.current_task);
}
#ifdef CONFIG_CALL_THUNKS_DEBUG
# define CALL_THUNKS_DEBUG_INC_CALLS \
- incq %gs:__x86_call_count;
+ incq PER_CPU_VAR(__x86_call_count);
# define CALL_THUNKS_DEBUG_INC_RETS \
- incq %gs:__x86_ret_count;
+ incq PER_CPU_VAR(__x86_ret_count);
# define CALL_THUNKS_DEBUG_INC_STUFFS \
- incq %gs:__x86_stuffs_count;
+ incq PER_CPU_VAR(__x86_stuffs_count);
# define CALL_THUNKS_DEBUG_INC_CTXSW \
- incq %gs:__x86_ctxsw_count;
+ incq PER_CPU_VAR(__x86_ctxsw_count);
#else
# define CALL_THUNKS_DEBUG_INC_CALLS
# define CALL_THUNKS_DEBUG_INC_RETS
# define CALL_THUNKS_DEBUG_INC_CTXSW
#endif
- #if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
+ #if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
#include <asm/asm-offsets.h>
#define CREDIT_CALL_DEPTH \
movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
-#define ASM_CREDIT_CALL_DEPTH \
- movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
-
#define RESET_CALL_DEPTH \
xor %eax, %eax; \
bts $63, %rax; \
CALL_THUNKS_DEBUG_INC_CALLS
#define INCREMENT_CALL_DEPTH \
- sarq $5, %gs:pcpu_hot + X86_call_depth; \
- CALL_THUNKS_DEBUG_INC_CALLS
-
-#define ASM_INCREMENT_CALL_DEPTH \
sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \
CALL_THUNKS_DEBUG_INC_CALLS
#else
#define CREDIT_CALL_DEPTH
-#define ASM_CREDIT_CALL_DEPTH
#define RESET_CALL_DEPTH
-#define INCREMENT_CALL_DEPTH
-#define ASM_INCREMENT_CALL_DEPTH
#define RESET_CALL_DEPTH_FROM_CALL
+#define INCREMENT_CALL_DEPTH
#endif
/*
jnz 771b; \
/* barrier for jnz misprediction */ \
lfence; \
- ASM_CREDIT_CALL_DEPTH \
+ CREDIT_CALL_DEPTH \
CALL_THUNKS_DEBUG_INC_CTXSW
#else
/*
*/
.macro VALIDATE_UNRET_END
#if defined(CONFIG_NOINSTR_VALIDATION) && \
- (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
+ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO))
ANNOTATE_RETPOLINE_SAFE
nop
#endif
* instruction irrespective of kCFI.
*/
.macro JMP_NOSPEC reg:req
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
__CS_PREFIX \reg
jmp __x86_indirect_thunk_\reg
#else
.endm
.macro CALL_NOSPEC reg:req
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
__CS_PREFIX \reg
call __x86_indirect_thunk_\reg
#else
.Lskip_rsb_\@:
.endm
- #if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
+ #if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
#define CALL_UNTRAIN_RET "call entry_untrain_ret"
#else
#define CALL_UNTRAIN_RET ""
* where we have a stack but before any RET instruction.
*/
.macro __UNTRAIN_RET ibpb_feature, call_depth_insns
- #if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY)
+ #if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY)
VALIDATE_UNRET_END
ALTERNATIVE_3 "", \
CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
.macro CALL_DEPTH_ACCOUNT
- #ifdef CONFIG_CALL_DEPTH_TRACKING
+ #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
ALTERNATIVE "", \
- __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
+ __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
#endif
.endm
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
- #ifdef CONFIG_RETHUNK
+ #ifdef CONFIG_MITIGATION_RETHUNK
extern void __x86_return_thunk(void);
#else
static inline void __x86_return_thunk(void) {}
#endif
- #ifdef CONFIG_CPU_UNRET_ENTRY
+ #ifdef CONFIG_MITIGATION_UNRET_ENTRY
extern void retbleed_return_thunk(void);
#else
static inline void retbleed_return_thunk(void) {}
#endif
- #ifdef CONFIG_CPU_SRSO
+ #ifdef CONFIG_MITIGATION_SRSO
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);
#else
extern void (*x86_return_thunk)(void);
- #ifdef CONFIG_CALL_DEPTH_TRACKING
+ extern void __warn_thunk(void);
+
+ #ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
extern void call_depth_return_thunk(void);
#define CALL_DEPTH_ACCOUNT \
DECLARE_PER_CPU(u64, __x86_stuffs_count);
DECLARE_PER_CPU(u64, __x86_ctxsw_count);
#endif
- #else /* !CONFIG_CALL_DEPTH_TRACKING */
+ #else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
static inline void call_depth_return_thunk(void) {}
#define CALL_DEPTH_ACCOUNT ""
- #endif /* CONFIG_CALL_DEPTH_TRACKING */
+ #endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
#define GEN(reg) \
extern retpoline_thunk_t __x86_indirect_thunk_ ## reg;
/*
* Inline asm uses the %V modifier which is only in newer GCC
- * which is ensured when CONFIG_RETPOLINE is defined.
+ * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined.
*/
# define CALL_NOSPEC \
ALTERNATIVE_2( \
#include <asm/fixmap.h>
#include <asm/paravirt.h>
#include <asm/asm-prototypes.h>
+#include <asm/cfi.h>
int __read_mostly alternatives_patched;
#define DA_ENDBR 0x08
#define DA_SMP 0x10
-static unsigned int __initdata_or_module debug_alternative;
+static unsigned int debug_alternative;
static int __init debug_alt(char *str)
{
* each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
* *jump* over instead of executing long and daft NOPs.
*/
-static void __init_or_module add_nop(u8 *instr, unsigned int len)
+static void add_nop(u8 *instr, unsigned int len)
{
u8 *target = instr + len;
* Optimize a sequence of NOPs, possibly preceded by an unconditional jump
* to the end of the NOP sequence into a single NOP.
*/
-static bool __init_or_module
+static bool
__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
{
int i = *next - insn->length;
return (target < src || target > src + src_len);
}
-static void __init_or_module noinline
-apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
{
int prev, target = 0;
{
BUG();
}
-EXPORT_SYMBOL_GPL(BUG_func);
+EXPORT_SYMBOL(BUG_func);
#define CALL_RIP_REL_OPCODE 0xff
#define CALL_RIP_REL_MODRM 0x15
return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
}
- #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
+ #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
/*
* CALL/JMP *%\reg
/*
* The compiler is supposed to EMIT an INT3 after every unconditional
* JMP instruction due to AMD BTC. However, if the compiler is too old
- * or SLS isn't enabled, we still need an INT3 after indirect JMPs
- * even on Intel.
+ * or MITIGATION_SLS isn't enabled, we still need an INT3 after
+ * indirect JMPs even on Intel.
*/
if (op == JMP32_INSN_OPCODE && i < insn->length)
bytes[i++] = INT3_INSN_OPCODE;
}
}
- #ifdef CONFIG_RETHUNK
+ #ifdef CONFIG_MITIGATION_RETHUNK
/*
* Rewrite the compiler generated return thunk tail-calls.
}
#else
void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
- #endif /* CONFIG_RETHUNK */
+ #endif /* CONFIG_MITIGATION_RETHUNK */
- #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
+ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
- #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
+ #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */
#ifdef CONFIG_X86_KERNEL_IBT
#endif /* CONFIG_X86_KERNEL_IBT */
#ifdef CONFIG_FINEIBT
+#define __CFI_DEFAULT CFI_DEFAULT
+#elif defined(CONFIG_CFI_CLANG)
+#define __CFI_DEFAULT CFI_KCFI
+#else
+#define __CFI_DEFAULT CFI_OFF
+#endif
-enum cfi_mode {
- CFI_DEFAULT,
- CFI_OFF,
- CFI_KCFI,
- CFI_FINEIBT,
-};
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+
+#ifdef CONFIG_CFI_CLANG
+struct bpf_insn;
+
+/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
+extern unsigned int __bpf_prog_runX(const void *ctx,
+ const struct bpf_insn *insn);
+
+/*
+ * Force a reference to the external symbol so the compiler generates
+ * __kcfi_typid.
+ */
+__ADDRESSABLE(__bpf_prog_runX);
+
+/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_hash,@object \n"
+" .globl cfi_bpf_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_hash: \n"
+" .long __kcfi_typeid___bpf_prog_runX \n"
+" .size cfi_bpf_hash, 4 \n"
+" .popsection \n"
+);
+
+/* Must match bpf_callback_t */
+extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
+
+__ADDRESSABLE(__bpf_callback_fn);
+
+/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_subprog_hash,@object \n"
+" .globl cfi_bpf_subprog_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_subprog_hash: \n"
+" .long __kcfi_typeid___bpf_callback_fn \n"
+" .size cfi_bpf_subprog_hash, 4 \n"
+" .popsection \n"
+);
+
+u32 cfi_get_func_hash(void *func)
+{
+ u32 hash;
+
+ func -= cfi_get_offset();
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ func += 7;
+ break;
+ case CFI_KCFI:
+ func += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (get_kernel_nofault(hash, func))
+ return 0;
+
+ return hash;
+}
+#endif
+
+#ifdef CONFIG_FINEIBT
-static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
static bool cfi_rand __ro_after_init = true;
static u32 cfi_seed __ro_after_init;
goto err;
if (cfi_rand) {
- if (builtin)
+ if (builtin) {
cfi_seed = get_random_u32();
+ cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+ cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
+ }
ret = cfi_rand_preamble(start_cfi, end_cfi);
if (ret)
/* Figure out Zen generations: */
switch (c->x86) {
- case 0x17: {
+ case 0x17:
switch (c->x86_model) {
case 0x00 ... 0x2f:
case 0x50 ... 0x5f:
goto warn;
}
break;
- }
- case 0x19: {
+
+ case 0x19:
switch (c->x86_model) {
case 0x00 ... 0x0f:
case 0x20 ... 0x5f:
goto warn;
}
break;
- }
+
+ case 0x1a:
+ switch (c->x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x2f:
+ case 0x40 ... 0x4f:
+ case 0x70 ... 0x7f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN5);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
default:
break;
}
void init_spectral_chicken(struct cpuinfo_x86 *c)
{
- #ifdef CONFIG_CPU_UNRET_ENTRY
+ #ifdef CONFIG_MITIGATION_UNRET_ENTRY
u64 value;
/*
msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
}
+static void init_amd_zen5(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+}
+
static void init_amd(struct cpuinfo_x86 *c)
{
u64 vm_cr;
init_amd_zen3(c);
else if (boot_cpu_has(X86_FEATURE_ZEN4))
init_amd_zen4(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN5))
+ init_amd_zen5(c);
/*
* Enable workaround for FXSAVE leak on CPUs
UNWIND_HINT_END_OF_STACK
/* Find the idle task stack */
- movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+ movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx
movq TASK_threadsp(%rcx), %rsp
jmp .Ljump_to_C_code
#define SYM_DATA_START_PAGE_ALIGNED(name) \
SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
- #ifdef CONFIG_PAGE_TABLE_ISOLATION
+ #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* Each PGD needs to be 8k long and 8k aligned. We do not
* ever go out to userspace with these, so we do not
#endif
jiffies = jiffies_64;
+const_pcpu_hot = pcpu_hot;
#if defined(CONFIG_X86_64)
/*
LOCK_TEXT
KPROBES_TEXT
SOFTIRQENTRY_TEXT
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
*(.text..__x86.indirect_thunk)
*(.text..__x86.return_thunk)
#endif
*(.text..__x86.rethunk_untrain)
ENTRY_TEXT
- #ifdef CONFIG_CPU_SRSO
+ #ifdef CONFIG_MITIGATION_SRSO
/*
* See the comment above srso_alias_untrain_ret()'s
* definition.
}
#endif
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
/*
* List of instructions that call/jmp/jcc to retpoline thunks
* __x86_indirect_thunk_*(). These instructions can be patched along
"fixed_percpu_data is not at start of per-cpu area");
#endif
- #ifdef CONFIG_CPU_UNRET_ENTRY
+ #ifdef CONFIG_MITIGATION_UNRET_ENTRY
. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
#endif
- #ifdef CONFIG_CPU_SRSO
+ #ifdef CONFIG_MITIGATION_SRSO
. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
/*
* GNU ld cannot do XOR until 2.41.
static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu)
{
- if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
return kvm_read_cr3(vcpu);
return mmu->get_guest_pgd(vcpu);
static inline bool kvm_available_flush_remote_tlbs_range(void)
{
+#if IS_ENABLED(CONFIG_HYPERV)
return kvm_x86_ops.flush_remote_tlbs_range;
-}
-
-int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
-{
- if (!kvm_x86_ops.flush_remote_tlbs_range)
- return -EOPNOTSUPP;
-
- return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
+#else
+ return false;
+#endif
}
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
return &slot->arch.lpage_info[level - 2][idx];
}
+/*
+ * The most significant bit in disallow_lpage tracks whether or not memory
+ * attributes are mixed, i.e. not identical for all gfns at the current level.
+ * The lower order bits are used to refcount other cases where a hugepage is
+ * disallowed, e.g. if KVM has shadow a page table at the gfn.
+ */
+#define KVM_LPAGE_MIXED_FLAG BIT(31)
+
static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
gfn_t gfn, int count)
{
struct kvm_lpage_info *linfo;
- int i;
+ int old, i;
for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
+
+ old = linfo->disallow_lpage;
linfo->disallow_lpage += count;
- WARN_ON_ONCE(linfo->disallow_lpage < 0);
+ WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG);
}
}
gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
if (READ_ONCE(eager_page_split))
- kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+ kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K);
kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
/*
* Recheck after taking the spinlock, a different vCPU
* may have since marked the page unsync. A false
- * positive on the unprotected check above is not
+ * negative on the unprotected check above is not
* possible as clearing sp->unsync _must_ hold mmu_lock
- * for write, i.e. unsync cannot transition from 0->1
+ * for write, i.e. unsync cannot transition from 1->0
* while this CPU holds mmu_lock for read (or write).
*/
if (READ_ONCE(sp->unsync))
*
* There are several ways to safely use this helper:
*
- * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
+ * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
* consuming it. In this case, mmu_lock doesn't need to be held during the
* lookup, but it does need to be held while checking the MMU notifier.
*
return level;
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm,
- const struct kvm_memory_slot *slot, gfn_t gfn,
- int max_level)
+static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t gfn, int max_level, bool is_private)
{
struct kvm_lpage_info *linfo;
int host_level;
break;
}
+ if (is_private)
+ return max_level;
+
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
return min(host_level, max_level);
}
+int kvm_mmu_max_mapping_level(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ int max_level)
+{
+ bool is_private = kvm_slot_can_be_private(slot) &&
+ kvm_mem_is_private(kvm, gfn);
+
+ return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
+}
+
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
- fault->gfn, fault->max_level);
+ fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
+ fault->gfn, fault->max_level,
+ fault->is_private);
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
return;
return;
if (is_tdp_mmu_page(sp))
- kvm_tdp_mmu_put_root(kvm, sp, false);
+ kvm_tdp_mmu_put_root(kvm, sp);
else if (!--sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
kvm_page_track_write_tracking_enabled(kvm))
goto out_success;
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(slot, bkt, slots) {
/*
hpa_t root;
root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
- root_gfn = root_pgd >> PAGE_SHIFT;
+ root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
mmu->root.hpa = kvm_mmu_get_dummy_root();
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
}
+static inline u8 kvm_max_level_for_order(int order)
+{
+ BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
+
+ KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
+ return PG_LEVEL_1G;
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+ PAGE_SIZE, fault->write, fault->exec,
+ fault->is_private);
+}
+
+static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int max_order, r;
+
+ if (!kvm_slot_can_be_private(fault->slot)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
+ &max_order);
+ if (r) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return r;
+ }
+
+ fault->max_level = min(kvm_max_level_for_order(max_order),
+ fault->max_level);
+ fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+
+ return RET_PF_CONTINUE;
+}
+
static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
return RET_PF_EMULATE;
}
+ if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ if (fault->is_private)
+ return kvm_faultin_pfn_private(vcpu, fault);
+
async = false;
fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
fault->write, &fault->map_writable,
return true;
return fault->slot &&
- mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
+ mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
}
static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (!kvm_memslots_have_rmaps(kvm))
return flush;
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
write_lock(&kvm->mmu_lock);
- kvm_mmu_invalidate_begin(kvm, 0, -1ul);
+ kvm_mmu_invalidate_begin(kvm);
+
+ kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
if (flush)
kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
- kvm_mmu_invalidate_end(kvm, 0, -1ul);
+ kvm_mmu_invalidate_end(kvm);
write_unlock(&kvm->mmu_lock);
}
* modifier prior to checking for a wrap of the MMIO generation so
* that a wrap in any address space is detected.
*/
- gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+ gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1);
/*
* The very rare case: if the MMIO generation number has wrapped,
if (kvm->arch.nx_huge_page_recovery_thread)
kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
}
+
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ /*
+ * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
+ * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
+ * can simply ignore such slots. But if userspace is making memory
+ * PRIVATE, then KVM must prevent the guest from accessing the memory
+ * as shared. And if userspace is making memory SHARED and this point
+ * is reached, then at least one page within the range was previously
+ * PRIVATE, i.e. the slot's possible hugepage ranges are changing.
+ * Zapping SPTEs in this case ensures KVM will reassess whether or not
+ * a hugepage can be used for affected ranges.
+ */
+ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+ return false;
+
+ return kvm_unmap_gfn_range(kvm, range);
+}
+
+static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
+}
+
+static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
+ int level)
+{
+ lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
+}
+
+static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, int level, unsigned long attrs)
+{
+ const unsigned long start = gfn;
+ const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
+
+ if (level == PG_LEVEL_2M)
+ return kvm_range_has_memory_attributes(kvm, start, end, attrs);
+
+ for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
+ if (hugepage_test_mixed(slot, gfn, level - 1) ||
+ attrs != kvm_get_memory_attributes(kvm, gfn))
+ return false;
+ }
+ return true;
+}
+
+bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
+ struct kvm_gfn_range *range)
+{
+ unsigned long attrs = range->arg.attributes;
+ struct kvm_memory_slot *slot = range->slot;
+ int level;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * Calculate which ranges can be mapped with hugepages even if the slot
+ * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over
+ * a range that has PRIVATE GFNs, and conversely converting a range to
+ * SHARED may now allow hugepages.
+ */
+ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+ return false;
+
+ /*
+ * The sequence matters here: upper levels consume the result of lower
+ * level's scanning.
+ */
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+ gfn_t gfn = gfn_round_for_level(range->start, level);
+
+ /* Process the head page if it straddles the range. */
+ if (gfn != range->start || gfn + nr_pages > range->end) {
+ /*
+ * Skip mixed tracking if the aligned gfn isn't covered
+ * by the memslot, KVM can't use a hugepage due to the
+ * misaligned address regardless of memory attributes.
+ */
+ if (gfn >= slot->base_gfn) {
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ gfn += nr_pages;
+ }
+
+ /*
+ * Pages entirely covered by the range are guaranteed to have
+ * only the attributes which were just set.
+ */
+ for ( ; gfn + nr_pages <= range->end; gfn += nr_pages)
+ hugepage_clear_mixed(slot, gfn, level);
+
+ /*
+ * Process the last tail page if it straddles the range and is
+ * contained by the memslot. Like the head page, KVM can't
+ * create a hugepage if the slot size is misaligned.
+ */
+ if (gfn < range->end &&
+ (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) {
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ }
+ return false;
+}
+
+void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ int level;
+
+ if (!kvm_arch_has_private_mem(kvm))
+ return;
+
+ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ /*
+ * Don't bother tracking mixed attributes for pages that can't
+ * be huge due to alignment, i.e. process only pages that are
+ * entirely contained by the memslot.
+ */
+ gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level);
+ gfn_t start = gfn_round_for_level(slot->base_gfn, level);
+ gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
+ gfn_t gfn;
+
+ if (start < slot->base_gfn)
+ start += nr_pages;
+
+ /*
+ * Unlike setting attributes, every potential hugepage needs to
+ * be manually checked as the attributes may already be mixed.
+ */
+ for (gfn = start; gfn < end; gfn += nr_pages) {
+ unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
+
+ if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
+ hugepage_clear_mixed(slot, gfn, level);
+ else
+ hugepage_set_mixed(slot, gfn, level);
+ }
+ }
+}
+#endif
#endif
/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
+#define __PT_BASE_ADDR_MASK GENMASK_ULL(51, 12)
#define __PT_LEVEL_SHIFT(level, bits_per_level) \
(PAGE_SHIFT + ((level) - 1) * (bits_per_level))
#define __PT_INDEX(address, level, bits_per_level) \
/* Derived from mmu and global state. */
const bool is_tdp;
+ const bool is_private;
const bool nx_huge_page_workaround_enabled;
/*
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
.req_level = PG_LEVEL_4K,
.goal_level = PG_LEVEL_4K,
+ .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
};
int r;
if (!prefetch)
vcpu->stat.pf_taken++;
- if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp)
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
r = kvm_tdp_page_fault(vcpu, &fault);
else
r = vcpu->arch.mmu->page_fault(vcpu, &fault);
if (!svm_check_exit_valid(exit_code))
return svm_handle_invalid_exit(vcpu, exit_code);
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
if (exit_code == SVM_EXIT_MSR)
return msr_interception(vcpu);
else if (exit_code == SVM_EXIT_VINTR)
if (svm->nmi_l1_to_l2)
return;
- svm->nmi_masked = true;
- svm_set_iret_intercept(svm);
+ /*
+ * No need to manually track NMI masking when vNMI is enabled, hardware
+ * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the
+ * case where software directly injects an NMI.
+ */
+ if (!is_vnmi_enabled(svm)) {
+ svm->nmi_masked = true;
+ svm_set_iret_intercept(svm);
+ }
++vcpu->stat.nmi_injections;
}
kvm_cpu_cap_set(X86_FEATURE_SVM);
kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
+ /*
+ * KVM currently flushes TLBs on *every* nested SVM transition,
+ * and so for all intents and purposes KVM supports flushing by
+ * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush.
+ */
+ kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID);
+
if (nrips)
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
7: vmload %_ASM_AX
8:
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
RESTORE_GUEST_SPEC_CTRL_BODY
RESTORE_HOST_SPEC_CTRL_BODY
-10: cmpb $0, kvm_rebooting
+10: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 2b
ud2
-30: cmpb $0, kvm_rebooting
+30: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 4b
ud2
-50: cmpb $0, kvm_rebooting
+50: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 6b
ud2
-70: cmpb $0, kvm_rebooting
+70: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 8b
ud2
/* Pop @svm to RDI, guest registers have been saved already. */
pop %_ASM_DI
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
RESTORE_GUEST_SPEC_CTRL_BODY
RESTORE_HOST_SPEC_CTRL_BODY
-3: cmpb $0, kvm_rebooting
+3: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 2b
ud2
#include "vmx.h"
#include "x86.h"
#include "smm.h"
+#include "vmx_onhyperv.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
- struct hv_partition_assist_pg **p_hv_pa_pg =
- &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
- /*
- * Synthetic VM-Exit is not enabled in current code and so All
- * evmcs in singe VM shares same assist page.
- */
- if (!*p_hv_pa_pg)
- *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
+ hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
- if (!*p_hv_pa_pg)
+ if (partition_assist_page == INVALID_PAGE)
return -ENOMEM;
evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
- evmcs->partition_assist_page =
- __pa(*p_hv_pa_pg);
+ evmcs->partition_assist_page = partition_assist_page;
evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
*/
static int kvm_cpu_vmxoff(void)
{
- asm_volatile_goto("1: vmxoff\n\t"
+ asm goto("1: vmxoff\n\t"
_ASM_EXTABLE(1b, %l[fault])
::: "cc", "memory" : fault);
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data))
return 1;
+#ifdef CONFIG_KVM_HYPERV
/*
* Enlightened VMCS v1 doesn't have certain VMCS fields but
* instead of just ignoring the features, different Hyper-V
if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
nested_evmcs_filter_control_msr(vcpu, msr_info->index,
&msr_info->data);
+#endif
break;
case MSR_IA32_RTIT_CTL:
if (!vmx_pt_mode_is_host_guest())
cr4_set_bits(X86_CR4_VMXE);
- asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ asm goto("1: vmxon %[vmxon_pointer]\n\t"
_ASM_EXTABLE(1b, %l[fault])
: : [vmxon_pointer] "m"(vmxon_pointer)
: : fault);
update_guest_cr3 = false;
vmx_ept_load_pdptrs(vcpu);
} else {
- guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu);
+ guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
+ kvm_get_active_cr3_lam_bits(vcpu);
}
if (update_guest_cr3)
vmx->nested.posted_intr_nv = -1;
vmx->nested.vmxon_ptr = INVALID_GPA;
vmx->nested.current_vmptr = INVALID_GPA;
+
+#ifdef CONFIG_KVM_HYPERV
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+#endif
vcpu->arch.microcode_version = 0x100000000ULL;
vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
* would also use advanced VM-exit information for EPT violations to
* reconstruct the page fault error code.
*/
- if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
+ if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
return kvm_emulate_instruction(vcpu, 0);
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
goto unexpected_vmexit;
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
return;
/*
- * Grab the memslot so that the hva lookup for the mmu_notifier retry
- * is guaranteed to use the same memslot as the pfn lookup, i.e. rely
- * on the pfn lookup's validation of the memslot to ensure a valid hva
- * is used for the retry check.
+ * Explicitly grab the memslot using KVM's internal slot ID to ensure
+ * KVM doesn't unintentionally grab a userspace memslot. It _should_
+ * be impossible for userspace to create a memslot for the APIC when
+ * APICv is enabled, but paranoia won't hurt in this case.
*/
slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return;
read_lock(&vcpu->kvm->mmu_lock);
- if (mmu_invalidate_retry_hva(kvm, mmu_seq,
- gfn_to_hva_memslot(slot, gfn))) {
+ if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) {
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
read_unlock(&vcpu->kvm->mmu_lock);
goto out;
cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
+ cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM));
+
#undef cr4_fixed1_update
}
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
+ kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM);
vmx_setup_uret_msrs(vmx);
free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
}
+/*
+ * Note, the SDM states that the linear address is masked *after* the modified
+ * canonicality check, whereas KVM masks (untags) the address and then performs
+ * a "normal" canonicality check. Functionally, the two methods are identical,
+ * and when the masking occurs relative to the canonicality check isn't visible
+ * to software, i.e. KVM's behavior doesn't violate the SDM.
+ */
+gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
+{
+ int lam_bit;
+ unsigned long cr3_bits;
+
+ if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
+ return gva;
+
+ if (!is_64_bit_mode(vcpu))
+ return gva;
+
+ /*
+ * Bit 63 determines if the address should be treated as user address
+ * or a supervisor address.
+ */
+ if (!(gva & BIT_ULL(63))) {
+ cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
+ if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
+ return gva;
+
+ /* LAM_U48 is ignored if LAM_U57 is set. */
+ lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
+ } else {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
+ return gva;
+
+ lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
+ }
+
+ /*
+ * Untag the address by sign-extending the lam_bit, but NOT to bit 63.
+ * Bit 63 is retained from the raw virtual address so that untagging
+ * doesn't change a user access to a supervisor access, and vice versa.
+ */
+ return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
+}
+
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.name = KBUILD_MODNAME,
.complete_emulated_msr = kvm_complete_insn_gp,
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+
+ .get_untagged_addr = vmx_get_untagged_addr,
};
static unsigned int vmx_handle_intel_pt_intr(void)
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
#include <asm/unwind.h>
+#include <asm/cfi.h>
static bool all_callee_regs_used[4] = {true, true, true, true};
do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
#ifdef CONFIG_X86_KERNEL_IBT
-#define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR_POISON() EMIT(gen_endbr_poison(), 4)
#else
#define EMIT_ENDBR()
+#define EMIT_ENDBR_POISON()
#endif
static bool is_imm8(int value)
*pprog = prog;
}
+static void emit_nops(u8 **pprog, int len)
+{
+ u8 *prog = *pprog;
+ int i, noplen;
+
+ while (len > 0) {
+ noplen = len;
+
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+
+ for (i = 0; i < noplen; i++)
+ EMIT1(x86_nops[noplen][i]);
+ len -= noplen;
+ }
+
+ *pprog = prog;
+}
+
+/*
+ * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
+ * in arch/x86/kernel/alternative.c
+ */
+
+static void emit_fineibt(u8 **pprog, u32 hash)
+{
+ u8 *prog = *pprog;
+
+ EMIT_ENDBR();
+ EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */
+ EMIT2(0x74, 0x07); /* jz.d8 +7 */
+ EMIT2(0x0f, 0x0b); /* ud2 */
+ EMIT1(0x90); /* nop */
+ EMIT_ENDBR_POISON();
+
+ *pprog = prog;
+}
+
+static void emit_kcfi(u8 **pprog, u32 hash)
+{
+ u8 *prog = *pprog;
+
+ EMIT1_off32(0xb8, hash); /* movl $hash, %eax */
+#ifdef CONFIG_CALL_PADDING
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+#endif
+ EMIT_ENDBR();
+
+ *pprog = prog;
+}
+
+static void emit_cfi(u8 **pprog, u32 hash)
+{
+ u8 *prog = *pprog;
+
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ emit_fineibt(&prog, hash);
+ break;
+
+ case CFI_KCFI:
+ emit_kcfi(&prog, hash);
+ break;
+
+ default:
+ EMIT_ENDBR();
+ break;
+ }
+
+ *pprog = prog;
+}
+
/*
* Emit x86-64 prologue code for BPF program.
* bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
{
u8 *prog = *pprog;
+ emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
- EMIT_ENDBR();
- memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
- prog += X86_PATCH_SIZE;
+ emit_nops(&prog, X86_PATCH_SIZE);
if (!ebpf_from_cbpf) {
if (tail_call_reachable && !is_subprog)
/* When it's the entry of the whole tailcall context,
emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
} else {
EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */
- if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS))
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || IS_ENABLED(CONFIG_MITIGATION_SLS))
EMIT1(0xCC); /* int3 */
}
emit_jump(&prog, x86_return_thunk, ip);
} else {
EMIT1(0xC3); /* ret */
- if (IS_ENABLED(CONFIG_SLS))
+ if (IS_ENABLED(CONFIG_MITIGATION_SLS))
EMIT1(0xCC); /* int3 */
}
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
- memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
- prog += X86_PATCH_SIZE;
+ emit_nops(&prog, X86_PATCH_SIZE);
/* out: */
ctx->tail_call_direct_label = prog - start;
}
}
-static void emit_nops(u8 **pprog, int len)
-{
- u8 *prog = *pprog;
- int i, noplen;
-
- while (len > 0) {
- noplen = len;
-
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
-
- for (i = 0; i < noplen; i++)
- EMIT1(x86_nops[noplen][i]);
- len -= noplen;
- }
-
- *pprog = prog;
-}
-
/* emit the 3-byte VEX prefix
*
* r: same as rex.r, extra bit for ModRM reg field
static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_link *l, int stack_size,
- int run_ctx_off, bool save_ret)
+ int run_ctx_off, bool save_ret,
+ void *image, void *rw_image)
{
u8 *prog = *pprog;
u8 *jmp_insn;
else
EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
- if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_enter(p), image + (prog - (u8 *)rw_image)))
return -EINVAL;
/* remember prog start time returned by __bpf_prog_enter */
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
(long) p->insnsi >> 32,
(u32) (long) p->insnsi);
/* call JITed bpf program or interpreter */
- if (emit_rsb_call(&prog, p->bpf_func, prog))
+ if (emit_rsb_call(&prog, p->bpf_func, image + (prog - (u8 *)rw_image)))
return -EINVAL;
/*
EMIT3_off32(0x48, 0x8D, 0x95, -run_ctx_off);
else
EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
- if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_exit(p), image + (prog - (u8 *)rw_image)))
return -EINVAL;
*pprog = prog;
static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_links *tl, int stack_size,
- int run_ctx_off, bool save_ret)
+ int run_ctx_off, bool save_ret,
+ void *image, void *rw_image)
{
int i;
u8 *prog = *pprog;
for (i = 0; i < tl->nr_links; i++) {
if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
- run_ctx_off, save_ret))
+ run_ctx_off, save_ret, image, rw_image))
return -EINVAL;
}
*pprog = prog;
static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_links *tl, int stack_size,
- int run_ctx_off, u8 **branches)
+ int run_ctx_off, u8 **branches,
+ void *image, void *rw_image)
{
u8 *prog = *pprog;
int i;
emit_mov_imm32(&prog, false, BPF_REG_0, 0);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
for (i = 0; i < tl->nr_links; i++) {
- if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true))
+ if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true,
+ image, rw_image))
return -EINVAL;
/* mod_ret prog stored return value into [rbp - 8]. Emit:
* add rsp, 8 // skip eth_type_trans's frame
* ret // return to its caller
*/
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
- const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_links *tlinks,
- void *func_addr)
+static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
+ void *rw_image_end, void *image,
+ const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
{
int i, ret, nr_regs = m->nr_args, stack_size = 0;
int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
u8 *prog;
bool save_ret;
+ /*
+ * F_INDIRECT is only compatible with F_RET_FENTRY_RET, it is
+ * explicitly incompatible with F_CALL_ORIG | F_SKIP_FRAME | F_IP_ARG
+ * because @func_addr.
+ */
+ WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
+ (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));
+
/* extra registers for struct arguments */
- for (i = 0; i < m->nr_args; i++)
+ for (i = 0; i < m->nr_args; i++) {
if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
nr_regs += (m->arg_size[i] + 7) / 8 - 1;
+ }
/* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
* are passed through regs, the remains are through stack.
orig_call += X86_PATCH_SIZE;
}
- prog = image;
+ prog = rw_image;
- EMIT_ENDBR();
- /*
- * This is the direct-call trampoline, as such it needs accounting
- * for the __fentry__ call.
- */
- x86_call_depth_emit_accounting(&prog, NULL);
+ if (flags & BPF_TRAMP_F_INDIRECT) {
+ /*
+ * Indirect call for bpf_struct_ops
+ */
+ emit_cfi(&prog, cfi_get_func_hash(func_addr));
+ } else {
+ /*
+ * Direct-call fentry stub, as such it needs accounting for the
+ * __fentry__ call.
+ */
+ x86_call_depth_emit_accounting(&prog, NULL);
+ }
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
- if (!is_imm8(stack_size))
+ if (!is_imm8(stack_size)) {
/* sub rsp, stack_size */
EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
- else
+ } else {
/* sub rsp, stack_size */
EMIT4(0x48, 0x83, 0xEC, stack_size);
+ }
if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
EMIT1(0x50); /* push rax */
/* mov QWORD PTR [rbp - rbx_off], rbx */
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_enter,
+ image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
}
- if (fentry->nr_links)
+ if (fentry->nr_links) {
if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
- flags & BPF_TRAMP_F_RET_FENTRY_RET))
+ flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image))
return -EINVAL;
+ }
if (fmod_ret->nr_links) {
branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
return -ENOMEM;
if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
- run_ctx_off, branches)) {
+ run_ctx_off, branches, image, rw_image)) {
ret = -EINVAL;
goto cleanup;
}
restore_regs(m, &prog, regs_off);
save_args(m, &prog, arg_stack_off, true);
- if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+ if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
/* Before calling the original function, restore the
* tail_call_cnt from stack to rax.
*/
RESTORE_TAIL_CALL_CNT(stack_size);
+ }
if (flags & BPF_TRAMP_F_ORIG_STACK) {
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
EMIT2(0xff, 0xd3); /* call *rbx */
} else {
/* call original function */
- if (emit_rsb_call(&prog, orig_call, prog)) {
+ if (emit_rsb_call(&prog, orig_call, image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
}
/* remember return value in a stack for bpf prog to access */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
- im->ip_after_call = prog;
- memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
- prog += X86_PATCH_SIZE;
+ im->ip_after_call = image + (prog - (u8 *)rw_image);
+ emit_nops(&prog, X86_PATCH_SIZE);
}
if (fmod_ret->nr_links) {
/* Update the branches saved in invoke_bpf_mod_ret with the
* aligned address of do_fexit.
*/
- for (i = 0; i < fmod_ret->nr_links; i++)
- emit_cond_near_jump(&branches[i], prog, branches[i],
- X86_JNE);
+ for (i = 0; i < fmod_ret->nr_links; i++) {
+ emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image),
+ image + (branches[i] - (u8 *)rw_image), X86_JNE);
+ }
}
- if (fexit->nr_links)
- if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) {
+ if (fexit->nr_links) {
+ if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off,
+ false, image, rw_image)) {
ret = -EINVAL;
goto cleanup;
}
+ }
if (flags & BPF_TRAMP_F_RESTORE_REGS)
restore_regs(m, &prog, regs_off);
* restored to R0.
*/
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- im->ip_epilogue = prog;
+ im->ip_epilogue = image + (prog - (u8 *)rw_image);
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_rsb_call(&prog, __bpf_tramp_exit, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_exit, image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
- } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+ } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
/* Before running the original function, restore the
* tail_call_cnt from stack to rax.
*/
RESTORE_TAIL_CALL_CNT(stack_size);
+ }
/* restore return value of orig_call or fentry prog back into RAX */
if (save_ret)
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
EMIT1(0xC9); /* leave */
- if (flags & BPF_TRAMP_F_SKIP_FRAME)
+ if (flags & BPF_TRAMP_F_SKIP_FRAME) {
/* skip our return address and return to parent */
EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
- emit_return(&prog, prog);
+ }
+ emit_return(&prog, image + (prog - (u8 *)rw_image));
/* Make sure the trampoline generation logic doesn't overflow */
- if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) {
+ if (WARN_ON_ONCE(prog > (u8 *)rw_image_end - BPF_INSN_SAFETY)) {
ret = -EFAULT;
goto cleanup;
}
- ret = prog - (u8 *)image;
+ ret = prog - (u8 *)rw_image + BPF_INSN_SAFETY;
cleanup:
kfree(branches);
return ret;
}
+void *arch_alloc_bpf_trampoline(unsigned int size)
+{
+ return bpf_prog_pack_alloc(size, jit_fill_hole);
+}
+
+void arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+ bpf_prog_pack_free(image, size);
+}
+
+void arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+}
+
+void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+{
+}
+
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
+ const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
+{
+ void *rw_image, *tmp;
+ int ret;
+ u32 size = image_end - image;
+
+ /* rw_image doesn't need to be in module memory range, so we can
+ * use kvmalloc.
+ */
+ rw_image = kvmalloc(size, GFP_KERNEL);
+ if (!rw_image)
+ return -ENOMEM;
+
+ ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m,
+ flags, tlinks, func_addr);
+ if (ret < 0)
+ goto out;
+
+ tmp = bpf_arch_text_copy(image, rw_image, size);
+ if (IS_ERR(tmp))
+ ret = PTR_ERR(tmp);
+out:
+ kvfree(rw_image);
+ return ret;
+}
+
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *func_addr)
+{
+ struct bpf_tramp_image im;
+ void *image;
+ int ret;
+
+ /* Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
+ * This will NOT cause fragmentation in direct map, as we do not
+ * call set_memory_*() on this buffer.
+ *
+ * We cannot use kvmalloc here, because we need image to be in
+ * module memory range.
+ */
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!image)
+ return -ENOMEM;
+
+ ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
+ m, flags, tlinks, func_addr);
+ bpf_jit_free_exec(image);
+ return ret;
+}
+
static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
{
u8 *jg_reloc, *prog = *pprog;
jit_data->header = header;
jit_data->rw_header = rw_header;
}
- prog->bpf_func = (void *)image;
+ /*
+ * ctx.prog_offset is used when CFI preambles put code *before*
+ * the function. See emit_cfi(). For FineIBT specifically this code
+ * can also be executed and bpf_prog_kallsyms_add() will
+ * generate an additional symbol to cover this, hence also
+ * decrement proglen.
+ */
+ prog->bpf_func = (void *)image + cfi_get_offset();
prog->jited = 1;
- prog->jited_len = proglen;
+ prog->jited_len = proglen - cfi_get_offset();
} else {
prog = orig_prog;
}
kvfree(jit_data->addrs);
kfree(jit_data);
}
+ prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
(typeof(ptr)) (__ptr + (off)); \
})
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
#define __noretpoline __attribute__((__indirect_branch__("keep")))
#endif
__builtin_unreachable(); \
} while (0)
+/*
+ * GCC 'asm goto' with outputs miscompiles certain code sequences:
+ *
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422
+ *
+ * Work it around via the same compiler barrier quirk that we used
+ * to use for the old 'asm goto' workaround.
+ *
+ * Also, always mark such 'asm goto' statements as volatile: all
+ * asm goto statements are supposed to be volatile as per the
+ * documentation, but some versions of gcc didn't actually do
+ * that for asms with outputs:
+ *
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619
+ */
+#define asm_goto_output(x...) \
+ do { asm volatile goto(x); asm (""); } while (0)
+
#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
#define __HAVE_BUILTIN_BSWAP32__
#define __HAVE_BUILTIN_BSWAP64__
#endif
#define __diag_ignore_all(option, comment) \
- __diag_GCC(8, ignore, option)
+ __diag(__diag_GCC_ignore option)
/*
* Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size"
#ifndef _LINUX_INDIRECT_CALL_WRAPPER_H
#define _LINUX_INDIRECT_CALL_WRAPPER_H
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
/*
* INDIRECT_CALL_$NR - wrapper for indirect calls with $NR known builtin
* @__VA_ARGS__: arguments for @f
*
* Avoid retpoline overhead for known builtin, checking @f vs each of them and
- * eventually invoking directly the builtin function. The functions are check
+ * eventually invoking directly the builtin function. The functions are checked
* in the given order. Fallback to the indirect call.
*/
#define INDIRECT_CALL_1(f, f1, ...) \
* @module: the module we should check for
*
* Only try to get a module reference count if the module is not being removed.
- * This call will fail if the module is already being removed.
+ * This call will fail if the module is in the process of being removed.
*
* Care must also be taken to ensure the module exists and is alive prior to
* usage of this call. This can be gauranteed through two means:
static inline void module_bug_cleanup(struct module *mod) {}
#endif /* CONFIG_GENERIC_BUG */
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
extern bool retpoline_module_ok(bool has_retpoline);
#else
static inline bool retpoline_module_ok(bool has_retpoline)
#include <net/pkt_cls.h>
- #if IS_ENABLED(CONFIG_RETPOLINE)
+ #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
#include <linux/cpufeature.h>
#include <linux/static_key.h>
if (a->ops->act == tcf_ife_act)
return tcf_ife_act(skb, a, res);
#endif
-#if IS_BUILTIN(CONFIG_NET_ACT_IPT)
- if (a->ops->act == tcf_ipt_act)
- return tcf_ipt_act(skb, a, res);
-#endif
#if IS_BUILTIN(CONFIG_NET_ACT_SIMP)
if (a->ops->act == tcf_simp_act)
return tcf_simp_act(skb, a, res);
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <asm/local64.h>
#include <asm/local.h>
/*
unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
+struct buffer_data_read_page {
+ unsigned order; /* order of the page */
+ struct buffer_data_page *data; /* actual data, stored in this page */
+};
+
/*
* Note, the buffer_page list must be first. The buffer pages
* are allocated in cache lines, which means that each buffer
unsigned read; /* index for next read */
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
+ unsigned order; /* order of the page */
struct buffer_data_page *page; /* Actual data page */
};
static void free_buffer_page(struct buffer_page *bpage)
{
- free_page((unsigned long)bpage->page);
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
return !!(delta & TS_DELTA_TEST);
}
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
-int ring_buffer_print_page_header(struct trace_seq *s)
-{
- struct buffer_data_page field;
-
- trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
-
- return !trace_seq_has_overflowed(s);
-}
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
RB_CTX_MAX
};
-#if BITS_PER_LONG == 32
-#define RB_TIME_32
-#endif
-
-/* To test on 64 bit machines */
-//#define RB_TIME_32
-
-#ifdef RB_TIME_32
-
-struct rb_time_struct {
- local_t cnt;
- local_t top;
- local_t bottom;
- local_t msb;
-};
-#else
-#include <asm/local64.h>
struct rb_time_struct {
local64_t time;
};
-#endif
typedef struct rb_time_struct rb_time_t;
#define MAX_NEST 5
struct rb_irq_work irq_work;
bool time_stamp_abs;
+
+ unsigned int subbuf_size;
+ unsigned int subbuf_order;
+ unsigned int max_data_size;
};
struct ring_buffer_iter {
u64 read_stamp;
u64 page_stamp;
struct ring_buffer_event *event;
+ size_t event_size;
int missed_events;
};
-#ifdef RB_TIME_32
-
-/*
- * On 32 bit machines, local64_t is very expensive. As the ring
- * buffer doesn't need all the features of a true 64 bit atomic,
- * on 32 bit, it uses these functions (64 still uses local64_t).
- *
- * For the ring buffer, 64 bit required operations for the time is
- * the following:
- *
- * - Reads may fail if it interrupted a modification of the time stamp.
- * It will succeed if it did not interrupt another write even if
- * the read itself is interrupted by a write.
- * It returns whether it was successful or not.
- *
- * - Writes always succeed and will overwrite other writes and writes
- * that were done by events interrupting the current write.
- *
- * - A write followed by a read of the same time stamp will always succeed,
- * but may not contain the same value.
- *
- * - A cmpxchg will fail if it interrupted another write or cmpxchg.
- * Other than that, it acts like a normal cmpxchg.
- *
- * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
- * (bottom being the least significant 30 bits of the 60 bit time stamp).
- *
- * The two most significant bits of each half holds a 2 bit counter (0-3).
- * Each update will increment this counter by one.
- * When reading the top and bottom, if the two counter bits match then the
- * top and bottom together make a valid 60 bit number.
- */
-#define RB_TIME_SHIFT 30
-#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
-#define RB_TIME_MSB_SHIFT 60
-
-static inline int rb_time_cnt(unsigned long val)
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
{
- return (val >> RB_TIME_SHIFT) & 3;
-}
-
-static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
-{
- u64 val;
-
- val = top & RB_TIME_VAL_MASK;
- val <<= RB_TIME_SHIFT;
- val |= bottom & RB_TIME_VAL_MASK;
-
- return val;
-}
-
-static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
-{
- unsigned long top, bottom, msb;
- unsigned long c;
-
- /*
- * If the read is interrupted by a write, then the cnt will
- * be different. Loop until both top and bottom have been read
- * without interruption.
- */
- do {
- c = local_read(&t->cnt);
- top = local_read(&t->top);
- bottom = local_read(&t->bottom);
- msb = local_read(&t->msb);
- } while (c != local_read(&t->cnt));
-
- *cnt = rb_time_cnt(top);
-
- /* If top, msb or bottom counts don't match, this interrupted a write */
- if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
- return false;
-
- /* The shift to msb will lose its cnt bits */
- *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
- return true;
-}
-
-static bool rb_time_read(rb_time_t *t, u64 *ret)
-{
- unsigned long cnt;
-
- return __rb_time_read(t, ret, &cnt);
-}
-
-static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
-{
- return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
-}
-
-static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
- unsigned long *msb)
-{
- *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
- *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
- *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
-}
+ struct buffer_data_page field;
-static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
-{
- val = rb_time_val_cnt(val, cnt);
- local_set(t, val);
-}
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
-static void rb_time_set(rb_time_t *t, u64 val)
-{
- unsigned long cnt, top, bottom, msb;
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
- rb_time_split(val, &top, &bottom, &msb);
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
- /* Writes always succeed with a valid number even if it gets interrupted. */
- do {
- cnt = local_inc_return(&t->cnt);
- rb_time_val_set(&t->top, top, cnt);
- rb_time_val_set(&t->bottom, bottom, cnt);
- rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
- } while (cnt != local_read(&t->cnt));
-}
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)buffer->subbuf_size,
+ (unsigned int)is_signed_type(char));
-static inline bool
-rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
-{
- return local_try_cmpxchg(l, &expect, set);
+ return !trace_seq_has_overflowed(s);
}
-#else /* 64 bits */
-
-/* local64_t always succeeds */
-
-static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+static inline void rb_time_read(rb_time_t *t, u64 *ret)
{
*ret = local64_read(&t->time);
- return true;
}
static void rb_time_set(rb_time_t *t, u64 val)
{
local64_set(&t->time, val);
}
-#endif
/*
* Enable this to make sure that the event passed to
WARN_ONCE(1, "nest (%d) greater than max", nest);
fail:
- /* Can only fail on 32 bit */
- if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
- /* Screw it, just read the current time */
- ts = rb_time_stamp(cpu_buffer->buffer);
+ rb_time_read(&cpu_buffer->write_stamp, &ts);
return ts;
}
full = 0;
} else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return -EINVAL;
+ return EPOLLERR;
cpu_buffer = buffer->buffers[cpu];
work = &cpu_buffer->irq_work;
u64 ts;
/* Skip retpolines :-( */
- if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && likely(buffer->clock == trace_clock_local))
ts = trace_clock_local();
else
ts = buffer->clock();
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+ cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
bpage->page = page_address(page);
+ bpage->order = cpu_buffer->buffer->subbuf_order;
rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
rb_check_bpage(cpu_buffer, bpage);
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
bpage->page = page_address(page);
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ /* Default buffer page size - one system page */
+ buffer->subbuf_order = 0;
+ buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+
+ /* Max payload is buffer page size - header (8bytes) */
+ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
+
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
* @size: the new size.
* @cpu_id: the cpu buffer to resize
*
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->subbuf_size.
*
* Returns 0 on success and < 0 on failure.
*/
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return 0;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
/* we need a minimum of two pages */
if (nr_pages < 2)
*/
barrier();
- if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
+ if ((iter->head + length) > commit || length > iter->event_size)
/* Writer corrupted the read? */
goto reset;
}
static __always_inline unsigned
-rb_event_index(struct ring_buffer_event *event)
+rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
- return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
+ addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1;
+
+ return addr - BUF_PAGE_HDR_SIZE;
}
static void rb_inc_iter(struct ring_buffer_iter *iter)
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long tail, struct rb_event_info *info)
{
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
unsigned long length = info->length;
* Only the event that crossed the page boundary
* must fill the old tail_page with padding.
*/
- if (tail >= BUF_PAGE_SIZE) {
+ if (tail >= bsize) {
/*
* If the page was filled, then we still need
* to update the real_end. Reset it to zero
* and the reader will ignore it.
*/
- if (tail == BUF_PAGE_SIZE)
+ if (tail == bsize)
tail_page->real_end = 0;
local_sub(length, &tail_page->write);
* If we are less than the minimum size, we don't need to
* worry about it.
*/
- if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+ if (tail > (bsize - RB_EVNT_MIN_SIZE)) {
/* No room for any events */
/* Mark the rest of the page with padding */
}
/* Put in a discarded event */
- event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+ event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
/* time delta must be non zero */
event->time_delta = 1;
/* account for padding bytes */
- local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+ local_add(bsize - tail, &cpu_buffer->entries_bytes);
/* Make sure the padding is visible before the tail_page->write update */
smp_wmb();
/* Set write to end of buffer */
- length = (tail + length) - BUF_PAGE_SIZE;
+ length = (tail + length) - bsize;
local_sub(length, &tail_page->write);
}
/* Slow path */
static struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
event->type_len = RINGBUF_TYPE_TIME_STAMP;
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page, or not delta? */
- if (abs || rb_event_index(event)) {
+ if (abs || rb_event_index(cpu_buffer, event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
(unsigned long long)info->ts,
(unsigned long long)info->before,
(unsigned long long)info->after,
- (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+ (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}),
sched_clock_stable() ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
if (!abs)
info->delta = 0;
}
- *event = rb_add_time_stamp(*event, info->delta, abs);
+ *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs);
*length -= RB_LEN_TIME_EXTEND;
*delta = 0;
}
struct buffer_page *bpage;
unsigned long addr;
- new_index = rb_event_index(event);
+ new_index = rb_event_index(cpu_buffer, event);
old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
bpage = READ_ONCE(cpu_buffer->tail_page);
#define CHECK_FULL_PAGE 1L
#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
+
+static const char *show_irq_str(int bits)
+{
+ const char *type[] = {
+ ".", // 0
+ "s", // 1
+ "h", // 2
+ "Hs", // 3
+ "n", // 4
+ "Ns", // 5
+ "Nh", // 6
+ "NHs", // 7
+ };
+
+ return type[bits];
+}
+
+/* Assume this is an trace event */
+static const char *show_flags(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+ int bits = 0;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "X";
+
+ entry = ring_buffer_event_data(event);
+
+ if (entry->flags & TRACE_FLAG_SOFTIRQ)
+ bits |= 1;
+
+ if (entry->flags & TRACE_FLAG_HARDIRQ)
+ bits |= 2;
+
+ if (entry->flags & TRACE_FLAG_NMI)
+ bits |= 4;
+
+ return show_irq_str(bits);
+}
+
+static const char *show_irq(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "";
+
+ entry = ring_buffer_event_data(event);
+ if (entry->flags & TRACE_FLAG_IRQS_OFF)
+ return "d";
+ return "";
+}
+
+static const char *show_interrupt_level(void)
+{
+ unsigned long pc = preempt_count();
+ unsigned char level = 0;
+
+ if (pc & SOFTIRQ_OFFSET)
+ level |= 1;
+
+ if (pc & HARDIRQ_MASK)
+ level |= 2;
+
+ if (pc & NMI_MASK)
+ level |= 4;
+
+ return show_irq_str(level);
+}
+
static void dump_buffer_page(struct buffer_data_page *bpage,
struct rb_event_info *info,
unsigned long tail)
case RINGBUF_TYPE_TIME_EXTEND:
delta = rb_event_time_stamp(event);
ts += delta;
- pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta);
+ pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n",
+ e, ts, delta);
break;
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
ts = rb_fix_abs_ts(delta, ts);
- pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
+ pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n",
+ e, ts, delta);
break;
case RINGBUF_TYPE_PADDING:
ts += event->time_delta;
- pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta);
+ pr_warn(" 0x%x: [%lld] delta:%d PADDING\n",
+ e, ts, event->time_delta);
break;
case RINGBUF_TYPE_DATA:
ts += event->time_delta;
- pr_warn(" [%lld] delta:%d\n", ts, event->time_delta);
+ pr_warn(" 0x%x: [%lld] delta:%d %s%s\n",
+ e, ts, event->time_delta,
+ show_flags(event), show_irq(event));
break;
default:
break;
}
}
+ pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e);
}
static DEFINE_PER_CPU(atomic_t, checking);
static atomic_t ts_dump;
+#define buffer_warn_return(fmt, ...) \
+ do { \
+ /* If another report is happening, ignore this one */ \
+ if (atomic_inc_return(&ts_dump) != 1) { \
+ atomic_dec(&ts_dump); \
+ goto out; \
+ } \
+ atomic_inc(&cpu_buffer->record_disabled); \
+ pr_warn(fmt, ##__VA_ARGS__); \
+ dump_buffer_page(bpage, info, tail); \
+ atomic_dec(&ts_dump); \
+ /* There's some cases in boot up that this can happen */ \
+ if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \
+ /* Do not re-enable checking */ \
+ return; \
+ } while (0)
+
/*
* Check if the current event time stamp matches the deltas on
* the buffer page.
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = rb_fix_abs_ts(delta, ts);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ }
+ ts = delta;
break;
case RINGBUF_TYPE_PADDING:
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
- /* If another report is happening, ignore this one */
- if (atomic_inc_return(&ts_dump) != 1) {
- atomic_dec(&ts_dump);
- goto out;
- }
- atomic_inc(&cpu_buffer->record_disabled);
- /* There's some cases in boot up that this can happen */
- WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
- pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
- cpu_buffer->cpu,
- ts + info->delta, info->ts, info->delta,
- info->before, info->after,
- full ? " (full)" : "");
- dump_buffer_page(bpage, info, tail);
- atomic_dec(&ts_dump);
- /* Do not re-enable checking */
- return;
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta,
+ info->before, info->after,
+ full ? " (full)" : "", show_interrupt_level());
}
out:
atomic_dec(this_cpu_ptr(&checking));
struct ring_buffer_event *event;
struct buffer_page *tail_page;
unsigned long tail, write, w;
- bool a_ok;
- bool b_ok;
/* Don't let the compiler play games with cpu_buffer->tail_page */
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
/*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
barrier();
- b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ rb_time_read(&cpu_buffer->write_stamp, &info->after);
barrier();
info->ts = rb_time_stamp(cpu_buffer->buffer);
if (!w) {
/* Use the sub-buffer timestamp */
info->delta = 0;
- } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
+ } else if (unlikely(info->before != info->after)) {
info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
info->length += RB_LEN_TIME_EXTEND;
} else {
tail = write - info->length;
/* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE)) {
+ if (unlikely(write > cpu_buffer->buffer->subbuf_size)) {
check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
return rb_move_tail(cpu_buffer, tail, info);
}
/* SLOW PATH - Interrupted between A and C */
/* Save the old before_stamp */
- a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- RB_WARN_ON(cpu_buffer, !a_ok);
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
/*
* Read a new timestamp and update the before_stamp to make
rb_time_set(&cpu_buffer->before_stamp, ts);
barrier();
- /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- /* Was interrupted before here, write_stamp must be valid */
- RB_WARN_ON(cpu_buffer, !a_ok);
+ /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after);
barrier();
/*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
info->after == info->before && info->after < ts) {
if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
add_ts_default = RB_ADD_STAMP_ABSOLUTE;
info.length += RB_LEN_TIME_EXTEND;
- if (info.length > BUF_MAX_DATA_SIZE)
+ if (info.length > cpu_buffer->buffer->max_data_size)
goto out_fail;
} else {
add_ts_default = RB_ADD_STAMP_NONE;
if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
goto out;
- if (unlikely(length > BUF_MAX_DATA_SIZE))
+ if (unlikely(length > buffer->max_data_size))
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
struct buffer_page *bpage = cpu_buffer->commit_page;
struct buffer_page *start;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
/* Do the likely case first */
if (likely(bpage->page == (void *)addr)) {
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- if (length > BUF_MAX_DATA_SIZE)
+ if (length > buffer->max_data_size)
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
#define USECS_WAIT 1000000
for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
/* If the write is past the end of page, a writer is still updating it */
- if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+ if (likely(!reader || rb_page_write(reader) <= bsize))
break;
udelay(1);
return NULL;
/* Holds the entire event: data and meta data */
- iter->event = kmalloc(BUF_PAGE_SIZE, flags);
+ iter->event_size = buffer->subbuf_size;
+ iter->event = kmalloc(iter->event_size, flags);
if (!iter->event) {
kfree(iter);
return NULL;
*/
unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
{
- /*
- * Earlier, this method returned
- * BUF_PAGE_SIZE * buffer->nr_pages
- * Since the nr_pages field is now removed, we have converted this to
- * return the per cpu buffer value.
- */
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
- return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+ return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
+/**
+ * ring_buffer_max_event_size - return the max data size of an event
+ * @buffer: The ring buffer.
+ *
+ * Returns the maximum size an event can be.
+ */
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
+{
+ /* If abs timestamp is requested, events have a timestamp too */
+ if (ring_buffer_time_stamp_abs(buffer))
+ return buffer->max_data_size - RB_LEN_TIME_EXTEND;
+ return buffer->max_data_size;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
+
static void rb_clear_buffer_page(struct buffer_page *page)
{
local_set(&page->write, 0);
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
+ if (buffer_a->subbuf_order != buffer_b->subbuf_order)
+ goto out;
+
ret = -EAGAIN;
if (atomic_read(&buffer_a->record_disabled))
* Returns:
* The page allocated, or ERR_PTR
*/
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = NULL;
+ struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
+ bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+ if (!bpage)
+ return ERR_PTR(-ENOMEM);
+
+ bpage->order = buffer->subbuf_order;
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
if (cpu_buffer->free_page) {
- bpage = cpu_buffer->free_page;
+ bpage->data = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
}
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
- if (bpage)
+ if (bpage->data)
goto out;
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page) {
+ kfree(bpage);
return ERR_PTR(-ENOMEM);
+ }
- bpage = page_address(page);
+ bpage->data = page_address(page);
out:
- rb_init_page(bpage);
+ rb_init_page(bpage->data);
return bpage;
}
* ring_buffer_free_read_page - free an allocated read page
* @buffer: the buffer the page was allocate for
* @cpu: the cpu buffer the page came from
- * @data: the page to free
+ * @data_page: the page to free
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+ struct buffer_data_read_page *data_page)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = data;
+ struct buffer_data_page *bpage = data_page->data;
struct page *page = virt_to_page(bpage);
unsigned long flags;
cpu_buffer = buffer->buffers[cpu];
- /* If the page is still in use someplace else, we can't reuse it */
- if (page_ref_count(page) > 1)
+ /*
+ * If the page is still in use someplace else, or order of the page
+ * is different from the subbuffer order of the buffer -
+ * we can't reuse it
+ */
+ if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order)
goto out;
local_irq_save(flags);
local_irq_restore(flags);
out:
- free_page((unsigned long)bpage);
+ free_pages((unsigned long)bpage, data_page->order);
+ kfree(data_page);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
* if (IS_ERR(rpage))
* return PTR_ERR(rpage);
- * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0);
* if (ret >= 0)
- * process_page(rpage, ret);
+ * process_page(ring_buffer_read_page_data(rpage), ret);
+ * ring_buffer_free_read_page(buffer, cpu, rpage);
*
* When @full is set, the function will not return true unless
* the writer is off the reader page.
* <0 if no data has been transferred.
*/
int ring_buffer_read_page(struct trace_buffer *buffer,
- void **data_page, size_t len, int cpu, int full)
+ struct buffer_data_read_page *data_page,
+ size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
len -= BUF_PAGE_HDR_SIZE;
- if (!data_page)
+ if (!data_page || !data_page->data)
+ goto out;
+ if (data_page->order != buffer->subbuf_order)
goto out;
- bpage = *data_page;
+ bpage = data_page->data;
if (!bpage)
goto out;
/* swap the pages */
rb_init_page(bpage);
bpage = reader->page;
- reader->page = *data_page;
+ reader->page = data_page->data;
local_set(&reader->write, 0);
local_set(&reader->entries, 0);
reader->read = 0;
- *data_page = bpage;
+ data_page->data = bpage;
/*
* Use the real_end for the data size,
/* If there is room at the end of the page to save the
* missed events, then record it there.
*/
- if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+ if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
memcpy(&bpage->data[commit], &missed_events,
sizeof(missed_events));
local_add(RB_MISSED_STORED, &bpage->commit);
/*
* This page may be off to user land. Zero it out here.
*/
- if (commit < BUF_PAGE_SIZE)
- memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+ if (commit < buffer->subbuf_size)
+ memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
out_unlock:
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+/**
+ * ring_buffer_read_page_data - get pointer to the data in the page.
+ * @page: the page to get the data from
+ *
+ * Returns pointer to the actual data in this page.
+ */
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page)
+{
+ return page->data;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page_data);
+
+/**
+ * ring_buffer_subbuf_size_get - get size of the sub buffer.
+ * @buffer: the buffer to get the sub buffer size from
+ *
+ * Returns size of the sub buffer, in bytes.
+ */
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
+{
+ return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
+
+/**
+ * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page.
+ * @buffer: The ring_buffer to get the system sub page order from
+ *
+ * By default, one ring buffer sub page equals to one system page. This parameter
+ * is configurable, per ring buffer. The size of the ring buffer sub page can be
+ * extended, but must be an order of system page size.
+ *
+ * Returns the order of buffer sub page size, in system pages:
+ * 0 means the sub buffer size is 1 system page and so forth.
+ * In case of an error < 0 is returned.
+ */
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
+{
+ if (!buffer)
+ return -EINVAL;
+
+ return buffer->subbuf_order;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
+
+/**
+ * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @order: Order of the system pages in one sub buffer page
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be order of
+ * system page size, that's why the input parameter @order is the order of
+ * system pages that are allocated for one ring buffer page:
+ * 0 - 1 system page
+ * 1 - 2 system pages
+ * 3 - 4 system pages
+ * ...
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *bpage, *tmp;
+ int old_order, old_size;
+ int nr_pages;
+ int psize;
+ int err;
+ int cpu;
+
+ if (!buffer || order < 0)
+ return -EINVAL;
+
+ if (buffer->subbuf_order == order)
+ return 0;
+
+ psize = (1 << order) * PAGE_SIZE;
+ if (psize <= BUF_PAGE_HDR_SIZE)
+ return -EINVAL;
+
+ old_order = buffer->subbuf_order;
+ old_size = buffer->subbuf_size;
+
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+ atomic_inc(&buffer->record_disabled);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buffer->subbuf_order = order;
+ buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
+
+ /* Make sure all new buffers are allocated, before deleting the old ones */
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Update the number of pages to match the new size */
+ nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
+
+ /* we need a minimum of two pages */
+ if (nr_pages < 2)
+ nr_pages = 2;
+
+ cpu_buffer->nr_pages_to_update = nr_pages;
+
+ /* Include the reader page */
+ nr_pages++;
+
+ /* Allocate the new size buffer */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer, nr_pages,
+ &cpu_buffer->new_pages)) {
+ /* not enough memory for new pages */
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Clear the head bit to make the link list normal to read */
+ rb_head_page_deactivate(cpu_buffer);
+
+ /* Now walk the list and free all the old sub buffers */
+ list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ /* The above loop stopped an the last page needing to be freed */
+ bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ free_buffer_page(bpage);
+
+ /* Free the current reader page */
+ free_buffer_page(cpu_buffer->reader_page);
+
+ /* One page was allocated for the reader page */
+ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
+ struct buffer_page, list);
+ list_del_init(&cpu_buffer->reader_page->list);
+
+ /* The cpu_buffer pages are a link list with no head */
+ cpu_buffer->pages = cpu_buffer->new_pages.next;
+ cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
+ cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
+
+ /* Clear the new_pages list */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
+ cpu_buffer->nr_pages_to_update = 0;
+
+ free_pages((unsigned long)cpu_buffer->free_page, old_order);
+ cpu_buffer->free_page = NULL;
+
+ rb_head_page_activate(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+ }
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ return 0;
+
+error:
+ buffer->subbuf_order = old_order;
+ buffer->subbuf_size = old_size;
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
break;
#endif
case NFT_CT_ID:
+ if (tb[NFTA_CT_DIRECTION])
+ return -EINVAL;
+
len = sizeof(u32);
break;
default:
return false;
}
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
static const struct nft_expr_ops nft_ct_get_fast_ops = {
.type = &nft_ct_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
return ERR_PTR(-EINVAL);
if (tb[NFTA_CT_DREG]) {
- #ifdef CONFIG_RETPOLINE
+ #ifdef CONFIG_MITIGATION_RETPOLINE
u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
switch (k) {
if (tb[NFTA_CT_EXPECT_L3PROTO])
priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO]));
+ switch (priv->l3num) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ if (priv->l3num != ctx->family)
+ return -EINVAL;
+
+ fallthrough;
+ case NFPROTO_INET:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]);
+ switch (priv->l4proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_DCCP:
+ case IPPROTO_SCTP:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]);
priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]);
priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]);
return false;
}
+static int qdisc_get_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, u32 clid, struct Qdisc *q,
+ struct netlink_ext_ack *extack)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (!tc_qdisc_dump_ignore(q, false)) {
+ if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0,
+ RTM_NEWQDISC, extack) < 0)
+ goto err_out;
+ }
+
+ if (skb->len)
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+
+err_out:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
static int qdisc_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, u32 clid,
struct Qdisc *old, struct Qdisc *new,
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
+ return 0;
+
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
if (err != 0)
return err;
} else {
- qdisc_notify(net, skb, n, clid, NULL, q, NULL);
+ qdisc_get_notify(net, skb, n, clid, q, NULL);
}
return 0;
}
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
+ return 0;
+
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
n->nlmsg_flags & NLM_F_ECHO);
}
+static int tclass_get_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct Qdisc *q,
+ unsigned long cl, struct netlink_ext_ack *extack)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS,
+ extack) < 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
static int tclass_del_notify(struct net *net,
const struct Qdisc_class_ops *cops,
struct sk_buff *oskb, struct nlmsghdr *n,
if (!cops->delete)
return -EOPNOTSUPP;
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
+ if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) {
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
- if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
- RTM_DELTCLASS, extack) < 0) {
- kfree_skb(skb);
- return -EINVAL;
+ if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
+ RTM_DELTCLASS, extack) < 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ } else {
+ skb = NULL;
}
err = cops->delete(q, cl, extack);
return err;
}
- err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
+ err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
return err;
}
tc_bind_tclass(q, portid, clid, 0);
goto out;
case RTM_GETTCLASS:
- err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
+ err = tclass_get_notify(net, skb, n, q, cl, extack);
goto out;
default:
err = -EINVAL;
.exit = psched_net_exit,
};
- #if IS_ENABLED(CONFIG_RETPOLINE)
+ #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
#endif
multi-dtb-y := $(call multi-search, $(dtb-y), .dtb, -dtbs)
# Primitive DTB compiled from *.dts
real-dtb-y := $(call real-search, $(dtb-y), .dtb, -dtbs)
-# Base DTB that overlay is applied onto (each first word of $(*-dtbs) expansion)
-base-dtb-y := $(foreach m, $(multi-dtb-y), $(firstword $(call suffix-search, $m, .dtb, -dtbs)))
+# Base DTB that overlay is applied onto
+base-dtb-y := $(filter %.dtb, $(call real-search, $(multi-dtb-y), .dtb, -dtbs))
always-y += $(dtb-y)
objtool-args-$(CONFIG_HAVE_JUMP_LABEL_HACK) += --hacks=jump_label
objtool-args-$(CONFIG_HAVE_NOINSTR_HACK) += --hacks=noinstr
- objtool-args-$(CONFIG_CALL_DEPTH_TRACKING) += --hacks=skylake
+ objtool-args-$(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) += --hacks=skylake
objtool-args-$(CONFIG_X86_KERNEL_IBT) += --ibt
objtool-args-$(CONFIG_FINEIBT) += --cfi
objtool-args-$(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL) += --mcount
objtool-args-$(CONFIG_HAVE_OBJTOOL_NOP_MCOUNT) += --mnop
endif
objtool-args-$(CONFIG_UNWINDER_ORC) += --orc
- objtool-args-$(CONFIG_RETPOLINE) += --retpoline
- objtool-args-$(CONFIG_RETHUNK) += --rethunk
- objtool-args-$(CONFIG_SLS) += --sls
+ objtool-args-$(CONFIG_MITIGATION_RETPOLINE) += --retpoline
+ objtool-args-$(CONFIG_MITIGATION_RETHUNK) += --rethunk
+ objtool-args-$(CONFIG_MITIGATION_SLS) += --sls
objtool-args-$(CONFIG_STACK_VALIDATION) += --stackval
objtool-args-$(CONFIG_HAVE_STATIC_CALL_INLINE) += --static-call
objtool-args-$(CONFIG_HAVE_UACCESS_VALIDATION) += --uaccess
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
);
let mut features = "-3dnow,-3dnowa,-mmx,+soft-float".to_string();
- if cfg.has("RETPOLINE") {
+ if cfg.has("MITIGATION_RETPOLINE") {
features += ",+retpoline-external-thunk";
}
ts.push("features", features);
ts.push("llvm-target", "x86_64-linux-gnu");
ts.push("target-pointer-width", "64");
+ } else if cfg.has("LOONGARCH") {
+ ts.push("arch", "loongarch64");
+ ts.push("data-layout", "e-m:e-p:64:64-i64:64-i128:128-n64-S128");
+ ts.push("features", "-f,-d");
+ ts.push("llvm-target", "loongarch64-linux-gnusf");
+ ts.push("llvm-abiname", "lp64s");
+ ts.push("target-pointer-width", "64");
} else {
panic!("Unsupported architecture");
}
#define MODULE_NAME_LEN (64 - sizeof(Elf_Addr))
-void __attribute__((format(printf, 2, 3)))
-modpost_log(enum loglevel loglevel, const char *fmt, ...)
+void modpost_log(enum loglevel loglevel, const char *fmt, ...)
{
va_list arglist;
break;
case LOG_ERROR:
fprintf(stderr, "ERROR: ");
- break;
- case LOG_FATAL:
- fprintf(stderr, "FATAL: ");
+ error_occurred = true;
break;
default: /* invalid loglevel, ignore */
break;
va_start(arglist, fmt);
vfprintf(stderr, fmt, arglist);
va_end(arglist);
-
- if (loglevel == LOG_FATAL)
- exit(1);
- if (loglevel == LOG_ERROR)
- error_occurred = true;
}
static inline bool strends(const char *str, const char *postfix)
fatal("%s: not relocatable object.", filename);
/* Check if file offset is correct */
- if (hdr->e_shoff > info->size) {
+ if (hdr->e_shoff > info->size)
fatal("section header offset=%lu in file '%s' is bigger than filesize=%zu\n",
(unsigned long)hdr->e_shoff, filename, info->size);
- return 0;
- }
if (hdr->e_shnum == SHN_UNDEF) {
/*
const char *secname;
int nobits = sechdrs[i].sh_type == SHT_NOBITS;
- if (!nobits && sechdrs[i].sh_offset > info->size) {
+ if (!nobits && sechdrs[i].sh_offset > info->size)
fatal("%s is truncated. sechdrs[i].sh_offset=%lu > sizeof(*hrd)=%zu\n",
filename, (unsigned long)sechdrs[i].sh_offset,
sizeof(*hdr));
- return 0;
- }
+
secname = secstrings + sechdrs[i].sh_name;
if (strcmp(secname, ".modinfo") == 0) {
if (nobits)
#define DATA_SECTIONS ".data", ".data.rel"
#define TEXT_SECTIONS ".text", ".text.*", ".sched.text", \
- ".kprobes.text", ".cpuidle.text", ".noinstr.text"
+ ".kprobes.text", ".cpuidle.text", ".noinstr.text", \
+ ".ltext", ".ltext.*"
#define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \
".fixup", ".entry.text", ".exception.text", \
".coldtext", ".softirqentry.text"
#define R_LARCH_SUB32 55
#endif
+#ifndef R_LARCH_RELAX
+#define R_LARCH_RELAX 100
+#endif
+
+#ifndef R_LARCH_ALIGN
+#define R_LARCH_ALIGN 102
+#endif
+
static void get_rel_type_and_sym(struct elf_info *elf, uint64_t r_info,
unsigned int *r_type, unsigned int *r_sym)
{
continue;
break;
case EM_LOONGARCH:
- if (!strcmp("__ex_table", fromsec) &&
- r_type == R_LARCH_SUB32)
+ switch (r_type) {
+ case R_LARCH_SUB32:
+ if (!strcmp("__ex_table", fromsec))
+ continue;
+ break;
+ case R_LARCH_RELAX:
+ case R_LARCH_ALIGN:
+ /* These relocs do not refer to symbols */
continue;
+ }
break;
}
for (rel = start; rel < stop; rel++) {
Elf_Sym *tsym;
- Elf_Addr taddr = 0, r_offset;
+ Elf_Addr taddr, r_offset;
unsigned int r_type, r_sym;
void *loc;
buf_printf(b,
"\n"
- "#ifdef CONFIG_RETPOLINE\n"
+ "#ifdef CONFIG_MITIGATION_RETPOLINE\n"
"MODULE_INFO(retpoline, \"Y\");\n"
"#endif\n");