#ifndef _ASM_X86_FPU_H
#define _ASM_X86_FPU_H
+ #include <asm/page_types.h>
+
/*
* The legacy x87 FPU state format, as saved by FSAVE and
* restored by the FRSTOR instructions:
*
* This master permission field is only to be used when
* task.fpu.fpstate based checks fail to validate whether the task
- * is allowed to expand it's xfeatures set which requires to
+ * is allowed to expand its xfeatures set which requires to
* allocate a larger sized fpstate buffer.
*
* Do not access this field directly. Use the provided helper
#include <asm/paravirt_types.h>
+ #ifndef __ASSEMBLY__
+ struct mm_struct;
+ #endif
+
#ifdef CONFIG_PARAVIRT
#include <asm/pgtable_types.h>
#include <asm/asm.h>
static __always_inline unsigned long read_cr2(void)
{
return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
- "mov %%cr2, %%rax;",
- ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%cr2, %%rax;", ALT_NOT_XEN);
}
static __always_inline void write_cr2(unsigned long x)
static inline unsigned long __read_cr3(void)
{
return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
- "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%cr3, %%rax;", ALT_NOT_XEN);
}
static inline void write_cr3(unsigned long x)
{
- PVOP_ALT_VCALL1(mmu.write_cr3, x,
- "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN);
}
static inline void __write_cr4(unsigned long x)
static __always_inline void wbinvd(void)
{
- PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
}
static inline u64 paravirt_read_msr(unsigned msr)
static inline pte_t __pte(pteval_t val)
{
return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pteval_t pte_val(pte_t pte)
{
return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline pgd_t __pgd(pgdval_t val)
{
return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pgdval_t pgd_val(pgd_t pgd)
{
return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
static inline pmd_t __pmd(pmdval_t val)
{
return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pmdval_t pmd_val(pmd_t pmd)
{
return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline void set_pud(pud_t *pudp, pud_t pud)
pudval_t ret;
ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
return (pud_t) { ret };
}
static inline pudval_t pud_val(pud_t pud)
{
return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline void pud_clear(pud_t *pudp)
static inline p4d_t __p4d(p4dval_t val)
{
p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
return (p4d_t) { ret };
}
static inline p4dval_t p4d_val(p4d_t p4d)
{
return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
static __always_inline unsigned long arch_local_save_flags(void)
{
return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
- ALT_NOT(X86_FEATURE_XENPV));
+ ALT_NOT_XEN);
}
static __always_inline void arch_local_irq_disable(void)
{
- PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN);
}
static __always_inline void arch_local_irq_enable(void)
{
- PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN);
}
static __always_inline unsigned long arch_local_irq_save(void)
#undef PVOP_VCALL4
#undef PVOP_CALL4
-#define DEFINE_PARAVIRT_ASM(func, instr, sec) \
- asm (".pushsection " #sec ", \"ax\"\n" \
- ".global " #func "\n\t" \
- ".type " #func ", @function\n\t" \
- ASM_FUNC_ALIGN "\n" \
- #func ":\n\t" \
- ASM_ENDBR \
- instr "\n\t" \
- ASM_RET \
- ".size " #func ", . - " #func "\n\t" \
- ".popsection")
-
extern void default_banner(void);
void native_pv_lock_init(void) __init;
#else /* __ASSEMBLY__ */
-#define _PVSITE(ptype, ops, word, algn) \
-771:; \
- ops; \
-772:; \
- .pushsection .parainstructions,"a"; \
- .align algn; \
- word 771b; \
- .byte ptype; \
- .byte 772b-771b; \
- _ASM_ALIGN; \
- .popsection
-
-
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_DEBUG_ENTRY
-#define PARA_PATCH(off) ((off) / 8)
-#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#ifdef CONFIG_DEBUG_ENTRY
.macro PARA_IRQ_save_fl
- PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
- ANNOTATE_RETPOLINE_SAFE;
- call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
+ ANNOTATE_RETPOLINE_SAFE;
+ call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);
.endm
-#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
- ALT_NOT(X86_FEATURE_XENPV)
+#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \
+ "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \
+ "pushf; pop %rax;", ALT_NOT_XEN
#endif
#endif /* CONFIG_PARAVIRT_XXL */
#endif /* CONFIG_X86_64 */
#ifndef _ASM_X86_PARAVIRT_TYPES_H
#define _ASM_X86_PARAVIRT_TYPES_H
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-
-/* These all sit in the .parainstructions section to tell us what to patch. */
-struct paravirt_patch_site {
- u8 *instr; /* original instructions */
- u8 type; /* type of this instruction */
- u8 len; /* length of original instruction */
-};
-#endif
-
#ifdef CONFIG_PARAVIRT
#ifndef __ASSEMBLY__
++#include <linux/types.h>
#include <asm/desc_defs.h>
#include <asm/pgtable_types.h>
extern struct pv_info pv_info;
extern struct paravirt_patch_template pv_ops;
-#define PARAVIRT_PATCH(x) \
- (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
-
-#define paravirt_type(op) \
- [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
- [paravirt_opptr] "m" (pv_ops.op)
-/*
- * Generate some code, and mark it as patchable by the
- * apply_paravirt() alternate instruction patcher.
- */
-#define _paravirt_alt(insn_string, type) \
- "771:\n\t" insn_string "\n" "772:\n" \
- ".pushsection .parainstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR " 771b\n" \
- " .byte " type "\n" \
- " .byte 772b-771b\n" \
- _ASM_ALIGN "\n" \
- ".popsection\n"
-
-/* Generate patchable code, with the default asm parameters. */
-#define paravirt_alt(insn_string) \
- _paravirt_alt(insn_string, "%c[paravirt_typenum]")
-
-/* Simple instruction patching code. */
-#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
-
-unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len);
+#define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op)
int paravirt_disable_iospace(void);
-/*
- * This generates an indirect call based on the operation type number.
- * The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_patch_template structure, and can therefore be
- * freely converted back into a structure offset.
- */
+/* This generates an indirect call based on the operation type number. */
#define PARAVIRT_CALL \
ANNOTATE_RETPOLINE_SAFE \
"call *%[paravirt_opptr];"
* However, x86_64 also has to clobber all caller saved registers, which
* unfortunately, are quite a bit (r8 - r11)
*
- * The call instruction itself is marked by placing its start address
- * and size into the .parainstructions section, so that
- * apply_paravirt() in arch/i386/kernel/alternative.c can do the
- * appropriate patching under the control of the backend pv_init_ops
- * implementation.
- *
* Unfortunately there's no way to get gcc to generate the args setup
* for the call, and then allow the call itself to be generated by an
* inline asm. Because of this, we must do the complete arg setup and
__mask & __eax; \
})
-
+/*
+ * Use alternative patching for paravirt calls:
+ * - For replacing an indirect call with a direct one, use the "normal"
+ * ALTERNATIVE() macro with the indirect call as the initial code sequence,
+ * which will be replaced with the related direct call by using the
+ * ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
+ * - In case the replacement is either a direct call or a short code sequence
+ * depending on a feature bit, the ALTERNATIVE_2() macro is being used.
+ * The indirect call is the initial code sequence again, while the special
+ * code sequence is selected with the specified feature bit. In case the
+ * feature is not active, the direct call is used as above via the
+ * ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
+ */
#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \
({ \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- asm volatile(paravirt_alt(PARAVIRT_CALL) \
+ asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \
+ ALT_CALL_ALWAYS) \
: call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
+ : paravirt_ptr(op), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
ret; \
({ \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \
- alt, cond) \
+ asm volatile(ALTERNATIVE_2(PARAVIRT_CALL, \
+ ALT_CALL_INSTR, ALT_CALL_ALWAYS, \
+ alt, cond) \
: call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
+ : paravirt_ptr(op), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
ret; \
__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-void _paravirt_nop(void);
-void paravirt_BUG(void);
unsigned long paravirt_ret0(void);
#ifdef CONFIG_PARAVIRT_XXL
u64 _paravirt_ident_64(u64);
unsigned long pv_native_read_cr2(void);
#endif
-#define paravirt_nop ((void *)_paravirt_nop)
-
-extern struct paravirt_patch_site __parainstructions[],
- __parainstructions_end[];
+#define paravirt_nop ((void *)nop_func)
#endif /* __ASSEMBLY__ */
+
+#define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV)
+
#endif /* CONFIG_PARAVIRT */
#endif /* _ASM_X86_PARAVIRT_TYPES_H */
#include <linux/coredump.h>
#include <linux/time_namespace.h>
#include <linux/user_events.h>
+ #include <linux/rseq.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
* will be able to manipulate the current directory, etc.
* It would be nice to force an unshare instead...
*/
- t = p;
n_fs = 1;
spin_lock(&p->fs->lock);
rcu_read_lock();
- while_each_thread(p, t) {
+ for_other_threads(p, t) {
if (t->fs == p->fs)
n_fs++;
}
unsigned long usage_mask;
const struct lock_trace *usage_traces[LOCK_TRACE_STATES];
+ const char *name;
/*
* Generation counter, when doing certain classes of graph walking,
* to ensure that we check one node only once:
*/
int name_version;
- const char *name;
u8 wait_type_inner;
u8 wait_type_outer;
struct pin_cookie { unsigned int val; };
+ #define MAX_LOCKDEP_KEYS_BITS 13
+ #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
+ #define INITIAL_CHAIN_KEY -1
+
+ struct held_lock {
+ /*
+ * One-way hash of the dependency chain up to this point. We
+ * hash the hashes step by step as the dependency chain grows.
+ *
+ * We use it for dependency-caching and we skip detection
+ * passes and dependency-updates if there is a cache-hit, so
+ * it is absolutely critical for 100% coverage of the validator
+ * to have a unique key value for every unique dependency path
+ * that can occur in the system, to make a unique hash value
+ * as likely as possible - hence the 64-bit width.
+ *
+ * The task struct holds the current hash value (initialized
+ * with zero), here we store the previous hash value:
+ */
+ u64 prev_chain_key;
+ unsigned long acquire_ip;
+ struct lockdep_map *instance;
+ struct lockdep_map *nest_lock;
+ #ifdef CONFIG_LOCK_STAT
+ u64 waittime_stamp;
+ u64 holdtime_stamp;
+ #endif
+ /*
+ * class_idx is zero-indexed; it points to the element in
+ * lock_classes this held lock instance belongs to. class_idx is in
+ * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
+ */
+ unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS;
+ /*
+ * The lock-stack is unified in that the lock chains of interrupt
+ * contexts nest ontop of process context chains, but we 'separate'
+ * the hashes by starting with 0 if we cross into an interrupt
+ * context, and we also keep do not add cross-context lock
+ * dependencies - the lock usage graph walking covers that area
+ * anyway, and we'd just unnecessarily increase the number of
+ * dependencies otherwise. [Note: hardirq and softirq contexts
+ * are separated from each other too.]
+ *
+ * The following field is used to detect when we cross into an
+ * interrupt context:
+ */
+ unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
+ unsigned int trylock:1; /* 16 bits */
+
+ unsigned int read:2; /* see lock_acquire() comment */
+ unsigned int check:1; /* see lock_acquire() comment */
+ unsigned int hardirqs_off:1;
+ unsigned int sync:1;
+ unsigned int references:11; /* 32 bits */
+ unsigned int pin_count;
+ };
+
#else /* !CONFIG_LOCKDEP */
/*
#include <linux/osq_lock.h>
#include <linux/debug_locks.h>
#include <linux/cleanup.h>
+ #include <linux/mutex_types.h>
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
#ifndef CONFIG_PREEMPT_RT
- /*
- * Simple, straightforward mutexes with strict semantics:
- *
- * - only one task can hold the mutex at a time
- * - only the owner can unlock the mutex
- * - multiple unlocks are not permitted
- * - recursive locking is not permitted
- * - a mutex object must be initialized via the API
- * - a mutex object must not be initialized via memset or copying
- * - task may not exit with mutex held
- * - memory areas where held locks reside must not be freed
- * - held mutexes must not be reinitialized
- * - mutexes may not be used in hardware or software interrupt
- * contexts such as tasklets and timers
- *
- * These semantics are fully enforced when DEBUG_MUTEXES is
- * enabled. Furthermore, besides enforcing the above rules, the mutex
- * debugging code also implements a number of additional features
- * that make lock debugging easier and faster:
- *
- * - uses symbolic names of mutexes, whenever they are printed in debug output
- * - point-of-acquire tracking, symbolic lookup of function names
- * - list of all locks held in the system, printout of them
- * - owner tracking
- * - detects self-recursing locks and prints out all relevant info
- * - detects multi-task circular deadlocks and prints out all affected
- * locks and tasks (and only those tasks)
- */
- struct mutex {
- atomic_long_t owner;
- raw_spinlock_t wait_lock;
- #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- struct optimistic_spin_queue osq; /* Spinner MCS lock */
- #endif
- struct list_head wait_list;
- #ifdef CONFIG_DEBUG_MUTEXES
- void *magic;
- #endif
- #ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
- #endif
- };
-
#ifdef CONFIG_DEBUG_MUTEXES
#define __DEBUG_MUTEX_INITIALIZER(lockname) \
/*
* Preempt-RT variant based on rtmutexes.
*/
- #include <linux/rtmutex.h>
-
- struct mutex {
- struct rt_mutex_base rtmutex;
- #ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
- #endif
- };
#define __MUTEX_INITIALIZER(mutexname) \
{ \
extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T))
-DEFINE_FREE(mutex, struct mutex *, if (_T) mutex_unlock(_T))
+DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T))
+DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0)
#endif /* __LINUX_MUTEX_H */
#include <uapi/linux/sched.h>
#include <asm/current.h>
-
- #include <linux/pid.h>
- #include <linux/sem.h>
+ #include <asm/processor.h>
+ #include <linux/thread_info.h>
+ #include <linux/preempt.h>
+ #include <linux/cpumask.h>
+
+ #include <linux/cache.h>
+ #include <linux/irqflags_types.h>
+ #include <linux/smp_types.h>
+ #include <linux/pid_types.h>
+ #include <linux/sem_types.h>
#include <linux/shm.h>
#include <linux/kmsan_types.h>
- #include <linux/mutex.h>
- #include <linux/plist.h>
- #include <linux/hrtimer.h>
- #include <linux/irqflags.h>
- #include <linux/seccomp.h>
- #include <linux/nodemask.h>
- #include <linux/rcupdate.h>
- #include <linux/refcount.h>
+ #include <linux/mutex_types.h>
+ #include <linux/plist_types.h>
+ #include <linux/hrtimer_types.h>
+ #include <linux/timer_types.h>
+ #include <linux/seccomp_types.h>
+ #include <linux/nodemask_types.h>
+ #include <linux/refcount_types.h>
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
- #include <linux/syscall_user_dispatch.h>
+ #include <linux/syscall_user_dispatch_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
- #include <linux/posix-timers.h>
- #include <linux/rseq.h>
- #include <linux/seqlock.h>
+ #include <linux/posix-timers_types.h>
+ #include <linux/restart_block.h>
+ #include <uapi/linux/rseq.h>
+ #include <linux/seqlock_types.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
#include <linux/livepatch_sched.h>
+ #include <linux/uidgid_types.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct root_domain;
struct rq;
struct sched_attr;
+struct sched_dl_entity;
struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
+struct task_struct;
struct user_event_mm;
/*
u32 inv_weight;
};
-/**
- * struct util_est - Estimation utilization of FAIR tasks
- * @enqueued: instantaneous estimated utilization of a task/cpu
- * @ewma: the Exponential Weighted Moving Average (EWMA)
- * utilization of a task
- *
- * Support data structure to track an Exponential Weighted Moving Average
- * (EWMA) of a FAIR task's utilization. New samples are added to the moving
- * average each time a task completes an activation. Sample's weight is chosen
- * so that the EWMA will be relatively insensitive to transient changes to the
- * task's workload.
- *
- * The enqueued attribute has a slightly different meaning for tasks and cpus:
- * - task: the task's util_avg at last task dequeue time
- * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
- * Thus, the util_est.enqueued of a task represents the contribution on the
- * estimated utilization of the CPU where that task is currently enqueued.
- *
- * Only for tasks we track a moving average of the past instantaneous
- * estimated utilization. This allows to absorb sporadic drops in utilization
- * of an otherwise almost periodic task.
- *
- * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
- * updates. When a task is dequeued, its util_est should not be updated if its
- * util_avg has not been updated in the meantime.
- * This information is mapped into the MSB bit of util_est.enqueued at dequeue
- * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
- * for a task) it is safe to use MSB.
- */
-struct util_est {
- unsigned int enqueued;
- unsigned int ewma;
-#define UTIL_EST_WEIGHT_SHIFT 2
-#define UTIL_AVG_UNCHANGED 0x80000000
-} __attribute__((__aligned__(sizeof(u64))));
-
/*
* The load/runnable/util_avg accumulates an infinite geometric series
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
unsigned long load_avg;
unsigned long runnable_avg;
unsigned long util_avg;
- struct util_est util_est;
+ unsigned int util_est;
} ____cacheline_aligned;
+/*
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est at dequeue time.
+ * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
+ * it is safe to use MSB.
+ */
+#define UTIL_EST_WEIGHT_SHIFT 2
+#define UTIL_AVG_UNCHANGED 0x80000000
+
struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 block_max;
s64 sum_block_runtime;
- u64 exec_max;
+ s64 exec_max;
u64 slice_max;
u64 nr_migrations_cold;
struct load_weight load;
struct rb_node run_node;
u64 deadline;
- u64 min_deadline;
+ u64 min_vruntime;
struct list_head group_node;
unsigned int on_rq;
#endif
} __randomize_layout;
+typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+
struct sched_dl_entity {
struct rb_node rb_node;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
+ unsigned int dl_server : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
* timer is needed to decrease the active utilization at the correct
* time.
*/
- struct hrtimer inactive_timer;
+ struct hrtimer inactive_timer;
+
+ /*
+ * Bits for DL-server functionality. Also see the comment near
+ * dl_server_update().
+ *
+ * @rq the runqueue this server is for
+ *
+ * @server_has_tasks() returns true if @server_pick return a
+ * runnable task.
+ */
+ struct rq *rq;
+ dl_server_has_tasks_f server_has_tasks;
+ dl_server_pick_f server_pick;
#ifdef CONFIG_RT_MUTEXES
/*
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
+ struct sched_dl_entity *dl_server;
const struct sched_class *sched_class;
#ifdef CONFIG_SCHED_CORE
*/
};
- static inline struct pid *task_pid(struct task_struct *task)
- {
- return task->thread_pid;
- }
-
- /*
- * the helpers to get the task's different pids as they are seen
- * from various namespaces
- *
- * task_xid_nr() : global id, i.e. the id seen from the init namespace;
- * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
- * current.
- * task_xid_nr_ns() : id seen from the ns specified;
- *
- * see also pid_nr() etc in include/linux/pid.h
- */
- pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
-
- static inline pid_t task_pid_nr(struct task_struct *tsk)
- {
- return tsk->pid;
- }
-
- static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
- return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
- }
-
- static inline pid_t task_pid_vnr(struct task_struct *tsk)
- {
- return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
- }
-
-
- static inline pid_t task_tgid_nr(struct task_struct *tsk)
- {
- return tsk->tgid;
- }
-
- /**
- * pid_alive - check that a task structure is not stale
- * @p: Task structure to be checked.
- *
- * Test if a process is not yet dead (at most zombie state)
- * If pid_alive fails, then pointers within the task structure
- * can be stale and must not be dereferenced.
- *
- * Return: 1 if the process is alive. 0 otherwise.
- */
- static inline int pid_alive(const struct task_struct *p)
- {
- return p->thread_pid != NULL;
- }
-
- static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
- return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
- }
-
- static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
- {
- return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
- }
-
-
- static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
- return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
- }
-
- static inline pid_t task_session_vnr(struct task_struct *tsk)
- {
- return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
- }
-
- static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
- return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
- }
-
- static inline pid_t task_tgid_vnr(struct task_struct *tsk)
- {
- return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
- }
-
- static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
- {
- pid_t pid = 0;
-
- rcu_read_lock();
- if (pid_alive(tsk))
- pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
- rcu_read_unlock();
-
- return pid;
- }
-
- static inline pid_t task_ppid_nr(const struct task_struct *tsk)
- {
- return task_ppid_nr_ns(tsk, &init_pid_ns);
- }
-
- /* Obsolete, do not use: */
- static inline pid_t task_pgrp_nr(struct task_struct *tsk)
- {
- return task_pgrp_nr_ns(tsk, &init_pid_ns);
- }
-
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
return task_index_to_char(task_state_index(tsk));
}
- /**
- * is_global_init - check if a task structure is init. Since init
- * is free to have sub-threads we need to check tgid.
- * @tsk: Task structure to be checked.
- *
- * Check if a task structure is the first user space task the kernel created.
- *
- * Return: 1 if the task structure is init. 0 otherwise.
- */
- static inline int is_global_init(struct task_struct *tsk)
- {
- return task_tgid_nr(tsk) == 1;
- }
-
extern struct pid *cad_pid;
/*
void yield(void);
union thread_union {
-#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
struct task_struct task;
-#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info thread_info;
#endif
__cond_resched_rwlock_write(lock); \
})
- static inline void cond_resched_rcu(void)
- {
- #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
- rcu_read_unlock();
- cond_resched();
- rcu_read_lock();
- #endif
- }
-
#ifdef CONFIG_PREEMPT_DYNAMIC
extern bool preempt_model_none(void);
return preempt_model_full() || preempt_model_rt();
}
- /*
- * Does a critical section need to be broken due to another
- * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
- * but a general need for low latency)
- */
- static inline int spin_needbreak(spinlock_t *lock)
- {
- #ifdef CONFIG_PREEMPTION
- return spin_is_contended(lock);
- #else
- return 0;
- #endif
- }
-
- /*
- * Check if a rwlock is contended.
- * Returns non-zero if there is another task waiting on the rwlock.
- * Returns zero if the lock is not contended or the system / underlying
- * rwlock implementation does not support contention detection.
- * Technically does not depend on CONFIG_PREEMPTION, but a general need
- * for low latency.
- */
- static inline int rwlock_needbreak(rwlock_t *lock)
- {
- #ifdef CONFIG_PREEMPTION
- return rwlock_is_contended(lock);
- #else
- return 0;
- #endif
- }
-
static __always_inline bool need_resched(void)
{
return unlikely(tif_need_resched());
extern unsigned long get_wchan(struct task_struct *p);
extern struct task_struct *cpu_curr_snapshot(int cpu);
+ #include <linux/spinlock.h>
+
/*
* In order to reduce various lock holder preemption latencies provide an
* interface to see if a vCPU is currently running or not.
unsigned long sched_cpu_util(int cpu);
#endif /* CONFIG_SMP */
- #ifdef CONFIG_RSEQ
-
- /*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
- enum rseq_event_mask_bits {
- RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
- RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
- RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
- };
-
- enum rseq_event_mask {
- RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
- RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
- RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
- };
-
- static inline void rseq_set_notify_resume(struct task_struct *t)
- {
- if (t->rseq)
- set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
- }
-
- void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
-
- static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
- {
- if (current->rseq)
- __rseq_handle_notify_resume(ksig, regs);
- }
-
- static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
- {
- preempt_disable();
- __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask);
- preempt_enable();
- rseq_handle_notify_resume(ksig, regs);
- }
-
- /* rseq_preempt() requires preemption to be disabled. */
- static inline void rseq_preempt(struct task_struct *t)
- {
- __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
- }
-
- /* rseq_migrate() requires preemption to be disabled. */
- static inline void rseq_migrate(struct task_struct *t)
- {
- __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
- }
-
- /*
- * If parent process has a registered restartable sequences area, the
- * child inherits. Unregister rseq for a clone with CLONE_VM set.
- */
- static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
- {
- if (clone_flags & CLONE_VM) {
- t->rseq = NULL;
- t->rseq_len = 0;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
- } else {
- t->rseq = current->rseq;
- t->rseq_len = current->rseq_len;
- t->rseq_sig = current->rseq_sig;
- t->rseq_event_mask = current->rseq_event_mask;
- }
- }
-
- static inline void rseq_execve(struct task_struct *t)
- {
- t->rseq = NULL;
- t->rseq_len = 0;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
- }
-
- #else
-
- static inline void rseq_set_notify_resume(struct task_struct *t)
- {
- }
- static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
- {
- }
- static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
- {
- }
- static inline void rseq_preempt(struct task_struct *t)
- {
- }
- static inline void rseq_migrate(struct task_struct *t)
- {
- }
- static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
- {
- }
- static inline void rseq_execve(struct task_struct *t)
- {
- }
-
- #endif
-
- #ifdef CONFIG_DEBUG_RSEQ
-
- void rseq_syscall(struct pt_regs *regs);
-
- #else
-
- static inline void rseq_syscall(struct pt_regs *regs)
- {
- }
-
- #endif
-
#ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p);
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
+ #include <linux/pid.h>
#include <linux/posix-timers.h>
#include <linux/mm_types.h>
#include <asm/ptrace.h>
* This is required every time the blocked sigset_t changes.
* callers must hold sighand->siglock.
*/
-extern void recalc_sigpending_and_wake(struct task_struct *t);
extern void recalc_sigpending(void);
extern void calculate_sigpending(void);
#define while_each_thread(g, t) \
while ((t = next_thread(t)) != g)
+#define for_other_threads(p, t) \
+ for (t = p; (t = next_thread(t)) != p; )
+
#define __for_each_thread(signal, t) \
list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
lockdep_is_held(&tasklist_lock))
* functionality:
*/
+ #include <linux/rcupdate.h>
+ #include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
spin_unlock(&p->alloc_lock);
}
+DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))
+
#endif /* _LINUX_SCHED_TASK_H */
return raw_spin_is_contended(&lock->rlock);
}
+ /*
+ * Does a critical section need to be broken due to another
+ * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
+ * but a general need for low latency)
+ */
+ static inline int spin_needbreak(spinlock_t *lock)
+ {
+ #ifdef CONFIG_PREEMPTION
+ return spin_is_contended(lock);
+ #else
+ return 0;
+ #endif
+ }
+
+ /*
+ * Check if a rwlock is contended.
+ * Returns non-zero if there is another task waiting on the rwlock.
+ * Returns zero if the lock is not contended or the system / underlying
+ * rwlock implementation does not support contention detection.
+ * Technically does not depend on CONFIG_PREEMPTION, but a general need
+ * for low latency.
+ */
+ static inline int rwlock_needbreak(rwlock_t *lock)
+ {
+ #ifdef CONFIG_PREEMPTION
+ return rwlock_is_contended(lock);
+ #else
+ return 0;
+ #endif
+ }
+
#define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock)
#else /* !CONFIG_PREEMPT_RT */
raw_spin_lock(_T->lock),
raw_spin_unlock(_T->lock))
+DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock))
+
DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t,
raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING),
raw_spin_unlock(_T->lock))
raw_spin_lock_irq(_T->lock),
raw_spin_unlock_irq(_T->lock))
+DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock))
+
DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
raw_spin_lock_irqsave(_T->lock, _T->flags),
raw_spin_unlock_irqrestore(_T->lock, _T->flags),
unsigned long flags)
+DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try,
+ raw_spin_trylock_irqsave(_T->lock, _T->flags))
+
DEFINE_LOCK_GUARD_1(spinlock, spinlock_t,
spin_lock(_T->lock),
spin_unlock(_T->lock))
+DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock))
+
DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
spin_lock_irq(_T->lock),
spin_unlock_irq(_T->lock))
+DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try,
+ spin_trylock_irq(_T->lock))
+
DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
spin_lock_irqsave(_T->lock, _T->flags),
spin_unlock_irqrestore(_T->lock, _T->flags),
unsigned long flags)
+DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
+ spin_trylock_irqsave(_T->lock, _T->flags))
+
+DEFINE_LOCK_GUARD_1(read_lock, rwlock_t,
+ read_lock(_T->lock),
+ read_unlock(_T->lock))
+
+DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t,
+ read_lock_irq(_T->lock),
+ read_unlock_irq(_T->lock))
+
+DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t,
+ read_lock_irqsave(_T->lock, _T->flags),
+ read_unlock_irqrestore(_T->lock, _T->flags),
+ unsigned long flags)
+
+DEFINE_LOCK_GUARD_1(write_lock, rwlock_t,
+ write_lock(_T->lock),
+ write_unlock(_T->lock))
+
+DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t,
+ write_lock_irq(_T->lock),
+ write_unlock_irq(_T->lock))
+
+DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t,
+ write_lock_irqsave(_T->lock, _T->flags),
+ write_unlock_irqrestore(_T->lock, _T->flags),
+ unsigned long flags)
+
#undef __LINUX_INSIDE_SPINLOCK_H
#endif /* __LINUX_SPINLOCK_H */
* to detect when we overlook these differences.
*
*/
- #include <linux/types.h>
+ #include <linux/uidgid_types.h>
#include <linux/highuid.h>
struct user_namespace;
extern struct user_namespace init_user_ns;
+struct uid_gid_map;
- typedef struct {
- uid_t val;
- } kuid_t;
-
-
- typedef struct {
- gid_t val;
- } kgid_t;
-
#define KUIDT_INIT(value) (kuid_t){ value }
#define KGIDT_INIT(value) (kgid_t){ value }
return from_kgid(ns, gid) != (gid_t) -1;
}
+u32 map_id_down(struct uid_gid_map *map, u32 id);
+u32 map_id_up(struct uid_gid_map *map, u32 id);
+
#else
static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
return gid_valid(gid);
}
+static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
+{
+ return id;
+}
+
+static inline u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+ return id;
+}
#endif /* CONFIG_USER_NS */
#endif /* _LINUX_UIDGID_H */
#include <linux/atomic.h>
#include <linux/cpumask.h>
#include <linux/rcupdate.h>
-
- struct workqueue_struct;
-
- struct work_struct;
- typedef void (*work_func_t)(struct work_struct *work);
- void delayed_work_timer_fn(struct timer_list *t);
+ #include <linux/workqueue_types.h>
/*
* The first word is the work queue pointer and the flags rolled into
#define WORK_STRUCT_FLAG_MASK ((1ul << WORK_STRUCT_FLAG_BITS) - 1)
#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
- struct work_struct {
- atomic_long_t data;
- struct list_head entry;
- work_func_t func;
- #ifdef CONFIG_LOCKDEP
- struct lockdep_map lockdep_map;
- #endif
- };
-
#define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
#define WORK_DATA_STATIC_INIT() \
ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))
void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs);
-int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
+extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);
extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work);
#include <linux/audit.h>
#include <linux/numa.h>
#include <linux/scs.h>
+ #include <linux/plist.h>
#include <linux/uaccess.h>
};
#ifdef CONFIG_SHADOW_CALL_STACK
-unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)]
- __init_task_data = {
+unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
[(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
};
#endif
* Set up the first task table, touch at your own risk!. Base=0,
* limit=0x1fffff (=2MB)
*/
-struct task_struct init_task
-#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
- __init_task_data
-#endif
- __aligned(L1_CACHE_BYTES)
-= {
+struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
#ifdef CONFIG_THREAD_INFO_IN_TASK
.thread_info = INIT_THREAD_INFO(init_task),
.stack_refcount = REFCOUNT_INIT(1),
#include <linux/async.h>
#include <linux/atomic.h>
- #include <linux/ktime.h>
#include <linux/export.h>
- #include <linux/wait.h>
+ #include <linux/ktime.h>
+ #include <linux/pid.h>
#include <linux/sched.h>
#include <linux/slab.h>
+ #include <linux/wait.h>
#include <linux/workqueue.h>
#include "workqueue_internal.h"
wake_up(&async_done);
}
+static async_cookie_t __async_schedule_node_domain(async_func_t func,
+ void *data, int node,
+ struct async_domain *domain,
+ struct async_entry *entry)
+{
+ async_cookie_t newcookie;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
+ INIT_WORK(&entry->work, async_run_entry_fn);
+ entry->func = func;
+ entry->data = data;
+ entry->domain = domain;
+
+ spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
+ newcookie = entry->cookie = next_cookie++;
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
+ atomic_inc(&entry_count);
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* schedule for execution */
+ queue_work_node(node, system_unbound_wq, &entry->work);
+
+ return newcookie;
+}
+
/**
* async_schedule_node_domain - NUMA specific version of async_schedule_domain
* @func: function to execute asynchronously
func(data, newcookie);
return newcookie;
}
- INIT_LIST_HEAD(&entry->domain_list);
- INIT_LIST_HEAD(&entry->global_list);
- INIT_WORK(&entry->work, async_run_entry_fn);
- entry->func = func;
- entry->data = data;
- entry->domain = domain;
-
- spin_lock_irqsave(&async_lock, flags);
-
- /* allocate cookie and queue */
- newcookie = entry->cookie = next_cookie++;
-
- list_add_tail(&entry->domain_list, &domain->pending);
- if (domain->registered)
- list_add_tail(&entry->global_list, &async_global_pending);
-
- atomic_inc(&entry_count);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* schedule for execution */
- queue_work_node(node, system_unbound_wq, &entry->work);
- return newcookie;
+ return __async_schedule_node_domain(func, data, node, domain, entry);
}
EXPORT_SYMBOL_GPL(async_schedule_node_domain);
}
EXPORT_SYMBOL_GPL(async_schedule_node);
+/**
+ * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
+ * @func: function to execute asynchronously
+ * @dev: device argument to be passed to function
+ *
+ * @dev is used as both the argument for the function and to provide NUMA
+ * context for where to run the function.
+ *
+ * If the asynchronous execution of @func is scheduled successfully, return
+ * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
+ * that will run the function synchronously then.
+ */
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
+{
+ struct async_entry *entry;
+
+ entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);
+
+ /* Give up if there is no memory or too much work. */
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+ kfree(entry);
+ return false;
+ }
+
+ __async_schedule_node_domain(func, dev, dev_to_node(dev),
+ &async_dfl_domain, entry);
+ return true;
+}
+
/**
* async_synchronize_full - synchronize all asynchronous function calls
*
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
-
#include <linux/uaccess.h>
+
+ #include <uapi/linux/wait.h>
+
#include <asm/unistd.h>
#include <asm/mmu_context.h>
ptrace_event(PTRACE_EVENT_EXIT, code);
user_events_exit(tsk);
- validate_creds_for_do_exit(tsk);
-
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
if (tsk->task_frag.page)
put_page(tsk->task_frag.page);
- validate_creds_for_do_exit(tsk);
exit_task_stack_account(tsk);
check_stack_usage();
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
+ #include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/stackprotector.h>
#include <linux/user_events.h>
#include <linux/iommu.h>
+ #include <linux/rseq.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
{
}
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
{
kmem_cache_free(task_struct_cachep, tsk);
}
-#endif
-
-#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
}
# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
-#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
-
-static int alloc_thread_stack_node(struct task_struct *tsk, int node)
-{
- unsigned long *stack;
-
- stack = arch_alloc_thread_stack_node(tsk, node);
- tsk->stack = stack;
- return stack ? 0 : -ENOMEM;
-}
-
-static void free_thread_stack(struct task_struct *tsk)
-{
- arch_free_thread_stack(tsk);
- tsk->stack = NULL;
-}
-
-#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
int retval;
unsigned long charge = 0;
LIST_HEAD(uf);
- VMA_ITERATOR(old_vmi, oldmm, 0);
VMA_ITERATOR(vmi, mm, 0);
uprobe_start_dup_mmap();
goto out;
khugepaged_fork(mm, oldmm);
- retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
- if (retval)
+ /* Use __mt_dup() to efficiently build an identical maple tree. */
+ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ if (unlikely(retval))
goto out;
mt_clear_in_rcu(vmi.mas.tree);
- for_each_vma(old_vmi, mpnt) {
+ for_each_vma(vmi, mpnt) {
struct file *file;
vma_start_write(mpnt);
if (mpnt->vm_flags & VM_DONTCOPY) {
+ retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ mpnt->vm_end, GFP_KERNEL);
+ if (retval)
+ goto loop_out;
+
vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
if (is_vm_hugetlb_page(tmp))
hugetlb_dup_vma_private(tmp);
- /* Link the vma into the MT */
- if (vma_iter_bulk_store(&vmi, tmp))
- goto fail_nomem_vmi_store;
+ /*
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
+ */
+ vma_iter_bulk_store(&vmi, tmp);
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
- if (retval)
+ if (retval) {
+ mpnt = vma_next(&vmi);
goto loop_out;
+ }
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
- if (!retval)
+ if (!retval) {
mt_set_in_rcu(vmi.mas.tree);
+ } else if (mpnt) {
+ /*
+ * The entire maple tree has already been duplicated. If the
+ * mmap duplication fails, mark the failure point with
+ * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ * stop releasing VMAs that have not been duplicated after this
+ * point.
+ */
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ }
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
uprobe_end_dup_mmap();
return retval;
-fail_nomem_vmi_store:
- unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
int arch_task_struct_size __read_mostly;
#endif
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
else
*offset += offsetof(struct task_struct, thread);
}
-#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
int i;
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN 0
#endif
arch_task_struct_size, align,
SLAB_PANIC|SLAB_ACCOUNT,
useroffset, usersize, NULL);
-#endif
/* do the arch specific task caches init */
arch_task_cache_init();
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
- unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
+ unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
int killed;
cgroup_enter_frozen();
get_task_struct(p);
}
- if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
#include <linux/profile.h>
#include <linux/psi.h>
#include <linux/rcuwait_api.h>
+ #include <linux/rseq.h>
#include <linux/sched/wake_q.h>
#include <linux/scs.h>
#include <linux/slab.h>
if (cpu == smp_processor_id())
return;
+ /*
+ * Set TIF_NEED_RESCHED and send an IPI if in the non-polling
+ * part of the idle loop. This forces an exit from the idle loop
+ * and a round trip to schedule(). Now this could be optimized
+ * because a simple new idle loop iteration is enough to
+ * re-evaluate the next tick. Provided some re-ordering of tick
+ * nohz functions that would need to follow TIF_NR_POLLING
+ * clearing:
+ *
+ * - On most archs, a simple fetch_or on ti::flags with a
+ * "0" value would be enough to know if an IPI needs to be sent.
+ *
+ * - x86 needs to perform a last need_resched() check between
+ * monitor and mwait which doesn't take timers into account.
+ * There a dedicated TIF_TIMER flag would be required to
+ * fetch_or here and be checked along with TIF_NEED_RESCHED
+ * before mwait().
+ *
+ * However, remote timer enqueue is not such a frequent event
+ * and testing of the above solutions didn't appear to report
+ * much benefits.
+ */
if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
else
enqueue_task(rq, p, flags);
- p->on_rq = TASK_ON_RQ_QUEUED;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+ WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
dequeue_task(rq, p, flags);
}
rq->idle_stamp = 0;
}
#endif
+
+ p->dl_server = NULL;
}
/*
memset(&p->stats, 0, sizeof(p->stats));
#endif
- RB_CLEAR_NODE(&p->dl.rb_node);
- init_dl_task_timer(&p->dl);
- init_dl_inactive_task_timer(&p->dl);
- __dl_clear_params(p);
+ init_dl_entity(&p->dl);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
p = pick_next_task_idle(rq);
}
+ /*
+ * This is the fast path; it cannot be a DL server pick;
+ * therefore even if @p == @prev, ->dl_server must be NULL.
+ */
+ if (p->dl_server)
+ p->dl_server = NULL;
+
return p;
}
restart:
put_prev_task_balance(rq, prev, rf);
+ /*
+ * We've updated @prev and no longer need the server link, clear it.
+ * Must be done before ->pick_next_task() because that can (re)set
+ * ->dl_server.
+ */
+ if (prev->dl_server)
+ prev->dl_server = NULL;
+
for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- enum cpu_util_type type,
- struct task_struct *p)
+ unsigned long *min,
+ unsigned long *max)
{
- unsigned long dl_util, util, irq, max;
+ unsigned long util, irq, scale;
struct rq *rq = cpu_rq(cpu);
- max = arch_scale_cpu_capacity(cpu);
-
- if (!uclamp_is_used() &&
- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
- return max;
- }
+ scale = arch_scale_cpu_capacity(cpu);
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
- if (unlikely(irq >= max))
- return max;
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
- *
- * CFS and RT utilization can be boosted or capped, depending on
- * utilization clamp constraints requested by currently RUNNABLE
- * tasks.
- * When there are no CFS RUNNABLE tasks, clamps are released and
- * frequency will be gracefully reduced with the utilization decay.
*/
util = util_cfs + cpu_util_rt(rq);
- if (type == FREQUENCY_UTIL)
- util = uclamp_rq_util_with(rq, util, p);
-
- dl_util = cpu_util_dl(rq);
+ util += cpu_util_dl(rq);
/*
- * For frequency selection we do not make cpu_util_dl() a permanent part
- * of this sum because we want to use cpu_bw_dl() later on, but we need
- * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
- * that we select f_max when there is no idle time.
- *
- * NOTE: numerical errors or stop class might cause us to not quite hit
- * saturation when we should -- something for later.
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
*/
- if (util + dl_util >= max)
- return max;
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
- /*
- * OTOH, for energy computation we need the estimated running time, so
- * include util_dl and ignore dl_bw.
- */
- if (type == ENERGY_UTIL)
- util += dl_util;
+ if (util >= scale)
+ return scale;
/*
* There is still idle time; further improve the number by using the
* U' = irq + --------- * U
* max
*/
- util = scale_irq_capacity(util, irq, max);
+ util = scale_irq_capacity(util, irq, scale);
util += irq;
- /*
- * Bandwidth required by DEADLINE must always be granted while, for
- * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
- * to gracefully reduce the frequency when no tasks show up for longer
- * periods of time.
- *
- * Ideally we would like to set bw_dl as min/guaranteed freq and util +
- * bw_dl as requested freq. However, cpufreq is not yet ready for such
- * an interface. So, we only do the latter for now.
- */
- if (type == FREQUENCY_UTIL)
- util += cpu_bw_dl(rq);
-
- return min(max, util);
+ return min(scale, util);
}
unsigned long sched_cpu_util(int cpu)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
}
#endif /* CONFIG_SMP */
#include <linux/migrate.h>
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
+ #include <linux/rcupdate_wait.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
* ->i_pages lock (try_to_unmap_one)
* ->lruvec->lru_lock (follow_page->mark_page_accessed)
* ->lruvec->lru_lock (check_pte_range->isolate_lru_page)
- * ->private_lock (page_remove_rmap->set_page_dirty)
- * ->i_pages lock (page_remove_rmap->set_page_dirty)
- * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
- * ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock)
+ * ->private_lock (folio_remove_rmap_pte->set_page_dirty)
+ * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)
+ * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)
+ * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty)
+ * ->memcg->move_lock (folio_remove_rmap_pte->folio_memcg_lock)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->block_dirty_folio)
static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
{
struct wait_queue_head *q = folio_waitqueue(folio);
- int ret = 0;
+ int ret;
wait->folio = folio;
wait->bit_nr = PG_locked;
if (nr) {
folio = fbatch->folios[nr - 1];
- *start = folio->index + folio_nr_pages(folio);
+ *start = folio_next_index(folio);
}
out:
rcu_read_unlock();
goto put_folios;
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
+ /*
+ * Pairs with a barrier in
+ * block_write_end()->mark_buffer_dirty() or other page
+ * dirtying routines like iomap_write_end() to ensure
+ * changes to page contents are visible before we see
+ * increased inode size.
+ */
+ smp_rmb();
+
/*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
+ #include <linux/rcupdate_wait.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/ksm.h>
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
hugepage_flags_enabled()) {
- if (hugepage_vma_check(vma, vm_flags, false, false, true))
+ if (thp_vma_allowable_order(vma, vm_flags, false, false, true,
+ PMD_ORDER))
__khugepaged_enter(vma->vm_mm);
}
}
folio_putback_lru(folio);
}
-static void release_pte_page(struct page *page)
-{
- release_pte_folio(page_folio(page));
-}
-
static void release_pte_pages(pte_t *pte, pte_t *_pte,
struct list_head *compound_pagelist)
{
spinlock_t *ptl,
struct list_head *compound_pagelist)
{
+ struct folio *src_folio;
struct page *src_page;
struct page *tmp;
pte_t *_pte;
}
} else {
src_page = pte_page(pteval);
- if (!PageCompound(src_page))
- release_pte_page(src_page);
+ src_folio = page_folio(src_page);
+ if (!folio_test_large(src_folio))
+ release_pte_folio(src_folio);
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
- * inside page_remove_rmap().
+ * inside folio_remove_rmap_pte().
*/
spin_lock(ptl);
ptep_clear(vma->vm_mm, address, _pte);
- page_remove_rmap(src_page, vma, false);
+ folio_remove_rmap_pte(src_folio, src_page, vma);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
if (!vma)
return SCAN_VMA_NULL;
- if (!transhuge_vma_suitable(vma, address))
+ if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
- cc->is_khugepaged))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
+ cc->is_khugepaged, PMD_ORDER))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
* remapped to file after khugepaged reaquired the mmap_lock.
*
- * hugepage_vma_check may return true for qualified file
+ * thp_vma_allowable_order may return true for qualified file
* vmas.
*/
if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
+ struct folio *folio;
struct page *hpage;
spinlock_t *pmd_ptl, *pte_ptl;
int result = SCAN_FAIL;
* Prevent all access to pagetables with the exception of
* gup_fast later handled by the ptep_clear_flush and the VM
* handled by the anon_vma lock + PG_lock.
+ *
+ * UFFDIO_MOVE is prevented to race as well thanks to the
+ * mmap_lock.
*/
mmap_write_lock(mm);
result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
if (unlikely(result != SCAN_SUCCEED))
goto out_up_write;
+ folio = page_folio(hpage);
/*
- * spin_lock() below is not the equivalent of smp_wmb(), but
- * the smp_wmb() inside __SetPageUptodate() can be reused to
- * avoid the copy_huge_page writes to become visible after
- * the set_pmd_at() write.
+ * The smp_wmb() inside __folio_mark_uptodate() ensures the
+ * copy_huge_page writes become visible before the set_pmd_at()
+ * write.
*/
- __SetPageUptodate(hpage);
+ __folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
_pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(hpage, vma, address);
- lru_cache_add_inactive_or_unevictable(hpage, vma);
+ folio_add_new_anon_rmap(folio, vma, address);
+ folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
* and map it by a PMD, regardless of sysfs THP settings. As such, let's
* analogously elide sysfs THP settings here.
*/
- if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
+ PMD_ORDER))
return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
* PTE dirty? Shmem page is already dirty; file is read-only.
*/
ptep_clear(mm, addr, pte);
- page_remove_rmap(page, vma, false);
+ folio_remove_rmap_pte(folio, page, vma);
nr_ptes++;
}
xas_lock_irq(&xas);
}
- nr = thp_nr_pages(hpage);
+ folio = page_folio(hpage);
+ nr = folio_nr_pages(folio);
if (is_shmem)
- __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
else
- __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
+ __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr);
if (nr_none) {
- __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none);
/* nr_none is always 0 for non-shmem. */
- __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none);
}
/*
* Mark hpage as uptodate before inserting it into the page cache so
* that it isn't mistaken for an fallocated but unwritten page.
*/
- folio = page_folio(hpage);
folio_mark_uptodate(folio);
folio_ref_add(folio, HPAGE_PMD_NR - 1);
/* Join all the small entries into a single multi-index entry. */
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
- xas_store(&xas, hpage);
+ xas_store(&xas, folio);
WARN_ON_ONCE(xas_error(&xas));
xas_unlock_irq(&xas);
retract_page_tables(mapping, start);
if (cc && !cc->is_khugepaged)
result = SCAN_PTE_MAPPED_HUGEPAGE;
- unlock_page(hpage);
+ folio_unlock(folio);
/*
* The collapse has succeeded, so free the old pages.
progress++;
break;
}
- if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
+ true, PMD_ORDER)) {
skip:
progress++;
continue;
while (true) {
cond_resched();
- if (unlikely(kthread_should_stop() || try_to_freeze()))
+ if (unlikely(kthread_should_stop()))
break;
spin_lock(&khugepaged_mm_lock);
*prev = vma;
- if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
+ PMD_ORDER))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
#include <linux/rmap.h>
#include <linux/uuid.h>
#include <linux/quotaops.h>
+ #include <linux/rcupdate_wait.h>
#include <linux/uaccess.h>
}
VM_BUG_ON_FOLIO(folio_test_writeback(folio),
folio);
- truncate_inode_folio(mapping, folio);
+
+ if (!folio_test_large(folio)) {
+ truncate_inode_folio(mapping, folio);
+ } else if (truncate_inode_partial_folio(folio, lstart, lend)) {
+ /*
+ * If we split a page, reset the loop so
+ * that we pick up the new sub pages.
+ * Otherwise the THP was entirely
+ * dropped or the target range was
+ * zeroed, so just continue the loop as
+ * is.
+ */
+ if (!folio_test_large(folio)) {
+ folio_unlock(folio);
+ index = start;
+ break;
+ }
+ }
}
folio_unlock(folio);
}
mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
- swap_writepage(&folio->page, wbc);
- return 0;
+ return swap_writepage(&folio->page, wbc);
}
mutex_unlock(&shmem_swaplist_mutex);
{
struct mempolicy *mpol;
pgoff_t ilx;
- struct page *page;
+ struct folio *folio;
mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
- page = swap_cluster_readahead(swap, gfp, mpol, ilx);
+ folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
mpol_cond_put(mpol);
- if (!page)
- return NULL;
- return page_folio(page);
+ return folio;
}
/*
}
/* Keep the page in page cache instead of truncating it */
-static int shmem_error_remove_page(struct address_space *mapping,
- struct page *page)
+static int shmem_error_remove_folio(struct address_space *mapping,
+ struct folio *folio)
{
return 0;
}
#ifdef CONFIG_MIGRATION
.migrate_folio = migrate_folio,
#endif
- .error_remove_page = shmem_error_remove_page,
+ .error_remove_folio = shmem_error_remove_folio,
};
EXPORT_SYMBOL(shmem_aops);
#include <linux/completion.h>
#include <linux/suspend.h>
#include <linux/zswap.h>
+ #include <linux/plist.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
BUG();
}
-sector_t swap_page_sector(struct page *page)
+sector_t swap_folio_sector(struct folio *folio)
{
- struct swap_info_struct *sis = page_swap_info(page);
+ struct swap_info_struct *sis = swp_swap_info(folio->swap);
struct swap_extent *se;
sector_t sector;
pgoff_t offset;
- offset = __page_file_index(page);
+ offset = swp_offset(folio->swap);
se = offset_to_swap_extent(sis, offset);
sector = se->start_block + (offset - se->start_page);
return sector << (PAGE_SHIFT - 9);
do {
page = list_next_entry(page, lru);
- map = kmap_atomic(page);
+ map = kmap_local_page(page);
tmp_count = map[offset];
- kunmap_atomic(map);
+ kunmap_local(map);
count += (tmp_count & ~COUNT_CONTINUED) * n;
n *= (SWAP_CONT_MAX + 1);
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct folio *folio)
{
- struct page *page = folio_file_page(folio, swp_offset(entry));
- struct page *swapcache;
+ struct page *page;
+ struct folio *swapcache;
spinlock_t *ptl;
pte_t *pte, new_pte, old_pte;
- bool hwpoisoned = PageHWPoison(page);
+ bool hwpoisoned = false;
int ret = 1;
- swapcache = page;
- page = ksm_might_need_to_copy(page, vma, addr);
- if (unlikely(!page))
+ swapcache = folio;
+ folio = ksm_might_need_to_copy(folio, vma, addr);
+ if (unlikely(!folio))
return -ENOMEM;
- else if (unlikely(PTR_ERR(page) == -EHWPOISON))
+ else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
+ hwpoisoned = true;
+ folio = swapcache;
+ }
+
+ page = folio_file_page(folio, swp_offset(entry));
+ if (PageHWPoison(page))
hwpoisoned = true;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
old_pte = ptep_get(pte);
- if (unlikely(hwpoisoned || !PageUptodate(page))) {
+ if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
swp_entry_t swp_entry;
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
if (hwpoisoned) {
- swp_entry = make_hwpoison_entry(swapcache);
- page = swapcache;
+ swp_entry = make_hwpoison_entry(page);
} else {
swp_entry = make_poisoned_swp_entry();
}
* when reading from swap. This metadata may be indexed by swap entry
* so this must be called before swap_free().
*/
- arch_swap_restore(entry, page_folio(page));
-
- /* See do_swap_page() */
- BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
- BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+ arch_swap_restore(entry, folio);
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
- get_page(page);
- if (page == swapcache) {
+ folio_get(folio);
+ if (folio == swapcache) {
rmap_t rmap_flags = RMAP_NONE;
/*
- * See do_swap_page(): PageWriteback() would be problematic.
- * However, we do a wait_on_page_writeback() just before this
- * call and have the page locked.
+ * See do_swap_page(): writeback would be problematic.
+ * However, we do a folio_wait_writeback() just before this
+ * call and have the folio locked.
*/
- VM_BUG_ON_PAGE(PageWriteback(page), page);
+ VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
if (pte_swp_exclusive(old_pte))
rmap_flags |= RMAP_EXCLUSIVE;
- page_add_anon_rmap(page, vma, addr, rmap_flags);
+ folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, addr);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ folio_add_new_anon_rmap(folio, vma, addr);
+ folio_add_lru_vma(folio, vma);
}
new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
if (pte_swp_soft_dirty(old_pte))
out:
if (pte)
pte_unmap_unlock(pte, ptl);
- if (page != swapcache) {
- unlock_page(page);
- put_page(page);
+ if (folio != swapcache) {
+ folio_unlock(folio);
+ folio_put(folio);
}
return ret;
}
/*
* A `swap extent' is a simple thing which maps a contiguous range of pages
* onto a contiguous range of disk blocks. A rbtree of swap extents is
- * built at swapon time and is then used at swap_writepage/swap_readpage
+ * built at swapon time and is then used at swap_writepage/swap_read_folio
* time for locating where on disk a page belongs.
*
* If the swapfile is an S_ISBLK block device, a single extent is installed.
return swap_type_to_swap_info(swp_type(entry));
}
-struct swap_info_struct *page_swap_info(struct page *page)
-{
- swp_entry_t entry = page_swap_entry(page);
- return swp_swap_info(entry);
-}
-
/*
* out-of-line methods to avoid include hell.
*/
struct address_space *swapcache_mapping(struct folio *folio)
{
- return page_swap_info(&folio->page)->swap_file->f_mapping;
+ return swp_swap_info(folio->swap)->swap_file->f_mapping;
}
EXPORT_SYMBOL_GPL(swapcache_mapping);
if (!(count & COUNT_CONTINUED))
goto out_unlock_cont;
- map = kmap_atomic(list_page) + offset;
+ map = kmap_local_page(list_page) + offset;
count = *map;
- kunmap_atomic(map);
+ kunmap_local(map);
/*
* If this continuation count now has some space in it,
spin_lock(&si->cont_lock);
offset &= ~PAGE_MASK;
page = list_next_entry(head, lru);
- map = kmap_atomic(page) + offset;
+ map = kmap_local_page(page) + offset;
if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
goto init_map; /* jump over SWAP_CONT_MAX checks */
* Think of how you add 1 to 999
*/
while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
- kunmap_atomic(map);
+ kunmap_local(map);
page = list_next_entry(page, lru);
BUG_ON(page == head);
- map = kmap_atomic(page) + offset;
+ map = kmap_local_page(page) + offset;
}
if (*map == SWAP_CONT_MAX) {
- kunmap_atomic(map);
+ kunmap_local(map);
page = list_next_entry(page, lru);
if (page == head) {
ret = false; /* add count continuation */
goto out;
}
- map = kmap_atomic(page) + offset;
+ map = kmap_local_page(page) + offset;
init_map: *map = 0; /* we didn't zero the page */
}
*map += 1;
- kunmap_atomic(map);
+ kunmap_local(map);
while ((page = list_prev_entry(page, lru)) != head) {
- map = kmap_atomic(page) + offset;
+ map = kmap_local_page(page) + offset;
*map = COUNT_CONTINUED;
- kunmap_atomic(map);
+ kunmap_local(map);
}
ret = true; /* incremented */
*/
BUG_ON(count != COUNT_CONTINUED);
while (*map == COUNT_CONTINUED) {
- kunmap_atomic(map);
+ kunmap_local(map);
page = list_next_entry(page, lru);
BUG_ON(page == head);
- map = kmap_atomic(page) + offset;
+ map = kmap_local_page(page) + offset;
}
BUG_ON(*map == 0);
*map -= 1;
if (*map == 0)
count = 0;
- kunmap_atomic(map);
+ kunmap_local(map);
while ((page = list_prev_entry(page, lru)) != head) {
- map = kmap_atomic(page) + offset;
+ map = kmap_local_page(page) + offset;
*map = SWAP_CONT_MAX | count;
count = COUNT_CONTINUED;
- kunmap_atomic(map);
+ kunmap_local(map);
}
ret = count == COUNT_CONTINUED;
}
#include <linux/export.h>
#include <linux/msg.h>
#include <linux/shm.h>
+ #include <uapi/linux/shm.h>
#include <linux/bpf.h>
#include <linux/kernfs.h>
#include <linux/stringhash.h> /* for hashlen_string() */
#include <linux/fsnotify.h>
#include <linux/fanotify.h>
#include <linux/io_uring.h>
+#include <uapi/linux/lsm.h>
#include "avc.h"
#include "objsec.h"
struct inode_security_struct *isec;
u32 sid;
- validate_creds(cred);
-
if (unlikely(IS_PRIVATE(inode)))
return 0;
new_tsec->keycreate_sid = 0;
new_tsec->sockcreate_sid = 0;
+ /*
+ * Before policy is loaded, label any task outside kernel space
+ * as SECINITSID_INIT, so that any userspace tasks surviving from
+ * early boot end up with a label different from SECINITSID_KERNEL
+ * (if the policy chooses to set SECINITSID_INIT != SECINITSID_KERNEL).
+ */
+ if (!selinux_initialized()) {
+ new_tsec->sid = SECINITSID_INIT;
+ /* also clear the exec_sid just in case */
+ new_tsec->exec_sid = 0;
+ return 0;
+ }
+
if (old_tsec->exec_sid) {
new_tsec->sid = old_tsec->exec_sid;
/* Reset exec SID on execve. */
struct inode_security_struct *isec;
u32 sid;
- validate_creds(cred);
-
ad.type = LSM_AUDIT_DATA_DENTRY;
ad.u.dentry = dentry;
sid = cred_sid(cred);
if (!mask)
return 0;
- validate_creds(cred);
-
if (unlikely(IS_PRIVATE(inode)))
return 0;
return error;
}
+static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ /*
+ * If we are in a 64-bit kernel running 32-bit userspace, we need to
+ * make sure we don't compare 32-bit flags to 64-bit flags.
+ */
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
+ break;
+ case FS_IOC32_SETVERSION:
+ cmd = FS_IOC_SETVERSION;
+ break;
+ default:
+ break;
+ }
+
+ return selinux_file_ioctl(file, cmd, arg);
+}
+
static int default_noexec __ro_after_init;
static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
if (sksec->sid == SECINITSID_KERNEL)
return 0;
+ /*
+ * Before POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT, sockets that
+ * inherited the kernel context from early boot used to be skipped
+ * here, so preserve that behavior unless the capability is set.
+ *
+ * By setting the capability the policy signals that it is ready
+ * for this quirk to be fixed. Note that sockets created by a kernel
+ * thread or a usermode helper executed without a transition will
+ * still be skipped in this check regardless of the policycap
+ * setting.
+ */
+ if (!selinux_policycap_userspace_initial_context() &&
+ sksec->sid == SECINITSID_INIT)
+ return 0;
+
ad_net_init_from_sk(&ad, &net, sk);
return avc_has_perm(current_sid(), sksec->sid, sksec->sclass, perms,
return -EINVAL;
addr4 = (struct sockaddr_in *)address;
if (family_sa == AF_UNSPEC) {
+ if (family == PF_INET6) {
+ /* Length check from inet6_bind_sk() */
+ if (addrlen < SIN6_LEN_RFC2133)
+ return -EINVAL;
+ /* Family check from __inet6_bind() */
+ goto err_af;
+ }
/* see __inet_bind(), we only want to allow
* AF_UNSPEC if the address is INADDR_ANY
*/
inode_doinit_with_dentry(inode, dentry);
}
-static int selinux_getprocattr(struct task_struct *p,
- const char *name, char **value)
+static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p,
+ char **value)
{
const struct task_security_struct *__tsec;
u32 sid;
goto bad;
}
- if (!strcmp(name, "current"))
+ switch (attr) {
+ case LSM_ATTR_CURRENT:
sid = __tsec->sid;
- else if (!strcmp(name, "prev"))
+ break;
+ case LSM_ATTR_PREV:
sid = __tsec->osid;
- else if (!strcmp(name, "exec"))
+ break;
+ case LSM_ATTR_EXEC:
sid = __tsec->exec_sid;
- else if (!strcmp(name, "fscreate"))
+ break;
+ case LSM_ATTR_FSCREATE:
sid = __tsec->create_sid;
- else if (!strcmp(name, "keycreate"))
+ break;
+ case LSM_ATTR_KEYCREATE:
sid = __tsec->keycreate_sid;
- else if (!strcmp(name, "sockcreate"))
+ break;
+ case LSM_ATTR_SOCKCREATE:
sid = __tsec->sockcreate_sid;
- else {
- error = -EINVAL;
+ break;
+ default:
+ error = -EOPNOTSUPP;
goto bad;
}
rcu_read_unlock();
return error;
}
-static int selinux_setprocattr(const char *name, void *value, size_t size)
+static int selinux_lsm_setattr(u64 attr, void *value, size_t size)
{
struct task_security_struct *tsec;
struct cred *new;
/*
* Basic control over ability to set these attributes at all.
*/
- if (!strcmp(name, "exec"))
+ switch (attr) {
+ case LSM_ATTR_EXEC:
error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
PROCESS__SETEXEC, NULL);
- else if (!strcmp(name, "fscreate"))
+ break;
+ case LSM_ATTR_FSCREATE:
error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
PROCESS__SETFSCREATE, NULL);
- else if (!strcmp(name, "keycreate"))
+ break;
+ case LSM_ATTR_KEYCREATE:
error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
PROCESS__SETKEYCREATE, NULL);
- else if (!strcmp(name, "sockcreate"))
+ break;
+ case LSM_ATTR_SOCKCREATE:
error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
PROCESS__SETSOCKCREATE, NULL);
- else if (!strcmp(name, "current"))
+ break;
+ case LSM_ATTR_CURRENT:
error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
PROCESS__SETCURRENT, NULL);
- else
- error = -EINVAL;
+ break;
+ default:
+ error = -EOPNOTSUPP;
+ break;
+ }
if (error)
return error;
}
error = security_context_to_sid(value, size,
&sid, GFP_KERNEL);
- if (error == -EINVAL && !strcmp(name, "fscreate")) {
+ if (error == -EINVAL && attr == LSM_ATTR_FSCREATE) {
if (!has_cap_mac_admin(true)) {
struct audit_buffer *ab;
size_t audit_size;
- /* We strip a nul only if it is at the end, otherwise the
- * context contains a nul and we should audit that */
+ /* We strip a nul only if it is at the end,
+ * otherwise the context contains a nul and
+ * we should audit that */
if (str[size - 1] == '\0')
audit_size = size - 1;
else
if (!ab)
return error;
audit_log_format(ab, "op=fscreate invalid_context=");
- audit_log_n_untrustedstring(ab, value, audit_size);
+ audit_log_n_untrustedstring(ab, value,
+ audit_size);
audit_log_end(ab);
return error;
checks and may_create for the file creation checks. The
operation will then fail if the context is not permitted. */
tsec = selinux_cred(new);
- if (!strcmp(name, "exec")) {
+ if (attr == LSM_ATTR_EXEC) {
tsec->exec_sid = sid;
- } else if (!strcmp(name, "fscreate")) {
+ } else if (attr == LSM_ATTR_FSCREATE) {
tsec->create_sid = sid;
- } else if (!strcmp(name, "keycreate")) {
+ } else if (attr == LSM_ATTR_KEYCREATE) {
if (sid) {
error = avc_has_perm(mysid, sid,
SECCLASS_KEY, KEY__CREATE, NULL);
goto abort_change;
}
tsec->keycreate_sid = sid;
- } else if (!strcmp(name, "sockcreate")) {
+ } else if (attr == LSM_ATTR_SOCKCREATE) {
tsec->sockcreate_sid = sid;
- } else if (!strcmp(name, "current")) {
+ } else if (attr == LSM_ATTR_CURRENT) {
error = -EINVAL;
if (sid == 0)
goto abort_change;
- /* Only allow single threaded processes to change context */
if (!current_is_single_threaded()) {
error = security_bounded_transition(tsec->sid, sid);
if (error)
return error;
}
+/**
+ * selinux_getselfattr - Get SELinux current task attributes
+ * @attr: the requested attribute
+ * @ctx: buffer to receive the result
+ * @size: buffer size (input), buffer size used (output)
+ * @flags: unused
+ *
+ * Fill the passed user space @ctx with the details of the requested
+ * attribute.
+ *
+ * Returns the number of attributes on success, an error code otherwise.
+ * There will only ever be one attribute.
+ */
+static int selinux_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
+ size_t *size, u32 flags)
+{
+ int rc;
+ char *val;
+ int val_len;
+
+ val_len = selinux_lsm_getattr(attr, current, &val);
+ if (val_len < 0)
+ return val_len;
+ rc = lsm_fill_user_ctx(ctx, size, val, val_len, LSM_ID_SELINUX, 0);
+ kfree(val);
+ return (!rc ? 1 : rc);
+}
+
+static int selinux_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
+ size_t size, u32 flags)
+{
+ int rc;
+
+ rc = selinux_lsm_setattr(attr, ctx->ctx, ctx->ctx_len);
+ if (rc > 0)
+ return 0;
+ return rc;
+}
+
+static int selinux_getprocattr(struct task_struct *p,
+ const char *name, char **value)
+{
+ unsigned int attr = lsm_name_to_attr(name);
+ int rc;
+
+ if (attr) {
+ rc = selinux_lsm_getattr(attr, p, value);
+ if (rc != -EOPNOTSUPP)
+ return rc;
+ }
+
+ return -EINVAL;
+}
+
+static int selinux_setprocattr(const char *name, void *value, size_t size)
+{
+ int attr = lsm_name_to_attr(name);
+
+ if (attr)
+ return selinux_lsm_setattr(attr, value, size);
+ return -EINVAL;
+}
+
static int selinux_ismaclabel(const char *name)
{
return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0);
}
#endif /* CONFIG_IO_URING */
+static const struct lsm_id selinux_lsmid = {
+ .name = "selinux",
+ .id = LSM_ID_SELINUX,
+};
+
/*
* IMPORTANT NOTE: When adding new hooks, please be careful to keep this order:
* 1. any hooks that don't belong to (2.) or (3.) below,
LSM_HOOK_INIT(file_permission, selinux_file_permission),
LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
+ LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate),
+ LSM_HOOK_INIT(getselfattr, selinux_getselfattr),
+ LSM_HOOK_INIT(setselfattr, selinux_setselfattr),
LSM_HOOK_INIT(getprocattr, selinux_getprocattr),
LSM_HOOK_INIT(setprocattr, selinux_setprocattr),
hashtab_cache_init();
- security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), "selinux");
+ security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks),
+ &selinux_lsmid);
if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
panic("SELinux: Unable to register AVC netcache callback\n");
#include <linux/personality.h>
#include <linux/msg.h>
#include <linux/shm.h>
+ #include <uapi/linux/shm.h>
#include <linux/binfmts.h>
#include <linux/parser.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/watch_queue.h>
#include <linux/io_uring.h>
+#include <uapi/linux/lsm.h>
#include "smack.h"
#define TRANS_TRUE "TRUE"
return;
}
+/**
+ * smack_getselfattr - Smack current process attribute
+ * @attr: which attribute to fetch
+ * @ctx: buffer to receive the result
+ * @size: available size in, actual size out
+ * @flags: unused
+ *
+ * Fill the passed user space @ctx with the details of the requested
+ * attribute.
+ *
+ * Returns the number of attributes on success, an error code otherwise.
+ * There will only ever be one attribute.
+ */
+static int smack_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
+ size_t *size, u32 flags)
+{
+ int rc;
+ struct smack_known *skp;
+
+ if (attr != LSM_ATTR_CURRENT)
+ return -EOPNOTSUPP;
+
+ skp = smk_of_current();
+ rc = lsm_fill_user_ctx(ctx, size,
+ skp->smk_known, strlen(skp->smk_known) + 1,
+ LSM_ID_SMACK, 0);
+ return (!rc ? 1 : rc);
+}
+
/**
* smack_getprocattr - Smack process attribute access
* @p: the object task
}
/**
- * smack_setprocattr - Smack process attribute setting
- * @name: the name of the attribute in /proc/.../attr
+ * do_setattr - Smack process attribute setting
+ * @attr: the ID of the attribute
* @value: the value to set
* @size: the size of the value
*
*
* Returns the length of the smack label or an error code
*/
-static int smack_setprocattr(const char *name, void *value, size_t size)
+static int do_setattr(u64 attr, void *value, size_t size)
{
struct task_smack *tsp = smack_cred(current_cred());
struct cred *new;
if (value == NULL || size == 0 || size >= SMK_LONGLABEL)
return -EINVAL;
- if (strcmp(name, "current") != 0)
- return -EINVAL;
+ if (attr != LSM_ATTR_CURRENT)
+ return -EOPNOTSUPP;
skp = smk_import_entry(value, size);
if (IS_ERR(skp))
return size;
}
+/**
+ * smack_setselfattr - Set a Smack process attribute
+ * @attr: which attribute to set
+ * @ctx: buffer containing the data
+ * @size: size of @ctx
+ * @flags: unused
+ *
+ * Fill the passed user space @ctx with the details of the requested
+ * attribute.
+ *
+ * Returns 0 on success, an error code otherwise.
+ */
+static int smack_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
+ size_t size, u32 flags)
+{
+ int rc;
+
+ rc = do_setattr(attr, ctx->ctx, ctx->ctx_len);
+ if (rc > 0)
+ return 0;
+ return rc;
+}
+
+/**
+ * smack_setprocattr - Smack process attribute setting
+ * @name: the name of the attribute in /proc/.../attr
+ * @value: the value to set
+ * @size: the size of the value
+ *
+ * Sets the Smack value of the task. Only setting self
+ * is permitted and only with privilege
+ *
+ * Returns the length of the smack label or an error code
+ */
+static int smack_setprocattr(const char *name, void *value, size_t size)
+{
+ int attr = lsm_name_to_attr(name);
+
+ if (attr != LSM_ATTR_UNDEF)
+ return do_setattr(attr, value, size);
+ return -EINVAL;
+}
+
/**
* smack_unix_stream_connect - Smack access on UDS
* @sock: one sock
.lbs_xattr_count = SMACK_INODE_INIT_XATTRS,
};
+static const struct lsm_id smack_lsmid = {
+ .name = "smack",
+ .id = LSM_ID_SMACK,
+};
+
static struct security_hook_list smack_hooks[] __ro_after_init = {
LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
+ LSM_HOOK_INIT(file_ioctl_compat, smack_file_ioctl),
LSM_HOOK_INIT(file_lock, smack_file_lock),
LSM_HOOK_INIT(file_fcntl, smack_file_fcntl),
LSM_HOOK_INIT(mmap_file, smack_mmap_file),
LSM_HOOK_INIT(d_instantiate, smack_d_instantiate),
+ LSM_HOOK_INIT(getselfattr, smack_getselfattr),
+ LSM_HOOK_INIT(setselfattr, smack_setselfattr),
LSM_HOOK_INIT(getprocattr, smack_getprocattr),
LSM_HOOK_INIT(setprocattr, smack_setprocattr),
/*
* Register with LSM
*/
- security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), "smack");
+ security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), &smack_lsmid);
smack_enabled = 1;
pr_info("Smack: Initializing.\n");