--- /dev/null
--- /dev/null
--- /dev/null
--- ret, "q", "", "=r", 16);
++ +#ifndef _ASM_X86_UACCESS_64_H
++ +#define _ASM_X86_UACCESS_64_H
++ +
++ +/*
++ + * User space memory access functions
++ + */
++ +#include <linux/compiler.h>
++ +#include <linux/errno.h>
++ +#include <linux/prefetch.h>
++ +#include <linux/lockdep.h>
++ +#include <asm/page.h>
++ +
++ +/*
++ + * Copy To/From Userspace
++ + */
++ +
++ +/* Handles exceptions in both to and from, but doesn't do access_ok */
++ +__must_check unsigned long
++ +copy_user_generic(void *to, const void *from, unsigned len);
++ +
++ +__must_check unsigned long
++ +copy_to_user(void __user *to, const void *from, unsigned len);
++ +__must_check unsigned long
++ +copy_from_user(void *to, const void __user *from, unsigned len);
++ +__must_check unsigned long
++ +copy_in_user(void __user *to, const void __user *from, unsigned len);
++ +
++ +static __always_inline __must_check
++ +int __copy_from_user(void *dst, const void __user *src, unsigned size)
++ +{
++ + int ret = 0;
+++ ++++
+++ ++++ might_fault();
++ + if (!__builtin_constant_p(size))
++ + return copy_user_generic(dst, (__force void *)src, size);
++ + switch (size) {
++ + case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
++ + ret, "b", "b", "=q", 1);
++ + return ret;
++ + case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
++ + ret, "w", "w", "=r", 2);
++ + return ret;
++ + case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
++ + ret, "l", "k", "=r", 4);
++ + return ret;
++ + case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
++ + ret, "q", "", "=r", 8);
++ + return ret;
++ + case 10:
++ + __get_user_asm(*(u64 *)dst, (u64 __user *)src,
+++++ + ret, "q", "", "=r", 10);
++ + if (unlikely(ret))
++ + return ret;
++ + __get_user_asm(*(u16 *)(8 + (char *)dst),
++ + (u16 __user *)(8 + (char __user *)src),
++ + ret, "w", "w", "=r", 2);
++ + return ret;
++ + case 16:
++ + __get_user_asm(*(u64 *)dst, (u64 __user *)src,
++ + ret, "q", "", "=r", 16);
++ + if (unlikely(ret))
++ + return ret;
++ + __get_user_asm(*(u64 *)(8 + (char *)dst),
++ + (u64 __user *)(8 + (char __user *)src),
++ + ret, "q", "", "=r", 8);
++ + return ret;
++ + default:
++ + return copy_user_generic(dst, (__force void *)src, size);
++ + }
++ +}
++ +
++ +static __always_inline __must_check
++ +int __copy_to_user(void __user *dst, const void *src, unsigned size)
++ +{
++ + int ret = 0;
+++ ++++
+++ ++++ might_fault();
++ + if (!__builtin_constant_p(size))
++ + return copy_user_generic((__force void *)dst, src, size);
++ + switch (size) {
++ + case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
++ + ret, "b", "b", "iq", 1);
++ + return ret;
++ + case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
++ + ret, "w", "w", "ir", 2);
++ + return ret;
++ + case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
++ + ret, "l", "k", "ir", 4);
++ + return ret;
++ + case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
++ + ret, "q", "", "ir", 8);
++ + return ret;
++ + case 10:
++ + __put_user_asm(*(u64 *)src, (u64 __user *)dst,
++ + ret, "q", "", "ir", 10);
++ + if (unlikely(ret))
++ + return ret;
++ + asm("":::"memory");
++ + __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
++ + ret, "w", "w", "ir", 2);
++ + return ret;
++ + case 16:
++ + __put_user_asm(*(u64 *)src, (u64 __user *)dst,
++ + ret, "q", "", "ir", 16);
++ + if (unlikely(ret))
++ + return ret;
++ + asm("":::"memory");
++ + __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
++ + ret, "q", "", "ir", 8);
++ + return ret;
++ + default:
++ + return copy_user_generic((__force void *)dst, src, size);
++ + }
++ +}
++ +
++ +static __always_inline __must_check
++ +int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
++ +{
++ + int ret = 0;
+++ ++++
+++ ++++ might_fault();
++ + if (!__builtin_constant_p(size))
++ + return copy_user_generic((__force void *)dst,
++ + (__force void *)src, size);
++ + switch (size) {
++ + case 1: {
++ + u8 tmp;
++ + __get_user_asm(tmp, (u8 __user *)src,
++ + ret, "b", "b", "=q", 1);
++ + if (likely(!ret))
++ + __put_user_asm(tmp, (u8 __user *)dst,
++ + ret, "b", "b", "iq", 1);
++ + return ret;
++ + }
++ + case 2: {
++ + u16 tmp;
++ + __get_user_asm(tmp, (u16 __user *)src,
++ + ret, "w", "w", "=r", 2);
++ + if (likely(!ret))
++ + __put_user_asm(tmp, (u16 __user *)dst,
++ + ret, "w", "w", "ir", 2);
++ + return ret;
++ + }
++ +
++ + case 4: {
++ + u32 tmp;
++ + __get_user_asm(tmp, (u32 __user *)src,
++ + ret, "l", "k", "=r", 4);
++ + if (likely(!ret))
++ + __put_user_asm(tmp, (u32 __user *)dst,
++ + ret, "l", "k", "ir", 4);
++ + return ret;
++ + }
++ + case 8: {
++ + u64 tmp;
++ + __get_user_asm(tmp, (u64 __user *)src,
++ + ret, "q", "", "=r", 8);
++ + if (likely(!ret))
++ + __put_user_asm(tmp, (u64 __user *)dst,
++ + ret, "q", "", "ir", 8);
++ + return ret;
++ + }
++ + default:
++ + return copy_user_generic((__force void *)dst,
++ + (__force void *)src, size);
++ + }
++ +}
++ +
++ +__must_check long
++ +strncpy_from_user(char *dst, const char __user *src, long count);
++ +__must_check long
++ +__strncpy_from_user(char *dst, const char __user *src, long count);
++ +__must_check long strnlen_user(const char __user *str, long n);
++ +__must_check long __strnlen_user(const char __user *str, long n);
++ +__must_check long strlen_user(const char __user *str);
++ +__must_check unsigned long clear_user(void __user *mem, unsigned long len);
++ +__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
++ +
++ +__must_check long __copy_from_user_inatomic(void *dst, const void __user *src,
++ + unsigned size);
++ +
++ +static __must_check __always_inline int
++ +__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
++ +{
++ + return copy_user_generic((__force void *)dst, src, size);
++ +}
++ +
++ +extern long __copy_user_nocache(void *dst, const void __user *src,
++ + unsigned size, int zerorest);
++ +
++ +static inline int __copy_from_user_nocache(void *dst, const void __user *src,
++ + unsigned size)
++ +{
++ + might_sleep();
++ + return __copy_user_nocache(dst, src, size, 1);
++ +}
++ +
++ +static inline int __copy_from_user_inatomic_nocache(void *dst,
++ + const void __user *src,
++ + unsigned size)
++ +{
++ + return __copy_user_nocache(dst, src, size, 0);
++ +}
++ +
++ +unsigned long
++ +copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
++ +
++ +#endif /* _ASM_X86_UACCESS_64_H */
#include <linux/log2.h>
#include <linux/typecheck.h>
#include <linux/ratelimit.h>
++ #include <linux/dynamic_printk.h>
#include <asm/byteorder.h>
#include <asm/bug.h>
# define might_resched() do { } while (0)
#endif
++ +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
++ + void __might_sleep(char *file, int line);
/**
* might_sleep - annotation for functions that can sleep
*
* be bitten later when the calling function happens to sleep when it is not
* supposed to.
*/
-- -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-- - void __might_sleep(char *file, int line);
# define might_sleep() \
do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
#else
(__x < 0) ? -__x : __x; \
})
+++ ++++#ifdef CONFIG_PROVE_LOCKING
+++ ++++void might_fault(void);
+++ ++++#else
+++ ++++static inline void might_fault(void)
+++ ++++{
+++ ++++ might_sleep();
+++ ++++}
+++ ++++#endif
+++ ++++
extern struct atomic_notifier_head panic_notifier_list;
extern long (*panic_blink)(long time);
NORET_TYPE void panic(const char * fmt, ...)
extern int get_option(char **str, int *pint);
extern char *get_options(const char *str, int nints, int *ints);
-- extern unsigned long long memparse(char *ptr, char **retptr);
++ extern unsigned long long memparse(const char *ptr, char **retptr);
extern int core_kernel_text(unsigned long addr);
extern int __kernel_text_address(unsigned long addr);
extern int kernel_text_address(unsigned long addr);
+ ++++++extern int func_ptr_is_kernel_text(void *ptr);
+ ++++++
struct pid;
extern struct pid *session_of_pgrp(struct pid *pgrp);
++ +/*
++ + * FW_BUG
++ + * Add this to a message where you are sure the firmware is buggy or behaves
++ + * really stupid or out of spec. Be aware that the responsible BIOS developer
++ + * should be able to fix this issue or at least get a concrete idea of the
++ + * problem by reading your message without the need of looking at the kernel
++ + * code.
++ + *
++ + * Use it for definite and high priority BIOS bugs.
++ + *
++ + * FW_WARN
++ + * Use it for not that clear (e.g. could the kernel messed up things already?)
++ + * and medium priority BIOS bugs.
++ + *
++ + * FW_INFO
++ + * Use this one if you want to tell the user or vendor about something
++ + * suspicious, but generally harmless related to the firmware.
++ + *
++ + * Use it for information or very low priority BIOS bugs.
++ + */
++ +#define FW_BUG "[Firmware Bug]: "
++ +#define FW_WARN "[Firmware Warn]: "
++ +#define FW_INFO "[Firmware Info]: "
++ +
#ifdef CONFIG_PRINTK
asmlinkage int vprintk(const char *fmt, va_list args)
__attribute__ ((format (printf, 1, 0)));
{ return false; }
#endif
++ extern int printk_needs_cpu(int cpu);
++ extern void printk_tick(void);
++
extern void asmlinkage __attribute__((format(printf, 1, 2)))
early_printk(const char *fmt, ...);
extern int panic_timeout;
extern int panic_on_oops;
extern int panic_on_unrecovered_nmi;
-- extern int tainted;
extern const char *print_tainted(void);
-- extern void add_taint(unsigned);
++ extern void add_taint(unsigned flag);
++ extern int test_taint(unsigned flag);
++ extern unsigned long get_taint(void);
extern int root_mountflags;
/* Values used for system_state */
SYSTEM_SUSPEND_DISK,
} system_state;
-- #define TAINT_PROPRIETARY_MODULE (1<<0)
-- #define TAINT_FORCED_MODULE (1<<1)
-- #define TAINT_UNSAFE_SMP (1<<2)
-- #define TAINT_FORCED_RMMOD (1<<3)
-- #define TAINT_MACHINE_CHECK (1<<4)
-- #define TAINT_BAD_PAGE (1<<5)
-- #define TAINT_USER (1<<6)
-- #define TAINT_DIE (1<<7)
-- #define TAINT_OVERRIDDEN_ACPI_TABLE (1<<8)
-- #define TAINT_WARN (1<<9)
++ #define TAINT_PROPRIETARY_MODULE 0
++ #define TAINT_FORCED_MODULE 1
++ #define TAINT_UNSAFE_SMP 2
++ #define TAINT_FORCED_RMMOD 3
++ #define TAINT_MACHINE_CHECK 4
++ #define TAINT_BAD_PAGE 5
++ #define TAINT_USER 6
++ #define TAINT_DIE 7
++ #define TAINT_OVERRIDDEN_ACPI_TABLE 8
++ #define TAINT_WARN 9
++ #define TAINT_CRAP 10
extern void dump_stack(void) __cold;
return buf;
}
----- -#define pr_emerg(fmt, arg...) \
----- - printk(KERN_EMERG fmt, ##arg)
----- -#define pr_alert(fmt, arg...) \
----- - printk(KERN_ALERT fmt, ##arg)
----- -#define pr_crit(fmt, arg...) \
----- - printk(KERN_CRIT fmt, ##arg)
----- -#define pr_err(fmt, arg...) \
----- - printk(KERN_ERR fmt, ##arg)
----- -#define pr_warning(fmt, arg...) \
----- - printk(KERN_WARNING fmt, ##arg)
----- -#define pr_notice(fmt, arg...) \
----- - printk(KERN_NOTICE fmt, ##arg)
----- -#define pr_info(fmt, arg...) \
----- - printk(KERN_INFO fmt, ##arg)
--
-- #ifdef DEBUG
+++++ +#ifndef pr_fmt
+++++ +#define pr_fmt(fmt) fmt
+++++ +#endif
+++++ +
+++++ +#define pr_emerg(fmt, ...) \
+++++ + printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+++++ +#define pr_alert(fmt, ...) \
+++++ + printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+++++ +#define pr_crit(fmt, ...) \
+++++ + printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+++++ +#define pr_err(fmt, ...) \
+++++ + printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+++++ +#define pr_warning(fmt, ...) \
+++++ + printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+++++ +#define pr_notice(fmt, ...) \
+++++ + printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+++++ +#define pr_info(fmt, ...) \
+++++ + printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
++
/* If you are writing a driver, please use dev_dbg instead */
-- #define pr_debug(fmt, arg...) \
-- printk(KERN_DEBUG fmt, ##arg)
++ #if defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
++ #define pr_debug(fmt, ...) do { \
--- - dynamic_pr_debug(fmt, ##__VA_ARGS__); \
+++++ + dynamic_pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
++ } while (0)
++ #elif defined(DEBUG)
--- -#define pr_debug(fmt, arg...) \
--- - printk(KERN_DEBUG fmt, ##arg)
+++++ +#define pr_debug(fmt, ...) \
+++++ + printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
----- -#define pr_debug(fmt, arg...) \
----- - ({ if (0) printk(KERN_DEBUG fmt, ##arg); 0; })
+++++ +#define pr_debug(fmt, ...) \
+++++ + ({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; })
#endif
/*
#define NUMA_BUILD 0
#endif
++ /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
++ #ifdef CONFIG_FTRACE_MCOUNT_RECORD
++ # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
++ #endif
++
#endif
#include <linux/cn_proc.h>
#include <linux/mutex.h>
#include <linux/futex.h>
----- -#include <linux/compat.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
++ #include <trace/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
-- sig->utime = cputime_add(sig->utime, task_utime(tsk));
-- sig->stime = cputime_add(sig->stime, task_stime(tsk));
sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
-- sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig = NULL; /* Marker for below. */
}
if (sig) {
flush_sigqueue(&sig->shared_pending);
taskstats_tgid_free(sig);
++ + + /*
++ + + * Make sure ->signal can't go away under rq->lock,
++ + + * see account_group_exec_runtime().
++ + + */
++ + + task_rq_unlock_wait(tsk);
__cleanup_signal(sig);
}
}
static void delayed_put_task_struct(struct rcu_head *rhp)
{
-- put_task_struct(container_of(rhp, struct task_struct, rcu));
++ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
++
++ trace_sched_process_free(tsk);
++ put_task_struct(tsk);
}
* If there are other users of the mm and the owner (us) is exiting
* we need to find a new owner to take on the responsibility.
*/
- if (!mm)
- return 0;
if (atomic_read(&mm->mm_users) <= 1)
return 0;
if (mm->owner != p)
} while_each_thread(g, c);
read_unlock(&tasklist_lock);
+ /*
+ * We found no owner yet mm_users > 1: this implies that we are
+ * most likely racing with swapoff (try_to_unuse()) or /proc or
+ * ptrace or page migration (get_task_mm()). Mark owner as NULL,
+ * so that subsystems can understand the callback and take action.
+ */
+ down_write(&mm->mmap_sem);
+ cgroup_mm_owner_callbacks(mm->owner, NULL);
+ mm->owner = NULL;
+ up_write(&mm->mmap_sem);
return;
assign_new_owner:
BUG_ON(c == p);
get_task_struct(c);
++ read_unlock(&tasklist_lock);
++ down_write(&mm->mmap_sem);
/*
* The task_lock protects c->mm from changing.
* We always want mm->owner->mm == mm
*/
task_lock(c);
-- /*
-- * Delay read_unlock() till we have the task_lock()
-- * to ensure that c does not slip away underneath us
-- */
-- read_unlock(&tasklist_lock);
if (c->mm != mm) {
task_unlock(c);
++ up_write(&mm->mmap_sem);
put_task_struct(c);
goto retry;
}
cgroup_mm_owner_callbacks(mm->owner, c);
mm->owner = c;
task_unlock(c);
++ up_write(&mm->mmap_sem);
put_task_struct(c);
}
#endif /* CONFIG_MM_OWNER */
exit_itimers(tsk->signal);
}
acct_collect(code, group_dead);
----- -#ifdef CONFIG_FUTEX
----- - if (unlikely(tsk->robust_list))
----- - exit_robust_list(tsk);
----- -#ifdef CONFIG_COMPAT
----- - if (unlikely(tsk->compat_robust_list))
----- - compat_exit_robust_list(tsk);
----- -#endif
----- -#endif
if (group_dead)
tty_audit_exit();
if (unlikely(tsk->audit_context))
if (group_dead)
acct_process();
++ trace_sched_process_exit(tsk);
++
exit_sem(tsk);
exit_files(tsk);
exit_fs(tsk);
if (likely(!traced)) {
struct signal_struct *psig;
struct signal_struct *sig;
++ struct task_cputime cputime;
/*
* The resource counters for the group leader are in its
* need to protect the access to p->parent->signal fields,
* as other threads in the parent group can be right
* here reaping other children at the same time.
++ *
++ * We use thread_group_cputime() to get times for the thread
++ * group, which consolidates times for all threads in the
++ * group including the group leader.
*/
+++++ ++ thread_group_cputime(p, &cputime);
spin_lock_irq(&p->parent->sighand->siglock);
psig = p->parent->signal;
sig = p->signal;
- -- -- thread_group_cputime(p, &cputime);
psig->cutime =
cputime_add(psig->cutime,
-- cputime_add(p->utime,
-- cputime_add(sig->utime,
-- sig->cutime)));
++ cputime_add(cputime.utime,
++ sig->cutime));
psig->cstime =
cputime_add(psig->cstime,
-- cputime_add(p->stime,
-- cputime_add(sig->stime,
-- sig->cstime)));
++ cputime_add(cputime.stime,
++ sig->cstime));
psig->cgtime =
cputime_add(psig->cgtime,
cputime_add(p->gtime,
struct task_struct *tsk;
int retval;
++ trace_sched_process_wait(pid);
++
add_wait_queue(¤t->signal->wait_chldexit,&wait);
repeat:
/*
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
-- -----/*
-- ----- * Take mm->mmap_sem, when futex is shared
-- ----- */
-- -----static inline void futex_lock_mm(struct rw_semaphore *fshared)
-- -----{
-- ----- if (fshared)
-- ----- down_read(fshared);
-- -----}
-- -----
-- -----/*
-- ----- * Release mm->mmap_sem, when the futex is shared
-- ----- */
-- -----static inline void futex_unlock_mm(struct rw_semaphore *fshared)
-- -----{
-- ----- if (fshared)
-- ----- up_read(fshared);
-- -----}
-- -----
/*
* We hash on the keys returned from get_futex_key (see below).
*/
&& key1->both.offset == key2->both.offset);
}
++ +++++/*
++ +++++ * Take a reference to the resource addressed by a key.
++ +++++ * Can be called while holding spinlocks.
++ +++++ *
++ +++++ */
++ +++++static void get_futex_key_refs(union futex_key *key)
++ +++++{
++ +++++ if (!key->both.ptr)
++ +++++ return;
++ +++++
++ +++++ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
++ +++++ case FUT_OFF_INODE:
++ +++++ atomic_inc(&key->shared.inode->i_count);
++ +++++ break;
++ +++++ case FUT_OFF_MMSHARED:
++ +++++ atomic_inc(&key->private.mm->mm_count);
++ +++++ break;
++ +++++ }
++ +++++}
++ +++++
++ +++++/*
++ +++++ * Drop a reference to the resource addressed by a key.
++ +++++ * The hash bucket spinlock must not be held.
++ +++++ */
++ +++++static void drop_futex_key_refs(union futex_key *key)
++ +++++{
++ +++++ if (!key->both.ptr)
++ +++++ return;
++ +++++
++ +++++ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
++ +++++ case FUT_OFF_INODE:
++ +++++ iput(key->shared.inode);
++ +++++ break;
++ +++++ case FUT_OFF_MMSHARED:
++ +++++ mmdrop(key->private.mm);
++ +++++ break;
++ +++++ }
++ +++++}
++ +++++
/**
* get_futex_key - Get parameters which are the keys for a futex.
* @uaddr: virtual address of the futex
* For other futexes, it points to ¤t->mm->mmap_sem and
* caller must have taken the reader lock. but NOT any spinlocks.
*/
-- -----static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
-- ----- union futex_key *key)
++ +++++static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
-- ----- struct vm_area_struct *vma;
struct page *page;
int err;
return -EFAULT;
key->private.mm = mm;
key->private.address = address;
++ +++++ get_futex_key_refs(key);
return 0;
}
-- ----- /*
-- ----- * The futex is hashed differently depending on whether
-- ----- * it's in a shared or private mapping. So check vma first.
-- ----- */
-- ----- vma = find_extend_vma(mm, address);
-- ----- if (unlikely(!vma))
-- ----- return -EFAULT;
-- ----- /*
-- ----- * Permissions.
-- ----- */
-- ----- if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
-- ----- return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
++ +++++again:
++ +++++ err = get_user_pages_fast(address, 1, 0, &page);
++ +++++ if (err < 0)
++ +++++ return err;
++ +++++
++ +++++ lock_page(page);
++ +++++ if (!page->mapping) {
++ +++++ unlock_page(page);
++ +++++ put_page(page);
++ +++++ goto again;
++ +++++ }
/*
* Private mappings are handled in a simple way.
*
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
-- ----- * the object not the particular process. Therefore we use
-- ----- * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
-- ----- * mappings of _writable_ handles.
++ +++++ * the object not the particular process.
*/
-- ----- if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
-- ----- key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
++ +++++ if (PageAnon(page)) {
++ +++++ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
- - return 0;
- - }
- -
- - /*
- - * Linear file mappings are also simple.
- - */
- - key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
- - key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
- - if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
- - key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
- - + vma->vm_pgoff);
-- ----- return 0;
++ +++++ } else {
++ +++++ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
++ +++++ key->shared.inode = page->mapping->host;
++ +++++ key->shared.pgoff = page->index;
}
-- ----- /*
- ---- * Linear file mappings are also simple.
- - * We could walk the page table to read the non-linear
- - * pte, and get the page index without fetching the page
- - * from swap. But that's a lot of code to duplicate here
- - * for a rare case, so we simply fetch the page.
-- ----- */
- ---- key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
- ---- key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
- ---- if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
- ---- key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
- ---- + vma->vm_pgoff);
- - err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
- - if (err >= 0) {
- - key->shared.pgoff =
- - page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- - put_page(page);
-- ----- return 0;
-- ----- }
- - return err;
- -}
++ +++++ get_futex_key_refs(key);
- ---- /*
- ---- * We could walk the page table to read the non-linear
- ---- * pte, and get the page index without fetching the page
- ---- * from swap. But that's a lot of code to duplicate here
- ---- * for a rare case, so we simply fetch the page.
- ---- */
- ---- err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
- ---- if (err >= 0) {
- ---- key->shared.pgoff =
- ---- page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- ---- put_page(page);
- ---- return 0;
- ---- }
- ---- return err;
- ---- }
- ----
-- -----/*
-- ----- * Take a reference to the resource addressed by a key.
-- ----- * Can be called while holding spinlocks.
-- ----- *
-- ----- */
-- -----static void get_futex_key_refs(union futex_key *key)
-- -----{
-- ----- if (key->both.ptr == NULL)
-- ----- return;
-- ----- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-- ----- case FUT_OFF_INODE:
-- ----- atomic_inc(&key->shared.inode->i_count);
-- ----- break;
-- ----- case FUT_OFF_MMSHARED:
-- ----- atomic_inc(&key->private.mm->mm_count);
-- ----- break;
-- ----- }
++ +++++ unlock_page(page);
++ +++++ put_page(page);
++ +++++ return 0;
}
-- -----/*
-- ----- * Drop a reference to the resource addressed by a key.
-- ----- * The hash bucket spinlock must not be held.
-- ----- */
-- -----static void drop_futex_key_refs(union futex_key *key)
++ +++++static inline
++ +++++void put_futex_key(int fshared, union futex_key *key)
{
-- ----- if (!key->both.ptr)
-- ----- return;
-- ----- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-- ----- case FUT_OFF_INODE:
-- ----- iput(key->shared.inode);
-- ----- break;
-- ----- case FUT_OFF_MMSHARED:
-- ----- mmdrop(key->private.mm);
-- ----- break;
-- ----- }
++ +++++ drop_futex_key_refs(key);
}
static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
/*
* Fault handling.
-- ----- * if fshared is non NULL, current->mm->mmap_sem is already held
*/
-- -----static int futex_handle_fault(unsigned long address,
-- ----- struct rw_semaphore *fshared, int attempt)
++ +++++static int futex_handle_fault(unsigned long address, int attempt)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
if (attempt > 2)
return ret;
-- ----- if (!fshared)
-- ----- down_read(&mm->mmap_sem);
++ +++++ down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
if (vma && address >= vma->vm_start &&
(vma->vm_flags & VM_WRITE)) {
current->min_flt++;
}
}
-- ----- if (!fshared)
-- ----- up_read(&mm->mmap_sem);
++ +++++ up_read(&mm->mmap_sem);
return ret;
}
/* pi_mutex gets initialized later */
pi_state->owner = NULL;
atomic_set(&pi_state->refcount, 1);
++ +++++ pi_state->key = FUTEX_KEY_INIT;
current->pi_state_cache = pi_state;
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
struct futex_hash_bucket *hb;
-- ----- union futex_key key;
++ +++++ union futex_key key = FUTEX_KEY_INIT;
if (!futex_cmpxchg_enabled)
return;
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
-- -----static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
-- ----- int nr_wake, u32 bitset)
++ +++++static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
struct plist_head *head;
-- ----- union futex_key key;
++ +++++ union futex_key key = FUTEX_KEY_INIT;
int ret;
if (!bitset)
return -EINVAL;
-- ----- futex_lock_mm(fshared);
-- -----
ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
spin_unlock(&hb->lock);
out:
-- ----- futex_unlock_mm(fshared);
++ +++++ put_futex_key(fshared, &key);
return ret;
}
* to this virtual address:
*/
static int
-- -----futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
-- ----- u32 __user *uaddr2,
++ +++++futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
-- ----- union futex_key key1, key2;
++ +++++ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
retryfull:
-- ----- futex_lock_mm(fshared);
-- -----
ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
*/
if (attempt++) {
ret = futex_handle_fault((unsigned long)uaddr2,
-- ----- fshared, attempt);
++ +++++ attempt);
if (ret)
goto out;
goto retry;
}
-- ----- /*
-- ----- * If we would have faulted, release mmap_sem,
-- ----- * fault it in and start all over again.
-- ----- */
-- ----- futex_unlock_mm(fshared);
-- -----
ret = get_user(dummy, uaddr2);
if (ret)
return ret;
if (hb1 != hb2)
spin_unlock(&hb2->lock);
out:
-- ----- futex_unlock_mm(fshared);
++ +++++ put_futex_key(fshared, &key2);
++ +++++ put_futex_key(fshared, &key1);
return ret;
}
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
-- -----static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
-- ----- u32 __user *uaddr2,
++ +++++static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
int nr_wake, int nr_requeue, u32 *cmpval)
{
-- ----- union futex_key key1, key2;
++ +++++ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
retry:
-- ----- futex_lock_mm(fshared);
-- -----
ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
if (hb1 != hb2)
spin_unlock(&hb2->lock);
-- ----- /*
-- ----- * If we would have faulted, release mmap_sem, fault
-- ----- * it in and start all over again.
-- ----- */
-- ----- futex_unlock_mm(fshared);
-- -----
ret = get_user(curval, uaddr1);
if (!ret)
drop_futex_key_refs(&key1);
out:
-- ----- futex_unlock_mm(fshared);
++ +++++ put_futex_key(fshared, &key2);
++ +++++ put_futex_key(fshared, &key1);
return ret;
}
* private futexes.
*/
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-- ----- struct task_struct *newowner,
-- ----- struct rw_semaphore *fshared)
++ +++++ struct task_struct *newowner, int fshared)
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
handle_fault:
spin_unlock(q->lock_ptr);
-- ----- ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
++ +++++ ret = futex_handle_fault((unsigned long)uaddr, attempt++);
spin_lock(q->lock_ptr);
static long futex_wait_restart(struct restart_block *restart);
-- -----static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
++ +++++static int futex_wait(u32 __user *uaddr, int fshared,
u32 val, ktime_t *abs_time, u32 bitset)
{
struct task_struct *curr = current;
q.pi_state = NULL;
q.bitset = bitset;
retry:
-- ----- futex_lock_mm(fshared);
-- -----
++ +++++ q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
if (unlikely(ret)) {
queue_unlock(&q, hb);
-- ----- /*
-- ----- * If we would have faulted, release mmap_sem, fault it in and
-- ----- * start all over again.
-- ----- */
-- ----- futex_unlock_mm(fshared);
-- -----
ret = get_user(uval, uaddr);
if (!ret)
/* Only actually queue if *uaddr contained val. */
queue_me(&q, hb);
-- ----- /*
-- ----- * Now the futex is queued and we have checked the data, we
-- ----- * don't want to hold mmap_sem while we sleep.
-- ----- */
-- ----- futex_unlock_mm(fshared);
-- -----
/*
* There might have been scheduling since the queue_me(), as we
* cannot hold a spinlock across the get_user() in case it
if (!abs_time)
schedule();
else {
++ + unsigned long slack;
++ + slack = current->timer_slack_ns;
++ + if (rt_task(current))
++ + slack = 0;
hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
hrtimer_init_sleeper(&t, current);
-- - t.timer.expires = *abs_time;
++ + hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
-- - hrtimer_start(&t.timer, t.timer.expires,
-- - HRTIMER_MODE_ABS);
++ + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
if (!hrtimer_active(&t.timer))
t.task = NULL;
queue_unlock(&q, hb);
out_release_sem:
-- ----- futex_unlock_mm(fshared);
++ +++++ put_futex_key(fshared, &q.key);
return ret;
}
static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
-- ----- struct rw_semaphore *fshared = NULL;
++ +++++ int fshared = 0;
ktime_t t;
t.tv64 = restart->futex.time;
restart->fn = do_no_restart_syscall;
if (restart->futex.flags & FLAGS_SHARED)
-- ----- fshared = ¤t->mm->mmap_sem;
++ +++++ fshared = 1;
return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
restart->futex.bitset);
}
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-- -----static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
++ +++++static int futex_lock_pi(u32 __user *uaddr, int fshared,
int detect, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
-- - to->timer.expires = *time;
++ + hrtimer_set_expires(&to->timer, *time);
}
q.pi_state = NULL;
retry:
-- ----- futex_lock_mm(fshared);
-- -----
++ +++++ q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
* exit to complete.
*/
queue_unlock(&q, hb);
-- ----- futex_unlock_mm(fshared);
cond_resched();
goto retry;
*/
queue_me(&q, hb);
-- ----- /*
-- ----- * Now the futex is queued and we have checked the data, we
-- ----- * don't want to hold mmap_sem while we sleep.
-- ----- */
-- ----- futex_unlock_mm(fshared);
-- -----
WARN_ON(!q.pi_state);
/*
* Block on the PI mutex:
ret = ret ? 0 : -EWOULDBLOCK;
}
-- ----- futex_lock_mm(fshared);
spin_lock(q.lock_ptr);
if (!ret) {
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
-- ----- futex_unlock_mm(fshared);
if (to)
destroy_hrtimer_on_stack(&to->timer);
queue_unlock(&q, hb);
out_release_sem:
-- ----- futex_unlock_mm(fshared);
++ +++++ put_futex_key(fshared, &q.key);
if (to)
destroy_hrtimer_on_stack(&to->timer);
return ret;
queue_unlock(&q, hb);
if (attempt++) {
-- ----- ret = futex_handle_fault((unsigned long)uaddr, fshared,
-- ----- attempt);
++ +++++ ret = futex_handle_fault((unsigned long)uaddr, attempt);
if (ret)
goto out_release_sem;
goto retry_unlocked;
}
-- ----- futex_unlock_mm(fshared);
-- -----
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
goto retry;
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-- -----static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
++ +++++static int futex_unlock_pi(u32 __user *uaddr, int fshared)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
u32 uval;
struct plist_head *head;
-- ----- union futex_key key;
++ +++++ union futex_key key = FUTEX_KEY_INIT;
int ret, attempt = 0;
retry:
*/
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
-- ----- /*
-- ----- * First take all the futex related locks:
-- ----- */
-- ----- futex_lock_mm(fshared);
ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
out_unlock:
spin_unlock(&hb->lock);
out:
-- ----- futex_unlock_mm(fshared);
++ +++++ put_futex_key(fshared, &key);
return ret;
spin_unlock(&hb->lock);
if (attempt++) {
-- ----- ret = futex_handle_fault((unsigned long)uaddr, fshared,
-- ----- attempt);
++ +++++ ret = futex_handle_fault((unsigned long)uaddr, attempt);
if (ret)
goto out;
uval = 0;
goto retry_unlocked;
}
-- ----- futex_unlock_mm(fshared);
-- -----
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
goto retry;
* PI futexes happens in exit_pi_state():
*/
if (!pi && (uval & FUTEX_WAITERS))
-- ----- futex_wake(uaddr, &curr->mm->mmap_sem, 1,
-- ----- FUTEX_BITSET_MATCH_ANY);
++ +++++ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
}
return 0;
}
{
int ret = -ENOSYS;
int cmd = op & FUTEX_CMD_MASK;
-- ----- struct rw_semaphore *fshared = NULL;
++ +++++ int fshared = 0;
if (!(op & FUTEX_PRIVATE_FLAG))
-- ----- fshared = ¤t->mm->mmap_sem;
++ +++++ fshared = 1;
switch (cmd) {
case FUTEX_WAIT:
#ifdef CONFIG_LOCK_STAT
static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
--- ----static int lock_contention_point(struct lock_class *class, unsigned long ip)
+++ ++++static int lock_point(unsigned long points[], unsigned long ip)
{
int i;
--- ---- for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
--- ---- if (class->contention_point[i] == 0) {
--- ---- class->contention_point[i] = ip;
+++ ++++ for (i = 0; i < LOCKSTAT_POINTS; i++) {
+++ ++++ if (points[i] == 0) {
+++ ++++ points[i] = ip;
break;
}
--- ---- if (class->contention_point[i] == ip)
+++ ++++ if (points[i] == ip)
break;
}
for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
stats.contention_point[i] += pcs->contention_point[i];
+++ ++++ for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
+++ ++++ stats.contending_point[i] += pcs->contending_point[i];
+++ ++++
lock_time_add(&pcs->read_waittime, &stats.read_waittime);
lock_time_add(&pcs->write_waittime, &stats.write_waittime);
memset(cpu_stats, 0, sizeof(struct lock_class_stats));
}
memset(class->contention_point, 0, sizeof(class->contention_point));
+++ ++++ memset(class->contending_point, 0, sizeof(class->contending_point));
}
static struct lock_class_stats *get_lock_stats(struct lock_class *class)
/*
* Hardirqs will be enabled:
*/
-- -void trace_hardirqs_on_caller(unsigned long a0)
++ +void trace_hardirqs_on_caller(unsigned long ip)
{
struct task_struct *curr = current;
-- - unsigned long ip;
-- - time_hardirqs_on(CALLER_ADDR0, a0);
++ + time_hardirqs_on(CALLER_ADDR0, ip);
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
}
/* we'll do an OFF -> ON transition: */
curr->hardirqs_enabled = 1;
-- - ip = (unsigned long) __builtin_return_address(0);
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
/*
* Hardirqs were disabled:
*/
-- -void trace_hardirqs_off_caller(unsigned long a0)
++ +void trace_hardirqs_off_caller(unsigned long ip)
{
struct task_struct *curr = current;
-- - time_hardirqs_off(CALLER_ADDR0, a0);
++ + time_hardirqs_off(CALLER_ADDR0, ip);
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
* We have done an ON -> OFF transition:
*/
curr->hardirqs_enabled = 0;
-- - curr->hardirq_disable_ip = _RET_IP_;
++ + curr->hardirq_disable_ip = ip;
curr->hardirq_disable_event = ++curr->irq_events;
debug_atomic_inc(&hardirqs_off_events);
} else
struct held_lock *hlock, *prev_hlock;
struct lock_class_stats *stats;
unsigned int depth;
--- ---- int i, point;
+++ ++++ int i, contention_point, contending_point;
depth = curr->lockdep_depth;
if (DEBUG_LOCKS_WARN_ON(!depth))
found_it:
hlock->waittime_stamp = sched_clock();
--- ---- point = lock_contention_point(hlock_class(hlock), ip);
+++ ++++ contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
+++ ++++ contending_point = lock_point(hlock_class(hlock)->contending_point,
+++ ++++ lock->ip);
stats = get_lock_stats(hlock_class(hlock));
--- ---- if (point < ARRAY_SIZE(stats->contention_point))
--- ---- stats->contention_point[point]++;
+++ ++++ if (contention_point < LOCKSTAT_POINTS)
+++ ++++ stats->contention_point[contention_point]++;
+++ ++++ if (contending_point < LOCKSTAT_POINTS)
+++ ++++ stats->contending_point[contending_point]++;
if (lock->cpu != smp_processor_id())
stats->bounces[bounce_contended + !!hlock->read]++;
put_lock_stats(stats);
}
static void
--- ----__lock_acquired(struct lockdep_map *lock)
+++ ++++__lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
struct held_lock *hlock, *prev_hlock;
put_lock_stats(stats);
lock->cpu = cpu;
+++ ++++ lock->ip = ip;
}
void lock_contended(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_contended);
--- ----void lock_acquired(struct lockdep_map *lock)
+++ ++++void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
raw_local_irq_save(flags);
check_flags(flags);
current->lockdep_recursion = 1;
--- ---- __lock_acquired(lock);
+++ ++++ __lock_acquired(lock, ip);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
{
printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
------ - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
++++++ + printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
------ - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
++++++ + printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
}
printk(" ignoring it.\n");
unlock = 0;
++ + } else {
++ + if (count != 10)
++ + printk(KERN_CONT " locked it.\n");
}
-- - if (count != 10)
-- - printk(" locked it.\n");
do_each_thread(g, p) {
/*
while (nb && nr_to_call) {
next_nb = rcu_dereference(nb->next);
+ ++++++
+ ++++++#ifdef CONFIG_DEBUG_NOTIFIERS
+ ++++++ if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
+ ++++++ WARN(1, "Invalid notifier called!");
+ ++++++ nb = next_nb;
+ ++++++ continue;
+ ++++++ }
+ ++++++#endif
ret = nb->notifier_call(nb, val, v);
if (nr_calls)
static ATOMIC_NOTIFIER_HEAD(die_chain);
-- int notify_die(enum die_val val, const char *str,
++ int notrace notify_die(enum die_val val, const char *str,
struct pt_regs *regs, long err, int trap, int sig)
{
struct die_args args = {
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
++ +#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/syscalls.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
++ #include <trace/sched.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
-- rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
++ }
++
++ static inline int rt_bandwidth_enabled(void)
++ {
++ return sysctl_sched_rt_runtime >= 0;
}
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
ktime_t now;
-- if (rt_b->rt_runtime == RUNTIME_INF)
++ if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-- - hrtimer_start(&rt_b->rt_period_timer,
-- - rt_b->rt_period_timer.expires,
-- - HRTIMER_MODE_ABS);
++ + hrtimer_start_expires(&rt_b->rt_period_timer,
++ + HRTIMER_MODE_ABS);
}
spin_unlock(&rt_b->rt_runtime_lock);
}
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
#endif /* CONFIG_RT_GROUP_SCHED */
-- #else /* !CONFIG_FAIR_GROUP_SCHED */
++ #else /* !CONFIG_USER_SCHED */
#define root_task_group init_task_group
-- #endif /* CONFIG_FAIR_GROUP_SCHED */
++ #endif /* CONFIG_USER_SCHED */
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
u64 exec_clock;
u64 min_vruntime;
-- - u64 pair_start;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
-- - - struct sched_entity *curr, *next;
++ + + struct sched_entity *curr, *next, *last;
-- - - unsigned long nr_spread_over;
++ + + unsigned int nr_spread_over;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
++ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
{
-- rq->curr->sched_class->check_preempt_curr(rq, p);
++ rq->curr->sched_class->check_preempt_curr(rq, p, sync);
}
static inline int cpu_of(struct rq *rq)
*/
unsigned int sysctl_sched_shares_ratelimit = 250000;
++ +/*
++ + * Inject some fuzzyness into changing the per-cpu group shares
++ + * this avoids remote rq-locks at the expense of fairness.
++ + * default: 4
++ + */
++ +unsigned int sysctl_sched_shares_thresh = 4;
++ +
/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
}
}
++ + +void task_rq_unlock_wait(struct task_struct *p)
++ + +{
++ + + struct rq *rq = task_rq(p);
++ + +
++ + + smp_mb(); /* spin-unlock-wait is not a full memory barrier */
++ + + spin_unlock_wait(&rq->lock);
++ + +}
++ + +
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
-- - timer->expires = time;
++ + hrtimer_set_expires(timer, time);
if (rq == this_rq()) {
hrtimer_restart(timer);
return NOTIFY_DONE;
}
- static void init_hrtick(void)
+ static __init void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
}
-- static void init_hrtick(void)
++ static inline void init_hrtick(void)
{
}
#endif /* CONFIG_SMP */
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
-- rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
}
-- #else
++ #else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
static inline void init_hrtick(void)
{
}
-- #endif
++ #endif /* CONFIG_SCHED_HRTICK */
/*
* resched_task - mark a task 'to be rescheduled now'.
update_load_sub(&rq->load, load);
}
-- #ifdef CONFIG_SMP
-- static unsigned long source_load(int cpu, int type);
-- static unsigned long target_load(int cpu, int type);
-- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
--
-- static unsigned long cpu_avg_load_per_task(int cpu)
-- {
-- struct rq *rq = cpu_rq(cpu);
--
-- if (rq->nr_running)
-- rq->avg_load_per_task = rq->load.weight / rq->nr_running;
--
-- return rq->avg_load_per_task;
-- }
--
-- #ifdef CONFIG_FAIR_GROUP_SCHED
--
-- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
++ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
++ typedef int (*tg_visitor)(struct task_group *, void *);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
-- static void
-- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
++ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
++ int ret;
rcu_read_lock();
parent = &root_task_group;
down:
-- (*down)(parent, cpu, sd);
++ ret = (*down)(parent, data);
++ if (ret)
++ goto out_unlock;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
-- (*up)(parent, cpu, sd);
++ ret = (*up)(parent, data);
++ if (ret)
++ goto out_unlock;
child = parent;
parent = parent->parent;
if (parent)
goto up;
++ out_unlock:
rcu_read_unlock();
++
++ return ret;
++ }
++
++ static int tg_nop(struct task_group *tg, void *data)
++ {
++ return 0;
++ }
++ #endif
++
++ #ifdef CONFIG_SMP
++ static unsigned long source_load(int cpu, int type);
++ static unsigned long target_load(int cpu, int type);
++ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
++
++ static unsigned long cpu_avg_load_per_task(int cpu)
++ {
++ struct rq *rq = cpu_rq(cpu);
++
++ if (rq->nr_running)
++ rq->avg_load_per_task = rq->load.weight / rq->nr_running;
++++ + else
++++ + rq->avg_load_per_task = 0;
++
++ return rq->avg_load_per_task;
}
++ #ifdef CONFIG_FAIR_GROUP_SCHED
++
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/*
* Calculate and set the cpu's group shares.
*/
static void
-- -__update_group_shares_cpu(struct task_group *tg, int cpu,
-- - unsigned long sd_shares, unsigned long sd_rq_weight)
++ +update_group_shares_cpu(struct task_group *tg, int cpu,
++ + unsigned long sd_shares, unsigned long sd_rq_weight)
{
int boost = 0;
unsigned long shares;
*
*/
shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
++ + shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-- - /*
-- - * record the actual number of shares, not the boosted amount.
-- - */
-- - tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-- - tg->cfs_rq[cpu]->rq_weight = rq_weight;
++ + if (abs(shares - tg->se[cpu]->load.weight) >
++ + sysctl_sched_shares_thresh) {
++ + struct rq *rq = cpu_rq(cpu);
++ + unsigned long flags;
-- - if (shares < MIN_SHARES)
-- - shares = MIN_SHARES;
-- - else if (shares > MAX_SHARES)
-- - shares = MAX_SHARES;
++ + spin_lock_irqsave(&rq->lock, flags);
++ + /*
++ + * record the actual number of shares, not the boosted amount.
++ + */
++ + tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
++ + tg->cfs_rq[cpu]->rq_weight = rq_weight;
-- - __set_se_shares(tg->se[cpu], shares);
++ + __set_se_shares(tg->se[cpu], shares);
++ + spin_unlock_irqrestore(&rq->lock, flags);
++ + }
}
/*
* This needs to be done in a bottom-up fashion because the rq weight of a
* parent group depends on the shares of its child groups.
*/
-- static void
-- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
++ static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long rq_weight = 0;
unsigned long shares = 0;
++ struct sched_domain *sd = data;
int i;
for_each_cpu_mask(i, sd->span) {
if (!rq_weight)
rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-- - for_each_cpu_mask(i, sd->span) {
-- - struct rq *rq = cpu_rq(i);
-- - unsigned long flags;
-
- spin_lock_irqsave(&rq->lock, flags);
- __update_group_shares_cpu(tg, i, shares, rq_weight);
- spin_unlock_irqrestore(&rq->lock, flags);
- }
++ + for_each_cpu_mask(i, sd->span)
++ + update_group_shares_cpu(tg, i, shares, rq_weight);
-- spin_lock_irqsave(&rq->lock, flags);
-- __update_group_shares_cpu(tg, i, shares, rq_weight);
-- spin_unlock_irqrestore(&rq->lock, flags);
-- }
++ return 0;
}
/*
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
-- static void
-- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
++ static int tg_load_down(struct task_group *tg, void *data)
{
unsigned long load;
++ long cpu = (long)data;
if (!tg->parent) {
load = cpu_rq(cpu)->load.weight;
}
tg->cfs_rq[cpu]->h_load = load;
-- }
-- static void
-- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-- {
++ return 0;
}
static void update_shares(struct sched_domain *sd)
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now;
-- walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
++ walk_tg_tree(tg_nop, tg_shares_up, sd);
}
}
spin_lock(&rq->lock);
}
-- static void update_h_load(int cpu)
++ static void update_h_load(long cpu)
{
-- walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
++ walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
#else
/*
* Buddy candidates are cache hot:
*/
-- - - if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
++ + + if (sched_feat(CACHE_HOT_BUDDY) &&
++ + + (&p->se == cfs_rq_of(&p->se)->next ||
++ + + &p->se == cfs_rq_of(&p->se)->last))
return 1;
if (p->sched_class != &fair_sched_class)
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
++ trace_sched_wait_task(rq, p);
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
-- if (!match_state || p->state == match_state) {
-- ncsw = p->nivcsw + p->nvcsw;
-- if (unlikely(!ncsw))
-- ncsw = 1;
-- }
++ if (!match_state || p->state == match_state)
++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, &flags);
/*
success = 1;
out_running:
-- trace_mark(kernel_sched_wakeup,
-- "pid %d state %ld ## rq %p task %p rq->curr %p",
-- p->pid, p->state, rq, p, rq->curr);
-- check_preempt_curr(rq, p);
++ trace_sched_wakeup(rq, p);
++ check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
p->sched_class->task_new(rq, p);
inc_nr_running(rq);
}
-- trace_mark(kernel_sched_wakeup_new,
-- "pid %d state %ld ## rq %p task %p rq->curr %p",
-- p->pid, p->state, rq, p, rq->curr);
-- check_preempt_curr(rq, p);
++ trace_sched_wakeup_new(rq, p);
++ check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
p->sched_class->task_wake_up(rq, p);
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
-- trace_mark(kernel_sched_schedule,
-- "prev_pid %d next_pid %d prev_state %ld "
-- "## rq %p prev %p next %p",
-- prev->pid, next->pid, prev->state,
-- rq, prev, next);
++ trace_sched_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
|| unlikely(!cpu_active(dest_cpu)))
goto out;
++ trace_sched_migrate_task(rq, p, dest_cpu);
/* force the process onto the specified CPU */
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
-- check_preempt_curr(this_rq, p);
++ check_preempt_curr(this_rq, p, 0);
}
/*
} else
this_load_per_task = cpu_avg_load_per_task(this_cpu);
-- - if (max_load - this_load + 2*busiest_load_per_task >=
++ + if (max_load - this_load + busiest_load_per_task >=
busiest_load_per_task * imbn) {
*imbalance = busiest_load_per_task;
return busiest;
EXPORT_PER_CPU_SYMBOL(kstat);
/*
-- * Return p->sum_exec_runtime plus any more ns on the sched_clock
-- * that have not yet been banked in case the task is currently running.
++ * Return any ns on the sched_clock that have not yet been banked in
++ * @p in case that task is currently running.
*/
-- unsigned long long task_sched_runtime(struct task_struct *p)
++ unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
-- u64 ns, delta_exec;
struct rq *rq;
++ u64 ns = 0;
rq = task_rq_lock(p, &flags);
-- ns = p->se.sum_exec_runtime;
++
if (task_current(rq, p)) {
++ u64 delta_exec;
++
update_rq_clock(rq);
delta_exec = rq->clock - p->se.exec_start;
if ((s64)delta_exec > 0)
-- ns += delta_exec;
++ ns = delta_exec;
}
++
task_rq_unlock(rq, &flags);
return ns;
cputime64_t tmp;
p->utime = cputime_add(p->utime, cputime);
++ account_group_user_time(p, cputime);
/* Add user time to cpustat. */
tmp = cputime_to_cputime64(cputime);
tmp = cputime_to_cputime64(cputime);
p->utime = cputime_add(p->utime, cputime);
++ account_group_user_time(p, cputime);
p->gtime = cputime_add(p->gtime, cputime);
cpustat->user = cputime64_add(cpustat->user, tmp);
}
p->stime = cputime_add(p->stime, cputime);
++ account_group_system_time(p, cputime);
/* Add system time to cpustat. */
tmp = cputime_to_cputime64(cputime);
if (p == rq->idle) {
p->stime = cputime_add(p->stime, steal);
- -- -- account_group_system_time(p, steal);
if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
else
/*
* Underflow?
*/
--- ---- if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+++ ++++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
return;
/*
* Is the spinlock portion underflowing?
if (sched_feat(HRTICK))
hrtick_clear(rq);
-- - /*
-- - * Do the rq-clock update outside the rq lock:
-- - */
-- - local_irq_disable();
++ + spin_lock_irq(&rq->lock);
update_rq_clock(rq);
-- - spin_lock(&rq->lock);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
++ /**
++ * complete: - signals a single thread waiting on this completion
++ * @x: holds the state of this particular completion
++ *
++ * This will wake up a single thread waiting on this completion. Threads will be
++ * awakened in the same order in which they were queued.
++ *
++ * See also complete_all(), wait_for_completion() and related routines.
++ */
void complete(struct completion *x)
{
unsigned long flags;
}
EXPORT_SYMBOL(complete);
++ /**
++ * complete_all: - signals all threads waiting on this completion
++ * @x: holds the state of this particular completion
++ *
++ * This will wake up all threads waiting on this particular completion event.
++ */
void complete_all(struct completion *x)
{
unsigned long flags;
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
-- if ((state == TASK_INTERRUPTIBLE &&
-- signal_pending(current)) ||
-- (state == TASK_KILLABLE &&
-- fatal_signal_pending(current))) {
++ if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
return timeout;
}
++ /**
++ * wait_for_completion: - waits for completion of a task
++ * @x: holds the state of this particular completion
++ *
++ * This waits to be signaled for completion of a specific task. It is NOT
++ * interruptible and there is no timeout.
++ *
++ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
++ * and interrupt capability. Also see complete().
++ */
void __sched wait_for_completion(struct completion *x)
{
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion);
++ /**
++ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
++ * @x: holds the state of this particular completion
++ * @timeout: timeout value in jiffies
++ *
++ * This waits for either a completion of a specific task to be signaled or for a
++ * specified timeout to expire. The timeout is in jiffies. It is not
++ * interruptible.
++ */
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
}
EXPORT_SYMBOL(wait_for_completion_timeout);
++ /**
++ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
++ * @x: holds the state of this particular completion
++ *
++ * This waits for completion of a specific task to be signaled. It is
++ * interruptible.
++ */
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
++ /**
++ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
++ * @x: holds the state of this particular completion
++ * @timeout: timeout value in jiffies
++ *
++ * This waits for either a completion of a specific task to be signaled or for a
++ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
++ */
unsigned long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
++ /**
++ * wait_for_completion_killable: - waits for completion of a task (killable)
++ * @x: holds the state of this particular completion
++ *
++ * This waits to be signaled for completion of a specific task. It can be
++ * interrupted by a kill signal.
++ */
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
-- if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
++ if (rt_bandwidth_enabled() && rt_policy(policy) &&
++ task_group(p)->rt_bandwidth.rt_runtime == 0)
return -EPERM;
#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
++++ + spin_lock_irqsave(&rq->lock, flags);
++++ +
__sched_fork(idle);
idle->se.exec_start = sched_clock();
idle->cpus_allowed = cpumask_of_cpu(cpu);
__set_task_cpu(idle, cpu);
---- - spin_lock_irqsave(&rq->lock, flags);
rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
idle->oncpu = 1;
set_task_cpu(p, dest_cpu);
if (on_rq) {
activate_task(rq_dest, p, 0);
-- check_preempt_curr(rq_dest, p);
++ check_preempt_curr(rq_dest, p, 0);
}
done:
ret = 1;
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
-- struct ctl_table *table = sd_alloc_ctl_entry(12);
++ struct ctl_table *table = sd_alloc_ctl_entry(13);
if (table == NULL)
return NULL;
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax);
-- /* &table[11] is terminator */
++ set_table_entry(&table[11], "name", sd->name,
++ CORENAME_MAX_SIZE, 0444, proc_dostring);
++ /* &table[12] is terminator */
return table;
}
struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */
-- - - for (tmp = sd; tmp; tmp = tmp->parent) {
++ + + for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
if (!parent)
break;
++ + +
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
-- - - }
++ + + } else
++ + + tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
++ #ifdef CONFIG_SCHED_DEBUG
++ # define SD_INIT_NAME(sd, type) sd->name = #type
++ #else
++ # define SD_INIT_NAME(sd, type) do { } while (0)
++ #endif
++
#define SD_INIT(sd, type) sd_init_##type(sd)
++
#define SD_INIT_FUNC(type) \
static noinline void sd_init_##type(struct sched_domain *sd) \
{ \
memset(sd, 0, sizeof(*sd)); \
*sd = SD_##type##_INIT; \
sd->level = SD_LV_##type; \
++ SD_INIT_NAME(sd, type); \
}
SD_INIT_FUNC(CPU)
error:
free_sched_groups(cpu_map, tmpmask);
SCHED_CPUMASK_FREE((void *)allmasks);
++ + + kfree(rd);
return -ENOMEM;
#endif
}
*
* The passed in 'doms_new' should be kmalloc'd. This routine takes
* ownership of it and will kfree it when done with it. If the caller
----- - * failed the kmalloc call, then it can pass in doms_new == NULL,
----- - * and partition_sched_domains() will fallback to the single partition
----- - * 'fallback_doms', it also forces the domains to be rebuilt.
+++++ + * failed the kmalloc call, then it can pass in doms_new == NULL &&
+++++ + * ndoms_new == 1, and partition_sched_domains() will fallback to
+++++ + * the single partition 'fallback_doms', it also forces the domains
+++++ + * to be rebuilt.
*
----- - * If doms_new==NULL it will be replaced with cpu_online_map.
----- - * ndoms_new==0 is a special case for destroying existing domains.
----- - * It will not create the default domain.
+++++ + * If doms_new == NULL it will be replaced with cpu_online_map.
+++++ + * ndoms_new == 0 is a special case for destroying existing domains,
+++++ + * and it will not create the default domain.
*
* Call with hotplug lock held
*/
#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
-- if ((in_atomic() || irqs_disabled()) &&
-- system_state == SYSTEM_RUNNING && !oops_in_progress) {
-- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-- return;
-- prev_jiffy = jiffies;
-- printk(KERN_ERR "BUG: sleeping function called from invalid"
-- " context at %s:%d\n", file, line);
-- printk("in_atomic():%d, irqs_disabled():%d\n",
-- in_atomic(), irqs_disabled());
-- debug_show_held_locks(current);
-- if (irqs_disabled())
-- print_irqtrace_events(current);
-- dump_stack();
-- }
++ if ((!in_atomic() && !irqs_disabled()) ||
++ system_state != SYSTEM_RUNNING || oops_in_progress)
++ return;
++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
++ return;
++ prev_jiffy = jiffies;
++
++ printk(KERN_ERR
++ "BUG: sleeping function called from invalid context at %s:%d\n",
++ file, line);
++ printk(KERN_ERR
++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
++ in_atomic(), irqs_disabled(),
++ current->pid, current->comm);
++
++ debug_show_held_locks(current);
++ if (irqs_disabled())
++ print_irqtrace_events(current);
++ dump_stack();
#endif
}
EXPORT_SYMBOL(__might_sleep);
static unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
-- return 1ULL << 16;
++ return 1ULL << 20;
-- return div64_u64(runtime << 16, period);
++ return div64_u64(runtime << 20, period);
}
-- #ifdef CONFIG_CGROUP_SCHED
-- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
++ /* Must be called with tasklist_lock held */
++ static inline int tg_has_rt_tasks(struct task_group *tg)
{
-- struct task_group *tgi, *parent = tg->parent;
-- unsigned long total = 0;
++ struct task_struct *g, *p;
-- if (!parent) {
-- if (global_rt_period() < period)
-- return 0;
++ do_each_thread(g, p) {
++ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
++ return 1;
++ } while_each_thread(g, p);
-- return to_ratio(period, runtime) <
-- to_ratio(global_rt_period(), global_rt_runtime());
-- }
++ return 0;
++ }
-- if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-- return 0;
++ struct rt_schedulable_data {
++ struct task_group *tg;
++ u64 rt_period;
++ u64 rt_runtime;
++ };
-- rcu_read_lock();
-- list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-- if (tgi == tg)
-- continue;
++ static int tg_schedulable(struct task_group *tg, void *data)
++ {
++ struct rt_schedulable_data *d = data;
++ struct task_group *child;
++ unsigned long total, sum = 0;
++ u64 period, runtime;
-- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-- tgi->rt_bandwidth.rt_runtime);
++ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
++ runtime = tg->rt_bandwidth.rt_runtime;
++
++ if (tg == d->tg) {
++ period = d->rt_period;
++ runtime = d->rt_runtime;
}
-- rcu_read_unlock();
-- return total + to_ratio(period, runtime) <=
-- to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-- parent->rt_bandwidth.rt_runtime);
-- }
-- #elif defined CONFIG_USER_SCHED
-- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-- {
-- struct task_group *tgi;
-- unsigned long total = 0;
-- unsigned long global_ratio =
-- to_ratio(global_rt_period(), global_rt_runtime());
++ /*
++ * Cannot have more runtime than the period.
++ */
++ if (runtime > period && runtime != RUNTIME_INF)
++ return -EINVAL;
-- rcu_read_lock();
-- list_for_each_entry_rcu(tgi, &task_groups, list) {
-- if (tgi == tg)
-- continue;
++ /*
++ * Ensure we don't starve existing RT tasks.
++ */
++ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
++ return -EBUSY;
++
++ total = to_ratio(period, runtime);
-- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-- tgi->rt_bandwidth.rt_runtime);
++ /*
++ * Nobody can have more than the global setting allows.
++ */
++ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
++ return -EINVAL;
++
++ /*
++ * The sum of our children's runtime should not exceed our own.
++ */
++ list_for_each_entry_rcu(child, &tg->children, siblings) {
++ period = ktime_to_ns(child->rt_bandwidth.rt_period);
++ runtime = child->rt_bandwidth.rt_runtime;
++
++ if (child == d->tg) {
++ period = d->rt_period;
++ runtime = d->rt_runtime;
++ }
++
++ sum += to_ratio(period, runtime);
}
-- rcu_read_unlock();
-- return total + to_ratio(period, runtime) < global_ratio;
++ if (sum > total)
++ return -EINVAL;
++
++ return 0;
}
-- #endif
-- /* Must be called with tasklist_lock held */
-- static inline int tg_has_rt_tasks(struct task_group *tg)
++ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
-- struct task_struct *g, *p;
-- do_each_thread(g, p) {
-- if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-- return 1;
-- } while_each_thread(g, p);
-- return 0;
++ struct rt_schedulable_data data = {
++ .tg = tg,
++ .rt_period = period,
++ .rt_runtime = runtime,
++ };
++
++ return walk_tg_tree(tg_schedulable, tg_nop, &data);
}
static int tg_set_bandwidth(struct task_group *tg,
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
-- if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-- err = -EBUSY;
-- goto unlock;
-- }
-- if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-- err = -EINVAL;
++ err = __rt_schedulable(tg, rt_period, rt_runtime);
++ if (err)
goto unlock;
-- }
spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
static int sched_rt_global_constraints(void)
{
-- struct task_group *tg = &root_task_group;
-- u64 rt_runtime, rt_period;
++ u64 runtime, period;
int ret = 0;
- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- rt_runtime = tg->rt_bandwidth.rt_runtime;
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- rt_runtime = tg->rt_bandwidth.rt_runtime;
++ runtime = global_rt_runtime();
++ period = global_rt_period();
++
++ /*
++ * Sanity check on the sysctl variables.
++ */
++ if (runtime > period && runtime != RUNTIME_INF)
++ return -EINVAL;
mutex_lock(&rt_constraints_mutex);
-- if (!__rt_schedulable(tg, rt_period, rt_runtime))
-- ret = -EINVAL;
++ read_lock(&tasklist_lock);
++ ret = __rt_schedulable(NULL, 0, 0);
++ read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return ret;
unsigned long flags;
int i;
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;
if (!cgrp->parent) {
/* This is early initialization for the top cgroup */
-- init_task_group.css.cgroup = cgrp;
return &init_task_group.css;
}
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
-- /* Bind the cgroup to task_group object we just created */
-- tg->css.cgroup = cgrp;
--
return &tg->css;
}
/*
* Zero means infinite timeout - no checking done:
*/
- ------unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+ ++++++unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
unsigned long __read_mostly sysctl_hung_task_warnings = 10;
* If the system crashed already then all bets are off,
* do not report extra hung tasks:
*/
-- if ((tainted & TAINT_DIE) || did_panic)
++ if (test_taint(TAINT_DIE) || did_panic)
return;
read_lock(&tasklist_lock);
If unsure, say N.
++ config DEBUG_VIRTUAL
++ bool "Debug VM translations"
++ depends on DEBUG_KERNEL && X86
++ help
++ Enable some costly sanity checks in virtual to page code. This can
++ catch mistakes with virt_to_page() and friends.
++
++ If unsure, say N.
++
config DEBUG_WRITECOUNT
bool "Debug filesystem writers count"
depends on DEBUG_KERNEL
If unsure, say N.
+ ++++++config DEBUG_NOTIFIERS
+ ++++++ bool "Debug notifier call chains"
+ ++++++ depends on DEBUG_KERNEL
+ ++++++ help
+ ++++++ Enable this to turn on sanity checking for notifier call chains.
+ ++++++ This is most useful for kernel developers to make sure that
+ ++++++ modules properly unregister themselves from notifier chains.
+ ++++++ This is a relatively cheap check but if you care about maximum
+ ++++++ performance, say N.
+ ++++++
config FRAME_POINTER
bool "Compile the kernel with frame pointers"
depends on DEBUG_KERNEL && \
Say N here if you want the RCU torture tests to start only
after being manually enabled via /proc.
++ config RCU_CPU_STALL_DETECTOR
++ bool "Check for stalled CPUs delaying RCU grace periods"
++ depends on CLASSIC_RCU
++ default n
++ help
++ This option causes RCU to printk information on which
++ CPUs are delaying the current grace period, but only when
++ the grace period extends for excessive time periods.
++
++ Say Y if you want RCU to perform such checks.
++
++ Say N if you are unsure.
++
config KPROBES_SANITY_TEST
bool "Kprobes sanity tests"
depends on DEBUG_KERNEL
Say N if you are unsure.
++ config DEBUG_BLOCK_EXT_DEVT
++ bool "Force extended block device numbers and spread them"
++ depends on DEBUG_KERNEL
++ depends on BLOCK
++ default n
++ help
++ BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON
++ SOME DISTRIBUTIONS. DO NOT ENABLE THIS UNLESS YOU KNOW WHAT
++ YOU ARE DOING. Distros, please enable this and fix whatever
++ is broken.
++
++ Conventionally, block device numbers are allocated from
++ predetermined contiguous area. However, extended block area
++ may introduce non-contiguous block device numbers. This
++ option forces most block device numbers to be allocated from
++ the extended space and spreads them to discover kernel or
++ userland code paths which assume predetermined contiguous
++ device number allocation.
++
++ Note that turning on this debug option shuffles all the
++ device numbers for all IDE and SCSI devices including libata
++ ones, so root partition specified using device number
++ directly (via rdev or root=MAJ:MIN) won't work anymore.
++ Textual device names (root=/dev/sdXn) will continue to work.
++
++ Say N if you are unsure.
++
config LKDTM
tristate "Linux Kernel Dump Test Tool Module"
depends on DEBUG_KERNEL
config FAIL_MAKE_REQUEST
bool "Fault-injection capability for disk IO"
-- depends on FAULT_INJECTION
++ depends on FAULT_INJECTION && BLOCK
help
Provide fault-injection capability for disk IO.
++ config FAIL_IO_TIMEOUT
++ bool "Faul-injection capability for faking disk interrupts"
++ depends on FAULT_INJECTION && BLOCK
++ help
++ Provide fault-injection capability on end IO handling. This
++ will make the block layer "forget" an interrupt as configured,
++ thus exercising the error handling.
++
++ Only works with drivers that use the generic timeout handling,
++ for others it wont do anything.
++
config FAULT_INJECTION_DEBUG_FS
bool "Debugfs entries for fault-injection capabilities"
depends on FAULT_INJECTION && SYSFS && DEBUG_FS
Say N if you are unsure.
++ config DYNAMIC_PRINTK_DEBUG
++ bool "Enable dynamic printk() call support"
++ default n
++ depends on PRINTK
++ select PRINTK_DEBUG
++ help
++
++ Compiles debug level messages into the kernel, which would not
++ otherwise be available at runtime. These messages can then be
++ enabled/disabled on a per module basis. This mechanism implicitly
++ enables all pr_debug() and dev_dbg() calls. The impact of this
++ compile option is a larger kernel text size of about 2%.
++
++ Usage:
++
++ Dynamic debugging is controlled by the debugfs file,
++ dynamic_printk/modules. This file contains a list of the modules that
++ can be enabled. The format of the file is the module name, followed
++ by a set of flags that can be enabled. The first flag is always the
++ 'enabled' flag. For example:
++
++ <module_name> <enabled=0/1>
++ .
++ .
++ .
++
++ <module_name> : Name of the module in which the debug call resides
++ <enabled=0/1> : whether the messages are enabled or not
++
++ From a live system:
++
++ snd_hda_intel enabled=0
++ fixup enabled=0
++ driver enabled=0
++
++ Enable a module:
++
++ $echo "set enabled=1 <module_name>" > dynamic_printk/modules
++
++ Disable a module:
++
++ $echo "set enabled=0 <module_name>" > dynamic_printk/modules
++
++ Enable all modules:
++
++ $echo "set enabled=1 all" > dynamic_printk/modules
++
++ Disable all modules:
++
++ $echo "set enabled=0 all" > dynamic_printk/modules
++
++ Finally, passing "dynamic_printk" at the command line enables
++ debugging for all modules. This mode can be turned off via the above
++ disable command.
++
source "samples/Kconfig"
source "lib/Kconfig.kgdb"