#include <linux/string.h>
#include <linux/wait.h>
#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/uaccess.h>
#include <linux/list.h>
#include <linux/completion.h>
#include <linux/file.h>
+ #include <linux/magic.h>
/* This is the range of ioctl() numbers we claim as ours */
#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
{
- return (struct autofs_sb_info *)(sb->s_fs_info);
+ return sb->s_magic != AUTOFS_SUPER_MAGIC ?
+ NULL : (struct autofs_sb_info *)(sb->s_fs_info);
}
static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry)
struct autofs_sb_info *,
struct autofs_packet_expire __user *);
int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int when);
+ struct autofs_sb_info *sbi, unsigned int how);
int autofs_expire_multi(struct super_block *, struct vfsmount *,
struct autofs_sb_info *, int __user *);
- struct dentry *autofs_expire_direct(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int how);
- struct dentry *autofs_expire_indirect(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int how);
/* Device node initialization */
struct list_head ptrace_entry;
/* PID/PID hash table linkage. */
- struct pid_link pids[PIDTYPE_MAX];
+ struct pid *thread_pid;
+ struct hlist_node pid_links[PIDTYPE_MAX];
struct list_head thread_group;
struct list_head thread_node;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
+ unsigned long last_switch_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
static inline struct pid *task_pid(struct task_struct *task)
{
- return task->pids[PIDTYPE_PID].pid;
-}
-
-static inline struct pid *task_tgid(struct task_struct *task)
-{
- return task->group_leader->pids[PIDTYPE_PID].pid;
-}
-
-/*
- * Without tasklist or RCU lock it is not safe to dereference
- * the result of task_pgrp/task_session even if task == current,
- * we can race with another thread doing sys_setsid/sys_setpgid.
- */
-static inline struct pid *task_pgrp(struct task_struct *task)
-{
- return task->group_leader->pids[PIDTYPE_PGID].pid;
-}
-
-static inline struct pid *task_session(struct task_struct *task)
-{
- return task->group_leader->pids[PIDTYPE_SID].pid;
+ return task->thread_pid;
}
/*
*/
static inline int pid_alive(const struct task_struct *p)
{
- return p->pids[PIDTYPE_PID].pid != NULL;
+ return p->thread_pid != NULL;
}
static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
- return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns);
+ return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
}
static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
- return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL);
+ return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
}
static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
bool checking_timer;
};
+struct multiprocess_signals {
+ sigset_t signal;
+ struct hlist_node node;
+};
+
/*
* NOTE! "signal_struct" does not have its own
* locking, because a shared signal_struct always
/* shared signal handling: */
struct sigpending shared_pending;
+ /* For collecting multiprocess signals during fork */
+ struct hlist_head multiprocess;
+
/* thread group exit support */
int group_exit_code;
/* overloaded:
#endif
- struct pid *leader_pid;
+ /* PID/PID hash table linkage. */
+ struct pid *pids[PIDTYPE_MAX];
#ifdef CONFIG_NO_HZ_FULL
atomic_t tick_dep_mask;
int force_sig_ptrace_errno_trap(int errno, void __user *addr);
extern int send_sig_info(int, struct siginfo *, struct task_struct *);
- extern int force_sigsegv(int, struct task_struct *);
+ extern void force_sigsegv(int sig, struct task_struct *p);
extern int force_sig_info(int, struct siginfo *, struct task_struct *);
extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
extern int zap_other_threads(struct task_struct *p);
extern struct sigqueue *sigqueue_alloc(void);
extern void sigqueue_free(struct sigqueue *);
-extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group);
+extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
static inline int restart_syscall(void)
*/
extern void recalc_sigpending_and_wake(struct task_struct *t);
extern void recalc_sigpending(void);
+extern void calculate_sigpending(void);
extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
}
+void task_join_group_stop(struct task_struct *task);
+
#ifdef TIF_RESTORE_SIGMASK
/*
* Legacy restore_sigmask accessors. These are inefficient on
typedef int (*proc_visitor)(struct task_struct *p, void *data);
void walk_process_tree(struct task_struct *top, proc_visitor, void *);
+static inline
+struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
+{
+ struct pid *pid;
+ if (type == PIDTYPE_PID)
+ pid = task_pid(task);
+ else
+ pid = task->signal->pids[type];
+ return pid;
+}
+
+static inline struct pid *task_tgid(struct task_struct *task)
+{
+ return task->signal->pids[PIDTYPE_TGID];
+}
+
+/*
+ * Without tasklist or RCU lock it is not safe to dereference
+ * the result of task_pgrp/task_session even if task == current,
+ * we can race with another thread doing sys_setsid/sys_setpgid.
+ */
+static inline struct pid *task_pgrp(struct task_struct *task)
+{
+ return task->signal->pids[PIDTYPE_PGID];
+}
+
+static inline struct pid *task_session(struct task_struct *task)
+{
+ return task->signal->pids[PIDTYPE_SID];
+}
+
static inline int get_nr_threads(struct task_struct *tsk)
{
return tsk->signal->nr_threads;
*/
static inline bool has_group_leader_pid(struct task_struct *p)
{
- return task_pid(p) == p->signal->leader_pid;
+ return task_pid(p) == task_tgid(p);
}
static inline
struct timespec;
struct pt_regs;
+enum pid_type;
extern int next_signal(struct sigpending *pending, sigset_t *mask);
extern int do_send_sig_info(int sig, struct siginfo *info,
- struct task_struct *p, bool group);
-extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
+ struct task_struct *p, enum pid_type type);
+extern int group_send_sig_info(int sig, struct siginfo *info,
+ struct task_struct *p, enum pid_type type);
extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
extern int sigprocmask(int, sigset_t *, sigset_t *);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;
- extern int get_signal(struct ksignal *ksig);
+ extern bool get_signal(struct ksignal *ksig);
extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
extern void exit_signals(struct task_struct *tsk);
extern void kernel_sigaction(int, __sighandler_t);
extern struct kmem_cache *sighand_cachep;
- int unhandled_signal(struct task_struct *tsk, int sig);
+ extern bool unhandled_signal(struct task_struct *tsk, int sig);
/*
* In POSIX a signal is sent either to a specific thread (Linux task)
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
- struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ struct vm_area_struct *vma;
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (vma)
vma_init(vma, mm);
return vma;
tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+ tsk->last_switch_time = 0;
#endif
tsk->mm = NULL;
return -ENOMEM;
atomic_set(&sig->count, 1);
+ spin_lock_irq(¤t->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+ spin_unlock_irq(¤t->sighand->siglock);
return 0;
}
init_waitqueue_head(&sig->wait_chldexit);
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
+ INIT_HLIST_HEAD(&sig->multiprocess);
seqlock_init(&sig->stats_lock);
prev_cputime_init(&sig->prev_cputime);
static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
#endif
+static inline void init_task_pid_links(struct task_struct *task)
+{
+ enum pid_type type;
+
+ for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
+ INIT_HLIST_NODE(&task->pid_links[type]);
+ }
+}
+
static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
- task->pids[type].pid = pid;
+ if (type == PIDTYPE_PID)
+ task->thread_pid = pid;
+ else
+ task->signal->pids[type] = pid;
}
static inline void rcu_copy_process(struct task_struct *p)
{
int retval;
struct task_struct *p;
+ struct multiprocess_signals delayed;
/*
* Don't allow sharing the root directory with processes in a different
return ERR_PTR(-EINVAL);
}
+ /*
+ * Force any signals received before this point to be delivered
+ * before the fork happens. Collect up signals sent to multiple
+ * processes that happen during the fork and delay them so that
+ * they appear to happen after the fork.
+ */
+ sigemptyset(&delayed.signal);
+ INIT_HLIST_NODE(&delayed.node);
+
+ spin_lock_irq(¤t->sighand->siglock);
+ if (!(clone_flags & CLONE_THREAD))
+ hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
+ recalc_sigpending();
+ spin_unlock_irq(¤t->sighand->siglock);
+ retval = -ERESTARTNOINTR;
+ if (signal_pending(current))
+ goto fork_out;
+
retval = -ENOMEM;
p = dup_task_struct(current, node);
if (!p)
rseq_fork(p, clone_flags);
- /*
- * Process group and session signals need to be delivered to just the
- * parent before the fork or both the parent and the child after the
- * fork. Restart if a signal comes in before we add the new process to
- * it's process group.
- * A fatal signal pending means that current will exit, so the new
- * thread can't slip out of an OOM kill (or normal SIGKILL).
- */
- recalc_sigpending();
- if (signal_pending(current)) {
- retval = -ERESTARTNOINTR;
- goto bad_fork_cancel_cgroup;
- }
+ /* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
+ /* Let kill terminate clone/fork in the middle */
+ if (fatal_signal_pending(current)) {
+ retval = -EINTR;
+ goto bad_fork_cancel_cgroup;
+ }
+
+
+ init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
+ init_task_pid(p, PIDTYPE_TGID, pid);
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
init_task_pid(p, PIDTYPE_SID, task_session(current));
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
}
-
- p->signal->leader_pid = pid;
+ p->signal->shared_pending.signal = delayed.signal;
p->signal->tty = tty_kref_get(current->signal->tty);
/*
* Inherit has_child_subreaper flag under the same
p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
+ attach_pid(p, PIDTYPE_TGID);
attach_pid(p, PIDTYPE_PGID);
attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
current->signal->nr_threads++;
atomic_inc(¤t->signal->live);
atomic_inc(¤t->signal->sigcnt);
+ task_join_group_stop(p);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
-
total_forks++;
+ hlist_del_init(&delayed.node);
spin_unlock(¤t->sighand->siglock);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
put_task_stack(p);
free_task(p);
fork_out:
+ spin_lock_irq(¤t->sighand->siglock);
+ hlist_del_init(&delayed.node);
+ spin_unlock_irq(¤t->sighand->siglock);
return ERR_PTR(retval);
}
-static inline void init_idle_pids(struct pid_link *links)
+static inline void init_idle_pids(struct task_struct *idle)
{
enum pid_type type;
for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
- INIT_HLIST_NODE(&links[type].node); /* not really needed */
- links[type].pid = &init_struct_pid;
+ INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
+ init_task_pid(idle, type, &init_struct_pid);
}
}
task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
cpu_to_node(cpu));
if (!IS_ERR(task)) {
- init_idle_pids(task->pids);
+ init_idle_pids(task);
init_idle(task, cpu);
}
return t->sighand->action[sig - 1].sa.sa_handler;
}
- static int sig_handler_ignored(void __user *handler, int sig)
+ static inline bool sig_handler_ignored(void __user *handler, int sig)
{
/* Is it explicitly or implicitly ignored? */
return handler == SIG_IGN ||
- (handler == SIG_DFL && sig_kernel_ignore(sig));
+ (handler == SIG_DFL && sig_kernel_ignore(sig));
}
- static int sig_task_ignored(struct task_struct *t, int sig, bool force)
+ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
{
void __user *handler;
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
handler == SIG_DFL && !(force && sig_kernel_only(sig)))
- return 1;
+ return true;
return sig_handler_ignored(handler, sig);
}
- static int sig_ignored(struct task_struct *t, int sig, bool force)
+ static bool sig_ignored(struct task_struct *t, int sig, bool force)
{
/*
* Blocked signals are never ignored, since the
* unblocked.
*/
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
- return 0;
+ return false;
/*
* Tracers may want to know about even ignored signal unless it
* by SIGNAL_UNKILLABLE task.
*/
if (t->ptrace && sig != SIGKILL)
- return 0;
+ return false;
return sig_task_ignored(t, sig, force);
}
* Re-calculate pending state from the set of locally pending
* signals, globally pending signals, and blocked signals.
*/
- static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
{
unsigned long ready;
long i;
#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
- static int recalc_sigpending_tsk(struct task_struct *t)
+ static bool recalc_sigpending_tsk(struct task_struct *t)
{
if ((t->jobctl & JOBCTL_PENDING_MASK) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
- return 1;
+ return true;
}
+
/*
* We must never clear the flag in another thread, or in current
* when it's possible the current syscall is returning -ERESTART*.
* So we don't clear it here, and only callers who know they should do.
*/
- return 0;
+ return false;
}
/*
}
+void calculate_sigpending(void)
+{
+ /* Have any signals or users of TIF_SIGPENDING been delayed
+ * until after fork?
+ */
+ spin_lock_irq(¤t->sighand->siglock);
+ set_tsk_thread_flag(current, TIF_SIGPENDING);
+ recalc_sigpending();
+ spin_unlock_irq(¤t->sighand->siglock);
+}
+
/* Given the mask, find the first available signal that should be serviced. */
#define SYNCHRONOUS_MASK \
return false;
}
+void task_join_group_stop(struct task_struct *task)
+{
+ /* Have the new thread join an on-going signal group stop */
+ unsigned long jobctl = current->jobctl;
+ if (jobctl & JOBCTL_STOP_PENDING) {
+ struct signal_struct *sig = current->signal;
+ unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK;
+ unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+ if (task_set_jobctl_pending(task, signr | gstop)) {
+ sig->group_stop_count++;
+ }
+ }
+}
+
/*
* allocate a new signal queue record
* - this may be called without locks if and only if t == current, otherwise an
}
}
- int unhandled_signal(struct task_struct *tsk, int sig)
+ bool unhandled_signal(struct task_struct *tsk, int sig)
{
void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
if (is_global_init(tsk))
- return 1;
+ return true;
+
if (handler != SIG_IGN && handler != SIG_DFL)
- return 0;
+ return false;
+
/* if ptraced, let the tracer determine */
return !tsk->ptrace;
}
*
* All callers must be holding the siglock.
*/
- static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
+ static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
{
struct sigqueue *q, *n;
sigset_t m;
sigandsets(&m, mask, &s->signal);
if (sigisemptyset(&m))
- return 0;
+ return;
sigandnsets(&s->signal, &s->signal, mask);
list_for_each_entry_safe(q, n, &s->list, list) {
__sigqueue_free(q);
}
}
- return 1;
}
static inline int is_si_special(const struct siginfo *info)
/*
* called with RCU read lock from check_kill_permission()
*/
- static int kill_ok_by_cred(struct task_struct *t)
+ static bool kill_ok_by_cred(struct task_struct *t)
{
const struct cred *cred = current_cred();
const struct cred *tcred = __task_cred(t);
- if (uid_eq(cred->euid, tcred->suid) ||
- uid_eq(cred->euid, tcred->uid) ||
- uid_eq(cred->uid, tcred->suid) ||
- uid_eq(cred->uid, tcred->uid))
- return 1;
-
- if (ns_capable(tcred->user_ns, CAP_KILL))
- return 1;
-
- return 0;
+ return uid_eq(cred->euid, tcred->suid) ||
+ uid_eq(cred->euid, tcred->uid) ||
+ uid_eq(cred->uid, tcred->suid) ||
+ uid_eq(cred->uid, tcred->uid) ||
+ ns_capable(tcred->user_ns, CAP_KILL);
}
/*
* as soon as they're available, so putting the signal on the shared queue
* will be equivalent to sending it to one such thread.
*/
- static inline int wants_signal(int sig, struct task_struct *p)
+ static inline bool wants_signal(int sig, struct task_struct *p)
{
if (sigismember(&p->blocked, sig))
- return 0;
+ return false;
+
if (p->flags & PF_EXITING)
- return 0;
+ return false;
+
if (sig == SIGKILL)
- return 1;
+ return true;
+
if (task_is_stopped_or_traced(p))
- return 0;
+ return false;
+
return task_curr(p) || !signal_pending(p);
}
-static void complete_signal(int sig, struct task_struct *p, int group)
+static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
*/
if (wants_signal(sig, p))
t = p;
- else if (!group || thread_group_empty(p))
+ else if ((type == PIDTYPE_PID) || thread_group_empty(p))
/*
* There is just one thread and it does not need to be woken.
* It will dequeue unblocked signals before it runs again.
return;
}
- static inline int legacy_queue(struct sigpending *signals, int sig)
+ static inline bool legacy_queue(struct sigpending *signals, int sig)
{
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
#endif
static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
- int group, int from_ancestor_ns)
+ enum pid_type type, int from_ancestor_ns)
{
struct sigpending *pending;
struct sigqueue *q;
from_ancestor_ns || (info == SEND_SIG_FORCED)))
goto ret;
- pending = group ? &t->signal->shared_pending : &t->pending;
+ pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
/*
* Short-circuit ignored signals and support queuing
* exactly one non-rt signal, so that we can get more
out_set:
signalfd_notify(t, sig);
sigaddset(&pending->signal, sig);
- complete_signal(sig, t, group);
+
+ /* Let multiprocess signals appear after on-going forks */
+ if (type > PIDTYPE_TGID) {
+ struct multiprocess_signals *delayed;
+ hlist_for_each_entry(delayed, &t->signal->multiprocess, node) {
+ sigset_t *signal = &delayed->signal;
+ /* Can't queue both a stop and a continue signal */
+ if (sig == SIGCONT)
+ sigdelsetmask(signal, SIG_KERNEL_STOP_MASK);
+ else if (sig_kernel_stop(sig))
+ sigdelset(signal, SIGCONT);
+ sigaddset(signal, sig);
+ }
+ }
+
+ complete_signal(sig, t, type);
ret:
- trace_signal_generate(sig, info, t, group, result);
+ trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
return ret;
}
static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
- int group)
+ enum pid_type type)
{
int from_ancestor_ns = 0;
!task_pid_nr_ns(current, task_active_pid_ns(t));
#endif
- return __send_signal(sig, info, t, group, from_ancestor_ns);
+ return __send_signal(sig, info, t, type, from_ancestor_ns);
}
static void print_fatal_signal(int signr)
int
__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
- return send_signal(sig, info, p, 1);
+ return send_signal(sig, info, p, PIDTYPE_TGID);
}
static int
specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
- return send_signal(sig, info, t, 0);
+ return send_signal(sig, info, t, PIDTYPE_PID);
}
int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
- bool group)
+ enum pid_type type)
{
unsigned long flags;
int ret = -ESRCH;
if (lock_task_sighand(p, &flags)) {
- ret = send_signal(sig, info, p, group);
+ ret = send_signal(sig, info, p, type);
unlock_task_sighand(p, &flags);
}
/*
* send signal info to all the members of a group
*/
-int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
+ enum pid_type type)
{
int ret;
rcu_read_unlock();
if (!ret && sig)
- ret = do_send_sig_info(sig, info, p, true);
+ ret = do_send_sig_info(sig, info, p, type);
return ret;
}
success = 0;
retval = -ESRCH;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
- int err = group_send_sig_info(sig, info, p);
+ int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
success |= !err;
retval = err;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (p)
- error = group_send_sig_info(sig, info, p);
+ error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
rcu_read_unlock();
if (likely(!p || error != -ESRCH))
return error;
return error;
}
- static int kill_as_cred_perm(const struct cred *cred,
- struct task_struct *target)
+ static inline bool kill_as_cred_perm(const struct cred *cred,
+ struct task_struct *target)
{
const struct cred *pcred = __task_cred(target);
- if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
- !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid))
- return 0;
- return 1;
+
+ return uid_eq(cred->euid, pcred->suid) ||
+ uid_eq(cred->euid, pcred->uid) ||
+ uid_eq(cred->uid, pcred->suid) ||
+ uid_eq(cred->uid, pcred->uid);
}
/* like kill_pid_info(), but doesn't use uid/euid of "current" */
if (sig) {
if (lock_task_sighand(p, &flags)) {
- ret = __send_signal(sig, info, p, 1, 0);
+ ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0);
unlock_task_sighand(p, &flags);
} else
ret = -ESRCH;
for_each_process(p) {
if (task_pid_vnr(p) > 1 &&
!same_thread_group(p, current)) {
- int err = group_send_sig_info(sig, info, p);
+ int err = group_send_sig_info(sig, info, p,
+ PIDTYPE_MAX);
++count;
if (err != -EPERM)
retval = err;
if (!valid_signal(sig))
return -EINVAL;
- return do_send_sig_info(sig, info, p, false);
+ return do_send_sig_info(sig, info, p, PIDTYPE_PID);
}
#define __si_special(priv) \
return send_sig_info(sig, __si_special(priv), p);
}
- void
- force_sig(int sig, struct task_struct *p)
+ void force_sig(int sig, struct task_struct *p)
{
force_sig_info(sig, SEND_SIG_PRIV, p);
}
* the problem was already a SIGSEGV, we'll want to
* make sure we don't even try to deliver the signal..
*/
- int
- force_sigsegv(int sig, struct task_struct *p)
+ void force_sigsegv(int sig, struct task_struct *p)
{
if (sig == SIGSEGV) {
unsigned long flags;
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
force_sig(SIGSEGV, p);
- return 0;
}
int force_sig_fault(int sig, int code, void __user *addr
__sigqueue_free(q);
}
-int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
+int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
{
int sig = q->info.si_signo;
struct sigpending *pending;
+ struct task_struct *t;
unsigned long flags;
int ret, result;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
ret = -1;
- if (!likely(lock_task_sighand(t, &flags)))
+ rcu_read_lock();
+ t = pid_task(pid, type);
+ if (!t || !likely(lock_task_sighand(t, &flags)))
goto ret;
ret = 1; /* the signal is ignored */
q->info.si_overrun = 0;
signalfd_notify(t, sig);
- pending = group ? &t->signal->shared_pending : &t->pending;
+ pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
list_add_tail(&q->list, &pending->list);
sigaddset(&pending->signal, sig);
- complete_signal(sig, t, group);
+ complete_signal(sig, t, type);
result = TRACE_SIGNAL_DELIVERED;
out:
- trace_signal_generate(sig, &q->info, t, group, result);
+ trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result);
unlock_task_sighand(t, &flags);
ret:
+ rcu_read_unlock();
return ret;
}
spin_unlock_irqrestore(&sighand->siglock, flags);
}
- static inline int may_ptrace_stop(void)
+ static inline bool may_ptrace_stop(void)
{
if (!likely(current->ptrace))
- return 0;
+ return false;
/*
* Are we in the middle of do_coredump?
* If so and our tracer is also part of the coredump stopping
*/
if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
* Return non-zero if there is a SIGKILL that should be waking us up.
* Called with the siglock held.
*/
- static int sigkill_pending(struct task_struct *tsk)
+ static bool sigkill_pending(struct task_struct *tsk)
{
- return sigismember(&tsk->pending.signal, SIGKILL) ||
- sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
+ return sigismember(&tsk->pending.signal, SIGKILL) ||
+ sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
}
/*
return signr;
}
- int get_signal(struct ksignal *ksig)
+ bool get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
task_work_run();
if (unlikely(uprobe_deny_signal()))
- return 0;
+ return false;
/*
* Do this once, we can't return to user-mode if freezing() == T.
}
#endif
- static int do_sigpending(sigset_t *set)
+ static void do_sigpending(sigset_t *set)
{
spin_lock_irq(¤t->sighand->siglock);
sigorsets(set, ¤t->pending.signal,
/* Outside the lock because only this thread touches it. */
sigandsets(set, ¤t->blocked, set);
- return 0;
}
/**
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
sigset_t set;
- int err;
if (sigsetsize > sizeof(*uset))
return -EINVAL;
- err = do_sigpending(&set);
- if (!err && copy_to_user(uset, &set, sigsetsize))
- err = -EFAULT;
- return err;
+ do_sigpending(&set);
+
+ if (copy_to_user(uset, &set, sigsetsize))
+ return -EFAULT;
+
+ return 0;
}
#ifdef CONFIG_COMPAT
compat_size_t, sigsetsize)
{
sigset_t set;
- int err;
if (sigsetsize > sizeof(*uset))
return -EINVAL;
- err = do_sigpending(&set);
- if (!err)
- err = put_compat_sigset(uset, &set, sigsetsize);
- return err;
+ do_sigpending(&set);
+
+ return put_compat_sigset(uset, &set, sigsetsize);
}
#endif
* probe. No signal is actually delivered.
*/
if (!error && sig) {
- error = do_send_sig_info(sig, info, p, false);
+ error = do_send_sig_info(sig, info, p, PIDTYPE_PID);
/*
* If lock_task_sighand() failed we pretend the task
* dies after receiving the signal. The window is tiny,
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
{
sigset_t set;
- int err;
if (sizeof(old_sigset_t) > sizeof(*uset))
return -EINVAL;
- err = do_sigpending(&set);
- if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t)))
- err = -EFAULT;
- return err;
+ do_sigpending(&set);
+
+ if (copy_to_user(uset, &set, sizeof(old_sigset_t)))
+ return -EFAULT;
+
+ return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
sigset_t set;
- int err = do_sigpending(&set);
- if (!err)
- err = put_user(set.sig[0], set32);
- return err;
+
+ do_sigpending(&set);
+
+ return put_user(set.sig[0], set32);
}
#endif
size_t, sigsetsize)
{
struct k_sigaction new_sa, old_sa;
- int ret = -EINVAL;
+ int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
- goto out;
+ return -EINVAL;
- if (act) {
- if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
- return -EFAULT;
- }
+ if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
+ return -EFAULT;
ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
+ if (ret)
+ return ret;
- if (!ret && oact) {
- if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
- return -EFAULT;
- }
- out:
- return ret;
+ if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
+ return -EFAULT;
+
+ return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
"the deadlock.\n");
return;
}
- ret = send_signal(sig, SEND_SIG_PRIV, t, false);
+ ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
spin_unlock(&t->sighand->siglock);
if (ret)
kdb_printf("Fail to deliver Signal %d to process %d.\n",
struct task_struct *p;
struct task_struct *task;
- pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+ pr_info("Tasks state (memory values in pages):\n");
+ pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
rcu_read_lock();
for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask))
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
mm_pgtables_bytes(task->mm),
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
- void __oom_reap_task_mm(struct mm_struct *mm)
+ bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ bool ret = true;
/*
* Tell all users of get_user/copy_from_user etc... that the content
struct mmu_gather tlb;
tlb_gather_mmu(&tlb, mm, start, end);
- mmu_notifier_invalidate_range_start(mm, start, end);
+ if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
+ ret = false;
+ continue;
+ }
unmap_page_range(&tlb, vma, start, end, NULL);
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
}
+
+ return ret;
}
+ /*
+ * Reaps the address space of the give task.
+ *
+ * Returns true on success and false if none or part of the address space
+ * has been reclaimed and the caller should retry later.
+ */
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
bool ret = true;
- /*
- * We have to make sure to not race with the victim exit path
- * and cause premature new oom victim selection:
- * oom_reap_task_mm exit_mm
- * mmget_not_zero
- * mmput
- * atomic_dec_and_test
- * exit_oom_victim
- * [...]
- * out_of_memory
- * select_bad_process
- * # no TIF_MEMDIE task selects new victim
- * unmap_page_range # frees some memory
- */
- mutex_lock(&oom_lock);
-
if (!down_read_trylock(&mm->mmap_sem)) {
- ret = false;
trace_skip_task_reaping(tsk->pid);
- goto unlock_oom;
- }
-
- /*
- * If the mm has invalidate_{start,end}() notifiers that could block,
- * sleep to give the oom victim some more time.
- * TODO: we really want to get rid of this ugly hack and make sure that
- * notifiers cannot block for unbounded amount of time
- */
- if (mm_has_blockable_invalidate_notifiers(mm)) {
- up_read(&mm->mmap_sem);
- schedule_timeout_idle(HZ);
- goto unlock_oom;
+ return false;
}
/*
* down_write();up_write() cycle in exit_mmap().
*/
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
- up_read(&mm->mmap_sem);
trace_skip_task_reaping(tsk->pid);
- goto unlock_oom;
+ goto out_unlock;
}
trace_start_task_reaping(tsk->pid);
- __oom_reap_task_mm(mm);
+ /* failed to reap part of the address space. Try again later */
+ ret = __oom_reap_task_mm(mm);
+ if (!ret)
+ goto out_finish;
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
K(get_mm_counter(mm, MM_FILEPAGES)),
K(get_mm_counter(mm, MM_SHMEMPAGES)));
+ out_finish:
+ trace_finish_task_reaping(tsk->pid);
+ out_unlock:
up_read(&mm->mmap_sem);
- trace_finish_task_reaping(tsk->pid);
- unlock_oom:
- mutex_unlock(&oom_lock);
return ret;
}
return ret;
}
- static void oom_kill_process(struct oom_control *oc, const char *message)
+ static void __oom_kill_process(struct task_struct *victim)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *p;
struct mm_struct *mm;
- unsigned int victim_points = 0;
- static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
bool can_oom_reap = true;
- /*
- * If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just give it access to memory reserves
- * so it can die quickly
- */
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
- return;
- }
- task_unlock(p);
-
- if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- read_unlock(&tasklist_lock);
-
p = find_lock_task_mm(victim);
if (!p) {
put_task_struct(victim);
* in order to prevent the OOM victim from depleting the memory
* reserves from the user space under its control.
*/
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
*/
if (unlikely(p->flags & PF_KTHREAD))
continue;
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
}
rcu_read_unlock();
}
#undef K
+ /*
+ * Kill provided task unless it's secured by setting
+ * oom_score_adj to OOM_SCORE_ADJ_MIN.
+ */
+ static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+ {
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ get_task_struct(task);
+ __oom_kill_process(task);
+ }
+ return 0;
+ }
+
+ static void oom_kill_process(struct oom_control *oc, const char *message)
+ {
+ struct task_struct *p = oc->chosen;
+ unsigned int points = oc->chosen_points;
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t;
+ struct mem_cgroup *oom_group;
+ unsigned int victim_points = 0;
+ static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just give it access to memory reserves
+ * so it can die quickly
+ */
+ task_lock(p);
+ if (task_will_free_mem(p)) {
+ mark_oom_victim(p);
+ wake_oom_reaper(p);
+ task_unlock(p);
+ put_task_struct(p);
+ return;
+ }
+ task_unlock(p);
+
+ if (__ratelimit(&oom_rs))
+ dump_header(oc, p);
+
+ pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+
+ /*
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest oom_badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
+ */
+ read_lock(&tasklist_lock);
+ for_each_thread(p, t) {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
+
+ if (process_shares_mm(child, p->mm))
+ continue;
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child,
+ oc->memcg, oc->nodemask, oc->totalpages);
+ if (child_points > victim_points) {
+ put_task_struct(victim);
+ victim = child;
+ victim_points = child_points;
+ get_task_struct(victim);
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Do we need to kill the entire memory cgroup?
+ * Or even one of the ancestor memory cgroups?
+ * Check this out before killing the victim task.
+ */
+ oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
+
+ __oom_kill_process(victim);
+
+ /*
+ * If necessary, kill all tasks in the selected memory cgroup.
+ */
+ if (oom_group) {
+ mem_cgroup_print_oom_group(oom_group);
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
+ mem_cgroup_put(oom_group);
+ }
+ }
+
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
static unsigned long long kvm_createvm_count;
static unsigned long long kvm_active_vms;
- __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end)
+ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end, bool blockable)
{
+ return 0;
}
bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
srcu_read_unlock(&kvm->srcu, idx);
}
- static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int need_tlb_flush = 0, idx;
+ int ret;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
spin_unlock(&kvm->mmu_lock);
- kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
+ ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable);
srcu_read_unlock(&kvm->srcu, idx);
+
+ return ret;
}
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
if (arg)
goto out;
oldpid = rcu_access_pointer(vcpu->pid);
- if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) {
+ if (unlikely(oldpid != task_pid(current))) {
/* The thread running this VCPU changed. */
struct pid *newpid;