regs->ax = -EINTR;
break;
}
- /* fallthrough */
+ fallthrough;
case -ERESTARTNOINTR:
regs->ax = regs->orig_ax;
regs->ip -= 2;
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
- void arch_do_signal(struct pt_regs *regs)
+ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
{
struct ksignal ksig;
- if (get_signal(&ksig)) {
+ if (has_signal && get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
handle_signal(&ksig, regs);
return;
# define _TIF_UPROBE (0)
#endif
+ #ifndef _TIF_NOTIFY_SIGNAL
+ # define _TIF_NOTIFY_SIGNAL (0)
+ #endif
+
/*
- * TIF flags handled in syscall_enter_from_usermode()
+ * TIF flags handled in syscall_enter_from_user_mode()
*/
#ifndef ARCH_SYSCALL_ENTER_WORK
# define ARCH_SYSCALL_ENTER_WORK (0)
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \
+ _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
ARCH_EXIT_TO_USER_MODE_WORK)
/**
#endif
/**
- * syscall_enter_from_user_mode - Check and handle work before invoking
- * a syscall
+ * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
* @regs: Pointer to currents pt_regs
- * @syscall: The syscall number
*
* Invoked from architecture specific syscall entry code with interrupts
* disabled. The calling code has to be non-instrumentable. When the
- * function returns all state is correct and the subsequent functions can be
- * instrumented.
+ * function returns all state is correct, interrupts are enabled and the
+ * subsequent functions can be instrumented.
+ *
+ * This handles lockdep, RCU (context tracking) and tracing state.
+ *
+ * This is invoked when there is extra architecture specific functionality
+ * to be done between establishing state and handling user mode entry work.
+ */
+void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
+
+/**
+ * syscall_enter_from_user_mode_work - Check and handle work before invoking
+ * a syscall
+ * @regs: Pointer to currents pt_regs
+ * @syscall: The syscall number
+ *
+ * Invoked from architecture specific syscall entry code with interrupts
+ * enabled after invoking syscall_enter_from_user_mode_prepare() and extra
+ * architecture specific work.
*
* Returns: The original or a modified syscall number
*
* syscall_set_return_value() first. If neither of those are called and -1
* is returned, then the syscall will fail with ENOSYS.
*
- * The following functionality is handled here:
+ * It handles the following work items:
*
- * 1) Establish state (lockdep, RCU (context tracking), tracing)
- * 2) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
+ * 1) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
* __secure_computing(), trace_sys_enter()
- * 3) Invocation of audit_syscall_entry()
+ * 2) Invocation of audit_syscall_entry()
+ */
+long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
+
+/**
+ * syscall_enter_from_user_mode - Establish state and check and handle work
+ * before invoking a syscall
+ * @regs: Pointer to currents pt_regs
+ * @syscall: The syscall number
+ *
+ * Invoked from architecture specific syscall entry code with interrupts
+ * disabled. The calling code has to be non-instrumentable. When the
+ * function returns all state is correct, interrupts are enabled and the
+ * subsequent functions can be instrumented.
+ *
+ * This is combination of syscall_enter_from_user_mode_prepare() and
+ * syscall_enter_from_user_mode_work().
+ *
+ * Returns: The original or a modified syscall number. See
+ * syscall_enter_from_user_mode_work() for further explanation.
*/
long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);
#endif
/**
- * arch_do_signal - Architecture specific signal delivery function
+ * arch_do_signal_or_restart - Architecture specific signal delivery function
* @regs: Pointer to currents pt_regs
+ * @has_signal: actual signal to handle
*
* Invoked from exit_to_user_mode_loop().
*/
- void arch_do_signal(struct pt_regs *regs);
+ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal);
/**
* arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit()
*/
static inline void tracehook_notify_resume(struct pt_regs *regs)
{
+ clear_thread_flag(TIF_NOTIFY_RESUME);
/*
- * The caller just cleared TIF_NOTIFY_RESUME. This barrier
- * pairs with task_work_add()->set_notify_resume() after
+ * This barrier pairs with task_work_add()->set_notify_resume() after
* hlist_add_head(task->task_works);
*/
smp_mb__after_atomic();
blkcg_maybe_throttle_current();
}
+ /*
+ * called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This
+ * is currently used by TWA_SIGNAL based task_work, which requires breaking
+ * wait loops to ensure that task_work is noticed and run.
+ */
+ static inline void tracehook_notify_signal(void)
+ {
+ #if defined(TIF_NOTIFY_SIGNAL)
+ clear_thread_flag(TIF_NOTIFY_SIGNAL);
+ smp_mb__after_atomic();
+ if (current->task_works)
+ task_work_run();
+ #endif
+ }
+
+ /*
+ * Called when we have work to process from exit_to_user_mode_loop()
+ */
+ static inline void set_notify_signal(struct task_struct *task)
+ {
+ #if defined(TIF_NOTIFY_SIGNAL)
+ if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
+ !wake_up_state(task, TASK_INTERRUPTIBLE))
+ kick_process(task);
+ #endif
+ }
+
#endif /* <linux/tracehook.h> */
return ret;
}
+ /* Either of the above might have changed the syscall number */
+ syscall = syscall_get_nr(current, regs);
+
if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, syscall);
return ret ? : syscall;
}
-noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+static __always_inline long
+__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{
unsigned long ti_work;
- enter_from_user_mode(regs);
- instrumentation_begin();
-
- local_irq_enable();
ti_work = READ_ONCE(current_thread_info()->flags);
if (ti_work & SYSCALL_ENTER_WORK)
syscall = syscall_trace_enter(regs, syscall, ti_work);
- instrumentation_end();
return syscall;
}
+long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
+{
+ return __syscall_enter_from_user_work(regs, syscall);
+}
+
+noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+{
+ long ret;
+
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ local_irq_enable();
+ ret = __syscall_enter_from_user_work(regs, syscall);
+ instrumentation_end();
+
+ return ret;
+}
+
+noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+ instrumentation_begin();
+ local_irq_enable();
+ instrumentation_end();
+}
+
/**
* exit_to_user_mode - Fixup state when exiting to user mode
*
}
/* Workaround to allow gradual conversion of architecture code */
- void __weak arch_do_signal(struct pt_regs *regs) { }
+ void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
+
+ static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
+ {
+ if (ti_work & _TIF_NOTIFY_SIGNAL)
+ tracehook_notify_signal();
+
+ arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
+ }
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
- if (ti_work & _TIF_SIGPENDING)
- arch_do_signal(regs);
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ handle_signal_work(regs, ti_work);
if (ti_work & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
rseq_handle_notify_resume(NULL, regs);
}
/*
* If TIF_SYSCALL_EMU is set, then the only reason to report is when
* TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
- * instruction has been already reported in syscall_enter_from_usermode().
+ * instruction has been already reported in syscall_enter_from_user_mode().
*/
#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
* terminate a grace period, if and only if the timer interrupt is
* not nested into another interrupt.
*
- * Checking for __rcu_is_watching() here would prevent the nesting
+ * Checking for rcu_is_watching() here would prevent the nesting
* interrupt to invoke rcu_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interupt and eventually claim
* already contains a warning when RCU is not watching, so no point
* in having another one here.
*/
+ lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
rcu_irq_enter_check_tick();
- /* Use the combo lockdep/tracing function */
- trace_hardirqs_off();
+ trace_hardirqs_off_finish();
instrumentation_end();
return ret;
do {
int ret;
+ if (ti_work & _TIF_NOTIFY_SIGNAL)
+ tracehook_notify_signal();
+
if (ti_work & _TIF_SIGPENDING) {
kvm_handle_signal_exit(vcpu);
return -EINTR;
if (ti_work & _TIF_NEED_RESCHED)
schedule();
- if (ti_work & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
+ if (ti_work & _TIF_NOTIFY_RESUME)
tracehook_notify_resume(NULL);
- }
ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
if (ret)
try_to_free_swap(old_page);
page_vma_mapped_walk_done(&pvmw);
- if (vma->vm_flags & VM_LOCKED)
+ if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
munlock_vma_page(old_page);
put_page(old_page);
t->utask->dup_xol_addr = area->vaddr;
init_task_work(&t->utask->dup_xol_work, dup_xol_work);
- task_work_add(t, &t->utask->dup_xol_work, true);
+ task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
}
/*
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
- if (signal_pending(t)) {
+ if (task_sigpending(t)) {
spin_lock_irq(&t->sighand->siglock);
clear_tsk_thread_flag(t, TIF_SIGPENDING);
spin_unlock_irq(&t->sighand->siglock);
void task_join_group_stop(struct task_struct *task)
{
+ unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK;
+ struct signal_struct *sig = current->signal;
+
+ if (sig->group_stop_count) {
+ sig->group_stop_count++;
+ mask |= JOBCTL_STOP_CONSUME;
+ } else if (!(sig->flags & SIGNAL_STOP_STOPPED))
+ return;
+
/* Have the new thread join an on-going signal group stop */
- unsigned long jobctl = current->jobctl;
- if (jobctl & JOBCTL_STOP_PENDING) {
- struct signal_struct *sig = current->signal;
- unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK;
- unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
- if (task_set_jobctl_pending(task, signr | gstop)) {
- sig->group_stop_count++;
- }
- }
+ task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
}
/*
*/
if (!sid || sid == task_session(current))
break;
- /* fall through */
+ fallthrough;
default:
return -EPERM;
}
if (task_is_stopped_or_traced(p))
return false;
- return task_curr(p) || !signal_pending(p);
+ return task_curr(p) || !task_sigpending(p);
}
static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
struct signal_struct *signal = current->signal;
int signr;
+ /*
+ * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
+ * that the arch handlers don't all have to do it. If we get here
+ * without TIF_SIGPENDING, just exit after running signal work.
+ */
+ #ifdef TIF_NOTIFY_SIGNAL
+ if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
+ if (test_thread_flag(TIF_NOTIFY_SIGNAL))
+ tracehook_notify_signal();
+ if (!task_sigpending(current))
+ return false;
+ }
+ #endif
+
if (unlikely(uprobe_deny_signal()))
return false;
/* Remove the signals this thread can handle. */
sigandsets(&retarget, &retarget, &t->blocked);
- if (!signal_pending(t))
+ if (!task_sigpending(t))
signal_wake_up(t, 0);
if (sigisemptyset(&retarget))
cgroup_threadgroup_change_end(tsk);
- if (!signal_pending(tsk))
+ if (!task_sigpending(tsk))
goto out;
unblocked = tsk->blocked;
static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
- if (signal_pending(tsk) && !thread_group_empty(tsk)) {
+ if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
sigset_t newblocked;
/* A set of now blocked but previously unblocked signals. */
sigandnsets(&newblocked, newset, ¤t->blocked);
static struct callback_head work_exited; /* all we need is ->next == NULL */
+ /*
+ * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster
+ * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is
+ * shared for threads, and can cause contention on sighand->lock. Even for
+ * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking
+ * or IRQ disabling is involved for notification (or running) purposes.
+ */
+ static void task_work_notify_signal(struct task_struct *task)
+ {
+ #if defined(TIF_NOTIFY_SIGNAL)
+ set_notify_signal(task);
+ #else
+ unsigned long flags;
+
+ /*
+ * Only grab the sighand lock if we don't already have some
+ * task_work pending. This pairs with the smp_store_mb()
+ * in get_signal(), see comment there.
+ */
+ if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
+ lock_task_sighand(task, &flags)) {
+ task->jobctl |= JOBCTL_TASK_WORK;
+ signal_wake_up(task, 0);
+ unlock_task_sighand(task, &flags);
+ }
+ #endif
+ }
+
/**
* task_work_add - ask the @task to execute @work->func()
* @task: the task which should run the callback
* @work: the callback to run
- * @notify: send the notification if true
+ * @notify: how to notify the targeted task
*
- * Queue @work for task_work_run() below and notify the @task if @notify.
- * Fails if the @task is exiting/exited and thus it can't process this @work.
- * Otherwise @work->func() will be called when the @task returns from kernel
- * mode or exits.
+ * Queue @work for task_work_run() below and notify the @task if @notify
+ * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the
+ * it will interrupt the targeted task and run the task_work. @TWA_RESUME
+ * work is run only when the task exits the kernel and returns to user mode,
+ * or before entering guest mode. Fails if the @task is exiting/exited and thus
+ * it can't process this @work. Otherwise @work->func() will be called when the
+ * @task goes through one of the aforementioned transitions, or exits.
*
- * This is like the signal handler which runs in kernel mode, but it doesn't
- * try to wake up the @task.
+ * If the targeted task is exiting, then an error is returned and the work item
+ * is not queued. It's up to the caller to arrange for an alternative mechanism
+ * in that case.
*
- * Note: there is no ordering guarantee on works queued here.
+ * Note: there is no ordering guarantee on works queued here. The task_work
+ * list is LIFO.
*
* RETURNS:
* 0 if succeeds or -ESRCH.
*/
-int
-task_work_add(struct task_struct *task, struct callback_head *work, int notify)
+int task_work_add(struct task_struct *task, struct callback_head *work,
+ enum task_work_notify_mode notify)
{
struct callback_head *head;
- unsigned long flags;
do {
head = READ_ONCE(task->task_works);
} while (cmpxchg(&task->task_works, head, work) != head);
switch (notify) {
+ case TWA_NONE:
+ break;
case TWA_RESUME:
set_notify_resume(task);
break;
case TWA_SIGNAL:
- /*
- * Only grab the sighand lock if we don't already have some
- * task_work pending. This pairs with the smp_store_mb()
- * in get_signal(), see comment there.
- */
- if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
- lock_task_sighand(task, &flags)) {
- task->jobctl |= JOBCTL_TASK_WORK;
- signal_wake_up(task, 0);
- unlock_task_sighand(task, &flags);
- }
+ task_work_notify_signal(task);
break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
return 0;