]> Git Repo - linux.git/blob - kernel/entry/common.c
sched: highmem: Store local kmaps in task struct
[linux.git] / kernel / entry / common.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/context_tracking.h>
4 #include <linux/entry-common.h>
5 #include <linux/highmem.h>
6 #include <linux/livepatch.h>
7 #include <linux/audit.h>
8
9 #define CREATE_TRACE_POINTS
10 #include <trace/events/syscalls.h>
11
12 /**
13  * enter_from_user_mode - Establish state when coming from user mode
14  *
15  * Syscall/interrupt entry disables interrupts, but user mode is traced as
16  * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
17  *
18  * 1) Tell lockdep that interrupts are disabled
19  * 2) Invoke context tracking if enabled to reactivate RCU
20  * 3) Trace interrupts off state
21  */
22 static __always_inline void enter_from_user_mode(struct pt_regs *regs)
23 {
24         arch_check_user_regs(regs);
25         lockdep_hardirqs_off(CALLER_ADDR0);
26
27         CT_WARN_ON(ct_state() != CONTEXT_USER);
28         user_exit_irqoff();
29
30         instrumentation_begin();
31         trace_hardirqs_off_finish();
32         instrumentation_end();
33 }
34
35 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
36 {
37         if (unlikely(audit_context())) {
38                 unsigned long args[6];
39
40                 syscall_get_arguments(current, regs, args);
41                 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
42         }
43 }
44
45 static long syscall_trace_enter(struct pt_regs *regs, long syscall,
46                                 unsigned long ti_work)
47 {
48         long ret = 0;
49
50         /* Handle ptrace */
51         if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
52                 ret = arch_syscall_enter_tracehook(regs);
53                 if (ret || (ti_work & _TIF_SYSCALL_EMU))
54                         return -1L;
55         }
56
57         /* Do seccomp after ptrace, to catch any tracer changes. */
58         if (ti_work & _TIF_SECCOMP) {
59                 ret = __secure_computing(NULL);
60                 if (ret == -1L)
61                         return ret;
62         }
63
64         /* Either of the above might have changed the syscall number */
65         syscall = syscall_get_nr(current, regs);
66
67         if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
68                 trace_sys_enter(regs, syscall);
69
70         syscall_enter_audit(regs, syscall);
71
72         return ret ? : syscall;
73 }
74
75 static __always_inline long
76 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
77 {
78         unsigned long ti_work;
79
80         ti_work = READ_ONCE(current_thread_info()->flags);
81         if (ti_work & SYSCALL_ENTER_WORK)
82                 syscall = syscall_trace_enter(regs, syscall, ti_work);
83
84         return syscall;
85 }
86
87 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
88 {
89         return __syscall_enter_from_user_work(regs, syscall);
90 }
91
92 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
93 {
94         long ret;
95
96         enter_from_user_mode(regs);
97
98         instrumentation_begin();
99         local_irq_enable();
100         ret = __syscall_enter_from_user_work(regs, syscall);
101         instrumentation_end();
102
103         return ret;
104 }
105
106 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
107 {
108         enter_from_user_mode(regs);
109         instrumentation_begin();
110         local_irq_enable();
111         instrumentation_end();
112 }
113
114 /**
115  * exit_to_user_mode - Fixup state when exiting to user mode
116  *
117  * Syscall/interupt exit enables interrupts, but the kernel state is
118  * interrupts disabled when this is invoked. Also tell RCU about it.
119  *
120  * 1) Trace interrupts on state
121  * 2) Invoke context tracking if enabled to adjust RCU state
122  * 3) Invoke architecture specific last minute exit code, e.g. speculation
123  *    mitigations, etc.
124  * 4) Tell lockdep that interrupts are enabled
125  */
126 static __always_inline void exit_to_user_mode(void)
127 {
128         instrumentation_begin();
129         trace_hardirqs_on_prepare();
130         lockdep_hardirqs_on_prepare(CALLER_ADDR0);
131         instrumentation_end();
132
133         user_enter_irqoff();
134         arch_exit_to_user_mode();
135         lockdep_hardirqs_on(CALLER_ADDR0);
136 }
137
138 /* Workaround to allow gradual conversion of architecture code */
139 void __weak arch_do_signal(struct pt_regs *regs) { }
140
141 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
142                                             unsigned long ti_work)
143 {
144         /*
145          * Before returning to user space ensure that all pending work
146          * items have been completed.
147          */
148         while (ti_work & EXIT_TO_USER_MODE_WORK) {
149
150                 local_irq_enable_exit_to_user(ti_work);
151
152                 if (ti_work & _TIF_NEED_RESCHED)
153                         schedule();
154
155                 if (ti_work & _TIF_UPROBE)
156                         uprobe_notify_resume(regs);
157
158                 if (ti_work & _TIF_PATCH_PENDING)
159                         klp_update_patch_state(current);
160
161                 if (ti_work & _TIF_SIGPENDING)
162                         arch_do_signal(regs);
163
164                 if (ti_work & _TIF_NOTIFY_RESUME) {
165                         tracehook_notify_resume(regs);
166                         rseq_handle_notify_resume(NULL, regs);
167                 }
168
169                 /* Architecture specific TIF work */
170                 arch_exit_to_user_mode_work(regs, ti_work);
171
172                 /*
173                  * Disable interrupts and reevaluate the work flags as they
174                  * might have changed while interrupts and preemption was
175                  * enabled above.
176                  */
177                 local_irq_disable_exit_to_user();
178                 ti_work = READ_ONCE(current_thread_info()->flags);
179         }
180
181         /* Return the latest work state for arch_exit_to_user_mode() */
182         return ti_work;
183 }
184
185 static void exit_to_user_mode_prepare(struct pt_regs *regs)
186 {
187         unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
188
189         lockdep_assert_irqs_disabled();
190
191         if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
192                 ti_work = exit_to_user_mode_loop(regs, ti_work);
193
194         arch_exit_to_user_mode_prepare(regs, ti_work);
195
196         /* Ensure that the address limit is intact and no locks are held */
197         addr_limit_user_check();
198         kmap_assert_nomap();
199         lockdep_assert_irqs_disabled();
200         lockdep_sys_exit();
201 }
202
203 #ifndef _TIF_SINGLESTEP
204 static inline bool report_single_step(unsigned long ti_work)
205 {
206         return false;
207 }
208 #else
209 /*
210  * If TIF_SYSCALL_EMU is set, then the only reason to report is when
211  * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
212  * instruction has been already reported in syscall_enter_from_user_mode().
213  */
214 #define SYSEMU_STEP     (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
215
216 static inline bool report_single_step(unsigned long ti_work)
217 {
218         return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP;
219 }
220 #endif
221
222 static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
223 {
224         bool step;
225
226         audit_syscall_exit(regs);
227
228         if (ti_work & _TIF_SYSCALL_TRACEPOINT)
229                 trace_sys_exit(regs, syscall_get_return_value(current, regs));
230
231         step = report_single_step(ti_work);
232         if (step || ti_work & _TIF_SYSCALL_TRACE)
233                 arch_syscall_exit_tracehook(regs, step);
234 }
235
236 /*
237  * Syscall specific exit to user mode preparation. Runs with interrupts
238  * enabled.
239  */
240 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
241 {
242         u32 cached_flags = READ_ONCE(current_thread_info()->flags);
243         unsigned long nr = syscall_get_nr(current, regs);
244
245         CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
246
247         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
248                 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
249                         local_irq_enable();
250         }
251
252         rseq_syscall(regs);
253
254         /*
255          * Do one-time syscall specific work. If these work items are
256          * enabled, we want to run them exactly once per syscall exit with
257          * interrupts enabled.
258          */
259         if (unlikely(cached_flags & SYSCALL_EXIT_WORK))
260                 syscall_exit_work(regs, cached_flags);
261 }
262
263 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
264 {
265         instrumentation_begin();
266         syscall_exit_to_user_mode_prepare(regs);
267         local_irq_disable_exit_to_user();
268         exit_to_user_mode_prepare(regs);
269         instrumentation_end();
270         exit_to_user_mode();
271 }
272
273 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
274 {
275         enter_from_user_mode(regs);
276 }
277
278 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
279 {
280         instrumentation_begin();
281         exit_to_user_mode_prepare(regs);
282         instrumentation_end();
283         exit_to_user_mode();
284 }
285
286 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
287 {
288         irqentry_state_t ret = {
289                 .exit_rcu = false,
290         };
291
292         if (user_mode(regs)) {
293                 irqentry_enter_from_user_mode(regs);
294                 return ret;
295         }
296
297         /*
298          * If this entry hit the idle task invoke rcu_irq_enter() whether
299          * RCU is watching or not.
300          *
301          * Interupts can nest when the first interrupt invokes softirq
302          * processing on return which enables interrupts.
303          *
304          * Scheduler ticks in the idle task can mark quiescent state and
305          * terminate a grace period, if and only if the timer interrupt is
306          * not nested into another interrupt.
307          *
308          * Checking for rcu_is_watching() here would prevent the nesting
309          * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
310          * the tick then rcu_flavor_sched_clock_irq() would wrongfully
311          * assume that it is the first interupt and eventually claim
312          * quiescient state and end grace periods prematurely.
313          *
314          * Unconditionally invoke rcu_irq_enter() so RCU state stays
315          * consistent.
316          *
317          * TINY_RCU does not support EQS, so let the compiler eliminate
318          * this part when enabled.
319          */
320         if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
321                 /*
322                  * If RCU is not watching then the same careful
323                  * sequence vs. lockdep and tracing is required
324                  * as in irq_enter_from_user_mode().
325                  */
326                 lockdep_hardirqs_off(CALLER_ADDR0);
327                 rcu_irq_enter();
328                 instrumentation_begin();
329                 trace_hardirqs_off_finish();
330                 instrumentation_end();
331
332                 ret.exit_rcu = true;
333                 return ret;
334         }
335
336         /*
337          * If RCU is watching then RCU only wants to check whether it needs
338          * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
339          * already contains a warning when RCU is not watching, so no point
340          * in having another one here.
341          */
342         instrumentation_begin();
343         rcu_irq_enter_check_tick();
344         /* Use the combo lockdep/tracing function */
345         trace_hardirqs_off();
346         instrumentation_end();
347
348         return ret;
349 }
350
351 void irqentry_exit_cond_resched(void)
352 {
353         if (!preempt_count()) {
354                 /* Sanity check RCU and thread stack */
355                 rcu_irq_exit_check_preempt();
356                 if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
357                         WARN_ON_ONCE(!on_thread_stack());
358                 if (need_resched())
359                         preempt_schedule_irq();
360         }
361 }
362
363 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
364 {
365         lockdep_assert_irqs_disabled();
366
367         /* Check whether this returns to user mode */
368         if (user_mode(regs)) {
369                 irqentry_exit_to_user_mode(regs);
370         } else if (!regs_irqs_disabled(regs)) {
371                 /*
372                  * If RCU was not watching on entry this needs to be done
373                  * carefully and needs the same ordering of lockdep/tracing
374                  * and RCU as the return to user mode path.
375                  */
376                 if (state.exit_rcu) {
377                         instrumentation_begin();
378                         /* Tell the tracer that IRET will enable interrupts */
379                         trace_hardirqs_on_prepare();
380                         lockdep_hardirqs_on_prepare(CALLER_ADDR0);
381                         instrumentation_end();
382                         rcu_irq_exit();
383                         lockdep_hardirqs_on(CALLER_ADDR0);
384                         return;
385                 }
386
387                 instrumentation_begin();
388                 if (IS_ENABLED(CONFIG_PREEMPTION))
389                         irqentry_exit_cond_resched();
390                 /* Covers both tracing and lockdep */
391                 trace_hardirqs_on();
392                 instrumentation_end();
393         } else {
394                 /*
395                  * IRQ flags state is correct already. Just tell RCU if it
396                  * was not watching on entry.
397                  */
398                 if (state.exit_rcu)
399                         rcu_irq_exit();
400         }
401 }
This page took 0.058202 seconds and 4 git commands to generate.