When performing an exec*(), there's a transient period before the return
to userspace where any stacktrace will result in a warning triggered by
kunwind_next_frame_record_meta() encountering a struct frame_record_meta
with an unknown type. This can be seen fairly reliably by enabling KASAN
or KFENCE, e.g.
| WARNING: CPU: 3 PID: 143 at arch/arm64/kernel/stacktrace.c:223 arch_stack_walk+0x264/0x3b0
| Modules linked in:
| CPU: 3 UID: 0 PID: 143 Comm: login Not tainted
6.12.0-rc2-00010-g0f0b9a3f6a50 #1
| Hardware name: linux,dummy-virt (DT)
| pstate:
814000c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
| pc : arch_stack_walk+0x264/0x3b0
| lr : arch_stack_walk+0x1ec/0x3b0
| sp :
ffff80008060b970
| x29:
ffff80008060ba10 x28:
fff00000051133c0 x27:
0000000000000000
| x26:
0000000000000000 x25:
0000000000000000 x24:
fff000007fe84000
| x23:
ffff9d1b3c940af0 x22:
0000000000000000 x21:
fff00000051133c0
| x20:
ffff80008060ba50 x19:
ffff9d1b3c9408e0 x18:
0000000000000014
| x17:
000000006d50da47 x16:
000000008e3f265e x15:
fff0000004e8bf40
| x14:
0000ffffc5e50e48 x13:
000000000000000f x12:
0000ffffc5e50fed
| x11:
000000000000001f x10:
000018007f8bffff x9 :
0000000000000000
| x8 :
ffff80008060b9c0 x7 :
ffff80008060bfd8 x6 :
ffff80008060ba80
| x5 :
ffff80008060ba00 x4 :
ffff80008060c000 x3 :
ffff80008060bff0
| x2 :
0000000000000018 x1 :
ffff80008060bfd8 x0 :
0000000000000000
| Call trace:
| arch_stack_walk+0x264/0x3b0 (P)
| arch_stack_walk+0x1ec/0x3b0 (L)
| stack_trace_save+0x50/0x80
| metadata_update_state+0x98/0xa0
| kfence_guarded_free+0xec/0x2c4
| __kfence_free+0x50/0x100
| kmem_cache_free+0x1a4/0x37c
| putname+0x9c/0xc0
| do_execveat_common.isra.0+0xf0/0x1e4
| __arm64_sys_execve+0x40/0x60
| invoke_syscall+0x48/0x104
| el0_svc_common.constprop.0+0x40/0xe0
| do_el0_svc+0x1c/0x28
| el0_svc+0x34/0xe0
| el0t_64_sync_handler+0x120/0x12c
| el0t_64_sync+0x198/0x19c
This happens because start_thread_common() zeroes the entirety of
current_pt_regs(), including pt_regs::stackframe::type, changing this
from FRAME_META_TYPE_FINAL to 0 and making the final record invalid.
The stacktrace code will reject this until the next return to userspace,
where a subsequent exception entry will reinitialize the type to
FRAME_META_TYPE_FINAL.
This zeroing wasn't a problem prior to commit:
c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries")
... as before that commit the stacktrace code only expected the final
pt_regs::stackframe to contain zeroes, which was unchanged by
start_thread_common().
A stacktrace could occur at any time, either due to instrumentation or
an exception, and so start_thread_common() must ensure that
pt_regs::stackframe is always valid.
Fix this by changing the way start_thread_common() zeroes and
reinitializes the pt_regs fields:
* The '{regs,pc,pstate}' fields are initialized in one go via a struct
assignment to the user_regs, with start_thread() and
compat_start_thread() modified to pass 'pstate' into
start_thread_common().
* The 'sp' and 'compat_sp' fields are zeroed by the struct assignment in
start_thread_common(), and subsequently overwritten in start_thread()
and compat_start_thread respectively, matching existing behaviour.
* The 'syscallno' field is implicitly preserved while the 'orig_x0'
field is explicitly zeroed, maintaining existing ABI.
* The 'pmr' field is explicitly initialized, as necessary for an exec*()
from a kernel thread, matching existing behaviour.
* The 'stackframe' field is implicitly preserved, with a new comment and
some assertions to ensure we don't accidentally break this in future.
* All other fields are implicitly preserved, and should have no
functional impact:
- 'sdei_ttbr1' is only used for SDEI exception entry/exit, and we
never exec*() inside an SDEI handler.
- 'lockdep_hardirqs' and 'exit_rcu' are only used for EL1 exception
entry/exit, and we never exec*() inside an EL1 exception handler.
While updating compat_start_thread() to pass 'pstate' into
start_thread_common(), I've also updated the logic to remove the
ifdeffery, replacing:
| #ifdef __AARCH64EB__
| regs->pstate |= PSR_AA32_E_BIT;
| #endif
... with:
| if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
| pstate |= PSR_AA32_E_BIT;
... which should be functionally equivalent, and matches our preferred
code style.
Fixes: c2c6b27b5aa1 ("arm64: stacktrace: unwind exception boundaries")
Signed-off-by: Mark Rutland <[email protected]>
Cc: Mark Brown <[email protected]>
Cc: Miroslav Benes <[email protected]>
Cc: Puranjay Mohan <[email protected]>
Cc: Will Deacon <[email protected]>
Fixes: c2c6b27b5aa1 ("arm64: stacktrace: unwind exception boundaries")
Tested-by: Puranjay Mohan <[email protected]>
Reviewed-by: Puranjay Mohan <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Catalin Marinas <[email protected]>
.fpsimd_cpu = NR_CPUS, \
}
-static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
+static inline void start_thread_common(struct pt_regs *regs, unsigned long pc,
+ unsigned long pstate)
{
- s32 previous_syscall = regs->syscallno;
- memset(regs, 0, sizeof(*regs));
- regs->syscallno = previous_syscall;
- regs->pc = pc;
+ /*
+ * Ensure all GPRs are zeroed, and initialize PC + PSTATE.
+ * The SP (or compat SP) will be initialized later.
+ */
+ regs->user_regs = (struct user_pt_regs) {
+ .pc = pc,
+ .pstate = pstate,
+ };
+
+ /*
+ * To allow the syscalls:sys_exit_execve tracepoint we need to preserve
+ * syscallno, but do not need orig_x0 or the original GPRs.
+ */
+ regs->orig_x0 = 0;
+ /*
+ * An exec from a kernel thread won't have an existing PMR value.
+ */
if (system_uses_irq_prio_masking())
regs->pmr = GIC_PRIO_IRQON;
+
+ /*
+ * The pt_regs::stackframe field must remain valid throughout this
+ * function as a stacktrace can be taken at any time. Any user or
+ * kernel task should have a valid final frame.
+ */
+ WARN_ON_ONCE(regs->stackframe.record.fp != 0);
+ WARN_ON_ONCE(regs->stackframe.record.lr != 0);
+ WARN_ON_ONCE(regs->stackframe.type != FRAME_META_TYPE_FINAL);
}
static inline void start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
{
- start_thread_common(regs, pc);
- regs->pstate = PSR_MODE_EL0t;
+ start_thread_common(regs, pc, PSR_MODE_EL0t);
spectre_v4_enable_task_mitigation(current);
regs->sp = sp;
}
static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
{
- start_thread_common(regs, pc);
- regs->pstate = PSR_AA32_MODE_USR;
+ unsigned long pstate = PSR_AA32_MODE_USR;
if (pc & 1)
- regs->pstate |= PSR_AA32_T_BIT;
-
-#ifdef __AARCH64EB__
- regs->pstate |= PSR_AA32_E_BIT;
-#endif
+ pstate |= PSR_AA32_T_BIT;
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
+ pstate |= PSR_AA32_E_BIT;
+ start_thread_common(regs, pc, pstate);
spectre_v4_enable_task_mitigation(current);
regs->compat_sp = sp;
}