Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	// SPDX-License-Identifier: GPL-2.0
	2	#include <linux/slab.h>
	3	#include <linux/file.h>
	4	#include <linux/fdtable.h>
	5	#include <linux/freezer.h>
	6	#include <linux/mm.h>
	7	#include <linux/stat.h>
	8	#include <linux/fcntl.h>
	9	#include <linux/swap.h>
	10	#include <linux/ctype.h>
	11	#include <linux/string.h>
	12	#include <linux/init.h>
	13	#include <linux/pagemap.h>
	14	#include <linux/perf_event.h>
	15	#include <linux/highmem.h>
	16	#include <linux/spinlock.h>
	17	#include <linux/key.h>
	18	#include <linux/personality.h>
	19	#include <linux/binfmts.h>
	20	#include <linux/coredump.h>
	21	#include <linux/sched/coredump.h>
	22	#include <linux/sched/signal.h>
	23	#include <linux/sched/task_stack.h>
	24	#include <linux/utsname.h>
	25	#include <linux/pid_namespace.h>
	26	#include <linux/module.h>
	27	#include <linux/namei.h>
	28	#include <linux/mount.h>
	29	#include <linux/security.h>
	30	#include <linux/syscalls.h>
	31	#include <linux/tsacct_kern.h>
	32	#include <linux/cn_proc.h>
	33	#include <linux/audit.h>
	34	#include <linux/tracehook.h>
	35	#include <linux/kmod.h>
	36	#include <linux/fsnotify.h>
	37	#include <linux/fs_struct.h>
	38	#include <linux/pipe_fs_i.h>
	39	#include <linux/oom.h>
	40	#include <linux/compat.h>
	41	#include <linux/fs.h>
	42	#include <linux/path.h>
	43	#include <linux/timekeeping.h>
	44
	45	#include <linux/uaccess.h>
	46	#include <asm/mmu_context.h>
	47	#include <asm/tlb.h>
	48	#include <asm/exec.h>
	49
	50	#include <trace/events/task.h>
	51	#include "internal.h"
	52
	53	#include <trace/events/sched.h>
	54
	55	int core_uses_pid;
	56	unsigned int core_pipe_limit;
	57	char core_pattern[CORENAME_MAX_SIZE] = "core";
	58	static int core_name_size = CORENAME_MAX_SIZE;
	59
	60	struct core_name {
	61	char *corename;
	62	int used, size;
	63	};
	64
	65	/* The maximal length of core_pattern is also specified in sysctl.c */
	66
	67	static int expand_corename(struct core_name *cn, int size)
	68	{
	69	char *corename = krealloc(cn->corename, size, GFP_KERNEL);
	70
	71	if (!corename)
	72	return -ENOMEM;
	73
	74	if (size > core_name_size) /* racy but harmless */
	75	core_name_size = size;
	76
	77	cn->size = ksize(corename);
	78	cn->corename = corename;
	79	return 0;
	80	}
	81
	82	static __printf(2, 0) int cn_vprintf(struct core_name cn, const char fmt,
	83	va_list arg)
	84	{
	85	int free, need;
	86	va_list arg_copy;
	87
	88	again:
	89	free = cn->size - cn->used;
	90
	91	va_copy(arg_copy, arg);
	92	need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
	93	va_end(arg_copy);
	94
	95	if (need < free) {
	96	cn->used += need;
	97	return 0;
	98	}
	99
	100	if (!expand_corename(cn, cn->size + need - free + 1))
	101	goto again;
	102
	103	return -ENOMEM;
	104	}
	105
	106	static __printf(2, 3) int cn_printf(struct core_name cn, const char fmt, ...)
	107	{
	108	va_list arg;
	109	int ret;
	110
	111	va_start(arg, fmt);
	112	ret = cn_vprintf(cn, fmt, arg);
	113	va_end(arg);
	114
	115	return ret;
	116	}
	117
	118	static __printf(2, 3)
	119	int cn_esc_printf(struct core_name cn, const char fmt, ...)
	120	{
	121	int cur = cn->used;
	122	va_list arg;
	123	int ret;
	124
	125	va_start(arg, fmt);
	126	ret = cn_vprintf(cn, fmt, arg);
	127	va_end(arg);
	128
	129	if (ret == 0) {
	130	/*
	131	* Ensure that this coredump name component can't cause the
	132	* resulting corefile path to consist of a ".." or ".".
	133	*/
	134	if ((cn->used - cur == 1 && cn->corename[cur] == '.') \|\|
	135	(cn->used - cur == 2 && cn->corename[cur] == '.'
	136	&& cn->corename[cur+1] == '.'))
	137	cn->corename[cur] = '!';
	138
	139	/*
	140	* Empty names are fishy and could be used to create a "//" in a
	141	* corefile name, causing the coredump to happen one directory
	142	* level too high. Enforce that all components of the core
	143	* pattern are at least one character long.
	144	*/
	145	if (cn->used == cur)
	146	ret = cn_printf(cn, "!");
	147	}
	148
	149	for (; cur < cn->used; ++cur) {
	150	if (cn->corename[cur] == '/')
	151	cn->corename[cur] = '!';
	152	}
	153	return ret;
	154	}
	155
	156	static int cn_print_exe_file(struct core_name *cn, bool name_only)
	157	{
	158	struct file *exe_file;
	159	char pathbuf, path, *ptr;
	160	int ret;
	161
	162	exe_file = get_mm_exe_file(current->mm);
	163	if (!exe_file)
	164	return cn_esc_printf(cn, "%s (path unknown)", current->comm);
	165
	166	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
	167	if (!pathbuf) {
	168	ret = -ENOMEM;
	169	goto put_exe_file;
	170	}
	171
	172	path = file_path(exe_file, pathbuf, PATH_MAX);
	173	if (IS_ERR(path)) {
	174	ret = PTR_ERR(path);
	175	goto free_buf;
	176	}
	177
	178	if (name_only) {
	179	ptr = strrchr(path, '/');
	180	if (ptr)
	181	path = ptr + 1;
	182	}
	183	ret = cn_esc_printf(cn, "%s", path);
	184
	185	free_buf:
	186	kfree(pathbuf);
	187	put_exe_file:
	188	fput(exe_file);
	189	return ret;
	190	}
	191
	192	/* format_corename will inspect the pattern parameter, and output a
	193	* name into corename, which must have space for at least
	194	* CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
	195	*/
	196	static int format_corename(struct core_name cn, struct coredump_params cprm,
	197	size_t *argv, int argc)
	198	{
	199	const struct cred *cred = current_cred();
	200	const char *pat_ptr = core_pattern;
	201	int ispipe = (*pat_ptr == '\|');
	202	bool was_space = false;
	203	int pid_in_pattern = 0;
	204	int err = 0;
	205
	206	cn->used = 0;
	207	cn->corename = NULL;
	208	if (expand_corename(cn, core_name_size))
	209	return -ENOMEM;
	210	cn->corename[0] = '\0';
	211
	212	if (ispipe) {
	213	int argvs = sizeof(core_pattern) / 2;
	214	(argv) = kmalloc_array(argvs, sizeof(*argv), GFP_KERNEL);
	215	if (!(*argv))
	216	return -ENOMEM;
	217	(argv)[(argc)++] = 0;
	218	++pat_ptr;
	219	if (!(*pat_ptr))
	220	return -ENOMEM;
	221	}
	222
	223	/* Repeat as long as we have more pattern to process and more output
	224	space */
	225	while (*pat_ptr) {
	226	/*
	227	* Split on spaces before doing template expansion so that
	228	* %e and %E don't get split if they have spaces in them
	229	*/
	230	if (ispipe) {
	231	if (isspace(*pat_ptr)) {
	232	if (cn->used != 0)
	233	was_space = true;
	234	pat_ptr++;
	235	continue;
	236	} else if (was_space) {
	237	was_space = false;
	238	err = cn_printf(cn, "%c", '\0');
	239	if (err)
	240	return err;
	241	(argv)[(argc)++] = cn->used;
	242	}
	243	}
	244	if (*pat_ptr != '%') {
	245	err = cn_printf(cn, "%c", *pat_ptr++);
	246	} else {
	247	switch (*++pat_ptr) {
	248	/* single % at the end, drop that */
	249	case 0:
	250	goto out;
	251	/* Double percent, output one percent */
	252	case '%':
	253	err = cn_printf(cn, "%c", '%');
	254	break;
	255	/* pid */
	256	case 'p':
	257	pid_in_pattern = 1;
	258	err = cn_printf(cn, "%d",
	259	task_tgid_vnr(current));
	260	break;
	261	/* global pid */
	262	case 'P':
	263	err = cn_printf(cn, "%d",
	264	task_tgid_nr(current));
	265	break;
	266	case 'i':
	267	err = cn_printf(cn, "%d",
	268	task_pid_vnr(current));
	269	break;
	270	case 'I':
	271	err = cn_printf(cn, "%d",
	272	task_pid_nr(current));
	273	break;
	274	/* uid */
	275	case 'u':
	276	err = cn_printf(cn, "%u",
	277	from_kuid(&init_user_ns,
	278	cred->uid));
	279	break;
	280	/* gid */
	281	case 'g':
	282	err = cn_printf(cn, "%u",
	283	from_kgid(&init_user_ns,
	284	cred->gid));
	285	break;
	286	case 'd':
	287	err = cn_printf(cn, "%d",
	288	__get_dumpable(cprm->mm_flags));
	289	break;
	290	/* signal that caused the coredump */
	291	case 's':
	292	err = cn_printf(cn, "%d",
	293	cprm->siginfo->si_signo);
	294	break;
	295	/* UNIX time of coredump */
	296	case 't': {
	297	time64_t time;
	298
	299	time = ktime_get_real_seconds();
	300	err = cn_printf(cn, "%lld", time);
	301	break;
	302	}
	303	/* hostname */
	304	case 'h':
	305	down_read(&uts_sem);
	306	err = cn_esc_printf(cn, "%s",
	307	utsname()->nodename);
	308	up_read(&uts_sem);
	309	break;
	310	/* executable, could be changed by prctl PR_SET_NAME etc */
	311	case 'e':
	312	err = cn_esc_printf(cn, "%s", current->comm);
	313	break;
	314	/* file name of executable */
	315	case 'f':
	316	err = cn_print_exe_file(cn, true);
	317	break;
	318	case 'E':
	319	err = cn_print_exe_file(cn, false);
	320	break;
	321	/* core limit size */
	322	case 'c':
	323	err = cn_printf(cn, "%lu",
	324	rlimit(RLIMIT_CORE));
	325	break;
	326	default:
	327	break;
	328	}
	329	++pat_ptr;
	330	}
	331
	332	if (err)
	333	return err;
	334	}
	335
	336	out:
	337	/* Backward compatibility with core_uses_pid:
	338	*
	339	* If core_pattern does not include a %p (as is the default)
	340	* and core_uses_pid is set, then .%pid will be appended to
	341	* the filename. Do not do this for piped commands. */
	342	if (!ispipe && !pid_in_pattern && core_uses_pid) {
	343	err = cn_printf(cn, ".%d", task_tgid_vnr(current));
	344	if (err)
	345	return err;
	346	}
	347	return ispipe;
	348	}
	349
	350	static int zap_process(struct task_struct *start, int exit_code, int flags)
	351	{
	352	struct task_struct *t;
	353	int nr = 0;
	354
	355	/* ignore all signals except SIGKILL, see prepare_signal() */
	356	start->signal->flags = SIGNAL_GROUP_COREDUMP \| flags;
	357	start->signal->group_exit_code = exit_code;
	358	start->signal->group_stop_count = 0;
	359
	360	for_each_thread(start, t) {
	361	task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
	362	if (t != current && t->mm) {
	363	sigaddset(&t->pending.signal, SIGKILL);
	364	signal_wake_up(t, 1);
	365	nr++;
	366	}
	367	}
	368
	369	return nr;
	370	}
	371
	372	static int zap_threads(struct task_struct tsk, struct mm_struct mm,
	373	struct core_state *core_state, int exit_code)
	374	{
	375	struct task_struct g, p;
	376	unsigned long flags;
	377	int nr = -EAGAIN;
	378
	379	spin_lock_irq(&tsk->sighand->siglock);
	380	if (!signal_group_exit(tsk->signal)) {
	381	mm->core_state = core_state;
	382	tsk->signal->group_exit_task = tsk;
	383	nr = zap_process(tsk, exit_code, 0);
	384	clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
	385	}
	386	spin_unlock_irq(&tsk->sighand->siglock);
	387	if (unlikely(nr < 0))
	388	return nr;
	389
	390	tsk->flags \|= PF_DUMPCORE;
	391	if (atomic_read(&mm->mm_users) == nr + 1)
	392	goto done;
	393	/*
	394	* We should find and kill all tasks which use this mm, and we should
	395	* count them correctly into ->nr_threads. We don't take tasklist
	396	* lock, but this is safe wrt:
	397	*
	398	* fork:
	399	* None of sub-threads can fork after zap_process(leader). All
	400	* processes which were created before this point should be
	401	* visible to zap_threads() because copy_process() adds the new
	402	* process to the tail of init_task.tasks list, and lock/unlock
	403	* of ->siglock provides a memory barrier.
	404	*
	405	* do_exit:
	406	* The caller holds mm->mmap_lock. This means that the task which
	407	* uses this mm can't pass exit_mm(), so it can't exit or clear
	408	* its ->mm.
	409	*
	410	* de_thread:
	411	* It does list_replace_rcu(&leader->tasks, &current->tasks),
	412	* we must see either old or new leader, this does not matter.
	413	* However, it can change p->sighand, so lock_task_sighand(p)
	414	* must be used. Since p->mm != NULL and we hold ->mmap_lock
	415	* it can't fail.
	416	*
	417	* Note also that "g" can be the old leader with ->mm == NULL
	418	* and already unhashed and thus removed from ->thread_group.
	419	* This is OK, __unhash_process()->list_del_rcu() does not
	420	* clear the ->next pointer, we will find the new leader via
	421	* next_thread().
	422	*/
	423	rcu_read_lock();
	424	for_each_process(g) {
	425	if (g == tsk->group_leader)
	426	continue;
	427	if (g->flags & PF_KTHREAD)
	428	continue;
	429
	430	for_each_thread(g, p) {
	431	if (unlikely(!p->mm))
	432	continue;
	433	if (unlikely(p->mm == mm)) {
	434	lock_task_sighand(p, &flags);
	435	nr += zap_process(p, exit_code,
	436	SIGNAL_GROUP_EXIT);
	437	unlock_task_sighand(p, &flags);
	438	}
	439	break;
	440	}
	441	}
	442	rcu_read_unlock();
	443	done:
	444	atomic_set(&core_state->nr_threads, nr);
	445	return nr;
	446	}
	447
	448	static int coredump_wait(int exit_code, struct core_state *core_state)
	449	{
	450	struct task_struct *tsk = current;
	451	struct mm_struct *mm = tsk->mm;
	452	int core_waiters = -EBUSY;
	453
	454	init_completion(&core_state->startup);
	455	core_state->dumper.task = tsk;
	456	core_state->dumper.next = NULL;
	457
	458	if (mmap_write_lock_killable(mm))
	459	return -EINTR;
	460
	461	if (!mm->core_state)
	462	core_waiters = zap_threads(tsk, mm, core_state, exit_code);
	463	mmap_write_unlock(mm);
	464
	465	if (core_waiters > 0) {
	466	struct core_thread *ptr;
	467
	468	freezer_do_not_count();
	469	wait_for_completion(&core_state->startup);
	470	freezer_count();
	471	/*
	472	* Wait for all the threads to become inactive, so that
	473	* all the thread context (extended register state, like
	474	* fpu etc) gets copied to the memory.
	475	*/
	476	ptr = core_state->dumper.next;
	477	while (ptr != NULL) {
	478	wait_task_inactive(ptr->task, 0);
	479	ptr = ptr->next;
	480	}
	481	}
	482
	483	return core_waiters;
	484	}
	485
	486	static void coredump_finish(struct mm_struct *mm, bool core_dumped)
	487	{
	488	struct core_thread curr, next;
	489	struct task_struct *task;
	490
	491	spin_lock_irq(&current->sighand->siglock);
	492	if (core_dumped && !__fatal_signal_pending(current))
	493	current->signal->group_exit_code \|= 0x80;
	494	current->signal->group_exit_task = NULL;
	495	current->signal->flags = SIGNAL_GROUP_EXIT;
	496	spin_unlock_irq(&current->sighand->siglock);
	497
	498	next = mm->core_state->dumper.next;
	499	while ((curr = next) != NULL) {
	500	next = curr->next;
	501	task = curr->task;
	502	/*
	503	* see exit_mm(), curr->task must not see
	504	* ->task == NULL before we read ->next.
	505	*/
	506	smp_mb();
	507	curr->task = NULL;
	508	wake_up_process(task);
	509	}
	510
	511	mm->core_state = NULL;
	512	}
	513
	514	static bool dump_interrupted(void)
	515	{
	516	/*
	517	* SIGKILL or freezing() interrupt the coredumping. Perhaps we
	518	* can do try_to_freeze() and check __fatal_signal_pending(),
	519	* but then we need to teach dump_write() to restart and clear
	520	* TIF_SIGPENDING.
	521	*/
	522	return signal_pending(current);
	523	}
	524
	525	static void wait_for_dump_helpers(struct file *file)
	526	{
	527	struct pipe_inode_info *pipe = file->private_data;
	528
	529	pipe_lock(pipe);
	530	pipe->readers++;
	531	pipe->writers--;
	532	wake_up_interruptible_sync(&pipe->rd_wait);
	533	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
	534	pipe_unlock(pipe);
	535
	536	/*
	537	* We actually want wait_event_freezable() but then we need
	538	* to clear TIF_SIGPENDING and improve dump_interrupted().
	539	*/
	540	wait_event_interruptible(pipe->rd_wait, pipe->readers == 1);
	541
	542	pipe_lock(pipe);
	543	pipe->readers--;
	544	pipe->writers++;
	545	pipe_unlock(pipe);
	546	}
	547
	548	/*
	549	* umh_pipe_setup
	550	* helper function to customize the process used
	551	* to collect the core in userspace. Specifically
	552	* it sets up a pipe and installs it as fd 0 (stdin)
	553	* for the process. Returns 0 on success, or
	554	* PTR_ERR on failure.
	555	* Note that it also sets the core limit to 1. This
	556	* is a special value that we use to trap recursive
	557	* core dumps
	558	*/
	559	static int umh_pipe_setup(struct subprocess_info info, struct cred new)
	560	{
	561	struct file *files[2];
	562	struct coredump_params cp = (struct coredump_params )info->data;
	563	int err = create_pipe_files(files, 0);
	564	if (err)
	565	return err;
	566
	567	cp->file = files[1];
	568
	569	err = replace_fd(0, files[0], 0);
	570	fput(files[0]);
	571	/* and disallow core files too */
	572	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
	573
	574	return err;
	575	}
	576
	577	void do_coredump(const kernel_siginfo_t *siginfo)
	578	{
	579	struct core_state core_state;
	580	struct core_name cn;
	581	struct mm_struct *mm = current->mm;
	582	struct linux_binfmt * binfmt;
	583	const struct cred *old_cred;
	584	struct cred *cred;
	585	int retval = 0;
	586	int ispipe;
	587	size_t *argv = NULL;
	588	int argc = 0;
	589	/* require nonrelative corefile path and be extra careful */
	590	bool need_suid_safe = false;
	591	bool core_dumped = false;
	592	static atomic_t core_dump_count = ATOMIC_INIT(0);
	593	struct coredump_params cprm = {
	594	.siginfo = siginfo,
	595	.regs = signal_pt_regs(),
	596	.limit = rlimit(RLIMIT_CORE),
	597	/*
	598	* We must use the same mm->flags while dumping core to avoid
	599	* inconsistency of bit flags, since this flag is not protected
	600	* by any locks.
	601	*/
	602	.mm_flags = mm->flags,
	603	};
	604
	605	audit_core_dumps(siginfo->si_signo);
	606
	607	binfmt = mm->binfmt;
	608	if (!binfmt \|\| !binfmt->core_dump)
	609	goto fail;
	610	if (!__get_dumpable(cprm.mm_flags))
	611	goto fail;
	612
	613	cred = prepare_creds();
	614	if (!cred)
	615	goto fail;
	616	/*
	617	* We cannot trust fsuid as being the "true" uid of the process
	618	* nor do we know its entire history. We only know it was tainted
	619	* so we dump it as root in mode 2, and only into a controlled
	620	* environment (pipe handler or fully qualified path).
	621	*/
	622	if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
	623	/* Setuid core dump mode */
	624	cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
	625	need_suid_safe = true;
	626	}
	627
	628	retval = coredump_wait(siginfo->si_signo, &core_state);
	629	if (retval < 0)
	630	goto fail_creds;
	631
	632	old_cred = override_creds(cred);
	633
	634	ispipe = format_corename(&cn, &cprm, &argv, &argc);
	635
	636	if (ispipe) {
	637	int argi;
	638	int dump_count;
	639	char **helper_argv;
	640	struct subprocess_info *sub_info;
	641
	642	if (ispipe < 0) {
	643	printk(KERN_WARNING "format_corename failed\n");
	644	printk(KERN_WARNING "Aborting core\n");
	645	goto fail_unlock;
	646	}
	647
	648	if (cprm.limit == 1) {
	649	/* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
	650	*
	651	* Normally core limits are irrelevant to pipes, since
	652	* we're not writing to the file system, but we use
	653	* cprm.limit of 1 here as a special value, this is a
	654	* consistent way to catch recursive crashes.
	655	* We can still crash if the core_pattern binary sets
	656	* RLIM_CORE = !1, but it runs as root, and can do
	657	* lots of stupid things.
	658	*
	659	* Note that we use task_tgid_vnr here to grab the pid
	660	* of the process group leader. That way we get the
	661	* right pid if a thread in a multi-threaded
	662	* core_pattern process dies.
	663	*/
	664	printk(KERN_WARNING
	665	"Process %d(%s) has RLIMIT_CORE set to 1\n",
	666	task_tgid_vnr(current), current->comm);
	667	printk(KERN_WARNING "Aborting core\n");
	668	goto fail_unlock;
	669	}
	670	cprm.limit = RLIM_INFINITY;
	671
	672	dump_count = atomic_inc_return(&core_dump_count);
	673	if (core_pipe_limit && (core_pipe_limit < dump_count)) {
	674	printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
	675	task_tgid_vnr(current), current->comm);
	676	printk(KERN_WARNING "Skipping core dump\n");
	677	goto fail_dropcount;
	678	}
	679
	680	helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
	681	GFP_KERNEL);
	682	if (!helper_argv) {
	683	printk(KERN_WARNING "%s failed to allocate memory\n",
	684	__func__);
	685	goto fail_dropcount;
	686	}
	687	for (argi = 0; argi < argc; argi++)
	688	helper_argv[argi] = cn.corename + argv[argi];
	689	helper_argv[argi] = NULL;
	690
	691	retval = -ENOMEM;
	692	sub_info = call_usermodehelper_setup(helper_argv[0],
	693	helper_argv, NULL, GFP_KERNEL,
	694	umh_pipe_setup, NULL, &cprm);
	695	if (sub_info)
	696	retval = call_usermodehelper_exec(sub_info,
	697	UMH_WAIT_EXEC);
	698
	699	kfree(helper_argv);
	700	if (retval) {
	701	printk(KERN_INFO "Core dump to \|%s pipe failed\n",
	702	cn.corename);
	703	goto close_fail;
	704	}
	705	} else {
	706	struct user_namespace *mnt_userns;
	707	struct inode *inode;
	708	int open_flags = O_CREAT \| O_RDWR \| O_NOFOLLOW \|
	709	O_LARGEFILE \| O_EXCL;
	710
	711	if (cprm.limit < binfmt->min_coredump)
	712	goto fail_unlock;
	713
	714	if (need_suid_safe && cn.corename[0] != '/') {
	715	printk(KERN_WARNING "Pid %d(%s) can only dump core "\
	716	"to fully qualified path!\n",
	717	task_tgid_vnr(current), current->comm);
	718	printk(KERN_WARNING "Skipping core dump\n");
	719	goto fail_unlock;
	720	}
	721
	722	/*
	723	* Unlink the file if it exists unless this is a SUID
	724	* binary - in that case, we're running around with root
	725	* privs and don't want to unlink another user's coredump.
	726	*/
	727	if (!need_suid_safe) {
	728	/*
	729	* If it doesn't exist, that's fine. If there's some
	730	* other problem, we'll catch it at the filp_open().
	731	*/
	732	do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
	733	}
	734
	735	/*
	736	* There is a race between unlinking and creating the
	737	* file, but if that causes an EEXIST here, that's
	738	* fine - another process raced with us while creating
	739	* the corefile, and the other process won. To userspace,
	740	* what matters is that at least one of the two processes
	741	* writes its coredump successfully, not which one.
	742	*/
	743	if (need_suid_safe) {
	744	/*
	745	* Using user namespaces, normal user tasks can change
	746	* their current->fs->root to point to arbitrary
	747	* directories. Since the intention of the "only dump
	748	* with a fully qualified path" rule is to control where
	749	* coredumps may be placed using root privileges,
	750	* current->fs->root must not be used. Instead, use the
	751	* root directory of init_task.
	752	*/
	753	struct path root;
	754
	755	task_lock(&init_task);
	756	get_fs_root(init_task.fs, &root);
	757	task_unlock(&init_task);
	758	cprm.file = file_open_root(root.dentry, root.mnt,
	759	cn.corename, open_flags, 0600);
	760	path_put(&root);
	761	} else {
	762	cprm.file = filp_open(cn.corename, open_flags, 0600);
	763	}
	764	if (IS_ERR(cprm.file))
	765	goto fail_unlock;
	766
	767	inode = file_inode(cprm.file);
	768	if (inode->i_nlink > 1)
	769	goto close_fail;
	770	if (d_unhashed(cprm.file->f_path.dentry))
	771	goto close_fail;
	772	/*
	773	* AK: actually i see no reason to not allow this for named
	774	* pipes etc, but keep the previous behaviour for now.
	775	*/
	776	if (!S_ISREG(inode->i_mode))
	777	goto close_fail;
	778	/*
	779	* Don't dump core if the filesystem changed owner or mode
	780	* of the file during file creation. This is an issue when
	781	* a process dumps core while its cwd is e.g. on a vfat
	782	* filesystem.
	783	*/
	784	mnt_userns = file_mnt_user_ns(cprm.file);
	785	if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), current_fsuid()))
	786	goto close_fail;
	787	if ((inode->i_mode & 0677) != 0600)
	788	goto close_fail;
	789	if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
	790	goto close_fail;
	791	if (do_truncate(mnt_userns, cprm.file->f_path.dentry,
	792	0, 0, cprm.file))
	793	goto close_fail;
	794	}
	795
	796	/* get us an unshared descriptor table; almost always a no-op */
	797	/* The cell spufs coredump code reads the file descriptor tables */
	798	retval = unshare_files();
	799	if (retval)
	800	goto close_fail;
	801	if (!dump_interrupted()) {
	802	/*
	803	* umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
	804	* have this set to NULL.
	805	*/
	806	if (!cprm.file) {
	807	pr_info("Core dump to \|%s disabled\n", cn.corename);
	808	goto close_fail;
	809	}
	810	file_start_write(cprm.file);
	811	core_dumped = binfmt->core_dump(&cprm);
	812	file_end_write(cprm.file);
	813	}
	814	if (ispipe && core_pipe_limit)
	815	wait_for_dump_helpers(cprm.file);
	816	close_fail:
	817	if (cprm.file)
	818	filp_close(cprm.file, NULL);
	819	fail_dropcount:
	820	if (ispipe)
	821	atomic_dec(&core_dump_count);
	822	fail_unlock:
	823	kfree(argv);
	824	kfree(cn.corename);
	825	coredump_finish(mm, core_dumped);
	826	revert_creds(old_cred);
	827	fail_creds:
	828	put_cred(cred);
	829	fail:
	830	return;
	831	}
	832
	833	/*
	834	* Core dumping helper functions. These are the only things you should
	835	* do on a core-file: use only these functions to write out all the
	836	* necessary info.
	837	*/
	838	int dump_emit(struct coredump_params cprm, const void addr, int nr)
	839	{
	840	struct file *file = cprm->file;
	841	loff_t pos = file->f_pos;
	842	ssize_t n;
	843	if (cprm->written + nr > cprm->limit)
	844	return 0;
	845
	846
	847	if (dump_interrupted())
	848	return 0;
	849	n = __kernel_write(file, addr, nr, &pos);
	850	if (n != nr)
	851	return 0;
	852	file->f_pos = pos;
	853	cprm->written += n;
	854	cprm->pos += n;
	855
	856	return 1;
	857	}
	858	EXPORT_SYMBOL(dump_emit);
	859
	860	int dump_skip(struct coredump_params *cprm, size_t nr)
	861	{
	862	static char zeroes[PAGE_SIZE];
	863	struct file *file = cprm->file;
	864	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
	865	if (dump_interrupted() \|\|
	866	file->f_op->llseek(file, nr, SEEK_CUR) < 0)
	867	return 0;
	868	cprm->pos += nr;
	869	return 1;
	870	} else {
	871	while (nr > PAGE_SIZE) {
	872	if (!dump_emit(cprm, zeroes, PAGE_SIZE))
	873	return 0;
	874	nr -= PAGE_SIZE;
	875	}
	876	return dump_emit(cprm, zeroes, nr);
	877	}
	878	}
	879	EXPORT_SYMBOL(dump_skip);
	880
	881	#ifdef CONFIG_ELF_CORE
	882	int dump_user_range(struct coredump_params *cprm, unsigned long start,
	883	unsigned long len)
	884	{
	885	unsigned long addr;
	886
	887	for (addr = start; addr < start + len; addr += PAGE_SIZE) {
	888	struct page *page;
	889	int stop;
	890
	891	/*
	892	* To avoid having to allocate page tables for virtual address
	893	* ranges that have never been used yet, and also to make it
	894	* easy to generate sparse core files, use a helper that returns
	895	* NULL when encountering an empty page table entry that would
	896	* otherwise have been filled with the zero page.
	897	*/
	898	page = get_dump_page(addr);
	899	if (page) {
	900	void *kaddr = kmap_local_page(page);
	901
	902	stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
	903	kunmap_local(kaddr);
	904	put_page(page);
	905	} else {
	906	stop = !dump_skip(cprm, PAGE_SIZE);
	907	}
	908	if (stop)
	909	return 0;
	910	}
	911	return 1;
	912	}
	913	#endif
	914
	915	int dump_align(struct coredump_params *cprm, int align)
	916	{
	917	unsigned mod = cprm->pos & (align - 1);
	918	if (align & (align - 1))
	919	return 0;
	920	return mod ? dump_skip(cprm, align - mod) : 1;
	921	}
	922	EXPORT_SYMBOL(dump_align);
	923
	924	/*
	925	* Ensures that file size is big enough to contain the current file
	926	* postion. This prevents gdb from complaining about a truncated file
	927	* if the last "write" to the file was dump_skip.
	928	*/
	929	void dump_truncate(struct coredump_params *cprm)
	930	{
	931	struct file *file = cprm->file;
	932	loff_t offset;
	933
	934	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
	935	offset = file->f_op->llseek(file, 0, SEEK_CUR);
	936	if (i_size_read(file->f_mapping->host) < offset)
	937	do_truncate(file_mnt_user_ns(file), file->f_path.dentry,
	938	offset, 0, file);
	939	}
	940	}
	941	EXPORT_SYMBOL(dump_truncate);
	942
	943	/*
	944	* The purpose of always_dump_vma() is to make sure that special kernel mappings
	945	* that are useful for post-mortem analysis are included in every core dump.
	946	* In that way we ensure that the core dump is fully interpretable later
	947	* without matching up the same kernel and hardware config to see what PC values
	948	* meant. These special mappings include - vDSO, vsyscall, and other
	949	* architecture specific mappings
	950	*/
	951	static bool always_dump_vma(struct vm_area_struct *vma)
	952	{
	953	/* Any vsyscall mappings? */
	954	if (vma == get_gate_vma(vma->vm_mm))
	955	return true;
	956
	957	/*
	958	* Assume that all vmas with a .name op should always be dumped.
	959	* If this changes, a new vm_ops field can easily be added.
	960	*/
	961	if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
	962	return true;
	963
	964	/*
	965	* arch_vma_name() returns non-NULL for special architecture mappings,
	966	* such as vDSO sections.
	967	*/
	968	if (arch_vma_name(vma))
	969	return true;
	970
	971	return false;
	972	}
	973
	974	/*
	975	* Decide how much of @vma's contents should be included in a core dump.
	976	*/
	977	static unsigned long vma_dump_size(struct vm_area_struct *vma,
	978	unsigned long mm_flags)
	979	{
	980	#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
	981
	982	/* always dump the vdso and vsyscall sections */
	983	if (always_dump_vma(vma))
	984	goto whole;
	985
	986	if (vma->vm_flags & VM_DONTDUMP)
	987	return 0;
	988
	989	/* support for DAX */
	990	if (vma_is_dax(vma)) {
	991	if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
	992	goto whole;
	993	if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
	994	goto whole;
	995	return 0;
	996	}
	997
	998	/* Hugetlb memory check */
	999	if (is_vm_hugetlb_page(vma)) {
	1000	if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
	1001	goto whole;
	1002	if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
	1003	goto whole;
	1004	return 0;
	1005	}
	1006
	1007	/* Do not dump I/O mapped devices or special mappings */
	1008	if (vma->vm_flags & VM_IO)
	1009	return 0;
	1010
	1011	/* By default, dump shared memory if mapped from an anonymous file. */
	1012	if (vma->vm_flags & VM_SHARED) {
	1013	if (file_inode(vma->vm_file)->i_nlink == 0 ?
	1014	FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
	1015	goto whole;
	1016	return 0;
	1017	}
	1018
	1019	/* Dump segments that have been written to. */
	1020	if ((!IS_ENABLED(CONFIG_MMU) \|\| vma->anon_vma) && FILTER(ANON_PRIVATE))
	1021	goto whole;
	1022	if (vma->vm_file == NULL)
	1023	return 0;
	1024
	1025	if (FILTER(MAPPED_PRIVATE))
	1026	goto whole;
	1027
	1028	/*
	1029	* If this is the beginning of an executable file mapping,
	1030	* dump the first page to aid in determining what was mapped here.
	1031	*/
	1032	if (FILTER(ELF_HEADERS) &&
	1033	vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) &&
	1034	(READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
	1035	return PAGE_SIZE;
	1036
	1037	#undef FILTER
	1038
	1039	return 0;
	1040
	1041	whole:
	1042	return vma->vm_end - vma->vm_start;
	1043	}
	1044
	1045	static struct vm_area_struct first_vma(struct task_struct tsk,
	1046	struct vm_area_struct *gate_vma)
	1047	{
	1048	struct vm_area_struct *ret = tsk->mm->mmap;
	1049
	1050	if (ret)
	1051	return ret;
	1052	return gate_vma;
	1053	}
	1054
	1055	/*
	1056	* Helper function for iterating across a vma list. It ensures that the caller
	1057	* will visit `gate_vma' prior to terminating the search.
	1058	*/
	1059	static struct vm_area_struct next_vma(struct vm_area_struct this_vma,
	1060	struct vm_area_struct *gate_vma)
	1061	{
	1062	struct vm_area_struct *ret;
	1063
	1064	ret = this_vma->vm_next;
	1065	if (ret)
	1066	return ret;
	1067	if (this_vma == gate_vma)
	1068	return NULL;
	1069	return gate_vma;
	1070	}
	1071
	1072	/*
	1073	* Under the mmap_lock, take a snapshot of relevant information about the task's
	1074	* VMAs.
	1075	*/
	1076	int dump_vma_snapshot(struct coredump_params cprm, int vma_count,
	1077	struct core_vma_metadata **vma_meta,
	1078	size_t *vma_data_size_ptr)
	1079	{
	1080	struct vm_area_struct vma, gate_vma;
	1081	struct mm_struct *mm = current->mm;
	1082	int i;
	1083	size_t vma_data_size = 0;
	1084
	1085	/*
	1086	* Once the stack expansion code is fixed to not change VMA bounds
	1087	* under mmap_lock in read mode, this can be changed to take the
	1088	* mmap_lock in read mode.
	1089	*/
	1090	if (mmap_write_lock_killable(mm))
	1091	return -EINTR;
	1092
	1093	gate_vma = get_gate_vma(mm);
	1094	*vma_count = mm->map_count + (gate_vma ? 1 : 0);
	1095
	1096	vma_meta = kvmalloc_array(vma_count, sizeof(**vma_meta), GFP_KERNEL);
	1097	if (!*vma_meta) {
	1098	mmap_write_unlock(mm);
	1099	return -ENOMEM;
	1100	}
	1101
	1102	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
	1103	vma = next_vma(vma, gate_vma), i++) {
	1104	struct core_vma_metadata m = (vma_meta) + i;
	1105
	1106	m->start = vma->vm_start;
	1107	m->end = vma->vm_end;
	1108	m->flags = vma->vm_flags;
	1109	m->dump_size = vma_dump_size(vma, cprm->mm_flags);
	1110
	1111	vma_data_size += m->dump_size;
	1112	}
	1113
	1114	mmap_write_unlock(mm);
	1115
	1116	if (WARN_ON(i != *vma_count))
	1117	return -EFAULT;
	1118
	1119	*vma_data_size_ptr = vma_data_size;
	1120	return 0;
	1121	}