Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	// SPDX-License-Identifier: GPL-2.0-only
	2	/*
	3	* Copyright (C) 1993 Linus Torvalds
	4	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
	5	* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <[email protected]>, May 2000
	6	* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
	7	* Numa awareness, Christoph Lameter, SGI, June 2005
	8	* Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
	9	*/
	10
	11	#include <linux/vmalloc.h>
	12	#include <linux/mm.h>
	13	#include <linux/module.h>
	14	#include <linux/highmem.h>
	15	#include <linux/sched/signal.h>
	16	#include <linux/slab.h>
	17	#include <linux/spinlock.h>
	18	#include <linux/interrupt.h>
	19	#include <linux/proc_fs.h>
	20	#include <linux/seq_file.h>
	21	#include <linux/set_memory.h>
	22	#include <linux/debugobjects.h>
	23	#include <linux/kallsyms.h>
	24	#include <linux/list.h>
	25	#include <linux/notifier.h>
	26	#include <linux/rbtree.h>
	27	#include <linux/xarray.h>
	28	#include <linux/io.h>
	29	#include <linux/rcupdate.h>
	30	#include <linux/pfn.h>
	31	#include <linux/kmemleak.h>
	32	#include <linux/atomic.h>
	33	#include <linux/compiler.h>
	34	#include <linux/memcontrol.h>
	35	#include <linux/llist.h>
	36	#include <linux/bitops.h>
	37	#include <linux/rbtree_augmented.h>
	38	#include <linux/overflow.h>
	39	#include <linux/pgtable.h>
	40	#include <linux/uaccess.h>
	41	#include <linux/hugetlb.h>
	42	#include <linux/sched/mm.h>
	43	#include <asm/tlbflush.h>
	44	#include <asm/shmparam.h>
	45
	46	#define CREATE_TRACE_POINTS
	47	#include <trace/events/vmalloc.h>
	48
	49	#include "internal.h"
	50	#include "pgalloc-track.h"
	51
	52	#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
	53	static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
	54
	55	static int __init set_nohugeiomap(char *str)
	56	{
	57	ioremap_max_page_shift = PAGE_SHIFT;
	58	return 0;
	59	}
	60	early_param("nohugeiomap", set_nohugeiomap);
	61	#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
	62	static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
	63	#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
	64
	65	#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
	66	static bool __ro_after_init vmap_allow_huge = true;
	67
	68	static int __init set_nohugevmalloc(char *str)
	69	{
	70	vmap_allow_huge = false;
	71	return 0;
	72	}
	73	early_param("nohugevmalloc", set_nohugevmalloc);
	74	#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
	75	static const bool vmap_allow_huge = false;
	76	#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
	77
	78	bool is_vmalloc_addr(const void *x)
	79	{
	80	unsigned long addr = (unsigned long)kasan_reset_tag(x);
	81
	82	return addr >= VMALLOC_START && addr < VMALLOC_END;
	83	}
	84	EXPORT_SYMBOL(is_vmalloc_addr);
	85
	86	struct vfree_deferred {
	87	struct llist_head list;
	88	struct work_struct wq;
	89	};
	90	static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
	91
	92	/* Page table manipulation functions */
	93	static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
	94	phys_addr_t phys_addr, pgprot_t prot,
	95	unsigned int max_page_shift, pgtbl_mod_mask *mask)
	96	{
	97	pte_t *pte;
	98	u64 pfn;
	99	unsigned long size = PAGE_SIZE;
	100
	101	pfn = phys_addr >> PAGE_SHIFT;
	102	pte = pte_alloc_kernel_track(pmd, addr, mask);
	103	if (!pte)
	104	return -ENOMEM;
	105	do {
	106	BUG_ON(!pte_none(*pte));
	107
	108	#ifdef CONFIG_HUGETLB_PAGE
	109	size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
	110	if (size != PAGE_SIZE) {
	111	pte_t entry = pfn_pte(pfn, prot);
	112
	113	entry = arch_make_huge_pte(entry, ilog2(size), 0);
	114	set_huge_pte_at(&init_mm, addr, pte, entry);
	115	pfn += PFN_DOWN(size);
	116	continue;
	117	}
	118	#endif
	119	set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
	120	pfn++;
	121	} while (pte += PFN_DOWN(size), addr += size, addr != end);
	122	*mask \|= PGTBL_PTE_MODIFIED;
	123	return 0;
	124	}
	125
	126	static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
	127	phys_addr_t phys_addr, pgprot_t prot,
	128	unsigned int max_page_shift)
	129	{
	130	if (max_page_shift < PMD_SHIFT)
	131	return 0;
	132
	133	if (!arch_vmap_pmd_supported(prot))
	134	return 0;
	135
	136	if ((end - addr) != PMD_SIZE)
	137	return 0;
	138
	139	if (!IS_ALIGNED(addr, PMD_SIZE))
	140	return 0;
	141
	142	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
	143	return 0;
	144
	145	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
	146	return 0;
	147
	148	return pmd_set_huge(pmd, phys_addr, prot);
	149	}
	150
	151	static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
	152	phys_addr_t phys_addr, pgprot_t prot,
	153	unsigned int max_page_shift, pgtbl_mod_mask *mask)
	154	{
	155	pmd_t *pmd;
	156	unsigned long next;
	157
	158	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
	159	if (!pmd)
	160	return -ENOMEM;
	161	do {
	162	next = pmd_addr_end(addr, end);
	163
	164	if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
	165	max_page_shift)) {
	166	*mask \|= PGTBL_PMD_MODIFIED;
	167	continue;
	168	}
	169
	170	if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
	171	return -ENOMEM;
	172	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
	173	return 0;
	174	}
	175
	176	static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
	177	phys_addr_t phys_addr, pgprot_t prot,
	178	unsigned int max_page_shift)
	179	{
	180	if (max_page_shift < PUD_SHIFT)
	181	return 0;
	182
	183	if (!arch_vmap_pud_supported(prot))
	184	return 0;
	185
	186	if ((end - addr) != PUD_SIZE)
	187	return 0;
	188
	189	if (!IS_ALIGNED(addr, PUD_SIZE))
	190	return 0;
	191
	192	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
	193	return 0;
	194
	195	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
	196	return 0;
	197
	198	return pud_set_huge(pud, phys_addr, prot);
	199	}
	200
	201	static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
	202	phys_addr_t phys_addr, pgprot_t prot,
	203	unsigned int max_page_shift, pgtbl_mod_mask *mask)
	204	{
	205	pud_t *pud;
	206	unsigned long next;
	207
	208	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
	209	if (!pud)
	210	return -ENOMEM;
	211	do {
	212	next = pud_addr_end(addr, end);
	213
	214	if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
	215	max_page_shift)) {
	216	*mask \|= PGTBL_PUD_MODIFIED;
	217	continue;
	218	}
	219
	220	if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
	221	max_page_shift, mask))
	222	return -ENOMEM;
	223	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
	224	return 0;
	225	}
	226
	227	static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
	228	phys_addr_t phys_addr, pgprot_t prot,
	229	unsigned int max_page_shift)
	230	{
	231	if (max_page_shift < P4D_SHIFT)
	232	return 0;
	233
	234	if (!arch_vmap_p4d_supported(prot))
	235	return 0;
	236
	237	if ((end - addr) != P4D_SIZE)
	238	return 0;
	239
	240	if (!IS_ALIGNED(addr, P4D_SIZE))
	241	return 0;
	242
	243	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
	244	return 0;
	245
	246	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
	247	return 0;
	248
	249	return p4d_set_huge(p4d, phys_addr, prot);
	250	}
	251
	252	static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
	253	phys_addr_t phys_addr, pgprot_t prot,
	254	unsigned int max_page_shift, pgtbl_mod_mask *mask)
	255	{
	256	p4d_t *p4d;
	257	unsigned long next;
	258
	259	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
	260	if (!p4d)
	261	return -ENOMEM;
	262	do {
	263	next = p4d_addr_end(addr, end);
	264
	265	if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
	266	max_page_shift)) {
	267	*mask \|= PGTBL_P4D_MODIFIED;
	268	continue;
	269	}
	270
	271	if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
	272	max_page_shift, mask))
	273	return -ENOMEM;
	274	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
	275	return 0;
	276	}
	277
	278	static int vmap_range_noflush(unsigned long addr, unsigned long end,
	279	phys_addr_t phys_addr, pgprot_t prot,
	280	unsigned int max_page_shift)
	281	{
	282	pgd_t *pgd;
	283	unsigned long start;
	284	unsigned long next;
	285	int err;
	286	pgtbl_mod_mask mask = 0;
	287
	288	might_sleep();
	289	BUG_ON(addr >= end);
	290
	291	start = addr;
	292	pgd = pgd_offset_k(addr);
	293	do {
	294	next = pgd_addr_end(addr, end);
	295	err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
	296	max_page_shift, &mask);
	297	if (err)
	298	break;
	299	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
	300
	301	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
	302	arch_sync_kernel_mappings(start, end);
	303
	304	return err;
	305	}
	306
	307	int ioremap_page_range(unsigned long addr, unsigned long end,
	308	phys_addr_t phys_addr, pgprot_t prot)
	309	{
	310	int err;
	311
	312	err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
	313	ioremap_max_page_shift);
	314	flush_cache_vmap(addr, end);
	315	if (!err)
	316	kmsan_ioremap_page_range(addr, end, phys_addr, prot,
	317	ioremap_max_page_shift);
	318	return err;
	319	}
	320
	321	static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
	322	pgtbl_mod_mask *mask)
	323	{
	324	pte_t *pte;
	325
	326	pte = pte_offset_kernel(pmd, addr);
	327	do {
	328	pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
	329	WARN_ON(!pte_none(ptent) && !pte_present(ptent));
	330	} while (pte++, addr += PAGE_SIZE, addr != end);
	331	*mask \|= PGTBL_PTE_MODIFIED;
	332	}
	333
	334	static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
	335	pgtbl_mod_mask *mask)
	336	{
	337	pmd_t *pmd;
	338	unsigned long next;
	339	int cleared;
	340
	341	pmd = pmd_offset(pud, addr);
	342	do {
	343	next = pmd_addr_end(addr, end);
	344
	345	cleared = pmd_clear_huge(pmd);
	346	if (cleared \|\| pmd_bad(*pmd))
	347	*mask \|= PGTBL_PMD_MODIFIED;
	348
	349	if (cleared)
	350	continue;
	351	if (pmd_none_or_clear_bad(pmd))
	352	continue;
	353	vunmap_pte_range(pmd, addr, next, mask);
	354
	355	cond_resched();
	356	} while (pmd++, addr = next, addr != end);
	357	}
	358
	359	static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
	360	pgtbl_mod_mask *mask)
	361	{
	362	pud_t *pud;
	363	unsigned long next;
	364	int cleared;
	365
	366	pud = pud_offset(p4d, addr);
	367	do {
	368	next = pud_addr_end(addr, end);
	369
	370	cleared = pud_clear_huge(pud);
	371	if (cleared \|\| pud_bad(*pud))
	372	*mask \|= PGTBL_PUD_MODIFIED;
	373
	374	if (cleared)
	375	continue;
	376	if (pud_none_or_clear_bad(pud))
	377	continue;
	378	vunmap_pmd_range(pud, addr, next, mask);
	379	} while (pud++, addr = next, addr != end);
	380	}
	381
	382	static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
	383	pgtbl_mod_mask *mask)
	384	{
	385	p4d_t *p4d;
	386	unsigned long next;
	387
	388	p4d = p4d_offset(pgd, addr);
	389	do {
	390	next = p4d_addr_end(addr, end);
	391
	392	p4d_clear_huge(p4d);
	393	if (p4d_bad(*p4d))
	394	*mask \|= PGTBL_P4D_MODIFIED;
	395
	396	if (p4d_none_or_clear_bad(p4d))
	397	continue;
	398	vunmap_pud_range(p4d, addr, next, mask);
	399	} while (p4d++, addr = next, addr != end);
	400	}
	401
	402	/*
	403	* vunmap_range_noflush is similar to vunmap_range, but does not
	404	* flush caches or TLBs.
	405	*
	406	* The caller is responsible for calling flush_cache_vmap() before calling
	407	* this function, and flush_tlb_kernel_range after it has returned
	408	* successfully (and before the addresses are expected to cause a page fault
	409	* or be re-mapped for something else, if TLB flushes are being delayed or
	410	* coalesced).
	411	*
	412	* This is an internal function only. Do not use outside mm/.
	413	*/
	414	void __vunmap_range_noflush(unsigned long start, unsigned long end)
	415	{
	416	unsigned long next;
	417	pgd_t *pgd;
	418	unsigned long addr = start;
	419	pgtbl_mod_mask mask = 0;
	420
	421	BUG_ON(addr >= end);
	422	pgd = pgd_offset_k(addr);
	423	do {
	424	next = pgd_addr_end(addr, end);
	425	if (pgd_bad(*pgd))
	426	mask \|= PGTBL_PGD_MODIFIED;
	427	if (pgd_none_or_clear_bad(pgd))
	428	continue;
	429	vunmap_p4d_range(pgd, addr, next, &mask);
	430	} while (pgd++, addr = next, addr != end);
	431
	432	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
	433	arch_sync_kernel_mappings(start, end);
	434	}
	435
	436	void vunmap_range_noflush(unsigned long start, unsigned long end)
	437	{
	438	kmsan_vunmap_range_noflush(start, end);
	439	__vunmap_range_noflush(start, end);
	440	}
	441
	442	/**
	443	* vunmap_range - unmap kernel virtual addresses
	444	* @addr: start of the VM area to unmap
	445	* @end: end of the VM area to unmap (non-inclusive)
	446	*
	447	* Clears any present PTEs in the virtual address range, flushes TLBs and
	448	* caches. Any subsequent access to the address before it has been re-mapped
	449	* is a kernel bug.
	450	*/
	451	void vunmap_range(unsigned long addr, unsigned long end)
	452	{
	453	flush_cache_vunmap(addr, end);
	454	vunmap_range_noflush(addr, end);
	455	flush_tlb_kernel_range(addr, end);
	456	}
	457
	458	static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
	459	unsigned long end, pgprot_t prot, struct page *pages, int nr,
	460	pgtbl_mod_mask *mask)
	461	{
	462	pte_t *pte;
	463
	464	/*
	465	* nr is a running index into the array which helps higher level
	466	* callers keep track of where we're up to.
	467	*/
	468
	469	pte = pte_alloc_kernel_track(pmd, addr, mask);
	470	if (!pte)
	471	return -ENOMEM;
	472	do {
	473	struct page page = pages[nr];
	474
	475	if (WARN_ON(!pte_none(*pte)))
	476	return -EBUSY;
	477	if (WARN_ON(!page))
	478	return -ENOMEM;
	479	if (WARN_ON(!pfn_valid(page_to_pfn(page))))
	480	return -EINVAL;
	481
	482	set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
	483	(*nr)++;
	484	} while (pte++, addr += PAGE_SIZE, addr != end);
	485	*mask \|= PGTBL_PTE_MODIFIED;
	486	return 0;
	487	}
	488
	489	static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
	490	unsigned long end, pgprot_t prot, struct page *pages, int nr,
	491	pgtbl_mod_mask *mask)
	492	{
	493	pmd_t *pmd;
	494	unsigned long next;
	495
	496	pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
	497	if (!pmd)
	498	return -ENOMEM;
	499	do {
	500	next = pmd_addr_end(addr, end);
	501	if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
	502	return -ENOMEM;
	503	} while (pmd++, addr = next, addr != end);
	504	return 0;
	505	}
	506
	507	static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
	508	unsigned long end, pgprot_t prot, struct page *pages, int nr,
	509	pgtbl_mod_mask *mask)
	510	{
	511	pud_t *pud;
	512	unsigned long next;
	513
	514	pud = pud_alloc_track(&init_mm, p4d, addr, mask);
	515	if (!pud)
	516	return -ENOMEM;
	517	do {
	518	next = pud_addr_end(addr, end);
	519	if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
	520	return -ENOMEM;
	521	} while (pud++, addr = next, addr != end);
	522	return 0;
	523	}
	524
	525	static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
	526	unsigned long end, pgprot_t prot, struct page *pages, int nr,
	527	pgtbl_mod_mask *mask)
	528	{
	529	p4d_t *p4d;
	530	unsigned long next;
	531
	532	p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
	533	if (!p4d)
	534	return -ENOMEM;
	535	do {
	536	next = p4d_addr_end(addr, end);
	537	if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
	538	return -ENOMEM;
	539	} while (p4d++, addr = next, addr != end);
	540	return 0;
	541	}
	542
	543	static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
	544	pgprot_t prot, struct page **pages)
	545	{
	546	unsigned long start = addr;
	547	pgd_t *pgd;
	548	unsigned long next;
	549	int err = 0;
	550	int nr = 0;
	551	pgtbl_mod_mask mask = 0;
	552
	553	BUG_ON(addr >= end);
	554	pgd = pgd_offset_k(addr);
	555	do {
	556	next = pgd_addr_end(addr, end);
	557	if (pgd_bad(*pgd))
	558	mask \|= PGTBL_PGD_MODIFIED;
	559	err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
	560	if (err)
	561	return err;
	562	} while (pgd++, addr = next, addr != end);
	563
	564	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
	565	arch_sync_kernel_mappings(start, end);
	566
	567	return 0;
	568	}
	569
	570	/*
	571	* vmap_pages_range_noflush is similar to vmap_pages_range, but does not
	572	* flush caches.
	573	*
	574	* The caller is responsible for calling flush_cache_vmap() after this
	575	* function returns successfully and before the addresses are accessed.
	576	*
	577	* This is an internal function only. Do not use outside mm/.
	578	*/
	579	int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
	580	pgprot_t prot, struct page **pages, unsigned int page_shift)
	581	{
	582	unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
	583
	584	WARN_ON(page_shift < PAGE_SHIFT);
	585
	586	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) \|\|
	587	page_shift == PAGE_SHIFT)
	588	return vmap_small_pages_range_noflush(addr, end, prot, pages);
	589
	590	for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
	591	int err;
	592
	593	err = vmap_range_noflush(addr, addr + (1UL << page_shift),
	594	page_to_phys(pages[i]), prot,
	595	page_shift);
	596	if (err)
	597	return err;
	598
	599	addr += 1UL << page_shift;
	600	}
	601
	602	return 0;
	603	}
	604
	605	int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
	606	pgprot_t prot, struct page **pages, unsigned int page_shift)
	607	{
	608	kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
	609	return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
	610	}
	611
	612	/**
	613	* vmap_pages_range - map pages to a kernel virtual address
	614	* @addr: start of the VM area to map
	615	* @end: end of the VM area to map (non-inclusive)
	616	* @prot: page protection flags to use
	617	* @pages: pages to map (always PAGE_SIZE pages)
	618	* @page_shift: maximum shift that the pages may be mapped with, @pages must
	619	* be aligned and contiguous up to at least this shift.
	620	*
	621	* RETURNS:
	622	* 0 on success, -errno on failure.
	623	*/
	624	static int vmap_pages_range(unsigned long addr, unsigned long end,
	625	pgprot_t prot, struct page **pages, unsigned int page_shift)
	626	{
	627	int err;
	628
	629	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
	630	flush_cache_vmap(addr, end);
	631	return err;
	632	}
	633
	634	int is_vmalloc_or_module_addr(const void *x)
	635	{
	636	/*
	637	* ARM, x86-64 and sparc64 put modules in a special place,
	638	* and fall back on vmalloc() if that fails. Others
	639	* just put it in the vmalloc space.
	640	*/
	641	#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
	642	unsigned long addr = (unsigned long)kasan_reset_tag(x);
	643	if (addr >= MODULES_VADDR && addr < MODULES_END)
	644	return 1;
	645	#endif
	646	return is_vmalloc_addr(x);
	647	}
	648
	649	/*
	650	* Walk a vmap address to the struct page it maps. Huge vmap mappings will
	651	* return the tail page that corresponds to the base page address, which
	652	* matches small vmap mappings.
	653	*/
	654	struct page vmalloc_to_page(const void vmalloc_addr)
	655	{
	656	unsigned long addr = (unsigned long) vmalloc_addr;
	657	struct page *page = NULL;
	658	pgd_t *pgd = pgd_offset_k(addr);
	659	p4d_t *p4d;
	660	pud_t *pud;
	661	pmd_t *pmd;
	662	pte_t *ptep, pte;
	663
	664	/*
	665	* XXX we might need to change this if we add VIRTUAL_BUG_ON for
	666	* architectures that do not vmalloc module space
	667	*/
	668	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
	669
	670	if (pgd_none(*pgd))
	671	return NULL;
	672	if (WARN_ON_ONCE(pgd_leaf(*pgd)))
	673	return NULL; /* XXX: no allowance for huge pgd */
	674	if (WARN_ON_ONCE(pgd_bad(*pgd)))
	675	return NULL;
	676
	677	p4d = p4d_offset(pgd, addr);
	678	if (p4d_none(*p4d))
	679	return NULL;
	680	if (p4d_leaf(*p4d))
	681	return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
	682	if (WARN_ON_ONCE(p4d_bad(*p4d)))
	683	return NULL;
	684
	685	pud = pud_offset(p4d, addr);
	686	if (pud_none(*pud))
	687	return NULL;
	688	if (pud_leaf(*pud))
	689	return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
	690	if (WARN_ON_ONCE(pud_bad(*pud)))
	691	return NULL;
	692
	693	pmd = pmd_offset(pud, addr);
	694	if (pmd_none(*pmd))
	695	return NULL;
	696	if (pmd_leaf(*pmd))
	697	return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
	698	if (WARN_ON_ONCE(pmd_bad(*pmd)))
	699	return NULL;
	700
	701	ptep = pte_offset_map(pmd, addr);
	702	pte = *ptep;
	703	if (pte_present(pte))
	704	page = pte_page(pte);
	705	pte_unmap(ptep);
	706
	707	return page;
	708	}
	709	EXPORT_SYMBOL(vmalloc_to_page);
	710
	711	/*
	712	* Map a vmalloc()-space virtual address to the physical page frame number.
	713	*/
	714	unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
	715	{
	716	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
	717	}
	718	EXPORT_SYMBOL(vmalloc_to_pfn);
	719
	720
	721	/* Global kva allocator */
	722
	723	#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
	724	#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
	725
	726
	727	static DEFINE_SPINLOCK(vmap_area_lock);
	728	static DEFINE_SPINLOCK(free_vmap_area_lock);
	729	/* Export for kexec only */
	730	LIST_HEAD(vmap_area_list);
	731	static struct rb_root vmap_area_root = RB_ROOT;
	732	static bool vmap_initialized __read_mostly;
	733
	734	static struct rb_root purge_vmap_area_root = RB_ROOT;
	735	static LIST_HEAD(purge_vmap_area_list);
	736	static DEFINE_SPINLOCK(purge_vmap_area_lock);
	737
	738	/*
	739	* This kmem_cache is used for vmap_area objects. Instead of
	740	* allocating from slab we reuse an object from this cache to
	741	* make things faster. Especially in "no edge" splitting of
	742	* free block.
	743	*/
	744	static struct kmem_cache *vmap_area_cachep;
	745
	746	/*
	747	* This linked list is used in pair with free_vmap_area_root.
	748	* It gives O(1) access to prev/next to perform fast coalescing.
	749	*/
	750	static LIST_HEAD(free_vmap_area_list);
	751
	752	/*
	753	* This augment red-black tree represents the free vmap space.
	754	* All vmap_area objects in this tree are sorted by va->va_start
	755	* address. It is used for allocation and merging when a vmap
	756	* object is released.
	757	*
	758	* Each vmap_area node contains a maximum available free block
	759	* of its sub-tree, right or left. Therefore it is possible to
	760	* find a lowest match of free area.
	761	*/
	762	static struct rb_root free_vmap_area_root = RB_ROOT;
	763
	764	/*
	765	* Preload a CPU with one object for "no edge" split case. The
	766	* aim is to get rid of allocations from the atomic context, thus
	767	* to use more permissive allocation masks.
	768	*/
	769	static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
	770
	771	static __always_inline unsigned long
	772	va_size(struct vmap_area *va)
	773	{
	774	return (va->va_end - va->va_start);
	775	}
	776
	777	static __always_inline unsigned long
	778	get_subtree_max_size(struct rb_node *node)
	779	{
	780	struct vmap_area *va;
	781
	782	va = rb_entry_safe(node, struct vmap_area, rb_node);
	783	return va ? va->subtree_max_size : 0;
	784	}
	785
	786	RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
	787	struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
	788
	789	static void purge_vmap_area_lazy(void);
	790	static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
	791	static void drain_vmap_area_work(struct work_struct *work);
	792	static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
	793
	794	static atomic_long_t nr_vmalloc_pages;
	795
	796	unsigned long vmalloc_nr_pages(void)
	797	{
	798	return atomic_long_read(&nr_vmalloc_pages);
	799	}
	800
	801	/* Look up the first VA which satisfies addr < va_end, NULL if none. */
	802	static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
	803	{
	804	struct vmap_area *va = NULL;
	805	struct rb_node *n = vmap_area_root.rb_node;
	806
	807	addr = (unsigned long)kasan_reset_tag((void *)addr);
	808
	809	while (n) {
	810	struct vmap_area *tmp;
	811
	812	tmp = rb_entry(n, struct vmap_area, rb_node);
	813	if (tmp->va_end > addr) {
	814	va = tmp;
	815	if (tmp->va_start <= addr)
	816	break;
	817
	818	n = n->rb_left;
	819	} else
	820	n = n->rb_right;
	821	}
	822
	823	return va;
	824	}
	825
	826	static struct vmap_area __find_vmap_area(unsigned long addr, struct rb_root root)
	827	{
	828	struct rb_node *n = root->rb_node;
	829
	830	addr = (unsigned long)kasan_reset_tag((void *)addr);
	831
	832	while (n) {
	833	struct vmap_area *va;
	834
	835	va = rb_entry(n, struct vmap_area, rb_node);
	836	if (addr < va->va_start)
	837	n = n->rb_left;
	838	else if (addr >= va->va_end)
	839	n = n->rb_right;
	840	else
	841	return va;
	842	}
	843
	844	return NULL;
	845	}
	846
	847	/*
	848	* This function returns back addresses of parent node
	849	* and its left or right link for further processing.
	850	*
	851	* Otherwise NULL is returned. In that case all further
	852	* steps regarding inserting of conflicting overlap range
	853	* have to be declined and actually considered as a bug.
	854	*/
	855	static __always_inline struct rb_node **
	856	find_va_links(struct vmap_area *va,
	857	struct rb_root root, struct rb_node from,
	858	struct rb_node **parent)
	859	{
	860	struct vmap_area *tmp_va;
	861	struct rb_node **link;
	862
	863	if (root) {
	864	link = &root->rb_node;
	865	if (unlikely(!*link)) {
	866	*parent = NULL;
	867	return link;
	868	}
	869	} else {
	870	link = &from;
	871	}
	872
	873	/*
	874	* Go to the bottom of the tree. When we hit the last point
	875	* we end up with parent rb_node and correct direction, i name
	876	* it link, where the new va->rb_node will be attached to.
	877	*/
	878	do {
	879	tmp_va = rb_entry(*link, struct vmap_area, rb_node);
	880
	881	/*
	882	* During the traversal we also do some sanity check.
	883	* Trigger the BUG() if there are sides(left/right)
	884	* or full overlaps.
	885	*/
	886	if (va->va_end <= tmp_va->va_start)
	887	link = &(*link)->rb_left;
	888	else if (va->va_start >= tmp_va->va_end)
	889	link = &(*link)->rb_right;
	890	else {
	891	WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
	892	va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
	893
	894	return NULL;
	895	}
	896	} while (*link);
	897
	898	*parent = &tmp_va->rb_node;
	899	return link;
	900	}
	901
	902	static __always_inline struct list_head *
	903	get_va_next_sibling(struct rb_node parent, struct rb_node *link)
	904	{
	905	struct list_head *list;
	906
	907	if (unlikely(!parent))
	908	/*
	909	* The red-black tree where we try to find VA neighbors
	910	* before merging or inserting is empty, i.e. it means
	911	* there is no free vmap space. Normally it does not
	912	* happen but we handle this case anyway.
	913	*/
	914	return NULL;
	915
	916	list = &rb_entry(parent, struct vmap_area, rb_node)->list;
	917	return (&parent->rb_right == link ? list->next : list);
	918	}
	919
	920	static __always_inline void
	921	__link_va(struct vmap_area va, struct rb_root root,
	922	struct rb_node parent, struct rb_node *link,
	923	struct list_head *head, bool augment)
	924	{
	925	/*
	926	* VA is still not in the list, but we can
	927	* identify its future previous list_head node.
	928	*/
	929	if (likely(parent)) {
	930	head = &rb_entry(parent, struct vmap_area, rb_node)->list;
	931	if (&parent->rb_right != link)
	932	head = head->prev;
	933	}
	934
	935	/* Insert to the rb-tree */
	936	rb_link_node(&va->rb_node, parent, link);
	937	if (augment) {
	938	/*
	939	* Some explanation here. Just perform simple insertion
	940	* to the tree. We do not set va->subtree_max_size to
	941	* its current size before calling rb_insert_augmented().
	942	* It is because we populate the tree from the bottom
	943	* to parent levels when the node _is_ in the tree.
	944	*
	945	* Therefore we set subtree_max_size to zero after insertion,
	946	* to let __augment_tree_propagate_from() puts everything to
	947	* the correct order later on.
	948	*/
	949	rb_insert_augmented(&va->rb_node,
	950	root, &free_vmap_area_rb_augment_cb);
	951	va->subtree_max_size = 0;
	952	} else {
	953	rb_insert_color(&va->rb_node, root);
	954	}
	955
	956	/* Address-sort this list */
	957	list_add(&va->list, head);
	958	}
	959
	960	static __always_inline void
	961	link_va(struct vmap_area va, struct rb_root root,
	962	struct rb_node parent, struct rb_node *link,
	963	struct list_head *head)
	964	{
	965	__link_va(va, root, parent, link, head, false);
	966	}
	967
	968	static __always_inline void
	969	link_va_augment(struct vmap_area va, struct rb_root root,
	970	struct rb_node parent, struct rb_node *link,
	971	struct list_head *head)
	972	{
	973	__link_va(va, root, parent, link, head, true);
	974	}
	975
	976	static __always_inline void
	977	__unlink_va(struct vmap_area va, struct rb_root root, bool augment)
	978	{
	979	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
	980	return;
	981
	982	if (augment)
	983	rb_erase_augmented(&va->rb_node,
	984	root, &free_vmap_area_rb_augment_cb);
	985	else
	986	rb_erase(&va->rb_node, root);
	987
	988	list_del_init(&va->list);
	989	RB_CLEAR_NODE(&va->rb_node);
	990	}
	991
	992	static __always_inline void
	993	unlink_va(struct vmap_area va, struct rb_root root)
	994	{
	995	__unlink_va(va, root, false);
	996	}
	997
	998	static __always_inline void
	999	unlink_va_augment(struct vmap_area va, struct rb_root root)
	1000	{
	1001	__unlink_va(va, root, true);
	1002	}
	1003
	1004	#if DEBUG_AUGMENT_PROPAGATE_CHECK
	1005	/*
	1006	* Gets called when remove the node and rotate.
	1007	*/
	1008	static __always_inline unsigned long
	1009	compute_subtree_max_size(struct vmap_area *va)
	1010	{
	1011	return max3(va_size(va),
	1012	get_subtree_max_size(va->rb_node.rb_left),
	1013	get_subtree_max_size(va->rb_node.rb_right));
	1014	}
	1015
	1016	static void
	1017	augment_tree_propagate_check(void)
	1018	{
	1019	struct vmap_area *va;
	1020	unsigned long computed_size;
	1021
	1022	list_for_each_entry(va, &free_vmap_area_list, list) {
	1023	computed_size = compute_subtree_max_size(va);
	1024	if (computed_size != va->subtree_max_size)
	1025	pr_emerg("tree is corrupted: %lu, %lu\n",
	1026	va_size(va), va->subtree_max_size);
	1027	}
	1028	}
	1029	#endif
	1030
	1031	/*
	1032	* This function populates subtree_max_size from bottom to upper
	1033	* levels starting from VA point. The propagation must be done
	1034	* when VA size is modified by changing its va_start/va_end. Or
	1035	* in case of newly inserting of VA to the tree.
	1036	*
	1037	* It means that __augment_tree_propagate_from() must be called:
	1038	* - After VA has been inserted to the tree(free path);
	1039	* - After VA has been shrunk(allocation path);
	1040	* - After VA has been increased(merging path).
	1041	*
	1042	* Please note that, it does not mean that upper parent nodes
	1043	* and their subtree_max_size are recalculated all the time up
	1044	* to the root node.
	1045	*
	1046	* 4--8
	1047	* /\
	1048	* / \
	1049	* / \
	1050	* 2--2 8--8
	1051	*
	1052	* For example if we modify the node 4, shrinking it to 2, then
	1053	* no any modification is required. If we shrink the node 2 to 1
	1054	* its subtree_max_size is updated only, and set to 1. If we shrink
	1055	* the node 8 to 6, then its subtree_max_size is set to 6 and parent
	1056	* node becomes 4--6.
	1057	*/
	1058	static __always_inline void
	1059	augment_tree_propagate_from(struct vmap_area *va)
	1060	{
	1061	/*
	1062	* Populate the tree from bottom towards the root until
	1063	* the calculated maximum available size of checked node
	1064	* is equal to its current one.
	1065	*/
	1066	free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
	1067
	1068	#if DEBUG_AUGMENT_PROPAGATE_CHECK
	1069	augment_tree_propagate_check();
	1070	#endif
	1071	}
	1072
	1073	static void
	1074	insert_vmap_area(struct vmap_area *va,
	1075	struct rb_root root, struct list_head head)
	1076	{
	1077	struct rb_node **link;
	1078	struct rb_node *parent;
	1079
	1080	link = find_va_links(va, root, NULL, &parent);
	1081	if (link)
	1082	link_va(va, root, parent, link, head);
	1083	}
	1084
	1085	static void
	1086	insert_vmap_area_augment(struct vmap_area *va,
	1087	struct rb_node from, struct rb_root root,
	1088	struct list_head *head)
	1089	{
	1090	struct rb_node **link;
	1091	struct rb_node *parent;
	1092
	1093	if (from)
	1094	link = find_va_links(va, NULL, from, &parent);
	1095	else
	1096	link = find_va_links(va, root, NULL, &parent);
	1097
	1098	if (link) {
	1099	link_va_augment(va, root, parent, link, head);
	1100	augment_tree_propagate_from(va);
	1101	}
	1102	}
	1103
	1104	/*
	1105	* Merge de-allocated chunk of VA memory with previous
	1106	* and next free blocks. If coalesce is not done a new
	1107	* free area is inserted. If VA has been merged, it is
	1108	* freed.
	1109	*
	1110	* Please note, it can return NULL in case of overlap
	1111	* ranges, followed by WARN() report. Despite it is a
	1112	* buggy behaviour, a system can be alive and keep
	1113	* ongoing.
	1114	*/
	1115	static __always_inline struct vmap_area *
	1116	__merge_or_add_vmap_area(struct vmap_area *va,
	1117	struct rb_root root, struct list_head head, bool augment)
	1118	{
	1119	struct vmap_area *sibling;
	1120	struct list_head *next;
	1121	struct rb_node **link;
	1122	struct rb_node *parent;
	1123	bool merged = false;
	1124
	1125	/*
	1126	* Find a place in the tree where VA potentially will be
	1127	* inserted, unless it is merged with its sibling/siblings.
	1128	*/
	1129	link = find_va_links(va, root, NULL, &parent);
	1130	if (!link)
	1131	return NULL;
	1132
	1133	/*
	1134	* Get next node of VA to check if merging can be done.
	1135	*/
	1136	next = get_va_next_sibling(parent, link);
	1137	if (unlikely(next == NULL))
	1138	goto insert;
	1139
	1140	/*
	1141	* start end
	1142	* \| \|
	1143	* \|<------VA------>\|<-----Next----->\|
	1144	* \| \|
	1145	* start end
	1146	*/
	1147	if (next != head) {
	1148	sibling = list_entry(next, struct vmap_area, list);
	1149	if (sibling->va_start == va->va_end) {
	1150	sibling->va_start = va->va_start;
	1151
	1152	/* Free vmap_area object. */
	1153	kmem_cache_free(vmap_area_cachep, va);
	1154
	1155	/* Point to the new merged area. */
	1156	va = sibling;
	1157	merged = true;
	1158	}
	1159	}
	1160
	1161	/*
	1162	* start end
	1163	* \| \|
	1164	* \|<-----Prev----->\|<------VA------>\|
	1165	* \| \|
	1166	* start end
	1167	*/
	1168	if (next->prev != head) {
	1169	sibling = list_entry(next->prev, struct vmap_area, list);
	1170	if (sibling->va_end == va->va_start) {
	1171	/*
	1172	* If both neighbors are coalesced, it is important
	1173	* to unlink the "next" node first, followed by merging
	1174	* with "previous" one. Otherwise the tree might not be
	1175	* fully populated if a sibling's augmented value is
	1176	* "normalized" because of rotation operations.
	1177	*/
	1178	if (merged)
	1179	__unlink_va(va, root, augment);
	1180
	1181	sibling->va_end = va->va_end;
	1182
	1183	/* Free vmap_area object. */
	1184	kmem_cache_free(vmap_area_cachep, va);
	1185
	1186	/* Point to the new merged area. */
	1187	va = sibling;
	1188	merged = true;
	1189	}
	1190	}
	1191
	1192	insert:
	1193	if (!merged)
	1194	__link_va(va, root, parent, link, head, augment);
	1195
	1196	return va;
	1197	}
	1198
	1199	static __always_inline struct vmap_area *
	1200	merge_or_add_vmap_area(struct vmap_area *va,
	1201	struct rb_root root, struct list_head head)
	1202	{
	1203	return __merge_or_add_vmap_area(va, root, head, false);
	1204	}
	1205
	1206	static __always_inline struct vmap_area *
	1207	merge_or_add_vmap_area_augment(struct vmap_area *va,
	1208	struct rb_root root, struct list_head head)
	1209	{
	1210	va = __merge_or_add_vmap_area(va, root, head, true);
	1211	if (va)
	1212	augment_tree_propagate_from(va);
	1213
	1214	return va;
	1215	}
	1216
	1217	static __always_inline bool
	1218	is_within_this_va(struct vmap_area *va, unsigned long size,
	1219	unsigned long align, unsigned long vstart)
	1220	{
	1221	unsigned long nva_start_addr;
	1222
	1223	if (va->va_start > vstart)
	1224	nva_start_addr = ALIGN(va->va_start, align);
	1225	else
	1226	nva_start_addr = ALIGN(vstart, align);
	1227
	1228	/* Can be overflowed due to big size or alignment. */
	1229	if (nva_start_addr + size < nva_start_addr \|\|
	1230	nva_start_addr < vstart)
	1231	return false;
	1232
	1233	return (nva_start_addr + size <= va->va_end);
	1234	}
	1235
	1236	/*
	1237	* Find the first free block(lowest start address) in the tree,
	1238	* that will accomplish the request corresponding to passing
	1239	* parameters. Please note, with an alignment bigger than PAGE_SIZE,
	1240	* a search length is adjusted to account for worst case alignment
	1241	* overhead.
	1242	*/
	1243	static __always_inline struct vmap_area *
	1244	find_vmap_lowest_match(struct rb_root *root, unsigned long size,
	1245	unsigned long align, unsigned long vstart, bool adjust_search_size)
	1246	{
	1247	struct vmap_area *va;
	1248	struct rb_node *node;
	1249	unsigned long length;
	1250
	1251	/* Start from the root. */
	1252	node = root->rb_node;
	1253
	1254	/* Adjust the search size for alignment overhead. */
	1255	length = adjust_search_size ? size + align - 1 : size;
	1256
	1257	while (node) {
	1258	va = rb_entry(node, struct vmap_area, rb_node);
	1259
	1260	if (get_subtree_max_size(node->rb_left) >= length &&
	1261	vstart < va->va_start) {
	1262	node = node->rb_left;
	1263	} else {
	1264	if (is_within_this_va(va, size, align, vstart))
	1265	return va;
	1266
	1267	/*
	1268	* Does not make sense to go deeper towards the right
	1269	* sub-tree if it does not have a free block that is
	1270	* equal or bigger to the requested search length.
	1271	*/
	1272	if (get_subtree_max_size(node->rb_right) >= length) {
	1273	node = node->rb_right;
	1274	continue;
	1275	}
	1276
	1277	/*
	1278	* OK. We roll back and find the first right sub-tree,
	1279	* that will satisfy the search criteria. It can happen
	1280	* due to "vstart" restriction or an alignment overhead
	1281	* that is bigger then PAGE_SIZE.
	1282	*/
	1283	while ((node = rb_parent(node))) {
	1284	va = rb_entry(node, struct vmap_area, rb_node);
	1285	if (is_within_this_va(va, size, align, vstart))
	1286	return va;
	1287
	1288	if (get_subtree_max_size(node->rb_right) >= length &&
	1289	vstart <= va->va_start) {
	1290	/*
	1291	* Shift the vstart forward. Please note, we update it with
	1292	* parent's start address adding "1" because we do not want
	1293	* to enter same sub-tree after it has already been checked
	1294	* and no suitable free block found there.
	1295	*/
	1296	vstart = va->va_start + 1;
	1297	node = node->rb_right;
	1298	break;
	1299	}
	1300	}
	1301	}
	1302	}
	1303
	1304	return NULL;
	1305	}
	1306
	1307	#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
	1308	#include <linux/random.h>
	1309
	1310	static struct vmap_area *
	1311	find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
	1312	unsigned long align, unsigned long vstart)
	1313	{
	1314	struct vmap_area *va;
	1315
	1316	list_for_each_entry(va, head, list) {
	1317	if (!is_within_this_va(va, size, align, vstart))
	1318	continue;
	1319
	1320	return va;
	1321	}
	1322
	1323	return NULL;
	1324	}
	1325
	1326	static void
	1327	find_vmap_lowest_match_check(struct rb_root root, struct list_head head,
	1328	unsigned long size, unsigned long align)
	1329	{
	1330	struct vmap_area va_1, va_2;
	1331	unsigned long vstart;
	1332	unsigned int rnd;
	1333
	1334	get_random_bytes(&rnd, sizeof(rnd));
	1335	vstart = VMALLOC_START + rnd;
	1336
	1337	va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
	1338	va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
	1339
	1340	if (va_1 != va_2)
	1341	pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
	1342	va_1, va_2, vstart);
	1343	}
	1344	#endif
	1345
	1346	enum fit_type {
	1347	NOTHING_FIT = 0,
	1348	FL_FIT_TYPE = 1, /* full fit */
	1349	LE_FIT_TYPE = 2, /* left edge fit */
	1350	RE_FIT_TYPE = 3, /* right edge fit */
	1351	NE_FIT_TYPE = 4 /* no edge fit */
	1352	};
	1353
	1354	static __always_inline enum fit_type
	1355	classify_va_fit_type(struct vmap_area *va,
	1356	unsigned long nva_start_addr, unsigned long size)
	1357	{
	1358	enum fit_type type;
	1359
	1360	/* Check if it is within VA. */
	1361	if (nva_start_addr < va->va_start \|\|
	1362	nva_start_addr + size > va->va_end)
	1363	return NOTHING_FIT;
	1364
	1365	/* Now classify. */
	1366	if (va->va_start == nva_start_addr) {
	1367	if (va->va_end == nva_start_addr + size)
	1368	type = FL_FIT_TYPE;
	1369	else
	1370	type = LE_FIT_TYPE;
	1371	} else if (va->va_end == nva_start_addr + size) {
	1372	type = RE_FIT_TYPE;
	1373	} else {
	1374	type = NE_FIT_TYPE;
	1375	}
	1376
	1377	return type;
	1378	}
	1379
	1380	static __always_inline int
	1381	adjust_va_to_fit_type(struct rb_root root, struct list_head head,
	1382	struct vmap_area *va, unsigned long nva_start_addr,
	1383	unsigned long size)
	1384	{
	1385	struct vmap_area *lva = NULL;
	1386	enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
	1387
	1388	if (type == FL_FIT_TYPE) {
	1389	/*
	1390	* No need to split VA, it fully fits.
	1391	*
	1392	* \| \|
	1393	* V NVA V
	1394	* \|---------------\|
	1395	*/
	1396	unlink_va_augment(va, root);
	1397	kmem_cache_free(vmap_area_cachep, va);
	1398	} else if (type == LE_FIT_TYPE) {
	1399	/*
	1400	* Split left edge of fit VA.
	1401	*
	1402	* \| \|
	1403	* V NVA V R
	1404	* \|-------\|-------\|
	1405	*/
	1406	va->va_start += size;
	1407	} else if (type == RE_FIT_TYPE) {
	1408	/*
	1409	* Split right edge of fit VA.
	1410	*
	1411	* \| \|
	1412	* L V NVA V
	1413	* \|-------\|-------\|
	1414	*/
	1415	va->va_end = nva_start_addr;
	1416	} else if (type == NE_FIT_TYPE) {
	1417	/*
	1418	* Split no edge of fit VA.
	1419	*
	1420	* \| \|
	1421	* L V NVA V R
	1422	* \|---\|-------\|---\|
	1423	*/
	1424	lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
	1425	if (unlikely(!lva)) {
	1426	/*
	1427	* For percpu allocator we do not do any pre-allocation
	1428	* and leave it as it is. The reason is it most likely
	1429	* never ends up with NE_FIT_TYPE splitting. In case of
	1430	* percpu allocations offsets and sizes are aligned to
	1431	* fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
	1432	* are its main fitting cases.
	1433	*
	1434	* There are a few exceptions though, as an example it is
	1435	* a first allocation (early boot up) when we have "one"
	1436	* big free space that has to be split.
	1437	*
	1438	* Also we can hit this path in case of regular "vmap"
	1439	* allocations, if "this" current CPU was not preloaded.
	1440	* See the comment in alloc_vmap_area() why. If so, then
	1441	* GFP_NOWAIT is used instead to get an extra object for
	1442	* split purpose. That is rare and most time does not
	1443	* occur.
	1444	*
	1445	* What happens if an allocation gets failed. Basically,
	1446	* an "overflow" path is triggered to purge lazily freed
	1447	* areas to free some memory, then, the "retry" path is
	1448	* triggered to repeat one more time. See more details
	1449	* in alloc_vmap_area() function.
	1450	*/
	1451	lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
	1452	if (!lva)
	1453	return -1;
	1454	}
	1455
	1456	/*
	1457	* Build the remainder.
	1458	*/
	1459	lva->va_start = va->va_start;
	1460	lva->va_end = nva_start_addr;
	1461
	1462	/*
	1463	* Shrink this VA to remaining size.
	1464	*/
	1465	va->va_start = nva_start_addr + size;
	1466	} else {
	1467	return -1;
	1468	}
	1469
	1470	if (type != FL_FIT_TYPE) {
	1471	augment_tree_propagate_from(va);
	1472
	1473	if (lva) /* type == NE_FIT_TYPE */
	1474	insert_vmap_area_augment(lva, &va->rb_node, root, head);
	1475	}
	1476
	1477	return 0;
	1478	}
	1479
	1480	/*
	1481	* Returns a start address of the newly allocated area, if success.
	1482	* Otherwise a vend is returned that indicates failure.
	1483	*/
	1484	static __always_inline unsigned long
	1485	__alloc_vmap_area(struct rb_root root, struct list_head head,
	1486	unsigned long size, unsigned long align,
	1487	unsigned long vstart, unsigned long vend)
	1488	{
	1489	bool adjust_search_size = true;
	1490	unsigned long nva_start_addr;
	1491	struct vmap_area *va;
	1492	int ret;
	1493
	1494	/*
	1495	* Do not adjust when:
	1496	* a) align <= PAGE_SIZE, because it does not make any sense.
	1497	* All blocks(their start addresses) are at least PAGE_SIZE
	1498	* aligned anyway;
	1499	* b) a short range where a requested size corresponds to exactly
	1500	* specified [vstart:vend] interval and an alignment > PAGE_SIZE.
	1501	* With adjusted search length an allocation would not succeed.
	1502	*/
	1503	if (align <= PAGE_SIZE \|\| (align > PAGE_SIZE && (vend - vstart) == size))
	1504	adjust_search_size = false;
	1505
	1506	va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
	1507	if (unlikely(!va))
	1508	return vend;
	1509
	1510	if (va->va_start > vstart)
	1511	nva_start_addr = ALIGN(va->va_start, align);
	1512	else
	1513	nva_start_addr = ALIGN(vstart, align);
	1514
	1515	/* Check the "vend" restriction. */
	1516	if (nva_start_addr + size > vend)
	1517	return vend;
	1518
	1519	/* Update the free vmap_area. */
	1520	ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
	1521	if (WARN_ON_ONCE(ret))
	1522	return vend;
	1523
	1524	#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
	1525	find_vmap_lowest_match_check(root, head, size, align);
	1526	#endif
	1527
	1528	return nva_start_addr;
	1529	}
	1530
	1531	/*
	1532	* Free a region of KVA allocated by alloc_vmap_area
	1533	*/
	1534	static void free_vmap_area(struct vmap_area *va)
	1535	{
	1536	/*
	1537	* Remove from the busy tree/list.
	1538	*/
	1539	spin_lock(&vmap_area_lock);
	1540	unlink_va(va, &vmap_area_root);
	1541	spin_unlock(&vmap_area_lock);
	1542
	1543	/*
	1544	* Insert/Merge it back to the free tree/list.
	1545	*/
	1546	spin_lock(&free_vmap_area_lock);
	1547	merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
	1548	spin_unlock(&free_vmap_area_lock);
	1549	}
	1550
	1551	static inline void
	1552	preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
	1553	{
	1554	struct vmap_area *va = NULL;
	1555
	1556	/*
	1557	* Preload this CPU with one extra vmap_area object. It is used
	1558	* when fit type of free area is NE_FIT_TYPE. It guarantees that
	1559	* a CPU that does an allocation is preloaded.
	1560	*
	1561	* We do it in non-atomic context, thus it allows us to use more
	1562	* permissive allocation masks to be more stable under low memory
	1563	* condition and high memory pressure.
	1564	*/
	1565	if (!this_cpu_read(ne_fit_preload_node))
	1566	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
	1567
	1568	spin_lock(lock);
	1569
	1570	if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
	1571	kmem_cache_free(vmap_area_cachep, va);
	1572	}
	1573
	1574	/*
	1575	* Allocate a region of KVA of the specified size and alignment, within the
	1576	* vstart and vend.
	1577	*/
	1578	static struct vmap_area *alloc_vmap_area(unsigned long size,
	1579	unsigned long align,
	1580	unsigned long vstart, unsigned long vend,
	1581	int node, gfp_t gfp_mask)
	1582	{
	1583	struct vmap_area *va;
	1584	unsigned long freed;
	1585	unsigned long addr;
	1586	int purged = 0;
	1587	int ret;
	1588
	1589	BUG_ON(!size);
	1590	BUG_ON(offset_in_page(size));
	1591	BUG_ON(!is_power_of_2(align));
	1592
	1593	if (unlikely(!vmap_initialized))
	1594	return ERR_PTR(-EBUSY);
	1595
	1596	might_sleep();
	1597	gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
	1598
	1599	va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
	1600	if (unlikely(!va))
	1601	return ERR_PTR(-ENOMEM);
	1602
	1603	/*
	1604	* Only scan the relevant parts containing pointers to other objects
	1605	* to avoid false negatives.
	1606	*/
	1607	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
	1608
	1609	retry:
	1610	preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
	1611	addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
	1612	size, align, vstart, vend);
	1613	spin_unlock(&free_vmap_area_lock);
	1614
	1615	trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);
	1616
	1617	/*
	1618	* If an allocation fails, the "vend" address is
	1619	* returned. Therefore trigger the overflow path.
	1620	*/
	1621	if (unlikely(addr == vend))
	1622	goto overflow;
	1623
	1624	va->va_start = addr;
	1625	va->va_end = addr + size;
	1626	va->vm = NULL;
	1627
	1628	spin_lock(&vmap_area_lock);
	1629	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
	1630	spin_unlock(&vmap_area_lock);
	1631
	1632	BUG_ON(!IS_ALIGNED(va->va_start, align));
	1633	BUG_ON(va->va_start < vstart);
	1634	BUG_ON(va->va_end > vend);
	1635
	1636	ret = kasan_populate_vmalloc(addr, size);
	1637	if (ret) {
	1638	free_vmap_area(va);
	1639	return ERR_PTR(ret);
	1640	}
	1641
	1642	return va;
	1643
	1644	overflow:
	1645	if (!purged) {
	1646	purge_vmap_area_lazy();
	1647	purged = 1;
	1648	goto retry;
	1649	}
	1650
	1651	freed = 0;
	1652	blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
	1653
	1654	if (freed > 0) {
	1655	purged = 0;
	1656	goto retry;
	1657	}
	1658
	1659	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
	1660	pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
	1661	size);
	1662
	1663	kmem_cache_free(vmap_area_cachep, va);
	1664	return ERR_PTR(-EBUSY);
	1665	}
	1666
	1667	int register_vmap_purge_notifier(struct notifier_block *nb)
	1668	{
	1669	return blocking_notifier_chain_register(&vmap_notify_list, nb);
	1670	}
	1671	EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
	1672
	1673	int unregister_vmap_purge_notifier(struct notifier_block *nb)
	1674	{
	1675	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
	1676	}
	1677	EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
	1678
	1679	/*
	1680	* lazy_max_pages is the maximum amount of virtual address space we gather up
	1681	* before attempting to purge with a TLB flush.
	1682	*
	1683	* There is a tradeoff here: a larger number will cover more kernel page tables
	1684	* and take slightly longer to purge, but it will linearly reduce the number of
	1685	* global TLB flushes that must be performed. It would seem natural to scale
	1686	* this number up linearly with the number of CPUs (because vmapping activity
	1687	* could also scale linearly with the number of CPUs), however it is likely
	1688	* that in practice, workloads might be constrained in other ways that mean
	1689	* vmap activity will not scale linearly with CPUs. Also, I want to be
	1690	* conservative and not introduce a big latency on huge systems, so go with
	1691	* a less aggressive log scale. It will still be an improvement over the old
	1692	* code, and it will be simple to change the scale factor if we find that it
	1693	* becomes a problem on bigger systems.
	1694	*/
	1695	static unsigned long lazy_max_pages(void)
	1696	{
	1697	unsigned int log;
	1698
	1699	log = fls(num_online_cpus());
	1700
	1701	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
	1702	}
	1703
	1704	static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
	1705
	1706	/*
	1707	* Serialize vmap purging. There is no actual critical section protected
	1708	* by this lock, but we want to avoid concurrent calls for performance
	1709	* reasons and to make the pcpu_get_vm_areas more deterministic.
	1710	*/
	1711	static DEFINE_MUTEX(vmap_purge_lock);
	1712
	1713	/* for per-CPU blocks */
	1714	static void purge_fragmented_blocks_allcpus(void);
	1715
	1716	/*
	1717	* Purges all lazily-freed vmap areas.
	1718	*/
	1719	static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
	1720	{
	1721	unsigned long resched_threshold;
	1722	unsigned int num_purged_areas = 0;
	1723	struct list_head local_purge_list;
	1724	struct vmap_area va, n_va;
	1725
	1726	lockdep_assert_held(&vmap_purge_lock);
	1727
	1728	spin_lock(&purge_vmap_area_lock);
	1729	purge_vmap_area_root = RB_ROOT;
	1730	list_replace_init(&purge_vmap_area_list, &local_purge_list);
	1731	spin_unlock(&purge_vmap_area_lock);
	1732
	1733	if (unlikely(list_empty(&local_purge_list)))
	1734	goto out;
	1735
	1736	start = min(start,
	1737	list_first_entry(&local_purge_list,
	1738	struct vmap_area, list)->va_start);
	1739
	1740	end = max(end,
	1741	list_last_entry(&local_purge_list,
	1742	struct vmap_area, list)->va_end);
	1743
	1744	flush_tlb_kernel_range(start, end);
	1745	resched_threshold = lazy_max_pages() << 1;
	1746
	1747	spin_lock(&free_vmap_area_lock);
	1748	list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
	1749	unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
	1750	unsigned long orig_start = va->va_start;
	1751	unsigned long orig_end = va->va_end;
	1752
	1753	/*
	1754	* Finally insert or merge lazily-freed area. It is
	1755	* detached and there is no need to "unlink" it from
	1756	* anything.
	1757	*/
	1758	va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
	1759	&free_vmap_area_list);
	1760
	1761	if (!va)
	1762	continue;
	1763
	1764	if (is_vmalloc_or_module_addr((void *)orig_start))
	1765	kasan_release_vmalloc(orig_start, orig_end,
	1766	va->va_start, va->va_end);
	1767
	1768	atomic_long_sub(nr, &vmap_lazy_nr);
	1769	num_purged_areas++;
	1770
	1771	if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
	1772	cond_resched_lock(&free_vmap_area_lock);
	1773	}
	1774	spin_unlock(&free_vmap_area_lock);
	1775
	1776	out:
	1777	trace_purge_vmap_area_lazy(start, end, num_purged_areas);
	1778	return num_purged_areas > 0;
	1779	}
	1780
	1781	/*
	1782	* Kick off a purge of the outstanding lazy areas.
	1783	*/
	1784	static void purge_vmap_area_lazy(void)
	1785	{
	1786	mutex_lock(&vmap_purge_lock);
	1787	purge_fragmented_blocks_allcpus();
	1788	__purge_vmap_area_lazy(ULONG_MAX, 0);
	1789	mutex_unlock(&vmap_purge_lock);
	1790	}
	1791
	1792	static void drain_vmap_area_work(struct work_struct *work)
	1793	{
	1794	unsigned long nr_lazy;
	1795
	1796	do {
	1797	mutex_lock(&vmap_purge_lock);
	1798	__purge_vmap_area_lazy(ULONG_MAX, 0);
	1799	mutex_unlock(&vmap_purge_lock);
	1800
	1801	/* Recheck if further work is required. */
	1802	nr_lazy = atomic_long_read(&vmap_lazy_nr);
	1803	} while (nr_lazy > lazy_max_pages());
	1804	}
	1805
	1806	/*
	1807	* Free a vmap area, caller ensuring that the area has been unmapped,
	1808	* unlinked and flush_cache_vunmap had been called for the correct
	1809	* range previously.
	1810	*/
	1811	static void free_vmap_area_noflush(struct vmap_area *va)
	1812	{
	1813	unsigned long nr_lazy_max = lazy_max_pages();
	1814	unsigned long va_start = va->va_start;
	1815	unsigned long nr_lazy;
	1816
	1817	if (WARN_ON_ONCE(!list_empty(&va->list)))
	1818	return;
	1819
	1820	nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
	1821	PAGE_SHIFT, &vmap_lazy_nr);
	1822
	1823	/*
	1824	* Merge or place it to the purge tree/list.
	1825	*/
	1826	spin_lock(&purge_vmap_area_lock);
	1827	merge_or_add_vmap_area(va,
	1828	&purge_vmap_area_root, &purge_vmap_area_list);
	1829	spin_unlock(&purge_vmap_area_lock);
	1830
	1831	trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
	1832
	1833	/* After this point, we may free va at any time */
	1834	if (unlikely(nr_lazy > nr_lazy_max))
	1835	schedule_work(&drain_vmap_work);
	1836	}
	1837
	1838	/*
	1839	* Free and unmap a vmap area
	1840	*/
	1841	static void free_unmap_vmap_area(struct vmap_area *va)
	1842	{
	1843	flush_cache_vunmap(va->va_start, va->va_end);
	1844	vunmap_range_noflush(va->va_start, va->va_end);
	1845	if (debug_pagealloc_enabled_static())
	1846	flush_tlb_kernel_range(va->va_start, va->va_end);
	1847
	1848	free_vmap_area_noflush(va);
	1849	}
	1850
	1851	struct vmap_area *find_vmap_area(unsigned long addr)
	1852	{
	1853	struct vmap_area *va;
	1854
	1855	spin_lock(&vmap_area_lock);
	1856	va = __find_vmap_area(addr, &vmap_area_root);
	1857	spin_unlock(&vmap_area_lock);
	1858
	1859	return va;
	1860	}
	1861
	1862	static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
	1863	{
	1864	struct vmap_area *va;
	1865
	1866	spin_lock(&vmap_area_lock);
	1867	va = __find_vmap_area(addr, &vmap_area_root);
	1868	if (va)
	1869	unlink_va(va, &vmap_area_root);
	1870	spin_unlock(&vmap_area_lock);
	1871
	1872	return va;
	1873	}
	1874
	1875	/* Per cpu kva allocator */
	1876
	1877	/*
	1878	* vmap space is limited especially on 32 bit architectures. Ensure there is
	1879	* room for at least 16 percpu vmap blocks per CPU.
	1880	*/
	1881	/*
	1882	* If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
	1883	* to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
	1884	* instead (we just need a rough idea)
	1885	*/
	1886	#if BITS_PER_LONG == 32
	1887	#define VMALLOC_SPACE (128UL10241024)
	1888	#else
	1889	#define VMALLOC_SPACE (128UL10241024*1024)
	1890	#endif
	1891
	1892	#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
	1893	#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
	1894	#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
	1895	#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
	1896	#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
	1897	#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
	1898	#define VMAP_BBMAP_BITS \
	1899	VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
	1900	VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
	1901	VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
	1902
	1903	#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
	1904
	1905	struct vmap_block_queue {
	1906	spinlock_t lock;
	1907	struct list_head free;
	1908	};
	1909
	1910	struct vmap_block {
	1911	spinlock_t lock;
	1912	struct vmap_area *va;
	1913	unsigned long free, dirty;
	1914	unsigned long dirty_min, dirty_max; /< dirty range /
	1915	struct list_head free_list;
	1916	struct rcu_head rcu_head;
	1917	struct list_head purge;
	1918	};
	1919
	1920	/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
	1921	static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
	1922
	1923	/*
	1924	* XArray of vmap blocks, indexed by address, to quickly find a vmap block
	1925	* in the free path. Could get rid of this if we change the API to return a
	1926	* "cookie" from alloc, to be passed to free. But no big deal yet.
	1927	*/
	1928	static DEFINE_XARRAY(vmap_blocks);
	1929
	1930	/*
	1931	* We should probably have a fallback mechanism to allocate virtual memory
	1932	* out of partially filled vmap blocks. However vmap block sizing should be
	1933	* fairly reasonable according to the vmalloc size, so it shouldn't be a
	1934	* big problem.
	1935	*/
	1936
	1937	static unsigned long addr_to_vb_idx(unsigned long addr)
	1938	{
	1939	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
	1940	addr /= VMAP_BLOCK_SIZE;
	1941	return addr;
	1942	}
	1943
	1944	static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
	1945	{
	1946	unsigned long addr;
	1947
	1948	addr = va_start + (pages_off << PAGE_SHIFT);
	1949	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
	1950	return (void *)addr;
	1951	}
	1952
	1953	/**
	1954	* new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
	1955	* block. Of course pages number can't exceed VMAP_BBMAP_BITS
	1956	* @order: how many 2^order pages should be occupied in newly allocated block
	1957	* @gfp_mask: flags for the page level allocator
	1958	*
	1959	* Return: virtual address in a newly allocated block or ERR_PTR(-errno)
	1960	*/
	1961	static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
	1962	{
	1963	struct vmap_block_queue *vbq;
	1964	struct vmap_block *vb;
	1965	struct vmap_area *va;
	1966	unsigned long vb_idx;
	1967	int node, err;
	1968	void *vaddr;
	1969
	1970	node = numa_node_id();
	1971
	1972	vb = kmalloc_node(sizeof(struct vmap_block),
	1973	gfp_mask & GFP_RECLAIM_MASK, node);
	1974	if (unlikely(!vb))
	1975	return ERR_PTR(-ENOMEM);
	1976
	1977	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
	1978	VMALLOC_START, VMALLOC_END,
	1979	node, gfp_mask);
	1980	if (IS_ERR(va)) {
	1981	kfree(vb);
	1982	return ERR_CAST(va);
	1983	}
	1984
	1985	vaddr = vmap_block_vaddr(va->va_start, 0);
	1986	spin_lock_init(&vb->lock);
	1987	vb->va = va;
	1988	/* At least something should be left free */
	1989	BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
	1990	vb->free = VMAP_BBMAP_BITS - (1UL << order);
	1991	vb->dirty = 0;
	1992	vb->dirty_min = VMAP_BBMAP_BITS;
	1993	vb->dirty_max = 0;
	1994	INIT_LIST_HEAD(&vb->free_list);
	1995
	1996	vb_idx = addr_to_vb_idx(va->va_start);
	1997	err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
	1998	if (err) {
	1999	kfree(vb);
	2000	free_vmap_area(va);
	2001	return ERR_PTR(err);
	2002	}
	2003
	2004	vbq = raw_cpu_ptr(&vmap_block_queue);
	2005	spin_lock(&vbq->lock);
	2006	list_add_tail_rcu(&vb->free_list, &vbq->free);
	2007	spin_unlock(&vbq->lock);
	2008
	2009	return vaddr;
	2010	}
	2011
	2012	static void free_vmap_block(struct vmap_block *vb)
	2013	{
	2014	struct vmap_block *tmp;
	2015
	2016	tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
	2017	BUG_ON(tmp != vb);
	2018
	2019	spin_lock(&vmap_area_lock);
	2020	unlink_va(vb->va, &vmap_area_root);
	2021	spin_unlock(&vmap_area_lock);
	2022
	2023	free_vmap_area_noflush(vb->va);
	2024	kfree_rcu(vb, rcu_head);
	2025	}
	2026
	2027	static void purge_fragmented_blocks(int cpu)
	2028	{
	2029	LIST_HEAD(purge);
	2030	struct vmap_block *vb;
	2031	struct vmap_block *n_vb;
	2032	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
	2033
	2034	rcu_read_lock();
	2035	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
	2036
	2037	if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
	2038	continue;
	2039
	2040	spin_lock(&vb->lock);
	2041	if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
	2042	vb->free = 0; /* prevent further allocs after releasing lock */
	2043	vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
	2044	vb->dirty_min = 0;
	2045	vb->dirty_max = VMAP_BBMAP_BITS;
	2046	spin_lock(&vbq->lock);
	2047	list_del_rcu(&vb->free_list);
	2048	spin_unlock(&vbq->lock);
	2049	spin_unlock(&vb->lock);
	2050	list_add_tail(&vb->purge, &purge);
	2051	} else
	2052	spin_unlock(&vb->lock);
	2053	}
	2054	rcu_read_unlock();
	2055
	2056	list_for_each_entry_safe(vb, n_vb, &purge, purge) {
	2057	list_del(&vb->purge);
	2058	free_vmap_block(vb);
	2059	}
	2060	}
	2061
	2062	static void purge_fragmented_blocks_allcpus(void)
	2063	{
	2064	int cpu;
	2065
	2066	for_each_possible_cpu(cpu)
	2067	purge_fragmented_blocks(cpu);
	2068	}
	2069
	2070	static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
	2071	{
	2072	struct vmap_block_queue *vbq;
	2073	struct vmap_block *vb;
	2074	void *vaddr = NULL;
	2075	unsigned int order;
	2076
	2077	BUG_ON(offset_in_page(size));
	2078	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
	2079	if (WARN_ON(size == 0)) {
	2080	/*
	2081	* Allocating 0 bytes isn't what caller wants since
	2082	* get_order(0) returns funny result. Just warn and terminate
	2083	* early.
	2084	*/
	2085	return NULL;
	2086	}
	2087	order = get_order(size);
	2088
	2089	rcu_read_lock();
	2090	vbq = raw_cpu_ptr(&vmap_block_queue);
	2091	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
	2092	unsigned long pages_off;
	2093
	2094	spin_lock(&vb->lock);
	2095	if (vb->free < (1UL << order)) {
	2096	spin_unlock(&vb->lock);
	2097	continue;
	2098	}
	2099
	2100	pages_off = VMAP_BBMAP_BITS - vb->free;
	2101	vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
	2102	vb->free -= 1UL << order;
	2103	if (vb->free == 0) {
	2104	spin_lock(&vbq->lock);
	2105	list_del_rcu(&vb->free_list);
	2106	spin_unlock(&vbq->lock);
	2107	}
	2108
	2109	spin_unlock(&vb->lock);
	2110	break;
	2111	}
	2112
	2113	rcu_read_unlock();
	2114
	2115	/* Allocate new block if nothing was found */
	2116	if (!vaddr)
	2117	vaddr = new_vmap_block(order, gfp_mask);
	2118
	2119	return vaddr;
	2120	}
	2121
	2122	static void vb_free(unsigned long addr, unsigned long size)
	2123	{
	2124	unsigned long offset;
	2125	unsigned int order;
	2126	struct vmap_block *vb;
	2127
	2128	BUG_ON(offset_in_page(size));
	2129	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
	2130
	2131	flush_cache_vunmap(addr, addr + size);
	2132
	2133	order = get_order(size);
	2134	offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
	2135	vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
	2136
	2137	vunmap_range_noflush(addr, addr + size);
	2138
	2139	if (debug_pagealloc_enabled_static())
	2140	flush_tlb_kernel_range(addr, addr + size);
	2141
	2142	spin_lock(&vb->lock);
	2143
	2144	/* Expand dirty range */
	2145	vb->dirty_min = min(vb->dirty_min, offset);
	2146	vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
	2147
	2148	vb->dirty += 1UL << order;
	2149	if (vb->dirty == VMAP_BBMAP_BITS) {
	2150	BUG_ON(vb->free);
	2151	spin_unlock(&vb->lock);
	2152	free_vmap_block(vb);
	2153	} else
	2154	spin_unlock(&vb->lock);
	2155	}
	2156
	2157	static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
	2158	{
	2159	int cpu;
	2160
	2161	if (unlikely(!vmap_initialized))
	2162	return;
	2163
	2164	might_sleep();
	2165
	2166	for_each_possible_cpu(cpu) {
	2167	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
	2168	struct vmap_block *vb;
	2169
	2170	rcu_read_lock();
	2171	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
	2172	spin_lock(&vb->lock);
	2173	if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
	2174	unsigned long va_start = vb->va->va_start;
	2175	unsigned long s, e;
	2176
	2177	s = va_start + (vb->dirty_min << PAGE_SHIFT);
	2178	e = va_start + (vb->dirty_max << PAGE_SHIFT);
	2179
	2180	start = min(s, start);
	2181	end = max(e, end);
	2182
	2183	flush = 1;
	2184	}
	2185	spin_unlock(&vb->lock);
	2186	}
	2187	rcu_read_unlock();
	2188	}
	2189
	2190	mutex_lock(&vmap_purge_lock);
	2191	purge_fragmented_blocks_allcpus();
	2192	if (!__purge_vmap_area_lazy(start, end) && flush)
	2193	flush_tlb_kernel_range(start, end);
	2194	mutex_unlock(&vmap_purge_lock);
	2195	}
	2196
	2197	/**
	2198	* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
	2199	*
	2200	* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
	2201	* to amortize TLB flushing overheads. What this means is that any page you
	2202	* have now, may, in a former life, have been mapped into kernel virtual
	2203	* address by the vmap layer and so there might be some CPUs with TLB entries
	2204	* still referencing that page (additional to the regular 1:1 kernel mapping).
	2205	*
	2206	* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
	2207	* be sure that none of the pages we have control over will have any aliases
	2208	* from the vmap layer.
	2209	*/
	2210	void vm_unmap_aliases(void)
	2211	{
	2212	unsigned long start = ULONG_MAX, end = 0;
	2213	int flush = 0;
	2214
	2215	_vm_unmap_aliases(start, end, flush);
	2216	}
	2217	EXPORT_SYMBOL_GPL(vm_unmap_aliases);
	2218
	2219	/**
	2220	* vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
	2221	* @mem: the pointer returned by vm_map_ram
	2222	* @count: the count passed to that vm_map_ram call (cannot unmap partial)
	2223	*/
	2224	void vm_unmap_ram(const void *mem, unsigned int count)
	2225	{
	2226	unsigned long size = (unsigned long)count << PAGE_SHIFT;
	2227	unsigned long addr = (unsigned long)kasan_reset_tag(mem);
	2228	struct vmap_area *va;
	2229
	2230	might_sleep();
	2231	BUG_ON(!addr);
	2232	BUG_ON(addr < VMALLOC_START);
	2233	BUG_ON(addr > VMALLOC_END);
	2234	BUG_ON(!PAGE_ALIGNED(addr));
	2235
	2236	kasan_poison_vmalloc(mem, size);
	2237
	2238	if (likely(count <= VMAP_MAX_ALLOC)) {
	2239	debug_check_no_locks_freed(mem, size);
	2240	vb_free(addr, size);
	2241	return;
	2242	}
	2243
	2244	va = find_unlink_vmap_area(addr);
	2245	if (WARN_ON_ONCE(!va))
	2246	return;
	2247
	2248	debug_check_no_locks_freed((void *)va->va_start,
	2249	(va->va_end - va->va_start));
	2250	free_unmap_vmap_area(va);
	2251	}
	2252	EXPORT_SYMBOL(vm_unmap_ram);
	2253
	2254	/**
	2255	* vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
	2256	* @pages: an array of pointers to the pages to be mapped
	2257	* @count: number of pages
	2258	* @node: prefer to allocate data structures on this node
	2259	*
	2260	* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
	2261	* faster than vmap so it's good. But if you mix long-life and short-life
	2262	* objects with vm_map_ram(), it could consume lots of address space through
	2263	* fragmentation (especially on a 32bit machine). You could see failures in
	2264	* the end. Please use this function for short-lived objects.
	2265	*
	2266	* Returns: a pointer to the address that has been mapped, or %NULL on failure
	2267	*/
	2268	void vm_map_ram(struct page *pages, unsigned int count, int node)
	2269	{
	2270	unsigned long size = (unsigned long)count << PAGE_SHIFT;
	2271	unsigned long addr;
	2272	void *mem;
	2273
	2274	if (likely(count <= VMAP_MAX_ALLOC)) {
	2275	mem = vb_alloc(size, GFP_KERNEL);
	2276	if (IS_ERR(mem))
	2277	return NULL;
	2278	addr = (unsigned long)mem;
	2279	} else {
	2280	struct vmap_area *va;
	2281	va = alloc_vmap_area(size, PAGE_SIZE,
	2282	VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
	2283	if (IS_ERR(va))
	2284	return NULL;
	2285
	2286	addr = va->va_start;
	2287	mem = (void *)addr;
	2288	}
	2289
	2290	if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
	2291	pages, PAGE_SHIFT) < 0) {
	2292	vm_unmap_ram(mem, count);
	2293	return NULL;
	2294	}
	2295
	2296	/*
	2297	* Mark the pages as accessible, now that they are mapped.
	2298	* With hardware tag-based KASAN, marking is skipped for
	2299	* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
	2300	*/
	2301	mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
	2302
	2303	return mem;
	2304	}
	2305	EXPORT_SYMBOL(vm_map_ram);
	2306
	2307	static struct vm_struct *vmlist __initdata;
	2308
	2309	static inline unsigned int vm_area_page_order(struct vm_struct *vm)
	2310	{
	2311	#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
	2312	return vm->page_order;
	2313	#else
	2314	return 0;
	2315	#endif
	2316	}
	2317
	2318	static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
	2319	{
	2320	#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
	2321	vm->page_order = order;
	2322	#else
	2323	BUG_ON(order != 0);
	2324	#endif
	2325	}
	2326
	2327	/**
	2328	* vm_area_add_early - add vmap area early during boot
	2329	* @vm: vm_struct to add
	2330	*
	2331	* This function is used to add fixed kernel vm area to vmlist before
	2332	* vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
	2333	* should contain proper values and the other fields should be zero.
	2334	*
	2335	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
	2336	*/
	2337	void __init vm_area_add_early(struct vm_struct *vm)
	2338	{
	2339	struct vm_struct tmp, *p;
	2340
	2341	BUG_ON(vmap_initialized);
	2342	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
	2343	if (tmp->addr >= vm->addr) {
	2344	BUG_ON(tmp->addr < vm->addr + vm->size);
	2345	break;
	2346	} else
	2347	BUG_ON(tmp->addr + tmp->size > vm->addr);
	2348	}
	2349	vm->next = *p;
	2350	*p = vm;
	2351	}
	2352
	2353	/**
	2354	* vm_area_register_early - register vmap area early during boot
	2355	* @vm: vm_struct to register
	2356	* @align: requested alignment
	2357	*
	2358	* This function is used to register kernel vm area before
	2359	* vmalloc_init() is called. @vm->size and @vm->flags should contain
	2360	* proper values on entry and other fields should be zero. On return,
	2361	* vm->addr contains the allocated address.
	2362	*
	2363	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
	2364	*/
	2365	void __init vm_area_register_early(struct vm_struct *vm, size_t align)
	2366	{
	2367	unsigned long addr = ALIGN(VMALLOC_START, align);
	2368	struct vm_struct cur, *p;
	2369
	2370	BUG_ON(vmap_initialized);
	2371
	2372	for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
	2373	if ((unsigned long)cur->addr - addr >= vm->size)
	2374	break;
	2375	addr = ALIGN((unsigned long)cur->addr + cur->size, align);
	2376	}
	2377
	2378	BUG_ON(addr > VMALLOC_END - vm->size);
	2379	vm->addr = (void *)addr;
	2380	vm->next = *p;
	2381	*p = vm;
	2382	kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
	2383	}
	2384
	2385	static void vmap_init_free_space(void)
	2386	{
	2387	unsigned long vmap_start = 1;
	2388	const unsigned long vmap_end = ULONG_MAX;
	2389	struct vmap_area busy, free;
	2390
	2391	/*
	2392	* B F B B B F
	2393	* -\|-----\|.....\|-----\|-----\|-----\|.....\|-
	2394	* \| The KVA space \|
	2395	* \|<--------------------------------->\|
	2396	*/
	2397	list_for_each_entry(busy, &vmap_area_list, list) {
	2398	if (busy->va_start - vmap_start > 0) {
	2399	free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
	2400	if (!WARN_ON_ONCE(!free)) {
	2401	free->va_start = vmap_start;
	2402	free->va_end = busy->va_start;
	2403
	2404	insert_vmap_area_augment(free, NULL,
	2405	&free_vmap_area_root,
	2406	&free_vmap_area_list);
	2407	}
	2408	}
	2409
	2410	vmap_start = busy->va_end;
	2411	}
	2412
	2413	if (vmap_end - vmap_start > 0) {
	2414	free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
	2415	if (!WARN_ON_ONCE(!free)) {
	2416	free->va_start = vmap_start;
	2417	free->va_end = vmap_end;
	2418
	2419	insert_vmap_area_augment(free, NULL,
	2420	&free_vmap_area_root,
	2421	&free_vmap_area_list);
	2422	}
	2423	}
	2424	}
	2425
	2426	static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
	2427	struct vmap_area va, unsigned long flags, const void caller)
	2428	{
	2429	vm->flags = flags;
	2430	vm->addr = (void *)va->va_start;
	2431	vm->size = va->va_end - va->va_start;
	2432	vm->caller = caller;
	2433	va->vm = vm;
	2434	}
	2435
	2436	static void setup_vmalloc_vm(struct vm_struct vm, struct vmap_area va,
	2437	unsigned long flags, const void *caller)
	2438	{
	2439	spin_lock(&vmap_area_lock);
	2440	setup_vmalloc_vm_locked(vm, va, flags, caller);
	2441	spin_unlock(&vmap_area_lock);
	2442	}
	2443
	2444	static void clear_vm_uninitialized_flag(struct vm_struct *vm)
	2445	{
	2446	/*
	2447	* Before removing VM_UNINITIALIZED,
	2448	* we should make sure that vm has proper values.
	2449	* Pair with smp_rmb() in show_numa_info().
	2450	*/
	2451	smp_wmb();
	2452	vm->flags &= ~VM_UNINITIALIZED;
	2453	}
	2454
	2455	static struct vm_struct *__get_vm_area_node(unsigned long size,
	2456	unsigned long align, unsigned long shift, unsigned long flags,
	2457	unsigned long start, unsigned long end, int node,
	2458	gfp_t gfp_mask, const void *caller)
	2459	{
	2460	struct vmap_area *va;
	2461	struct vm_struct *area;
	2462	unsigned long requested_size = size;
	2463
	2464	BUG_ON(in_interrupt());
	2465	size = ALIGN(size, 1ul << shift);
	2466	if (unlikely(!size))
	2467	return NULL;
	2468
	2469	if (flags & VM_IOREMAP)
	2470	align = 1ul << clamp_t(int, get_count_order_long(size),
	2471	PAGE_SHIFT, IOREMAP_MAX_ORDER);
	2472
	2473	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
	2474	if (unlikely(!area))
	2475	return NULL;
	2476
	2477	if (!(flags & VM_NO_GUARD))
	2478	size += PAGE_SIZE;
	2479
	2480	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
	2481	if (IS_ERR(va)) {
	2482	kfree(area);
	2483	return NULL;
	2484	}
	2485
	2486	setup_vmalloc_vm(area, va, flags, caller);
	2487
	2488	/*
	2489	* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
	2490	* best-effort approach, as they can be mapped outside of vmalloc code.
	2491	* For VM_ALLOC mappings, the pages are marked as accessible after
	2492	* getting mapped in __vmalloc_node_range().
	2493	* With hardware tag-based KASAN, marking is skipped for
	2494	* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
	2495	*/
	2496	if (!(flags & VM_ALLOC))
	2497	area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
	2498	KASAN_VMALLOC_PROT_NORMAL);
	2499
	2500	return area;
	2501	}
	2502
	2503	struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
	2504	unsigned long start, unsigned long end,
	2505	const void *caller)
	2506	{
	2507	return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
	2508	NUMA_NO_NODE, GFP_KERNEL, caller);
	2509	}
	2510
	2511	/**
	2512	* get_vm_area - reserve a contiguous kernel virtual area
	2513	* @size: size of the area
	2514	* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
	2515	*
	2516	* Search an area of @size in the kernel virtual mapping area,
	2517	* and reserved it for out purposes. Returns the area descriptor
	2518	* on success or %NULL on failure.
	2519	*
	2520	* Return: the area descriptor on success or %NULL on failure.
	2521	*/
	2522	struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
	2523	{
	2524	return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
	2525	VMALLOC_START, VMALLOC_END,
	2526	NUMA_NO_NODE, GFP_KERNEL,
	2527	__builtin_return_address(0));
	2528	}
	2529
	2530	struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
	2531	const void *caller)
	2532	{
	2533	return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
	2534	VMALLOC_START, VMALLOC_END,
	2535	NUMA_NO_NODE, GFP_KERNEL, caller);
	2536	}
	2537
	2538	/**
	2539	* find_vm_area - find a continuous kernel virtual area
	2540	* @addr: base address
	2541	*
	2542	* Search for the kernel VM area starting at @addr, and return it.
	2543	* It is up to the caller to do all required locking to keep the returned
	2544	* pointer valid.
	2545	*
	2546	* Return: the area descriptor on success or %NULL on failure.
	2547	*/
	2548	struct vm_struct find_vm_area(const void addr)
	2549	{
	2550	struct vmap_area *va;
	2551
	2552	va = find_vmap_area((unsigned long)addr);
	2553	if (!va)
	2554	return NULL;
	2555
	2556	return va->vm;
	2557	}
	2558
	2559	/**
	2560	* remove_vm_area - find and remove a continuous kernel virtual area
	2561	* @addr: base address
	2562	*
	2563	* Search for the kernel VM area starting at @addr, and remove it.
	2564	* This function returns the found VM area, but using it is NOT safe
	2565	* on SMP machines, except for its size or flags.
	2566	*
	2567	* Return: the area descriptor on success or %NULL on failure.
	2568	*/
	2569	struct vm_struct remove_vm_area(const void addr)
	2570	{
	2571	struct vmap_area *va;
	2572	struct vm_struct *vm;
	2573
	2574	might_sleep();
	2575
	2576	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
	2577	addr))
	2578	return NULL;
	2579
	2580	va = find_unlink_vmap_area((unsigned long)addr);
	2581	if (!va \|\| !va->vm)
	2582	return NULL;
	2583	vm = va->vm;
	2584
	2585	debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
	2586	debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
	2587	kasan_free_module_shadow(vm);
	2588	kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));
	2589
	2590	free_unmap_vmap_area(va);
	2591	return vm;
	2592	}
	2593
	2594	static inline void set_area_direct_map(const struct vm_struct *area,
	2595	int (set_direct_map)(struct page page))
	2596	{
	2597	int i;
	2598
	2599	/* HUGE_VMALLOC passes small pages to set_direct_map */
	2600	for (i = 0; i < area->nr_pages; i++)
	2601	if (page_address(area->pages[i]))
	2602	set_direct_map(area->pages[i]);
	2603	}
	2604
	2605	/*
	2606	* Flush the vm mapping and reset the direct map.
	2607	*/
	2608	static void vm_reset_perms(struct vm_struct *area)
	2609	{
	2610	unsigned long start = ULONG_MAX, end = 0;
	2611	unsigned int page_order = vm_area_page_order(area);
	2612	int flush_dmap = 0;
	2613	int i;
	2614
	2615	/*
	2616	* Find the start and end range of the direct mappings to make sure that
	2617	* the vm_unmap_aliases() flush includes the direct map.
	2618	*/
	2619	for (i = 0; i < area->nr_pages; i += 1U << page_order) {
	2620	unsigned long addr = (unsigned long)page_address(area->pages[i]);
	2621
	2622	if (addr) {
	2623	unsigned long page_size;
	2624
	2625	page_size = PAGE_SIZE << page_order;
	2626	start = min(addr, start);
	2627	end = max(addr + page_size, end);
	2628	flush_dmap = 1;
	2629	}
	2630	}
	2631
	2632	/*
	2633	* Set direct map to something invalid so that it won't be cached if
	2634	* there are any accesses after the TLB flush, then flush the TLB and
	2635	* reset the direct map permissions to the default.
	2636	*/
	2637	set_area_direct_map(area, set_direct_map_invalid_noflush);
	2638	_vm_unmap_aliases(start, end, flush_dmap);
	2639	set_area_direct_map(area, set_direct_map_default_noflush);
	2640	}
	2641
	2642	static void delayed_vfree_work(struct work_struct *w)
	2643	{
	2644	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
	2645	struct llist_node t, llnode;
	2646
	2647	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
	2648	vfree(llnode);
	2649	}
	2650
	2651	/**
	2652	* vfree_atomic - release memory allocated by vmalloc()
	2653	* @addr: memory base address
	2654	*
	2655	* This one is just like vfree() but can be called in any atomic context
	2656	* except NMIs.
	2657	*/
	2658	void vfree_atomic(const void *addr)
	2659	{
	2660	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
	2661
	2662	BUG_ON(in_nmi());
	2663	kmemleak_free(addr);
	2664
	2665	/*
	2666	* Use raw_cpu_ptr() because this can be called from preemptible
	2667	* context. Preemption is absolutely fine here, because the llist_add()
	2668	* implementation is lockless, so it works even if we are adding to
	2669	* another cpu's list. schedule_work() should be fine with this too.
	2670	*/
	2671	if (addr && llist_add((struct llist_node *)addr, &p->list))
	2672	schedule_work(&p->wq);
	2673	}
	2674
	2675	/**
	2676	* vfree - Release memory allocated by vmalloc()
	2677	* @addr: Memory base address
	2678	*
	2679	* Free the virtually continuous memory area starting at @addr, as obtained
	2680	* from one of the vmalloc() family of APIs. This will usually also free the
	2681	* physical memory underlying the virtual allocation, but that memory is
	2682	* reference counted, so it will not be freed until the last user goes away.
	2683	*
	2684	* If @addr is NULL, no operation is performed.
	2685	*
	2686	* Context:
	2687	* May sleep if called not from interrupt context.
	2688	* Must not be called in NMI context (strictly speaking, it could be
	2689	* if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
	2690	* conventions for vfree() arch-dependent would be a really bad idea).
	2691	*/
	2692	void vfree(const void *addr)
	2693	{
	2694	struct vm_struct *vm;
	2695	int i;
	2696
	2697	if (unlikely(in_interrupt())) {
	2698	vfree_atomic(addr);
	2699	return;
	2700	}
	2701
	2702	BUG_ON(in_nmi());
	2703	kmemleak_free(addr);
	2704	might_sleep();
	2705
	2706	if (!addr)
	2707	return;
	2708
	2709	vm = remove_vm_area(addr);
	2710	if (unlikely(!vm)) {
	2711	WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
	2712	addr);
	2713	return;
	2714	}
	2715
	2716	if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
	2717	vm_reset_perms(vm);
	2718	for (i = 0; i < vm->nr_pages; i++) {
	2719	struct page *page = vm->pages[i];
	2720
	2721	BUG_ON(!page);
	2722	mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
	2723	/*
	2724	* High-order allocs for huge vmallocs are split, so
	2725	* can be freed as an array of order-0 allocations
	2726	*/
	2727	__free_pages(page, 0);
	2728	cond_resched();
	2729	}
	2730	atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
	2731	kvfree(vm->pages);
	2732	kfree(vm);
	2733	}
	2734	EXPORT_SYMBOL(vfree);
	2735
	2736	/**
	2737	* vunmap - release virtual mapping obtained by vmap()
	2738	* @addr: memory base address
	2739	*
	2740	* Free the virtually contiguous memory area starting at @addr,
	2741	* which was created from the page array passed to vmap().
	2742	*
	2743	* Must not be called in interrupt context.
	2744	*/
	2745	void vunmap(const void *addr)
	2746	{
	2747	struct vm_struct *vm;
	2748
	2749	BUG_ON(in_interrupt());
	2750	might_sleep();
	2751
	2752	if (!addr)
	2753	return;
	2754	vm = remove_vm_area(addr);
	2755	if (unlikely(!vm)) {
	2756	WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
	2757	addr);
	2758	return;
	2759	}
	2760	kfree(vm);
	2761	}
	2762	EXPORT_SYMBOL(vunmap);
	2763
	2764	/**
	2765	* vmap - map an array of pages into virtually contiguous space
	2766	* @pages: array of page pointers
	2767	* @count: number of pages to map
	2768	* @flags: vm_area->flags
	2769	* @prot: page protection for the mapping
	2770	*
	2771	* Maps @count pages from @pages into contiguous kernel virtual space.
	2772	* If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
	2773	* (which must be kmalloc or vmalloc memory) and one reference per pages in it
	2774	* are transferred from the caller to vmap(), and will be freed / dropped when
	2775	* vfree() is called on the return value.
	2776	*
	2777	* Return: the address of the area or %NULL on failure
	2778	*/
	2779	void vmap(struct page *pages, unsigned int count,
	2780	unsigned long flags, pgprot_t prot)
	2781	{
	2782	struct vm_struct *area;
	2783	unsigned long addr;
	2784	unsigned long size; /* In bytes */
	2785
	2786	might_sleep();
	2787
	2788	if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
	2789	return NULL;
	2790
	2791	/*
	2792	* Your top guard is someone else's bottom guard. Not having a top
	2793	* guard compromises someone else's mappings too.
	2794	*/
	2795	if (WARN_ON_ONCE(flags & VM_NO_GUARD))
	2796	flags &= ~VM_NO_GUARD;
	2797
	2798	if (count > totalram_pages())
	2799	return NULL;
	2800
	2801	size = (unsigned long)count << PAGE_SHIFT;
	2802	area = get_vm_area_caller(size, flags, __builtin_return_address(0));
	2803	if (!area)
	2804	return NULL;
	2805
	2806	addr = (unsigned long)area->addr;
	2807	if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
	2808	pages, PAGE_SHIFT) < 0) {
	2809	vunmap(area->addr);
	2810	return NULL;
	2811	}
	2812
	2813	if (flags & VM_MAP_PUT_PAGES) {
	2814	area->pages = pages;
	2815	area->nr_pages = count;
	2816	}
	2817	return area->addr;
	2818	}
	2819	EXPORT_SYMBOL(vmap);
	2820
	2821	#ifdef CONFIG_VMAP_PFN
	2822	struct vmap_pfn_data {
	2823	unsigned long *pfns;
	2824	pgprot_t prot;
	2825	unsigned int idx;
	2826	};
	2827
	2828	static int vmap_pfn_apply(pte_t pte, unsigned long addr, void private)
	2829	{
	2830	struct vmap_pfn_data *data = private;
	2831
	2832	if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
	2833	return -EINVAL;
	2834	*pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
	2835	return 0;
	2836	}
	2837
	2838	/**
	2839	* vmap_pfn - map an array of PFNs into virtually contiguous space
	2840	* @pfns: array of PFNs
	2841	* @count: number of pages to map
	2842	* @prot: page protection for the mapping
	2843	*
	2844	* Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
	2845	* the start address of the mapping.
	2846	*/
	2847	void vmap_pfn(unsigned long pfns, unsigned int count, pgprot_t prot)
	2848	{
	2849	struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
	2850	struct vm_struct *area;
	2851
	2852	area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
	2853	__builtin_return_address(0));
	2854	if (!area)
	2855	return NULL;
	2856	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
	2857	count * PAGE_SIZE, vmap_pfn_apply, &data)) {
	2858	free_vm_area(area);
	2859	return NULL;
	2860	}
	2861	return area->addr;
	2862	}
	2863	EXPORT_SYMBOL_GPL(vmap_pfn);
	2864	#endif /* CONFIG_VMAP_PFN */
	2865
	2866	static inline unsigned int
	2867	vm_area_alloc_pages(gfp_t gfp, int nid,
	2868	unsigned int order, unsigned int nr_pages, struct page **pages)
	2869	{
	2870	unsigned int nr_allocated = 0;
	2871	struct page *page;
	2872	int i;
	2873
	2874	/*
	2875	* For order-0 pages we make use of bulk allocator, if
	2876	* the page array is partly or not at all populated due
	2877	* to fails, fallback to a single page allocator that is
	2878	* more permissive.
	2879	*/
	2880	if (!order) {
	2881	gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
	2882
	2883	while (nr_allocated < nr_pages) {
	2884	unsigned int nr, nr_pages_request;
	2885
	2886	/*
	2887	* A maximum allowed request is hard-coded and is 100
	2888	* pages per call. That is done in order to prevent a
	2889	* long preemption off scenario in the bulk-allocator
	2890	* so the range is [1:100].
	2891	*/
	2892	nr_pages_request = min(100U, nr_pages - nr_allocated);
	2893
	2894	/* memory allocation should consider mempolicy, we can't
	2895	* wrongly use nearest node when nid == NUMA_NO_NODE,
	2896	* otherwise memory may be allocated in only one node,
	2897	* but mempolicy wants to alloc memory by interleaving.
	2898	*/
	2899	if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
	2900	nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
	2901	nr_pages_request,
	2902	pages + nr_allocated);
	2903
	2904	else
	2905	nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
	2906	nr_pages_request,
	2907	pages + nr_allocated);
	2908
	2909	nr_allocated += nr;
	2910	cond_resched();
	2911
	2912	/*
	2913	* If zero or pages were obtained partly,
	2914	* fallback to a single page allocator.
	2915	*/
	2916	if (nr != nr_pages_request)
	2917	break;
	2918	}
	2919	}
	2920
	2921	/* High-order pages or fallback path if "bulk" fails. */
	2922
	2923	while (nr_allocated < nr_pages) {
	2924	if (fatal_signal_pending(current))
	2925	break;
	2926
	2927	if (nid == NUMA_NO_NODE)
	2928	page = alloc_pages(gfp, order);
	2929	else
	2930	page = alloc_pages_node(nid, gfp, order);
	2931	if (unlikely(!page))
	2932	break;
	2933	/*
	2934	* Higher order allocations must be able to be treated as
	2935	* indepdenent small pages by callers (as they can with
	2936	* small-page vmallocs). Some drivers do their own refcounting
	2937	* on vmalloc_to_page() pages, some use page->mapping,
	2938	* page->lru, etc.
	2939	*/
	2940	if (order)
	2941	split_page(page, order);
	2942
	2943	/*
	2944	* Careful, we allocate and map page-order pages, but
	2945	* tracking is done per PAGE_SIZE page so as to keep the
	2946	* vm_struct APIs independent of the physical/mapped size.
	2947	*/
	2948	for (i = 0; i < (1U << order); i++)
	2949	pages[nr_allocated + i] = page + i;
	2950
	2951	cond_resched();
	2952	nr_allocated += 1U << order;
	2953	}
	2954
	2955	return nr_allocated;
	2956	}
	2957
	2958	static void __vmalloc_area_node(struct vm_struct area, gfp_t gfp_mask,
	2959	pgprot_t prot, unsigned int page_shift,
	2960	int node)
	2961	{
	2962	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) \| __GFP_ZERO;
	2963	bool nofail = gfp_mask & __GFP_NOFAIL;
	2964	unsigned long addr = (unsigned long)area->addr;
	2965	unsigned long size = get_vm_area_size(area);
	2966	unsigned long array_size;
	2967	unsigned int nr_small_pages = size >> PAGE_SHIFT;
	2968	unsigned int page_order;
	2969	unsigned int flags;
	2970	int ret;
	2971
	2972	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
	2973
	2974	if (!(gfp_mask & (GFP_DMA \| GFP_DMA32)))
	2975	gfp_mask \|= __GFP_HIGHMEM;
	2976
	2977	/* Please note that the recursion is strictly bounded. */
	2978	if (array_size > PAGE_SIZE) {
	2979	area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
	2980	area->caller);
	2981	} else {
	2982	area->pages = kmalloc_node(array_size, nested_gfp, node);
	2983	}
	2984
	2985	if (!area->pages) {
	2986	warn_alloc(gfp_mask, NULL,
	2987	"vmalloc error: size %lu, failed to allocated page array size %lu",
	2988	nr_small_pages * PAGE_SIZE, array_size);
	2989	free_vm_area(area);
	2990	return NULL;
	2991	}
	2992
	2993	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
	2994	page_order = vm_area_page_order(area);
	2995
	2996	area->nr_pages = vm_area_alloc_pages(gfp_mask \| __GFP_NOWARN,
	2997	node, page_order, nr_small_pages, area->pages);
	2998
	2999	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
	3000	if (gfp_mask & __GFP_ACCOUNT) {
	3001	int i;
	3002
	3003	for (i = 0; i < area->nr_pages; i++)
	3004	mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
	3005	}
	3006
	3007	/*
	3008	* If not enough pages were obtained to accomplish an
	3009	* allocation request, free them via vfree() if any.
	3010	*/
	3011	if (area->nr_pages != nr_small_pages) {
	3012	warn_alloc(gfp_mask, NULL,
	3013	"vmalloc error: size %lu, page order %u, failed to allocate pages",
	3014	area->nr_pages * PAGE_SIZE, page_order);
	3015	goto fail;
	3016	}
	3017
	3018	/*
	3019	* page tables allocations ignore external gfp mask, enforce it
	3020	* by the scope API
	3021	*/
	3022	if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == __GFP_IO)
	3023	flags = memalloc_nofs_save();
	3024	else if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == 0)
	3025	flags = memalloc_noio_save();
	3026
	3027	do {
	3028	ret = vmap_pages_range(addr, addr + size, prot, area->pages,
	3029	page_shift);
	3030	if (nofail && (ret < 0))
	3031	schedule_timeout_uninterruptible(1);
	3032	} while (nofail && (ret < 0));
	3033
	3034	if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == __GFP_IO)
	3035	memalloc_nofs_restore(flags);
	3036	else if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == 0)
	3037	memalloc_noio_restore(flags);
	3038
	3039	if (ret < 0) {
	3040	warn_alloc(gfp_mask, NULL,
	3041	"vmalloc error: size %lu, failed to map pages",
	3042	area->nr_pages * PAGE_SIZE);
	3043	goto fail;
	3044	}
	3045
	3046	return area->addr;
	3047
	3048	fail:
	3049	vfree(area->addr);
	3050	return NULL;
	3051	}
	3052
	3053	/**
	3054	* __vmalloc_node_range - allocate virtually contiguous memory
	3055	* @size: allocation size
	3056	* @align: desired alignment
	3057	* @start: vm area range start
	3058	* @end: vm area range end
	3059	* @gfp_mask: flags for the page level allocator
	3060	* @prot: protection mask for the allocated pages
	3061	* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
	3062	* @node: node to use for allocation or NUMA_NO_NODE
	3063	* @caller: caller's return address
	3064	*
	3065	* Allocate enough pages to cover @size from the page level
	3066	* allocator with @gfp_mask flags. Please note that the full set of gfp
	3067	* flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
	3068	* supported.
	3069	* Zone modifiers are not supported. From the reclaim modifiers
	3070	* __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
	3071	* and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
	3072	* __GFP_RETRY_MAYFAIL are not supported).
	3073	*
	3074	* __GFP_NOWARN can be used to suppress failures messages.
	3075	*
	3076	* Map them into contiguous kernel virtual space, using a pagetable
	3077	* protection of @prot.
	3078	*
	3079	* Return: the address of the area or %NULL on failure
	3080	*/
	3081	void *__vmalloc_node_range(unsigned long size, unsigned long align,
	3082	unsigned long start, unsigned long end, gfp_t gfp_mask,
	3083	pgprot_t prot, unsigned long vm_flags, int node,
	3084	const void *caller)
	3085	{
	3086	struct vm_struct *area;
	3087	void *ret;
	3088	kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
	3089	unsigned long real_size = size;
	3090	unsigned long real_align = align;
	3091	unsigned int shift = PAGE_SHIFT;
	3092
	3093	if (WARN_ON_ONCE(!size))
	3094	return NULL;
	3095
	3096	if ((size >> PAGE_SHIFT) > totalram_pages()) {
	3097	warn_alloc(gfp_mask, NULL,
	3098	"vmalloc error: size %lu, exceeds total pages",
	3099	real_size);
	3100	return NULL;
	3101	}
	3102
	3103	if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
	3104	unsigned long size_per_node;
	3105
	3106	/*
	3107	* Try huge pages. Only try for PAGE_KERNEL allocations,
	3108	* others like modules don't yet expect huge pages in
	3109	* their allocations due to apply_to_page_range not
	3110	* supporting them.
	3111	*/
	3112
	3113	size_per_node = size;
	3114	if (node == NUMA_NO_NODE)
	3115	size_per_node /= num_online_nodes();
	3116	if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
	3117	shift = PMD_SHIFT;
	3118	else
	3119	shift = arch_vmap_pte_supported_shift(size_per_node);
	3120
	3121	align = max(real_align, 1UL << shift);
	3122	size = ALIGN(real_size, 1UL << shift);
	3123	}
	3124
	3125	again:
	3126	area = __get_vm_area_node(real_size, align, shift, VM_ALLOC \|
	3127	VM_UNINITIALIZED \| vm_flags, start, end, node,
	3128	gfp_mask, caller);
	3129	if (!area) {
	3130	bool nofail = gfp_mask & __GFP_NOFAIL;
	3131	warn_alloc(gfp_mask, NULL,
	3132	"vmalloc error: size %lu, vm_struct allocation failed%s",
	3133	real_size, (nofail) ? ". Retrying." : "");
	3134	if (nofail) {
	3135	schedule_timeout_uninterruptible(1);
	3136	goto again;
	3137	}
	3138	goto fail;
	3139	}
	3140
	3141	/*
	3142	* Prepare arguments for __vmalloc_area_node() and
	3143	* kasan_unpoison_vmalloc().
	3144	*/
	3145	if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
	3146	if (kasan_hw_tags_enabled()) {
	3147	/*
	3148	* Modify protection bits to allow tagging.
	3149	* This must be done before mapping.
	3150	*/
	3151	prot = arch_vmap_pgprot_tagged(prot);
	3152
	3153	/*
	3154	* Skip page_alloc poisoning and zeroing for physical
	3155	* pages backing VM_ALLOC mapping. Memory is instead
	3156	* poisoned and zeroed by kasan_unpoison_vmalloc().
	3157	*/
	3158	gfp_mask \|= __GFP_SKIP_KASAN_UNPOISON \| __GFP_SKIP_ZERO;
	3159	}
	3160
	3161	/* Take note that the mapping is PAGE_KERNEL. */
	3162	kasan_flags \|= KASAN_VMALLOC_PROT_NORMAL;
	3163	}
	3164
	3165	/* Allocate physical pages and map them into vmalloc space. */
	3166	ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
	3167	if (!ret)
	3168	goto fail;
	3169
	3170	/*
	3171	* Mark the pages as accessible, now that they are mapped.
	3172	* The condition for setting KASAN_VMALLOC_INIT should complement the
	3173	* one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
	3174	* to make sure that memory is initialized under the same conditions.
	3175	* Tag-based KASAN modes only assign tags to normal non-executable
	3176	* allocations, see __kasan_unpoison_vmalloc().
	3177	*/
	3178	kasan_flags \|= KASAN_VMALLOC_VM_ALLOC;
	3179	if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
	3180	(gfp_mask & __GFP_SKIP_ZERO))
	3181	kasan_flags \|= KASAN_VMALLOC_INIT;
	3182	/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
	3183	area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
	3184
	3185	/*
	3186	* In this function, newly allocated vm_struct has VM_UNINITIALIZED
	3187	* flag. It means that vm_struct is not fully initialized.
	3188	* Now, it is fully initialized, so remove this flag here.
	3189	*/
	3190	clear_vm_uninitialized_flag(area);
	3191
	3192	size = PAGE_ALIGN(size);
	3193	if (!(vm_flags & VM_DEFER_KMEMLEAK))
	3194	kmemleak_vmalloc(area, size, gfp_mask);
	3195
	3196	return area->addr;
	3197
	3198	fail:
	3199	if (shift > PAGE_SHIFT) {
	3200	shift = PAGE_SHIFT;
	3201	align = real_align;
	3202	size = real_size;
	3203	goto again;
	3204	}
	3205
	3206	return NULL;
	3207	}
	3208
	3209	/**
	3210	* __vmalloc_node - allocate virtually contiguous memory
	3211	* @size: allocation size
	3212	* @align: desired alignment
	3213	* @gfp_mask: flags for the page level allocator
	3214	* @node: node to use for allocation or NUMA_NO_NODE
	3215	* @caller: caller's return address
	3216	*
	3217	* Allocate enough pages to cover @size from the page level allocator with
	3218	* @gfp_mask flags. Map them into contiguous kernel virtual space.
	3219	*
	3220	* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
	3221	* and __GFP_NOFAIL are not supported
	3222	*
	3223	* Any use of gfp flags outside of GFP_KERNEL should be consulted
	3224	* with mm people.
	3225	*
	3226	* Return: pointer to the allocated memory or %NULL on error
	3227	*/
	3228	void *__vmalloc_node(unsigned long size, unsigned long align,
	3229	gfp_t gfp_mask, int node, const void *caller)
	3230	{
	3231	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
	3232	gfp_mask, PAGE_KERNEL, 0, node, caller);
	3233	}
	3234	/*
	3235	* This is only for performance analysis of vmalloc and stress purpose.
	3236	* It is required by vmalloc test module, therefore do not use it other
	3237	* than that.
	3238	*/
	3239	#ifdef CONFIG_TEST_VMALLOC_MODULE
	3240	EXPORT_SYMBOL_GPL(__vmalloc_node);
	3241	#endif
	3242
	3243	void *__vmalloc(unsigned long size, gfp_t gfp_mask)
	3244	{
	3245	return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
	3246	__builtin_return_address(0));
	3247	}
	3248	EXPORT_SYMBOL(__vmalloc);
	3249
	3250	/**
	3251	* vmalloc - allocate virtually contiguous memory
	3252	* @size: allocation size
	3253	*
	3254	* Allocate enough pages to cover @size from the page level
	3255	* allocator and map them into contiguous kernel virtual space.
	3256	*
	3257	* For tight control over page level allocator and protection flags
	3258	* use __vmalloc() instead.
	3259	*
	3260	* Return: pointer to the allocated memory or %NULL on error
	3261	*/
	3262	void *vmalloc(unsigned long size)
	3263	{
	3264	return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
	3265	__builtin_return_address(0));
	3266	}
	3267	EXPORT_SYMBOL(vmalloc);
	3268
	3269	/**
	3270	* vmalloc_huge - allocate virtually contiguous memory, allow huge pages
	3271	* @size: allocation size
	3272	* @gfp_mask: flags for the page level allocator
	3273	*
	3274	* Allocate enough pages to cover @size from the page level
	3275	* allocator and map them into contiguous kernel virtual space.
	3276	* If @size is greater than or equal to PMD_SIZE, allow using
	3277	* huge pages for the memory
	3278	*
	3279	* Return: pointer to the allocated memory or %NULL on error
	3280	*/
	3281	void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
	3282	{
	3283	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
	3284	gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
	3285	NUMA_NO_NODE, __builtin_return_address(0));
	3286	}
	3287	EXPORT_SYMBOL_GPL(vmalloc_huge);
	3288
	3289	/**
	3290	* vzalloc - allocate virtually contiguous memory with zero fill
	3291	* @size: allocation size
	3292	*
	3293	* Allocate enough pages to cover @size from the page level
	3294	* allocator and map them into contiguous kernel virtual space.
	3295	* The memory allocated is set to zero.
	3296	*
	3297	* For tight control over page level allocator and protection flags
	3298	* use __vmalloc() instead.
	3299	*
	3300	* Return: pointer to the allocated memory or %NULL on error
	3301	*/
	3302	void *vzalloc(unsigned long size)
	3303	{
	3304	return __vmalloc_node(size, 1, GFP_KERNEL \| __GFP_ZERO, NUMA_NO_NODE,
	3305	__builtin_return_address(0));
	3306	}
	3307	EXPORT_SYMBOL(vzalloc);
	3308
	3309	/**
	3310	* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
	3311	* @size: allocation size
	3312	*
	3313	* The resulting memory area is zeroed so it can be mapped to userspace
	3314	* without leaking data.
	3315	*
	3316	* Return: pointer to the allocated memory or %NULL on error
	3317	*/
	3318	void *vmalloc_user(unsigned long size)
	3319	{
	3320	return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
	3321	GFP_KERNEL \| __GFP_ZERO, PAGE_KERNEL,
	3322	VM_USERMAP, NUMA_NO_NODE,
	3323	__builtin_return_address(0));
	3324	}
	3325	EXPORT_SYMBOL(vmalloc_user);
	3326
	3327	/**
	3328	* vmalloc_node - allocate memory on a specific node
	3329	* @size: allocation size
	3330	* @node: numa node
	3331	*
	3332	* Allocate enough pages to cover @size from the page level
	3333	* allocator and map them into contiguous kernel virtual space.
	3334	*
	3335	* For tight control over page level allocator and protection flags
	3336	* use __vmalloc() instead.
	3337	*
	3338	* Return: pointer to the allocated memory or %NULL on error
	3339	*/
	3340	void *vmalloc_node(unsigned long size, int node)
	3341	{
	3342	return __vmalloc_node(size, 1, GFP_KERNEL, node,
	3343	__builtin_return_address(0));
	3344	}
	3345	EXPORT_SYMBOL(vmalloc_node);
	3346
	3347	/**
	3348	* vzalloc_node - allocate memory on a specific node with zero fill
	3349	* @size: allocation size
	3350	* @node: numa node
	3351	*
	3352	* Allocate enough pages to cover @size from the page level
	3353	* allocator and map them into contiguous kernel virtual space.
	3354	* The memory allocated is set to zero.
	3355	*
	3356	* Return: pointer to the allocated memory or %NULL on error
	3357	*/
	3358	void *vzalloc_node(unsigned long size, int node)
	3359	{
	3360	return __vmalloc_node(size, 1, GFP_KERNEL \| __GFP_ZERO, node,
	3361	__builtin_return_address(0));
	3362	}
	3363	EXPORT_SYMBOL(vzalloc_node);
	3364
	3365	#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
	3366	#define GFP_VMALLOC32 (GFP_DMA32 \| GFP_KERNEL)
	3367	#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
	3368	#define GFP_VMALLOC32 (GFP_DMA \| GFP_KERNEL)
	3369	#else
	3370	/*
	3371	* 64b systems should always have either DMA or DMA32 zones. For others
	3372	* GFP_DMA32 should do the right thing and use the normal zone.
	3373	*/
	3374	#define GFP_VMALLOC32 (GFP_DMA32 \| GFP_KERNEL)
	3375	#endif
	3376
	3377	/**
	3378	* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
	3379	* @size: allocation size
	3380	*
	3381	* Allocate enough 32bit PA addressable pages to cover @size from the
	3382	* page level allocator and map them into contiguous kernel virtual space.
	3383	*
	3384	* Return: pointer to the allocated memory or %NULL on error
	3385	*/
	3386	void *vmalloc_32(unsigned long size)
	3387	{
	3388	return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
	3389	__builtin_return_address(0));
	3390	}
	3391	EXPORT_SYMBOL(vmalloc_32);
	3392
	3393	/**
	3394	* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
	3395	* @size: allocation size
	3396	*
	3397	* The resulting memory area is 32bit addressable and zeroed so it can be
	3398	* mapped to userspace without leaking data.
	3399	*
	3400	* Return: pointer to the allocated memory or %NULL on error
	3401	*/
	3402	void *vmalloc_32_user(unsigned long size)
	3403	{
	3404	return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
	3405	GFP_VMALLOC32 \| __GFP_ZERO, PAGE_KERNEL,
	3406	VM_USERMAP, NUMA_NO_NODE,
	3407	__builtin_return_address(0));
	3408	}
	3409	EXPORT_SYMBOL(vmalloc_32_user);
	3410
	3411	/*
	3412	* small helper routine , copy contents to buf from addr.
	3413	* If the page is not present, fill zero.
	3414	*/
	3415
	3416	static int aligned_vread(char buf, char addr, unsigned long count)
	3417	{
	3418	struct page *p;
	3419	int copied = 0;
	3420
	3421	while (count) {
	3422	unsigned long offset, length;
	3423
	3424	offset = offset_in_page(addr);
	3425	length = PAGE_SIZE - offset;
	3426	if (length > count)
	3427	length = count;
	3428	p = vmalloc_to_page(addr);
	3429	/*
	3430	* To do safe access to this _mapped_ area, we need
	3431	* lock. But adding lock here means that we need to add
	3432	* overhead of vmalloc()/vfree() calls for this _debug_
	3433	* interface, rarely used. Instead of that, we'll use
	3434	* kmap() and get small overhead in this access function.
	3435	*/
	3436	if (p) {
	3437	/* We can expect USER0 is not used -- see vread() */
	3438	void *map = kmap_atomic(p);
	3439	memcpy(buf, map + offset, length);
	3440	kunmap_atomic(map);
	3441	} else
	3442	memset(buf, 0, length);
	3443
	3444	addr += length;
	3445	buf += length;
	3446	copied += length;
	3447	count -= length;
	3448	}
	3449	return copied;
	3450	}
	3451
	3452	/**
	3453	* vread() - read vmalloc area in a safe way.
	3454	* @buf: buffer for reading data
	3455	* @addr: vm address.
	3456	* @count: number of bytes to be read.
	3457	*
	3458	* This function checks that addr is a valid vmalloc'ed area, and
	3459	* copy data from that area to a given buffer. If the given memory range
	3460	* of [addr...addr+count) includes some valid address, data is copied to
	3461	* proper area of @buf. If there are memory holes, they'll be zero-filled.
	3462	* IOREMAP area is treated as memory hole and no copy is done.
	3463	*
	3464	* If [addr...addr+count) doesn't includes any intersects with alive
	3465	* vm_struct area, returns 0. @buf should be kernel's buffer.
	3466	*
	3467	* Note: In usual ops, vread() is never necessary because the caller
	3468	* should know vmalloc() area is valid and can use memcpy().
	3469	* This is for routines which have to access vmalloc area without
	3470	* any information, as /proc/kcore.
	3471	*
	3472	* Return: number of bytes for which addr and buf should be increased
	3473	* (same number as @count) or %0 if [addr...addr+count) doesn't
	3474	* include any intersection with valid vmalloc area
	3475	*/
	3476	long vread(char buf, char addr, unsigned long count)
	3477	{
	3478	struct vmap_area *va;
	3479	struct vm_struct *vm;
	3480	char vaddr, buf_start = buf;
	3481	unsigned long buflen = count;
	3482	unsigned long n;
	3483
	3484	addr = kasan_reset_tag(addr);
	3485
	3486	/* Don't allow overflow */
	3487	if ((unsigned long) addr + count < count)
	3488	count = -(unsigned long) addr;
	3489
	3490	spin_lock(&vmap_area_lock);
	3491	va = find_vmap_area_exceed_addr((unsigned long)addr);
	3492	if (!va)
	3493	goto finished;
	3494
	3495	/* no intersects with alive vmap_area */
	3496	if ((unsigned long)addr + count <= va->va_start)
	3497	goto finished;
	3498
	3499	list_for_each_entry_from(va, &vmap_area_list, list) {
	3500	if (!count)
	3501	break;
	3502
	3503	if (!va->vm)
	3504	continue;
	3505
	3506	vm = va->vm;
	3507	vaddr = (char *) vm->addr;
	3508	if (addr >= vaddr + get_vm_area_size(vm))
	3509	continue;
	3510	while (addr < vaddr) {
	3511	if (count == 0)
	3512	goto finished;
	3513	*buf = '\0';
	3514	buf++;
	3515	addr++;
	3516	count--;
	3517	}
	3518	n = vaddr + get_vm_area_size(vm) - addr;
	3519	if (n > count)
	3520	n = count;
	3521	if (!(vm->flags & VM_IOREMAP))
	3522	aligned_vread(buf, addr, n);
	3523	else /* IOREMAP area is treated as memory hole */
	3524	memset(buf, 0, n);
	3525	buf += n;
	3526	addr += n;
	3527	count -= n;
	3528	}
	3529	finished:
	3530	spin_unlock(&vmap_area_lock);
	3531
	3532	if (buf == buf_start)
	3533	return 0;
	3534	/* zero-fill memory holes */
	3535	if (buf != buf_start + buflen)
	3536	memset(buf, 0, buflen - (buf - buf_start));
	3537
	3538	return buflen;
	3539	}
	3540
	3541	/**
	3542	* remap_vmalloc_range_partial - map vmalloc pages to userspace
	3543	* @vma: vma to cover
	3544	* @uaddr: target user address to start at
	3545	* @kaddr: virtual address of vmalloc kernel memory
	3546	* @pgoff: offset from @kaddr to start at
	3547	* @size: size of map area
	3548	*
	3549	* Returns: 0 for success, -Exxx on failure
	3550	*
	3551	* This function checks that @kaddr is a valid vmalloc'ed area,
	3552	* and that it is big enough to cover the range starting at
	3553	* @uaddr in @vma. Will return failure if that criteria isn't
	3554	* met.
	3555	*
	3556	* Similar to remap_pfn_range() (see mm/memory.c)
	3557	*/
	3558	int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
	3559	void *kaddr, unsigned long pgoff,
	3560	unsigned long size)
	3561	{
	3562	struct vm_struct *area;
	3563	unsigned long off;
	3564	unsigned long end_index;
	3565
	3566	if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
	3567	return -EINVAL;
	3568
	3569	size = PAGE_ALIGN(size);
	3570
	3571	if (!PAGE_ALIGNED(uaddr) \|\| !PAGE_ALIGNED(kaddr))
	3572	return -EINVAL;
	3573
	3574	area = find_vm_area(kaddr);
	3575	if (!area)
	3576	return -EINVAL;
	3577
	3578	if (!(area->flags & (VM_USERMAP \| VM_DMA_COHERENT)))
	3579	return -EINVAL;
	3580
	3581	if (check_add_overflow(size, off, &end_index) \|\|
	3582	end_index > get_vm_area_size(area))
	3583	return -EINVAL;
	3584	kaddr += off;
	3585
	3586	do {
	3587	struct page *page = vmalloc_to_page(kaddr);
	3588	int ret;
	3589
	3590	ret = vm_insert_page(vma, uaddr, page);
	3591	if (ret)
	3592	return ret;
	3593
	3594	uaddr += PAGE_SIZE;
	3595	kaddr += PAGE_SIZE;
	3596	size -= PAGE_SIZE;
	3597	} while (size > 0);
	3598
	3599	vma->vm_flags \|= VM_DONTEXPAND \| VM_DONTDUMP;
	3600
	3601	return 0;
	3602	}
	3603
	3604	/**
	3605	* remap_vmalloc_range - map vmalloc pages to userspace
	3606	* @vma: vma to cover (map full range of vma)
	3607	* @addr: vmalloc memory
	3608	* @pgoff: number of pages into addr before first page to map
	3609	*
	3610	* Returns: 0 for success, -Exxx on failure
	3611	*
	3612	* This function checks that addr is a valid vmalloc'ed area, and
	3613	* that it is big enough to cover the vma. Will return failure if
	3614	* that criteria isn't met.
	3615	*
	3616	* Similar to remap_pfn_range() (see mm/memory.c)
	3617	*/
	3618	int remap_vmalloc_range(struct vm_area_struct vma, void addr,
	3619	unsigned long pgoff)
	3620	{
	3621	return remap_vmalloc_range_partial(vma, vma->vm_start,
	3622	addr, pgoff,
	3623	vma->vm_end - vma->vm_start);
	3624	}
	3625	EXPORT_SYMBOL(remap_vmalloc_range);
	3626
	3627	void free_vm_area(struct vm_struct *area)
	3628	{
	3629	struct vm_struct *ret;
	3630	ret = remove_vm_area(area->addr);
	3631	BUG_ON(ret != area);
	3632	kfree(area);
	3633	}
	3634	EXPORT_SYMBOL_GPL(free_vm_area);
	3635
	3636	#ifdef CONFIG_SMP
	3637	static struct vmap_area node_to_va(struct rb_node n)
	3638	{
	3639	return rb_entry_safe(n, struct vmap_area, rb_node);
	3640	}
	3641
	3642	/**
	3643	* pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
	3644	* @addr: target address
	3645	*
	3646	* Returns: vmap_area if it is found. If there is no such area
	3647	* the first highest(reverse order) vmap_area is returned
	3648	* i.e. va->va_start < addr && va->va_end < addr or NULL
	3649	* if there are no any areas before @addr.
	3650	*/
	3651	static struct vmap_area *
	3652	pvm_find_va_enclose_addr(unsigned long addr)
	3653	{
	3654	struct vmap_area va, tmp;
	3655	struct rb_node *n;
	3656
	3657	n = free_vmap_area_root.rb_node;
	3658	va = NULL;
	3659
	3660	while (n) {
	3661	tmp = rb_entry(n, struct vmap_area, rb_node);
	3662	if (tmp->va_start <= addr) {
	3663	va = tmp;
	3664	if (tmp->va_end >= addr)
	3665	break;
	3666
	3667	n = n->rb_right;
	3668	} else {
	3669	n = n->rb_left;
	3670	}
	3671	}
	3672
	3673	return va;
	3674	}
	3675
	3676	/**
	3677	* pvm_determine_end_from_reverse - find the highest aligned address
	3678	* of free block below VMALLOC_END
	3679	* @va:
	3680	* in - the VA we start the search(reverse order);
	3681	* out - the VA with the highest aligned end address.
	3682	* @align: alignment for required highest address
	3683	*
	3684	* Returns: determined end address within vmap_area
	3685	*/
	3686	static unsigned long
	3687	pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
	3688	{
	3689	unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
	3690	unsigned long addr;
	3691
	3692	if (likely(*va)) {
	3693	list_for_each_entry_from_reverse((*va),
	3694	&free_vmap_area_list, list) {
	3695	addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
	3696	if ((*va)->va_start < addr)
	3697	return addr;
	3698	}
	3699	}
	3700
	3701	return 0;
	3702	}
	3703
	3704	/**
	3705	* pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
	3706	* @offsets: array containing offset of each area
	3707	* @sizes: array containing size of each area
	3708	* @nr_vms: the number of areas to allocate
	3709	* @align: alignment, all entries in @offsets and @sizes must be aligned to this
	3710	*
	3711	* Returns: kmalloc'd vm_struct pointer array pointing to allocated
	3712	* vm_structs on success, %NULL on failure
	3713	*
	3714	* Percpu allocator wants to use congruent vm areas so that it can
	3715	* maintain the offsets among percpu areas. This function allocates
	3716	* congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
	3717	* be scattered pretty far, distance between two areas easily going up
	3718	* to gigabytes. To avoid interacting with regular vmallocs, these
	3719	* areas are allocated from top.
	3720	*
	3721	* Despite its complicated look, this allocator is rather simple. It
	3722	* does everything top-down and scans free blocks from the end looking
	3723	* for matching base. While scanning, if any of the areas do not fit the
	3724	* base address is pulled down to fit the area. Scanning is repeated till
	3725	* all the areas fit and then all necessary data structures are inserted
	3726	* and the result is returned.
	3727	*/
	3728	struct vm_struct *pcpu_get_vm_areas(const unsigned long offsets,
	3729	const size_t *sizes, int nr_vms,
	3730	size_t align)
	3731	{
	3732	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
	3733	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
	3734	struct vmap_area *vas, va;
	3735	struct vm_struct **vms;
	3736	int area, area2, last_area, term_area;
	3737	unsigned long base, start, size, end, last_end, orig_start, orig_end;
	3738	bool purged = false;
	3739
	3740	/* verify parameters and allocate data structures */
	3741	BUG_ON(offset_in_page(align) \|\| !is_power_of_2(align));
	3742	for (last_area = 0, area = 0; area < nr_vms; area++) {
	3743	start = offsets[area];
	3744	end = start + sizes[area];
	3745
	3746	/* is everything aligned properly? */
	3747	BUG_ON(!IS_ALIGNED(offsets[area], align));
	3748	BUG_ON(!IS_ALIGNED(sizes[area], align));
	3749
	3750	/* detect the area with the highest address */
	3751	if (start > offsets[last_area])
	3752	last_area = area;
	3753
	3754	for (area2 = area + 1; area2 < nr_vms; area2++) {
	3755	unsigned long start2 = offsets[area2];
	3756	unsigned long end2 = start2 + sizes[area2];
	3757
	3758	BUG_ON(start2 < end && start < end2);
	3759	}
	3760	}
	3761	last_end = offsets[last_area] + sizes[last_area];
	3762
	3763	if (vmalloc_end - vmalloc_start < last_end) {
	3764	WARN_ON(true);
	3765	return NULL;
	3766	}
	3767
	3768	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
	3769	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
	3770	if (!vas \|\| !vms)
	3771	goto err_free2;
	3772
	3773	for (area = 0; area < nr_vms; area++) {
	3774	vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
	3775	vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
	3776	if (!vas[area] \|\| !vms[area])
	3777	goto err_free;
	3778	}
	3779	retry:
	3780	spin_lock(&free_vmap_area_lock);
	3781
	3782	/* start scanning - we scan from the top, begin with the last area */
	3783	area = term_area = last_area;
	3784	start = offsets[area];
	3785	end = start + sizes[area];
	3786
	3787	va = pvm_find_va_enclose_addr(vmalloc_end);
	3788	base = pvm_determine_end_from_reverse(&va, align) - end;
	3789
	3790	while (true) {
	3791	/*
	3792	* base might have underflowed, add last_end before
	3793	* comparing.
	3794	*/
	3795	if (base + last_end < vmalloc_start + last_end)
	3796	goto overflow;
	3797
	3798	/*
	3799	* Fitting base has not been found.
	3800	*/
	3801	if (va == NULL)
	3802	goto overflow;
	3803
	3804	/*
	3805	* If required width exceeds current VA block, move
	3806	* base downwards and then recheck.
	3807	*/
	3808	if (base + end > va->va_end) {
	3809	base = pvm_determine_end_from_reverse(&va, align) - end;
	3810	term_area = area;
	3811	continue;
	3812	}
	3813
	3814	/*
	3815	* If this VA does not fit, move base downwards and recheck.
	3816	*/
	3817	if (base + start < va->va_start) {
	3818	va = node_to_va(rb_prev(&va->rb_node));
	3819	base = pvm_determine_end_from_reverse(&va, align) - end;
	3820	term_area = area;
	3821	continue;
	3822	}
	3823
	3824	/*
	3825	* This area fits, move on to the previous one. If
	3826	* the previous one is the terminal one, we're done.
	3827	*/
	3828	area = (area + nr_vms - 1) % nr_vms;
	3829	if (area == term_area)
	3830	break;
	3831
	3832	start = offsets[area];
	3833	end = start + sizes[area];
	3834	va = pvm_find_va_enclose_addr(base + end);
	3835	}
	3836
	3837	/* we've found a fitting base, insert all va's */
	3838	for (area = 0; area < nr_vms; area++) {
	3839	int ret;
	3840
	3841	start = base + offsets[area];
	3842	size = sizes[area];
	3843
	3844	va = pvm_find_va_enclose_addr(start);
	3845	if (WARN_ON_ONCE(va == NULL))
	3846	/* It is a BUG(), but trigger recovery instead. */
	3847	goto recovery;
	3848
	3849	ret = adjust_va_to_fit_type(&free_vmap_area_root,
	3850	&free_vmap_area_list,
	3851	va, start, size);
	3852	if (WARN_ON_ONCE(unlikely(ret)))
	3853	/* It is a BUG(), but trigger recovery instead. */
	3854	goto recovery;
	3855
	3856	/* Allocated area. */
	3857	va = vas[area];
	3858	va->va_start = start;
	3859	va->va_end = start + size;
	3860	}
	3861
	3862	spin_unlock(&free_vmap_area_lock);
	3863
	3864	/* populate the kasan shadow space */
	3865	for (area = 0; area < nr_vms; area++) {
	3866	if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
	3867	goto err_free_shadow;
	3868	}
	3869
	3870	/* insert all vm's */
	3871	spin_lock(&vmap_area_lock);
	3872	for (area = 0; area < nr_vms; area++) {
	3873	insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
	3874
	3875	setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
	3876	pcpu_get_vm_areas);
	3877	}
	3878	spin_unlock(&vmap_area_lock);
	3879
	3880	/*
	3881	* Mark allocated areas as accessible. Do it now as a best-effort
	3882	* approach, as they can be mapped outside of vmalloc code.
	3883	* With hardware tag-based KASAN, marking is skipped for
	3884	* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
	3885	*/
	3886	for (area = 0; area < nr_vms; area++)
	3887	vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
	3888	vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
	3889
	3890	kfree(vas);
	3891	return vms;
	3892
	3893	recovery:
	3894	/*
	3895	* Remove previously allocated areas. There is no
	3896	* need in removing these areas from the busy tree,
	3897	* because they are inserted only on the final step
	3898	* and when pcpu_get_vm_areas() is success.
	3899	*/
	3900	while (area--) {
	3901	orig_start = vas[area]->va_start;
	3902	orig_end = vas[area]->va_end;
	3903	va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
	3904	&free_vmap_area_list);
	3905	if (va)
	3906	kasan_release_vmalloc(orig_start, orig_end,
	3907	va->va_start, va->va_end);
	3908	vas[area] = NULL;
	3909	}
	3910
	3911	overflow:
	3912	spin_unlock(&free_vmap_area_lock);
	3913	if (!purged) {
	3914	purge_vmap_area_lazy();
	3915	purged = true;
	3916
	3917	/* Before "retry", check if we recover. */
	3918	for (area = 0; area < nr_vms; area++) {
	3919	if (vas[area])
	3920	continue;
	3921
	3922	vas[area] = kmem_cache_zalloc(
	3923	vmap_area_cachep, GFP_KERNEL);
	3924	if (!vas[area])
	3925	goto err_free;
	3926	}
	3927
	3928	goto retry;
	3929	}
	3930
	3931	err_free:
	3932	for (area = 0; area < nr_vms; area++) {
	3933	if (vas[area])
	3934	kmem_cache_free(vmap_area_cachep, vas[area]);
	3935
	3936	kfree(vms[area]);
	3937	}
	3938	err_free2:
	3939	kfree(vas);
	3940	kfree(vms);
	3941	return NULL;
	3942
	3943	err_free_shadow:
	3944	spin_lock(&free_vmap_area_lock);
	3945	/*
	3946	* We release all the vmalloc shadows, even the ones for regions that
	3947	* hadn't been successfully added. This relies on kasan_release_vmalloc
	3948	* being able to tolerate this case.
	3949	*/
	3950	for (area = 0; area < nr_vms; area++) {
	3951	orig_start = vas[area]->va_start;
	3952	orig_end = vas[area]->va_end;
	3953	va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
	3954	&free_vmap_area_list);
	3955	if (va)
	3956	kasan_release_vmalloc(orig_start, orig_end,
	3957	va->va_start, va->va_end);
	3958	vas[area] = NULL;
	3959	kfree(vms[area]);
	3960	}
	3961	spin_unlock(&free_vmap_area_lock);
	3962	kfree(vas);
	3963	kfree(vms);
	3964	return NULL;
	3965	}
	3966
	3967	/**
	3968	* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
	3969	* @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
	3970	* @nr_vms: the number of allocated areas
	3971	*
	3972	* Free vm_structs and the array allocated by pcpu_get_vm_areas().
	3973	*/
	3974	void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
	3975	{
	3976	int i;
	3977
	3978	for (i = 0; i < nr_vms; i++)
	3979	free_vm_area(vms[i]);
	3980	kfree(vms);
	3981	}
	3982	#endif /* CONFIG_SMP */
	3983
	3984	#ifdef CONFIG_PRINTK
	3985	bool vmalloc_dump_obj(void *object)
	3986	{
	3987	struct vm_struct *vm;
	3988	void objp = (void )PAGE_ALIGN((unsigned long)object);
	3989
	3990	vm = find_vm_area(objp);
	3991	if (!vm)
	3992	return false;
	3993	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
	3994	vm->nr_pages, (unsigned long)vm->addr, vm->caller);
	3995	return true;
	3996	}
	3997	#endif
	3998
	3999	#ifdef CONFIG_PROC_FS
	4000	static void s_start(struct seq_file m, loff_t *pos)
	4001	__acquires(&vmap_purge_lock)
	4002	__acquires(&vmap_area_lock)
	4003	{
	4004	mutex_lock(&vmap_purge_lock);
	4005	spin_lock(&vmap_area_lock);
	4006
	4007	return seq_list_start(&vmap_area_list, *pos);
	4008	}
	4009
	4010	static void s_next(struct seq_file m, void p, loff_t pos)
	4011	{
	4012	return seq_list_next(p, &vmap_area_list, pos);
	4013	}
	4014
	4015	static void s_stop(struct seq_file m, void p)
	4016	__releases(&vmap_area_lock)
	4017	__releases(&vmap_purge_lock)
	4018	{
	4019	spin_unlock(&vmap_area_lock);
	4020	mutex_unlock(&vmap_purge_lock);
	4021	}
	4022
	4023	static void show_numa_info(struct seq_file m, struct vm_struct v)
	4024	{
	4025	if (IS_ENABLED(CONFIG_NUMA)) {
	4026	unsigned int nr, *counters = m->private;
	4027	unsigned int step = 1U << vm_area_page_order(v);
	4028
	4029	if (!counters)
	4030	return;
	4031
	4032	if (v->flags & VM_UNINITIALIZED)
	4033	return;
	4034	/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
	4035	smp_rmb();
	4036
	4037	memset(counters, 0, nr_node_ids * sizeof(unsigned int));
	4038
	4039	for (nr = 0; nr < v->nr_pages; nr += step)
	4040	counters[page_to_nid(v->pages[nr])] += step;
	4041	for_each_node_state(nr, N_HIGH_MEMORY)
	4042	if (counters[nr])
	4043	seq_printf(m, " N%u=%u", nr, counters[nr]);
	4044	}
	4045	}
	4046
	4047	static void show_purge_info(struct seq_file *m)
	4048	{
	4049	struct vmap_area *va;
	4050
	4051	spin_lock(&purge_vmap_area_lock);
	4052	list_for_each_entry(va, &purge_vmap_area_list, list) {
	4053	seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
	4054	(void )va->va_start, (void )va->va_end,
	4055	va->va_end - va->va_start);
	4056	}
	4057	spin_unlock(&purge_vmap_area_lock);
	4058	}
	4059
	4060	static int s_show(struct seq_file m, void p)
	4061	{
	4062	struct vmap_area *va;
	4063	struct vm_struct *v;
	4064
	4065	va = list_entry(p, struct vmap_area, list);
	4066
	4067	/*
	4068	* s_show can encounter race with remove_vm_area, !vm on behalf
	4069	* of vmap area is being tear down or vm_map_ram allocation.
	4070	*/
	4071	if (!va->vm) {
	4072	seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
	4073	(void )va->va_start, (void )va->va_end,
	4074	va->va_end - va->va_start);
	4075
	4076	goto final;
	4077	}
	4078
	4079	v = va->vm;
	4080
	4081	seq_printf(m, "0x%pK-0x%pK %7ld",
	4082	v->addr, v->addr + v->size, v->size);
	4083
	4084	if (v->caller)
	4085	seq_printf(m, " %pS", v->caller);
	4086
	4087	if (v->nr_pages)
	4088	seq_printf(m, " pages=%d", v->nr_pages);
	4089
	4090	if (v->phys_addr)
	4091	seq_printf(m, " phys=%pa", &v->phys_addr);
	4092
	4093	if (v->flags & VM_IOREMAP)
	4094	seq_puts(m, " ioremap");
	4095
	4096	if (v->flags & VM_ALLOC)
	4097	seq_puts(m, " vmalloc");
	4098
	4099	if (v->flags & VM_MAP)
	4100	seq_puts(m, " vmap");
	4101
	4102	if (v->flags & VM_USERMAP)
	4103	seq_puts(m, " user");
	4104
	4105	if (v->flags & VM_DMA_COHERENT)
	4106	seq_puts(m, " dma-coherent");
	4107
	4108	if (is_vmalloc_addr(v->pages))
	4109	seq_puts(m, " vpages");
	4110
	4111	show_numa_info(m, v);
	4112	seq_putc(m, '\n');
	4113
	4114	/*
	4115	* As a final step, dump "unpurged" areas.
	4116	*/
	4117	final:
	4118	if (list_is_last(&va->list, &vmap_area_list))
	4119	show_purge_info(m);
	4120
	4121	return 0;
	4122	}
	4123
	4124	static const struct seq_operations vmalloc_op = {
	4125	.start = s_start,
	4126	.next = s_next,
	4127	.stop = s_stop,
	4128	.show = s_show,
	4129	};
	4130
	4131	static int __init proc_vmalloc_init(void)
	4132	{
	4133	if (IS_ENABLED(CONFIG_NUMA))
	4134	proc_create_seq_private("vmallocinfo", 0400, NULL,
	4135	&vmalloc_op,
	4136	nr_node_ids * sizeof(unsigned int), NULL);
	4137	else
	4138	proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
	4139	return 0;
	4140	}
	4141	module_init(proc_vmalloc_init);
	4142
	4143	#endif
	4144
	4145	void __init vmalloc_init(void)
	4146	{
	4147	struct vmap_area *va;
	4148	struct vm_struct *tmp;
	4149	int i;
	4150
	4151	/*
	4152	* Create the cache for vmap_area objects.
	4153	*/
	4154	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
	4155
	4156	for_each_possible_cpu(i) {
	4157	struct vmap_block_queue *vbq;
	4158	struct vfree_deferred *p;
	4159
	4160	vbq = &per_cpu(vmap_block_queue, i);
	4161	spin_lock_init(&vbq->lock);
	4162	INIT_LIST_HEAD(&vbq->free);
	4163	p = &per_cpu(vfree_deferred, i);
	4164	init_llist_head(&p->list);
	4165	INIT_WORK(&p->wq, delayed_vfree_work);
	4166	}
	4167
	4168	/* Import existing vmlist entries. */
	4169	for (tmp = vmlist; tmp; tmp = tmp->next) {
	4170	va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
	4171	if (WARN_ON_ONCE(!va))
	4172	continue;
	4173
	4174	va->va_start = (unsigned long)tmp->addr;
	4175	va->va_end = va->va_start + tmp->size;
	4176	va->vm = tmp;
	4177	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
	4178	}
	4179
	4180	/*
	4181	* Now we can initialize a free vmap space.
	4182	*/
	4183	vmap_init_free_space();
	4184	vmap_initialized = true;
	4185	}