Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Generic hugetlb support.
	3	* (C) Nadia Yvette Chambers, April 2004
	4	*/
	5	#include <linux/list.h>
	6	#include <linux/init.h>
	7	#include <linux/mm.h>
	8	#include <linux/seq_file.h>
	9	#include <linux/sysctl.h>
	10	#include <linux/highmem.h>
	11	#include <linux/mmu_notifier.h>
	12	#include <linux/nodemask.h>
	13	#include <linux/pagemap.h>
	14	#include <linux/mempolicy.h>
	15	#include <linux/compiler.h>
	16	#include <linux/cpuset.h>
	17	#include <linux/mutex.h>
	18	#include <linux/bootmem.h>
	19	#include <linux/sysfs.h>
	20	#include <linux/slab.h>
	21	#include <linux/mmdebug.h>
	22	#include <linux/sched/signal.h>
	23	#include <linux/rmap.h>
	24	#include <linux/string_helpers.h>
	25	#include <linux/swap.h>
	26	#include <linux/swapops.h>
	27	#include <linux/jhash.h>
	28
	29	#include <asm/page.h>
	30	#include <asm/pgtable.h>
	31	#include <asm/tlb.h>
	32
	33	#include <linux/io.h>
	34	#include <linux/hugetlb.h>
	35	#include <linux/hugetlb_cgroup.h>
	36	#include <linux/node.h>
	37	#include <linux/userfaultfd_k.h>
	38	#include <linux/page_owner.h>
	39	#include "internal.h"
	40
	41	int hugetlb_max_hstate __read_mostly;
	42	unsigned int default_hstate_idx;
	43	struct hstate hstates[HUGE_MAX_HSTATE];
	44	/*
	45	* Minimum page order among possible hugepage sizes, set to a proper value
	46	* at boot time.
	47	*/
	48	static unsigned int minimum_order __read_mostly = UINT_MAX;
	49
	50	__initdata LIST_HEAD(huge_boot_pages);
	51
	52	/* for command line parsing */
	53	static struct hstate * __initdata parsed_hstate;
	54	static unsigned long __initdata default_hstate_max_huge_pages;
	55	static unsigned long __initdata default_hstate_size;
	56	static bool __initdata parsed_valid_hugepagesz = true;
	57
	58	/*
	59	* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
	60	* free_huge_pages, and surplus_huge_pages.
	61	*/
	62	DEFINE_SPINLOCK(hugetlb_lock);
	63
	64	/*
	65	* Serializes faults on the same logical page. This is used to
	66	* prevent spurious OOMs when the hugepage pool is fully utilized.
	67	*/
	68	static int num_fault_mutexes;
	69	struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
	70
	71	/* Forward declaration */
	72	static int hugetlb_acct_memory(struct hstate *h, long delta);
	73
	74	static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
	75	{
	76	bool free = (spool->count == 0) && (spool->used_hpages == 0);
	77
	78	spin_unlock(&spool->lock);
	79
	80	/* If no pages are used, and no other handles to the subpool
	81	* remain, give up any reservations mased on minimum size and
	82	* free the subpool */
	83	if (free) {
	84	if (spool->min_hpages != -1)
	85	hugetlb_acct_memory(spool->hstate,
	86	-spool->min_hpages);
	87	kfree(spool);
	88	}
	89	}
	90
	91	struct hugepage_subpool hugepage_new_subpool(struct hstate h, long max_hpages,
	92	long min_hpages)
	93	{
	94	struct hugepage_subpool *spool;
	95
	96	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
	97	if (!spool)
	98	return NULL;
	99
	100	spin_lock_init(&spool->lock);
	101	spool->count = 1;
	102	spool->max_hpages = max_hpages;
	103	spool->hstate = h;
	104	spool->min_hpages = min_hpages;
	105
	106	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
	107	kfree(spool);
	108	return NULL;
	109	}
	110	spool->rsv_hpages = min_hpages;
	111
	112	return spool;
	113	}
	114
	115	void hugepage_put_subpool(struct hugepage_subpool *spool)
	116	{
	117	spin_lock(&spool->lock);
	118	BUG_ON(!spool->count);
	119	spool->count--;
	120	unlock_or_release_subpool(spool);
	121	}
	122
	123	/*
	124	* Subpool accounting for allocating and reserving pages.
	125	* Return -ENOMEM if there are not enough resources to satisfy the
	126	* the request. Otherwise, return the number of pages by which the
	127	* global pools must be adjusted (upward). The returned value may
	128	* only be different than the passed value (delta) in the case where
	129	* a subpool minimum size must be manitained.
	130	*/
	131	static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
	132	long delta)
	133	{
	134	long ret = delta;
	135
	136	if (!spool)
	137	return ret;
	138
	139	spin_lock(&spool->lock);
	140
	141	if (spool->max_hpages != -1) { /* maximum size accounting */
	142	if ((spool->used_hpages + delta) <= spool->max_hpages)
	143	spool->used_hpages += delta;
	144	else {
	145	ret = -ENOMEM;
	146	goto unlock_ret;
	147	}
	148	}
	149
	150	/* minimum size accounting */
	151	if (spool->min_hpages != -1 && spool->rsv_hpages) {
	152	if (delta > spool->rsv_hpages) {
	153	/*
	154	* Asking for more reserves than those already taken on
	155	* behalf of subpool. Return difference.
	156	*/
	157	ret = delta - spool->rsv_hpages;
	158	spool->rsv_hpages = 0;
	159	} else {
	160	ret = 0; /* reserves already accounted for */
	161	spool->rsv_hpages -= delta;
	162	}
	163	}
	164
	165	unlock_ret:
	166	spin_unlock(&spool->lock);
	167	return ret;
	168	}
	169
	170	/*
	171	* Subpool accounting for freeing and unreserving pages.
	172	* Return the number of global page reservations that must be dropped.
	173	* The return value may only be different than the passed value (delta)
	174	* in the case where a subpool minimum size must be maintained.
	175	*/
	176	static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
	177	long delta)
	178	{
	179	long ret = delta;
	180
	181	if (!spool)
	182	return delta;
	183
	184	spin_lock(&spool->lock);
	185
	186	if (spool->max_hpages != -1) /* maximum size accounting */
	187	spool->used_hpages -= delta;
	188
	189	/* minimum size accounting */
	190	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
	191	if (spool->rsv_hpages + delta <= spool->min_hpages)
	192	ret = 0;
	193	else
	194	ret = spool->rsv_hpages + delta - spool->min_hpages;
	195
	196	spool->rsv_hpages += delta;
	197	if (spool->rsv_hpages > spool->min_hpages)
	198	spool->rsv_hpages = spool->min_hpages;
	199	}
	200
	201	/*
	202	* If hugetlbfs_put_super couldn't free spool due to an outstanding
	203	* quota reference, free it now.
	204	*/
	205	unlock_or_release_subpool(spool);
	206
	207	return ret;
	208	}
	209
	210	static inline struct hugepage_subpool subpool_inode(struct inode inode)
	211	{
	212	return HUGETLBFS_SB(inode->i_sb)->spool;
	213	}
	214
	215	static inline struct hugepage_subpool subpool_vma(struct vm_area_struct vma)
	216	{
	217	return subpool_inode(file_inode(vma->vm_file));
	218	}
	219
	220	/*
	221	* Region tracking -- allows tracking of reservations and instantiated pages
	222	* across the pages in a mapping.
	223	*
	224	* The region data structures are embedded into a resv_map and protected
	225	* by a resv_map's lock. The set of regions within the resv_map represent
	226	* reservations for huge pages, or huge pages that have already been
	227	* instantiated within the map. The from and to elements are huge page
	228	* indicies into the associated mapping. from indicates the starting index
	229	* of the region. to represents the first index past the end of the region.
	230	*
	231	* For example, a file region structure with from == 0 and to == 4 represents
	232	* four huge pages in a mapping. It is important to note that the to element
	233	* represents the first element past the end of the region. This is used in
	234	* arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
	235	*
	236	* Interval notation of the form [from, to) will be used to indicate that
	237	* the endpoint from is inclusive and to is exclusive.
	238	*/
	239	struct file_region {
	240	struct list_head link;
	241	long from;
	242	long to;
	243	};
	244
	245	/*
	246	* Add the huge page range represented by [f, t) to the reserve
	247	* map. In the normal case, existing regions will be expanded
	248	* to accommodate the specified range. Sufficient regions should
	249	* exist for expansion due to the previous call to region_chg
	250	* with the same range. However, it is possible that region_del
	251	* could have been called after region_chg and modifed the map
	252	* in such a way that no region exists to be expanded. In this
	253	* case, pull a region descriptor from the cache associated with
	254	* the map and use that for the new range.
	255	*
	256	* Return the number of new huge pages added to the map. This
	257	* number is greater than or equal to zero.
	258	*/
	259	static long region_add(struct resv_map *resv, long f, long t)
	260	{
	261	struct list_head *head = &resv->regions;
	262	struct file_region rg, nrg, *trg;
	263	long add = 0;
	264
	265	spin_lock(&resv->lock);
	266	/* Locate the region we are either in or before. */
	267	list_for_each_entry(rg, head, link)
	268	if (f <= rg->to)
	269	break;
	270
	271	/*
	272	* If no region exists which can be expanded to include the
	273	* specified range, the list must have been modified by an
	274	* interleving call to region_del(). Pull a region descriptor
	275	* from the cache and use it for this range.
	276	*/
	277	if (&rg->link == head \|\| t < rg->from) {
	278	VM_BUG_ON(resv->region_cache_count <= 0);
	279
	280	resv->region_cache_count--;
	281	nrg = list_first_entry(&resv->region_cache, struct file_region,
	282	link);
	283	list_del(&nrg->link);
	284
	285	nrg->from = f;
	286	nrg->to = t;
	287	list_add(&nrg->link, rg->link.prev);
	288
	289	add += t - f;
	290	goto out_locked;
	291	}
	292
	293	/* Round our left edge to the current segment if it encloses us. */
	294	if (f > rg->from)
	295	f = rg->from;
	296
	297	/* Check for and consume any regions we now overlap with. */
	298	nrg = rg;
	299	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
	300	if (&rg->link == head)
	301	break;
	302	if (rg->from > t)
	303	break;
	304
	305	/* If this area reaches higher then extend our area to
	306	* include it completely. If this is not the first area
	307	* which we intend to reuse, free it. */
	308	if (rg->to > t)
	309	t = rg->to;
	310	if (rg != nrg) {
	311	/* Decrement return value by the deleted range.
	312	* Another range will span this area so that by
	313	* end of routine add will be >= zero
	314	*/
	315	add -= (rg->to - rg->from);
	316	list_del(&rg->link);
	317	kfree(rg);
	318	}
	319	}
	320
	321	add += (nrg->from - f); /* Added to beginning of region */
	322	nrg->from = f;
	323	add += t - nrg->to; /* Added to end of region */
	324	nrg->to = t;
	325
	326	out_locked:
	327	resv->adds_in_progress--;
	328	spin_unlock(&resv->lock);
	329	VM_BUG_ON(add < 0);
	330	return add;
	331	}
	332
	333	/*
	334	* Examine the existing reserve map and determine how many
	335	* huge pages in the specified range [f, t) are NOT currently
	336	* represented. This routine is called before a subsequent
	337	* call to region_add that will actually modify the reserve
	338	* map to add the specified range [f, t). region_chg does
	339	* not change the number of huge pages represented by the
	340	* map. However, if the existing regions in the map can not
	341	* be expanded to represent the new range, a new file_region
	342	* structure is added to the map as a placeholder. This is
	343	* so that the subsequent region_add call will have all the
	344	* regions it needs and will not fail.
	345	*
	346	* Upon entry, region_chg will also examine the cache of region descriptors
	347	* associated with the map. If there are not enough descriptors cached, one
	348	* will be allocated for the in progress add operation.
	349	*
	350	* Returns the number of huge pages that need to be added to the existing
	351	* reservation map for the range [f, t). This number is greater or equal to
	352	* zero. -ENOMEM is returned if a new file_region structure or cache entry
	353	* is needed and can not be allocated.
	354	*/
	355	static long region_chg(struct resv_map *resv, long f, long t)
	356	{
	357	struct list_head *head = &resv->regions;
	358	struct file_region rg, nrg = NULL;
	359	long chg = 0;
	360
	361	retry:
	362	spin_lock(&resv->lock);
	363	retry_locked:
	364	resv->adds_in_progress++;
	365
	366	/*
	367	* Check for sufficient descriptors in the cache to accommodate
	368	* the number of in progress add operations.
	369	*/
	370	if (resv->adds_in_progress > resv->region_cache_count) {
	371	struct file_region *trg;
	372
	373	VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
	374	/* Must drop lock to allocate a new descriptor. */
	375	resv->adds_in_progress--;
	376	spin_unlock(&resv->lock);
	377
	378	trg = kmalloc(sizeof(*trg), GFP_KERNEL);
	379	if (!trg) {
	380	kfree(nrg);
	381	return -ENOMEM;
	382	}
	383
	384	spin_lock(&resv->lock);
	385	list_add(&trg->link, &resv->region_cache);
	386	resv->region_cache_count++;
	387	goto retry_locked;
	388	}
	389
	390	/* Locate the region we are before or in. */
	391	list_for_each_entry(rg, head, link)
	392	if (f <= rg->to)
	393	break;
	394
	395	/* If we are below the current region then a new region is required.
	396	* Subtle, allocate a new region at the position but make it zero
	397	* size such that we can guarantee to record the reservation. */
	398	if (&rg->link == head \|\| t < rg->from) {
	399	if (!nrg) {
	400	resv->adds_in_progress--;
	401	spin_unlock(&resv->lock);
	402	nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
	403	if (!nrg)
	404	return -ENOMEM;
	405
	406	nrg->from = f;
	407	nrg->to = f;
	408	INIT_LIST_HEAD(&nrg->link);
	409	goto retry;
	410	}
	411
	412	list_add(&nrg->link, rg->link.prev);
	413	chg = t - f;
	414	goto out_nrg;
	415	}
	416
	417	/* Round our left edge to the current segment if it encloses us. */
	418	if (f > rg->from)
	419	f = rg->from;
	420	chg = t - f;
	421
	422	/* Check for and consume any regions we now overlap with. */
	423	list_for_each_entry(rg, rg->link.prev, link) {
	424	if (&rg->link == head)
	425	break;
	426	if (rg->from > t)
	427	goto out;
	428
	429	/* We overlap with this area, if it extends further than
	430	* us then we must extend ourselves. Account for its
	431	* existing reservation. */
	432	if (rg->to > t) {
	433	chg += rg->to - t;
	434	t = rg->to;
	435	}
	436	chg -= rg->to - rg->from;
	437	}
	438
	439	out:
	440	spin_unlock(&resv->lock);
	441	/* We already know we raced and no longer need the new region */
	442	kfree(nrg);
	443	return chg;
	444	out_nrg:
	445	spin_unlock(&resv->lock);
	446	return chg;
	447	}
	448
	449	/*
	450	* Abort the in progress add operation. The adds_in_progress field
	451	* of the resv_map keeps track of the operations in progress between
	452	* calls to region_chg and region_add. Operations are sometimes
	453	* aborted after the call to region_chg. In such cases, region_abort
	454	* is called to decrement the adds_in_progress counter.
	455	*
	456	* NOTE: The range arguments [f, t) are not needed or used in this
	457	* routine. They are kept to make reading the calling code easier as
	458	* arguments will match the associated region_chg call.
	459	*/
	460	static void region_abort(struct resv_map *resv, long f, long t)
	461	{
	462	spin_lock(&resv->lock);
	463	VM_BUG_ON(!resv->region_cache_count);
	464	resv->adds_in_progress--;
	465	spin_unlock(&resv->lock);
	466	}
	467
	468	/*
	469	* Delete the specified range [f, t) from the reserve map. If the
	470	* t parameter is LONG_MAX, this indicates that ALL regions after f
	471	* should be deleted. Locate the regions which intersect [f, t)
	472	* and either trim, delete or split the existing regions.
	473	*
	474	* Returns the number of huge pages deleted from the reserve map.
	475	* In the normal case, the return value is zero or more. In the
	476	* case where a region must be split, a new region descriptor must
	477	* be allocated. If the allocation fails, -ENOMEM will be returned.
	478	* NOTE: If the parameter t == LONG_MAX, then we will never split
	479	* a region and possibly return -ENOMEM. Callers specifying
	480	* t == LONG_MAX do not need to check for -ENOMEM error.
	481	*/
	482	static long region_del(struct resv_map *resv, long f, long t)
	483	{
	484	struct list_head *head = &resv->regions;
	485	struct file_region rg, trg;
	486	struct file_region *nrg = NULL;
	487	long del = 0;
	488
	489	retry:
	490	spin_lock(&resv->lock);
	491	list_for_each_entry_safe(rg, trg, head, link) {
	492	/*
	493	* Skip regions before the range to be deleted. file_region
	494	* ranges are normally of the form [from, to). However, there
	495	* may be a "placeholder" entry in the map which is of the form
	496	* (from, to) with from == to. Check for placeholder entries
	497	* at the beginning of the range to be deleted.
	498	*/
	499	if (rg->to <= f && (rg->to != rg->from \|\| rg->to != f))
	500	continue;
	501
	502	if (rg->from >= t)
	503	break;
	504
	505	if (f > rg->from && t < rg->to) { /* Must split region */
	506	/*
	507	* Check for an entry in the cache before dropping
	508	* lock and attempting allocation.
	509	*/
	510	if (!nrg &&
	511	resv->region_cache_count > resv->adds_in_progress) {
	512	nrg = list_first_entry(&resv->region_cache,
	513	struct file_region,
	514	link);
	515	list_del(&nrg->link);
	516	resv->region_cache_count--;
	517	}
	518
	519	if (!nrg) {
	520	spin_unlock(&resv->lock);
	521	nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
	522	if (!nrg)
	523	return -ENOMEM;
	524	goto retry;
	525	}
	526
	527	del += t - f;
	528
	529	/* New entry for end of split region */
	530	nrg->from = t;
	531	nrg->to = rg->to;
	532	INIT_LIST_HEAD(&nrg->link);
	533
	534	/* Original entry is trimmed */
	535	rg->to = f;
	536
	537	list_add(&nrg->link, &rg->link);
	538	nrg = NULL;
	539	break;
	540	}
	541
	542	if (f <= rg->from && t >= rg->to) { /* Remove entire region */
	543	del += rg->to - rg->from;
	544	list_del(&rg->link);
	545	kfree(rg);
	546	continue;
	547	}
	548
	549	if (f <= rg->from) { /* Trim beginning of region */
	550	del += t - rg->from;
	551	rg->from = t;
	552	} else { /* Trim end of region */
	553	del += rg->to - f;
	554	rg->to = f;
	555	}
	556	}
	557
	558	spin_unlock(&resv->lock);
	559	kfree(nrg);
	560	return del;
	561	}
	562
	563	/*
	564	* A rare out of memory error was encountered which prevented removal of
	565	* the reserve map region for a page. The huge page itself was free'ed
	566	* and removed from the page cache. This routine will adjust the subpool
	567	* usage count, and the global reserve count if needed. By incrementing
	568	* these counts, the reserve map entry which could not be deleted will
	569	* appear as a "reserved" entry instead of simply dangling with incorrect
	570	* counts.
	571	*/
	572	void hugetlb_fix_reserve_counts(struct inode *inode)
	573	{
	574	struct hugepage_subpool *spool = subpool_inode(inode);
	575	long rsv_adjust;
	576
	577	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
	578	if (rsv_adjust) {
	579	struct hstate *h = hstate_inode(inode);
	580
	581	hugetlb_acct_memory(h, 1);
	582	}
	583	}
	584
	585	/*
	586	* Count and return the number of huge pages in the reserve map
	587	* that intersect with the range [f, t).
	588	*/
	589	static long region_count(struct resv_map *resv, long f, long t)
	590	{
	591	struct list_head *head = &resv->regions;
	592	struct file_region *rg;
	593	long chg = 0;
	594
	595	spin_lock(&resv->lock);
	596	/* Locate each segment we overlap with, and count that overlap. */
	597	list_for_each_entry(rg, head, link) {
	598	long seg_from;
	599	long seg_to;
	600
	601	if (rg->to <= f)
	602	continue;
	603	if (rg->from >= t)
	604	break;
	605
	606	seg_from = max(rg->from, f);
	607	seg_to = min(rg->to, t);
	608
	609	chg += seg_to - seg_from;
	610	}
	611	spin_unlock(&resv->lock);
	612
	613	return chg;
	614	}
	615
	616	/*
	617	* Convert the address within this vma to the page offset within
	618	* the mapping, in pagecache page units; huge pages here.
	619	*/
	620	static pgoff_t vma_hugecache_offset(struct hstate *h,
	621	struct vm_area_struct *vma, unsigned long address)
	622	{
	623	return ((address - vma->vm_start) >> huge_page_shift(h)) +
	624	(vma->vm_pgoff >> huge_page_order(h));
	625	}
	626
	627	pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
	628	unsigned long address)
	629	{
	630	return vma_hugecache_offset(hstate_vma(vma), vma, address);
	631	}
	632	EXPORT_SYMBOL_GPL(linear_hugepage_index);
	633
	634	/*
	635	* Return the size of the pages allocated when backing a VMA. In the majority
	636	* cases this will be same size as used by the page table entries.
	637	*/
	638	unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
	639	{
	640	if (vma->vm_ops && vma->vm_ops->pagesize)
	641	return vma->vm_ops->pagesize(vma);
	642	return PAGE_SIZE;
	643	}
	644	EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
	645
	646	/*
	647	* Return the page size being used by the MMU to back a VMA. In the majority
	648	* of cases, the page size used by the kernel matches the MMU size. On
	649	* architectures where it differs, an architecture-specific 'strong'
	650	* version of this symbol is required.
	651	*/
	652	__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
	653	{
	654	return vma_kernel_pagesize(vma);
	655	}
	656
	657	/*
	658	* Flags for MAP_PRIVATE reservations. These are stored in the bottom
	659	* bits of the reservation map pointer, which are always clear due to
	660	* alignment.
	661	*/
	662	#define HPAGE_RESV_OWNER (1UL << 0)
	663	#define HPAGE_RESV_UNMAPPED (1UL << 1)
	664	#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER \| HPAGE_RESV_UNMAPPED)
	665
	666	/*
	667	* These helpers are used to track how many pages are reserved for
	668	* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
	669	* is guaranteed to have their future faults succeed.
	670	*
	671	* With the exception of reset_vma_resv_huge_pages() which is called at fork(),
	672	* the reserve counters are updated with the hugetlb_lock held. It is safe
	673	* to reset the VMA at fork() time as it is not in use yet and there is no
	674	* chance of the global counters getting corrupted as a result of the values.
	675	*
	676	* The private mapping reservation is represented in a subtly different
	677	* manner to a shared mapping. A shared mapping has a region map associated
	678	* with the underlying file, this region map represents the backing file
	679	* pages which have ever had a reservation assigned which this persists even
	680	* after the page is instantiated. A private mapping has a region map
	681	* associated with the original mmap which is attached to all VMAs which
	682	* reference it, this region map represents those offsets which have consumed
	683	* reservation ie. where pages have been instantiated.
	684	*/
	685	static unsigned long get_vma_private_data(struct vm_area_struct *vma)
	686	{
	687	return (unsigned long)vma->vm_private_data;
	688	}
	689
	690	static void set_vma_private_data(struct vm_area_struct *vma,
	691	unsigned long value)
	692	{
	693	vma->vm_private_data = (void *)value;
	694	}
	695
	696	struct resv_map *resv_map_alloc(void)
	697	{
	698	struct resv_map resv_map = kmalloc(sizeof(resv_map), GFP_KERNEL);
	699	struct file_region rg = kmalloc(sizeof(rg), GFP_KERNEL);
	700
	701	if (!resv_map \|\| !rg) {
	702	kfree(resv_map);
	703	kfree(rg);
	704	return NULL;
	705	}
	706
	707	kref_init(&resv_map->refs);
	708	spin_lock_init(&resv_map->lock);
	709	INIT_LIST_HEAD(&resv_map->regions);
	710
	711	resv_map->adds_in_progress = 0;
	712
	713	INIT_LIST_HEAD(&resv_map->region_cache);
	714	list_add(&rg->link, &resv_map->region_cache);
	715	resv_map->region_cache_count = 1;
	716
	717	return resv_map;
	718	}
	719
	720	void resv_map_release(struct kref *ref)
	721	{
	722	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
	723	struct list_head *head = &resv_map->region_cache;
	724	struct file_region rg, trg;
	725
	726	/* Clear out any active regions before we release the map. */
	727	region_del(resv_map, 0, LONG_MAX);
	728
	729	/* ... and any entries left in the cache */
	730	list_for_each_entry_safe(rg, trg, head, link) {
	731	list_del(&rg->link);
	732	kfree(rg);
	733	}
	734
	735	VM_BUG_ON(resv_map->adds_in_progress);
	736
	737	kfree(resv_map);
	738	}
	739
	740	static inline struct resv_map inode_resv_map(struct inode inode)
	741	{
	742	return inode->i_mapping->private_data;
	743	}
	744
	745	static struct resv_map vma_resv_map(struct vm_area_struct vma)
	746	{
	747	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	748	if (vma->vm_flags & VM_MAYSHARE) {
	749	struct address_space *mapping = vma->vm_file->f_mapping;
	750	struct inode *inode = mapping->host;
	751
	752	return inode_resv_map(inode);
	753
	754	} else {
	755	return (struct resv_map *)(get_vma_private_data(vma) &
	756	~HPAGE_RESV_MASK);
	757	}
	758	}
	759
	760	static void set_vma_resv_map(struct vm_area_struct vma, struct resv_map map)
	761	{
	762	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	763	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
	764
	765	set_vma_private_data(vma, (get_vma_private_data(vma) &
	766	HPAGE_RESV_MASK) \| (unsigned long)map);
	767	}
	768
	769	static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
	770	{
	771	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	772	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
	773
	774	set_vma_private_data(vma, get_vma_private_data(vma) \| flags);
	775	}
	776
	777	static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
	778	{
	779	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	780
	781	return (get_vma_private_data(vma) & flag) != 0;
	782	}
	783
	784	/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
	785	void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
	786	{
	787	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
	788	if (!(vma->vm_flags & VM_MAYSHARE))
	789	vma->vm_private_data = (void *)0;
	790	}
	791
	792	/* Returns true if the VMA has associated reserve pages */
	793	static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
	794	{
	795	if (vma->vm_flags & VM_NORESERVE) {
	796	/*
	797	* This address is already reserved by other process(chg == 0),
	798	* so, we should decrement reserved count. Without decrementing,
	799	* reserve count remains after releasing inode, because this
	800	* allocated page will go into page cache and is regarded as
	801	* coming from reserved pool in releasing step. Currently, we
	802	* don't have any other solution to deal with this situation
	803	* properly, so add work-around here.
	804	*/
	805	if (vma->vm_flags & VM_MAYSHARE && chg == 0)
	806	return true;
	807	else
	808	return false;
	809	}
	810
	811	/* Shared mappings always use reserves */
	812	if (vma->vm_flags & VM_MAYSHARE) {
	813	/*
	814	* We know VM_NORESERVE is not set. Therefore, there SHOULD
	815	* be a region map for all pages. The only situation where
	816	* there is no region map is if a hole was punched via
	817	* fallocate. In this case, there really are no reverves to
	818	* use. This situation is indicated if chg != 0.
	819	*/
	820	if (chg)
	821	return false;
	822	else
	823	return true;
	824	}
	825
	826	/*
	827	* Only the process that called mmap() has reserves for
	828	* private mappings.
	829	*/
	830	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
	831	/*
	832	* Like the shared case above, a hole punch or truncate
	833	* could have been performed on the private mapping.
	834	* Examine the value of chg to determine if reserves
	835	* actually exist or were previously consumed.
	836	* Very Subtle - The value of chg comes from a previous
	837	* call to vma_needs_reserves(). The reserve map for
	838	* private mappings has different (opposite) semantics
	839	* than that of shared mappings. vma_needs_reserves()
	840	* has already taken this difference in semantics into
	841	* account. Therefore, the meaning of chg is the same
	842	* as in the shared case above. Code could easily be
	843	* combined, but keeping it separate draws attention to
	844	* subtle differences.
	845	*/
	846	if (chg)
	847	return false;
	848	else
	849	return true;
	850	}
	851
	852	return false;
	853	}
	854
	855	static void enqueue_huge_page(struct hstate h, struct page page)
	856	{
	857	int nid = page_to_nid(page);
	858	list_move(&page->lru, &h->hugepage_freelists[nid]);
	859	h->free_huge_pages++;
	860	h->free_huge_pages_node[nid]++;
	861	}
	862
	863	static struct page dequeue_huge_page_node_exact(struct hstate h, int nid)
	864	{
	865	struct page *page;
	866
	867	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
	868	if (!PageHWPoison(page))
	869	break;
	870	/*
	871	* if 'non-isolated free hugepage' not found on the list,
	872	* the allocation fails.
	873	*/
	874	if (&h->hugepage_freelists[nid] == &page->lru)
	875	return NULL;
	876	list_move(&page->lru, &h->hugepage_activelist);
	877	set_page_refcounted(page);
	878	h->free_huge_pages--;
	879	h->free_huge_pages_node[nid]--;
	880	return page;
	881	}
	882
	883	static struct page dequeue_huge_page_nodemask(struct hstate h, gfp_t gfp_mask, int nid,
	884	nodemask_t *nmask)
	885	{
	886	unsigned int cpuset_mems_cookie;
	887	struct zonelist *zonelist;
	888	struct zone *zone;
	889	struct zoneref *z;
	890	int node = -1;
	891
	892	zonelist = node_zonelist(nid, gfp_mask);
	893
	894	retry_cpuset:
	895	cpuset_mems_cookie = read_mems_allowed_begin();
	896	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
	897	struct page *page;
	898
	899	if (!cpuset_zone_allowed(zone, gfp_mask))
	900	continue;
	901	/*
	902	* no need to ask again on the same node. Pool is node rather than
	903	* zone aware
	904	*/
	905	if (zone_to_nid(zone) == node)
	906	continue;
	907	node = zone_to_nid(zone);
	908
	909	page = dequeue_huge_page_node_exact(h, node);
	910	if (page)
	911	return page;
	912	}
	913	if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
	914	goto retry_cpuset;
	915
	916	return NULL;
	917	}
	918
	919	/* Movability of hugepages depends on migration support. */
	920	static inline gfp_t htlb_alloc_mask(struct hstate *h)
	921	{
	922	if (hugepage_migration_supported(h))
	923	return GFP_HIGHUSER_MOVABLE;
	924	else
	925	return GFP_HIGHUSER;
	926	}
	927
	928	static struct page dequeue_huge_page_vma(struct hstate h,
	929	struct vm_area_struct *vma,
	930	unsigned long address, int avoid_reserve,
	931	long chg)
	932	{
	933	struct page *page;
	934	struct mempolicy *mpol;
	935	gfp_t gfp_mask;
	936	nodemask_t *nodemask;
	937	int nid;
	938
	939	/*
	940	* A child process with MAP_PRIVATE mappings created by their parent
	941	* have no page reserves. This check ensures that reservations are
	942	* not "stolen". The child may still get SIGKILLed
	943	*/
	944	if (!vma_has_reserves(vma, chg) &&
	945	h->free_huge_pages - h->resv_huge_pages == 0)
	946	goto err;
	947
	948	/* If reserves cannot be used, ensure enough pages are in the pool */
	949	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
	950	goto err;
	951
	952	gfp_mask = htlb_alloc_mask(h);
	953	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
	954	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
	955	if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
	956	SetPagePrivate(page);
	957	h->resv_huge_pages--;
	958	}
	959
	960	mpol_cond_put(mpol);
	961	return page;
	962
	963	err:
	964	return NULL;
	965	}
	966
	967	/*
	968	* common helper functions for hstate_next_node_to_{alloc\|free}.
	969	* We may have allocated or freed a huge page based on a different
	970	* nodes_allowed previously, so h->next_node_to_{alloc\|free} might
	971	* be outside of *nodes_allowed. Ensure that we use an allowed
	972	* node for alloc or free.
	973	*/
	974	static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
	975	{
	976	nid = next_node_in(nid, *nodes_allowed);
	977	VM_BUG_ON(nid >= MAX_NUMNODES);
	978
	979	return nid;
	980	}
	981
	982	static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
	983	{
	984	if (!node_isset(nid, *nodes_allowed))
	985	nid = next_node_allowed(nid, nodes_allowed);
	986	return nid;
	987	}
	988
	989	/*
	990	* returns the previously saved node ["this node"] from which to
	991	* allocate a persistent huge page for the pool and advance the
	992	* next node from which to allocate, handling wrap at end of node
	993	* mask.
	994	*/
	995	static int hstate_next_node_to_alloc(struct hstate *h,
	996	nodemask_t *nodes_allowed)
	997	{
	998	int nid;
	999
	1000	VM_BUG_ON(!nodes_allowed);
	1001
	1002	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
	1003	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
	1004
	1005	return nid;
	1006	}
	1007
	1008	/*
	1009	* helper for free_pool_huge_page() - return the previously saved
	1010	* node ["this node"] from which to free a huge page. Advance the
	1011	* next node id whether or not we find a free huge page to free so
	1012	* that the next attempt to free addresses the next node.
	1013	*/
	1014	static int hstate_next_node_to_free(struct hstate h, nodemask_t nodes_allowed)
	1015	{
	1016	int nid;
	1017
	1018	VM_BUG_ON(!nodes_allowed);
	1019
	1020	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
	1021	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
	1022
	1023	return nid;
	1024	}
	1025
	1026	#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
	1027	for (nr_nodes = nodes_weight(*mask); \
	1028	nr_nodes > 0 && \
	1029	((node = hstate_next_node_to_alloc(hs, mask)) \|\| 1); \
	1030	nr_nodes--)
	1031
	1032	#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
	1033	for (nr_nodes = nodes_weight(*mask); \
	1034	nr_nodes > 0 && \
	1035	((node = hstate_next_node_to_free(hs, mask)) \|\| 1); \
	1036	nr_nodes--)
	1037
	1038	#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
	1039	static void destroy_compound_gigantic_page(struct page *page,
	1040	unsigned int order)
	1041	{
	1042	int i;
	1043	int nr_pages = 1 << order;
	1044	struct page *p = page + 1;
	1045
	1046	atomic_set(compound_mapcount_ptr(page), 0);
	1047	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
	1048	clear_compound_head(p);
	1049	set_page_refcounted(p);
	1050	}
	1051
	1052	set_compound_order(page, 0);
	1053	__ClearPageHead(page);
	1054	}
	1055
	1056	static void free_gigantic_page(struct page *page, unsigned int order)
	1057	{
	1058	free_contig_range(page_to_pfn(page), 1 << order);
	1059	}
	1060
	1061	static int __alloc_gigantic_page(unsigned long start_pfn,
	1062	unsigned long nr_pages, gfp_t gfp_mask)
	1063	{
	1064	unsigned long end_pfn = start_pfn + nr_pages;
	1065	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
	1066	gfp_mask);
	1067	}
	1068
	1069	static bool pfn_range_valid_gigantic(struct zone *z,
	1070	unsigned long start_pfn, unsigned long nr_pages)
	1071	{
	1072	unsigned long i, end_pfn = start_pfn + nr_pages;
	1073	struct page *page;
	1074
	1075	for (i = start_pfn; i < end_pfn; i++) {
	1076	if (!pfn_valid(i))
	1077	return false;
	1078
	1079	page = pfn_to_page(i);
	1080
	1081	if (page_zone(page) != z)
	1082	return false;
	1083
	1084	if (PageReserved(page))
	1085	return false;
	1086
	1087	if (page_count(page) > 0)
	1088	return false;
	1089
	1090	if (PageHuge(page))
	1091	return false;
	1092	}
	1093
	1094	return true;
	1095	}
	1096
	1097	static bool zone_spans_last_pfn(const struct zone *zone,
	1098	unsigned long start_pfn, unsigned long nr_pages)
	1099	{
	1100	unsigned long last_pfn = start_pfn + nr_pages - 1;
	1101	return zone_spans_pfn(zone, last_pfn);
	1102	}
	1103
	1104	static struct page alloc_gigantic_page(struct hstate h, gfp_t gfp_mask,
	1105	int nid, nodemask_t *nodemask)
	1106	{
	1107	unsigned int order = huge_page_order(h);
	1108	unsigned long nr_pages = 1 << order;
	1109	unsigned long ret, pfn, flags;
	1110	struct zonelist *zonelist;
	1111	struct zone *zone;
	1112	struct zoneref *z;
	1113
	1114	zonelist = node_zonelist(nid, gfp_mask);
	1115	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
	1116	spin_lock_irqsave(&zone->lock, flags);
	1117
	1118	pfn = ALIGN(zone->zone_start_pfn, nr_pages);
	1119	while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
	1120	if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
	1121	/*
	1122	* We release the zone lock here because
	1123	* alloc_contig_range() will also lock the zone
	1124	* at some point. If there's an allocation
	1125	* spinning on this lock, it may win the race
	1126	* and cause alloc_contig_range() to fail...
	1127	*/
	1128	spin_unlock_irqrestore(&zone->lock, flags);
	1129	ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
	1130	if (!ret)
	1131	return pfn_to_page(pfn);
	1132	spin_lock_irqsave(&zone->lock, flags);
	1133	}
	1134	pfn += nr_pages;
	1135	}
	1136
	1137	spin_unlock_irqrestore(&zone->lock, flags);
	1138	}
	1139
	1140	return NULL;
	1141	}
	1142
	1143	static void prep_new_huge_page(struct hstate h, struct page page, int nid);
	1144	static void prep_compound_gigantic_page(struct page *page, unsigned int order);
	1145
	1146	#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
	1147	static inline bool gigantic_page_supported(void) { return false; }
	1148	static struct page alloc_gigantic_page(struct hstate h, gfp_t gfp_mask,
	1149	int nid, nodemask_t *nodemask) { return NULL; }
	1150	static inline void free_gigantic_page(struct page *page, unsigned int order) { }
	1151	static inline void destroy_compound_gigantic_page(struct page *page,
	1152	unsigned int order) { }
	1153	#endif
	1154
	1155	static void update_and_free_page(struct hstate h, struct page page)
	1156	{
	1157	int i;
	1158
	1159	if (hstate_is_gigantic(h) && !gigantic_page_supported())
	1160	return;
	1161
	1162	h->nr_huge_pages--;
	1163	h->nr_huge_pages_node[page_to_nid(page)]--;
	1164	for (i = 0; i < pages_per_huge_page(h); i++) {
	1165	page[i].flags &= ~(1 << PG_locked \| 1 << PG_error \|
	1166	1 << PG_referenced \| 1 << PG_dirty \|
	1167	1 << PG_active \| 1 << PG_private \|
	1168	1 << PG_writeback);
	1169	}
	1170	VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
	1171	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
	1172	set_page_refcounted(page);
	1173	if (hstate_is_gigantic(h)) {
	1174	destroy_compound_gigantic_page(page, huge_page_order(h));
	1175	free_gigantic_page(page, huge_page_order(h));
	1176	} else {
	1177	__free_pages(page, huge_page_order(h));
	1178	}
	1179	}
	1180
	1181	struct hstate *size_to_hstate(unsigned long size)
	1182	{
	1183	struct hstate *h;
	1184
	1185	for_each_hstate(h) {
	1186	if (huge_page_size(h) == size)
	1187	return h;
	1188	}
	1189	return NULL;
	1190	}
	1191
	1192	/*
	1193	* Test to determine whether the hugepage is "active/in-use" (i.e. being linked
	1194	* to hstate->hugepage_activelist.)
	1195	*
	1196	* This function can be called for tail pages, but never returns true for them.
	1197	*/
	1198	bool page_huge_active(struct page *page)
	1199	{
	1200	VM_BUG_ON_PAGE(!PageHuge(page), page);
	1201	return PageHead(page) && PagePrivate(&page[1]);
	1202	}
	1203
	1204	/* never called for tail page */
	1205	static void set_page_huge_active(struct page *page)
	1206	{
	1207	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
	1208	SetPagePrivate(&page[1]);
	1209	}
	1210
	1211	static void clear_page_huge_active(struct page *page)
	1212	{
	1213	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
	1214	ClearPagePrivate(&page[1]);
	1215	}
	1216
	1217	/*
	1218	* Internal hugetlb specific page flag. Do not use outside of the hugetlb
	1219	* code
	1220	*/
	1221	static inline bool PageHugeTemporary(struct page *page)
	1222	{
	1223	if (!PageHuge(page))
	1224	return false;
	1225
	1226	return (unsigned long)page[2].mapping == -1U;
	1227	}
	1228
	1229	static inline void SetPageHugeTemporary(struct page *page)
	1230	{
	1231	page[2].mapping = (void *)-1U;
	1232	}
	1233
	1234	static inline void ClearPageHugeTemporary(struct page *page)
	1235	{
	1236	page[2].mapping = NULL;
	1237	}
	1238
	1239	void free_huge_page(struct page *page)
	1240	{
	1241	/*
	1242	* Can't pass hstate in here because it is called from the
	1243	* compound page destructor.
	1244	*/
	1245	struct hstate *h = page_hstate(page);
	1246	int nid = page_to_nid(page);
	1247	struct hugepage_subpool *spool =
	1248	(struct hugepage_subpool *)page_private(page);
	1249	bool restore_reserve;
	1250
	1251	set_page_private(page, 0);
	1252	page->mapping = NULL;
	1253	VM_BUG_ON_PAGE(page_count(page), page);
	1254	VM_BUG_ON_PAGE(page_mapcount(page), page);
	1255	restore_reserve = PagePrivate(page);
	1256	ClearPagePrivate(page);
	1257
	1258	/*
	1259	* A return code of zero implies that the subpool will be under its
	1260	* minimum size if the reservation is not restored after page is free.
	1261	* Therefore, force restore_reserve operation.
	1262	*/
	1263	if (hugepage_subpool_put_pages(spool, 1) == 0)
	1264	restore_reserve = true;
	1265
	1266	spin_lock(&hugetlb_lock);
	1267	clear_page_huge_active(page);
	1268	hugetlb_cgroup_uncharge_page(hstate_index(h),
	1269	pages_per_huge_page(h), page);
	1270	if (restore_reserve)
	1271	h->resv_huge_pages++;
	1272
	1273	if (PageHugeTemporary(page)) {
	1274	list_del(&page->lru);
	1275	ClearPageHugeTemporary(page);
	1276	update_and_free_page(h, page);
	1277	} else if (h->surplus_huge_pages_node[nid]) {
	1278	/* remove the page from active list */
	1279	list_del(&page->lru);
	1280	update_and_free_page(h, page);
	1281	h->surplus_huge_pages--;
	1282	h->surplus_huge_pages_node[nid]--;
	1283	} else {
	1284	arch_clear_hugepage_flags(page);
	1285	enqueue_huge_page(h, page);
	1286	}
	1287	spin_unlock(&hugetlb_lock);
	1288	}
	1289
	1290	static void prep_new_huge_page(struct hstate h, struct page page, int nid)
	1291	{
	1292	INIT_LIST_HEAD(&page->lru);
	1293	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
	1294	spin_lock(&hugetlb_lock);
	1295	set_hugetlb_cgroup(page, NULL);
	1296	h->nr_huge_pages++;
	1297	h->nr_huge_pages_node[nid]++;
	1298	spin_unlock(&hugetlb_lock);
	1299	}
	1300
	1301	static void prep_compound_gigantic_page(struct page *page, unsigned int order)
	1302	{
	1303	int i;
	1304	int nr_pages = 1 << order;
	1305	struct page *p = page + 1;
	1306
	1307	/* we rely on prep_new_huge_page to set the destructor */
	1308	set_compound_order(page, order);
	1309	__ClearPageReserved(page);
	1310	__SetPageHead(page);
	1311	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
	1312	/*
	1313	* For gigantic hugepages allocated through bootmem at
	1314	* boot, it's safer to be consistent with the not-gigantic
	1315	* hugepages and clear the PG_reserved bit from all tail pages
	1316	* too. Otherwse drivers using get_user_pages() to access tail
	1317	* pages may get the reference counting wrong if they see
	1318	* PG_reserved set on a tail page (despite the head page not
	1319	* having PG_reserved set). Enforcing this consistency between
	1320	* head and tail pages allows drivers to optimize away a check
	1321	* on the head page when they need know if put_page() is needed
	1322	* after get_user_pages().
	1323	*/
	1324	__ClearPageReserved(p);
	1325	set_page_count(p, 0);
	1326	set_compound_head(p, page);
	1327	}
	1328	atomic_set(compound_mapcount_ptr(page), -1);
	1329	}
	1330
	1331	/*
	1332	* PageHuge() only returns true for hugetlbfs pages, but not for normal or
	1333	* transparent huge pages. See the PageTransHuge() documentation for more
	1334	* details.
	1335	*/
	1336	int PageHuge(struct page *page)
	1337	{
	1338	if (!PageCompound(page))
	1339	return 0;
	1340
	1341	page = compound_head(page);
	1342	return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
	1343	}
	1344	EXPORT_SYMBOL_GPL(PageHuge);
	1345
	1346	/*
	1347	* PageHeadHuge() only returns true for hugetlbfs head page, but not for
	1348	* normal or transparent huge pages.
	1349	*/
	1350	int PageHeadHuge(struct page *page_head)
	1351	{
	1352	if (!PageHead(page_head))
	1353	return 0;
	1354
	1355	return get_compound_page_dtor(page_head) == free_huge_page;
	1356	}
	1357
	1358	pgoff_t __basepage_index(struct page *page)
	1359	{
	1360	struct page *page_head = compound_head(page);
	1361	pgoff_t index = page_index(page_head);
	1362	unsigned long compound_idx;
	1363
	1364	if (!PageHuge(page_head))
	1365	return page_index(page);
	1366
	1367	if (compound_order(page_head) >= MAX_ORDER)
	1368	compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
	1369	else
	1370	compound_idx = page - page_head;
	1371
	1372	return (index << compound_order(page_head)) + compound_idx;
	1373	}
	1374
	1375	static struct page alloc_buddy_huge_page(struct hstate h,
	1376	gfp_t gfp_mask, int nid, nodemask_t *nmask)
	1377	{
	1378	int order = huge_page_order(h);
	1379	struct page *page;
	1380
	1381	gfp_mask \|= __GFP_COMP\|__GFP_RETRY_MAYFAIL\|__GFP_NOWARN;
	1382	if (nid == NUMA_NO_NODE)
	1383	nid = numa_mem_id();
	1384	page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
	1385	if (page)
	1386	__count_vm_event(HTLB_BUDDY_PGALLOC);
	1387	else
	1388	__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
	1389
	1390	return page;
	1391	}
	1392
	1393	/*
	1394	* Common helper to allocate a fresh hugetlb page. All specific allocators
	1395	* should use this function to get new hugetlb pages
	1396	*/
	1397	static struct page alloc_fresh_huge_page(struct hstate h,
	1398	gfp_t gfp_mask, int nid, nodemask_t *nmask)
	1399	{
	1400	struct page *page;
	1401
	1402	if (hstate_is_gigantic(h))
	1403	page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
	1404	else
	1405	page = alloc_buddy_huge_page(h, gfp_mask,
	1406	nid, nmask);
	1407	if (!page)
	1408	return NULL;
	1409
	1410	if (hstate_is_gigantic(h))
	1411	prep_compound_gigantic_page(page, huge_page_order(h));
	1412	prep_new_huge_page(h, page, page_to_nid(page));
	1413
	1414	return page;
	1415	}
	1416
	1417	/*
	1418	* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
	1419	* manner.
	1420	*/
	1421	static int alloc_pool_huge_page(struct hstate h, nodemask_t nodes_allowed)
	1422	{
	1423	struct page *page;
	1424	int nr_nodes, node;
	1425	gfp_t gfp_mask = htlb_alloc_mask(h) \| __GFP_THISNODE;
	1426
	1427	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
	1428	page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
	1429	if (page)
	1430	break;
	1431	}
	1432
	1433	if (!page)
	1434	return 0;
	1435
	1436	put_page(page); /* free it into the hugepage allocator */
	1437
	1438	return 1;
	1439	}
	1440
	1441	/*
	1442	* Free huge page from pool from next node to free.
	1443	* Attempt to keep persistent huge pages more or less
	1444	* balanced over allowed nodes.
	1445	* Called with hugetlb_lock locked.
	1446	*/
	1447	static int free_pool_huge_page(struct hstate h, nodemask_t nodes_allowed,
	1448	bool acct_surplus)
	1449	{
	1450	int nr_nodes, node;
	1451	int ret = 0;
	1452
	1453	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
	1454	/*
	1455	* If we're returning unused surplus pages, only examine
	1456	* nodes with surplus pages.
	1457	*/
	1458	if ((!acct_surplus \|\| h->surplus_huge_pages_node[node]) &&
	1459	!list_empty(&h->hugepage_freelists[node])) {
	1460	struct page *page =
	1461	list_entry(h->hugepage_freelists[node].next,
	1462	struct page, lru);
	1463	list_del(&page->lru);
	1464	h->free_huge_pages--;
	1465	h->free_huge_pages_node[node]--;
	1466	if (acct_surplus) {
	1467	h->surplus_huge_pages--;
	1468	h->surplus_huge_pages_node[node]--;
	1469	}
	1470	update_and_free_page(h, page);
	1471	ret = 1;
	1472	break;
	1473	}
	1474	}
	1475
	1476	return ret;
	1477	}
	1478
	1479	/*
	1480	* Dissolve a given free hugepage into free buddy pages. This function does
	1481	* nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
	1482	* number of free hugepages would be reduced below the number of reserved
	1483	* hugepages.
	1484	*/
	1485	int dissolve_free_huge_page(struct page *page)
	1486	{
	1487	int rc = 0;
	1488
	1489	spin_lock(&hugetlb_lock);
	1490	if (PageHuge(page) && !page_count(page)) {
	1491	struct page *head = compound_head(page);
	1492	struct hstate *h = page_hstate(head);
	1493	int nid = page_to_nid(head);
	1494	if (h->free_huge_pages - h->resv_huge_pages == 0) {
	1495	rc = -EBUSY;
	1496	goto out;
	1497	}
	1498	/*
	1499	* Move PageHWPoison flag from head page to the raw error page,
	1500	* which makes any subpages rather than the error page reusable.
	1501	*/
	1502	if (PageHWPoison(head) && page != head) {
	1503	SetPageHWPoison(page);
	1504	ClearPageHWPoison(head);
	1505	}
	1506	list_del(&head->lru);
	1507	h->free_huge_pages--;
	1508	h->free_huge_pages_node[nid]--;
	1509	h->max_huge_pages--;
	1510	update_and_free_page(h, head);
	1511	}
	1512	out:
	1513	spin_unlock(&hugetlb_lock);
	1514	return rc;
	1515	}
	1516
	1517	/*
	1518	* Dissolve free hugepages in a given pfn range. Used by memory hotplug to
	1519	* make specified memory blocks removable from the system.
	1520	* Note that this will dissolve a free gigantic hugepage completely, if any
	1521	* part of it lies within the given range.
	1522	* Also note that if dissolve_free_huge_page() returns with an error, all
	1523	* free hugepages that were dissolved before that error are lost.
	1524	*/
	1525	int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
	1526	{
	1527	unsigned long pfn;
	1528	struct page *page;
	1529	int rc = 0;
	1530
	1531	if (!hugepages_supported())
	1532	return rc;
	1533
	1534	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
	1535	page = pfn_to_page(pfn);
	1536	if (PageHuge(page) && !page_count(page)) {
	1537	rc = dissolve_free_huge_page(page);
	1538	if (rc)
	1539	break;
	1540	}
	1541	}
	1542
	1543	return rc;
	1544	}
	1545
	1546	/*
	1547	* Allocates a fresh surplus page from the page allocator.
	1548	*/
	1549	static struct page alloc_surplus_huge_page(struct hstate h, gfp_t gfp_mask,
	1550	int nid, nodemask_t *nmask)
	1551	{
	1552	struct page *page = NULL;
	1553
	1554	if (hstate_is_gigantic(h))
	1555	return NULL;
	1556
	1557	spin_lock(&hugetlb_lock);
	1558	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
	1559	goto out_unlock;
	1560	spin_unlock(&hugetlb_lock);
	1561
	1562	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
	1563	if (!page)
	1564	return NULL;
	1565
	1566	spin_lock(&hugetlb_lock);
	1567	/*
	1568	* We could have raced with the pool size change.
	1569	* Double check that and simply deallocate the new page
	1570	* if we would end up overcommiting the surpluses. Abuse
	1571	* temporary page to workaround the nasty free_huge_page
	1572	* codeflow
	1573	*/
	1574	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
	1575	SetPageHugeTemporary(page);
	1576	put_page(page);
	1577	page = NULL;
	1578	} else {
	1579	h->surplus_huge_pages++;
	1580	h->surplus_huge_pages_node[page_to_nid(page)]++;
	1581	}
	1582
	1583	out_unlock:
	1584	spin_unlock(&hugetlb_lock);
	1585
	1586	return page;
	1587	}
	1588
	1589	static struct page alloc_migrate_huge_page(struct hstate h, gfp_t gfp_mask,
	1590	int nid, nodemask_t *nmask)
	1591	{
	1592	struct page *page;
	1593
	1594	if (hstate_is_gigantic(h))
	1595	return NULL;
	1596
	1597	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
	1598	if (!page)
	1599	return NULL;
	1600
	1601	/*
	1602	* We do not account these pages as surplus because they are only
	1603	* temporary and will be released properly on the last reference
	1604	*/
	1605	SetPageHugeTemporary(page);
	1606
	1607	return page;
	1608	}
	1609
	1610	/*
	1611	* Use the VMA's mpolicy to allocate a huge page from the buddy.
	1612	*/
	1613	static
	1614	struct page alloc_buddy_huge_page_with_mpol(struct hstate h,
	1615	struct vm_area_struct *vma, unsigned long addr)
	1616	{
	1617	struct page *page;
	1618	struct mempolicy *mpol;
	1619	gfp_t gfp_mask = htlb_alloc_mask(h);
	1620	int nid;
	1621	nodemask_t *nodemask;
	1622
	1623	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
	1624	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
	1625	mpol_cond_put(mpol);
	1626
	1627	return page;
	1628	}
	1629
	1630	/* page migration callback function */
	1631	struct page alloc_huge_page_node(struct hstate h, int nid)
	1632	{
	1633	gfp_t gfp_mask = htlb_alloc_mask(h);
	1634	struct page *page = NULL;
	1635
	1636	if (nid != NUMA_NO_NODE)
	1637	gfp_mask \|= __GFP_THISNODE;
	1638
	1639	spin_lock(&hugetlb_lock);
	1640	if (h->free_huge_pages - h->resv_huge_pages > 0)
	1641	page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
	1642	spin_unlock(&hugetlb_lock);
	1643
	1644	if (!page)
	1645	page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
	1646
	1647	return page;
	1648	}
	1649
	1650	/* page migration callback function */
	1651	struct page alloc_huge_page_nodemask(struct hstate h, int preferred_nid,
	1652	nodemask_t *nmask)
	1653	{
	1654	gfp_t gfp_mask = htlb_alloc_mask(h);
	1655
	1656	spin_lock(&hugetlb_lock);
	1657	if (h->free_huge_pages - h->resv_huge_pages > 0) {
	1658	struct page *page;
	1659
	1660	page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
	1661	if (page) {
	1662	spin_unlock(&hugetlb_lock);
	1663	return page;
	1664	}
	1665	}
	1666	spin_unlock(&hugetlb_lock);
	1667
	1668	return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
	1669	}
	1670
	1671	/* mempolicy aware migration callback */
	1672	struct page alloc_huge_page_vma(struct hstate h, struct vm_area_struct *vma,
	1673	unsigned long address)
	1674	{
	1675	struct mempolicy *mpol;
	1676	nodemask_t *nodemask;
	1677	struct page *page;
	1678	gfp_t gfp_mask;
	1679	int node;
	1680
	1681	gfp_mask = htlb_alloc_mask(h);
	1682	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
	1683	page = alloc_huge_page_nodemask(h, node, nodemask);
	1684	mpol_cond_put(mpol);
	1685
	1686	return page;
	1687	}
	1688
	1689	/*
	1690	* Increase the hugetlb pool such that it can accommodate a reservation
	1691	* of size 'delta'.
	1692	*/
	1693	static int gather_surplus_pages(struct hstate *h, int delta)
	1694	{
	1695	struct list_head surplus_list;
	1696	struct page page, tmp;
	1697	int ret, i;
	1698	int needed, allocated;
	1699	bool alloc_ok = true;
	1700
	1701	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
	1702	if (needed <= 0) {
	1703	h->resv_huge_pages += delta;
	1704	return 0;
	1705	}
	1706
	1707	allocated = 0;
	1708	INIT_LIST_HEAD(&surplus_list);
	1709
	1710	ret = -ENOMEM;
	1711	retry:
	1712	spin_unlock(&hugetlb_lock);
	1713	for (i = 0; i < needed; i++) {
	1714	page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
	1715	NUMA_NO_NODE, NULL);
	1716	if (!page) {
	1717	alloc_ok = false;
	1718	break;
	1719	}
	1720	list_add(&page->lru, &surplus_list);
	1721	cond_resched();
	1722	}
	1723	allocated += i;
	1724
	1725	/*
	1726	* After retaking hugetlb_lock, we need to recalculate 'needed'
	1727	* because either resv_huge_pages or free_huge_pages may have changed.
	1728	*/
	1729	spin_lock(&hugetlb_lock);
	1730	needed = (h->resv_huge_pages + delta) -
	1731	(h->free_huge_pages + allocated);
	1732	if (needed > 0) {
	1733	if (alloc_ok)
	1734	goto retry;
	1735	/*
	1736	* We were not able to allocate enough pages to
	1737	* satisfy the entire reservation so we free what
	1738	* we've allocated so far.
	1739	*/
	1740	goto free;
	1741	}
	1742	/*
	1743	* The surplus_list now contains _at_least_ the number of extra pages
	1744	* needed to accommodate the reservation. Add the appropriate number
	1745	* of pages to the hugetlb pool and free the extras back to the buddy
	1746	* allocator. Commit the entire reservation here to prevent another
	1747	* process from stealing the pages as they are added to the pool but
	1748	* before they are reserved.
	1749	*/
	1750	needed += allocated;
	1751	h->resv_huge_pages += delta;
	1752	ret = 0;
	1753
	1754	/* Free the needed pages to the hugetlb pool */
	1755	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
	1756	if ((--needed) < 0)
	1757	break;
	1758	/*
	1759	* This page is now managed by the hugetlb allocator and has
	1760	* no users -- drop the buddy allocator's reference.
	1761	*/
	1762	put_page_testzero(page);
	1763	VM_BUG_ON_PAGE(page_count(page), page);
	1764	enqueue_huge_page(h, page);
	1765	}
	1766	free:
	1767	spin_unlock(&hugetlb_lock);
	1768
	1769	/* Free unnecessary surplus pages to the buddy allocator */
	1770	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
	1771	put_page(page);
	1772	spin_lock(&hugetlb_lock);
	1773
	1774	return ret;
	1775	}
	1776
	1777	/*
	1778	* This routine has two main purposes:
	1779	* 1) Decrement the reservation count (resv_huge_pages) by the value passed
	1780	* in unused_resv_pages. This corresponds to the prior adjustments made
	1781	* to the associated reservation map.
	1782	* 2) Free any unused surplus pages that may have been allocated to satisfy
	1783	* the reservation. As many as unused_resv_pages may be freed.
	1784	*
	1785	* Called with hugetlb_lock held. However, the lock could be dropped (and
	1786	* reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
	1787	* we must make sure nobody else can claim pages we are in the process of
	1788	* freeing. Do this by ensuring resv_huge_page always is greater than the
	1789	* number of huge pages we plan to free when dropping the lock.
	1790	*/
	1791	static void return_unused_surplus_pages(struct hstate *h,
	1792	unsigned long unused_resv_pages)
	1793	{
	1794	unsigned long nr_pages;
	1795
	1796	/* Cannot return gigantic pages currently */
	1797	if (hstate_is_gigantic(h))
	1798	goto out;
	1799
	1800	/*
	1801	* Part (or even all) of the reservation could have been backed
	1802	* by pre-allocated pages. Only free surplus pages.
	1803	*/
	1804	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
	1805
	1806	/*
	1807	* We want to release as many surplus pages as possible, spread
	1808	* evenly across all nodes with memory. Iterate across these nodes
	1809	* until we can no longer free unreserved surplus pages. This occurs
	1810	* when the nodes with surplus pages have no free pages.
	1811	* free_pool_huge_page() will balance the the freed pages across the
	1812	* on-line nodes with memory and will handle the hstate accounting.
	1813	*
	1814	* Note that we decrement resv_huge_pages as we free the pages. If
	1815	* we drop the lock, resv_huge_pages will still be sufficiently large
	1816	* to cover subsequent pages we may free.
	1817	*/
	1818	while (nr_pages--) {
	1819	h->resv_huge_pages--;
	1820	unused_resv_pages--;
	1821	if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
	1822	goto out;
	1823	cond_resched_lock(&hugetlb_lock);
	1824	}
	1825
	1826	out:
	1827	/* Fully uncommit the reservation */
	1828	h->resv_huge_pages -= unused_resv_pages;
	1829	}
	1830
	1831
	1832	/*
	1833	* vma_needs_reservation, vma_commit_reservation and vma_end_reservation
	1834	* are used by the huge page allocation routines to manage reservations.
	1835	*
	1836	* vma_needs_reservation is called to determine if the huge page at addr
	1837	* within the vma has an associated reservation. If a reservation is
	1838	* needed, the value 1 is returned. The caller is then responsible for
	1839	* managing the global reservation and subpool usage counts. After
	1840	* the huge page has been allocated, vma_commit_reservation is called
	1841	* to add the page to the reservation map. If the page allocation fails,
	1842	* the reservation must be ended instead of committed. vma_end_reservation
	1843	* is called in such cases.
	1844	*
	1845	* In the normal case, vma_commit_reservation returns the same value
	1846	* as the preceding vma_needs_reservation call. The only time this
	1847	* is not the case is if a reserve map was changed between calls. It
	1848	* is the responsibility of the caller to notice the difference and
	1849	* take appropriate action.
	1850	*
	1851	* vma_add_reservation is used in error paths where a reservation must
	1852	* be restored when a newly allocated huge page must be freed. It is
	1853	* to be called after calling vma_needs_reservation to determine if a
	1854	* reservation exists.
	1855	*/
	1856	enum vma_resv_mode {
	1857	VMA_NEEDS_RESV,
	1858	VMA_COMMIT_RESV,
	1859	VMA_END_RESV,
	1860	VMA_ADD_RESV,
	1861	};
	1862	static long __vma_reservation_common(struct hstate *h,
	1863	struct vm_area_struct *vma, unsigned long addr,
	1864	enum vma_resv_mode mode)
	1865	{
	1866	struct resv_map *resv;
	1867	pgoff_t idx;
	1868	long ret;
	1869
	1870	resv = vma_resv_map(vma);
	1871	if (!resv)
	1872	return 1;
	1873
	1874	idx = vma_hugecache_offset(h, vma, addr);
	1875	switch (mode) {
	1876	case VMA_NEEDS_RESV:
	1877	ret = region_chg(resv, idx, idx + 1);
	1878	break;
	1879	case VMA_COMMIT_RESV:
	1880	ret = region_add(resv, idx, idx + 1);
	1881	break;
	1882	case VMA_END_RESV:
	1883	region_abort(resv, idx, idx + 1);
	1884	ret = 0;
	1885	break;
	1886	case VMA_ADD_RESV:
	1887	if (vma->vm_flags & VM_MAYSHARE)
	1888	ret = region_add(resv, idx, idx + 1);
	1889	else {
	1890	region_abort(resv, idx, idx + 1);
	1891	ret = region_del(resv, idx, idx + 1);
	1892	}
	1893	break;
	1894	default:
	1895	BUG();
	1896	}
	1897
	1898	if (vma->vm_flags & VM_MAYSHARE)
	1899	return ret;
	1900	else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
	1901	/*
	1902	* In most cases, reserves always exist for private mappings.
	1903	* However, a file associated with mapping could have been
	1904	* hole punched or truncated after reserves were consumed.
	1905	* As subsequent fault on such a range will not use reserves.
	1906	* Subtle - The reserve map for private mappings has the
	1907	* opposite meaning than that of shared mappings. If NO
	1908	* entry is in the reserve map, it means a reservation exists.
	1909	* If an entry exists in the reserve map, it means the
	1910	* reservation has already been consumed. As a result, the
	1911	* return value of this routine is the opposite of the
	1912	* value returned from reserve map manipulation routines above.
	1913	*/
	1914	if (ret)
	1915	return 0;
	1916	else
	1917	return 1;
	1918	}
	1919	else
	1920	return ret < 0 ? ret : 0;
	1921	}
	1922
	1923	static long vma_needs_reservation(struct hstate *h,
	1924	struct vm_area_struct *vma, unsigned long addr)
	1925	{
	1926	return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
	1927	}
	1928
	1929	static long vma_commit_reservation(struct hstate *h,
	1930	struct vm_area_struct *vma, unsigned long addr)
	1931	{
	1932	return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
	1933	}
	1934
	1935	static void vma_end_reservation(struct hstate *h,
	1936	struct vm_area_struct *vma, unsigned long addr)
	1937	{
	1938	(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
	1939	}
	1940
	1941	static long vma_add_reservation(struct hstate *h,
	1942	struct vm_area_struct *vma, unsigned long addr)
	1943	{
	1944	return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
	1945	}
	1946
	1947	/*
	1948	* This routine is called to restore a reservation on error paths. In the
	1949	* specific error paths, a huge page was allocated (via alloc_huge_page)
	1950	* and is about to be freed. If a reservation for the page existed,
	1951	* alloc_huge_page would have consumed the reservation and set PagePrivate
	1952	* in the newly allocated page. When the page is freed via free_huge_page,
	1953	* the global reservation count will be incremented if PagePrivate is set.
	1954	* However, free_huge_page can not adjust the reserve map. Adjust the
	1955	* reserve map here to be consistent with global reserve count adjustments
	1956	* to be made by free_huge_page.
	1957	*/
	1958	static void restore_reserve_on_error(struct hstate *h,
	1959	struct vm_area_struct *vma, unsigned long address,
	1960	struct page *page)
	1961	{
	1962	if (unlikely(PagePrivate(page))) {
	1963	long rc = vma_needs_reservation(h, vma, address);
	1964
	1965	if (unlikely(rc < 0)) {
	1966	/*
	1967	* Rare out of memory condition in reserve map
	1968	* manipulation. Clear PagePrivate so that
	1969	* global reserve count will not be incremented
	1970	* by free_huge_page. This will make it appear
	1971	* as though the reservation for this page was
	1972	* consumed. This may prevent the task from
	1973	* faulting in the page at a later time. This
	1974	* is better than inconsistent global huge page
	1975	* accounting of reserve counts.
	1976	*/
	1977	ClearPagePrivate(page);
	1978	} else if (rc) {
	1979	rc = vma_add_reservation(h, vma, address);
	1980	if (unlikely(rc < 0))
	1981	/*
	1982	* See above comment about rare out of
	1983	* memory condition.
	1984	*/
	1985	ClearPagePrivate(page);
	1986	} else
	1987	vma_end_reservation(h, vma, address);
	1988	}
	1989	}
	1990
	1991	struct page alloc_huge_page(struct vm_area_struct vma,
	1992	unsigned long addr, int avoid_reserve)
	1993	{
	1994	struct hugepage_subpool *spool = subpool_vma(vma);
	1995	struct hstate *h = hstate_vma(vma);
	1996	struct page *page;
	1997	long map_chg, map_commit;
	1998	long gbl_chg;
	1999	int ret, idx;
	2000	struct hugetlb_cgroup *h_cg;
	2001
	2002	idx = hstate_index(h);
	2003	/*
	2004	* Examine the region/reserve map to determine if the process
	2005	* has a reservation for the page to be allocated. A return
	2006	* code of zero indicates a reservation exists (no change).
	2007	*/
	2008	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
	2009	if (map_chg < 0)
	2010	return ERR_PTR(-ENOMEM);
	2011
	2012	/*
	2013	* Processes that did not create the mapping will have no
	2014	* reserves as indicated by the region/reserve map. Check
	2015	* that the allocation will not exceed the subpool limit.
	2016	* Allocations for MAP_NORESERVE mappings also need to be
	2017	* checked against any subpool limit.
	2018	*/
	2019	if (map_chg \|\| avoid_reserve) {
	2020	gbl_chg = hugepage_subpool_get_pages(spool, 1);
	2021	if (gbl_chg < 0) {
	2022	vma_end_reservation(h, vma, addr);
	2023	return ERR_PTR(-ENOSPC);
	2024	}
	2025
	2026	/*
	2027	* Even though there was no reservation in the region/reserve
	2028	* map, there could be reservations associated with the
	2029	* subpool that can be used. This would be indicated if the
	2030	* return value of hugepage_subpool_get_pages() is zero.
	2031	* However, if avoid_reserve is specified we still avoid even
	2032	* the subpool reservations.
	2033	*/
	2034	if (avoid_reserve)
	2035	gbl_chg = 1;
	2036	}
	2037
	2038	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
	2039	if (ret)
	2040	goto out_subpool_put;
	2041
	2042	spin_lock(&hugetlb_lock);
	2043	/*
	2044	* glb_chg is passed to indicate whether or not a page must be taken
	2045	* from the global free pool (global change). gbl_chg == 0 indicates
	2046	* a reservation exists for the allocation.
	2047	*/
	2048	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
	2049	if (!page) {
	2050	spin_unlock(&hugetlb_lock);
	2051	page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
	2052	if (!page)
	2053	goto out_uncharge_cgroup;
	2054	if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
	2055	SetPagePrivate(page);
	2056	h->resv_huge_pages--;
	2057	}
	2058	spin_lock(&hugetlb_lock);
	2059	list_move(&page->lru, &h->hugepage_activelist);
	2060	/* Fall through */
	2061	}
	2062	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
	2063	spin_unlock(&hugetlb_lock);
	2064
	2065	set_page_private(page, (unsigned long)spool);
	2066
	2067	map_commit = vma_commit_reservation(h, vma, addr);
	2068	if (unlikely(map_chg > map_commit)) {
	2069	/*
	2070	* The page was added to the reservation map between
	2071	* vma_needs_reservation and vma_commit_reservation.
	2072	* This indicates a race with hugetlb_reserve_pages.
	2073	* Adjust for the subpool count incremented above AND
	2074	* in hugetlb_reserve_pages for the same page. Also,
	2075	* the reservation count added in hugetlb_reserve_pages
	2076	* no longer applies.
	2077	*/
	2078	long rsv_adjust;
	2079
	2080	rsv_adjust = hugepage_subpool_put_pages(spool, 1);
	2081	hugetlb_acct_memory(h, -rsv_adjust);
	2082	}
	2083	return page;
	2084
	2085	out_uncharge_cgroup:
	2086	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
	2087	out_subpool_put:
	2088	if (map_chg \|\| avoid_reserve)
	2089	hugepage_subpool_put_pages(spool, 1);
	2090	vma_end_reservation(h, vma, addr);
	2091	return ERR_PTR(-ENOSPC);
	2092	}
	2093
	2094	int alloc_bootmem_huge_page(struct hstate *h)
	2095	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
	2096	int __alloc_bootmem_huge_page(struct hstate *h)
	2097	{
	2098	struct huge_bootmem_page *m;
	2099	int nr_nodes, node;
	2100
	2101	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
	2102	void *addr;
	2103
	2104	addr = memblock_virt_alloc_try_nid_nopanic(
	2105	huge_page_size(h), huge_page_size(h),
	2106	0, BOOTMEM_ALLOC_ACCESSIBLE, node);
	2107	if (addr) {
	2108	/*
	2109	* Use the beginning of the huge page to store the
	2110	* huge_bootmem_page struct (until gather_bootmem
	2111	* puts them into the mem_map).
	2112	*/
	2113	m = addr;
	2114	goto found;
	2115	}
	2116	}
	2117	return 0;
	2118
	2119	found:
	2120	BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
	2121	/* Put them into a private list first because mem_map is not up yet */
	2122	list_add(&m->list, &huge_boot_pages);
	2123	m->hstate = h;
	2124	return 1;
	2125	}
	2126
	2127	static void __init prep_compound_huge_page(struct page *page,
	2128	unsigned int order)
	2129	{
	2130	if (unlikely(order > (MAX_ORDER - 1)))
	2131	prep_compound_gigantic_page(page, order);
	2132	else
	2133	prep_compound_page(page, order);
	2134	}
	2135
	2136	/* Put bootmem huge pages into the standard lists after mem_map is up */
	2137	static void __init gather_bootmem_prealloc(void)
	2138	{
	2139	struct huge_bootmem_page *m;
	2140
	2141	list_for_each_entry(m, &huge_boot_pages, list) {
	2142	struct hstate *h = m->hstate;
	2143	struct page *page;
	2144
	2145	#ifdef CONFIG_HIGHMEM
	2146	page = pfn_to_page(m->phys >> PAGE_SHIFT);
	2147	memblock_free_late(__pa(m),
	2148	sizeof(struct huge_bootmem_page));
	2149	#else
	2150	page = virt_to_page(m);
	2151	#endif
	2152	WARN_ON(page_count(page) != 1);
	2153	prep_compound_huge_page(page, h->order);
	2154	WARN_ON(PageReserved(page));
	2155	prep_new_huge_page(h, page, page_to_nid(page));
	2156	put_page(page); /* free it into the hugepage allocator */
	2157
	2158	/*
	2159	* If we had gigantic hugepages allocated at boot time, we need
	2160	* to restore the 'stolen' pages to totalram_pages in order to
	2161	* fix confusing memory reports from free(1) and another
	2162	* side-effects, like CommitLimit going negative.
	2163	*/
	2164	if (hstate_is_gigantic(h))
	2165	adjust_managed_page_count(page, 1 << h->order);
	2166	cond_resched();
	2167	}
	2168	}
	2169
	2170	static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
	2171	{
	2172	unsigned long i;
	2173
	2174	for (i = 0; i < h->max_huge_pages; ++i) {
	2175	if (hstate_is_gigantic(h)) {
	2176	if (!alloc_bootmem_huge_page(h))
	2177	break;
	2178	} else if (!alloc_pool_huge_page(h,
	2179	&node_states[N_MEMORY]))
	2180	break;
	2181	cond_resched();
	2182	}
	2183	if (i < h->max_huge_pages) {
	2184	char buf[32];
	2185
	2186	string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
	2187	pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
	2188	h->max_huge_pages, buf, i);
	2189	h->max_huge_pages = i;
	2190	}
	2191	}
	2192
	2193	static void __init hugetlb_init_hstates(void)
	2194	{
	2195	struct hstate *h;
	2196
	2197	for_each_hstate(h) {
	2198	if (minimum_order > huge_page_order(h))
	2199	minimum_order = huge_page_order(h);
	2200
	2201	/* oversize hugepages were init'ed in early boot */
	2202	if (!hstate_is_gigantic(h))
	2203	hugetlb_hstate_alloc_pages(h);
	2204	}
	2205	VM_BUG_ON(minimum_order == UINT_MAX);
	2206	}
	2207
	2208	static void __init report_hugepages(void)
	2209	{
	2210	struct hstate *h;
	2211
	2212	for_each_hstate(h) {
	2213	char buf[32];
	2214
	2215	string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
	2216	pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
	2217	buf, h->free_huge_pages);
	2218	}
	2219	}
	2220
	2221	#ifdef CONFIG_HIGHMEM
	2222	static void try_to_free_low(struct hstate *h, unsigned long count,
	2223	nodemask_t *nodes_allowed)
	2224	{
	2225	int i;
	2226
	2227	if (hstate_is_gigantic(h))
	2228	return;
	2229
	2230	for_each_node_mask(i, *nodes_allowed) {
	2231	struct page page, next;
	2232	struct list_head *freel = &h->hugepage_freelists[i];
	2233	list_for_each_entry_safe(page, next, freel, lru) {
	2234	if (count >= h->nr_huge_pages)
	2235	return;
	2236	if (PageHighMem(page))
	2237	continue;
	2238	list_del(&page->lru);
	2239	update_and_free_page(h, page);
	2240	h->free_huge_pages--;
	2241	h->free_huge_pages_node[page_to_nid(page)]--;
	2242	}
	2243	}
	2244	}
	2245	#else
	2246	static inline void try_to_free_low(struct hstate *h, unsigned long count,
	2247	nodemask_t *nodes_allowed)
	2248	{
	2249	}
	2250	#endif
	2251
	2252	/*
	2253	* Increment or decrement surplus_huge_pages. Keep node-specific counters
	2254	* balanced by operating on them in a round-robin fashion.
	2255	* Returns 1 if an adjustment was made.
	2256	*/
	2257	static int adjust_pool_surplus(struct hstate h, nodemask_t nodes_allowed,
	2258	int delta)
	2259	{
	2260	int nr_nodes, node;
	2261
	2262	VM_BUG_ON(delta != -1 && delta != 1);
	2263
	2264	if (delta < 0) {
	2265	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
	2266	if (h->surplus_huge_pages_node[node])
	2267	goto found;
	2268	}
	2269	} else {
	2270	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
	2271	if (h->surplus_huge_pages_node[node] <
	2272	h->nr_huge_pages_node[node])
	2273	goto found;
	2274	}
	2275	}
	2276	return 0;
	2277
	2278	found:
	2279	h->surplus_huge_pages += delta;
	2280	h->surplus_huge_pages_node[node] += delta;
	2281	return 1;
	2282	}
	2283
	2284	#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
	2285	static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
	2286	nodemask_t *nodes_allowed)
	2287	{
	2288	unsigned long min_count, ret;
	2289
	2290	if (hstate_is_gigantic(h) && !gigantic_page_supported())
	2291	return h->max_huge_pages;
	2292
	2293	/*
	2294	* Increase the pool size
	2295	* First take pages out of surplus state. Then make up the
	2296	* remaining difference by allocating fresh huge pages.
	2297	*
	2298	* We might race with alloc_surplus_huge_page() here and be unable
	2299	* to convert a surplus huge page to a normal huge page. That is
	2300	* not critical, though, it just means the overall size of the
	2301	* pool might be one hugepage larger than it needs to be, but
	2302	* within all the constraints specified by the sysctls.
	2303	*/
	2304	spin_lock(&hugetlb_lock);
	2305	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
	2306	if (!adjust_pool_surplus(h, nodes_allowed, -1))
	2307	break;
	2308	}
	2309
	2310	while (count > persistent_huge_pages(h)) {
	2311	/*
	2312	* If this allocation races such that we no longer need the
	2313	* page, free_huge_page will handle it by freeing the page
	2314	* and reducing the surplus.
	2315	*/
	2316	spin_unlock(&hugetlb_lock);
	2317
	2318	/* yield cpu to avoid soft lockup */
	2319	cond_resched();
	2320
	2321	ret = alloc_pool_huge_page(h, nodes_allowed);
	2322	spin_lock(&hugetlb_lock);
	2323	if (!ret)
	2324	goto out;
	2325
	2326	/* Bail for signals. Probably ctrl-c from user */
	2327	if (signal_pending(current))
	2328	goto out;
	2329	}
	2330
	2331	/*
	2332	* Decrease the pool size
	2333	* First return free pages to the buddy allocator (being careful
	2334	* to keep enough around to satisfy reservations). Then place
	2335	* pages into surplus state as needed so the pool will shrink
	2336	* to the desired size as pages become free.
	2337	*
	2338	* By placing pages into the surplus state independent of the
	2339	* overcommit value, we are allowing the surplus pool size to
	2340	* exceed overcommit. There are few sane options here. Since
	2341	* alloc_surplus_huge_page() is checking the global counter,
	2342	* though, we'll note that we're not allowed to exceed surplus
	2343	* and won't grow the pool anywhere else. Not until one of the
	2344	* sysctls are changed, or the surplus pages go out of use.
	2345	*/
	2346	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
	2347	min_count = max(count, min_count);
	2348	try_to_free_low(h, min_count, nodes_allowed);
	2349	while (min_count < persistent_huge_pages(h)) {
	2350	if (!free_pool_huge_page(h, nodes_allowed, 0))
	2351	break;
	2352	cond_resched_lock(&hugetlb_lock);
	2353	}
	2354	while (count < persistent_huge_pages(h)) {
	2355	if (!adjust_pool_surplus(h, nodes_allowed, 1))
	2356	break;
	2357	}
	2358	out:
	2359	ret = persistent_huge_pages(h);
	2360	spin_unlock(&hugetlb_lock);
	2361	return ret;
	2362	}
	2363
	2364	#define HSTATE_ATTR_RO(_name) \
	2365	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
	2366
	2367	#define HSTATE_ATTR(_name) \
	2368	static struct kobj_attribute _name##_attr = \
	2369	__ATTR(_name, 0644, _name##_show, _name##_store)
	2370
	2371	static struct kobject *hugepages_kobj;
	2372	static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
	2373
	2374	static struct hstate kobj_to_node_hstate(struct kobject kobj, int *nidp);
	2375
	2376	static struct hstate kobj_to_hstate(struct kobject kobj, int *nidp)
	2377	{
	2378	int i;
	2379
	2380	for (i = 0; i < HUGE_MAX_HSTATE; i++)
	2381	if (hstate_kobjs[i] == kobj) {
	2382	if (nidp)
	2383	*nidp = NUMA_NO_NODE;
	2384	return &hstates[i];
	2385	}
	2386
	2387	return kobj_to_node_hstate(kobj, nidp);
	2388	}
	2389
	2390	static ssize_t nr_hugepages_show_common(struct kobject *kobj,
	2391	struct kobj_attribute attr, char buf)
	2392	{
	2393	struct hstate *h;
	2394	unsigned long nr_huge_pages;
	2395	int nid;
	2396
	2397	h = kobj_to_hstate(kobj, &nid);
	2398	if (nid == NUMA_NO_NODE)
	2399	nr_huge_pages = h->nr_huge_pages;
	2400	else
	2401	nr_huge_pages = h->nr_huge_pages_node[nid];
	2402
	2403	return sprintf(buf, "%lu\n", nr_huge_pages);
	2404	}
	2405
	2406	static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
	2407	struct hstate *h, int nid,
	2408	unsigned long count, size_t len)
	2409	{
	2410	int err;
	2411	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL \| __GFP_NORETRY);
	2412
	2413	if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
	2414	err = -EINVAL;
	2415	goto out;
	2416	}
	2417
	2418	if (nid == NUMA_NO_NODE) {
	2419	/*
	2420	* global hstate attribute
	2421	*/
	2422	if (!(obey_mempolicy &&
	2423	init_nodemask_of_mempolicy(nodes_allowed))) {
	2424	NODEMASK_FREE(nodes_allowed);
	2425	nodes_allowed = &node_states[N_MEMORY];
	2426	}
	2427	} else if (nodes_allowed) {
	2428	/*
	2429	* per node hstate attribute: adjust count to global,
	2430	* but restrict alloc/free to the specified node.
	2431	*/
	2432	count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
	2433	init_nodemask_of_node(nodes_allowed, nid);
	2434	} else
	2435	nodes_allowed = &node_states[N_MEMORY];
	2436
	2437	h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
	2438
	2439	if (nodes_allowed != &node_states[N_MEMORY])
	2440	NODEMASK_FREE(nodes_allowed);
	2441
	2442	return len;
	2443	out:
	2444	NODEMASK_FREE(nodes_allowed);
	2445	return err;
	2446	}
	2447
	2448	static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
	2449	struct kobject kobj, const char buf,
	2450	size_t len)
	2451	{
	2452	struct hstate *h;
	2453	unsigned long count;
	2454	int nid;
	2455	int err;
	2456
	2457	err = kstrtoul(buf, 10, &count);
	2458	if (err)
	2459	return err;
	2460
	2461	h = kobj_to_hstate(kobj, &nid);
	2462	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
	2463	}
	2464
	2465	static ssize_t nr_hugepages_show(struct kobject *kobj,
	2466	struct kobj_attribute attr, char buf)
	2467	{
	2468	return nr_hugepages_show_common(kobj, attr, buf);
	2469	}
	2470
	2471	static ssize_t nr_hugepages_store(struct kobject *kobj,
	2472	struct kobj_attribute attr, const char buf, size_t len)
	2473	{
	2474	return nr_hugepages_store_common(false, kobj, buf, len);
	2475	}
	2476	HSTATE_ATTR(nr_hugepages);
	2477
	2478	#ifdef CONFIG_NUMA
	2479
	2480	/*
	2481	* hstate attribute for optionally mempolicy-based constraint on persistent
	2482	* huge page alloc/free.
	2483	*/
	2484	static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
	2485	struct kobj_attribute attr, char buf)
	2486	{
	2487	return nr_hugepages_show_common(kobj, attr, buf);
	2488	}
	2489
	2490	static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
	2491	struct kobj_attribute attr, const char buf, size_t len)
	2492	{
	2493	return nr_hugepages_store_common(true, kobj, buf, len);
	2494	}
	2495	HSTATE_ATTR(nr_hugepages_mempolicy);
	2496	#endif
	2497
	2498
	2499	static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
	2500	struct kobj_attribute attr, char buf)
	2501	{
	2502	struct hstate *h = kobj_to_hstate(kobj, NULL);
	2503	return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
	2504	}
	2505
	2506	static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
	2507	struct kobj_attribute attr, const char buf, size_t count)
	2508	{
	2509	int err;
	2510	unsigned long input;
	2511	struct hstate *h = kobj_to_hstate(kobj, NULL);
	2512
	2513	if (hstate_is_gigantic(h))
	2514	return -EINVAL;
	2515
	2516	err = kstrtoul(buf, 10, &input);
	2517	if (err)
	2518	return err;
	2519
	2520	spin_lock(&hugetlb_lock);
	2521	h->nr_overcommit_huge_pages = input;
	2522	spin_unlock(&hugetlb_lock);
	2523
	2524	return count;
	2525	}
	2526	HSTATE_ATTR(nr_overcommit_hugepages);
	2527
	2528	static ssize_t free_hugepages_show(struct kobject *kobj,
	2529	struct kobj_attribute attr, char buf)
	2530	{
	2531	struct hstate *h;
	2532	unsigned long free_huge_pages;
	2533	int nid;
	2534
	2535	h = kobj_to_hstate(kobj, &nid);
	2536	if (nid == NUMA_NO_NODE)
	2537	free_huge_pages = h->free_huge_pages;
	2538	else
	2539	free_huge_pages = h->free_huge_pages_node[nid];
	2540
	2541	return sprintf(buf, "%lu\n", free_huge_pages);
	2542	}
	2543	HSTATE_ATTR_RO(free_hugepages);
	2544
	2545	static ssize_t resv_hugepages_show(struct kobject *kobj,
	2546	struct kobj_attribute attr, char buf)
	2547	{
	2548	struct hstate *h = kobj_to_hstate(kobj, NULL);
	2549	return sprintf(buf, "%lu\n", h->resv_huge_pages);
	2550	}
	2551	HSTATE_ATTR_RO(resv_hugepages);
	2552
	2553	static ssize_t surplus_hugepages_show(struct kobject *kobj,
	2554	struct kobj_attribute attr, char buf)
	2555	{
	2556	struct hstate *h;
	2557	unsigned long surplus_huge_pages;
	2558	int nid;
	2559
	2560	h = kobj_to_hstate(kobj, &nid);
	2561	if (nid == NUMA_NO_NODE)
	2562	surplus_huge_pages = h->surplus_huge_pages;
	2563	else
	2564	surplus_huge_pages = h->surplus_huge_pages_node[nid];
	2565
	2566	return sprintf(buf, "%lu\n", surplus_huge_pages);
	2567	}
	2568	HSTATE_ATTR_RO(surplus_hugepages);
	2569
	2570	static struct attribute *hstate_attrs[] = {
	2571	&nr_hugepages_attr.attr,
	2572	&nr_overcommit_hugepages_attr.attr,
	2573	&free_hugepages_attr.attr,
	2574	&resv_hugepages_attr.attr,
	2575	&surplus_hugepages_attr.attr,
	2576	#ifdef CONFIG_NUMA
	2577	&nr_hugepages_mempolicy_attr.attr,
	2578	#endif
	2579	NULL,
	2580	};
	2581
	2582	static const struct attribute_group hstate_attr_group = {
	2583	.attrs = hstate_attrs,
	2584	};
	2585
	2586	static int hugetlb_sysfs_add_hstate(struct hstate h, struct kobject parent,
	2587	struct kobject **hstate_kobjs,
	2588	const struct attribute_group *hstate_attr_group)
	2589	{
	2590	int retval;
	2591	int hi = hstate_index(h);
	2592
	2593	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
	2594	if (!hstate_kobjs[hi])
	2595	return -ENOMEM;
	2596
	2597	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
	2598	if (retval)
	2599	kobject_put(hstate_kobjs[hi]);
	2600
	2601	return retval;
	2602	}
	2603
	2604	static void __init hugetlb_sysfs_init(void)
	2605	{
	2606	struct hstate *h;
	2607	int err;
	2608
	2609	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
	2610	if (!hugepages_kobj)
	2611	return;
	2612
	2613	for_each_hstate(h) {
	2614	err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
	2615	hstate_kobjs, &hstate_attr_group);
	2616	if (err)
	2617	pr_err("Hugetlb: Unable to add hstate %s", h->name);
	2618	}
	2619	}
	2620
	2621	#ifdef CONFIG_NUMA
	2622
	2623	/*
	2624	* node_hstate/s - associate per node hstate attributes, via their kobjects,
	2625	* with node devices in node_devices[] using a parallel array. The array
	2626	* index of a node device or _hstate == node id.
	2627	* This is here to avoid any static dependency of the node device driver, in
	2628	* the base kernel, on the hugetlb module.
	2629	*/
	2630	struct node_hstate {
	2631	struct kobject *hugepages_kobj;
	2632	struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
	2633	};
	2634	static struct node_hstate node_hstates[MAX_NUMNODES];
	2635
	2636	/*
	2637	* A subset of global hstate attributes for node devices
	2638	*/
	2639	static struct attribute *per_node_hstate_attrs[] = {
	2640	&nr_hugepages_attr.attr,
	2641	&free_hugepages_attr.attr,
	2642	&surplus_hugepages_attr.attr,
	2643	NULL,
	2644	};
	2645
	2646	static const struct attribute_group per_node_hstate_attr_group = {
	2647	.attrs = per_node_hstate_attrs,
	2648	};
	2649
	2650	/*
	2651	* kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
	2652	* Returns node id via non-NULL nidp.
	2653	*/
	2654	static struct hstate kobj_to_node_hstate(struct kobject kobj, int *nidp)
	2655	{
	2656	int nid;
	2657
	2658	for (nid = 0; nid < nr_node_ids; nid++) {
	2659	struct node_hstate *nhs = &node_hstates[nid];
	2660	int i;
	2661	for (i = 0; i < HUGE_MAX_HSTATE; i++)
	2662	if (nhs->hstate_kobjs[i] == kobj) {
	2663	if (nidp)
	2664	*nidp = nid;
	2665	return &hstates[i];
	2666	}
	2667	}
	2668
	2669	BUG();
	2670	return NULL;
	2671	}
	2672
	2673	/*
	2674	* Unregister hstate attributes from a single node device.
	2675	* No-op if no hstate attributes attached.
	2676	*/
	2677	static void hugetlb_unregister_node(struct node *node)
	2678	{
	2679	struct hstate *h;
	2680	struct node_hstate *nhs = &node_hstates[node->dev.id];
	2681
	2682	if (!nhs->hugepages_kobj)
	2683	return; /* no hstate attributes */
	2684
	2685	for_each_hstate(h) {
	2686	int idx = hstate_index(h);
	2687	if (nhs->hstate_kobjs[idx]) {
	2688	kobject_put(nhs->hstate_kobjs[idx]);
	2689	nhs->hstate_kobjs[idx] = NULL;
	2690	}
	2691	}
	2692
	2693	kobject_put(nhs->hugepages_kobj);
	2694	nhs->hugepages_kobj = NULL;
	2695	}
	2696
	2697
	2698	/*
	2699	* Register hstate attributes for a single node device.
	2700	* No-op if attributes already registered.
	2701	*/
	2702	static void hugetlb_register_node(struct node *node)
	2703	{
	2704	struct hstate *h;
	2705	struct node_hstate *nhs = &node_hstates[node->dev.id];
	2706	int err;
	2707
	2708	if (nhs->hugepages_kobj)
	2709	return; /* already allocated */
	2710
	2711	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
	2712	&node->dev.kobj);
	2713	if (!nhs->hugepages_kobj)
	2714	return;
	2715
	2716	for_each_hstate(h) {
	2717	err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
	2718	nhs->hstate_kobjs,
	2719	&per_node_hstate_attr_group);
	2720	if (err) {
	2721	pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
	2722	h->name, node->dev.id);
	2723	hugetlb_unregister_node(node);
	2724	break;
	2725	}
	2726	}
	2727	}
	2728
	2729	/*
	2730	* hugetlb init time: register hstate attributes for all registered node
	2731	* devices of nodes that have memory. All on-line nodes should have
	2732	* registered their associated device by this time.
	2733	*/
	2734	static void __init hugetlb_register_all_nodes(void)
	2735	{
	2736	int nid;
	2737
	2738	for_each_node_state(nid, N_MEMORY) {
	2739	struct node *node = node_devices[nid];
	2740	if (node->dev.id == nid)
	2741	hugetlb_register_node(node);
	2742	}
	2743
	2744	/*
	2745	* Let the node device driver know we're here so it can
	2746	* [un]register hstate attributes on node hotplug.
	2747	*/
	2748	register_hugetlbfs_with_node(hugetlb_register_node,
	2749	hugetlb_unregister_node);
	2750	}
	2751	#else /* !CONFIG_NUMA */
	2752
	2753	static struct hstate kobj_to_node_hstate(struct kobject kobj, int *nidp)
	2754	{
	2755	BUG();
	2756	if (nidp)
	2757	*nidp = -1;
	2758	return NULL;
	2759	}
	2760
	2761	static void hugetlb_register_all_nodes(void) { }
	2762
	2763	#endif
	2764
	2765	static int __init hugetlb_init(void)
	2766	{
	2767	int i;
	2768
	2769	if (!hugepages_supported())
	2770	return 0;
	2771
	2772	if (!size_to_hstate(default_hstate_size)) {
	2773	if (default_hstate_size != 0) {
	2774	pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
	2775	default_hstate_size, HPAGE_SIZE);
	2776	}
	2777
	2778	default_hstate_size = HPAGE_SIZE;
	2779	if (!size_to_hstate(default_hstate_size))
	2780	hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
	2781	}
	2782	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
	2783	if (default_hstate_max_huge_pages) {
	2784	if (!default_hstate.max_huge_pages)
	2785	default_hstate.max_huge_pages = default_hstate_max_huge_pages;
	2786	}
	2787
	2788	hugetlb_init_hstates();
	2789	gather_bootmem_prealloc();
	2790	report_hugepages();
	2791
	2792	hugetlb_sysfs_init();
	2793	hugetlb_register_all_nodes();
	2794	hugetlb_cgroup_file_init();
	2795
	2796	#ifdef CONFIG_SMP
	2797	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
	2798	#else
	2799	num_fault_mutexes = 1;
	2800	#endif
	2801	hugetlb_fault_mutex_table =
	2802	kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
	2803	GFP_KERNEL);
	2804	BUG_ON(!hugetlb_fault_mutex_table);
	2805
	2806	for (i = 0; i < num_fault_mutexes; i++)
	2807	mutex_init(&hugetlb_fault_mutex_table[i]);
	2808	return 0;
	2809	}
	2810	subsys_initcall(hugetlb_init);
	2811
	2812	/* Should be called on processing a hugepagesz=... option */
	2813	void __init hugetlb_bad_size(void)
	2814	{
	2815	parsed_valid_hugepagesz = false;
	2816	}
	2817
	2818	void __init hugetlb_add_hstate(unsigned int order)
	2819	{
	2820	struct hstate *h;
	2821	unsigned long i;
	2822
	2823	if (size_to_hstate(PAGE_SIZE << order)) {
	2824	pr_warn("hugepagesz= specified twice, ignoring\n");
	2825	return;
	2826	}
	2827	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
	2828	BUG_ON(order == 0);
	2829	h = &hstates[hugetlb_max_hstate++];
	2830	h->order = order;
	2831	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
	2832	h->nr_huge_pages = 0;
	2833	h->free_huge_pages = 0;
	2834	for (i = 0; i < MAX_NUMNODES; ++i)
	2835	INIT_LIST_HEAD(&h->hugepage_freelists[i]);
	2836	INIT_LIST_HEAD(&h->hugepage_activelist);
	2837	h->next_nid_to_alloc = first_memory_node;
	2838	h->next_nid_to_free = first_memory_node;
	2839	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
	2840	huge_page_size(h)/1024);
	2841
	2842	parsed_hstate = h;
	2843	}
	2844
	2845	static int __init hugetlb_nrpages_setup(char *s)
	2846	{
	2847	unsigned long *mhp;
	2848	static unsigned long *last_mhp;
	2849
	2850	if (!parsed_valid_hugepagesz) {
	2851	pr_warn("hugepages = %s preceded by "
	2852	"an unsupported hugepagesz, ignoring\n", s);
	2853	parsed_valid_hugepagesz = true;
	2854	return 1;
	2855	}
	2856	/*
	2857	* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
	2858	* so this hugepages= parameter goes to the "default hstate".
	2859	*/
	2860	else if (!hugetlb_max_hstate)
	2861	mhp = &default_hstate_max_huge_pages;
	2862	else
	2863	mhp = &parsed_hstate->max_huge_pages;
	2864
	2865	if (mhp == last_mhp) {
	2866	pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
	2867	return 1;
	2868	}
	2869
	2870	if (sscanf(s, "%lu", mhp) <= 0)
	2871	*mhp = 0;
	2872
	2873	/*
	2874	* Global state is always initialized later in hugetlb_init.
	2875	* But we need to allocate >= MAX_ORDER hstates here early to still
	2876	* use the bootmem allocator.
	2877	*/
	2878	if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
	2879	hugetlb_hstate_alloc_pages(parsed_hstate);
	2880
	2881	last_mhp = mhp;
	2882
	2883	return 1;
	2884	}
	2885	__setup("hugepages=", hugetlb_nrpages_setup);
	2886
	2887	static int __init hugetlb_default_setup(char *s)
	2888	{
	2889	default_hstate_size = memparse(s, &s);
	2890	return 1;
	2891	}
	2892	__setup("default_hugepagesz=", hugetlb_default_setup);
	2893
	2894	static unsigned int cpuset_mems_nr(unsigned int *array)
	2895	{
	2896	int node;
	2897	unsigned int nr = 0;
	2898
	2899	for_each_node_mask(node, cpuset_current_mems_allowed)
	2900	nr += array[node];
	2901
	2902	return nr;
	2903	}
	2904
	2905	#ifdef CONFIG_SYSCTL
	2906	static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
	2907	struct ctl_table *table, int write,
	2908	void __user buffer, size_t length, loff_t *ppos)
	2909	{
	2910	struct hstate *h = &default_hstate;
	2911	unsigned long tmp = h->max_huge_pages;
	2912	int ret;
	2913
	2914	if (!hugepages_supported())
	2915	return -EOPNOTSUPP;
	2916
	2917	table->data = &tmp;
	2918	table->maxlen = sizeof(unsigned long);
	2919	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
	2920	if (ret)
	2921	goto out;
	2922
	2923	if (write)
	2924	ret = __nr_hugepages_store_common(obey_mempolicy, h,
	2925	NUMA_NO_NODE, tmp, *length);
	2926	out:
	2927	return ret;
	2928	}
	2929
	2930	int hugetlb_sysctl_handler(struct ctl_table *table, int write,
	2931	void __user buffer, size_t length, loff_t *ppos)
	2932	{
	2933
	2934	return hugetlb_sysctl_handler_common(false, table, write,
	2935	buffer, length, ppos);
	2936	}
	2937
	2938	#ifdef CONFIG_NUMA
	2939	int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
	2940	void __user buffer, size_t length, loff_t *ppos)
	2941	{
	2942	return hugetlb_sysctl_handler_common(true, table, write,
	2943	buffer, length, ppos);
	2944	}
	2945	#endif /* CONFIG_NUMA */
	2946
	2947	int hugetlb_overcommit_handler(struct ctl_table *table, int write,
	2948	void __user *buffer,
	2949	size_t length, loff_t ppos)
	2950	{
	2951	struct hstate *h = &default_hstate;
	2952	unsigned long tmp;
	2953	int ret;
	2954
	2955	if (!hugepages_supported())
	2956	return -EOPNOTSUPP;
	2957
	2958	tmp = h->nr_overcommit_huge_pages;
	2959
	2960	if (write && hstate_is_gigantic(h))
	2961	return -EINVAL;
	2962
	2963	table->data = &tmp;
	2964	table->maxlen = sizeof(unsigned long);
	2965	ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
	2966	if (ret)
	2967	goto out;
	2968
	2969	if (write) {
	2970	spin_lock(&hugetlb_lock);
	2971	h->nr_overcommit_huge_pages = tmp;
	2972	spin_unlock(&hugetlb_lock);
	2973	}
	2974	out:
	2975	return ret;
	2976	}
	2977
	2978	#endif /* CONFIG_SYSCTL */
	2979
	2980	void hugetlb_report_meminfo(struct seq_file *m)
	2981	{
	2982	struct hstate *h;
	2983	unsigned long total = 0;
	2984
	2985	if (!hugepages_supported())
	2986	return;
	2987
	2988	for_each_hstate(h) {
	2989	unsigned long count = h->nr_huge_pages;
	2990
	2991	total += (PAGE_SIZE << huge_page_order(h)) * count;
	2992
	2993	if (h == &default_hstate)
	2994	seq_printf(m,
	2995	"HugePages_Total: %5lu\n"
	2996	"HugePages_Free: %5lu\n"
	2997	"HugePages_Rsvd: %5lu\n"
	2998	"HugePages_Surp: %5lu\n"
	2999	"Hugepagesize: %8lu kB\n",
	3000	count,
	3001	h->free_huge_pages,
	3002	h->resv_huge_pages,
	3003	h->surplus_huge_pages,
	3004	(PAGE_SIZE << huge_page_order(h)) / 1024);
	3005	}
	3006
	3007	seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
	3008	}
	3009
	3010	int hugetlb_report_node_meminfo(int nid, char *buf)
	3011	{
	3012	struct hstate *h = &default_hstate;
	3013	if (!hugepages_supported())
	3014	return 0;
	3015	return sprintf(buf,
	3016	"Node %d HugePages_Total: %5u\n"
	3017	"Node %d HugePages_Free: %5u\n"
	3018	"Node %d HugePages_Surp: %5u\n",
	3019	nid, h->nr_huge_pages_node[nid],
	3020	nid, h->free_huge_pages_node[nid],
	3021	nid, h->surplus_huge_pages_node[nid]);
	3022	}
	3023
	3024	void hugetlb_show_meminfo(void)
	3025	{
	3026	struct hstate *h;
	3027	int nid;
	3028
	3029	if (!hugepages_supported())
	3030	return;
	3031
	3032	for_each_node_state(nid, N_MEMORY)
	3033	for_each_hstate(h)
	3034	pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
	3035	nid,
	3036	h->nr_huge_pages_node[nid],
	3037	h->free_huge_pages_node[nid],
	3038	h->surplus_huge_pages_node[nid],
	3039	1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
	3040	}
	3041
	3042	void hugetlb_report_usage(struct seq_file m, struct mm_struct mm)
	3043	{
	3044	seq_printf(m, "HugetlbPages:\t%8lu kB\n",
	3045	atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
	3046	}
	3047
	3048	/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
	3049	unsigned long hugetlb_total_pages(void)
	3050	{
	3051	struct hstate *h;
	3052	unsigned long nr_total_pages = 0;
	3053
	3054	for_each_hstate(h)
	3055	nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
	3056	return nr_total_pages;
	3057	}
	3058
	3059	static int hugetlb_acct_memory(struct hstate *h, long delta)
	3060	{
	3061	int ret = -ENOMEM;
	3062
	3063	spin_lock(&hugetlb_lock);
	3064	/*
	3065	* When cpuset is configured, it breaks the strict hugetlb page
	3066	* reservation as the accounting is done on a global variable. Such
	3067	* reservation is completely rubbish in the presence of cpuset because
	3068	* the reservation is not checked against page availability for the
	3069	* current cpuset. Application can still potentially OOM'ed by kernel
	3070	* with lack of free htlb page in cpuset that the task is in.
	3071	* Attempt to enforce strict accounting with cpuset is almost
	3072	* impossible (or too ugly) because cpuset is too fluid that
	3073	* task or memory node can be dynamically moved between cpusets.
	3074	*
	3075	* The change of semantics for shared hugetlb mapping with cpuset is
	3076	* undesirable. However, in order to preserve some of the semantics,
	3077	* we fall back to check against current free page availability as
	3078	* a best attempt and hopefully to minimize the impact of changing
	3079	* semantics that cpuset has.
	3080	*/
	3081	if (delta > 0) {
	3082	if (gather_surplus_pages(h, delta) < 0)
	3083	goto out;
	3084
	3085	if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
	3086	return_unused_surplus_pages(h, delta);
	3087	goto out;
	3088	}
	3089	}
	3090
	3091	ret = 0;
	3092	if (delta < 0)
	3093	return_unused_surplus_pages(h, (unsigned long) -delta);
	3094
	3095	out:
	3096	spin_unlock(&hugetlb_lock);
	3097	return ret;
	3098	}
	3099
	3100	static void hugetlb_vm_op_open(struct vm_area_struct *vma)
	3101	{
	3102	struct resv_map *resv = vma_resv_map(vma);
	3103
	3104	/*
	3105	* This new VMA should share its siblings reservation map if present.
	3106	* The VMA will only ever have a valid reservation map pointer where
	3107	* it is being copied for another still existing VMA. As that VMA
	3108	* has a reference to the reservation map it cannot disappear until
	3109	* after this open call completes. It is therefore safe to take a
	3110	* new reference here without additional locking.
	3111	*/
	3112	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
	3113	kref_get(&resv->refs);
	3114	}
	3115
	3116	static void hugetlb_vm_op_close(struct vm_area_struct *vma)
	3117	{
	3118	struct hstate *h = hstate_vma(vma);
	3119	struct resv_map *resv = vma_resv_map(vma);
	3120	struct hugepage_subpool *spool = subpool_vma(vma);
	3121	unsigned long reserve, start, end;
	3122	long gbl_reserve;
	3123
	3124	if (!resv \|\| !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
	3125	return;
	3126
	3127	start = vma_hugecache_offset(h, vma, vma->vm_start);
	3128	end = vma_hugecache_offset(h, vma, vma->vm_end);
	3129
	3130	reserve = (end - start) - region_count(resv, start, end);
	3131
	3132	kref_put(&resv->refs, resv_map_release);
	3133
	3134	if (reserve) {
	3135	/*
	3136	* Decrement reserve counts. The global reserve count may be
	3137	* adjusted if the subpool has a minimum size.
	3138	*/
	3139	gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
	3140	hugetlb_acct_memory(h, -gbl_reserve);
	3141	}
	3142	}
	3143
	3144	static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
	3145	{
	3146	if (addr & ~(huge_page_mask(hstate_vma(vma))))
	3147	return -EINVAL;
	3148	return 0;
	3149	}
	3150
	3151	static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
	3152	{
	3153	struct hstate *hstate = hstate_vma(vma);
	3154
	3155	return 1UL << huge_page_shift(hstate);
	3156	}
	3157
	3158	/*
	3159	* We cannot handle pagefaults against hugetlb pages at all. They cause
	3160	* handle_mm_fault() to try to instantiate regular-sized pages in the
	3161	* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
	3162	* this far.
	3163	*/
	3164	static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
	3165	{
	3166	BUG();
	3167	return 0;
	3168	}
	3169
	3170	const struct vm_operations_struct hugetlb_vm_ops = {
	3171	.fault = hugetlb_vm_op_fault,
	3172	.open = hugetlb_vm_op_open,
	3173	.close = hugetlb_vm_op_close,
	3174	.split = hugetlb_vm_op_split,
	3175	.pagesize = hugetlb_vm_op_pagesize,
	3176	};
	3177
	3178	static pte_t make_huge_pte(struct vm_area_struct vma, struct page page,
	3179	int writable)
	3180	{
	3181	pte_t entry;
	3182
	3183	if (writable) {
	3184	entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
	3185	vma->vm_page_prot)));
	3186	} else {
	3187	entry = huge_pte_wrprotect(mk_huge_pte(page,
	3188	vma->vm_page_prot));
	3189	}
	3190	entry = pte_mkyoung(entry);
	3191	entry = pte_mkhuge(entry);
	3192	entry = arch_make_huge_pte(entry, vma, page, writable);
	3193
	3194	return entry;
	3195	}
	3196
	3197	static void set_huge_ptep_writable(struct vm_area_struct *vma,
	3198	unsigned long address, pte_t *ptep)
	3199	{
	3200	pte_t entry;
	3201
	3202	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
	3203	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
	3204	update_mmu_cache(vma, address, ptep);
	3205	}
	3206
	3207	bool is_hugetlb_entry_migration(pte_t pte)
	3208	{
	3209	swp_entry_t swp;
	3210
	3211	if (huge_pte_none(pte) \|\| pte_present(pte))
	3212	return false;
	3213	swp = pte_to_swp_entry(pte);
	3214	if (non_swap_entry(swp) && is_migration_entry(swp))
	3215	return true;
	3216	else
	3217	return false;
	3218	}
	3219
	3220	static int is_hugetlb_entry_hwpoisoned(pte_t pte)
	3221	{
	3222	swp_entry_t swp;
	3223
	3224	if (huge_pte_none(pte) \|\| pte_present(pte))
	3225	return 0;
	3226	swp = pte_to_swp_entry(pte);
	3227	if (non_swap_entry(swp) && is_hwpoison_entry(swp))
	3228	return 1;
	3229	else
	3230	return 0;
	3231	}
	3232
	3233	int copy_hugetlb_page_range(struct mm_struct dst, struct mm_struct src,
	3234	struct vm_area_struct *vma)
	3235	{
	3236	pte_t src_pte, dst_pte, entry;
	3237	struct page *ptepage;
	3238	unsigned long addr;
	3239	int cow;
	3240	struct hstate *h = hstate_vma(vma);
	3241	unsigned long sz = huge_page_size(h);
	3242	unsigned long mmun_start; /* For mmu_notifiers */
	3243	unsigned long mmun_end; /* For mmu_notifiers */
	3244	int ret = 0;
	3245
	3246	cow = (vma->vm_flags & (VM_SHARED \| VM_MAYWRITE)) == VM_MAYWRITE;
	3247
	3248	mmun_start = vma->vm_start;
	3249	mmun_end = vma->vm_end;
	3250	if (cow)
	3251	mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
	3252
	3253	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
	3254	spinlock_t src_ptl, dst_ptl;
	3255	src_pte = huge_pte_offset(src, addr, sz);
	3256	if (!src_pte)
	3257	continue;
	3258	dst_pte = huge_pte_alloc(dst, addr, sz);
	3259	if (!dst_pte) {
	3260	ret = -ENOMEM;
	3261	break;
	3262	}
	3263
	3264	/* If the pagetables are shared don't copy or take references */
	3265	if (dst_pte == src_pte)
	3266	continue;
	3267
	3268	dst_ptl = huge_pte_lock(h, dst, dst_pte);
	3269	src_ptl = huge_pte_lockptr(h, src, src_pte);
	3270	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
	3271	entry = huge_ptep_get(src_pte);
	3272	if (huge_pte_none(entry)) { /* skip none entry */
	3273	;
	3274	} else if (unlikely(is_hugetlb_entry_migration(entry) \|\|
	3275	is_hugetlb_entry_hwpoisoned(entry))) {
	3276	swp_entry_t swp_entry = pte_to_swp_entry(entry);
	3277
	3278	if (is_write_migration_entry(swp_entry) && cow) {
	3279	/*
	3280	* COW mappings require pages in both
	3281	* parent and child to be set to read.
	3282	*/
	3283	make_migration_entry_read(&swp_entry);
	3284	entry = swp_entry_to_pte(swp_entry);
	3285	set_huge_swap_pte_at(src, addr, src_pte,
	3286	entry, sz);
	3287	}
	3288	set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
	3289	} else {
	3290	if (cow) {
	3291	/*
	3292	* No need to notify as we are downgrading page
	3293	* table protection not changing it to point
	3294	* to a new page.
	3295	*
	3296	* See Documentation/vm/mmu_notifier.rst
	3297	*/
	3298	huge_ptep_set_wrprotect(src, addr, src_pte);
	3299	}
	3300	entry = huge_ptep_get(src_pte);
	3301	ptepage = pte_page(entry);
	3302	get_page(ptepage);
	3303	page_dup_rmap(ptepage, true);
	3304	set_huge_pte_at(dst, addr, dst_pte, entry);
	3305	hugetlb_count_add(pages_per_huge_page(h), dst);
	3306	}
	3307	spin_unlock(src_ptl);
	3308	spin_unlock(dst_ptl);
	3309	}
	3310
	3311	if (cow)
	3312	mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
	3313
	3314	return ret;
	3315	}
	3316
	3317	void __unmap_hugepage_range(struct mmu_gather tlb, struct vm_area_struct vma,
	3318	unsigned long start, unsigned long end,
	3319	struct page *ref_page)
	3320	{
	3321	struct mm_struct *mm = vma->vm_mm;
	3322	unsigned long address;
	3323	pte_t *ptep;
	3324	pte_t pte;
	3325	spinlock_t *ptl;
	3326	struct page *page;
	3327	struct hstate *h = hstate_vma(vma);
	3328	unsigned long sz = huge_page_size(h);
	3329	const unsigned long mmun_start = start; /* For mmu_notifiers */
	3330	const unsigned long mmun_end = end; /* For mmu_notifiers */
	3331
	3332	WARN_ON(!is_vm_hugetlb_page(vma));
	3333	BUG_ON(start & ~huge_page_mask(h));
	3334	BUG_ON(end & ~huge_page_mask(h));
	3335
	3336	/*
	3337	* This is a hugetlb vma, all the pte entries should point
	3338	* to huge page.
	3339	*/
	3340	tlb_remove_check_page_size_change(tlb, sz);
	3341	tlb_start_vma(tlb, vma);
	3342	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
	3343	address = start;
	3344	for (; address < end; address += sz) {
	3345	ptep = huge_pte_offset(mm, address, sz);
	3346	if (!ptep)
	3347	continue;
	3348
	3349	ptl = huge_pte_lock(h, mm, ptep);
	3350	if (huge_pmd_unshare(mm, &address, ptep)) {
	3351	spin_unlock(ptl);
	3352	continue;
	3353	}
	3354
	3355	pte = huge_ptep_get(ptep);
	3356	if (huge_pte_none(pte)) {
	3357	spin_unlock(ptl);
	3358	continue;
	3359	}
	3360
	3361	/*
	3362	* Migrating hugepage or HWPoisoned hugepage is already
	3363	* unmapped and its refcount is dropped, so just clear pte here.
	3364	*/
	3365	if (unlikely(!pte_present(pte))) {
	3366	huge_pte_clear(mm, address, ptep, sz);
	3367	spin_unlock(ptl);
	3368	continue;
	3369	}
	3370
	3371	page = pte_page(pte);
	3372	/*
	3373	* If a reference page is supplied, it is because a specific
	3374	* page is being unmapped, not a range. Ensure the page we
	3375	* are about to unmap is the actual page of interest.
	3376	*/
	3377	if (ref_page) {
	3378	if (page != ref_page) {
	3379	spin_unlock(ptl);
	3380	continue;
	3381	}
	3382	/*
	3383	* Mark the VMA as having unmapped its page so that
	3384	* future faults in this VMA will fail rather than
	3385	* looking like data was lost
	3386	*/
	3387	set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
	3388	}
	3389
	3390	pte = huge_ptep_get_and_clear(mm, address, ptep);
	3391	tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
	3392	if (huge_pte_dirty(pte))
	3393	set_page_dirty(page);
	3394
	3395	hugetlb_count_sub(pages_per_huge_page(h), mm);
	3396	page_remove_rmap(page, true);
	3397
	3398	spin_unlock(ptl);
	3399	tlb_remove_page_size(tlb, page, huge_page_size(h));
	3400	/*
	3401	* Bail out after unmapping reference page if supplied
	3402	*/
	3403	if (ref_page)
	3404	break;
	3405	}
	3406	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
	3407	tlb_end_vma(tlb, vma);
	3408	}
	3409
	3410	void __unmap_hugepage_range_final(struct mmu_gather *tlb,
	3411	struct vm_area_struct *vma, unsigned long start,
	3412	unsigned long end, struct page *ref_page)
	3413	{
	3414	__unmap_hugepage_range(tlb, vma, start, end, ref_page);
	3415
	3416	/*
	3417	* Clear this flag so that x86's huge_pmd_share page_table_shareable
	3418	* test will fail on a vma being torn down, and not grab a page table
	3419	* on its way out. We're lucky that the flag has such an appropriate
	3420	* name, and can in fact be safely cleared here. We could clear it
	3421	* before the __unmap_hugepage_range above, but all that's necessary
	3422	* is to clear it before releasing the i_mmap_rwsem. This works
	3423	* because in the context this is called, the VMA is about to be
	3424	* destroyed and the i_mmap_rwsem is held.
	3425	*/
	3426	vma->vm_flags &= ~VM_MAYSHARE;
	3427	}
	3428
	3429	void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
	3430	unsigned long end, struct page *ref_page)
	3431	{
	3432	struct mm_struct *mm;
	3433	struct mmu_gather tlb;
	3434
	3435	mm = vma->vm_mm;
	3436
	3437	tlb_gather_mmu(&tlb, mm, start, end);
	3438	__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
	3439	tlb_finish_mmu(&tlb, start, end);
	3440	}
	3441
	3442	/*
	3443	* This is called when the original mapper is failing to COW a MAP_PRIVATE
	3444	* mappping it owns the reserve page for. The intention is to unmap the page
	3445	* from other VMAs and let the children be SIGKILLed if they are faulting the
	3446	* same region.
	3447	*/
	3448	static void unmap_ref_private(struct mm_struct mm, struct vm_area_struct vma,
	3449	struct page *page, unsigned long address)
	3450	{
	3451	struct hstate *h = hstate_vma(vma);
	3452	struct vm_area_struct *iter_vma;
	3453	struct address_space *mapping;
	3454	pgoff_t pgoff;
	3455
	3456	/*
	3457	* vm_pgoff is in PAGE_SIZE units, hence the different calculation
	3458	* from page cache lookup which is in HPAGE_SIZE units.
	3459	*/
	3460	address = address & huge_page_mask(h);
	3461	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
	3462	vma->vm_pgoff;
	3463	mapping = vma->vm_file->f_mapping;
	3464
	3465	/*
	3466	* Take the mapping lock for the duration of the table walk. As
	3467	* this mapping should be shared between all the VMAs,
	3468	* __unmap_hugepage_range() is called as the lock is already held
	3469	*/
	3470	i_mmap_lock_write(mapping);
	3471	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
	3472	/* Do not unmap the current VMA */
	3473	if (iter_vma == vma)
	3474	continue;
	3475
	3476	/*
	3477	* Shared VMAs have their own reserves and do not affect
	3478	* MAP_PRIVATE accounting but it is possible that a shared
	3479	* VMA is using the same page so check and skip such VMAs.
	3480	*/
	3481	if (iter_vma->vm_flags & VM_MAYSHARE)
	3482	continue;
	3483
	3484	/*
	3485	* Unmap the page from other VMAs without their own reserves.
	3486	* They get marked to be SIGKILLed if they fault in these
	3487	* areas. This is because a future no-page fault on this VMA
	3488	* could insert a zeroed page instead of the data existing
	3489	* from the time of fork. This would look like data corruption
	3490	*/
	3491	if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
	3492	unmap_hugepage_range(iter_vma, address,
	3493	address + huge_page_size(h), page);
	3494	}
	3495	i_mmap_unlock_write(mapping);
	3496	}
	3497
	3498	/*
	3499	* Hugetlb_cow() should be called with page lock of the original hugepage held.
	3500	* Called with hugetlb_instantiation_mutex held and pte_page locked so we
	3501	* cannot race with other handlers or page migration.
	3502	* Keep the pte_same checks anyway to make transition from the mutex easier.
	3503	*/
	3504	static int hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,
	3505	unsigned long address, pte_t *ptep,
	3506	struct page pagecache_page, spinlock_t ptl)
	3507	{
	3508	pte_t pte;
	3509	struct hstate *h = hstate_vma(vma);
	3510	struct page old_page, new_page;
	3511	int ret = 0, outside_reserve = 0;
	3512	unsigned long mmun_start; /* For mmu_notifiers */
	3513	unsigned long mmun_end; /* For mmu_notifiers */
	3514
	3515	pte = huge_ptep_get(ptep);
	3516	old_page = pte_page(pte);
	3517
	3518	retry_avoidcopy:
	3519	/* If no-one else is actually using this page, avoid the copy
	3520	* and just make the page writable */
	3521	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
	3522	page_move_anon_rmap(old_page, vma);
	3523	set_huge_ptep_writable(vma, address, ptep);
	3524	return 0;
	3525	}
	3526
	3527	/*
	3528	* If the process that created a MAP_PRIVATE mapping is about to
	3529	* perform a COW due to a shared page count, attempt to satisfy
	3530	* the allocation without using the existing reserves. The pagecache
	3531	* page is used to determine if the reserve at this address was
	3532	* consumed or not. If reserves were used, a partial faulted mapping
	3533	* at the time of fork() could consume its reserves on COW instead
	3534	* of the full address range.
	3535	*/
	3536	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
	3537	old_page != pagecache_page)
	3538	outside_reserve = 1;
	3539
	3540	get_page(old_page);
	3541
	3542	/*
	3543	* Drop page table lock as buddy allocator may be called. It will
	3544	* be acquired again before returning to the caller, as expected.
	3545	*/
	3546	spin_unlock(ptl);
	3547	new_page = alloc_huge_page(vma, address, outside_reserve);
	3548
	3549	if (IS_ERR(new_page)) {
	3550	/*
	3551	* If a process owning a MAP_PRIVATE mapping fails to COW,
	3552	* it is due to references held by a child and an insufficient
	3553	* huge page pool. To guarantee the original mappers
	3554	* reliability, unmap the page from child processes. The child
	3555	* may get SIGKILLed if it later faults.
	3556	*/
	3557	if (outside_reserve) {
	3558	put_page(old_page);
	3559	BUG_ON(huge_pte_none(pte));
	3560	unmap_ref_private(mm, vma, old_page, address);
	3561	BUG_ON(huge_pte_none(pte));
	3562	spin_lock(ptl);
	3563	ptep = huge_pte_offset(mm, address & huge_page_mask(h),
	3564	huge_page_size(h));
	3565	if (likely(ptep &&
	3566	pte_same(huge_ptep_get(ptep), pte)))
	3567	goto retry_avoidcopy;
	3568	/*
	3569	* race occurs while re-acquiring page table
	3570	* lock, and our job is done.
	3571	*/
	3572	return 0;
	3573	}
	3574
	3575	ret = (PTR_ERR(new_page) == -ENOMEM) ?
	3576	VM_FAULT_OOM : VM_FAULT_SIGBUS;
	3577	goto out_release_old;
	3578	}
	3579
	3580	/*
	3581	* When the original hugepage is shared one, it does not have
	3582	* anon_vma prepared.
	3583	*/
	3584	if (unlikely(anon_vma_prepare(vma))) {
	3585	ret = VM_FAULT_OOM;
	3586	goto out_release_all;
	3587	}
	3588
	3589	copy_user_huge_page(new_page, old_page, address, vma,
	3590	pages_per_huge_page(h));
	3591	__SetPageUptodate(new_page);
	3592	set_page_huge_active(new_page);
	3593
	3594	mmun_start = address & huge_page_mask(h);
	3595	mmun_end = mmun_start + huge_page_size(h);
	3596	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
	3597
	3598	/*
	3599	* Retake the page table lock to check for racing updates
	3600	* before the page tables are altered
	3601	*/
	3602	spin_lock(ptl);
	3603	ptep = huge_pte_offset(mm, address & huge_page_mask(h),
	3604	huge_page_size(h));
	3605	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
	3606	ClearPagePrivate(new_page);
	3607
	3608	/* Break COW */
	3609	huge_ptep_clear_flush(vma, address, ptep);
	3610	mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
	3611	set_huge_pte_at(mm, address, ptep,
	3612	make_huge_pte(vma, new_page, 1));
	3613	page_remove_rmap(old_page, true);
	3614	hugepage_add_new_anon_rmap(new_page, vma, address);
	3615	/* Make the old page be freed below */
	3616	new_page = old_page;
	3617	}
	3618	spin_unlock(ptl);
	3619	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
	3620	out_release_all:
	3621	restore_reserve_on_error(h, vma, address, new_page);
	3622	put_page(new_page);
	3623	out_release_old:
	3624	put_page(old_page);
	3625
	3626	spin_lock(ptl); /* Caller expects lock to be held */
	3627	return ret;
	3628	}
	3629
	3630	/* Return the pagecache page at a given address within a VMA */
	3631	static struct page hugetlbfs_pagecache_page(struct hstate h,
	3632	struct vm_area_struct *vma, unsigned long address)
	3633	{
	3634	struct address_space *mapping;
	3635	pgoff_t idx;
	3636
	3637	mapping = vma->vm_file->f_mapping;
	3638	idx = vma_hugecache_offset(h, vma, address);
	3639
	3640	return find_lock_page(mapping, idx);
	3641	}
	3642
	3643	/*
	3644	* Return whether there is a pagecache page to back given address within VMA.
	3645	* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
	3646	*/
	3647	static bool hugetlbfs_pagecache_present(struct hstate *h,
	3648	struct vm_area_struct *vma, unsigned long address)
	3649	{
	3650	struct address_space *mapping;
	3651	pgoff_t idx;
	3652	struct page *page;
	3653
	3654	mapping = vma->vm_file->f_mapping;
	3655	idx = vma_hugecache_offset(h, vma, address);
	3656
	3657	page = find_get_page(mapping, idx);
	3658	if (page)
	3659	put_page(page);
	3660	return page != NULL;
	3661	}
	3662
	3663	int huge_add_to_page_cache(struct page page, struct address_space mapping,
	3664	pgoff_t idx)
	3665	{
	3666	struct inode *inode = mapping->host;
	3667	struct hstate *h = hstate_inode(inode);
	3668	int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
	3669
	3670	if (err)
	3671	return err;
	3672	ClearPagePrivate(page);
	3673
	3674	spin_lock(&inode->i_lock);
	3675	inode->i_blocks += blocks_per_huge_page(h);
	3676	spin_unlock(&inode->i_lock);
	3677	return 0;
	3678	}
	3679
	3680	static int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,
	3681	struct address_space *mapping, pgoff_t idx,
	3682	unsigned long address, pte_t *ptep, unsigned int flags)
	3683	{
	3684	struct hstate *h = hstate_vma(vma);
	3685	int ret = VM_FAULT_SIGBUS;
	3686	int anon_rmap = 0;
	3687	unsigned long size;
	3688	struct page *page;
	3689	pte_t new_pte;
	3690	spinlock_t *ptl;
	3691	unsigned long haddr = address & huge_page_mask(h);
	3692
	3693	/*
	3694	* Currently, we are forced to kill the process in the event the
	3695	* original mapper has unmapped pages from the child due to a failed
	3696	* COW. Warn that such a situation has occurred as it may not be obvious
	3697	*/
	3698	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
	3699	pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
	3700	current->pid);
	3701	return ret;
	3702	}
	3703
	3704	/*
	3705	* Use page lock to guard against racing truncation
	3706	* before we get page_table_lock.
	3707	*/
	3708	retry:
	3709	page = find_lock_page(mapping, idx);
	3710	if (!page) {
	3711	size = i_size_read(mapping->host) >> huge_page_shift(h);
	3712	if (idx >= size)
	3713	goto out;
	3714
	3715	/*
	3716	* Check for page in userfault range
	3717	*/
	3718	if (userfaultfd_missing(vma)) {
	3719	u32 hash;
	3720	struct vm_fault vmf = {
	3721	.vma = vma,
	3722	.address = haddr,
	3723	.flags = flags,
	3724	/*
	3725	* Hard to debug if it ends up being
	3726	* used by a callee that assumes
	3727	* something about the other
	3728	* uninitialized fields... same as in
	3729	* memory.c
	3730	*/
	3731	};
	3732
	3733	/*
	3734	* hugetlb_fault_mutex must be dropped before
	3735	* handling userfault. Reacquire after handling
	3736	* fault to make calling code simpler.
	3737	*/
	3738	hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
	3739	idx, haddr);
	3740	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
	3741	ret = handle_userfault(&vmf, VM_UFFD_MISSING);
	3742	mutex_lock(&hugetlb_fault_mutex_table[hash]);
	3743	goto out;
	3744	}
	3745
	3746	page = alloc_huge_page(vma, haddr, 0);
	3747	if (IS_ERR(page)) {
	3748	ret = PTR_ERR(page);
	3749	if (ret == -ENOMEM)
	3750	ret = VM_FAULT_OOM;
	3751	else
	3752	ret = VM_FAULT_SIGBUS;
	3753	goto out;
	3754	}
	3755	clear_huge_page(page, address, pages_per_huge_page(h));
	3756	__SetPageUptodate(page);
	3757	set_page_huge_active(page);
	3758
	3759	if (vma->vm_flags & VM_MAYSHARE) {
	3760	int err = huge_add_to_page_cache(page, mapping, idx);
	3761	if (err) {
	3762	put_page(page);
	3763	if (err == -EEXIST)
	3764	goto retry;
	3765	goto out;
	3766	}
	3767	} else {
	3768	lock_page(page);
	3769	if (unlikely(anon_vma_prepare(vma))) {
	3770	ret = VM_FAULT_OOM;
	3771	goto backout_unlocked;
	3772	}
	3773	anon_rmap = 1;
	3774	}
	3775	} else {
	3776	/*
	3777	* If memory error occurs between mmap() and fault, some process
	3778	* don't have hwpoisoned swap entry for errored virtual address.
	3779	* So we need to block hugepage fault by PG_hwpoison bit check.
	3780	*/
	3781	if (unlikely(PageHWPoison(page))) {
	3782	ret = VM_FAULT_HWPOISON \|
	3783	VM_FAULT_SET_HINDEX(hstate_index(h));
	3784	goto backout_unlocked;
	3785	}
	3786	}
	3787
	3788	/*
	3789	* If we are going to COW a private mapping later, we examine the
	3790	* pending reservations for this page now. This will ensure that
	3791	* any allocations necessary to record that reservation occur outside
	3792	* the spinlock.
	3793	*/
	3794	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
	3795	if (vma_needs_reservation(h, vma, haddr) < 0) {
	3796	ret = VM_FAULT_OOM;
	3797	goto backout_unlocked;
	3798	}
	3799	/* Just decrements count, does not deallocate */
	3800	vma_end_reservation(h, vma, haddr);
	3801	}
	3802
	3803	ptl = huge_pte_lock(h, mm, ptep);
	3804	size = i_size_read(mapping->host) >> huge_page_shift(h);
	3805	if (idx >= size)
	3806	goto backout;
	3807
	3808	ret = 0;
	3809	if (!huge_pte_none(huge_ptep_get(ptep)))
	3810	goto backout;
	3811
	3812	if (anon_rmap) {
	3813	ClearPagePrivate(page);
	3814	hugepage_add_new_anon_rmap(page, vma, haddr);
	3815	} else
	3816	page_dup_rmap(page, true);
	3817	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
	3818	&& (vma->vm_flags & VM_SHARED)));
	3819	set_huge_pte_at(mm, haddr, ptep, new_pte);
	3820
	3821	hugetlb_count_add(pages_per_huge_page(h), mm);
	3822	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
	3823	/* Optimization, do the COW without a second fault */
	3824	ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl);
	3825	}
	3826
	3827	spin_unlock(ptl);
	3828	unlock_page(page);
	3829	out:
	3830	return ret;
	3831
	3832	backout:
	3833	spin_unlock(ptl);
	3834	backout_unlocked:
	3835	unlock_page(page);
	3836	restore_reserve_on_error(h, vma, haddr, page);
	3837	put_page(page);
	3838	goto out;
	3839	}
	3840
	3841	#ifdef CONFIG_SMP
	3842	u32 hugetlb_fault_mutex_hash(struct hstate h, struct mm_struct mm,
	3843	struct vm_area_struct *vma,
	3844	struct address_space *mapping,
	3845	pgoff_t idx, unsigned long address)
	3846	{
	3847	unsigned long key[2];
	3848	u32 hash;
	3849
	3850	if (vma->vm_flags & VM_SHARED) {
	3851	key[0] = (unsigned long) mapping;
	3852	key[1] = idx;
	3853	} else {
	3854	key[0] = (unsigned long) mm;
	3855	key[1] = address >> huge_page_shift(h);
	3856	}
	3857
	3858	hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
	3859
	3860	return hash & (num_fault_mutexes - 1);
	3861	}
	3862	#else
	3863	/*
	3864	* For uniprocesor systems we always use a single mutex, so just
	3865	* return 0 and avoid the hashing overhead.
	3866	*/
	3867	u32 hugetlb_fault_mutex_hash(struct hstate h, struct mm_struct mm,
	3868	struct vm_area_struct *vma,
	3869	struct address_space *mapping,
	3870	pgoff_t idx, unsigned long address)
	3871	{
	3872	return 0;
	3873	}
	3874	#endif
	3875
	3876	int hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,
	3877	unsigned long address, unsigned int flags)
	3878	{
	3879	pte_t *ptep, entry;
	3880	spinlock_t *ptl;
	3881	int ret;
	3882	u32 hash;
	3883	pgoff_t idx;
	3884	struct page *page = NULL;
	3885	struct page *pagecache_page = NULL;
	3886	struct hstate *h = hstate_vma(vma);
	3887	struct address_space *mapping;
	3888	int need_wait_lock = 0;
	3889	unsigned long haddr = address & huge_page_mask(h);
	3890
	3891	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
	3892	if (ptep) {
	3893	entry = huge_ptep_get(ptep);
	3894	if (unlikely(is_hugetlb_entry_migration(entry))) {
	3895	migration_entry_wait_huge(vma, mm, ptep);
	3896	return 0;
	3897	} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
	3898	return VM_FAULT_HWPOISON_LARGE \|
	3899	VM_FAULT_SET_HINDEX(hstate_index(h));
	3900	} else {
	3901	ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
	3902	if (!ptep)
	3903	return VM_FAULT_OOM;
	3904	}
	3905
	3906	mapping = vma->vm_file->f_mapping;
	3907	idx = vma_hugecache_offset(h, vma, haddr);
	3908
	3909	/*
	3910	* Serialize hugepage allocation and instantiation, so that we don't
	3911	* get spurious allocation failures if two CPUs race to instantiate
	3912	* the same page in the page cache.
	3913	*/
	3914	hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
	3915	mutex_lock(&hugetlb_fault_mutex_table[hash]);
	3916
	3917	entry = huge_ptep_get(ptep);
	3918	if (huge_pte_none(entry)) {
	3919	ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
	3920	goto out_mutex;
	3921	}
	3922
	3923	ret = 0;
	3924
	3925	/*
	3926	* entry could be a migration/hwpoison entry at this point, so this
	3927	* check prevents the kernel from going below assuming that we have
	3928	* a active hugepage in pagecache. This goto expects the 2nd page fault,
	3929	* and is_hugetlb_entry_(migration\|hwpoisoned) check will properly
	3930	* handle it.
	3931	*/
	3932	if (!pte_present(entry))
	3933	goto out_mutex;
	3934
	3935	/*
	3936	* If we are going to COW the mapping later, we examine the pending
	3937	* reservations for this page now. This will ensure that any
	3938	* allocations necessary to record that reservation occur outside the
	3939	* spinlock. For private mappings, we also lookup the pagecache
	3940	* page now as it is used to determine if a reservation has been
	3941	* consumed.
	3942	*/
	3943	if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
	3944	if (vma_needs_reservation(h, vma, haddr) < 0) {
	3945	ret = VM_FAULT_OOM;
	3946	goto out_mutex;
	3947	}
	3948	/* Just decrements count, does not deallocate */
	3949	vma_end_reservation(h, vma, haddr);
	3950
	3951	if (!(vma->vm_flags & VM_MAYSHARE))
	3952	pagecache_page = hugetlbfs_pagecache_page(h,
	3953	vma, haddr);
	3954	}
	3955
	3956	ptl = huge_pte_lock(h, mm, ptep);
	3957
	3958	/* Check for a racing update before calling hugetlb_cow */
	3959	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
	3960	goto out_ptl;
	3961
	3962	/*
	3963	* hugetlb_cow() requires page locks of pte_page(entry) and
	3964	* pagecache_page, so here we need take the former one
	3965	* when page != pagecache_page or !pagecache_page.
	3966	*/
	3967	page = pte_page(entry);
	3968	if (page != pagecache_page)
	3969	if (!trylock_page(page)) {
	3970	need_wait_lock = 1;
	3971	goto out_ptl;
	3972	}
	3973
	3974	get_page(page);
	3975
	3976	if (flags & FAULT_FLAG_WRITE) {
	3977	if (!huge_pte_write(entry)) {
	3978	ret = hugetlb_cow(mm, vma, haddr, ptep,
	3979	pagecache_page, ptl);
	3980	goto out_put_page;
	3981	}
	3982	entry = huge_pte_mkdirty(entry);
	3983	}
	3984	entry = pte_mkyoung(entry);
	3985	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
	3986	flags & FAULT_FLAG_WRITE))
	3987	update_mmu_cache(vma, haddr, ptep);
	3988	out_put_page:
	3989	if (page != pagecache_page)
	3990	unlock_page(page);
	3991	put_page(page);
	3992	out_ptl:
	3993	spin_unlock(ptl);
	3994
	3995	if (pagecache_page) {
	3996	unlock_page(pagecache_page);
	3997	put_page(pagecache_page);
	3998	}
	3999	out_mutex:
	4000	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
	4001	/*
	4002	* Generally it's safe to hold refcount during waiting page lock. But
	4003	* here we just wait to defer the next page fault to avoid busy loop and
	4004	* the page is not used after unlocked before returning from the current
	4005	* page fault. So we are safe from accessing freed page, even if we wait
	4006	* here without taking refcount.
	4007	*/
	4008	if (need_wait_lock)
	4009	wait_on_page_locked(page);
	4010	return ret;
	4011	}
	4012
	4013	/*
	4014	* Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
	4015	* modifications for huge pages.
	4016	*/
	4017	int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
	4018	pte_t *dst_pte,
	4019	struct vm_area_struct *dst_vma,
	4020	unsigned long dst_addr,
	4021	unsigned long src_addr,
	4022	struct page **pagep)
	4023	{
	4024	struct address_space *mapping;
	4025	pgoff_t idx;
	4026	unsigned long size;
	4027	int vm_shared = dst_vma->vm_flags & VM_SHARED;
	4028	struct hstate *h = hstate_vma(dst_vma);
	4029	pte_t _dst_pte;
	4030	spinlock_t *ptl;
	4031	int ret;
	4032	struct page *page;
	4033
	4034	if (!*pagep) {
	4035	ret = -ENOMEM;
	4036	page = alloc_huge_page(dst_vma, dst_addr, 0);
	4037	if (IS_ERR(page))
	4038	goto out;
	4039
	4040	ret = copy_huge_page_from_user(page,
	4041	(const void __user *) src_addr,
	4042	pages_per_huge_page(h), false);
	4043
	4044	/* fallback to copy_from_user outside mmap_sem */
	4045	if (unlikely(ret)) {
	4046	ret = -EFAULT;
	4047	*pagep = page;
	4048	/* don't free the page */
	4049	goto out;
	4050	}
	4051	} else {
	4052	page = *pagep;
	4053	*pagep = NULL;
	4054	}
	4055
	4056	/*
	4057	* The memory barrier inside __SetPageUptodate makes sure that
	4058	* preceding stores to the page contents become visible before
	4059	* the set_pte_at() write.
	4060	*/
	4061	__SetPageUptodate(page);
	4062	set_page_huge_active(page);
	4063
	4064	mapping = dst_vma->vm_file->f_mapping;
	4065	idx = vma_hugecache_offset(h, dst_vma, dst_addr);
	4066
	4067	/*
	4068	* If shared, add to page cache
	4069	*/
	4070	if (vm_shared) {
	4071	size = i_size_read(mapping->host) >> huge_page_shift(h);
	4072	ret = -EFAULT;
	4073	if (idx >= size)
	4074	goto out_release_nounlock;
	4075
	4076	/*
	4077	* Serialization between remove_inode_hugepages() and
	4078	* huge_add_to_page_cache() below happens through the
	4079	* hugetlb_fault_mutex_table that here must be hold by
	4080	* the caller.
	4081	*/
	4082	ret = huge_add_to_page_cache(page, mapping, idx);
	4083	if (ret)
	4084	goto out_release_nounlock;
	4085	}
	4086
	4087	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
	4088	spin_lock(ptl);
	4089
	4090	/*
	4091	* Recheck the i_size after holding PT lock to make sure not
	4092	* to leave any page mapped (as page_mapped()) beyond the end
	4093	* of the i_size (remove_inode_hugepages() is strict about
	4094	* enforcing that). If we bail out here, we'll also leave a
	4095	* page in the radix tree in the vm_shared case beyond the end
	4096	* of the i_size, but remove_inode_hugepages() will take care
	4097	* of it as soon as we drop the hugetlb_fault_mutex_table.
	4098	*/
	4099	size = i_size_read(mapping->host) >> huge_page_shift(h);
	4100	ret = -EFAULT;
	4101	if (idx >= size)
	4102	goto out_release_unlock;
	4103
	4104	ret = -EEXIST;
	4105	if (!huge_pte_none(huge_ptep_get(dst_pte)))
	4106	goto out_release_unlock;
	4107
	4108	if (vm_shared) {
	4109	page_dup_rmap(page, true);
	4110	} else {
	4111	ClearPagePrivate(page);
	4112	hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
	4113	}
	4114
	4115	_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
	4116	if (dst_vma->vm_flags & VM_WRITE)
	4117	_dst_pte = huge_pte_mkdirty(_dst_pte);
	4118	_dst_pte = pte_mkyoung(_dst_pte);
	4119
	4120	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
	4121
	4122	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
	4123	dst_vma->vm_flags & VM_WRITE);
	4124	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
	4125
	4126	/* No need to invalidate - it was non-present before */
	4127	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	4128
	4129	spin_unlock(ptl);
	4130	if (vm_shared)
	4131	unlock_page(page);
	4132	ret = 0;
	4133	out:
	4134	return ret;
	4135	out_release_unlock:
	4136	spin_unlock(ptl);
	4137	if (vm_shared)
	4138	unlock_page(page);
	4139	out_release_nounlock:
	4140	put_page(page);
	4141	goto out;
	4142	}
	4143
	4144	long follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
	4145	struct page pages, struct vm_area_struct vmas,
	4146	unsigned long position, unsigned long nr_pages,
	4147	long i, unsigned int flags, int *nonblocking)
	4148	{
	4149	unsigned long pfn_offset;
	4150	unsigned long vaddr = *position;
	4151	unsigned long remainder = *nr_pages;
	4152	struct hstate *h = hstate_vma(vma);
	4153	int err = -EFAULT;
	4154
	4155	while (vaddr < vma->vm_end && remainder) {
	4156	pte_t *pte;
	4157	spinlock_t *ptl = NULL;
	4158	int absent;
	4159	struct page *page;
	4160
	4161	/*
	4162	* If we have a pending SIGKILL, don't keep faulting pages and
	4163	* potentially allocating memory.
	4164	*/
	4165	if (unlikely(fatal_signal_pending(current))) {
	4166	remainder = 0;
	4167	break;
	4168	}
	4169
	4170	/*
	4171	* Some archs (sparc64, sh*) have multiple pte_ts to
	4172	* each hugepage. We have to make sure we get the
	4173	* first, for the page indexing below to work.
	4174	*
	4175	* Note that page table lock is not held when pte is null.
	4176	*/
	4177	pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
	4178	huge_page_size(h));
	4179	if (pte)
	4180	ptl = huge_pte_lock(h, mm, pte);
	4181	absent = !pte \|\| huge_pte_none(huge_ptep_get(pte));
	4182
	4183	/*
	4184	* When coredumping, it suits get_dump_page if we just return
	4185	* an error where there's an empty slot with no huge pagecache
	4186	* to back it. This way, we avoid allocating a hugepage, and
	4187	* the sparse dumpfile avoids allocating disk blocks, but its
	4188	* huge holes still show up with zeroes where they need to be.
	4189	*/
	4190	if (absent && (flags & FOLL_DUMP) &&
	4191	!hugetlbfs_pagecache_present(h, vma, vaddr)) {
	4192	if (pte)
	4193	spin_unlock(ptl);
	4194	remainder = 0;
	4195	break;
	4196	}
	4197
	4198	/*
	4199	* We need call hugetlb_fault for both hugepages under migration
	4200	* (in which case hugetlb_fault waits for the migration,) and
	4201	* hwpoisoned hugepages (in which case we need to prevent the
	4202	* caller from accessing to them.) In order to do this, we use
	4203	* here is_swap_pte instead of is_hugetlb_entry_migration and
	4204	* is_hugetlb_entry_hwpoisoned. This is because it simply covers
	4205	* both cases, and because we can't follow correct pages
	4206	* directly from any kind of swap entries.
	4207	*/
	4208	if (absent \|\| is_swap_pte(huge_ptep_get(pte)) \|\|
	4209	((flags & FOLL_WRITE) &&
	4210	!huge_pte_write(huge_ptep_get(pte)))) {
	4211	int ret;
	4212	unsigned int fault_flags = 0;
	4213
	4214	if (pte)
	4215	spin_unlock(ptl);
	4216	if (flags & FOLL_WRITE)
	4217	fault_flags \|= FAULT_FLAG_WRITE;
	4218	if (nonblocking)
	4219	fault_flags \|= FAULT_FLAG_ALLOW_RETRY;
	4220	if (flags & FOLL_NOWAIT)
	4221	fault_flags \|= FAULT_FLAG_ALLOW_RETRY \|
	4222	FAULT_FLAG_RETRY_NOWAIT;
	4223	if (flags & FOLL_TRIED) {
	4224	VM_WARN_ON_ONCE(fault_flags &
	4225	FAULT_FLAG_ALLOW_RETRY);
	4226	fault_flags \|= FAULT_FLAG_TRIED;
	4227	}
	4228	ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
	4229	if (ret & VM_FAULT_ERROR) {
	4230	err = vm_fault_to_errno(ret, flags);
	4231	remainder = 0;
	4232	break;
	4233	}
	4234	if (ret & VM_FAULT_RETRY) {
	4235	if (nonblocking)
	4236	*nonblocking = 0;
	4237	*nr_pages = 0;
	4238	/*
	4239	* VM_FAULT_RETRY must not return an
	4240	* error, it will return zero
	4241	* instead.
	4242	*
	4243	* No need to update "position" as the
	4244	* caller will not check it after
	4245	* *nr_pages is set to 0.
	4246	*/
	4247	return i;
	4248	}
	4249	continue;
	4250	}
	4251
	4252	pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
	4253	page = pte_page(huge_ptep_get(pte));
	4254	same_page:
	4255	if (pages) {
	4256	pages[i] = mem_map_offset(page, pfn_offset);
	4257	get_page(pages[i]);
	4258	}
	4259
	4260	if (vmas)
	4261	vmas[i] = vma;
	4262
	4263	vaddr += PAGE_SIZE;
	4264	++pfn_offset;
	4265	--remainder;
	4266	++i;
	4267	if (vaddr < vma->vm_end && remainder &&
	4268	pfn_offset < pages_per_huge_page(h)) {
	4269	/*
	4270	* We use pfn_offset to avoid touching the pageframes
	4271	* of this compound page.
	4272	*/
	4273	goto same_page;
	4274	}
	4275	spin_unlock(ptl);
	4276	}
	4277	*nr_pages = remainder;
	4278	/*
	4279	* setting position is actually required only if remainder is
	4280	* not zero but it's faster not to add a "if (remainder)"
	4281	* branch.
	4282	*/
	4283	*position = vaddr;
	4284
	4285	return i ? i : err;
	4286	}
	4287
	4288	#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
	4289	/*
	4290	* ARCHes with special requirements for evicting HUGETLB backing TLB entries can
	4291	* implement this.
	4292	*/
	4293	#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
	4294	#endif
	4295
	4296	unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
	4297	unsigned long address, unsigned long end, pgprot_t newprot)
	4298	{
	4299	struct mm_struct *mm = vma->vm_mm;
	4300	unsigned long start = address;
	4301	pte_t *ptep;
	4302	pte_t pte;
	4303	struct hstate *h = hstate_vma(vma);
	4304	unsigned long pages = 0;
	4305
	4306	BUG_ON(address >= end);
	4307	flush_cache_range(vma, address, end);
	4308
	4309	mmu_notifier_invalidate_range_start(mm, start, end);
	4310	i_mmap_lock_write(vma->vm_file->f_mapping);
	4311	for (; address < end; address += huge_page_size(h)) {
	4312	spinlock_t *ptl;
	4313	ptep = huge_pte_offset(mm, address, huge_page_size(h));
	4314	if (!ptep)
	4315	continue;
	4316	ptl = huge_pte_lock(h, mm, ptep);
	4317	if (huge_pmd_unshare(mm, &address, ptep)) {
	4318	pages++;
	4319	spin_unlock(ptl);
	4320	continue;
	4321	}
	4322	pte = huge_ptep_get(ptep);
	4323	if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
	4324	spin_unlock(ptl);
	4325	continue;
	4326	}
	4327	if (unlikely(is_hugetlb_entry_migration(pte))) {
	4328	swp_entry_t entry = pte_to_swp_entry(pte);
	4329
	4330	if (is_write_migration_entry(entry)) {
	4331	pte_t newpte;
	4332
	4333	make_migration_entry_read(&entry);
	4334	newpte = swp_entry_to_pte(entry);
	4335	set_huge_swap_pte_at(mm, address, ptep,
	4336	newpte, huge_page_size(h));
	4337	pages++;
	4338	}
	4339	spin_unlock(ptl);
	4340	continue;
	4341	}
	4342	if (!huge_pte_none(pte)) {
	4343	pte = huge_ptep_get_and_clear(mm, address, ptep);
	4344	pte = pte_mkhuge(huge_pte_modify(pte, newprot));
	4345	pte = arch_make_huge_pte(pte, vma, NULL, 0);
	4346	set_huge_pte_at(mm, address, ptep, pte);
	4347	pages++;
	4348	}
	4349	spin_unlock(ptl);
	4350	}
	4351	/*
	4352	* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
	4353	* may have cleared our pud entry and done put_page on the page table:
	4354	* once we release i_mmap_rwsem, another task can do the final put_page
	4355	* and that page table be reused and filled with junk.
	4356	*/
	4357	flush_hugetlb_tlb_range(vma, start, end);
	4358	/*
	4359	* No need to call mmu_notifier_invalidate_range() we are downgrading
	4360	* page table protection not changing it to point to a new page.
	4361	*
	4362	* See Documentation/vm/mmu_notifier.rst
	4363	*/
	4364	i_mmap_unlock_write(vma->vm_file->f_mapping);
	4365	mmu_notifier_invalidate_range_end(mm, start, end);
	4366
	4367	return pages << h->order;
	4368	}
	4369
	4370	int hugetlb_reserve_pages(struct inode *inode,
	4371	long from, long to,
	4372	struct vm_area_struct *vma,
	4373	vm_flags_t vm_flags)
	4374	{
	4375	long ret, chg;
	4376	struct hstate *h = hstate_inode(inode);
	4377	struct hugepage_subpool *spool = subpool_inode(inode);
	4378	struct resv_map *resv_map;
	4379	long gbl_reserve;
	4380
	4381	/* This should never happen */
	4382	if (from > to) {
	4383	VM_WARN(1, "%s called with a negative range\n", __func__);
	4384	return -EINVAL;
	4385	}
	4386
	4387	/*
	4388	* Only apply hugepage reservation if asked. At fault time, an
	4389	* attempt will be made for VM_NORESERVE to allocate a page
	4390	* without using reserves
	4391	*/
	4392	if (vm_flags & VM_NORESERVE)
	4393	return 0;
	4394
	4395	/*
	4396	* Shared mappings base their reservation on the number of pages that
	4397	* are already allocated on behalf of the file. Private mappings need
	4398	* to reserve the full area even if read-only as mprotect() may be
	4399	* called to make the mapping read-write. Assume !vma is a shm mapping
	4400	*/
	4401	if (!vma \|\| vma->vm_flags & VM_MAYSHARE) {
	4402	resv_map = inode_resv_map(inode);
	4403
	4404	chg = region_chg(resv_map, from, to);
	4405
	4406	} else {
	4407	resv_map = resv_map_alloc();
	4408	if (!resv_map)
	4409	return -ENOMEM;
	4410
	4411	chg = to - from;
	4412
	4413	set_vma_resv_map(vma, resv_map);
	4414	set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
	4415	}
	4416
	4417	if (chg < 0) {
	4418	ret = chg;
	4419	goto out_err;
	4420	}
	4421
	4422	/*
	4423	* There must be enough pages in the subpool for the mapping. If
	4424	* the subpool has a minimum size, there may be some global
	4425	* reservations already in place (gbl_reserve).
	4426	*/
	4427	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
	4428	if (gbl_reserve < 0) {
	4429	ret = -ENOSPC;
	4430	goto out_err;
	4431	}
	4432
	4433	/*
	4434	* Check enough hugepages are available for the reservation.
	4435	* Hand the pages back to the subpool if there are not
	4436	*/
	4437	ret = hugetlb_acct_memory(h, gbl_reserve);
	4438	if (ret < 0) {
	4439	/* put back original number of pages, chg */
	4440	(void)hugepage_subpool_put_pages(spool, chg);
	4441	goto out_err;
	4442	}
	4443
	4444	/*
	4445	* Account for the reservations made. Shared mappings record regions
	4446	* that have reservations as they are shared by multiple VMAs.
	4447	* When the last VMA disappears, the region map says how much
	4448	* the reservation was and the page cache tells how much of
	4449	* the reservation was consumed. Private mappings are per-VMA and
	4450	* only the consumed reservations are tracked. When the VMA
	4451	* disappears, the original reservation is the VMA size and the
	4452	* consumed reservations are stored in the map. Hence, nothing
	4453	* else has to be done for private mappings here
	4454	*/
	4455	if (!vma \|\| vma->vm_flags & VM_MAYSHARE) {
	4456	long add = region_add(resv_map, from, to);
	4457
	4458	if (unlikely(chg > add)) {
	4459	/*
	4460	* pages in this range were added to the reserve
	4461	* map between region_chg and region_add. This
	4462	* indicates a race with alloc_huge_page. Adjust
	4463	* the subpool and reserve counts modified above
	4464	* based on the difference.
	4465	*/
	4466	long rsv_adjust;
	4467
	4468	rsv_adjust = hugepage_subpool_put_pages(spool,
	4469	chg - add);
	4470	hugetlb_acct_memory(h, -rsv_adjust);
	4471	}
	4472	}
	4473	return 0;
	4474	out_err:
	4475	if (!vma \|\| vma->vm_flags & VM_MAYSHARE)
	4476	/* Don't call region_abort if region_chg failed */
	4477	if (chg >= 0)
	4478	region_abort(resv_map, from, to);
	4479	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
	4480	kref_put(&resv_map->refs, resv_map_release);
	4481	return ret;
	4482	}
	4483
	4484	long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
	4485	long freed)
	4486	{
	4487	struct hstate *h = hstate_inode(inode);
	4488	struct resv_map *resv_map = inode_resv_map(inode);
	4489	long chg = 0;
	4490	struct hugepage_subpool *spool = subpool_inode(inode);
	4491	long gbl_reserve;
	4492
	4493	if (resv_map) {
	4494	chg = region_del(resv_map, start, end);
	4495	/*
	4496	* region_del() can fail in the rare case where a region
	4497	* must be split and another region descriptor can not be
	4498	* allocated. If end == LONG_MAX, it will not fail.
	4499	*/
	4500	if (chg < 0)
	4501	return chg;
	4502	}
	4503
	4504	spin_lock(&inode->i_lock);
	4505	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
	4506	spin_unlock(&inode->i_lock);
	4507
	4508	/*
	4509	* If the subpool has a minimum size, the number of global
	4510	* reservations to be released may be adjusted.
	4511	*/
	4512	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
	4513	hugetlb_acct_memory(h, -gbl_reserve);
	4514
	4515	return 0;
	4516	}
	4517
	4518	#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
	4519	static unsigned long page_table_shareable(struct vm_area_struct *svma,
	4520	struct vm_area_struct *vma,
	4521	unsigned long addr, pgoff_t idx)
	4522	{
	4523	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
	4524	svma->vm_start;
	4525	unsigned long sbase = saddr & PUD_MASK;
	4526	unsigned long s_end = sbase + PUD_SIZE;
	4527
	4528	/* Allow segments to share if only one is marked locked */
	4529	unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
	4530	unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
	4531
	4532	/*
	4533	* match the virtual addresses, permission and the alignment of the
	4534	* page table page.
	4535	*/
	4536	if (pmd_index(addr) != pmd_index(saddr) \|\|
	4537	vm_flags != svm_flags \|\|
	4538	sbase < svma->vm_start \|\| svma->vm_end < s_end)
	4539	return 0;
	4540
	4541	return saddr;
	4542	}
	4543
	4544	static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
	4545	{
	4546	unsigned long base = addr & PUD_MASK;
	4547	unsigned long end = base + PUD_SIZE;
	4548
	4549	/*
	4550	* check on proper vm_flags and page table alignment
	4551	*/
	4552	if (vma->vm_flags & VM_MAYSHARE &&
	4553	vma->vm_start <= base && end <= vma->vm_end)
	4554	return true;
	4555	return false;
	4556	}
	4557
	4558	/*
	4559	* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
	4560	* and returns the corresponding pte. While this is not necessary for the
	4561	* !shared pmd case because we can allocate the pmd later as well, it makes the
	4562	* code much cleaner. pmd allocation is essential for the shared case because
	4563	* pud has to be populated inside the same i_mmap_rwsem section - otherwise
	4564	* racing tasks could either miss the sharing (see huge_pte_offset) or select a
	4565	* bad pmd for sharing.
	4566	*/
	4567	pte_t huge_pmd_share(struct mm_struct mm, unsigned long addr, pud_t *pud)
	4568	{
	4569	struct vm_area_struct *vma = find_vma(mm, addr);
	4570	struct address_space *mapping = vma->vm_file->f_mapping;
	4571	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
	4572	vma->vm_pgoff;
	4573	struct vm_area_struct *svma;
	4574	unsigned long saddr;
	4575	pte_t *spte = NULL;
	4576	pte_t *pte;
	4577	spinlock_t *ptl;
	4578
	4579	if (!vma_shareable(vma, addr))
	4580	return (pte_t *)pmd_alloc(mm, pud, addr);
	4581
	4582	i_mmap_lock_write(mapping);
	4583	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
	4584	if (svma == vma)
	4585	continue;
	4586
	4587	saddr = page_table_shareable(svma, vma, addr, idx);
	4588	if (saddr) {
	4589	spte = huge_pte_offset(svma->vm_mm, saddr,
	4590	vma_mmu_pagesize(svma));
	4591	if (spte) {
	4592	get_page(virt_to_page(spte));
	4593	break;
	4594	}
	4595	}
	4596	}
	4597
	4598	if (!spte)
	4599	goto out;
	4600
	4601	ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
	4602	if (pud_none(*pud)) {
	4603	pud_populate(mm, pud,
	4604	(pmd_t *)((unsigned long)spte & PAGE_MASK));
	4605	mm_inc_nr_pmds(mm);
	4606	} else {
	4607	put_page(virt_to_page(spte));
	4608	}
	4609	spin_unlock(ptl);
	4610	out:
	4611	pte = (pte_t *)pmd_alloc(mm, pud, addr);
	4612	i_mmap_unlock_write(mapping);
	4613	return pte;
	4614	}
	4615
	4616	/*
	4617	* unmap huge page backed by shared pte.
	4618	*
	4619	* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
	4620	* indicated by page_count > 1, unmap is achieved by clearing pud and
	4621	* decrementing the ref count. If count == 1, the pte page is not shared.
	4622	*
	4623	* called with page table lock held.
	4624	*
	4625	* returns: 1 successfully unmapped a shared pte page
	4626	* 0 the underlying pte page is not shared, or it is the last user
	4627	*/
	4628	int huge_pmd_unshare(struct mm_struct mm, unsigned long addr, pte_t *ptep)
	4629	{
	4630	pgd_t pgd = pgd_offset(mm, addr);
	4631	p4d_t p4d = p4d_offset(pgd, addr);
	4632	pud_t pud = pud_offset(p4d, addr);
	4633
	4634	BUG_ON(page_count(virt_to_page(ptep)) == 0);
	4635	if (page_count(virt_to_page(ptep)) == 1)
	4636	return 0;
	4637
	4638	pud_clear(pud);
	4639	put_page(virt_to_page(ptep));
	4640	mm_dec_nr_pmds(mm);
	4641	addr = ALIGN(addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
	4642	return 1;
	4643	}
	4644	#define want_pmd_share() (1)
	4645	#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
	4646	pte_t huge_pmd_share(struct mm_struct mm, unsigned long addr, pud_t *pud)
	4647	{
	4648	return NULL;
	4649	}
	4650
	4651	int huge_pmd_unshare(struct mm_struct mm, unsigned long addr, pte_t *ptep)
	4652	{
	4653	return 0;
	4654	}
	4655	#define want_pmd_share() (0)
	4656	#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
	4657
	4658	#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
	4659	pte_t huge_pte_alloc(struct mm_struct mm,
	4660	unsigned long addr, unsigned long sz)
	4661	{
	4662	pgd_t *pgd;
	4663	p4d_t *p4d;
	4664	pud_t *pud;
	4665	pte_t *pte = NULL;
	4666
	4667	pgd = pgd_offset(mm, addr);
	4668	p4d = p4d_alloc(mm, pgd, addr);
	4669	if (!p4d)
	4670	return NULL;
	4671	pud = pud_alloc(mm, p4d, addr);
	4672	if (pud) {
	4673	if (sz == PUD_SIZE) {
	4674	pte = (pte_t *)pud;
	4675	} else {
	4676	BUG_ON(sz != PMD_SIZE);
	4677	if (want_pmd_share() && pud_none(*pud))
	4678	pte = huge_pmd_share(mm, addr, pud);
	4679	else
	4680	pte = (pte_t *)pmd_alloc(mm, pud, addr);
	4681	}
	4682	}
	4683	BUG_ON(pte && pte_present(pte) && !pte_huge(pte));
	4684
	4685	return pte;
	4686	}
	4687
	4688	/*
	4689	* huge_pte_offset() - Walk the page table to resolve the hugepage
	4690	* entry at address @addr
	4691	*
	4692	* Return: Pointer to page table or swap entry (PUD or PMD) for
	4693	* address @addr, or NULL if a p*d_none() entry is encountered and the
	4694	* size @sz doesn't match the hugepage size at this level of the page
	4695	* table.
	4696	*/
	4697	pte_t huge_pte_offset(struct mm_struct mm,
	4698	unsigned long addr, unsigned long sz)
	4699	{
	4700	pgd_t *pgd;
	4701	p4d_t *p4d;
	4702	pud_t *pud;
	4703	pmd_t *pmd;
	4704
	4705	pgd = pgd_offset(mm, addr);
	4706	if (!pgd_present(*pgd))
	4707	return NULL;
	4708	p4d = p4d_offset(pgd, addr);
	4709	if (!p4d_present(*p4d))
	4710	return NULL;
	4711
	4712	pud = pud_offset(p4d, addr);
	4713	if (sz != PUD_SIZE && pud_none(*pud))
	4714	return NULL;
	4715	/* hugepage or swap? */
	4716	if (pud_huge(pud) \|\| !pud_present(pud))
	4717	return (pte_t *)pud;
	4718
	4719	pmd = pmd_offset(pud, addr);
	4720	if (sz != PMD_SIZE && pmd_none(*pmd))
	4721	return NULL;
	4722	/* hugepage or swap? */
	4723	if (pmd_huge(pmd) \|\| !pmd_present(pmd))
	4724	return (pte_t *)pmd;
	4725
	4726	return NULL;
	4727	}
	4728
	4729	#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
	4730
	4731	/*
	4732	* These functions are overwritable if your architecture needs its own
	4733	* behavior.
	4734	*/
	4735	struct page * __weak
	4736	follow_huge_addr(struct mm_struct *mm, unsigned long address,
	4737	int write)
	4738	{
	4739	return ERR_PTR(-EINVAL);
	4740	}
	4741
	4742	struct page * __weak
	4743	follow_huge_pd(struct vm_area_struct *vma,
	4744	unsigned long address, hugepd_t hpd, int flags, int pdshift)
	4745	{
	4746	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
	4747	return NULL;
	4748	}
	4749
	4750	struct page * __weak
	4751	follow_huge_pmd(struct mm_struct *mm, unsigned long address,
	4752	pmd_t *pmd, int flags)
	4753	{
	4754	struct page *page = NULL;
	4755	spinlock_t *ptl;
	4756	pte_t pte;
	4757	retry:
	4758	ptl = pmd_lockptr(mm, pmd);
	4759	spin_lock(ptl);
	4760	/*
	4761	* make sure that the address range covered by this pmd is not
	4762	* unmapped from other threads.
	4763	*/
	4764	if (!pmd_huge(*pmd))
	4765	goto out;
	4766	pte = huge_ptep_get((pte_t *)pmd);
	4767	if (pte_present(pte)) {
	4768	page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
	4769	if (flags & FOLL_GET)
	4770	get_page(page);
	4771	} else {
	4772	if (is_hugetlb_entry_migration(pte)) {
	4773	spin_unlock(ptl);
	4774	__migration_entry_wait(mm, (pte_t *)pmd, ptl);
	4775	goto retry;
	4776	}
	4777	/*
	4778	* hwpoisoned entry is treated as no_page_table in
	4779	* follow_page_mask().
	4780	*/
	4781	}
	4782	out:
	4783	spin_unlock(ptl);
	4784	return page;
	4785	}
	4786
	4787	struct page * __weak
	4788	follow_huge_pud(struct mm_struct *mm, unsigned long address,
	4789	pud_t *pud, int flags)
	4790	{
	4791	if (flags & FOLL_GET)
	4792	return NULL;
	4793
	4794	return pte_page((pte_t )pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
	4795	}
	4796
	4797	struct page * __weak
	4798	follow_huge_pgd(struct mm_struct mm, unsigned long address, pgd_t pgd, int flags)
	4799	{
	4800	if (flags & FOLL_GET)
	4801	return NULL;
	4802
	4803	return pte_page((pte_t )pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
	4804	}
	4805
	4806	bool isolate_huge_page(struct page page, struct list_head list)
	4807	{
	4808	bool ret = true;
	4809
	4810	VM_BUG_ON_PAGE(!PageHead(page), page);
	4811	spin_lock(&hugetlb_lock);
	4812	if (!page_huge_active(page) \|\| !get_page_unless_zero(page)) {
	4813	ret = false;
	4814	goto unlock;
	4815	}
	4816	clear_page_huge_active(page);
	4817	list_move_tail(&page->lru, list);
	4818	unlock:
	4819	spin_unlock(&hugetlb_lock);
	4820	return ret;
	4821	}
	4822
	4823	void putback_active_hugepage(struct page *page)
	4824	{
	4825	VM_BUG_ON_PAGE(!PageHead(page), page);
	4826	spin_lock(&hugetlb_lock);
	4827	set_page_huge_active(page);
	4828	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
	4829	spin_unlock(&hugetlb_lock);
	4830	put_page(page);
	4831	}
	4832
	4833	void move_hugetlb_state(struct page oldpage, struct page newpage, int reason)
	4834	{
	4835	struct hstate *h = page_hstate(oldpage);
	4836
	4837	hugetlb_cgroup_migrate(oldpage, newpage);
	4838	set_page_owner_migrate_reason(newpage, reason);
	4839
	4840	/*
	4841	* transfer temporary state of the new huge page. This is
	4842	* reverse to other transitions because the newpage is going to
	4843	* be final while the old one will be freed so it takes over
	4844	* the temporary status.
	4845	*
	4846	* Also note that we have to transfer the per-node surplus state
	4847	* here as well otherwise the global surplus count will not match
	4848	* the per-node's.
	4849	*/
	4850	if (PageHugeTemporary(newpage)) {
	4851	int old_nid = page_to_nid(oldpage);
	4852	int new_nid = page_to_nid(newpage);
	4853
	4854	SetPageHugeTemporary(oldpage);
	4855	ClearPageHugeTemporary(newpage);
	4856
	4857	spin_lock(&hugetlb_lock);
	4858	if (h->surplus_huge_pages_node[old_nid]) {
	4859	h->surplus_huge_pages_node[old_nid]--;
	4860	h->surplus_huge_pages_node[new_nid]++;
	4861	}
	4862	spin_unlock(&hugetlb_lock);
	4863	}
	4864	}