Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Generic hugetlb support.
	3	* (C) William Irwin, April 2004
	4	*/
	5	#include <linux/gfp.h>
	6	#include <linux/list.h>
	7	#include <linux/init.h>
	8	#include <linux/module.h>
	9	#include <linux/mm.h>
	10	#include <linux/sysctl.h>
	11	#include <linux/highmem.h>
	12	#include <linux/nodemask.h>
	13	#include <linux/pagemap.h>
	14	#include <linux/mempolicy.h>
	15	#include <linux/cpuset.h>
	16	#include <linux/mutex.h>
	17
	18	#include <asm/page.h>
	19	#include <asm/pgtable.h>
	20
	21	#include <linux/hugetlb.h>
	22	#include "internal.h"
	23
	24	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
	25	static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
	26	static unsigned long surplus_huge_pages;
	27	unsigned long max_huge_pages;
	28	static struct list_head hugepage_freelists[MAX_NUMNODES];
	29	static unsigned int nr_huge_pages_node[MAX_NUMNODES];
	30	static unsigned int free_huge_pages_node[MAX_NUMNODES];
	31	static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
	32	static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
	33	unsigned long hugepages_treat_as_movable;
	34	int hugetlb_dynamic_pool;
	35	static int hugetlb_next_nid;
	36
	37	/*
	38	* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
	39	*/
	40	static DEFINE_SPINLOCK(hugetlb_lock);
	41
	42	static void clear_huge_page(struct page *page, unsigned long addr)
	43	{
	44	int i;
	45
	46	might_sleep();
	47	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
	48	cond_resched();
	49	clear_user_highpage(page + i, addr + i * PAGE_SIZE);
	50	}
	51	}
	52
	53	static void copy_huge_page(struct page dst, struct page src,
	54	unsigned long addr, struct vm_area_struct *vma)
	55	{
	56	int i;
	57
	58	might_sleep();
	59	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
	60	cond_resched();
	61	copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
	62	}
	63	}
	64
	65	static void enqueue_huge_page(struct page *page)
	66	{
	67	int nid = page_to_nid(page);
	68	list_add(&page->lru, &hugepage_freelists[nid]);
	69	free_huge_pages++;
	70	free_huge_pages_node[nid]++;
	71	}
	72
	73	static struct page dequeue_huge_page(struct vm_area_struct vma,
	74	unsigned long address)
	75	{
	76	int nid;
	77	struct page *page = NULL;
	78	struct mempolicy *mpol;
	79	struct zonelist *zonelist = huge_zonelist(vma, address,
	80	htlb_alloc_mask, &mpol);
	81	struct zone **z;
	82
	83	for (z = zonelist->zones; *z; z++) {
	84	nid = zone_to_nid(*z);
	85	if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
	86	!list_empty(&hugepage_freelists[nid])) {
	87	page = list_entry(hugepage_freelists[nid].next,
	88	struct page, lru);
	89	list_del(&page->lru);
	90	free_huge_pages--;
	91	free_huge_pages_node[nid]--;
	92	if (vma && vma->vm_flags & VM_MAYSHARE)
	93	resv_huge_pages--;
	94	break;
	95	}
	96	}
	97	mpol_free(mpol); /* unref if mpol !NULL */
	98	return page;
	99	}
	100
	101	static void update_and_free_page(struct page *page)
	102	{
	103	int i;
	104	nr_huge_pages--;
	105	nr_huge_pages_node[page_to_nid(page)]--;
	106	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
	107	page[i].flags &= ~(1 << PG_locked \| 1 << PG_error \| 1 << PG_referenced \|
	108	1 << PG_dirty \| 1 << PG_active \| 1 << PG_reserved \|
	109	1 << PG_private \| 1<< PG_writeback);
	110	}
	111	set_compound_page_dtor(page, NULL);
	112	set_page_refcounted(page);
	113	__free_pages(page, HUGETLB_PAGE_ORDER);
	114	}
	115
	116	static void free_huge_page(struct page *page)
	117	{
	118	int nid = page_to_nid(page);
	119
	120	BUG_ON(page_count(page));
	121	INIT_LIST_HEAD(&page->lru);
	122
	123	spin_lock(&hugetlb_lock);
	124	if (surplus_huge_pages_node[nid]) {
	125	update_and_free_page(page);
	126	surplus_huge_pages--;
	127	surplus_huge_pages_node[nid]--;
	128	} else {
	129	enqueue_huge_page(page);
	130	}
	131	spin_unlock(&hugetlb_lock);
	132	}
	133
	134	/*
	135	* Increment or decrement surplus_huge_pages. Keep node-specific counters
	136	* balanced by operating on them in a round-robin fashion.
	137	* Returns 1 if an adjustment was made.
	138	*/
	139	static int adjust_pool_surplus(int delta)
	140	{
	141	static int prev_nid;
	142	int nid = prev_nid;
	143	int ret = 0;
	144
	145	VM_BUG_ON(delta != -1 && delta != 1);
	146	do {
	147	nid = next_node(nid, node_online_map);
	148	if (nid == MAX_NUMNODES)
	149	nid = first_node(node_online_map);
	150
	151	/* To shrink on this node, there must be a surplus page */
	152	if (delta < 0 && !surplus_huge_pages_node[nid])
	153	continue;
	154	/* Surplus cannot exceed the total number of pages */
	155	if (delta > 0 && surplus_huge_pages_node[nid] >=
	156	nr_huge_pages_node[nid])
	157	continue;
	158
	159	surplus_huge_pages += delta;
	160	surplus_huge_pages_node[nid] += delta;
	161	ret = 1;
	162	break;
	163	} while (nid != prev_nid);
	164
	165	prev_nid = nid;
	166	return ret;
	167	}
	168
	169	static struct page *alloc_fresh_huge_page_node(int nid)
	170	{
	171	struct page *page;
	172
	173	page = alloc_pages_node(nid,
	174	htlb_alloc_mask\|__GFP_COMP\|__GFP_THISNODE\|__GFP_NOWARN,
	175	HUGETLB_PAGE_ORDER);
	176	if (page) {
	177	set_compound_page_dtor(page, free_huge_page);
	178	spin_lock(&hugetlb_lock);
	179	nr_huge_pages++;
	180	nr_huge_pages_node[nid]++;
	181	spin_unlock(&hugetlb_lock);
	182	put_page(page); /* free it into the hugepage allocator */
	183	}
	184
	185	return page;
	186	}
	187
	188	static int alloc_fresh_huge_page(void)
	189	{
	190	struct page *page;
	191	int start_nid;
	192	int next_nid;
	193	int ret = 0;
	194
	195	start_nid = hugetlb_next_nid;
	196
	197	do {
	198	page = alloc_fresh_huge_page_node(hugetlb_next_nid);
	199	if (page)
	200	ret = 1;
	201	/*
	202	* Use a helper variable to find the next node and then
	203	* copy it back to hugetlb_next_nid afterwards:
	204	* otherwise there's a window in which a racer might
	205	* pass invalid nid MAX_NUMNODES to alloc_pages_node.
	206	* But we don't need to use a spin_lock here: it really
	207	* doesn't matter if occasionally a racer chooses the
	208	* same nid as we do. Move nid forward in the mask even
	209	* if we just successfully allocated a hugepage so that
	210	* the next caller gets hugepages on the next node.
	211	*/
	212	next_nid = next_node(hugetlb_next_nid, node_online_map);
	213	if (next_nid == MAX_NUMNODES)
	214	next_nid = first_node(node_online_map);
	215	hugetlb_next_nid = next_nid;
	216	} while (!page && hugetlb_next_nid != start_nid);
	217
	218	return ret;
	219	}
	220
	221	static struct page alloc_buddy_huge_page(struct vm_area_struct vma,
	222	unsigned long address)
	223	{
	224	struct page *page;
	225
	226	/* Check if the dynamic pool is enabled */
	227	if (!hugetlb_dynamic_pool)
	228	return NULL;
	229
	230	page = alloc_pages(htlb_alloc_mask\|__GFP_COMP\|__GFP_NOWARN,
	231	HUGETLB_PAGE_ORDER);
	232	if (page) {
	233	set_compound_page_dtor(page, free_huge_page);
	234	spin_lock(&hugetlb_lock);
	235	nr_huge_pages++;
	236	nr_huge_pages_node[page_to_nid(page)]++;
	237	surplus_huge_pages++;
	238	surplus_huge_pages_node[page_to_nid(page)]++;
	239	spin_unlock(&hugetlb_lock);
	240	}
	241
	242	return page;
	243	}
	244
	245	/*
	246	* Increase the hugetlb pool such that it can accomodate a reservation
	247	* of size 'delta'.
	248	*/
	249	static int gather_surplus_pages(int delta)
	250	{
	251	struct list_head surplus_list;
	252	struct page page, tmp;
	253	int ret, i;
	254	int needed, allocated;
	255
	256	needed = (resv_huge_pages + delta) - free_huge_pages;
	257	if (needed <= 0)
	258	return 0;
	259
	260	allocated = 0;
	261	INIT_LIST_HEAD(&surplus_list);
	262
	263	ret = -ENOMEM;
	264	retry:
	265	spin_unlock(&hugetlb_lock);
	266	for (i = 0; i < needed; i++) {
	267	page = alloc_buddy_huge_page(NULL, 0);
	268	if (!page) {
	269	/*
	270	* We were not able to allocate enough pages to
	271	* satisfy the entire reservation so we free what
	272	* we've allocated so far.
	273	*/
	274	spin_lock(&hugetlb_lock);
	275	needed = 0;
	276	goto free;
	277	}
	278
	279	list_add(&page->lru, &surplus_list);
	280	}
	281	allocated += needed;
	282
	283	/*
	284	* After retaking hugetlb_lock, we need to recalculate 'needed'
	285	* because either resv_huge_pages or free_huge_pages may have changed.
	286	*/
	287	spin_lock(&hugetlb_lock);
	288	needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
	289	if (needed > 0)
	290	goto retry;
	291
	292	/*
	293	* The surplus_list now contains _at_least_ the number of extra pages
	294	* needed to accomodate the reservation. Add the appropriate number
	295	* of pages to the hugetlb pool and free the extras back to the buddy
	296	* allocator.
	297	*/
	298	needed += allocated;
	299	ret = 0;
	300	free:
	301	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
	302	list_del(&page->lru);
	303	if ((--needed) >= 0)
	304	enqueue_huge_page(page);
	305	else {
	306	/*
	307	* Decrement the refcount and free the page using its
	308	* destructor. This must be done with hugetlb_lock
	309	* unlocked which is safe because free_huge_page takes
	310	* hugetlb_lock before deciding how to free the page.
	311	*/
	312	spin_unlock(&hugetlb_lock);
	313	put_page(page);
	314	spin_lock(&hugetlb_lock);
	315	}
	316	}
	317
	318	return ret;
	319	}
	320
	321	/*
	322	* When releasing a hugetlb pool reservation, any surplus pages that were
	323	* allocated to satisfy the reservation must be explicitly freed if they were
	324	* never used.
	325	*/
	326	void return_unused_surplus_pages(unsigned long unused_resv_pages)
	327	{
	328	static int nid = -1;
	329	struct page *page;
	330	unsigned long nr_pages;
	331
	332	nr_pages = min(unused_resv_pages, surplus_huge_pages);
	333
	334	while (nr_pages) {
	335	nid = next_node(nid, node_online_map);
	336	if (nid == MAX_NUMNODES)
	337	nid = first_node(node_online_map);
	338
	339	if (!surplus_huge_pages_node[nid])
	340	continue;
	341
	342	if (!list_empty(&hugepage_freelists[nid])) {
	343	page = list_entry(hugepage_freelists[nid].next,
	344	struct page, lru);
	345	list_del(&page->lru);
	346	update_and_free_page(page);
	347	free_huge_pages--;
	348	free_huge_pages_node[nid]--;
	349	surplus_huge_pages--;
	350	surplus_huge_pages_node[nid]--;
	351	nr_pages--;
	352	}
	353	}
	354	}
	355
	356	static struct page alloc_huge_page(struct vm_area_struct vma,
	357	unsigned long addr)
	358	{
	359	struct page *page = NULL;
	360	int use_reserved_page = vma->vm_flags & VM_MAYSHARE;
	361
	362	spin_lock(&hugetlb_lock);
	363	if (!use_reserved_page && (free_huge_pages <= resv_huge_pages))
	364	goto fail;
	365
	366	page = dequeue_huge_page(vma, addr);
	367	if (!page)
	368	goto fail;
	369
	370	spin_unlock(&hugetlb_lock);
	371	set_page_refcounted(page);
	372	return page;
	373
	374	fail:
	375	spin_unlock(&hugetlb_lock);
	376
	377	/*
	378	* Private mappings do not use reserved huge pages so the allocation
	379	* may have failed due to an undersized hugetlb pool. Try to grab a
	380	* surplus huge page from the buddy allocator.
	381	*/
	382	if (!use_reserved_page)
	383	page = alloc_buddy_huge_page(vma, addr);
	384
	385	return page;
	386	}
	387
	388	static int __init hugetlb_init(void)
	389	{
	390	unsigned long i;
	391
	392	if (HPAGE_SHIFT == 0)
	393	return 0;
	394
	395	for (i = 0; i < MAX_NUMNODES; ++i)
	396	INIT_LIST_HEAD(&hugepage_freelists[i]);
	397
	398	hugetlb_next_nid = first_node(node_online_map);
	399
	400	for (i = 0; i < max_huge_pages; ++i) {
	401	if (!alloc_fresh_huge_page())
	402	break;
	403	}
	404	max_huge_pages = free_huge_pages = nr_huge_pages = i;
	405	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
	406	return 0;
	407	}
	408	module_init(hugetlb_init);
	409
	410	static int __init hugetlb_setup(char *s)
	411	{
	412	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
	413	max_huge_pages = 0;
	414	return 1;
	415	}
	416	__setup("hugepages=", hugetlb_setup);
	417
	418	static unsigned int cpuset_mems_nr(unsigned int *array)
	419	{
	420	int node;
	421	unsigned int nr = 0;
	422
	423	for_each_node_mask(node, cpuset_current_mems_allowed)
	424	nr += array[node];
	425
	426	return nr;
	427	}
	428
	429	#ifdef CONFIG_SYSCTL
	430	#ifdef CONFIG_HIGHMEM
	431	static void try_to_free_low(unsigned long count)
	432	{
	433	int i;
	434
	435	for (i = 0; i < MAX_NUMNODES; ++i) {
	436	struct page page, next;
	437	list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
	438	if (count >= nr_huge_pages)
	439	return;
	440	if (PageHighMem(page))
	441	continue;
	442	list_del(&page->lru);
	443	update_and_free_page(page);
	444	free_huge_pages--;
	445	free_huge_pages_node[page_to_nid(page)]--;
	446	}
	447	}
	448	}
	449	#else
	450	static inline void try_to_free_low(unsigned long count)
	451	{
	452	}
	453	#endif
	454
	455	#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
	456	static unsigned long set_max_huge_pages(unsigned long count)
	457	{
	458	unsigned long min_count, ret;
	459
	460	/*
	461	* Increase the pool size
	462	* First take pages out of surplus state. Then make up the
	463	* remaining difference by allocating fresh huge pages.
	464	*/
	465	spin_lock(&hugetlb_lock);
	466	while (surplus_huge_pages && count > persistent_huge_pages) {
	467	if (!adjust_pool_surplus(-1))
	468	break;
	469	}
	470
	471	while (count > persistent_huge_pages) {
	472	int ret;
	473	/*
	474	* If this allocation races such that we no longer need the
	475	* page, free_huge_page will handle it by freeing the page
	476	* and reducing the surplus.
	477	*/
	478	spin_unlock(&hugetlb_lock);
	479	ret = alloc_fresh_huge_page();
	480	spin_lock(&hugetlb_lock);
	481	if (!ret)
	482	goto out;
	483
	484	}
	485
	486	/*
	487	* Decrease the pool size
	488	* First return free pages to the buddy allocator (being careful
	489	* to keep enough around to satisfy reservations). Then place
	490	* pages into surplus state as needed so the pool will shrink
	491	* to the desired size as pages become free.
	492	*/
	493	min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
	494	min_count = max(count, min_count);
	495	try_to_free_low(min_count);
	496	while (min_count < persistent_huge_pages) {
	497	struct page *page = dequeue_huge_page(NULL, 0);
	498	if (!page)
	499	break;
	500	update_and_free_page(page);
	501	}
	502	while (count < persistent_huge_pages) {
	503	if (!adjust_pool_surplus(1))
	504	break;
	505	}
	506	out:
	507	ret = persistent_huge_pages;
	508	spin_unlock(&hugetlb_lock);
	509	return ret;
	510	}
	511
	512	int hugetlb_sysctl_handler(struct ctl_table *table, int write,
	513	struct file file, void __user buffer,
	514	size_t length, loff_t ppos)
	515	{
	516	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
	517	max_huge_pages = set_max_huge_pages(max_huge_pages);
	518	return 0;
	519	}
	520
	521	int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
	522	struct file file, void __user buffer,
	523	size_t length, loff_t ppos)
	524	{
	525	proc_dointvec(table, write, file, buffer, length, ppos);
	526	if (hugepages_treat_as_movable)
	527	htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
	528	else
	529	htlb_alloc_mask = GFP_HIGHUSER;
	530	return 0;
	531	}
	532
	533	#endif /* CONFIG_SYSCTL */
	534
	535	int hugetlb_report_meminfo(char *buf)
	536	{
	537	return sprintf(buf,
	538	"HugePages_Total: %5lu\n"
	539	"HugePages_Free: %5lu\n"
	540	"HugePages_Rsvd: %5lu\n"
	541	"HugePages_Surp: %5lu\n"
	542	"Hugepagesize: %5lu kB\n",
	543	nr_huge_pages,
	544	free_huge_pages,
	545	resv_huge_pages,
	546	surplus_huge_pages,
	547	HPAGE_SIZE/1024);
	548	}
	549
	550	int hugetlb_report_node_meminfo(int nid, char *buf)
	551	{
	552	return sprintf(buf,
	553	"Node %d HugePages_Total: %5u\n"
	554	"Node %d HugePages_Free: %5u\n",
	555	nid, nr_huge_pages_node[nid],
	556	nid, free_huge_pages_node[nid]);
	557	}
	558
	559	/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
	560	unsigned long hugetlb_total_pages(void)
	561	{
	562	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
	563	}
	564
	565	/*
	566	* We cannot handle pagefaults against hugetlb pages at all. They cause
	567	* handle_mm_fault() to try to instantiate regular-sized pages in the
	568	* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
	569	* this far.
	570	*/
	571	static int hugetlb_vm_op_fault(struct vm_area_struct vma, struct vm_fault vmf)
	572	{
	573	BUG();
	574	return 0;
	575	}
	576
	577	struct vm_operations_struct hugetlb_vm_ops = {
	578	.fault = hugetlb_vm_op_fault,
	579	};
	580
	581	static pte_t make_huge_pte(struct vm_area_struct vma, struct page page,
	582	int writable)
	583	{
	584	pte_t entry;
	585
	586	if (writable) {
	587	entry =
	588	pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
	589	} else {
	590	entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
	591	}
	592	entry = pte_mkyoung(entry);
	593	entry = pte_mkhuge(entry);
	594
	595	return entry;
	596	}
	597
	598	static void set_huge_ptep_writable(struct vm_area_struct *vma,
	599	unsigned long address, pte_t *ptep)
	600	{
	601	pte_t entry;
	602
	603	entry = pte_mkwrite(pte_mkdirty(*ptep));
	604	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
	605	update_mmu_cache(vma, address, entry);
	606	}
	607	}
	608
	609
	610	int copy_hugetlb_page_range(struct mm_struct dst, struct mm_struct src,
	611	struct vm_area_struct *vma)
	612	{
	613	pte_t src_pte, dst_pte, entry;
	614	struct page *ptepage;
	615	unsigned long addr;
	616	int cow;
	617
	618	cow = (vma->vm_flags & (VM_SHARED \| VM_MAYWRITE)) == VM_MAYWRITE;
	619
	620	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
	621	src_pte = huge_pte_offset(src, addr);
	622	if (!src_pte)
	623	continue;
	624	dst_pte = huge_pte_alloc(dst, addr);
	625	if (!dst_pte)
	626	goto nomem;
	627	spin_lock(&dst->page_table_lock);
	628	spin_lock(&src->page_table_lock);
	629	if (!pte_none(*src_pte)) {
	630	if (cow)
	631	ptep_set_wrprotect(src, addr, src_pte);
	632	entry = *src_pte;
	633	ptepage = pte_page(entry);
	634	get_page(ptepage);
	635	set_huge_pte_at(dst, addr, dst_pte, entry);
	636	}
	637	spin_unlock(&src->page_table_lock);
	638	spin_unlock(&dst->page_table_lock);
	639	}
	640	return 0;
	641
	642	nomem:
	643	return -ENOMEM;
	644	}
	645
	646	void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
	647	unsigned long end)
	648	{
	649	struct mm_struct *mm = vma->vm_mm;
	650	unsigned long address;
	651	pte_t *ptep;
	652	pte_t pte;
	653	struct page *page;
	654	struct page *tmp;
	655	/*
	656	* A page gathering list, protected by per file i_mmap_lock. The
	657	* lock is used to avoid list corruption from multiple unmapping
	658	* of the same page since we are using page->lru.
	659	*/
	660	LIST_HEAD(page_list);
	661
	662	WARN_ON(!is_vm_hugetlb_page(vma));
	663	BUG_ON(start & ~HPAGE_MASK);
	664	BUG_ON(end & ~HPAGE_MASK);
	665
	666	spin_lock(&mm->page_table_lock);
	667	for (address = start; address < end; address += HPAGE_SIZE) {
	668	ptep = huge_pte_offset(mm, address);
	669	if (!ptep)
	670	continue;
	671
	672	if (huge_pmd_unshare(mm, &address, ptep))
	673	continue;
	674
	675	pte = huge_ptep_get_and_clear(mm, address, ptep);
	676	if (pte_none(pte))
	677	continue;
	678
	679	page = pte_page(pte);
	680	if (pte_dirty(pte))
	681	set_page_dirty(page);
	682	list_add(&page->lru, &page_list);
	683	}
	684	spin_unlock(&mm->page_table_lock);
	685	flush_tlb_range(vma, start, end);
	686	list_for_each_entry_safe(page, tmp, &page_list, lru) {
	687	list_del(&page->lru);
	688	put_page(page);
	689	}
	690	}
	691
	692	void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
	693	unsigned long end)
	694	{
	695	/*
	696	* It is undesirable to test vma->vm_file as it should be non-null
	697	* for valid hugetlb area. However, vm_file will be NULL in the error
	698	* cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
	699	* do_mmap_pgoff() nullifies vma->vm_file before calling this function
	700	* to clean up. Since no pte has actually been setup, it is safe to
	701	* do nothing in this case.
	702	*/
	703	if (vma->vm_file) {
	704	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
	705	__unmap_hugepage_range(vma, start, end);
	706	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
	707	}
	708	}
	709
	710	static int hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,
	711	unsigned long address, pte_t *ptep, pte_t pte)
	712	{
	713	struct page old_page, new_page;
	714	int avoidcopy;
	715
	716	old_page = pte_page(pte);
	717
	718	/* If no-one else is actually using this page, avoid the copy
	719	* and just make the page writable */
	720	avoidcopy = (page_count(old_page) == 1);
	721	if (avoidcopy) {
	722	set_huge_ptep_writable(vma, address, ptep);
	723	return 0;
	724	}
	725
	726	page_cache_get(old_page);
	727	new_page = alloc_huge_page(vma, address);
	728
	729	if (!new_page) {
	730	page_cache_release(old_page);
	731	return VM_FAULT_OOM;
	732	}
	733
	734	spin_unlock(&mm->page_table_lock);
	735	copy_huge_page(new_page, old_page, address, vma);
	736	spin_lock(&mm->page_table_lock);
	737
	738	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
	739	if (likely(pte_same(*ptep, pte))) {
	740	/* Break COW */
	741	set_huge_pte_at(mm, address, ptep,
	742	make_huge_pte(vma, new_page, 1));
	743	/* Make the old page be freed below */
	744	new_page = old_page;
	745	}
	746	page_cache_release(new_page);
	747	page_cache_release(old_page);
	748	return 0;
	749	}
	750
	751	static int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,
	752	unsigned long address, pte_t *ptep, int write_access)
	753	{
	754	int ret = VM_FAULT_SIGBUS;
	755	unsigned long idx;
	756	unsigned long size;
	757	struct page *page;
	758	struct address_space *mapping;
	759	pte_t new_pte;
	760
	761	mapping = vma->vm_file->f_mapping;
	762	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
	763	+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
	764
	765	/*
	766	* Use page lock to guard against racing truncation
	767	* before we get page_table_lock.
	768	*/
	769	retry:
	770	page = find_lock_page(mapping, idx);
	771	if (!page) {
	772	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
	773	if (idx >= size)
	774	goto out;
	775	if (hugetlb_get_quota(mapping))
	776	goto out;
	777	page = alloc_huge_page(vma, address);
	778	if (!page) {
	779	hugetlb_put_quota(mapping);
	780	ret = VM_FAULT_OOM;
	781	goto out;
	782	}
	783	clear_huge_page(page, address);
	784
	785	if (vma->vm_flags & VM_SHARED) {
	786	int err;
	787
	788	err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
	789	if (err) {
	790	put_page(page);
	791	hugetlb_put_quota(mapping);
	792	if (err == -EEXIST)
	793	goto retry;
	794	goto out;
	795	}
	796	} else
	797	lock_page(page);
	798	}
	799
	800	spin_lock(&mm->page_table_lock);
	801	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
	802	if (idx >= size)
	803	goto backout;
	804
	805	ret = 0;
	806	if (!pte_none(*ptep))
	807	goto backout;
	808
	809	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
	810	&& (vma->vm_flags & VM_SHARED)));
	811	set_huge_pte_at(mm, address, ptep, new_pte);
	812
	813	if (write_access && !(vma->vm_flags & VM_SHARED)) {
	814	/* Optimization, do the COW without a second fault */
	815	ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
	816	}
	817
	818	spin_unlock(&mm->page_table_lock);
	819	unlock_page(page);
	820	out:
	821	return ret;
	822
	823	backout:
	824	spin_unlock(&mm->page_table_lock);
	825	hugetlb_put_quota(mapping);
	826	unlock_page(page);
	827	put_page(page);
	828	goto out;
	829	}
	830
	831	int hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,
	832	unsigned long address, int write_access)
	833	{
	834	pte_t *ptep;
	835	pte_t entry;
	836	int ret;
	837	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
	838
	839	ptep = huge_pte_alloc(mm, address);
	840	if (!ptep)
	841	return VM_FAULT_OOM;
	842
	843	/*
	844	* Serialize hugepage allocation and instantiation, so that we don't
	845	* get spurious allocation failures if two CPUs race to instantiate
	846	* the same page in the page cache.
	847	*/
	848	mutex_lock(&hugetlb_instantiation_mutex);
	849	entry = *ptep;
	850	if (pte_none(entry)) {
	851	ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
	852	mutex_unlock(&hugetlb_instantiation_mutex);
	853	return ret;
	854	}
	855
	856	ret = 0;
	857
	858	spin_lock(&mm->page_table_lock);
	859	/* Check for a racing update before calling hugetlb_cow */
	860	if (likely(pte_same(entry, *ptep)))
	861	if (write_access && !pte_write(entry))
	862	ret = hugetlb_cow(mm, vma, address, ptep, entry);
	863	spin_unlock(&mm->page_table_lock);
	864	mutex_unlock(&hugetlb_instantiation_mutex);
	865
	866	return ret;
	867	}
	868
	869	int follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
	870	struct page pages, struct vm_area_struct vmas,
	871	unsigned long position, int length, int i)
	872	{
	873	unsigned long pfn_offset;
	874	unsigned long vaddr = *position;
	875	int remainder = *length;
	876
	877	spin_lock(&mm->page_table_lock);
	878	while (vaddr < vma->vm_end && remainder) {
	879	pte_t *pte;
	880	struct page *page;
	881
	882	/*
	883	* Some archs (sparc64, sh*) have multiple pte_ts to
	884	* each hugepage. We have to make * sure we get the
	885	* first, for the page indexing below to work.
	886	*/
	887	pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
	888
	889	if (!pte \|\| pte_none(*pte)) {
	890	int ret;
	891
	892	spin_unlock(&mm->page_table_lock);
	893	ret = hugetlb_fault(mm, vma, vaddr, 0);
	894	spin_lock(&mm->page_table_lock);
	895	if (!(ret & VM_FAULT_ERROR))
	896	continue;
	897
	898	remainder = 0;
	899	if (!i)
	900	i = -EFAULT;
	901	break;
	902	}
	903
	904	pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
	905	page = pte_page(*pte);
	906	same_page:
	907	if (pages) {
	908	get_page(page);
	909	pages[i] = page + pfn_offset;
	910	}
	911
	912	if (vmas)
	913	vmas[i] = vma;
	914
	915	vaddr += PAGE_SIZE;
	916	++pfn_offset;
	917	--remainder;
	918	++i;
	919	if (vaddr < vma->vm_end && remainder &&
	920	pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
	921	/*
	922	* We use pfn_offset to avoid touching the pageframes
	923	* of this compound page.
	924	*/
	925	goto same_page;
	926	}
	927	}
	928	spin_unlock(&mm->page_table_lock);
	929	*length = remainder;
	930	*position = vaddr;
	931
	932	return i;
	933	}
	934
	935	void hugetlb_change_protection(struct vm_area_struct *vma,
	936	unsigned long address, unsigned long end, pgprot_t newprot)
	937	{
	938	struct mm_struct *mm = vma->vm_mm;
	939	unsigned long start = address;
	940	pte_t *ptep;
	941	pte_t pte;
	942
	943	BUG_ON(address >= end);
	944	flush_cache_range(vma, address, end);
	945
	946	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
	947	spin_lock(&mm->page_table_lock);
	948	for (; address < end; address += HPAGE_SIZE) {
	949	ptep = huge_pte_offset(mm, address);
	950	if (!ptep)
	951	continue;
	952	if (huge_pmd_unshare(mm, &address, ptep))
	953	continue;
	954	if (!pte_none(*ptep)) {
	955	pte = huge_ptep_get_and_clear(mm, address, ptep);
	956	pte = pte_mkhuge(pte_modify(pte, newprot));
	957	set_huge_pte_at(mm, address, ptep, pte);
	958	}
	959	}
	960	spin_unlock(&mm->page_table_lock);
	961	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
	962
	963	flush_tlb_range(vma, start, end);
	964	}
	965
	966	struct file_region {
	967	struct list_head link;
	968	long from;
	969	long to;
	970	};
	971
	972	static long region_add(struct list_head *head, long f, long t)
	973	{
	974	struct file_region rg, nrg, *trg;
	975
	976	/* Locate the region we are either in or before. */
	977	list_for_each_entry(rg, head, link)
	978	if (f <= rg->to)
	979	break;
	980
	981	/* Round our left edge to the current segment if it encloses us. */
	982	if (f > rg->from)
	983	f = rg->from;
	984
	985	/* Check for and consume any regions we now overlap with. */
	986	nrg = rg;
	987	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
	988	if (&rg->link == head)
	989	break;
	990	if (rg->from > t)
	991	break;
	992
	993	/* If this area reaches higher then extend our area to
	994	* include it completely. If this is not the first area
	995	* which we intend to reuse, free it. */
	996	if (rg->to > t)
	997	t = rg->to;
	998	if (rg != nrg) {
	999	list_del(&rg->link);
	1000	kfree(rg);
	1001	}
	1002	}
	1003	nrg->from = f;
	1004	nrg->to = t;
	1005	return 0;
	1006	}
	1007
	1008	static long region_chg(struct list_head *head, long f, long t)
	1009	{
	1010	struct file_region rg, nrg;
	1011	long chg = 0;
	1012
	1013	/* Locate the region we are before or in. */
	1014	list_for_each_entry(rg, head, link)
	1015	if (f <= rg->to)
	1016	break;
	1017
	1018	/* If we are below the current region then a new region is required.
	1019	* Subtle, allocate a new region at the position but make it zero
	1020	* size such that we can guarantee to record the reservation. */
	1021	if (&rg->link == head \|\| t < rg->from) {
	1022	nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
	1023	if (!nrg)
	1024	return -ENOMEM;
	1025	nrg->from = f;
	1026	nrg->to = f;
	1027	INIT_LIST_HEAD(&nrg->link);
	1028	list_add(&nrg->link, rg->link.prev);
	1029
	1030	return t - f;
	1031	}
	1032
	1033	/* Round our left edge to the current segment if it encloses us. */
	1034	if (f > rg->from)
	1035	f = rg->from;
	1036	chg = t - f;
	1037
	1038	/* Check for and consume any regions we now overlap with. */
	1039	list_for_each_entry(rg, rg->link.prev, link) {
	1040	if (&rg->link == head)
	1041	break;
	1042	if (rg->from > t)
	1043	return chg;
	1044
	1045	/* We overlap with this area, if it extends futher than
	1046	* us then we must extend ourselves. Account for its
	1047	* existing reservation. */
	1048	if (rg->to > t) {
	1049	chg += rg->to - t;
	1050	t = rg->to;
	1051	}
	1052	chg -= rg->to - rg->from;
	1053	}
	1054	return chg;
	1055	}
	1056
	1057	static long region_truncate(struct list_head *head, long end)
	1058	{
	1059	struct file_region rg, trg;
	1060	long chg = 0;
	1061
	1062	/* Locate the region we are either in or before. */
	1063	list_for_each_entry(rg, head, link)
	1064	if (end <= rg->to)
	1065	break;
	1066	if (&rg->link == head)
	1067	return 0;
	1068
	1069	/* If we are in the middle of a region then adjust it. */
	1070	if (end > rg->from) {
	1071	chg = rg->to - end;
	1072	rg->to = end;
	1073	rg = list_entry(rg->link.next, typeof(*rg), link);
	1074	}
	1075
	1076	/* Drop any remaining regions. */
	1077	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
	1078	if (&rg->link == head)
	1079	break;
	1080	chg += rg->to - rg->from;
	1081	list_del(&rg->link);
	1082	kfree(rg);
	1083	}
	1084	return chg;
	1085	}
	1086
	1087	static int hugetlb_acct_memory(long delta)
	1088	{
	1089	int ret = -ENOMEM;
	1090
	1091	spin_lock(&hugetlb_lock);
	1092	/*
	1093	* When cpuset is configured, it breaks the strict hugetlb page
	1094	* reservation as the accounting is done on a global variable. Such
	1095	* reservation is completely rubbish in the presence of cpuset because
	1096	* the reservation is not checked against page availability for the
	1097	* current cpuset. Application can still potentially OOM'ed by kernel
	1098	* with lack of free htlb page in cpuset that the task is in.
	1099	* Attempt to enforce strict accounting with cpuset is almost
	1100	* impossible (or too ugly) because cpuset is too fluid that
	1101	* task or memory node can be dynamically moved between cpusets.
	1102	*
	1103	* The change of semantics for shared hugetlb mapping with cpuset is
	1104	* undesirable. However, in order to preserve some of the semantics,
	1105	* we fall back to check against current free page availability as
	1106	* a best attempt and hopefully to minimize the impact of changing
	1107	* semantics that cpuset has.
	1108	*/
	1109	if (delta > 0) {
	1110	if (gather_surplus_pages(delta) < 0)
	1111	goto out;
	1112
	1113	if (delta > cpuset_mems_nr(free_huge_pages_node))
	1114	goto out;
	1115	}
	1116
	1117	ret = 0;
	1118	resv_huge_pages += delta;
	1119	if (delta < 0)
	1120	return_unused_surplus_pages((unsigned long) -delta);
	1121
	1122	out:
	1123	spin_unlock(&hugetlb_lock);
	1124	return ret;
	1125	}
	1126
	1127	int hugetlb_reserve_pages(struct inode *inode, long from, long to)
	1128	{
	1129	long ret, chg;
	1130
	1131	chg = region_chg(&inode->i_mapping->private_list, from, to);
	1132	if (chg < 0)
	1133	return chg;
	1134
	1135	ret = hugetlb_acct_memory(chg);
	1136	if (ret < 0)
	1137	return ret;
	1138	region_add(&inode->i_mapping->private_list, from, to);
	1139	return 0;
	1140	}
	1141
	1142	void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
	1143	{
	1144	long chg = region_truncate(&inode->i_mapping->private_list, offset);
	1145	hugetlb_acct_memory(freed - chg);
	1146	}