arch/x86/mm/init_64.c

   1 /*
   2  *  linux/arch/x86_64/mm/init.c
   3  *
   4  *  Copyright (C) 1995  Linus Torvalds
   5  *  Copyright (C) 2000  Pavel Machek <[email protected]>
   6  *  Copyright (C) 2002,2003 Andi Kleen <[email protected]>
   7  */
   8
   9 #include <linux/signal.h>
  10 #include <linux/sched.h>
  11 #include <linux/kernel.h>
  12 #include <linux/errno.h>
  13 #include <linux/string.h>
  14 #include <linux/types.h>
  15 #include <linux/ptrace.h>
  16 #include <linux/mman.h>
  17 #include <linux/mm.h>
  18 #include <linux/swap.h>
  19 #include <linux/smp.h>
  20 #include <linux/init.h>
  21 #include <linux/initrd.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/bootmem.h>
  24 #include <linux/memblock.h>
  25 #include <linux/proc_fs.h>
  26 #include <linux/pci.h>
  27 #include <linux/pfn.h>
  28 #include <linux/poison.h>
  29 #include <linux/dma-mapping.h>
  30 #include <linux/module.h>
  31 #include <linux/memory_hotplug.h>
  32 #include <linux/nmi.h>
  33 #include <linux/gfp.h>
  34
  35 #include <asm/processor.h>
  36 #include <asm/bios_ebda.h>
  37 #include <asm/system.h>
  38 #include <asm/uaccess.h>
  39 #include <asm/pgtable.h>
  40 #include <asm/pgalloc.h>
  41 #include <asm/dma.h>
  42 #include <asm/fixmap.h>
  43 #include <asm/e820.h>
  44 #include <asm/apic.h>
  45 #include <asm/tlb.h>
  46 #include <asm/mmu_context.h>
  47 #include <asm/proto.h>
  48 #include <asm/smp.h>
  49 #include <asm/sections.h>
  50 #include <asm/kdebug.h>
  51 #include <asm/numa.h>
  52 #include <asm/cacheflush.h>
  53 #include <asm/init.h>
  54
  55 static int __init parse_direct_gbpages_off(char *arg)
  56 {
  57         direct_gbpages = 0;
  58         return 0;
  59 }
  60 early_param("nogbpages", parse_direct_gbpages_off);
  61
  62 static int __init parse_direct_gbpages_on(char *arg)
  63 {
  64         direct_gbpages = 1;
  65         return 0;
  66 }
  67 early_param("gbpages", parse_direct_gbpages_on);
  68
  69 /*
  70  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  71  * physical space so we can cache the place of the first one and move
  72  * around without checking the pgd every time.
  73  */
  74
  75 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
  76 EXPORT_SYMBOL_GPL(__supported_pte_mask);
  77
  78 int force_personality32;
  79
  80 /*
  81  * noexec32=on|off
  82  * Control non executable heap for 32bit processes.
  83  * To control the stack too use noexec=off
  84  *
  85  * on   PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
  86  * off  PROT_READ implies PROT_EXEC
  87  */
  88 static int __init nonx32_setup(char *str)
  89 {
  90         if (!strcmp(str, "on"))
  91                 force_personality32 &= ~READ_IMPLIES_EXEC;
  92         else if (!strcmp(str, "off"))
  93                 force_personality32 |= READ_IMPLIES_EXEC;
  94         return 1;
  95 }
  96 __setup("noexec32=", nonx32_setup);
  97
  98 /*
  99  * When memory was added/removed make sure all the processes MM have
 100  * suitable PGD entries in the local PGD level page.
 101  */
 102 void sync_global_pgds(unsigned long start, unsigned long end)
 103 {
 104         unsigned long address;
 105
 106         for (address = start; address <= end; address += PGDIR_SIZE) {
 107                 const pgd_t *pgd_ref = pgd_offset_k(address);
 108                 unsigned long flags;
 109                 struct page *page;
 110
 111                 if (pgd_none(*pgd_ref))
 112                         continue;
 113
 114                 spin_lock_irqsave(&pgd_lock, flags);
 115                 list_for_each_entry(page, &pgd_list, lru) {
 116                         pgd_t *pgd;
 117                         spinlock_t *pgt_lock;
 118
 119                         pgd = (pgd_t *)page_address(page) + pgd_index(address);
 120                         pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 121                         spin_lock(pgt_lock);
 122
 123                         if (pgd_none(*pgd))
 124                                 set_pgd(pgd, *pgd_ref);
 125                         else
 126                                 BUG_ON(pgd_page_vaddr(*pgd)
 127                                        != pgd_page_vaddr(*pgd_ref));
 128
 129                         spin_unlock(pgt_lock);
 130                 }
 131                 spin_unlock_irqrestore(&pgd_lock, flags);
 132         }
 133 }
 134
 135 /*
 136  * NOTE: This function is marked __ref because it calls __init function
 137  * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 138  */
 139 static __ref void *spp_getpage(void)
 140 {
 141         void *ptr;
 142
 143         if (after_bootmem)
 144                 ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
 145         else
 146                 ptr = alloc_bootmem_pages(PAGE_SIZE);
 147
 148         if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
 149                 panic("set_pte_phys: cannot allocate page data %s\n",
 150                         after_bootmem ? "after bootmem" : "");
 151         }
 152
 153         pr_debug("spp_getpage %p\n", ptr);
 154
 155         return ptr;
 156 }
 157
 158 static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
 159 {
 160         if (pgd_none(*pgd)) {
 161                 pud_t *pud = (pud_t *)spp_getpage();
 162                 pgd_populate(&init_mm, pgd, pud);
 163                 if (pud != pud_offset(pgd, 0))
 164                         printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
 165                                pud, pud_offset(pgd, 0));
 166         }
 167         return pud_offset(pgd, vaddr);
 168 }
 169
 170 static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
 171 {
 172         if (pud_none(*pud)) {
 173                 pmd_t *pmd = (pmd_t *) spp_getpage();
 174                 pud_populate(&init_mm, pud, pmd);
 175                 if (pmd != pmd_offset(pud, 0))
 176                         printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
 177                                pmd, pmd_offset(pud, 0));
 178         }
 179         return pmd_offset(pud, vaddr);
 180 }
 181
 182 static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
 183 {
 184         if (pmd_none(*pmd)) {
 185                 pte_t *pte = (pte_t *) spp_getpage();
 186                 pmd_populate_kernel(&init_mm, pmd, pte);
 187                 if (pte != pte_offset_kernel(pmd, 0))
 188                         printk(KERN_ERR "PAGETABLE BUG #02!\n");
 189         }
 190         return pte_offset_kernel(pmd, vaddr);
 191 }
 192
 193 void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 194 {
 195         pud_t *pud;
 196         pmd_t *pmd;
 197         pte_t *pte;
 198
 199         pud = pud_page + pud_index(vaddr);
 200         pmd = fill_pmd(pud, vaddr);
 201         pte = fill_pte(pmd, vaddr);
 202
 203         set_pte(pte, new_pte);
 204
 205         /*
 206          * It's enough to flush this one mapping.
 207          * (PGE mappings get flushed as well)
 208          */
 209         __flush_tlb_one(vaddr);
 210 }
 211
 212 void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 213 {
 214         pgd_t *pgd;
 215         pud_t *pud_page;
 216
 217         pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
 218
 219         pgd = pgd_offset_k(vaddr);
 220         if (pgd_none(*pgd)) {
 221                 printk(KERN_ERR
 222                         "PGD FIXMAP MISSING, it should be setup in head.S!\n");
 223                 return;
 224         }
 225         pud_page = (pud_t*)pgd_page_vaddr(*pgd);
 226         set_pte_vaddr_pud(pud_page, vaddr, pteval);
 227 }
 228
 229 pmd_t * __init populate_extra_pmd(unsigned long vaddr)
 230 {
 231         pgd_t *pgd;
 232         pud_t *pud;
 233
 234         pgd = pgd_offset_k(vaddr);
 235         pud = fill_pud(pgd, vaddr);
 236         return fill_pmd(pud, vaddr);
 237 }
 238
 239 pte_t * __init populate_extra_pte(unsigned long vaddr)
 240 {
 241         pmd_t *pmd;
 242
 243         pmd = populate_extra_pmd(vaddr);
 244         return fill_pte(pmd, vaddr);
 245 }
 246
 247 /*
 248  * Create large page table mappings for a range of physical addresses.
 249  */
 250 static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
 251                                                 pgprot_t prot)
 252 {
 253         pgd_t *pgd;
 254         pud_t *pud;
 255         pmd_t *pmd;
 256
 257         BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
 258         for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
 259                 pgd = pgd_offset_k((unsigned long)__va(phys));
 260                 if (pgd_none(*pgd)) {
 261                         pud = (pud_t *) spp_getpage();
 262                         set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
 263                                                 _PAGE_USER));
 264                 }
 265                 pud = pud_offset(pgd, (unsigned long)__va(phys));
 266                 if (pud_none(*pud)) {
 267                         pmd = (pmd_t *) spp_getpage();
 268                         set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
 269                                                 _PAGE_USER));
 270                 }
 271                 pmd = pmd_offset(pud, phys);
 272                 BUG_ON(!pmd_none(*pmd));
 273                 set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
 274         }
 275 }
 276
 277 void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
 278 {
 279         __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
 280 }
 281
 282 void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
 283 {
 284         __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
 285 }
 286
 287 /*
 288  * The head.S code sets up the kernel high mapping:
 289  *
 290  *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
 291  *
 292  * phys_addr holds the negative offset to the kernel, which is added
 293  * to the compile time generated pmds. This results in invalid pmds up
 294  * to the point where we hit the physaddr 0 mapping.
 295  *
 296  * We limit the mappings to the region from _text to _end.  _end is
 297  * rounded up to the 2MB boundary. This catches the invalid pmds as
 298  * well, as they are located before _text:
 299  */
 300 void __init cleanup_highmap(void)
 301 {
 302         unsigned long vaddr = __START_KERNEL_map;
 303         unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
 304         pmd_t *pmd = level2_kernel_pgt;
 305         pmd_t *last_pmd = pmd + PTRS_PER_PMD;
 306
 307         for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
 308                 if (pmd_none(*pmd))
 309                         continue;
 310                 if (vaddr < (unsigned long) _text || vaddr > end)
 311                         set_pmd(pmd, __pmd(0));
 312         }
 313 }
 314
 315 static __ref void *alloc_low_page(unsigned long *phys)
 316 {
 317         unsigned long pfn = e820_table_end++;
 318         void *adr;
 319
 320         if (after_bootmem) {
 321                 adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
 322                 *phys = __pa(adr);
 323
 324                 return adr;
 325         }
 326
 327         if (pfn >= e820_table_top)
 328                 panic("alloc_low_page: ran out of memory");
 329
 330         adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
 331         clear_page(adr);
 332         *phys  = pfn * PAGE_SIZE;
 333         return adr;
 334 }
 335
 336 static __ref void unmap_low_page(void *adr)
 337 {
 338         if (after_bootmem)
 339                 return;
 340
 341         early_iounmap(adr, PAGE_SIZE);
 342 }
 343
 344 static unsigned long __meminit
 345 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
 346               pgprot_t prot)
 347 {
 348         unsigned pages = 0;
 349         unsigned long last_map_addr = end;
 350         int i;
 351
 352         pte_t *pte = pte_page + pte_index(addr);
 353
 354         for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
 355
 356                 if (addr >= end) {
 357                         if (!after_bootmem) {
 358                                 for(; i < PTRS_PER_PTE; i++, pte++)
 359                                         set_pte(pte, __pte(0));
 360                         }
 361                         break;
 362                 }
 363
 364                 /*
 365                  * We will re-use the existing mapping.
 366                  * Xen for example has some special requirements, like mapping
 367                  * pagetable pages as RO. So assume someone who pre-setup
 368                  * these mappings are more intelligent.
 369                  */
 370                 if (pte_val(*pte)) {
 371                         pages++;
 372                         continue;
 373                 }
 374
 375                 if (0)
 376                         printk("   pte=%p addr=%lx pte=%016lx\n",
 377                                pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
 378                 pages++;
 379                 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
 380                 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
 381         }
 382
 383         update_page_count(PG_LEVEL_4K, pages);
 384
 385         return last_map_addr;
 386 }
 387
 388 static unsigned long __meminit
 389 phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
 390                 pgprot_t prot)
 391 {
 392         pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
 393
 394         return phys_pte_init(pte, address, end, prot);
 395 }
 396
 397 static unsigned long __meminit
 398 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 399               unsigned long page_size_mask, pgprot_t prot)
 400 {
 401         unsigned long pages = 0;
 402         unsigned long last_map_addr = end;
 403
 404         int i = pmd_index(address);
 405
 406         for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
 407                 unsigned long pte_phys;
 408                 pmd_t *pmd = pmd_page + pmd_index(address);
 409                 pte_t *pte;
 410                 pgprot_t new_prot = prot;
 411
 412                 if (address >= end) {
 413                         if (!after_bootmem) {
 414                                 for (; i < PTRS_PER_PMD; i++, pmd++)
 415                                         set_pmd(pmd, __pmd(0));
 416                         }
 417                         break;
 418                 }
 419
 420                 if (pmd_val(*pmd)) {
 421                         if (!pmd_large(*pmd)) {
 422                                 spin_lock(&init_mm.page_table_lock);
 423                                 last_map_addr = phys_pte_update(pmd, address,
 424                                                                 end, prot);
 425                                 spin_unlock(&init_mm.page_table_lock);
 426                                 continue;
 427                         }
 428                         /*
 429                          * If we are ok with PG_LEVEL_2M mapping, then we will
 430                          * use the existing mapping,
 431                          *
 432                          * Otherwise, we will split the large page mapping but
 433                          * use the same existing protection bits except for
 434                          * large page, so that we don't violate Intel's TLB
 435                          * Application note (317080) which says, while changing
 436                          * the page sizes, new and old translations should
 437                          * not differ with respect to page frame and
 438                          * attributes.
 439                          */
 440                         if (page_size_mask & (1 << PG_LEVEL_2M)) {
 441                                 pages++;
 442                                 continue;
 443                         }
 444                         new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
 445                 }
 446
 447                 if (page_size_mask & (1<<PG_LEVEL_2M)) {
 448                         pages++;
 449                         spin_lock(&init_mm.page_table_lock);
 450                         set_pte((pte_t *)pmd,
 451                                 pfn_pte(address >> PAGE_SHIFT,
 452                                         __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 453                         spin_unlock(&init_mm.page_table_lock);
 454                         last_map_addr = (address & PMD_MASK) + PMD_SIZE;
 455                         continue;
 456                 }
 457
 458                 pte = alloc_low_page(&pte_phys);
 459                 last_map_addr = phys_pte_init(pte, address, end, new_prot);
 460                 unmap_low_page(pte);
 461
 462                 spin_lock(&init_mm.page_table_lock);
 463                 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
 464                 spin_unlock(&init_mm.page_table_lock);
 465         }
 466         update_page_count(PG_LEVEL_2M, pages);
 467         return last_map_addr;
 468 }
 469
 470 static unsigned long __meminit
 471 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
 472                 unsigned long page_size_mask, pgprot_t prot)
 473 {
 474         pmd_t *pmd = pmd_offset(pud, 0);
 475         unsigned long last_map_addr;
 476
 477         last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
 478         __flush_tlb_all();
 479         return last_map_addr;
 480 }
 481
 482 static unsigned long __meminit
 483 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 484                          unsigned long page_size_mask)
 485 {
 486         unsigned long pages = 0;
 487         unsigned long last_map_addr = end;
 488         int i = pud_index(addr);
 489
 490         for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
 491                 unsigned long pmd_phys;
 492                 pud_t *pud = pud_page + pud_index(addr);
 493                 pmd_t *pmd;
 494                 pgprot_t prot = PAGE_KERNEL;
 495
 496                 if (addr >= end)
 497                         break;
 498
 499                 if (!after_bootmem &&
 500                                 !e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
 501                         set_pud(pud, __pud(0));
 502                         continue;
 503                 }
 504
 505                 if (pud_val(*pud)) {
 506                         if (!pud_large(*pud)) {
 507                                 last_map_addr = phys_pmd_update(pud, addr, end,
 508                                                          page_size_mask, prot);
 509                                 continue;
 510                         }
 511                         /*
 512                          * If we are ok with PG_LEVEL_1G mapping, then we will
 513                          * use the existing mapping.
 514                          *
 515                          * Otherwise, we will split the gbpage mapping but use
 516                          * the same existing protection  bits except for large
 517                          * page, so that we don't violate Intel's TLB
 518                          * Application note (317080) which says, while changing
 519                          * the page sizes, new and old translations should
 520                          * not differ with respect to page frame and
 521                          * attributes.
 522                          */
 523                         if (page_size_mask & (1 << PG_LEVEL_1G)) {
 524                                 pages++;
 525                                 continue;
 526                         }
 527                         prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
 528                 }
 529
 530                 if (page_size_mask & (1<<PG_LEVEL_1G)) {
 531                         pages++;
 532                         spin_lock(&init_mm.page_table_lock);
 533                         set_pte((pte_t *)pud,
 534                                 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
 535                         spin_unlock(&init_mm.page_table_lock);
 536                         last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
 537                         continue;
 538                 }
 539
 540                 pmd = alloc_low_page(&pmd_phys);
 541                 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
 542                                               prot);
 543                 unmap_low_page(pmd);
 544
 545                 spin_lock(&init_mm.page_table_lock);
 546                 pud_populate(&init_mm, pud, __va(pmd_phys));
 547                 spin_unlock(&init_mm.page_table_lock);
 548         }
 549         __flush_tlb_all();
 550
 551         update_page_count(PG_LEVEL_1G, pages);
 552
 553         return last_map_addr;
 554 }
 555
 556 static unsigned long __meminit
 557 phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
 558                  unsigned long page_size_mask)
 559 {
 560         pud_t *pud;
 561
 562         pud = (pud_t *)pgd_page_vaddr(*pgd);
 563
 564         return phys_pud_init(pud, addr, end, page_size_mask);
 565 }
 566
 567 unsigned long __meminit
 568 kernel_physical_mapping_init(unsigned long start,
 569                              unsigned long end,
 570                              unsigned long page_size_mask)
 571 {
 572         bool pgd_changed = false;
 573         unsigned long next, last_map_addr = end;
 574         unsigned long addr;
 575
 576         start = (unsigned long)__va(start);
 577         end = (unsigned long)__va(end);
 578         addr = start;
 579
 580         for (; start < end; start = next) {
 581                 pgd_t *pgd = pgd_offset_k(start);
 582                 unsigned long pud_phys;
 583                 pud_t *pud;
 584
 585                 next = (start + PGDIR_SIZE) & PGDIR_MASK;
 586                 if (next > end)
 587                         next = end;
 588
 589                 if (pgd_val(*pgd)) {
 590                         last_map_addr = phys_pud_update(pgd, __pa(start),
 591                                                  __pa(end), page_size_mask);
 592                         continue;
 593                 }
 594
 595                 pud = alloc_low_page(&pud_phys);
 596                 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
 597                                                  page_size_mask);
 598                 unmap_low_page(pud);
 599
 600                 spin_lock(&init_mm.page_table_lock);
 601                 pgd_populate(&init_mm, pgd, __va(pud_phys));
 602                 spin_unlock(&init_mm.page_table_lock);
 603                 pgd_changed = true;
 604         }
 605
 606         if (pgd_changed)
 607                 sync_global_pgds(addr, end);
 608
 609         __flush_tlb_all();
 610
 611         return last_map_addr;
 612 }
 613
 614 #ifndef CONFIG_NUMA
 615 void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 616                                 int acpi, int k8)
 617 {
 618         memblock_x86_register_active_regions(0, start_pfn, end_pfn);
 619 }
 620 #endif
 621
 622 void __init paging_init(void)
 623 {
 624         unsigned long max_zone_pfns[MAX_NR_ZONES];
 625
 626         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 627         max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 628         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 629         max_zone_pfns[ZONE_NORMAL] = max_pfn;
 630
 631         sparse_memory_present_with_active_regions(MAX_NUMNODES);
 632         sparse_init();
 633
 634         /*
 635          * clear the default setting with node 0
 636          * note: don't use nodes_clear here, that is really clearing when
 637          *       numa support is not compiled in, and later node_set_state
 638          *       will not set it back.
 639          */
 640         node_clear_state(0, N_NORMAL_MEMORY);
 641
 642         free_area_init_nodes(max_zone_pfns);
 643 }
 644
 645 /*
 646  * Memory hotplug specific functions
 647  */
 648 #ifdef CONFIG_MEMORY_HOTPLUG
 649 /*
 650  * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
 651  * updating.
 652  */
 653 static void  update_end_of_memory_vars(u64 start, u64 size)
 654 {
 655         unsigned long end_pfn = PFN_UP(start + size);
 656
 657         if (end_pfn > max_pfn) {
 658                 max_pfn = end_pfn;
 659                 max_low_pfn = end_pfn;
 660                 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
 661         }
 662 }
 663
 664 /*
 665  * Memory is added always to NORMAL zone. This means you will never get
 666  * additional DMA/DMA32 memory.
 667  */
 668 int arch_add_memory(int nid, u64 start, u64 size)
 669 {
 670         struct pglist_data *pgdat = NODE_DATA(nid);
 671         struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
 672         unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
 673         unsigned long nr_pages = size >> PAGE_SHIFT;
 674         int ret;
 675
 676         last_mapped_pfn = init_memory_mapping(start, start + size);
 677         if (last_mapped_pfn > max_pfn_mapped)
 678                 max_pfn_mapped = last_mapped_pfn;
 679
 680         ret = __add_pages(nid, zone, start_pfn, nr_pages);
 681         WARN_ON_ONCE(ret);
 682
 683         /* update max_pfn, max_low_pfn and high_memory */
 684         update_end_of_memory_vars(start, size);
 685
 686         return ret;
 687 }
 688 EXPORT_SYMBOL_GPL(arch_add_memory);
 689
 690 #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
 691 int memory_add_physaddr_to_nid(u64 start)
 692 {
 693         return 0;
 694 }
 695 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 696 #endif
 697
 698 #endif /* CONFIG_MEMORY_HOTPLUG */
 699
 700 static struct kcore_list kcore_vsyscall;
 701
 702 void __init mem_init(void)
 703 {
 704         long codesize, reservedpages, datasize, initsize;
 705         unsigned long absent_pages;
 706
 707         pci_iommu_alloc();
 708
 709         /* clear_bss() already clear the empty_zero_page */
 710
 711         reservedpages = 0;
 712
 713         /* this will put all low memory onto the freelists */
 714 #ifdef CONFIG_NUMA
 715         totalram_pages = numa_free_all_bootmem();
 716 #else
 717         totalram_pages = free_all_bootmem();
 718 #endif
 719
 720         absent_pages = absent_pages_in_range(0, max_pfn);
 721         reservedpages = max_pfn - totalram_pages - absent_pages;
 722         after_bootmem = 1;
 723
 724         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 725         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 726         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 727
 728         /* Register memory areas for /proc/kcore */
 729         kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
 730                          VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
 731
 732         printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
 733                          "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
 734                 nr_free_pages() << (PAGE_SHIFT-10),
 735                 max_pfn << (PAGE_SHIFT-10),
 736                 codesize >> 10,
 737                 absent_pages << (PAGE_SHIFT-10),
 738                 reservedpages << (PAGE_SHIFT-10),
 739                 datasize >> 10,
 740                 initsize >> 10);
 741 }
 742
 743 #ifdef CONFIG_DEBUG_RODATA
 744 const int rodata_test_data = 0xC3;
 745 EXPORT_SYMBOL_GPL(rodata_test_data);
 746
 747 int kernel_set_to_readonly;
 748
 749 void set_kernel_text_rw(void)
 750 {
 751         unsigned long start = PFN_ALIGN(_text);
 752         unsigned long end = PFN_ALIGN(__stop___ex_table);
 753
 754         if (!kernel_set_to_readonly)
 755                 return;
 756
 757         pr_debug("Set kernel text: %lx - %lx for read write\n",
 758                  start, end);
 759
 760         /*
 761          * Make the kernel identity mapping for text RW. Kernel text
 762          * mapping will always be RO. Refer to the comment in
 763          * static_protections() in pageattr.c
 764          */
 765         set_memory_rw(start, (end - start) >> PAGE_SHIFT);
 766 }
 767
 768 void set_kernel_text_ro(void)
 769 {
 770         unsigned long start = PFN_ALIGN(_text);
 771         unsigned long end = PFN_ALIGN(__stop___ex_table);
 772
 773         if (!kernel_set_to_readonly)
 774                 return;
 775
 776         pr_debug("Set kernel text: %lx - %lx for read only\n",
 777                  start, end);
 778
 779         /*
 780          * Set the kernel identity mapping for text RO.
 781          */
 782         set_memory_ro(start, (end - start) >> PAGE_SHIFT);
 783 }
 784
 785 void mark_rodata_ro(void)
 786 {
 787         unsigned long start = PFN_ALIGN(_text);
 788         unsigned long rodata_start =
 789                 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
 790         unsigned long end = (unsigned long) &__end_rodata_hpage_align;
 791         unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
 792         unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
 793         unsigned long data_start = (unsigned long) &_sdata;
 794
 795         printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 796                (end - start) >> 10);
 797         set_memory_ro(start, (end - start) >> PAGE_SHIFT);
 798
 799         kernel_set_to_readonly = 1;
 800
 801         /*
 802          * The rodata section (but not the kernel text!) should also be
 803          * not-executable.
 804          */
 805         set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
 806
 807         rodata_test();
 808
 809 #ifdef CONFIG_CPA_DEBUG
 810         printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
 811         set_memory_rw(start, (end-start) >> PAGE_SHIFT);
 812
 813         printk(KERN_INFO "Testing CPA: again\n");
 814         set_memory_ro(start, (end-start) >> PAGE_SHIFT);
 815 #endif
 816
 817         free_init_pages("unused kernel memory",
 818                         (unsigned long) page_address(virt_to_page(text_end)),
 819                         (unsigned long)
 820                                  page_address(virt_to_page(rodata_start)));
 821         free_init_pages("unused kernel memory",
 822                         (unsigned long) page_address(virt_to_page(rodata_end)),
 823                         (unsigned long) page_address(virt_to_page(data_start)));
 824 }
 825
 826 #endif
 827
 828 int kern_addr_valid(unsigned long addr)
 829 {
 830         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
 831         pgd_t *pgd;
 832         pud_t *pud;
 833         pmd_t *pmd;
 834         pte_t *pte;
 835
 836         if (above != 0 && above != -1UL)
 837                 return 0;
 838
 839         pgd = pgd_offset_k(addr);
 840         if (pgd_none(*pgd))
 841                 return 0;
 842
 843         pud = pud_offset(pgd, addr);
 844         if (pud_none(*pud))
 845                 return 0;
 846
 847         pmd = pmd_offset(pud, addr);
 848         if (pmd_none(*pmd))
 849                 return 0;
 850
 851         if (pmd_large(*pmd))
 852                 return pfn_valid(pmd_pfn(*pmd));
 853
 854         pte = pte_offset_kernel(pmd, addr);
 855         if (pte_none(*pte))
 856                 return 0;
 857
 858         return pfn_valid(pte_pfn(*pte));
 859 }
 860
 861 /*
 862  * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 863  * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 864  * not need special handling anymore:
 865  */
 866 static struct vm_area_struct gate_vma = {
 867         .vm_start       = VSYSCALL_START,
 868         .vm_end         = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
 869         .vm_page_prot   = PAGE_READONLY_EXEC,
 870         .vm_flags       = VM_READ | VM_EXEC
 871 };
 872
 873 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
 874 {
 875 #ifdef CONFIG_IA32_EMULATION
 876         if (test_tsk_thread_flag(tsk, TIF_IA32))
 877                 return NULL;
 878 #endif
 879         return &gate_vma;
 880 }
 881
 882 int in_gate_area(struct task_struct *task, unsigned long addr)
 883 {
 884         struct vm_area_struct *vma = get_gate_vma(task);
 885
 886         if (!vma)
 887                 return 0;
 888
 889         return (addr >= vma->vm_start) && (addr < vma->vm_end);
 890 }
 891
 892 /*
 893  * Use this when you have no reliable task/vma, typically from interrupt
 894  * context. It is less reliable than using the task's vma and may give
 895  * false positives:
 896  */
 897 int in_gate_area_no_task(unsigned long addr)
 898 {
 899         return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
 900 }
 901
 902 const char *arch_vma_name(struct vm_area_struct *vma)
 903 {
 904         if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
 905                 return "[vdso]";
 906         if (vma == &gate_vma)
 907                 return "[vsyscall]";
 908         return NULL;
 909 }
 910
 911 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 912 /*
 913  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 914  */
 915 static long __meminitdata addr_start, addr_end;
 916 static void __meminitdata *p_start, *p_end;
 917 static int __meminitdata node_start;
 918
 919 int __meminit
 920 vmemmap_populate(struct page *start_page, unsigned long size, int node)
 921 {
 922         unsigned long addr = (unsigned long)start_page;
 923         unsigned long end = (unsigned long)(start_page + size);
 924         unsigned long next;
 925         pgd_t *pgd;
 926         pud_t *pud;
 927         pmd_t *pmd;
 928
 929         for (; addr < end; addr = next) {
 930                 void *p = NULL;
 931
 932                 pgd = vmemmap_pgd_populate(addr, node);
 933                 if (!pgd)
 934                         return -ENOMEM;
 935
 936                 pud = vmemmap_pud_populate(pgd, addr, node);
 937                 if (!pud)
 938                         return -ENOMEM;
 939
 940                 if (!cpu_has_pse) {
 941                         next = (addr + PAGE_SIZE) & PAGE_MASK;
 942                         pmd = vmemmap_pmd_populate(pud, addr, node);
 943
 944                         if (!pmd)
 945                                 return -ENOMEM;
 946
 947                         p = vmemmap_pte_populate(pmd, addr, node);
 948
 949                         if (!p)
 950                                 return -ENOMEM;
 951
 952                         addr_end = addr + PAGE_SIZE;
 953                         p_end = p + PAGE_SIZE;
 954                 } else {
 955                         next = pmd_addr_end(addr, end);
 956
 957                         pmd = pmd_offset(pud, addr);
 958                         if (pmd_none(*pmd)) {
 959                                 pte_t entry;
 960
 961                                 p = vmemmap_alloc_block_buf(PMD_SIZE, node);
 962                                 if (!p)
 963                                         return -ENOMEM;
 964
 965                                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
 966                                                 PAGE_KERNEL_LARGE);
 967                                 set_pmd(pmd, __pmd(pte_val(entry)));
 968
 969                                 /* check to see if we have contiguous blocks */
 970                                 if (p_end != p || node_start != node) {
 971                                         if (p_start)
 972                                                 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
 973                                                        addr_start, addr_end-1, p_start, p_end-1, node_start);
 974                                         addr_start = addr;
 975                                         node_start = node;
 976                                         p_start = p;
 977                                 }
 978
 979                                 addr_end = addr + PMD_SIZE;
 980                                 p_end = p + PMD_SIZE;
 981                         } else
 982                                 vmemmap_verify((pte_t *)pmd, node, addr, next);
 983                 }
 984
 985         }
 986         sync_global_pgds((unsigned long)start_page, end);
 987         return 0;
 988 }
 989
 990 void __meminit vmemmap_populate_print_last(void)
 991 {
 992         if (p_start) {
 993                 printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
 994                         addr_start, addr_end-1, p_start, p_end-1, node_start);
 995                 p_start = NULL;
 996                 p_end = NULL;
 997                 node_start = 0;
 998         }
 999 }
1000 #endif