]> Git Repo - linux.git/blame - arch/x86/xen/mmu.c
bootmem, x86: further fixes for arch-specific bootmem wrapping
[linux.git] / arch / x86 / xen / mmu.c
CommitLineData
3b827c1b
JF
1/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <[email protected]>, XenSource Inc, 2007
40 */
f120f13e 41#include <linux/sched.h>
f4f97b3e 42#include <linux/highmem.h>
994025ca 43#include <linux/debugfs.h>
3b827c1b 44#include <linux/bug.h>
3b827c1b
JF
45
46#include <asm/pgtable.h>
47#include <asm/tlbflush.h>
5deb30d1 48#include <asm/fixmap.h>
3b827c1b 49#include <asm/mmu_context.h>
319f3ba5 50#include <asm/setup.h>
f4f97b3e 51#include <asm/paravirt.h>
cbcd79c2 52#include <asm/linkage.h>
3b827c1b
JF
53
54#include <asm/xen/hypercall.h>
f4f97b3e 55#include <asm/xen/hypervisor.h>
3b827c1b
JF
56
57#include <xen/page.h>
58#include <xen/interface/xen.h>
319f3ba5
JF
59#include <xen/interface/version.h>
60#include <xen/hvc-console.h>
3b827c1b 61
f4f97b3e 62#include "multicalls.h"
3b827c1b 63#include "mmu.h"
994025ca
JF
64#include "debugfs.h"
65
66#define MMU_UPDATE_HISTO 30
67
68#ifdef CONFIG_XEN_DEBUG_FS
69
70static struct {
71 u32 pgd_update;
72 u32 pgd_update_pinned;
73 u32 pgd_update_batched;
74
75 u32 pud_update;
76 u32 pud_update_pinned;
77 u32 pud_update_batched;
78
79 u32 pmd_update;
80 u32 pmd_update_pinned;
81 u32 pmd_update_batched;
82
83 u32 pte_update;
84 u32 pte_update_pinned;
85 u32 pte_update_batched;
86
87 u32 mmu_update;
88 u32 mmu_update_extended;
89 u32 mmu_update_histo[MMU_UPDATE_HISTO];
90
91 u32 prot_commit;
92 u32 prot_commit_batched;
93
94 u32 set_pte_at;
95 u32 set_pte_at_batched;
96 u32 set_pte_at_pinned;
97 u32 set_pte_at_current;
98 u32 set_pte_at_kernel;
99} mmu_stats;
100
101static u8 zero_stats;
102
103static inline void check_zero(void)
104{
105 if (unlikely(zero_stats)) {
106 memset(&mmu_stats, 0, sizeof(mmu_stats));
107 zero_stats = 0;
108 }
109}
110
111#define ADD_STATS(elem, val) \
112 do { check_zero(); mmu_stats.elem += (val); } while(0)
113
114#else /* !CONFIG_XEN_DEBUG_FS */
115
116#define ADD_STATS(elem, val) do { (void)(val); } while(0)
117
118#endif /* CONFIG_XEN_DEBUG_FS */
3b827c1b 119
319f3ba5
JF
120
121/*
122 * Identity map, in addition to plain kernel map. This needs to be
123 * large enough to allocate page table pages to allocate the rest.
124 * Each page can map 2MB.
125 */
126static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
127
128#ifdef CONFIG_X86_64
129/* l3 pud for userspace vsyscall mapping */
130static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
131#endif /* CONFIG_X86_64 */
132
133/*
134 * Note about cr3 (pagetable base) values:
135 *
136 * xen_cr3 contains the current logical cr3 value; it contains the
137 * last set cr3. This may not be the current effective cr3, because
138 * its update may be being lazily deferred. However, a vcpu looking
139 * at its own cr3 can use this value knowing that it everything will
140 * be self-consistent.
141 *
142 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
143 * hypercall to set the vcpu cr3 is complete (so it may be a little
144 * out of date, but it will never be set early). If one vcpu is
145 * looking at another vcpu's cr3 value, it should use this variable.
146 */
147DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
148DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
149
150
d6182fbf
JF
151/*
152 * Just beyond the highest usermode address. STACK_TOP_MAX has a
153 * redzone above it, so round it up to a PGD boundary.
154 */
155#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
156
157
d451bb7a 158#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
cf0923ea 159#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
d451bb7a 160
cf0923ea 161/* Placeholder for holes in the address space */
cbcd79c2 162static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
cf0923ea
JF
163 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
164
165 /* Array of pointers to pages containing p2m entries */
cbcd79c2 166static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
cf0923ea 167 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
d451bb7a 168
d5edbc1f 169/* Arrays of p2m arrays expressed in mfns used for save/restore */
cbcd79c2 170static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
d5edbc1f 171
cbcd79c2
JF
172static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
173 __page_aligned_bss;
d5edbc1f 174
d451bb7a
JF
175static inline unsigned p2m_top_index(unsigned long pfn)
176{
8006ec3e 177 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
d451bb7a
JF
178 return pfn / P2M_ENTRIES_PER_PAGE;
179}
180
181static inline unsigned p2m_index(unsigned long pfn)
182{
183 return pfn % P2M_ENTRIES_PER_PAGE;
184}
185
d5edbc1f
JF
186/* Build the parallel p2m_top_mfn structures */
187void xen_setup_mfn_list_list(void)
188{
189 unsigned pfn, idx;
190
f63c2f24 191 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
d5edbc1f
JF
192 unsigned topidx = p2m_top_index(pfn);
193
194 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
195 }
196
f63c2f24 197 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
d5edbc1f
JF
198 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
199 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
200 }
201
202 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
203
204 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
205 virt_to_mfn(p2m_top_mfn_list);
206 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
207}
208
209/* Set up p2m_top to point to the domain-builder provided p2m pages */
d451bb7a
JF
210void __init xen_build_dynamic_phys_to_machine(void)
211{
d451bb7a 212 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
8006ec3e 213 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
d5edbc1f 214 unsigned pfn;
d451bb7a 215
f63c2f24 216 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
d451bb7a
JF
217 unsigned topidx = p2m_top_index(pfn);
218
219 p2m_top[topidx] = &mfn_list[pfn];
220 }
221}
222
223unsigned long get_phys_to_machine(unsigned long pfn)
224{
225 unsigned topidx, idx;
226
8006ec3e
JF
227 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
228 return INVALID_P2M_ENTRY;
229
d451bb7a 230 topidx = p2m_top_index(pfn);
d451bb7a
JF
231 idx = p2m_index(pfn);
232 return p2m_top[topidx][idx];
233}
15ce6005 234EXPORT_SYMBOL_GPL(get_phys_to_machine);
d451bb7a 235
d5edbc1f 236static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
d451bb7a
JF
237{
238 unsigned long *p;
239 unsigned i;
240
241 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
242 BUG_ON(p == NULL);
243
f63c2f24 244 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
d451bb7a
JF
245 p[i] = INVALID_P2M_ENTRY;
246
cf0923ea 247 if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
d451bb7a 248 free_page((unsigned long)p);
d5edbc1f
JF
249 else
250 *mfnp = virt_to_mfn(p);
d451bb7a
JF
251}
252
253void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
254{
255 unsigned topidx, idx;
256
257 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
258 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
8006ec3e
JF
259 return;
260 }
261
262 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
263 BUG_ON(mfn != INVALID_P2M_ENTRY);
d451bb7a
JF
264 return;
265 }
266
267 topidx = p2m_top_index(pfn);
cf0923ea 268 if (p2m_top[topidx] == p2m_missing) {
d451bb7a
JF
269 /* no need to allocate a page to store an invalid entry */
270 if (mfn == INVALID_P2M_ENTRY)
271 return;
d5edbc1f 272 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
d451bb7a
JF
273 }
274
275 idx = p2m_index(pfn);
276 p2m_top[topidx][idx] = mfn;
277}
278
ce803e70 279xmaddr_t arbitrary_virt_to_machine(void *vaddr)
3b827c1b 280{
ce803e70 281 unsigned long address = (unsigned long)vaddr;
da7bfc50 282 unsigned int level;
9f32d21c
CL
283 pte_t *pte;
284 unsigned offset;
3b827c1b 285
9f32d21c
CL
286 /*
287 * if the PFN is in the linear mapped vaddr range, we can just use
288 * the (quick) virt_to_machine() p2m lookup
289 */
290 if (virt_addr_valid(vaddr))
291 return virt_to_machine(vaddr);
292
293 /* otherwise we have to do a (slower) full page-table walk */
3b827c1b 294
9f32d21c
CL
295 pte = lookup_address(address, &level);
296 BUG_ON(pte == NULL);
297 offset = address & ~PAGE_MASK;
ebd879e3 298 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
3b827c1b
JF
299}
300
301void make_lowmem_page_readonly(void *vaddr)
302{
303 pte_t *pte, ptev;
304 unsigned long address = (unsigned long)vaddr;
da7bfc50 305 unsigned int level;
3b827c1b 306
f0646e43 307 pte = lookup_address(address, &level);
3b827c1b
JF
308 BUG_ON(pte == NULL);
309
310 ptev = pte_wrprotect(*pte);
311
312 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
313 BUG();
314}
315
316void make_lowmem_page_readwrite(void *vaddr)
317{
318 pte_t *pte, ptev;
319 unsigned long address = (unsigned long)vaddr;
da7bfc50 320 unsigned int level;
3b827c1b 321
f0646e43 322 pte = lookup_address(address, &level);
3b827c1b
JF
323 BUG_ON(pte == NULL);
324
325 ptev = pte_mkwrite(*pte);
326
327 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
328 BUG();
329}
330
331
7708ad64 332static bool xen_page_pinned(void *ptr)
e2426cf8
JF
333{
334 struct page *page = virt_to_page(ptr);
335
336 return PagePinned(page);
337}
338
7708ad64 339static void xen_extend_mmu_update(const struct mmu_update *update)
3b827c1b 340{
d66bf8fc
JF
341 struct multicall_space mcs;
342 struct mmu_update *u;
3b827c1b 343
400d3494
JF
344 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
345
994025ca
JF
346 if (mcs.mc != NULL) {
347 ADD_STATS(mmu_update_extended, 1);
348 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
349
400d3494 350 mcs.mc->args[1]++;
994025ca
JF
351
352 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
353 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
354 else
355 ADD_STATS(mmu_update_histo[0], 1);
356 } else {
357 ADD_STATS(mmu_update, 1);
400d3494
JF
358 mcs = __xen_mc_entry(sizeof(*u));
359 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
994025ca 360 ADD_STATS(mmu_update_histo[1], 1);
400d3494 361 }
d66bf8fc 362
d66bf8fc 363 u = mcs.args;
400d3494
JF
364 *u = *update;
365}
366
367void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
368{
369 struct mmu_update u;
370
371 preempt_disable();
372
373 xen_mc_batch();
374
ce803e70
JF
375 /* ptr may be ioremapped for 64-bit pagetable setup */
376 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
400d3494 377 u.val = pmd_val_ma(val);
7708ad64 378 xen_extend_mmu_update(&u);
d66bf8fc 379
994025ca
JF
380 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
381
d66bf8fc
JF
382 xen_mc_issue(PARAVIRT_LAZY_MMU);
383
384 preempt_enable();
3b827c1b
JF
385}
386
e2426cf8
JF
387void xen_set_pmd(pmd_t *ptr, pmd_t val)
388{
994025ca
JF
389 ADD_STATS(pmd_update, 1);
390
e2426cf8
JF
391 /* If page is not pinned, we can just update the entry
392 directly */
7708ad64 393 if (!xen_page_pinned(ptr)) {
e2426cf8
JF
394 *ptr = val;
395 return;
396 }
397
994025ca
JF
398 ADD_STATS(pmd_update_pinned, 1);
399
e2426cf8
JF
400 xen_set_pmd_hyper(ptr, val);
401}
402
3b827c1b
JF
403/*
404 * Associate a virtual page frame with a given physical page frame
405 * and protection flags for that frame.
406 */
407void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
408{
836fe2f2 409 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
3b827c1b
JF
410}
411
412void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
413 pte_t *ptep, pte_t pteval)
414{
2bd50036
JF
415 /* updates to init_mm may be done without lock */
416 if (mm == &init_mm)
417 preempt_disable();
418
994025ca
JF
419 ADD_STATS(set_pte_at, 1);
420// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
421 ADD_STATS(set_pte_at_current, mm == current->mm);
422 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
423
d66bf8fc 424 if (mm == current->mm || mm == &init_mm) {
8965c1c0 425 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
d66bf8fc
JF
426 struct multicall_space mcs;
427 mcs = xen_mc_entry(0);
428
429 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
994025ca 430 ADD_STATS(set_pte_at_batched, 1);
d66bf8fc 431 xen_mc_issue(PARAVIRT_LAZY_MMU);
2bd50036 432 goto out;
d66bf8fc
JF
433 } else
434 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
2bd50036 435 goto out;
d66bf8fc
JF
436 }
437 xen_set_pte(ptep, pteval);
2bd50036
JF
438
439out:
440 if (mm == &init_mm)
441 preempt_enable();
3b827c1b
JF
442}
443
f63c2f24
T
444pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
445 unsigned long addr, pte_t *ptep)
947a69c9 446{
e57778a1
JF
447 /* Just return the pte as-is. We preserve the bits on commit */
448 return *ptep;
449}
450
451void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
452 pte_t *ptep, pte_t pte)
453{
400d3494 454 struct mmu_update u;
e57778a1 455
400d3494 456 xen_mc_batch();
947a69c9 457
9f32d21c 458 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
400d3494 459 u.val = pte_val_ma(pte);
7708ad64 460 xen_extend_mmu_update(&u);
947a69c9 461
994025ca
JF
462 ADD_STATS(prot_commit, 1);
463 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
464
e57778a1 465 xen_mc_issue(PARAVIRT_LAZY_MMU);
947a69c9
JF
466}
467
ebb9cfe2
JF
468/* Assume pteval_t is equivalent to all the other *val_t types. */
469static pteval_t pte_mfn_to_pfn(pteval_t val)
947a69c9 470{
ebb9cfe2 471 if (val & _PAGE_PRESENT) {
59438c9f 472 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
77be1fab 473 pteval_t flags = val & PTE_FLAGS_MASK;
d8355aca 474 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
ebb9cfe2 475 }
947a69c9 476
ebb9cfe2 477 return val;
947a69c9
JF
478}
479
ebb9cfe2 480static pteval_t pte_pfn_to_mfn(pteval_t val)
947a69c9 481{
ebb9cfe2 482 if (val & _PAGE_PRESENT) {
59438c9f 483 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
77be1fab 484 pteval_t flags = val & PTE_FLAGS_MASK;
d8355aca 485 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
947a69c9
JF
486 }
487
ebb9cfe2 488 return val;
947a69c9
JF
489}
490
ebb9cfe2 491pteval_t xen_pte_val(pte_t pte)
947a69c9 492{
ebb9cfe2 493 return pte_mfn_to_pfn(pte.pte);
947a69c9 494}
da5de7c2 495PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
947a69c9 496
947a69c9
JF
497pgdval_t xen_pgd_val(pgd_t pgd)
498{
ebb9cfe2 499 return pte_mfn_to_pfn(pgd.pgd);
947a69c9 500}
da5de7c2 501PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
947a69c9
JF
502
503pte_t xen_make_pte(pteval_t pte)
504{
ebb9cfe2
JF
505 pte = pte_pfn_to_mfn(pte);
506 return native_make_pte(pte);
947a69c9 507}
da5de7c2 508PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
947a69c9
JF
509
510pgd_t xen_make_pgd(pgdval_t pgd)
511{
ebb9cfe2
JF
512 pgd = pte_pfn_to_mfn(pgd);
513 return native_make_pgd(pgd);
947a69c9 514}
da5de7c2 515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
947a69c9
JF
516
517pmdval_t xen_pmd_val(pmd_t pmd)
518{
ebb9cfe2 519 return pte_mfn_to_pfn(pmd.pmd);
947a69c9 520}
da5de7c2 521PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
28499143 522
e2426cf8 523void xen_set_pud_hyper(pud_t *ptr, pud_t val)
f4f97b3e 524{
400d3494 525 struct mmu_update u;
f4f97b3e 526
d66bf8fc
JF
527 preempt_disable();
528
400d3494
JF
529 xen_mc_batch();
530
ce803e70
JF
531 /* ptr may be ioremapped for 64-bit pagetable setup */
532 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
400d3494 533 u.val = pud_val_ma(val);
7708ad64 534 xen_extend_mmu_update(&u);
d66bf8fc 535
994025ca
JF
536 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
537
d66bf8fc
JF
538 xen_mc_issue(PARAVIRT_LAZY_MMU);
539
540 preempt_enable();
f4f97b3e
JF
541}
542
e2426cf8
JF
543void xen_set_pud(pud_t *ptr, pud_t val)
544{
994025ca
JF
545 ADD_STATS(pud_update, 1);
546
e2426cf8
JF
547 /* If page is not pinned, we can just update the entry
548 directly */
7708ad64 549 if (!xen_page_pinned(ptr)) {
e2426cf8
JF
550 *ptr = val;
551 return;
552 }
553
994025ca
JF
554 ADD_STATS(pud_update_pinned, 1);
555
e2426cf8
JF
556 xen_set_pud_hyper(ptr, val);
557}
558
f4f97b3e
JF
559void xen_set_pte(pte_t *ptep, pte_t pte)
560{
994025ca
JF
561 ADD_STATS(pte_update, 1);
562// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
563 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
564
f6e58732 565#ifdef CONFIG_X86_PAE
f4f97b3e
JF
566 ptep->pte_high = pte.pte_high;
567 smp_wmb();
568 ptep->pte_low = pte.pte_low;
f6e58732
JF
569#else
570 *ptep = pte;
571#endif
f4f97b3e
JF
572}
573
f6e58732 574#ifdef CONFIG_X86_PAE
3b827c1b
JF
575void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
576{
f6e58732 577 set_64bit((u64 *)ptep, native_pte_val(pte));
3b827c1b
JF
578}
579
580void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
581{
582 ptep->pte_low = 0;
583 smp_wmb(); /* make sure low gets written first */
584 ptep->pte_high = 0;
585}
586
587void xen_pmd_clear(pmd_t *pmdp)
588{
e2426cf8 589 set_pmd(pmdp, __pmd(0));
3b827c1b 590}
f6e58732 591#endif /* CONFIG_X86_PAE */
3b827c1b 592
abf33038 593pmd_t xen_make_pmd(pmdval_t pmd)
3b827c1b 594{
ebb9cfe2 595 pmd = pte_pfn_to_mfn(pmd);
947a69c9 596 return native_make_pmd(pmd);
3b827c1b 597}
da5de7c2 598PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
3b827c1b 599
f6e58732
JF
600#if PAGETABLE_LEVELS == 4
601pudval_t xen_pud_val(pud_t pud)
602{
603 return pte_mfn_to_pfn(pud.pud);
604}
da5de7c2 605PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
f6e58732
JF
606
607pud_t xen_make_pud(pudval_t pud)
608{
609 pud = pte_pfn_to_mfn(pud);
610
611 return native_make_pud(pud);
612}
da5de7c2 613PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
f6e58732 614
d6182fbf 615pgd_t *xen_get_user_pgd(pgd_t *pgd)
f6e58732 616{
d6182fbf
JF
617 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
618 unsigned offset = pgd - pgd_page;
619 pgd_t *user_ptr = NULL;
f6e58732 620
d6182fbf
JF
621 if (offset < pgd_index(USER_LIMIT)) {
622 struct page *page = virt_to_page(pgd_page);
623 user_ptr = (pgd_t *)page->private;
624 if (user_ptr)
625 user_ptr += offset;
626 }
f6e58732 627
d6182fbf
JF
628 return user_ptr;
629}
630
631static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
632{
633 struct mmu_update u;
f6e58732
JF
634
635 u.ptr = virt_to_machine(ptr).maddr;
636 u.val = pgd_val_ma(val);
7708ad64 637 xen_extend_mmu_update(&u);
d6182fbf
JF
638}
639
640/*
641 * Raw hypercall-based set_pgd, intended for in early boot before
642 * there's a page structure. This implies:
643 * 1. The only existing pagetable is the kernel's
644 * 2. It is always pinned
645 * 3. It has no user pagetable attached to it
646 */
647void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
648{
649 preempt_disable();
650
651 xen_mc_batch();
652
653 __xen_set_pgd_hyper(ptr, val);
f6e58732
JF
654
655 xen_mc_issue(PARAVIRT_LAZY_MMU);
656
657 preempt_enable();
658}
659
660void xen_set_pgd(pgd_t *ptr, pgd_t val)
661{
d6182fbf
JF
662 pgd_t *user_ptr = xen_get_user_pgd(ptr);
663
994025ca
JF
664 ADD_STATS(pgd_update, 1);
665
f6e58732
JF
666 /* If page is not pinned, we can just update the entry
667 directly */
7708ad64 668 if (!xen_page_pinned(ptr)) {
f6e58732 669 *ptr = val;
d6182fbf 670 if (user_ptr) {
7708ad64 671 WARN_ON(xen_page_pinned(user_ptr));
d6182fbf
JF
672 *user_ptr = val;
673 }
f6e58732
JF
674 return;
675 }
676
994025ca
JF
677 ADD_STATS(pgd_update_pinned, 1);
678 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
679
d6182fbf
JF
680 /* If it's pinned, then we can at least batch the kernel and
681 user updates together. */
682 xen_mc_batch();
683
684 __xen_set_pgd_hyper(ptr, val);
685 if (user_ptr)
686 __xen_set_pgd_hyper(user_ptr, val);
687
688 xen_mc_issue(PARAVIRT_LAZY_MMU);
f6e58732
JF
689}
690#endif /* PAGETABLE_LEVELS == 4 */
691
f4f97b3e 692/*
5deb30d1
JF
693 * (Yet another) pagetable walker. This one is intended for pinning a
694 * pagetable. This means that it walks a pagetable and calls the
695 * callback function on each page it finds making up the page table,
696 * at every level. It walks the entire pagetable, but it only bothers
697 * pinning pte pages which are below limit. In the normal case this
698 * will be STACK_TOP_MAX, but at boot we need to pin up to
699 * FIXADDR_TOP.
700 *
701 * For 32-bit the important bit is that we don't pin beyond there,
702 * because then we start getting into Xen's ptes.
703 *
704 * For 64-bit, we must skip the Xen hole in the middle of the address
705 * space, just after the big x86-64 virtual hole.
706 */
86bbc2c2
IC
707static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
708 int (*func)(struct mm_struct *mm, struct page *,
709 enum pt_level),
710 unsigned long limit)
3b827c1b 711{
f4f97b3e 712 int flush = 0;
5deb30d1
JF
713 unsigned hole_low, hole_high;
714 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
715 unsigned pgdidx, pudidx, pmdidx;
f4f97b3e 716
5deb30d1
JF
717 /* The limit is the last byte to be touched */
718 limit--;
719 BUG_ON(limit >= FIXADDR_TOP);
3b827c1b
JF
720
721 if (xen_feature(XENFEAT_auto_translated_physmap))
f4f97b3e
JF
722 return 0;
723
5deb30d1
JF
724 /*
725 * 64-bit has a great big hole in the middle of the address
726 * space, which contains the Xen mappings. On 32-bit these
727 * will end up making a zero-sized hole and so is a no-op.
728 */
d6182fbf 729 hole_low = pgd_index(USER_LIMIT);
5deb30d1
JF
730 hole_high = pgd_index(PAGE_OFFSET);
731
732 pgdidx_limit = pgd_index(limit);
733#if PTRS_PER_PUD > 1
734 pudidx_limit = pud_index(limit);
735#else
736 pudidx_limit = 0;
737#endif
738#if PTRS_PER_PMD > 1
739 pmdidx_limit = pmd_index(limit);
740#else
741 pmdidx_limit = 0;
742#endif
743
5deb30d1 744 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
f4f97b3e 745 pud_t *pud;
3b827c1b 746
5deb30d1
JF
747 if (pgdidx >= hole_low && pgdidx < hole_high)
748 continue;
f4f97b3e 749
5deb30d1 750 if (!pgd_val(pgd[pgdidx]))
3b827c1b 751 continue;
f4f97b3e 752
5deb30d1 753 pud = pud_offset(&pgd[pgdidx], 0);
3b827c1b
JF
754
755 if (PTRS_PER_PUD > 1) /* not folded */
eefb47f6 756 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
f4f97b3e 757
5deb30d1 758 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
f4f97b3e 759 pmd_t *pmd;
f4f97b3e 760
5deb30d1
JF
761 if (pgdidx == pgdidx_limit &&
762 pudidx > pudidx_limit)
763 goto out;
3b827c1b 764
5deb30d1 765 if (pud_none(pud[pudidx]))
3b827c1b 766 continue;
f4f97b3e 767
5deb30d1 768 pmd = pmd_offset(&pud[pudidx], 0);
3b827c1b
JF
769
770 if (PTRS_PER_PMD > 1) /* not folded */
eefb47f6 771 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
f4f97b3e 772
5deb30d1
JF
773 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
774 struct page *pte;
775
776 if (pgdidx == pgdidx_limit &&
777 pudidx == pudidx_limit &&
778 pmdidx > pmdidx_limit)
779 goto out;
3b827c1b 780
5deb30d1 781 if (pmd_none(pmd[pmdidx]))
3b827c1b
JF
782 continue;
783
5deb30d1 784 pte = pmd_page(pmd[pmdidx]);
eefb47f6 785 flush |= (*func)(mm, pte, PT_PTE);
3b827c1b
JF
786 }
787 }
788 }
11ad93e5 789
5deb30d1 790out:
11ad93e5
JF
791 /* Do the top level last, so that the callbacks can use it as
792 a cue to do final things like tlb flushes. */
eefb47f6 793 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
f4f97b3e
JF
794
795 return flush;
3b827c1b
JF
796}
797
86bbc2c2
IC
798static int xen_pgd_walk(struct mm_struct *mm,
799 int (*func)(struct mm_struct *mm, struct page *,
800 enum pt_level),
801 unsigned long limit)
802{
803 return __xen_pgd_walk(mm, mm->pgd, func, limit);
804}
805
7708ad64
JF
806/* If we're using split pte locks, then take the page's lock and
807 return a pointer to it. Otherwise return NULL. */
eefb47f6 808static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
74260714
JF
809{
810 spinlock_t *ptl = NULL;
811
f7d0b926 812#if USE_SPLIT_PTLOCKS
74260714 813 ptl = __pte_lockptr(page);
eefb47f6 814 spin_lock_nest_lock(ptl, &mm->page_table_lock);
74260714
JF
815#endif
816
817 return ptl;
818}
819
7708ad64 820static void xen_pte_unlock(void *v)
74260714
JF
821{
822 spinlock_t *ptl = v;
823 spin_unlock(ptl);
824}
825
826static void xen_do_pin(unsigned level, unsigned long pfn)
827{
828 struct mmuext_op *op;
829 struct multicall_space mcs;
830
831 mcs = __xen_mc_entry(sizeof(*op));
832 op = mcs.args;
833 op->cmd = level;
834 op->arg1.mfn = pfn_to_mfn(pfn);
835 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
836}
837
eefb47f6
JF
838static int xen_pin_page(struct mm_struct *mm, struct page *page,
839 enum pt_level level)
f4f97b3e 840{
d60cd46b 841 unsigned pgfl = TestSetPagePinned(page);
f4f97b3e
JF
842 int flush;
843
844 if (pgfl)
845 flush = 0; /* already pinned */
846 else if (PageHighMem(page))
847 /* kmaps need flushing if we found an unpinned
848 highpage */
849 flush = 1;
850 else {
851 void *pt = lowmem_page_address(page);
852 unsigned long pfn = page_to_pfn(page);
853 struct multicall_space mcs = __xen_mc_entry(0);
74260714 854 spinlock_t *ptl;
f4f97b3e
JF
855
856 flush = 0;
857
11ad93e5
JF
858 /*
859 * We need to hold the pagetable lock between the time
860 * we make the pagetable RO and when we actually pin
861 * it. If we don't, then other users may come in and
862 * attempt to update the pagetable by writing it,
863 * which will fail because the memory is RO but not
864 * pinned, so Xen won't do the trap'n'emulate.
865 *
866 * If we're using split pte locks, we can't hold the
867 * entire pagetable's worth of locks during the
868 * traverse, because we may wrap the preempt count (8
869 * bits). The solution is to mark RO and pin each PTE
870 * page while holding the lock. This means the number
871 * of locks we end up holding is never more than a
872 * batch size (~32 entries, at present).
873 *
874 * If we're not using split pte locks, we needn't pin
875 * the PTE pages independently, because we're
876 * protected by the overall pagetable lock.
877 */
74260714
JF
878 ptl = NULL;
879 if (level == PT_PTE)
eefb47f6 880 ptl = xen_pte_lock(page, mm);
74260714 881
f4f97b3e
JF
882 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
883 pfn_pte(pfn, PAGE_KERNEL_RO),
74260714
JF
884 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
885
11ad93e5 886 if (ptl) {
74260714
JF
887 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
888
74260714
JF
889 /* Queue a deferred unlock for when this batch
890 is completed. */
7708ad64 891 xen_mc_callback(xen_pte_unlock, ptl);
74260714 892 }
f4f97b3e
JF
893 }
894
895 return flush;
896}
3b827c1b 897
f4f97b3e
JF
898/* This is called just after a mm has been created, but it has not
899 been used yet. We need to make sure that its pagetable is all
900 read-only, and can be pinned. */
eefb47f6 901static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
3b827c1b 902{
d05fdf31
JF
903 vm_unmap_aliases();
904
f4f97b3e 905 xen_mc_batch();
3b827c1b 906
86bbc2c2 907 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
d05fdf31 908 /* re-enable interrupts for flushing */
f87e4cac 909 xen_mc_issue(0);
d05fdf31 910
f4f97b3e 911 kmap_flush_unused();
d05fdf31 912
f87e4cac
JF
913 xen_mc_batch();
914 }
f4f97b3e 915
d6182fbf
JF
916#ifdef CONFIG_X86_64
917 {
918 pgd_t *user_pgd = xen_get_user_pgd(pgd);
919
920 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
921
922 if (user_pgd) {
eefb47f6 923 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
f63c2f24
T
924 xen_do_pin(MMUEXT_PIN_L4_TABLE,
925 PFN_DOWN(__pa(user_pgd)));
d6182fbf
JF
926 }
927 }
928#else /* CONFIG_X86_32 */
5deb30d1
JF
929#ifdef CONFIG_X86_PAE
930 /* Need to make sure unshared kernel PMD is pinnable */
47cb2ed9 931 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
eefb47f6 932 PT_PMD);
5deb30d1 933#endif
28499143 934 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
d6182fbf 935#endif /* CONFIG_X86_64 */
f4f97b3e 936 xen_mc_issue(0);
3b827c1b
JF
937}
938
eefb47f6
JF
939static void xen_pgd_pin(struct mm_struct *mm)
940{
941 __xen_pgd_pin(mm, mm->pgd);
942}
943
0e91398f
JF
944/*
945 * On save, we need to pin all pagetables to make sure they get their
946 * mfns turned into pfns. Search the list for any unpinned pgds and pin
947 * them (unpinned pgds are not currently in use, probably because the
948 * process is under construction or destruction).
eefb47f6
JF
949 *
950 * Expected to be called in stop_machine() ("equivalent to taking
951 * every spinlock in the system"), so the locking doesn't really
952 * matter all that much.
0e91398f
JF
953 */
954void xen_mm_pin_all(void)
955{
956 unsigned long flags;
957 struct page *page;
74260714 958
0e91398f 959 spin_lock_irqsave(&pgd_lock, flags);
f4f97b3e 960
0e91398f
JF
961 list_for_each_entry(page, &pgd_list, lru) {
962 if (!PagePinned(page)) {
eefb47f6 963 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
0e91398f
JF
964 SetPageSavePinned(page);
965 }
966 }
967
968 spin_unlock_irqrestore(&pgd_lock, flags);
3b827c1b
JF
969}
970
c1f2f09e
EH
971/*
972 * The init_mm pagetable is really pinned as soon as its created, but
973 * that's before we have page structures to store the bits. So do all
974 * the book-keeping now.
975 */
eefb47f6
JF
976static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
977 enum pt_level level)
3b827c1b 978{
f4f97b3e
JF
979 SetPagePinned(page);
980 return 0;
981}
3b827c1b 982
f4f97b3e
JF
983void __init xen_mark_init_mm_pinned(void)
984{
eefb47f6 985 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
f4f97b3e 986}
3b827c1b 987
eefb47f6
JF
988static int xen_unpin_page(struct mm_struct *mm, struct page *page,
989 enum pt_level level)
f4f97b3e 990{
d60cd46b 991 unsigned pgfl = TestClearPagePinned(page);
3b827c1b 992
f4f97b3e
JF
993 if (pgfl && !PageHighMem(page)) {
994 void *pt = lowmem_page_address(page);
995 unsigned long pfn = page_to_pfn(page);
74260714
JF
996 spinlock_t *ptl = NULL;
997 struct multicall_space mcs;
998
11ad93e5
JF
999 /*
1000 * Do the converse to pin_page. If we're using split
1001 * pte locks, we must be holding the lock for while
1002 * the pte page is unpinned but still RO to prevent
1003 * concurrent updates from seeing it in this
1004 * partially-pinned state.
1005 */
74260714 1006 if (level == PT_PTE) {
eefb47f6 1007 ptl = xen_pte_lock(page, mm);
74260714 1008
11ad93e5
JF
1009 if (ptl)
1010 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
74260714
JF
1011 }
1012
1013 mcs = __xen_mc_entry(0);
f4f97b3e
JF
1014
1015 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1016 pfn_pte(pfn, PAGE_KERNEL),
74260714
JF
1017 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1018
1019 if (ptl) {
1020 /* unlock when batch completed */
7708ad64 1021 xen_mc_callback(xen_pte_unlock, ptl);
74260714 1022 }
f4f97b3e
JF
1023 }
1024
1025 return 0; /* never need to flush on unpin */
3b827c1b
JF
1026}
1027
f4f97b3e 1028/* Release a pagetables pages back as normal RW */
eefb47f6 1029static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
f4f97b3e 1030{
f4f97b3e
JF
1031 xen_mc_batch();
1032
74260714 1033 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
f4f97b3e 1034
d6182fbf
JF
1035#ifdef CONFIG_X86_64
1036 {
1037 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1038
1039 if (user_pgd) {
f63c2f24
T
1040 xen_do_pin(MMUEXT_UNPIN_TABLE,
1041 PFN_DOWN(__pa(user_pgd)));
eefb47f6 1042 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
d6182fbf
JF
1043 }
1044 }
1045#endif
1046
5deb30d1
JF
1047#ifdef CONFIG_X86_PAE
1048 /* Need to make sure unshared kernel PMD is unpinned */
47cb2ed9 1049 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
eefb47f6 1050 PT_PMD);
5deb30d1 1051#endif
d6182fbf 1052
86bbc2c2 1053 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
f4f97b3e
JF
1054
1055 xen_mc_issue(0);
1056}
3b827c1b 1057
eefb47f6
JF
1058static void xen_pgd_unpin(struct mm_struct *mm)
1059{
1060 __xen_pgd_unpin(mm, mm->pgd);
1061}
1062
0e91398f
JF
1063/*
1064 * On resume, undo any pinning done at save, so that the rest of the
1065 * kernel doesn't see any unexpected pinned pagetables.
1066 */
1067void xen_mm_unpin_all(void)
1068{
1069 unsigned long flags;
1070 struct page *page;
1071
1072 spin_lock_irqsave(&pgd_lock, flags);
1073
1074 list_for_each_entry(page, &pgd_list, lru) {
1075 if (PageSavePinned(page)) {
1076 BUG_ON(!PagePinned(page));
eefb47f6 1077 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
0e91398f
JF
1078 ClearPageSavePinned(page);
1079 }
1080 }
1081
1082 spin_unlock_irqrestore(&pgd_lock, flags);
1083}
1084
3b827c1b
JF
1085void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1086{
f4f97b3e 1087 spin_lock(&next->page_table_lock);
eefb47f6 1088 xen_pgd_pin(next);
f4f97b3e 1089 spin_unlock(&next->page_table_lock);
3b827c1b
JF
1090}
1091
1092void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1093{
f4f97b3e 1094 spin_lock(&mm->page_table_lock);
eefb47f6 1095 xen_pgd_pin(mm);
f4f97b3e 1096 spin_unlock(&mm->page_table_lock);
3b827c1b
JF
1097}
1098
3b827c1b 1099
f87e4cac
JF
1100#ifdef CONFIG_SMP
1101/* Another cpu may still have their %cr3 pointing at the pagetable, so
1102 we need to repoint it somewhere else before we can unpin it. */
1103static void drop_other_mm_ref(void *info)
1104{
1105 struct mm_struct *mm = info;
ce87b3d3 1106 struct mm_struct *active_mm;
3b827c1b 1107
9eb912d1 1108 active_mm = percpu_read(cpu_tlbstate.active_mm);
ce87b3d3
JF
1109
1110 if (active_mm == mm)
f87e4cac 1111 leave_mm(smp_processor_id());
9f79991d
JF
1112
1113 /* If this cpu still has a stale cr3 reference, then make sure
1114 it has been flushed. */
6dbde353 1115 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
9f79991d
JF
1116 load_cr3(swapper_pg_dir);
1117 arch_flush_lazy_cpu_mode();
1118 }
f87e4cac 1119}
3b827c1b 1120
7708ad64 1121static void xen_drop_mm_ref(struct mm_struct *mm)
f87e4cac 1122{
e4d98207 1123 cpumask_var_t mask;
9f79991d
JF
1124 unsigned cpu;
1125
f87e4cac
JF
1126 if (current->active_mm == mm) {
1127 if (current->mm == mm)
1128 load_cr3(swapper_pg_dir);
1129 else
1130 leave_mm(smp_processor_id());
9f79991d
JF
1131 arch_flush_lazy_cpu_mode();
1132 }
1133
1134 /* Get the "official" set of cpus referring to our pagetable. */
e4d98207
MT
1135 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1136 for_each_online_cpu(cpu) {
1137 if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
1138 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1139 continue;
1140 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1141 }
1142 return;
1143 }
1144 cpumask_copy(mask, &mm->cpu_vm_mask);
9f79991d
JF
1145
1146 /* It's possible that a vcpu may have a stale reference to our
1147 cr3, because its in lazy mode, and it hasn't yet flushed
1148 its set of pending hypercalls yet. In this case, we can
1149 look at its actual current cr3 value, and force it to flush
1150 if needed. */
1151 for_each_online_cpu(cpu) {
1152 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
e4d98207 1153 cpumask_set_cpu(cpu, mask);
3b827c1b
JF
1154 }
1155
e4d98207
MT
1156 if (!cpumask_empty(mask))
1157 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1158 free_cpumask_var(mask);
f87e4cac
JF
1159}
1160#else
7708ad64 1161static void xen_drop_mm_ref(struct mm_struct *mm)
f87e4cac
JF
1162{
1163 if (current->active_mm == mm)
1164 load_cr3(swapper_pg_dir);
1165}
1166#endif
1167
1168/*
1169 * While a process runs, Xen pins its pagetables, which means that the
1170 * hypervisor forces it to be read-only, and it controls all updates
1171 * to it. This means that all pagetable updates have to go via the
1172 * hypervisor, which is moderately expensive.
1173 *
1174 * Since we're pulling the pagetable down, we switch to use init_mm,
1175 * unpin old process pagetable and mark it all read-write, which
1176 * allows further operations on it to be simple memory accesses.
1177 *
1178 * The only subtle point is that another CPU may be still using the
1179 * pagetable because of lazy tlb flushing. This means we need need to
1180 * switch all CPUs off this pagetable before we can unpin it.
1181 */
1182void xen_exit_mmap(struct mm_struct *mm)
1183{
1184 get_cpu(); /* make sure we don't move around */
7708ad64 1185 xen_drop_mm_ref(mm);
f87e4cac 1186 put_cpu();
3b827c1b 1187
f120f13e 1188 spin_lock(&mm->page_table_lock);
df912ea4
JF
1189
1190 /* pgd may not be pinned in the error exit path of execve */
7708ad64 1191 if (xen_page_pinned(mm->pgd))
eefb47f6 1192 xen_pgd_unpin(mm);
74260714 1193
f120f13e 1194 spin_unlock(&mm->page_table_lock);
3b827c1b 1195}
994025ca 1196
319f3ba5
JF
1197static __init void xen_pagetable_setup_start(pgd_t *base)
1198{
1199}
1200
1201static __init void xen_pagetable_setup_done(pgd_t *base)
1202{
1203 xen_setup_shared_info();
1204}
1205
1206static void xen_write_cr2(unsigned long cr2)
1207{
1208 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1209}
1210
1211static unsigned long xen_read_cr2(void)
1212{
1213 return percpu_read(xen_vcpu)->arch.cr2;
1214}
1215
1216unsigned long xen_read_cr2_direct(void)
1217{
1218 return percpu_read(xen_vcpu_info.arch.cr2);
1219}
1220
1221static void xen_flush_tlb(void)
1222{
1223 struct mmuext_op *op;
1224 struct multicall_space mcs;
1225
1226 preempt_disable();
1227
1228 mcs = xen_mc_entry(sizeof(*op));
1229
1230 op = mcs.args;
1231 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1232 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1233
1234 xen_mc_issue(PARAVIRT_LAZY_MMU);
1235
1236 preempt_enable();
1237}
1238
1239static void xen_flush_tlb_single(unsigned long addr)
1240{
1241 struct mmuext_op *op;
1242 struct multicall_space mcs;
1243
1244 preempt_disable();
1245
1246 mcs = xen_mc_entry(sizeof(*op));
1247 op = mcs.args;
1248 op->cmd = MMUEXT_INVLPG_LOCAL;
1249 op->arg1.linear_addr = addr & PAGE_MASK;
1250 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1251
1252 xen_mc_issue(PARAVIRT_LAZY_MMU);
1253
1254 preempt_enable();
1255}
1256
1257static void xen_flush_tlb_others(const struct cpumask *cpus,
1258 struct mm_struct *mm, unsigned long va)
1259{
1260 struct {
1261 struct mmuext_op op;
1262 DECLARE_BITMAP(mask, NR_CPUS);
1263 } *args;
1264 struct multicall_space mcs;
1265
1266 BUG_ON(cpumask_empty(cpus));
1267 BUG_ON(!mm);
1268
1269 mcs = xen_mc_entry(sizeof(*args));
1270 args = mcs.args;
1271 args->op.arg2.vcpumask = to_cpumask(args->mask);
1272
1273 /* Remove us, and any offline CPUS. */
1274 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1275 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
319f3ba5
JF
1276
1277 if (va == TLB_FLUSH_ALL) {
1278 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1279 } else {
1280 args->op.cmd = MMUEXT_INVLPG_MULTI;
1281 args->op.arg1.linear_addr = va;
1282 }
1283
1284 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1285
319f3ba5
JF
1286 xen_mc_issue(PARAVIRT_LAZY_MMU);
1287}
1288
1289static unsigned long xen_read_cr3(void)
1290{
1291 return percpu_read(xen_cr3);
1292}
1293
1294static void set_current_cr3(void *v)
1295{
1296 percpu_write(xen_current_cr3, (unsigned long)v);
1297}
1298
1299static void __xen_write_cr3(bool kernel, unsigned long cr3)
1300{
1301 struct mmuext_op *op;
1302 struct multicall_space mcs;
1303 unsigned long mfn;
1304
1305 if (cr3)
1306 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1307 else
1308 mfn = 0;
1309
1310 WARN_ON(mfn == 0 && kernel);
1311
1312 mcs = __xen_mc_entry(sizeof(*op));
1313
1314 op = mcs.args;
1315 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1316 op->arg1.mfn = mfn;
1317
1318 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1319
1320 if (kernel) {
1321 percpu_write(xen_cr3, cr3);
1322
1323 /* Update xen_current_cr3 once the batch has actually
1324 been submitted. */
1325 xen_mc_callback(set_current_cr3, (void *)cr3);
1326 }
1327}
1328
1329static void xen_write_cr3(unsigned long cr3)
1330{
1331 BUG_ON(preemptible());
1332
1333 xen_mc_batch(); /* disables interrupts */
1334
1335 /* Update while interrupts are disabled, so its atomic with
1336 respect to ipis */
1337 percpu_write(xen_cr3, cr3);
1338
1339 __xen_write_cr3(true, cr3);
1340
1341#ifdef CONFIG_X86_64
1342 {
1343 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1344 if (user_pgd)
1345 __xen_write_cr3(false, __pa(user_pgd));
1346 else
1347 __xen_write_cr3(false, 0);
1348 }
1349#endif
1350
1351 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1352}
1353
1354static int xen_pgd_alloc(struct mm_struct *mm)
1355{
1356 pgd_t *pgd = mm->pgd;
1357 int ret = 0;
1358
1359 BUG_ON(PagePinned(virt_to_page(pgd)));
1360
1361#ifdef CONFIG_X86_64
1362 {
1363 struct page *page = virt_to_page(pgd);
1364 pgd_t *user_pgd;
1365
1366 BUG_ON(page->private != 0);
1367
1368 ret = -ENOMEM;
1369
1370 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1371 page->private = (unsigned long)user_pgd;
1372
1373 if (user_pgd != NULL) {
1374 user_pgd[pgd_index(VSYSCALL_START)] =
1375 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1376 ret = 0;
1377 }
1378
1379 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1380 }
1381#endif
1382
1383 return ret;
1384}
1385
1386static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1387{
1388#ifdef CONFIG_X86_64
1389 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1390
1391 if (user_pgd)
1392 free_page((unsigned long)user_pgd);
1393#endif
1394}
1395
1f4f9315
JF
1396#ifdef CONFIG_HIGHPTE
1397static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1398{
1399 pgprot_t prot = PAGE_KERNEL;
1400
1401 if (PagePinned(page))
1402 prot = PAGE_KERNEL_RO;
1403
1404 if (0 && PageHighMem(page))
1405 printk("mapping highpte %lx type %d prot %s\n",
1406 page_to_pfn(page), type,
1407 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1408
1409 return kmap_atomic_prot(page, type, prot);
1410}
1411#endif
1412
1413#ifdef CONFIG_X86_32
1414static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1415{
1416 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1417 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1418 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1419 pte_val_ma(pte));
1420
1421 return pte;
1422}
1423
1424/* Init-time set_pte while constructing initial pagetables, which
1425 doesn't allow RO pagetable pages to be remapped RW */
1426static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1427{
1428 pte = mask_rw_pte(ptep, pte);
1429
1430 xen_set_pte(ptep, pte);
1431}
1432#endif
319f3ba5
JF
1433
1434/* Early in boot, while setting up the initial pagetable, assume
1435 everything is pinned. */
1436static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1437{
1438#ifdef CONFIG_FLATMEM
1439 BUG_ON(mem_map); /* should only be used early */
1440#endif
1441 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1442}
1443
1444/* Early release_pte assumes that all pts are pinned, since there's
1445 only init_mm and anything attached to that is pinned. */
1446static void xen_release_pte_init(unsigned long pfn)
1447{
1448 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1449}
1450
1451static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1452{
1453 struct mmuext_op op;
1454 op.cmd = cmd;
1455 op.arg1.mfn = pfn_to_mfn(pfn);
1456 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1457 BUG();
1458}
1459
1460/* This needs to make sure the new pte page is pinned iff its being
1461 attached to a pinned pagetable. */
1462static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1463{
1464 struct page *page = pfn_to_page(pfn);
1465
1466 if (PagePinned(virt_to_page(mm->pgd))) {
1467 SetPagePinned(page);
1468
1469 vm_unmap_aliases();
1470 if (!PageHighMem(page)) {
1471 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1472 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1473 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1474 } else {
1475 /* make sure there are no stray mappings of
1476 this page */
1477 kmap_flush_unused();
1478 }
1479 }
1480}
1481
1482static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1483{
1484 xen_alloc_ptpage(mm, pfn, PT_PTE);
1485}
1486
1487static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1488{
1489 xen_alloc_ptpage(mm, pfn, PT_PMD);
1490}
1491
1492/* This should never happen until we're OK to use struct page */
1493static void xen_release_ptpage(unsigned long pfn, unsigned level)
1494{
1495 struct page *page = pfn_to_page(pfn);
1496
1497 if (PagePinned(page)) {
1498 if (!PageHighMem(page)) {
1499 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1500 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1501 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1502 }
1503 ClearPagePinned(page);
1504 }
1505}
1506
1507static void xen_release_pte(unsigned long pfn)
1508{
1509 xen_release_ptpage(pfn, PT_PTE);
1510}
1511
1512static void xen_release_pmd(unsigned long pfn)
1513{
1514 xen_release_ptpage(pfn, PT_PMD);
1515}
1516
1517#if PAGETABLE_LEVELS == 4
1518static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1519{
1520 xen_alloc_ptpage(mm, pfn, PT_PUD);
1521}
1522
1523static void xen_release_pud(unsigned long pfn)
1524{
1525 xen_release_ptpage(pfn, PT_PUD);
1526}
1527#endif
1528
1529void __init xen_reserve_top(void)
1530{
1531#ifdef CONFIG_X86_32
1532 unsigned long top = HYPERVISOR_VIRT_START;
1533 struct xen_platform_parameters pp;
1534
1535 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1536 top = pp.virt_start;
1537
1538 reserve_top_address(-top);
1539#endif /* CONFIG_X86_32 */
1540}
1541
1542/*
1543 * Like __va(), but returns address in the kernel mapping (which is
1544 * all we have until the physical memory mapping has been set up.
1545 */
1546static void *__ka(phys_addr_t paddr)
1547{
1548#ifdef CONFIG_X86_64
1549 return (void *)(paddr + __START_KERNEL_map);
1550#else
1551 return __va(paddr);
1552#endif
1553}
1554
1555/* Convert a machine address to physical address */
1556static unsigned long m2p(phys_addr_t maddr)
1557{
1558 phys_addr_t paddr;
1559
1560 maddr &= PTE_PFN_MASK;
1561 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1562
1563 return paddr;
1564}
1565
1566/* Convert a machine address to kernel virtual */
1567static void *m2v(phys_addr_t maddr)
1568{
1569 return __ka(m2p(maddr));
1570}
1571
1572static void set_page_prot(void *addr, pgprot_t prot)
1573{
1574 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1575 pte_t pte = pfn_pte(pfn, prot);
1576
1577 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1578 BUG();
1579}
1580
1581static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1582{
1583 unsigned pmdidx, pteidx;
1584 unsigned ident_pte;
1585 unsigned long pfn;
1586
1587 ident_pte = 0;
1588 pfn = 0;
1589 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1590 pte_t *pte_page;
1591
1592 /* Reuse or allocate a page of ptes */
1593 if (pmd_present(pmd[pmdidx]))
1594 pte_page = m2v(pmd[pmdidx].pmd);
1595 else {
1596 /* Check for free pte pages */
1597 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1598 break;
1599
1600 pte_page = &level1_ident_pgt[ident_pte];
1601 ident_pte += PTRS_PER_PTE;
1602
1603 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1604 }
1605
1606 /* Install mappings */
1607 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1608 pte_t pte;
1609
1610 if (pfn > max_pfn_mapped)
1611 max_pfn_mapped = pfn;
1612
1613 if (!pte_none(pte_page[pteidx]))
1614 continue;
1615
1616 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1617 pte_page[pteidx] = pte;
1618 }
1619 }
1620
1621 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1622 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1623
1624 set_page_prot(pmd, PAGE_KERNEL_RO);
1625}
1626
1627#ifdef CONFIG_X86_64
1628static void convert_pfn_mfn(void *v)
1629{
1630 pte_t *pte = v;
1631 int i;
1632
1633 /* All levels are converted the same way, so just treat them
1634 as ptes. */
1635 for (i = 0; i < PTRS_PER_PTE; i++)
1636 pte[i] = xen_make_pte(pte[i].pte);
1637}
1638
1639/*
1640 * Set up the inital kernel pagetable.
1641 *
1642 * We can construct this by grafting the Xen provided pagetable into
1643 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1644 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1645 * means that only the kernel has a physical mapping to start with -
1646 * but that's enough to get __va working. We need to fill in the rest
1647 * of the physical mapping once some sort of allocator has been set
1648 * up.
1649 */
1650__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1651 unsigned long max_pfn)
1652{
1653 pud_t *l3;
1654 pmd_t *l2;
1655
1656 /* Zap identity mapping */
1657 init_level4_pgt[0] = __pgd(0);
1658
1659 /* Pre-constructed entries are in pfn, so convert to mfn */
1660 convert_pfn_mfn(init_level4_pgt);
1661 convert_pfn_mfn(level3_ident_pgt);
1662 convert_pfn_mfn(level3_kernel_pgt);
1663
1664 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1665 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1666
1667 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1668 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1669
1670 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1671 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1672 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1673
1674 /* Set up identity map */
1675 xen_map_identity_early(level2_ident_pgt, max_pfn);
1676
1677 /* Make pagetable pieces RO */
1678 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1679 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1680 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1681 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1682 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1683 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1684
1685 /* Pin down new L4 */
1686 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1687 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1688
1689 /* Unpin Xen-provided one */
1690 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1691
1692 /* Switch over */
1693 pgd = init_level4_pgt;
1694
1695 /*
1696 * At this stage there can be no user pgd, and no page
1697 * structure to attach it to, so make sure we just set kernel
1698 * pgd.
1699 */
1700 xen_mc_batch();
1701 __xen_write_cr3(true, __pa(pgd));
1702 xen_mc_issue(PARAVIRT_LAZY_CPU);
1703
1704 reserve_early(__pa(xen_start_info->pt_base),
1705 __pa(xen_start_info->pt_base +
1706 xen_start_info->nr_pt_frames * PAGE_SIZE),
1707 "XEN PAGETABLES");
1708
1709 return pgd;
1710}
1711#else /* !CONFIG_X86_64 */
1712static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1713
1714__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1715 unsigned long max_pfn)
1716{
1717 pmd_t *kernel_pmd;
1718
1719 init_pg_tables_start = __pa(pgd);
1720 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1721 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1722
1723 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1724 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1725
1726 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1727
1728 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1729 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1730 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1731
1732 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1733 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1734 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1735
1736 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1737
1738 xen_write_cr3(__pa(swapper_pg_dir));
1739
1740 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1741
1742 return swapper_pg_dir;
1743}
1744#endif /* CONFIG_X86_64 */
1745
1746static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1747{
1748 pte_t pte;
1749
1750 phys >>= PAGE_SHIFT;
1751
1752 switch (idx) {
1753 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1754#ifdef CONFIG_X86_F00F_BUG
1755 case FIX_F00F_IDT:
1756#endif
1757#ifdef CONFIG_X86_32
1758 case FIX_WP_TEST:
1759 case FIX_VDSO:
1760# ifdef CONFIG_HIGHMEM
1761 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1762# endif
1763#else
1764 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1765#endif
1766#ifdef CONFIG_X86_LOCAL_APIC
1767 case FIX_APIC_BASE: /* maps dummy local APIC */
1768#endif
1769 pte = pfn_pte(phys, prot);
1770 break;
1771
1772 default:
1773 pte = mfn_pte(phys, prot);
1774 break;
1775 }
1776
1777 __native_set_fixmap(idx, pte);
1778
1779#ifdef CONFIG_X86_64
1780 /* Replicate changes to map the vsyscall page into the user
1781 pagetable vsyscall mapping. */
1782 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1783 unsigned long vaddr = __fix_to_virt(idx);
1784 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1785 }
1786#endif
1787}
1788
1789__init void xen_post_allocator_init(void)
1790{
1791 pv_mmu_ops.set_pte = xen_set_pte;
1792 pv_mmu_ops.set_pmd = xen_set_pmd;
1793 pv_mmu_ops.set_pud = xen_set_pud;
1794#if PAGETABLE_LEVELS == 4
1795 pv_mmu_ops.set_pgd = xen_set_pgd;
1796#endif
1797
1798 /* This will work as long as patching hasn't happened yet
1799 (which it hasn't) */
1800 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1801 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1802 pv_mmu_ops.release_pte = xen_release_pte;
1803 pv_mmu_ops.release_pmd = xen_release_pmd;
1804#if PAGETABLE_LEVELS == 4
1805 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1806 pv_mmu_ops.release_pud = xen_release_pud;
1807#endif
1808
1809#ifdef CONFIG_X86_64
1810 SetPagePinned(virt_to_page(level3_user_vsyscall));
1811#endif
1812 xen_mark_init_mm_pinned();
1813}
1814
1815
1816const struct pv_mmu_ops xen_mmu_ops __initdata = {
1817 .pagetable_setup_start = xen_pagetable_setup_start,
1818 .pagetable_setup_done = xen_pagetable_setup_done,
1819
1820 .read_cr2 = xen_read_cr2,
1821 .write_cr2 = xen_write_cr2,
1822
1823 .read_cr3 = xen_read_cr3,
1824 .write_cr3 = xen_write_cr3,
1825
1826 .flush_tlb_user = xen_flush_tlb,
1827 .flush_tlb_kernel = xen_flush_tlb,
1828 .flush_tlb_single = xen_flush_tlb_single,
1829 .flush_tlb_others = xen_flush_tlb_others,
1830
1831 .pte_update = paravirt_nop,
1832 .pte_update_defer = paravirt_nop,
1833
1834 .pgd_alloc = xen_pgd_alloc,
1835 .pgd_free = xen_pgd_free,
1836
1837 .alloc_pte = xen_alloc_pte_init,
1838 .release_pte = xen_release_pte_init,
1839 .alloc_pmd = xen_alloc_pte_init,
1840 .alloc_pmd_clone = paravirt_nop,
1841 .release_pmd = xen_release_pte_init,
1842
1843#ifdef CONFIG_HIGHPTE
1844 .kmap_atomic_pte = xen_kmap_atomic_pte,
1845#endif
1846
1847#ifdef CONFIG_X86_64
1848 .set_pte = xen_set_pte,
1849#else
1850 .set_pte = xen_set_pte_init,
1851#endif
1852 .set_pte_at = xen_set_pte_at,
1853 .set_pmd = xen_set_pmd_hyper,
1854
1855 .ptep_modify_prot_start = __ptep_modify_prot_start,
1856 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1857
da5de7c2
JF
1858 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1859 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
319f3ba5 1860
da5de7c2
JF
1861 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1862 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
319f3ba5
JF
1863
1864#ifdef CONFIG_X86_PAE
1865 .set_pte_atomic = xen_set_pte_atomic,
1866 .set_pte_present = xen_set_pte_at,
1867 .pte_clear = xen_pte_clear,
1868 .pmd_clear = xen_pmd_clear,
1869#endif /* CONFIG_X86_PAE */
1870 .set_pud = xen_set_pud_hyper,
1871
da5de7c2
JF
1872 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1873 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
319f3ba5
JF
1874
1875#if PAGETABLE_LEVELS == 4
da5de7c2
JF
1876 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1877 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
319f3ba5
JF
1878 .set_pgd = xen_set_pgd_hyper,
1879
1880 .alloc_pud = xen_alloc_pte_init,
1881 .release_pud = xen_release_pte_init,
1882#endif /* PAGETABLE_LEVELS == 4 */
1883
1884 .activate_mm = xen_activate_mm,
1885 .dup_mmap = xen_dup_mmap,
1886 .exit_mmap = xen_exit_mmap,
1887
1888 .lazy_mode = {
1889 .enter = paravirt_enter_lazy_mmu,
1890 .leave = xen_leave_lazy,
1891 },
1892
1893 .set_fixmap = xen_set_fixmap,
1894};
1895
1896
994025ca
JF
1897#ifdef CONFIG_XEN_DEBUG_FS
1898
1899static struct dentry *d_mmu_debug;
1900
1901static int __init xen_mmu_debugfs(void)
1902{
1903 struct dentry *d_xen = xen_init_debugfs();
1904
1905 if (d_xen == NULL)
1906 return -ENOMEM;
1907
1908 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1909
1910 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1911
1912 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1913 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1914 &mmu_stats.pgd_update_pinned);
1915 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1916 &mmu_stats.pgd_update_pinned);
1917
1918 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1919 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1920 &mmu_stats.pud_update_pinned);
1921 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1922 &mmu_stats.pud_update_pinned);
1923
1924 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1925 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1926 &mmu_stats.pmd_update_pinned);
1927 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1928 &mmu_stats.pmd_update_pinned);
1929
1930 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1931// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1932// &mmu_stats.pte_update_pinned);
1933 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1934 &mmu_stats.pte_update_pinned);
1935
1936 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1937 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1938 &mmu_stats.mmu_update_extended);
1939 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1940 mmu_stats.mmu_update_histo, 20);
1941
1942 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1943 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1944 &mmu_stats.set_pte_at_batched);
1945 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1946 &mmu_stats.set_pte_at_current);
1947 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1948 &mmu_stats.set_pte_at_kernel);
1949
1950 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1951 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1952 &mmu_stats.prot_commit_batched);
1953
1954 return 0;
1955}
1956fs_initcall(xen_mmu_debugfs);
1957
1958#endif /* CONFIG_XEN_DEBUG_FS */
This page took 0.551026 seconds and 4 git commands to generate.