arch/x86/xen/mmu_pv.c

   1 /*
   2  * Xen mmu operations
   3  *
   4  * This file contains the various mmu fetch and update operations.
   5  * The most important job they must perform is the mapping between the
   6  * domain's pfn and the overall machine mfns.
   7  *
   8  * Xen allows guests to directly update the pagetable, in a controlled
   9  * fashion.  In other words, the guest modifies the same pagetable
  10  * that the CPU actually uses, which eliminates the overhead of having
  11  * a separate shadow pagetable.
  12  *
  13  * In order to allow this, it falls on the guest domain to map its
  14  * notion of a "physical" pfn - which is just a domain-local linear
  15  * address - into a real "machine address" which the CPU's MMU can
  16  * use.
  17  *
  18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19  * inserted directly into the pagetable.  When creating a new
  20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22  * the mfn back into a pfn.
  23  *
  24  * The other constraint is that all pages which make up a pagetable
  25  * must be mapped read-only in the guest.  This prevents uncontrolled
  26  * guest updates to the pagetable.  Xen strictly enforces this, and
  27  * will disallow any pagetable update which will end up mapping a
  28  * pagetable page RW, and will disallow using any writable page as a
  29  * pagetable.
  30  *
  31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32  * would need to validate the whole pagetable before going on.
  33  * Naturally, this is quite slow.  The solution is to "pin" a
  34  * pagetable, which enforces all the constraints on the pagetable even
  35  * when it is not actively in use.  This menas that Xen can be assured
  36  * that it is still valid when you do load it into %cr3, and doesn't
  37  * need to revalidate it.
  38  *
  39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40  */
  41 #include <linux/sched/mm.h>
  42 #include <linux/highmem.h>
  43 #include <linux/debugfs.h>
  44 #include <linux/bug.h>
  45 #include <linux/vmalloc.h>
  46 #include <linux/export.h>
  47 #include <linux/init.h>
  48 #include <linux/gfp.h>
  49 #include <linux/memblock.h>
  50 #include <linux/seq_file.h>
  51 #include <linux/crash_dump.h>
  52 #ifdef CONFIG_KEXEC_CORE
  53 #include <linux/kexec.h>
  54 #endif
  55
  56 #include <trace/events/xen.h>
  57
  58 #include <asm/pgtable.h>
  59 #include <asm/tlbflush.h>
  60 #include <asm/fixmap.h>
  61 #include <asm/mmu_context.h>
  62 #include <asm/setup.h>
  63 #include <asm/paravirt.h>
  64 #include <asm/e820/api.h>
  65 #include <asm/linkage.h>
  66 #include <asm/page.h>
  67 #include <asm/init.h>
  68 #include <asm/pat.h>
  69 #include <asm/smp.h>
  70
  71 #include <asm/xen/hypercall.h>
  72 #include <asm/xen/hypervisor.h>
  73
  74 #include <xen/xen.h>
  75 #include <xen/page.h>
  76 #include <xen/interface/xen.h>
  77 #include <xen/interface/hvm/hvm_op.h>
  78 #include <xen/interface/version.h>
  79 #include <xen/interface/memory.h>
  80 #include <xen/hvc-console.h>
  81
  82 #include "multicalls.h"
  83 #include "mmu.h"
  84 #include "debugfs.h"
  85
  86 #ifdef CONFIG_X86_32
  87 /*
  88  * Identity map, in addition to plain kernel map.  This needs to be
  89  * large enough to allocate page table pages to allocate the rest.
  90  * Each page can map 2MB.
  91  */
  92 #define LEVEL1_IDENT_ENTRIES    (PTRS_PER_PTE * 4)
  93 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
  94 #endif
  95 #ifdef CONFIG_X86_64
  96 /* l3 pud for userspace vsyscall mapping */
  97 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
  98 #endif /* CONFIG_X86_64 */
  99
 100 /*
 101  * Note about cr3 (pagetable base) values:
 102  *
 103  * xen_cr3 contains the current logical cr3 value; it contains the
 104  * last set cr3.  This may not be the current effective cr3, because
 105  * its update may be being lazily deferred.  However, a vcpu looking
 106  * at its own cr3 can use this value knowing that it everything will
 107  * be self-consistent.
 108  *
 109  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 110  * hypercall to set the vcpu cr3 is complete (so it may be a little
 111  * out of date, but it will never be set early).  If one vcpu is
 112  * looking at another vcpu's cr3 value, it should use this variable.
 113  */
 114 DEFINE_PER_CPU(unsigned long, xen_cr3);  /* cr3 stored as physaddr */
 115 DEFINE_PER_CPU(unsigned long, xen_current_cr3);  /* actual vcpu cr3 */
 116
 117 static phys_addr_t xen_pt_base, xen_pt_size __initdata;
 118
 119 /*
 120  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 121  * redzone above it, so round it up to a PGD boundary.
 122  */
 123 #define USER_LIMIT      ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 124
 125 void make_lowmem_page_readonly(void *vaddr)
 126 {
 127         pte_t *pte, ptev;
 128         unsigned long address = (unsigned long)vaddr;
 129         unsigned int level;
 130
 131         pte = lookup_address(address, &level);
 132         if (pte == NULL)
 133                 return;         /* vaddr missing */
 134
 135         ptev = pte_wrprotect(*pte);
 136
 137         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 138                 BUG();
 139 }
 140
 141 void make_lowmem_page_readwrite(void *vaddr)
 142 {
 143         pte_t *pte, ptev;
 144         unsigned long address = (unsigned long)vaddr;
 145         unsigned int level;
 146
 147         pte = lookup_address(address, &level);
 148         if (pte == NULL)
 149                 return;         /* vaddr missing */
 150
 151         ptev = pte_mkwrite(*pte);
 152
 153         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 154                 BUG();
 155 }
 156
 157
 158 static bool xen_page_pinned(void *ptr)
 159 {
 160         struct page *page = virt_to_page(ptr);
 161
 162         return PagePinned(page);
 163 }
 164
 165 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
 166 {
 167         struct multicall_space mcs;
 168         struct mmu_update *u;
 169
 170         trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
 171
 172         mcs = xen_mc_entry(sizeof(*u));
 173         u = mcs.args;
 174
 175         /* ptep might be kmapped when using 32-bit HIGHPTE */
 176         u->ptr = virt_to_machine(ptep).maddr;
 177         u->val = pte_val_ma(pteval);
 178
 179         MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
 180
 181         xen_mc_issue(PARAVIRT_LAZY_MMU);
 182 }
 183 EXPORT_SYMBOL_GPL(xen_set_domain_pte);
 184
 185 static void xen_extend_mmu_update(const struct mmu_update *update)
 186 {
 187         struct multicall_space mcs;
 188         struct mmu_update *u;
 189
 190         mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 191
 192         if (mcs.mc != NULL) {
 193                 mcs.mc->args[1]++;
 194         } else {
 195                 mcs = __xen_mc_entry(sizeof(*u));
 196                 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 197         }
 198
 199         u = mcs.args;
 200         *u = *update;
 201 }
 202
 203 static void xen_extend_mmuext_op(const struct mmuext_op *op)
 204 {
 205         struct multicall_space mcs;
 206         struct mmuext_op *u;
 207
 208         mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
 209
 210         if (mcs.mc != NULL) {
 211                 mcs.mc->args[1]++;
 212         } else {
 213                 mcs = __xen_mc_entry(sizeof(*u));
 214                 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 215         }
 216
 217         u = mcs.args;
 218         *u = *op;
 219 }
 220
 221 static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 222 {
 223         struct mmu_update u;
 224
 225         preempt_disable();
 226
 227         xen_mc_batch();
 228
 229         /* ptr may be ioremapped for 64-bit pagetable setup */
 230         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 231         u.val = pmd_val_ma(val);
 232         xen_extend_mmu_update(&u);
 233
 234         xen_mc_issue(PARAVIRT_LAZY_MMU);
 235
 236         preempt_enable();
 237 }
 238
 239 static void xen_set_pmd(pmd_t *ptr, pmd_t val)
 240 {
 241         trace_xen_mmu_set_pmd(ptr, val);
 242
 243         /* If page is not pinned, we can just update the entry
 244            directly */
 245         if (!xen_page_pinned(ptr)) {
 246                 *ptr = val;
 247                 return;
 248         }
 249
 250         xen_set_pmd_hyper(ptr, val);
 251 }
 252
 253 /*
 254  * Associate a virtual page frame with a given physical page frame
 255  * and protection flags for that frame.
 256  */
 257 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 258 {
 259         set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 260 }
 261
 262 static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 263 {
 264         struct mmu_update u;
 265
 266         if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
 267                 return false;
 268
 269         xen_mc_batch();
 270
 271         u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 272         u.val = pte_val_ma(pteval);
 273         xen_extend_mmu_update(&u);
 274
 275         xen_mc_issue(PARAVIRT_LAZY_MMU);
 276
 277         return true;
 278 }
 279
 280 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 281 {
 282         if (!xen_batched_set_pte(ptep, pteval)) {
 283                 /*
 284                  * Could call native_set_pte() here and trap and
 285                  * emulate the PTE write but with 32-bit guests this
 286                  * needs two traps (one for each of the two 32-bit
 287                  * words in the PTE) so do one hypercall directly
 288                  * instead.
 289                  */
 290                 struct mmu_update u;
 291
 292                 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 293                 u.val = pte_val_ma(pteval);
 294                 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
 295         }
 296 }
 297
 298 static void xen_set_pte(pte_t *ptep, pte_t pteval)
 299 {
 300         trace_xen_mmu_set_pte(ptep, pteval);
 301         __xen_set_pte(ptep, pteval);
 302 }
 303
 304 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 305                     pte_t *ptep, pte_t pteval)
 306 {
 307         trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
 308         __xen_set_pte(ptep, pteval);
 309 }
 310
 311 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 312                                  unsigned long addr, pte_t *ptep)
 313 {
 314         /* Just return the pte as-is.  We preserve the bits on commit */
 315         trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
 316         return *ptep;
 317 }
 318
 319 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 320                                  pte_t *ptep, pte_t pte)
 321 {
 322         struct mmu_update u;
 323
 324         trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
 325         xen_mc_batch();
 326
 327         u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 328         u.val = pte_val_ma(pte);
 329         xen_extend_mmu_update(&u);
 330
 331         xen_mc_issue(PARAVIRT_LAZY_MMU);
 332 }
 333
 334 /* Assume pteval_t is equivalent to all the other *val_t types. */
 335 static pteval_t pte_mfn_to_pfn(pteval_t val)
 336 {
 337         if (val & _PAGE_PRESENT) {
 338                 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 339                 unsigned long pfn = mfn_to_pfn(mfn);
 340
 341                 pteval_t flags = val & PTE_FLAGS_MASK;
 342                 if (unlikely(pfn == ~0))
 343                         val = flags & ~_PAGE_PRESENT;
 344                 else
 345                         val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 346         }
 347
 348         return val;
 349 }
 350
 351 static pteval_t pte_pfn_to_mfn(pteval_t val)
 352 {
 353         if (val & _PAGE_PRESENT) {
 354                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 355                 pteval_t flags = val & PTE_FLAGS_MASK;
 356                 unsigned long mfn;
 357
 358                 if (!xen_feature(XENFEAT_auto_translated_physmap))
 359                         mfn = __pfn_to_mfn(pfn);
 360                 else
 361                         mfn = pfn;
 362                 /*
 363                  * If there's no mfn for the pfn, then just create an
 364                  * empty non-present pte.  Unfortunately this loses
 365                  * information about the original pfn, so
 366                  * pte_mfn_to_pfn is asymmetric.
 367                  */
 368                 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 369                         mfn = 0;
 370                         flags = 0;
 371                 } else
 372                         mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
 373                 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 374         }
 375
 376         return val;
 377 }
 378
 379 __visible pteval_t xen_pte_val(pte_t pte)
 380 {
 381         pteval_t pteval = pte.pte;
 382
 383         return pte_mfn_to_pfn(pteval);
 384 }
 385 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 386
 387 __visible pgdval_t xen_pgd_val(pgd_t pgd)
 388 {
 389         return pte_mfn_to_pfn(pgd.pgd);
 390 }
 391 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 392
 393 __visible pte_t xen_make_pte(pteval_t pte)
 394 {
 395         pte = pte_pfn_to_mfn(pte);
 396
 397         return native_make_pte(pte);
 398 }
 399 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 400
 401 __visible pgd_t xen_make_pgd(pgdval_t pgd)
 402 {
 403         pgd = pte_pfn_to_mfn(pgd);
 404         return native_make_pgd(pgd);
 405 }
 406 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 407
 408 __visible pmdval_t xen_pmd_val(pmd_t pmd)
 409 {
 410         return pte_mfn_to_pfn(pmd.pmd);
 411 }
 412 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 413
 414 static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 415 {
 416         struct mmu_update u;
 417
 418         preempt_disable();
 419
 420         xen_mc_batch();
 421
 422         /* ptr may be ioremapped for 64-bit pagetable setup */
 423         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 424         u.val = pud_val_ma(val);
 425         xen_extend_mmu_update(&u);
 426
 427         xen_mc_issue(PARAVIRT_LAZY_MMU);
 428
 429         preempt_enable();
 430 }
 431
 432 static void xen_set_pud(pud_t *ptr, pud_t val)
 433 {
 434         trace_xen_mmu_set_pud(ptr, val);
 435
 436         /* If page is not pinned, we can just update the entry
 437            directly */
 438         if (!xen_page_pinned(ptr)) {
 439                 *ptr = val;
 440                 return;
 441         }
 442
 443         xen_set_pud_hyper(ptr, val);
 444 }
 445
 446 #ifdef CONFIG_X86_PAE
 447 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 448 {
 449         trace_xen_mmu_set_pte_atomic(ptep, pte);
 450         set_64bit((u64 *)ptep, native_pte_val(pte));
 451 }
 452
 453 static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 454 {
 455         trace_xen_mmu_pte_clear(mm, addr, ptep);
 456         if (!xen_batched_set_pte(ptep, native_make_pte(0)))
 457                 native_pte_clear(mm, addr, ptep);
 458 }
 459
 460 static void xen_pmd_clear(pmd_t *pmdp)
 461 {
 462         trace_xen_mmu_pmd_clear(pmdp);
 463         set_pmd(pmdp, __pmd(0));
 464 }
 465 #endif  /* CONFIG_X86_PAE */
 466
 467 __visible pmd_t xen_make_pmd(pmdval_t pmd)
 468 {
 469         pmd = pte_pfn_to_mfn(pmd);
 470         return native_make_pmd(pmd);
 471 }
 472 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 473
 474 #if CONFIG_PGTABLE_LEVELS == 4
 475 __visible pudval_t xen_pud_val(pud_t pud)
 476 {
 477         return pte_mfn_to_pfn(pud.pud);
 478 }
 479 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 480
 481 __visible pud_t xen_make_pud(pudval_t pud)
 482 {
 483         pud = pte_pfn_to_mfn(pud);
 484
 485         return native_make_pud(pud);
 486 }
 487 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 488
 489 static pgd_t *xen_get_user_pgd(pgd_t *pgd)
 490 {
 491         pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 492         unsigned offset = pgd - pgd_page;
 493         pgd_t *user_ptr = NULL;
 494
 495         if (offset < pgd_index(USER_LIMIT)) {
 496                 struct page *page = virt_to_page(pgd_page);
 497                 user_ptr = (pgd_t *)page->private;
 498                 if (user_ptr)
 499                         user_ptr += offset;
 500         }
 501
 502         return user_ptr;
 503 }
 504
 505 static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
 506 {
 507         struct mmu_update u;
 508
 509         u.ptr = virt_to_machine(ptr).maddr;
 510         u.val = p4d_val_ma(val);
 511         xen_extend_mmu_update(&u);
 512 }
 513
 514 /*
 515  * Raw hypercall-based set_p4d, intended for in early boot before
 516  * there's a page structure.  This implies:
 517  *  1. The only existing pagetable is the kernel's
 518  *  2. It is always pinned
 519  *  3. It has no user pagetable attached to it
 520  */
 521 static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
 522 {
 523         preempt_disable();
 524
 525         xen_mc_batch();
 526
 527         __xen_set_p4d_hyper(ptr, val);
 528
 529         xen_mc_issue(PARAVIRT_LAZY_MMU);
 530
 531         preempt_enable();
 532 }
 533
 534 static void xen_set_p4d(p4d_t *ptr, p4d_t val)
 535 {
 536         pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
 537         pgd_t pgd_val;
 538
 539         trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
 540
 541         /* If page is not pinned, we can just update the entry
 542            directly */
 543         if (!xen_page_pinned(ptr)) {
 544                 *ptr = val;
 545                 if (user_ptr) {
 546                         WARN_ON(xen_page_pinned(user_ptr));
 547                         pgd_val.pgd = p4d_val_ma(val);
 548                         *user_ptr = pgd_val;
 549                 }
 550                 return;
 551         }
 552
 553         /* If it's pinned, then we can at least batch the kernel and
 554            user updates together. */
 555         xen_mc_batch();
 556
 557         __xen_set_p4d_hyper(ptr, val);
 558         if (user_ptr)
 559                 __xen_set_p4d_hyper((p4d_t *)user_ptr, val);
 560
 561         xen_mc_issue(PARAVIRT_LAZY_MMU);
 562 }
 563 #endif  /* CONFIG_PGTABLE_LEVELS == 4 */
 564
 565 static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
 566                 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
 567                 bool last, unsigned long limit)
 568 {
 569         int i, nr, flush = 0;
 570
 571         nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
 572         for (i = 0; i < nr; i++) {
 573                 if (!pmd_none(pmd[i]))
 574                         flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
 575         }
 576         return flush;
 577 }
 578
 579 static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
 580                 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
 581                 bool last, unsigned long limit)
 582 {
 583         int i, nr, flush = 0;
 584
 585         nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
 586         for (i = 0; i < nr; i++) {
 587                 pmd_t *pmd;
 588
 589                 if (pud_none(pud[i]))
 590                         continue;
 591
 592                 pmd = pmd_offset(&pud[i], 0);
 593                 if (PTRS_PER_PMD > 1)
 594                         flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 595                 flush |= xen_pmd_walk(mm, pmd, func,
 596                                 last && i == nr - 1, limit);
 597         }
 598         return flush;
 599 }
 600
 601 static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
 602                 int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
 603                 bool last, unsigned long limit)
 604 {
 605         int i, nr, flush = 0;
 606
 607         nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
 608         for (i = 0; i < nr; i++) {
 609                 pud_t *pud;
 610
 611                 if (p4d_none(p4d[i]))
 612                         continue;
 613
 614                 pud = pud_offset(&p4d[i], 0);
 615                 if (PTRS_PER_PUD > 1)
 616                         flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 617                 flush |= xen_pud_walk(mm, pud, func,
 618                                 last && i == nr - 1, limit);
 619         }
 620         return flush;
 621 }
 622
 623 /*
 624  * (Yet another) pagetable walker.  This one is intended for pinning a
 625  * pagetable.  This means that it walks a pagetable and calls the
 626  * callback function on each page it finds making up the page table,
 627  * at every level.  It walks the entire pagetable, but it only bothers
 628  * pinning pte pages which are below limit.  In the normal case this
 629  * will be STACK_TOP_MAX, but at boot we need to pin up to
 630  * FIXADDR_TOP.
 631  *
 632  * For 32-bit the important bit is that we don't pin beyond there,
 633  * because then we start getting into Xen's ptes.
 634  *
 635  * For 64-bit, we must skip the Xen hole in the middle of the address
 636  * space, just after the big x86-64 virtual hole.
 637  */
 638 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 639                           int (*func)(struct mm_struct *mm, struct page *,
 640                                       enum pt_level),
 641                           unsigned long limit)
 642 {
 643         int i, nr, flush = 0;
 644         unsigned hole_low, hole_high;
 645
 646         /* The limit is the last byte to be touched */
 647         limit--;
 648         BUG_ON(limit >= FIXADDR_TOP);
 649
 650         if (xen_feature(XENFEAT_auto_translated_physmap))
 651                 return 0;
 652
 653         /*
 654          * 64-bit has a great big hole in the middle of the address
 655          * space, which contains the Xen mappings.  On 32-bit these
 656          * will end up making a zero-sized hole and so is a no-op.
 657          */
 658         hole_low = pgd_index(USER_LIMIT);
 659         hole_high = pgd_index(PAGE_OFFSET);
 660
 661         nr = pgd_index(limit) + 1;
 662         for (i = 0; i < nr; i++) {
 663                 p4d_t *p4d;
 664
 665                 if (i >= hole_low && i < hole_high)
 666                         continue;
 667
 668                 if (pgd_none(pgd[i]))
 669                         continue;
 670
 671                 p4d = p4d_offset(&pgd[i], 0);
 672                 if (PTRS_PER_P4D > 1)
 673                         flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
 674                 flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
 675         }
 676
 677         /* Do the top level last, so that the callbacks can use it as
 678            a cue to do final things like tlb flushes. */
 679         flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 680
 681         return flush;
 682 }
 683
 684 static int xen_pgd_walk(struct mm_struct *mm,
 685                         int (*func)(struct mm_struct *mm, struct page *,
 686                                     enum pt_level),
 687                         unsigned long limit)
 688 {
 689         return __xen_pgd_walk(mm, mm->pgd, func, limit);
 690 }
 691
 692 /* If we're using split pte locks, then take the page's lock and
 693    return a pointer to it.  Otherwise return NULL. */
 694 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 695 {
 696         spinlock_t *ptl = NULL;
 697
 698 #if USE_SPLIT_PTE_PTLOCKS
 699         ptl = ptlock_ptr(page);
 700         spin_lock_nest_lock(ptl, &mm->page_table_lock);
 701 #endif
 702
 703         return ptl;
 704 }
 705
 706 static void xen_pte_unlock(void *v)
 707 {
 708         spinlock_t *ptl = v;
 709         spin_unlock(ptl);
 710 }
 711
 712 static void xen_do_pin(unsigned level, unsigned long pfn)
 713 {
 714         struct mmuext_op op;
 715
 716         op.cmd = level;
 717         op.arg1.mfn = pfn_to_mfn(pfn);
 718
 719         xen_extend_mmuext_op(&op);
 720 }
 721
 722 static int xen_pin_page(struct mm_struct *mm, struct page *page,
 723                         enum pt_level level)
 724 {
 725         unsigned pgfl = TestSetPagePinned(page);
 726         int flush;
 727
 728         if (pgfl)
 729                 flush = 0;              /* already pinned */
 730         else if (PageHighMem(page))
 731                 /* kmaps need flushing if we found an unpinned
 732                    highpage */
 733                 flush = 1;
 734         else {
 735                 void *pt = lowmem_page_address(page);
 736                 unsigned long pfn = page_to_pfn(page);
 737                 struct multicall_space mcs = __xen_mc_entry(0);
 738                 spinlock_t *ptl;
 739
 740                 flush = 0;
 741
 742                 /*
 743                  * We need to hold the pagetable lock between the time
 744                  * we make the pagetable RO and when we actually pin
 745                  * it.  If we don't, then other users may come in and
 746                  * attempt to update the pagetable by writing it,
 747                  * which will fail because the memory is RO but not
 748                  * pinned, so Xen won't do the trap'n'emulate.
 749                  *
 750                  * If we're using split pte locks, we can't hold the
 751                  * entire pagetable's worth of locks during the
 752                  * traverse, because we may wrap the preempt count (8
 753                  * bits).  The solution is to mark RO and pin each PTE
 754                  * page while holding the lock.  This means the number
 755                  * of locks we end up holding is never more than a
 756                  * batch size (~32 entries, at present).
 757                  *
 758                  * If we're not using split pte locks, we needn't pin
 759                  * the PTE pages independently, because we're
 760                  * protected by the overall pagetable lock.
 761                  */
 762                 ptl = NULL;
 763                 if (level == PT_PTE)
 764                         ptl = xen_pte_lock(page, mm);
 765
 766                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 767                                         pfn_pte(pfn, PAGE_KERNEL_RO),
 768                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 769
 770                 if (ptl) {
 771                         xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 772
 773                         /* Queue a deferred unlock for when this batch
 774                            is completed. */
 775                         xen_mc_callback(xen_pte_unlock, ptl);
 776                 }
 777         }
 778
 779         return flush;
 780 }
 781
 782 /* This is called just after a mm has been created, but it has not
 783    been used yet.  We need to make sure that its pagetable is all
 784    read-only, and can be pinned. */
 785 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 786 {
 787         trace_xen_mmu_pgd_pin(mm, pgd);
 788
 789         xen_mc_batch();
 790
 791         if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
 792                 /* re-enable interrupts for flushing */
 793                 xen_mc_issue(0);
 794
 795                 kmap_flush_unused();
 796
 797                 xen_mc_batch();
 798         }
 799
 800 #ifdef CONFIG_X86_64
 801         {
 802                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
 803
 804                 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 805
 806                 if (user_pgd) {
 807                         xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
 808                         xen_do_pin(MMUEXT_PIN_L4_TABLE,
 809                                    PFN_DOWN(__pa(user_pgd)));
 810                 }
 811         }
 812 #else /* CONFIG_X86_32 */
 813 #ifdef CONFIG_X86_PAE
 814         /* Need to make sure unshared kernel PMD is pinnable */
 815         xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 816                      PT_PMD);
 817 #endif
 818         xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 819 #endif /* CONFIG_X86_64 */
 820         xen_mc_issue(0);
 821 }
 822
 823 static void xen_pgd_pin(struct mm_struct *mm)
 824 {
 825         __xen_pgd_pin(mm, mm->pgd);
 826 }
 827
 828 /*
 829  * On save, we need to pin all pagetables to make sure they get their
 830  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 831  * them (unpinned pgds are not currently in use, probably because the
 832  * process is under construction or destruction).
 833  *
 834  * Expected to be called in stop_machine() ("equivalent to taking
 835  * every spinlock in the system"), so the locking doesn't really
 836  * matter all that much.
 837  */
 838 void xen_mm_pin_all(void)
 839 {
 840         struct page *page;
 841
 842         spin_lock(&pgd_lock);
 843
 844         list_for_each_entry(page, &pgd_list, lru) {
 845                 if (!PagePinned(page)) {
 846                         __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
 847                         SetPageSavePinned(page);
 848                 }
 849         }
 850
 851         spin_unlock(&pgd_lock);
 852 }
 853
 854 /*
 855  * The init_mm pagetable is really pinned as soon as its created, but
 856  * that's before we have page structures to store the bits.  So do all
 857  * the book-keeping now.
 858  */
 859 static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
 860                                   enum pt_level level)
 861 {
 862         SetPagePinned(page);
 863         return 0;
 864 }
 865
 866 static void __init xen_mark_init_mm_pinned(void)
 867 {
 868         xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 869 }
 870
 871 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
 872                           enum pt_level level)
 873 {
 874         unsigned pgfl = TestClearPagePinned(page);
 875
 876         if (pgfl && !PageHighMem(page)) {
 877                 void *pt = lowmem_page_address(page);
 878                 unsigned long pfn = page_to_pfn(page);
 879                 spinlock_t *ptl = NULL;
 880                 struct multicall_space mcs;
 881
 882                 /*
 883                  * Do the converse to pin_page.  If we're using split
 884                  * pte locks, we must be holding the lock for while
 885                  * the pte page is unpinned but still RO to prevent
 886                  * concurrent updates from seeing it in this
 887                  * partially-pinned state.
 888                  */
 889                 if (level == PT_PTE) {
 890                         ptl = xen_pte_lock(page, mm);
 891
 892                         if (ptl)
 893                                 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
 894                 }
 895
 896                 mcs = __xen_mc_entry(0);
 897
 898                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 899                                         pfn_pte(pfn, PAGE_KERNEL),
 900                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 901
 902                 if (ptl) {
 903                         /* unlock when batch completed */
 904                         xen_mc_callback(xen_pte_unlock, ptl);
 905                 }
 906         }
 907
 908         return 0;               /* never need to flush on unpin */
 909 }
 910
 911 /* Release a pagetables pages back as normal RW */
 912 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
 913 {
 914         trace_xen_mmu_pgd_unpin(mm, pgd);
 915
 916         xen_mc_batch();
 917
 918         xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 919
 920 #ifdef CONFIG_X86_64
 921         {
 922                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
 923
 924                 if (user_pgd) {
 925                         xen_do_pin(MMUEXT_UNPIN_TABLE,
 926                                    PFN_DOWN(__pa(user_pgd)));
 927                         xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
 928                 }
 929         }
 930 #endif
 931
 932 #ifdef CONFIG_X86_PAE
 933         /* Need to make sure unshared kernel PMD is unpinned */
 934         xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 935                        PT_PMD);
 936 #endif
 937
 938         __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
 939
 940         xen_mc_issue(0);
 941 }
 942
 943 static void xen_pgd_unpin(struct mm_struct *mm)
 944 {
 945         __xen_pgd_unpin(mm, mm->pgd);
 946 }
 947
 948 /*
 949  * On resume, undo any pinning done at save, so that the rest of the
 950  * kernel doesn't see any unexpected pinned pagetables.
 951  */
 952 void xen_mm_unpin_all(void)
 953 {
 954         struct page *page;
 955
 956         spin_lock(&pgd_lock);
 957
 958         list_for_each_entry(page, &pgd_list, lru) {
 959                 if (PageSavePinned(page)) {
 960                         BUG_ON(!PagePinned(page));
 961                         __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
 962                         ClearPageSavePinned(page);
 963                 }
 964         }
 965
 966         spin_unlock(&pgd_lock);
 967 }
 968
 969 static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 970 {
 971         spin_lock(&next->page_table_lock);
 972         xen_pgd_pin(next);
 973         spin_unlock(&next->page_table_lock);
 974 }
 975
 976 static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 977 {
 978         spin_lock(&mm->page_table_lock);
 979         xen_pgd_pin(mm);
 980         spin_unlock(&mm->page_table_lock);
 981 }
 982
 983
 984 #ifdef CONFIG_SMP
 985 /* Another cpu may still have their %cr3 pointing at the pagetable, so
 986    we need to repoint it somewhere else before we can unpin it. */
 987 static void drop_other_mm_ref(void *info)
 988 {
 989         struct mm_struct *mm = info;
 990         struct mm_struct *active_mm;
 991
 992         active_mm = this_cpu_read(cpu_tlbstate.active_mm);
 993
 994         if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
 995                 leave_mm(smp_processor_id());
 996
 997         /* If this cpu still has a stale cr3 reference, then make sure
 998            it has been flushed. */
 999         if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1000                 load_cr3(swapper_pg_dir);
1001 }
1002
1003 static void xen_drop_mm_ref(struct mm_struct *mm)
1004 {
1005         cpumask_var_t mask;
1006         unsigned cpu;
1007
1008         if (current->active_mm == mm) {
1009                 if (current->mm == mm)
1010                         load_cr3(swapper_pg_dir);
1011                 else
1012                         leave_mm(smp_processor_id());
1013         }
1014
1015         /* Get the "official" set of cpus referring to our pagetable. */
1016         if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1017                 for_each_online_cpu(cpu) {
1018                         if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1019                             && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1020                                 continue;
1021                         smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1022                 }
1023                 return;
1024         }
1025         cpumask_copy(mask, mm_cpumask(mm));
1026
1027         /* It's possible that a vcpu may have a stale reference to our
1028            cr3, because its in lazy mode, and it hasn't yet flushed
1029            its set of pending hypercalls yet.  In this case, we can
1030            look at its actual current cr3 value, and force it to flush
1031            if needed. */
1032         for_each_online_cpu(cpu) {
1033                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1034                         cpumask_set_cpu(cpu, mask);
1035         }
1036
1037         if (!cpumask_empty(mask))
1038                 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1039         free_cpumask_var(mask);
1040 }
1041 #else
1042 static void xen_drop_mm_ref(struct mm_struct *mm)
1043 {
1044         if (current->active_mm == mm)
1045                 load_cr3(swapper_pg_dir);
1046 }
1047 #endif
1048
1049 /*
1050  * While a process runs, Xen pins its pagetables, which means that the
1051  * hypervisor forces it to be read-only, and it controls all updates
1052  * to it.  This means that all pagetable updates have to go via the
1053  * hypervisor, which is moderately expensive.
1054  *
1055  * Since we're pulling the pagetable down, we switch to use init_mm,
1056  * unpin old process pagetable and mark it all read-write, which
1057  * allows further operations on it to be simple memory accesses.
1058  *
1059  * The only subtle point is that another CPU may be still using the
1060  * pagetable because of lazy tlb flushing.  This means we need need to
1061  * switch all CPUs off this pagetable before we can unpin it.
1062  */
1063 static void xen_exit_mmap(struct mm_struct *mm)
1064 {
1065         get_cpu();              /* make sure we don't move around */
1066         xen_drop_mm_ref(mm);
1067         put_cpu();
1068
1069         spin_lock(&mm->page_table_lock);
1070
1071         /* pgd may not be pinned in the error exit path of execve */
1072         if (xen_page_pinned(mm->pgd))
1073                 xen_pgd_unpin(mm);
1074
1075         spin_unlock(&mm->page_table_lock);
1076 }
1077
1078 static void xen_post_allocator_init(void);
1079
1080 static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1081 {
1082         struct mmuext_op op;
1083
1084         op.cmd = cmd;
1085         op.arg1.mfn = pfn_to_mfn(pfn);
1086         if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1087                 BUG();
1088 }
1089
1090 #ifdef CONFIG_X86_64
1091 static void __init xen_cleanhighmap(unsigned long vaddr,
1092                                     unsigned long vaddr_end)
1093 {
1094         unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1095         pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1096
1097         /* NOTE: The loop is more greedy than the cleanup_highmap variant.
1098          * We include the PMD passed in on _both_ boundaries. */
1099         for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD));
1100                         pmd++, vaddr += PMD_SIZE) {
1101                 if (pmd_none(*pmd))
1102                         continue;
1103                 if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1104                         set_pmd(pmd, __pmd(0));
1105         }
1106         /* In case we did something silly, we should crash in this function
1107          * instead of somewhere later and be confusing. */
1108         xen_mc_flush();
1109 }
1110
1111 /*
1112  * Make a page range writeable and free it.
1113  */
1114 static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
1115 {
1116         void *vaddr = __va(paddr);
1117         void *vaddr_end = vaddr + size;
1118
1119         for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
1120                 make_lowmem_page_readwrite(vaddr);
1121
1122         memblock_free(paddr, size);
1123 }
1124
1125 static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
1126 {
1127         unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
1128
1129         if (unpin)
1130                 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
1131         ClearPagePinned(virt_to_page(__va(pa)));
1132         xen_free_ro_pages(pa, PAGE_SIZE);
1133 }
1134
1135 static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
1136 {
1137         unsigned long pa;
1138         pte_t *pte_tbl;
1139         int i;
1140
1141         if (pmd_large(*pmd)) {
1142                 pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1143                 xen_free_ro_pages(pa, PMD_SIZE);
1144                 return;
1145         }
1146
1147         pte_tbl = pte_offset_kernel(pmd, 0);
1148         for (i = 0; i < PTRS_PER_PTE; i++) {
1149                 if (pte_none(pte_tbl[i]))
1150                         continue;
1151                 pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
1152                 xen_free_ro_pages(pa, PAGE_SIZE);
1153         }
1154         set_pmd(pmd, __pmd(0));
1155         xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
1156 }
1157
1158 static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
1159 {
1160         unsigned long pa;
1161         pmd_t *pmd_tbl;
1162         int i;
1163
1164         if (pud_large(*pud)) {
1165                 pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
1166                 xen_free_ro_pages(pa, PUD_SIZE);
1167                 return;
1168         }
1169
1170         pmd_tbl = pmd_offset(pud, 0);
1171         for (i = 0; i < PTRS_PER_PMD; i++) {
1172                 if (pmd_none(pmd_tbl[i]))
1173                         continue;
1174                 xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
1175         }
1176         set_pud(pud, __pud(0));
1177         xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
1178 }
1179
1180 static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
1181 {
1182         unsigned long pa;
1183         pud_t *pud_tbl;
1184         int i;
1185
1186         if (p4d_large(*p4d)) {
1187                 pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
1188                 xen_free_ro_pages(pa, P4D_SIZE);
1189                 return;
1190         }
1191
1192         pud_tbl = pud_offset(p4d, 0);
1193         for (i = 0; i < PTRS_PER_PUD; i++) {
1194                 if (pud_none(pud_tbl[i]))
1195                         continue;
1196                 xen_cleanmfnmap_pud(pud_tbl + i, unpin);
1197         }
1198         set_p4d(p4d, __p4d(0));
1199         xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
1200 }
1201
1202 /*
1203  * Since it is well isolated we can (and since it is perhaps large we should)
1204  * also free the page tables mapping the initial P->M table.
1205  */
1206 static void __init xen_cleanmfnmap(unsigned long vaddr)
1207 {
1208         pgd_t *pgd;
1209         p4d_t *p4d;
1210         unsigned int i;
1211         bool unpin;
1212
1213         unpin = (vaddr == 2 * PGDIR_SIZE);
1214         vaddr &= PMD_MASK;
1215         pgd = pgd_offset_k(vaddr);
1216         p4d = p4d_offset(pgd, 0);
1217         for (i = 0; i < PTRS_PER_P4D; i++) {
1218                 if (p4d_none(p4d[i]))
1219                         continue;
1220                 xen_cleanmfnmap_p4d(p4d + i, unpin);
1221         }
1222         if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
1223                 set_pgd(pgd, __pgd(0));
1224                 xen_cleanmfnmap_free_pgtbl(p4d, unpin);
1225         }
1226 }
1227
1228 static void __init xen_pagetable_p2m_free(void)
1229 {
1230         unsigned long size;
1231         unsigned long addr;
1232
1233         size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1234
1235         /* No memory or already called. */
1236         if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
1237                 return;
1238
1239         /* using __ka address and sticking INVALID_P2M_ENTRY! */
1240         memset((void *)xen_start_info->mfn_list, 0xff, size);
1241
1242         addr = xen_start_info->mfn_list;
1243         /*
1244          * We could be in __ka space.
1245          * We roundup to the PMD, which means that if anybody at this stage is
1246          * using the __ka address of xen_start_info or
1247          * xen_start_info->shared_info they are in going to crash. Fortunatly
1248          * we have already revectored in xen_setup_kernel_pagetable and in
1249          * xen_setup_shared_info.
1250          */
1251         size = roundup(size, PMD_SIZE);
1252
1253         if (addr >= __START_KERNEL_map) {
1254                 xen_cleanhighmap(addr, addr + size);
1255                 size = PAGE_ALIGN(xen_start_info->nr_pages *
1256                                   sizeof(unsigned long));
1257                 memblock_free(__pa(addr), size);
1258         } else {
1259                 xen_cleanmfnmap(addr);
1260         }
1261 }
1262
1263 static void __init xen_pagetable_cleanhighmap(void)
1264 {
1265         unsigned long size;
1266         unsigned long addr;
1267
1268         /* At this stage, cleanup_highmap has already cleaned __ka space
1269          * from _brk_limit way up to the max_pfn_mapped (which is the end of
1270          * the ramdisk). We continue on, erasing PMD entries that point to page
1271          * tables - do note that they are accessible at this stage via __va.
1272          * For good measure we also round up to the PMD - which means that if
1273          * anybody is using __ka address to the initial boot-stack - and try
1274          * to use it - they are going to crash. The xen_start_info has been
1275          * taken care of already in xen_setup_kernel_pagetable. */
1276         addr = xen_start_info->pt_base;
1277         size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1278
1279         xen_cleanhighmap(addr, addr + size);
1280         xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1281 #ifdef DEBUG
1282         /* This is superfluous and is not necessary, but you know what
1283          * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1284          * anything at this stage. */
1285         xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1286 #endif
1287 }
1288 #endif
1289
1290 static void __init xen_pagetable_p2m_setup(void)
1291 {
1292         if (xen_feature(XENFEAT_auto_translated_physmap))
1293                 return;
1294
1295         xen_vmalloc_p2m_tree();
1296
1297 #ifdef CONFIG_X86_64
1298         xen_pagetable_p2m_free();
1299
1300         xen_pagetable_cleanhighmap();
1301 #endif
1302         /* And revector! Bye bye old array */
1303         xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
1304 }
1305
1306 static void __init xen_pagetable_init(void)
1307 {
1308         paging_init();
1309         xen_post_allocator_init();
1310
1311         xen_pagetable_p2m_setup();
1312
1313         /* Allocate and initialize top and mid mfn levels for p2m structure */
1314         xen_build_mfn_list_list();
1315
1316         /* Remap memory freed due to conflicts with E820 map */
1317         if (!xen_feature(XENFEAT_auto_translated_physmap))
1318                 xen_remap_memory();
1319
1320         xen_setup_shared_info();
1321 }
1322 static void xen_write_cr2(unsigned long cr2)
1323 {
1324         this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1325 }
1326
1327 static unsigned long xen_read_cr2(void)
1328 {
1329         return this_cpu_read(xen_vcpu)->arch.cr2;
1330 }
1331
1332 unsigned long xen_read_cr2_direct(void)
1333 {
1334         return this_cpu_read(xen_vcpu_info.arch.cr2);
1335 }
1336
1337 static void xen_flush_tlb(void)
1338 {
1339         struct mmuext_op *op;
1340         struct multicall_space mcs;
1341
1342         trace_xen_mmu_flush_tlb(0);
1343
1344         preempt_disable();
1345
1346         mcs = xen_mc_entry(sizeof(*op));
1347
1348         op = mcs.args;
1349         op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1350         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1351
1352         xen_mc_issue(PARAVIRT_LAZY_MMU);
1353
1354         preempt_enable();
1355 }
1356
1357 static void xen_flush_tlb_single(unsigned long addr)
1358 {
1359         struct mmuext_op *op;
1360         struct multicall_space mcs;
1361
1362         trace_xen_mmu_flush_tlb_single(addr);
1363
1364         preempt_disable();
1365
1366         mcs = xen_mc_entry(sizeof(*op));
1367         op = mcs.args;
1368         op->cmd = MMUEXT_INVLPG_LOCAL;
1369         op->arg1.linear_addr = addr & PAGE_MASK;
1370         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1371
1372         xen_mc_issue(PARAVIRT_LAZY_MMU);
1373
1374         preempt_enable();
1375 }
1376
1377 static void xen_flush_tlb_others(const struct cpumask *cpus,
1378                                  struct mm_struct *mm, unsigned long start,
1379                                  unsigned long end)
1380 {
1381         struct {
1382                 struct mmuext_op op;
1383 #ifdef CONFIG_SMP
1384                 DECLARE_BITMAP(mask, num_processors);
1385 #else
1386                 DECLARE_BITMAP(mask, NR_CPUS);
1387 #endif
1388         } *args;
1389         struct multicall_space mcs;
1390
1391         trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1392
1393         if (cpumask_empty(cpus))
1394                 return;         /* nothing to do */
1395
1396         mcs = xen_mc_entry(sizeof(*args));
1397         args = mcs.args;
1398         args->op.arg2.vcpumask = to_cpumask(args->mask);
1399
1400         /* Remove us, and any offline CPUS. */
1401         cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1402         cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1403
1404         args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1405         if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1406                 args->op.cmd = MMUEXT_INVLPG_MULTI;
1407                 args->op.arg1.linear_addr = start;
1408         }
1409
1410         MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1411
1412         xen_mc_issue(PARAVIRT_LAZY_MMU);
1413 }
1414
1415 static unsigned long xen_read_cr3(void)
1416 {
1417         return this_cpu_read(xen_cr3);
1418 }
1419
1420 static void set_current_cr3(void *v)
1421 {
1422         this_cpu_write(xen_current_cr3, (unsigned long)v);
1423 }
1424
1425 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1426 {
1427         struct mmuext_op op;
1428         unsigned long mfn;
1429
1430         trace_xen_mmu_write_cr3(kernel, cr3);
1431
1432         if (cr3)
1433                 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1434         else
1435                 mfn = 0;
1436
1437         WARN_ON(mfn == 0 && kernel);
1438
1439         op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1440         op.arg1.mfn = mfn;
1441
1442         xen_extend_mmuext_op(&op);
1443
1444         if (kernel) {
1445                 this_cpu_write(xen_cr3, cr3);
1446
1447                 /* Update xen_current_cr3 once the batch has actually
1448                    been submitted. */
1449                 xen_mc_callback(set_current_cr3, (void *)cr3);
1450         }
1451 }
1452 static void xen_write_cr3(unsigned long cr3)
1453 {
1454         BUG_ON(preemptible());
1455
1456         xen_mc_batch();  /* disables interrupts */
1457
1458         /* Update while interrupts are disabled, so its atomic with
1459            respect to ipis */
1460         this_cpu_write(xen_cr3, cr3);
1461
1462         __xen_write_cr3(true, cr3);
1463
1464 #ifdef CONFIG_X86_64
1465         {
1466                 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1467                 if (user_pgd)
1468                         __xen_write_cr3(false, __pa(user_pgd));
1469                 else
1470                         __xen_write_cr3(false, 0);
1471         }
1472 #endif
1473
1474         xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1475 }
1476
1477 #ifdef CONFIG_X86_64
1478 /*
1479  * At the start of the day - when Xen launches a guest, it has already
1480  * built pagetables for the guest. We diligently look over them
1481  * in xen_setup_kernel_pagetable and graft as appropriate them in the
1482  * init_level4_pgt and its friends. Then when we are happy we load
1483  * the new init_level4_pgt - and continue on.
1484  *
1485  * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1486  * up the rest of the pagetables. When it has completed it loads the cr3.
1487  * N.B. that baremetal would start at 'start_kernel' (and the early
1488  * #PF handler would create bootstrap pagetables) - so we are running
1489  * with the same assumptions as what to do when write_cr3 is executed
1490  * at this point.
1491  *
1492  * Since there are no user-page tables at all, we have two variants
1493  * of xen_write_cr3 - the early bootup (this one), and the late one
1494  * (xen_write_cr3). The reason we have to do that is that in 64-bit
1495  * the Linux kernel and user-space are both in ring 3 while the
1496  * hypervisor is in ring 0.
1497  */
1498 static void __init xen_write_cr3_init(unsigned long cr3)
1499 {
1500         BUG_ON(preemptible());
1501
1502         xen_mc_batch();  /* disables interrupts */
1503
1504         /* Update while interrupts are disabled, so its atomic with
1505            respect to ipis */
1506         this_cpu_write(xen_cr3, cr3);
1507
1508         __xen_write_cr3(true, cr3);
1509
1510         xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1511 }
1512 #endif
1513
1514 static int xen_pgd_alloc(struct mm_struct *mm)
1515 {
1516         pgd_t *pgd = mm->pgd;
1517         int ret = 0;
1518
1519         BUG_ON(PagePinned(virt_to_page(pgd)));
1520
1521 #ifdef CONFIG_X86_64
1522         {
1523                 struct page *page = virt_to_page(pgd);
1524                 pgd_t *user_pgd;
1525
1526                 BUG_ON(page->private != 0);
1527
1528                 ret = -ENOMEM;
1529
1530                 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1531                 page->private = (unsigned long)user_pgd;
1532
1533                 if (user_pgd != NULL) {
1534 #ifdef CONFIG_X86_VSYSCALL_EMULATION
1535                         user_pgd[pgd_index(VSYSCALL_ADDR)] =
1536                                 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1537 #endif
1538                         ret = 0;
1539                 }
1540
1541                 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1542         }
1543 #endif
1544         return ret;
1545 }
1546
1547 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1548 {
1549 #ifdef CONFIG_X86_64
1550         pgd_t *user_pgd = xen_get_user_pgd(pgd);
1551
1552         if (user_pgd)
1553                 free_page((unsigned long)user_pgd);
1554 #endif
1555 }
1556
1557 /*
1558  * Init-time set_pte while constructing initial pagetables, which
1559  * doesn't allow RO page table pages to be remapped RW.
1560  *
1561  * If there is no MFN for this PFN then this page is initially
1562  * ballooned out so clear the PTE (as in decrease_reservation() in
1563  * drivers/xen/balloon.c).
1564  *
1565  * Many of these PTE updates are done on unpinned and writable pages
1566  * and doing a hypercall for these is unnecessary and expensive.  At
1567  * this point it is not possible to tell if a page is pinned or not,
1568  * so always write the PTE directly and rely on Xen trapping and
1569  * emulating any updates as necessary.
1570  */
1571 __visible pte_t xen_make_pte_init(pteval_t pte)
1572 {
1573 #ifdef CONFIG_X86_64
1574         unsigned long pfn;
1575
1576         /*
1577          * Pages belonging to the initial p2m list mapped outside the default
1578          * address range must be mapped read-only. This region contains the
1579          * page tables for mapping the p2m list, too, and page tables MUST be
1580          * mapped read-only.
1581          */
1582         pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
1583         if (xen_start_info->mfn_list < __START_KERNEL_map &&
1584             pfn >= xen_start_info->first_p2m_pfn &&
1585             pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
1586                 pte &= ~_PAGE_RW;
1587 #endif
1588         pte = pte_pfn_to_mfn(pte);
1589         return native_make_pte(pte);
1590 }
1591 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
1592
1593 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1594 {
1595 #ifdef CONFIG_X86_32
1596         /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1597         if (pte_mfn(pte) != INVALID_P2M_ENTRY
1598             && pte_val_ma(*ptep) & _PAGE_PRESENT)
1599                 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1600                                pte_val_ma(pte));
1601 #endif
1602         native_set_pte(ptep, pte);
1603 }
1604
1605 /* Early in boot, while setting up the initial pagetable, assume
1606    everything is pinned. */
1607 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1608 {
1609 #ifdef CONFIG_FLATMEM
1610         BUG_ON(mem_map);        /* should only be used early */
1611 #endif
1612         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1613         pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1614 }
1615
1616 /* Used for pmd and pud */
1617 static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1618 {
1619 #ifdef CONFIG_FLATMEM
1620         BUG_ON(mem_map);        /* should only be used early */
1621 #endif
1622         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1623 }
1624
1625 /* Early release_pte assumes that all pts are pinned, since there's
1626    only init_mm and anything attached to that is pinned. */
1627 static void __init xen_release_pte_init(unsigned long pfn)
1628 {
1629         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1630         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1631 }
1632
1633 static void __init xen_release_pmd_init(unsigned long pfn)
1634 {
1635         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1636 }
1637
1638 static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1639 {
1640         struct multicall_space mcs;
1641         struct mmuext_op *op;
1642
1643         mcs = __xen_mc_entry(sizeof(*op));
1644         op = mcs.args;
1645         op->cmd = cmd;
1646         op->arg1.mfn = pfn_to_mfn(pfn);
1647
1648         MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1649 }
1650
1651 static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1652 {
1653         struct multicall_space mcs;
1654         unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1655
1656         mcs = __xen_mc_entry(0);
1657         MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1658                                 pfn_pte(pfn, prot), 0);
1659 }
1660
1661 /* This needs to make sure the new pte page is pinned iff its being
1662    attached to a pinned pagetable. */
1663 static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1664                                     unsigned level)
1665 {
1666         bool pinned = PagePinned(virt_to_page(mm->pgd));
1667
1668         trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1669
1670         if (pinned) {
1671                 struct page *page = pfn_to_page(pfn);
1672
1673                 SetPagePinned(page);
1674
1675                 if (!PageHighMem(page)) {
1676                         xen_mc_batch();
1677
1678                         __set_pfn_prot(pfn, PAGE_KERNEL_RO);
1679
1680                         if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1681                                 __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1682
1683                         xen_mc_issue(PARAVIRT_LAZY_MMU);
1684                 } else {
1685                         /* make sure there are no stray mappings of
1686                            this page */
1687                         kmap_flush_unused();
1688                 }
1689         }
1690 }
1691
1692 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1693 {
1694         xen_alloc_ptpage(mm, pfn, PT_PTE);
1695 }
1696
1697 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1698 {
1699         xen_alloc_ptpage(mm, pfn, PT_PMD);
1700 }
1701
1702 /* This should never happen until we're OK to use struct page */
1703 static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1704 {
1705         struct page *page = pfn_to_page(pfn);
1706         bool pinned = PagePinned(page);
1707
1708         trace_xen_mmu_release_ptpage(pfn, level, pinned);
1709
1710         if (pinned) {
1711                 if (!PageHighMem(page)) {
1712                         xen_mc_batch();
1713
1714                         if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1715                                 __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1716
1717                         __set_pfn_prot(pfn, PAGE_KERNEL);
1718
1719                         xen_mc_issue(PARAVIRT_LAZY_MMU);
1720                 }
1721                 ClearPagePinned(page);
1722         }
1723 }
1724
1725 static void xen_release_pte(unsigned long pfn)
1726 {
1727         xen_release_ptpage(pfn, PT_PTE);
1728 }
1729
1730 static void xen_release_pmd(unsigned long pfn)
1731 {
1732         xen_release_ptpage(pfn, PT_PMD);
1733 }
1734
1735 #if CONFIG_PGTABLE_LEVELS >= 4
1736 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1737 {
1738         xen_alloc_ptpage(mm, pfn, PT_PUD);
1739 }
1740
1741 static void xen_release_pud(unsigned long pfn)
1742 {
1743         xen_release_ptpage(pfn, PT_PUD);
1744 }
1745 #endif
1746
1747 void __init xen_reserve_top(void)
1748 {
1749 #ifdef CONFIG_X86_32
1750         unsigned long top = HYPERVISOR_VIRT_START;
1751         struct xen_platform_parameters pp;
1752
1753         if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1754                 top = pp.virt_start;
1755
1756         reserve_top_address(-top);
1757 #endif  /* CONFIG_X86_32 */
1758 }
1759
1760 /*
1761  * Like __va(), but returns address in the kernel mapping (which is
1762  * all we have until the physical memory mapping has been set up.
1763  */
1764 static void * __init __ka(phys_addr_t paddr)
1765 {
1766 #ifdef CONFIG_X86_64
1767         return (void *)(paddr + __START_KERNEL_map);
1768 #else
1769         return __va(paddr);
1770 #endif
1771 }
1772
1773 /* Convert a machine address to physical address */
1774 static unsigned long __init m2p(phys_addr_t maddr)
1775 {
1776         phys_addr_t paddr;
1777
1778         maddr &= PTE_PFN_MASK;
1779         paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1780
1781         return paddr;
1782 }
1783
1784 /* Convert a machine address to kernel virtual */
1785 static void * __init m2v(phys_addr_t maddr)
1786 {
1787         return __ka(m2p(maddr));
1788 }
1789
1790 /* Set the page permissions on an identity-mapped pages */
1791 static void __init set_page_prot_flags(void *addr, pgprot_t prot,
1792                                        unsigned long flags)
1793 {
1794         unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1795         pte_t pte = pfn_pte(pfn, prot);
1796
1797         if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1798                 BUG();
1799 }
1800 static void __init set_page_prot(void *addr, pgprot_t prot)
1801 {
1802         return set_page_prot_flags(addr, prot, UVMF_NONE);
1803 }
1804 #ifdef CONFIG_X86_32
1805 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1806 {
1807         unsigned pmdidx, pteidx;
1808         unsigned ident_pte;
1809         unsigned long pfn;
1810
1811         level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1812                                       PAGE_SIZE);
1813
1814         ident_pte = 0;
1815         pfn = 0;
1816         for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1817                 pte_t *pte_page;
1818
1819                 /* Reuse or allocate a page of ptes */
1820                 if (pmd_present(pmd[pmdidx]))
1821                         pte_page = m2v(pmd[pmdidx].pmd);
1822                 else {
1823                         /* Check for free pte pages */
1824                         if (ident_pte == LEVEL1_IDENT_ENTRIES)
1825                                 break;
1826
1827                         pte_page = &level1_ident_pgt[ident_pte];
1828                         ident_pte += PTRS_PER_PTE;
1829
1830                         pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1831                 }
1832
1833                 /* Install mappings */
1834                 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1835                         pte_t pte;
1836
1837                         if (pfn > max_pfn_mapped)
1838                                 max_pfn_mapped = pfn;
1839
1840                         if (!pte_none(pte_page[pteidx]))
1841                                 continue;
1842
1843                         pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1844                         pte_page[pteidx] = pte;
1845                 }
1846         }
1847
1848         for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1849                 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1850
1851         set_page_prot(pmd, PAGE_KERNEL_RO);
1852 }
1853 #endif
1854 void __init xen_setup_machphys_mapping(void)
1855 {
1856         struct xen_machphys_mapping mapping;
1857
1858         if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1859                 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1860                 machine_to_phys_nr = mapping.max_mfn + 1;
1861         } else {
1862                 machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1863         }
1864 #ifdef CONFIG_X86_32
1865         WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1866                 < machine_to_phys_mapping);
1867 #endif
1868 }
1869
1870 #ifdef CONFIG_X86_64
1871 static void __init convert_pfn_mfn(void *v)
1872 {
1873         pte_t *pte = v;
1874         int i;
1875
1876         /* All levels are converted the same way, so just treat them
1877            as ptes. */
1878         for (i = 0; i < PTRS_PER_PTE; i++)
1879                 pte[i] = xen_make_pte(pte[i].pte);
1880 }
1881 static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1882                                  unsigned long addr)
1883 {
1884         if (*pt_base == PFN_DOWN(__pa(addr))) {
1885                 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1886                 clear_page((void *)addr);
1887                 (*pt_base)++;
1888         }
1889         if (*pt_end == PFN_DOWN(__pa(addr))) {
1890                 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1891                 clear_page((void *)addr);
1892                 (*pt_end)--;
1893         }
1894 }
1895 /*
1896  * Set up the initial kernel pagetable.
1897  *
1898  * We can construct this by grafting the Xen provided pagetable into
1899  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1900  * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
1901  * kernel has a physical mapping to start with - but that's enough to
1902  * get __va working.  We need to fill in the rest of the physical
1903  * mapping once some sort of allocator has been set up.
1904  */
1905 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1906 {
1907         pud_t *l3;
1908         pmd_t *l2;
1909         unsigned long addr[3];
1910         unsigned long pt_base, pt_end;
1911         unsigned i;
1912
1913         /* max_pfn_mapped is the last pfn mapped in the initial memory
1914          * mappings. Considering that on Xen after the kernel mappings we
1915          * have the mappings of some pages that don't exist in pfn space, we
1916          * set max_pfn_mapped to the last real pfn mapped. */
1917         if (xen_start_info->mfn_list < __START_KERNEL_map)
1918                 max_pfn_mapped = xen_start_info->first_p2m_pfn;
1919         else
1920                 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1921
1922         pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1923         pt_end = pt_base + xen_start_info->nr_pt_frames;
1924
1925         /* Zap identity mapping */
1926         init_level4_pgt[0] = __pgd(0);
1927
1928         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1929                 /* Pre-constructed entries are in pfn, so convert to mfn */
1930                 /* L4[272] -> level3_ident_pgt
1931                  * L4[511] -> level3_kernel_pgt */
1932                 convert_pfn_mfn(init_level4_pgt);
1933
1934                 /* L3_i[0] -> level2_ident_pgt */
1935                 convert_pfn_mfn(level3_ident_pgt);
1936                 /* L3_k[510] -> level2_kernel_pgt
1937                  * L3_k[511] -> level2_fixmap_pgt */
1938                 convert_pfn_mfn(level3_kernel_pgt);
1939
1940                 /* L3_k[511][506] -> level1_fixmap_pgt */
1941                 convert_pfn_mfn(level2_fixmap_pgt);
1942         }
1943         /* We get [511][511] and have Xen's version of level2_kernel_pgt */
1944         l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1945         l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1946
1947         addr[0] = (unsigned long)pgd;
1948         addr[1] = (unsigned long)l3;
1949         addr[2] = (unsigned long)l2;
1950         /* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
1951          * Both L4[272][0] and L4[511][510] have entries that point to the same
1952          * L2 (PMD) tables. Meaning that if you modify it in __va space
1953          * it will be also modified in the __ka space! (But if you just
1954          * modify the PMD table to point to other PTE's or none, then you
1955          * are OK - which is what cleanup_highmap does) */
1956         copy_page(level2_ident_pgt, l2);
1957         /* Graft it onto L4[511][510] */
1958         copy_page(level2_kernel_pgt, l2);
1959
1960         /* Copy the initial P->M table mappings if necessary. */
1961         i = pgd_index(xen_start_info->mfn_list);
1962         if (i && i < pgd_index(__START_KERNEL_map))
1963                 init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1964
1965         if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1966                 /* Make pagetable pieces RO */
1967                 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1968                 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1969                 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1970                 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1971                 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1972                 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1973                 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1974                 set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
1975
1976                 /* Pin down new L4 */
1977                 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1978                                   PFN_DOWN(__pa_symbol(init_level4_pgt)));
1979
1980                 /* Unpin Xen-provided one */
1981                 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1982
1983                 /*
1984                  * At this stage there can be no user pgd, and no page
1985                  * structure to attach it to, so make sure we just set kernel
1986                  * pgd.
1987                  */
1988                 xen_mc_batch();
1989                 __xen_write_cr3(true, __pa(init_level4_pgt));
1990                 xen_mc_issue(PARAVIRT_LAZY_CPU);
1991         } else
1992                 native_write_cr3(__pa(init_level4_pgt));
1993
1994         /* We can't that easily rip out L3 and L2, as the Xen pagetables are
1995          * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
1996          * the initial domain. For guests using the toolstack, they are in:
1997          * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
1998          * rip out the [L4] (pgd), but for guests we shave off three pages.
1999          */
2000         for (i = 0; i < ARRAY_SIZE(addr); i++)
2001                 check_pt_base(&pt_base, &pt_end, addr[i]);
2002
2003         /* Our (by three pages) smaller Xen pagetable that we are using */
2004         xen_pt_base = PFN_PHYS(pt_base);
2005         xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
2006         memblock_reserve(xen_pt_base, xen_pt_size);
2007
2008         /* Revector the xen_start_info */
2009         xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
2010 }
2011
2012 /*
2013  * Read a value from a physical address.
2014  */
2015 static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
2016 {
2017         unsigned long *vaddr;
2018         unsigned long val;
2019
2020         vaddr = early_memremap_ro(addr, sizeof(val));
2021         val = *vaddr;
2022         early_memunmap(vaddr, sizeof(val));
2023         return val;
2024 }
2025
2026 /*
2027  * Translate a virtual address to a physical one without relying on mapped
2028  * page tables.
2029  */
2030 static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
2031 {
2032         phys_addr_t pa;
2033         pgd_t pgd;
2034         pud_t pud;
2035         pmd_t pmd;
2036         pte_t pte;
2037
2038         pa = read_cr3();
2039         pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
2040                                                        sizeof(pgd)));
2041         if (!pgd_present(pgd))
2042                 return 0;
2043
2044         pa = pgd_val(pgd) & PTE_PFN_MASK;
2045         pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
2046                                                        sizeof(pud)));
2047         if (!pud_present(pud))
2048                 return 0;
2049         pa = pud_pfn(pud) << PAGE_SHIFT;
2050         if (pud_large(pud))
2051                 return pa + (vaddr & ~PUD_MASK);
2052
2053         pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
2054                                                        sizeof(pmd)));
2055         if (!pmd_present(pmd))
2056                 return 0;
2057         pa = pmd_pfn(pmd) << PAGE_SHIFT;
2058         if (pmd_large(pmd))
2059                 return pa + (vaddr & ~PMD_MASK);
2060
2061         pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
2062                                                        sizeof(pte)));
2063         if (!pte_present(pte))
2064                 return 0;
2065         pa = pte_pfn(pte) << PAGE_SHIFT;
2066
2067         return pa | (vaddr & ~PAGE_MASK);
2068 }
2069
2070 /*
2071  * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
2072  * this area.
2073  */
2074 void __init xen_relocate_p2m(void)
2075 {
2076         phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
2077         unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
2078         int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
2079         pte_t *pt;
2080         pmd_t *pmd;
2081         pud_t *pud;
2082         p4d_t *p4d = NULL;
2083         pgd_t *pgd;
2084         unsigned long *new_p2m;
2085         int save_pud;
2086
2087         size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
2088         n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
2089         n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
2090         n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
2091         n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
2092         if (PTRS_PER_P4D > 1)
2093                 n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
2094         else
2095                 n_p4d = 0;
2096         n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
2097
2098         new_area = xen_find_free_area(PFN_PHYS(n_frames));
2099         if (!new_area) {
2100                 xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
2101                 BUG();
2102         }
2103
2104         /*
2105          * Setup the page tables for addressing the new p2m list.
2106          * We have asked the hypervisor to map the p2m list at the user address
2107          * PUD_SIZE. It may have done so, or it may have used a kernel space
2108          * address depending on the Xen version.
2109          * To avoid any possible virtual address collision, just use
2110          * 2 * PUD_SIZE for the new area.
2111          */
2112         p4d_phys = new_area;
2113         pud_phys = p4d_phys + PFN_PHYS(n_p4d);
2114         pmd_phys = pud_phys + PFN_PHYS(n_pud);
2115         pt_phys = pmd_phys + PFN_PHYS(n_pmd);
2116         p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
2117
2118         pgd = __va(read_cr3());
2119         new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
2120         idx_p4d = 0;
2121         save_pud = n_pud;
2122         do {
2123                 if (n_p4d > 0) {
2124                         p4d = early_memremap(p4d_phys, PAGE_SIZE);
2125                         clear_page(p4d);
2126                         n_pud = min(save_pud, PTRS_PER_P4D);
2127                 }
2128                 for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
2129                         pud = early_memremap(pud_phys, PAGE_SIZE);
2130                         clear_page(pud);
2131                         for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
2132                                  idx_pmd++) {
2133                                 pmd = early_memremap(pmd_phys, PAGE_SIZE);
2134                                 clear_page(pmd);
2135                                 for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
2136                                          idx_pt++) {
2137                                         pt = early_memremap(pt_phys, PAGE_SIZE);
2138                                         clear_page(pt);
2139                                         for (idx_pte = 0;
2140                                                  idx_pte < min(n_pte, PTRS_PER_PTE);
2141                                                  idx_pte++) {
2142                                                 set_pte(pt + idx_pte,
2143                                                                 pfn_pte(p2m_pfn, PAGE_KERNEL));
2144                                                 p2m_pfn++;
2145                                         }
2146                                         n_pte -= PTRS_PER_PTE;
2147                                         early_memunmap(pt, PAGE_SIZE);
2148                                         make_lowmem_page_readonly(__va(pt_phys));
2149                                         pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
2150                                                         PFN_DOWN(pt_phys));
2151                                         set_pmd(pmd + idx_pt,
2152                                                         __pmd(_PAGE_TABLE | pt_phys));
2153                                         pt_phys += PAGE_SIZE;
2154                                 }
2155                                 n_pt -= PTRS_PER_PMD;
2156                                 early_memunmap(pmd, PAGE_SIZE);
2157                                 make_lowmem_page_readonly(__va(pmd_phys));
2158                                 pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
2159                                                 PFN_DOWN(pmd_phys));
2160                                 set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
2161                                 pmd_phys += PAGE_SIZE;
2162                         }
2163                         n_pmd -= PTRS_PER_PUD;
2164                         early_memunmap(pud, PAGE_SIZE);
2165                         make_lowmem_page_readonly(__va(pud_phys));
2166                         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
2167                         if (n_p4d > 0)
2168                                 set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
2169                         else
2170                                 set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
2171                         pud_phys += PAGE_SIZE;
2172                 }
2173                 if (n_p4d > 0) {
2174                         save_pud -= PTRS_PER_P4D;
2175                         early_memunmap(p4d, PAGE_SIZE);
2176                         make_lowmem_page_readonly(__va(p4d_phys));
2177                         pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
2178                         set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
2179                         p4d_phys += PAGE_SIZE;
2180                 }
2181         } while (++idx_p4d < n_p4d);
2182
2183         /* Now copy the old p2m info to the new area. */
2184         memcpy(new_p2m, xen_p2m_addr, size);
2185         xen_p2m_addr = new_p2m;
2186
2187         /* Release the old p2m list and set new list info. */
2188         p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
2189         BUG_ON(!p2m_pfn);
2190         p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
2191
2192         if (xen_start_info->mfn_list < __START_KERNEL_map) {
2193                 pfn = xen_start_info->first_p2m_pfn;
2194                 pfn_end = xen_start_info->first_p2m_pfn +
2195                           xen_start_info->nr_p2m_frames;
2196                 set_pgd(pgd + 1, __pgd(0));
2197         } else {
2198                 pfn = p2m_pfn;
2199                 pfn_end = p2m_pfn_end;
2200         }
2201
2202         memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
2203         while (pfn < pfn_end) {
2204                 if (pfn == p2m_pfn) {
2205                         pfn = p2m_pfn_end;
2206                         continue;
2207                 }
2208                 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
2209                 pfn++;
2210         }
2211
2212         xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
2213         xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
2214         xen_start_info->nr_p2m_frames = n_frames;
2215 }
2216
2217 #else   /* !CONFIG_X86_64 */
2218 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
2219 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
2220
2221 static void __init xen_write_cr3_init(unsigned long cr3)
2222 {
2223         unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
2224
2225         BUG_ON(read_cr3() != __pa(initial_page_table));
2226         BUG_ON(cr3 != __pa(swapper_pg_dir));
2227
2228         /*
2229          * We are switching to swapper_pg_dir for the first time (from
2230          * initial_page_table) and therefore need to mark that page
2231          * read-only and then pin it.
2232          *
2233          * Xen disallows sharing of kernel PMDs for PAE
2234          * guests. Therefore we must copy the kernel PMD from
2235          * initial_page_table into a new kernel PMD to be used in
2236          * swapper_pg_dir.
2237          */
2238         swapper_kernel_pmd =
2239                 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2240         copy_page(swapper_kernel_pmd, initial_kernel_pmd);
2241         swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
2242                 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
2243         set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
2244
2245         set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
2246         xen_write_cr3(cr3);
2247         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
2248
2249         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
2250                           PFN_DOWN(__pa(initial_page_table)));
2251         set_page_prot(initial_page_table, PAGE_KERNEL);
2252         set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
2253
2254         pv_mmu_ops.write_cr3 = &xen_write_cr3;
2255 }
2256
2257 /*
2258  * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
2259  * not the first page table in the page table pool.
2260  * Iterate through the initial page tables to find the real page table base.
2261  */
2262 static phys_addr_t xen_find_pt_base(pmd_t *pmd)
2263 {
2264         phys_addr_t pt_base, paddr;
2265         unsigned pmdidx;
2266
2267         pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
2268
2269         for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
2270                 if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
2271                         paddr = m2p(pmd[pmdidx].pmd);
2272                         pt_base = min(pt_base, paddr);
2273                 }
2274
2275         return pt_base;
2276 }
2277
2278 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
2279 {
2280         pmd_t *kernel_pmd;
2281
2282         kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2283
2284         xen_pt_base = xen_find_pt_base(kernel_pmd);
2285         xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
2286
2287         initial_kernel_pmd =
2288                 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2289
2290         max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
2291
2292         copy_page(initial_kernel_pmd, kernel_pmd);
2293
2294         xen_map_identity_early(initial_kernel_pmd, max_pfn);
2295
2296         copy_page(initial_page_table, pgd);
2297         initial_page_table[KERNEL_PGD_BOUNDARY] =
2298                 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
2299
2300         set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
2301         set_page_prot(initial_page_table, PAGE_KERNEL_RO);
2302         set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2303
2304         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2305
2306         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
2307                           PFN_DOWN(__pa(initial_page_table)));
2308         xen_write_cr3(__pa(initial_page_table));
2309
2310         memblock_reserve(xen_pt_base, xen_pt_size);
2311 }
2312 #endif  /* CONFIG_X86_64 */
2313
2314 void __init xen_reserve_special_pages(void)
2315 {
2316         phys_addr_t paddr;
2317
2318         memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
2319         if (xen_start_info->store_mfn) {
2320                 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
2321                 memblock_reserve(paddr, PAGE_SIZE);
2322         }
2323         if (!xen_initial_domain()) {
2324                 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
2325                 memblock_reserve(paddr, PAGE_SIZE);
2326         }
2327 }
2328
2329 void __init xen_pt_check_e820(void)
2330 {
2331         if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
2332                 xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
2333                 BUG();
2334         }
2335 }
2336
2337 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2338
2339 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2340 {
2341         pte_t pte;
2342
2343         phys >>= PAGE_SHIFT;
2344
2345         switch (idx) {
2346         case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2347         case FIX_RO_IDT:
2348 #ifdef CONFIG_X86_32
2349         case FIX_WP_TEST:
2350 # ifdef CONFIG_HIGHMEM
2351         case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2352 # endif
2353 #elif defined(CONFIG_X86_VSYSCALL_EMULATION)
2354         case VSYSCALL_PAGE:
2355 #endif
2356         case FIX_TEXT_POKE0:
2357         case FIX_TEXT_POKE1:
2358         case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
2359                 /* All local page mappings */
2360                 pte = pfn_pte(phys, prot);
2361                 break;
2362
2363 #ifdef CONFIG_X86_LOCAL_APIC
2364         case FIX_APIC_BASE:     /* maps dummy local APIC */
2365                 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2366                 break;
2367 #endif
2368
2369 #ifdef CONFIG_X86_IO_APIC
2370         case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2371                 /*
2372                  * We just don't map the IO APIC - all access is via
2373                  * hypercalls.  Keep the address in the pte for reference.
2374                  */
2375                 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2376                 break;
2377 #endif
2378
2379         case FIX_PARAVIRT_BOOTMAP:
2380                 /* This is an MFN, but it isn't an IO mapping from the
2381                    IO domain */
2382                 pte = mfn_pte(phys, prot);
2383                 break;
2384
2385         default:
2386                 /* By default, set_fixmap is used for hardware mappings */
2387                 pte = mfn_pte(phys, prot);
2388                 break;
2389         }
2390
2391         __native_set_fixmap(idx, pte);
2392
2393 #ifdef CONFIG_X86_VSYSCALL_EMULATION
2394         /* Replicate changes to map the vsyscall page into the user
2395            pagetable vsyscall mapping. */
2396         if (idx == VSYSCALL_PAGE) {
2397                 unsigned long vaddr = __fix_to_virt(idx);
2398                 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2399         }
2400 #endif
2401 }
2402
2403 static void __init xen_post_allocator_init(void)
2404 {
2405         if (xen_feature(XENFEAT_auto_translated_physmap))
2406                 return;
2407
2408         pv_mmu_ops.set_pte = xen_set_pte;
2409         pv_mmu_ops.set_pmd = xen_set_pmd;
2410         pv_mmu_ops.set_pud = xen_set_pud;
2411 #if CONFIG_PGTABLE_LEVELS >= 4
2412         pv_mmu_ops.set_p4d = xen_set_p4d;
2413 #endif
2414
2415         /* This will work as long as patching hasn't happened yet
2416            (which it hasn't) */
2417         pv_mmu_ops.alloc_pte = xen_alloc_pte;
2418         pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2419         pv_mmu_ops.release_pte = xen_release_pte;
2420         pv_mmu_ops.release_pmd = xen_release_pmd;
2421 #if CONFIG_PGTABLE_LEVELS >= 4
2422         pv_mmu_ops.alloc_pud = xen_alloc_pud;
2423         pv_mmu_ops.release_pud = xen_release_pud;
2424 #endif
2425         pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte);
2426
2427 #ifdef CONFIG_X86_64
2428         pv_mmu_ops.write_cr3 = &xen_write_cr3;
2429         SetPagePinned(virt_to_page(level3_user_vsyscall));
2430 #endif
2431         xen_mark_init_mm_pinned();
2432 }
2433
2434 static void xen_leave_lazy_mmu(void)
2435 {
2436         preempt_disable();
2437         xen_mc_flush();
2438         paravirt_leave_lazy_mmu();
2439         preempt_enable();
2440 }
2441
2442 static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2443         .read_cr2 = xen_read_cr2,
2444         .write_cr2 = xen_write_cr2,
2445
2446         .read_cr3 = xen_read_cr3,
2447         .write_cr3 = xen_write_cr3_init,
2448
2449         .flush_tlb_user = xen_flush_tlb,
2450         .flush_tlb_kernel = xen_flush_tlb,
2451         .flush_tlb_single = xen_flush_tlb_single,
2452         .flush_tlb_others = xen_flush_tlb_others,
2453
2454         .pte_update = paravirt_nop,
2455
2456         .pgd_alloc = xen_pgd_alloc,
2457         .pgd_free = xen_pgd_free,
2458
2459         .alloc_pte = xen_alloc_pte_init,
2460         .release_pte = xen_release_pte_init,
2461         .alloc_pmd = xen_alloc_pmd_init,
2462         .release_pmd = xen_release_pmd_init,
2463
2464         .set_pte = xen_set_pte_init,
2465         .set_pte_at = xen_set_pte_at,
2466         .set_pmd = xen_set_pmd_hyper,
2467
2468         .ptep_modify_prot_start = __ptep_modify_prot_start,
2469         .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2470
2471         .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2472         .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2473
2474         .make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
2475         .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2476
2477 #ifdef CONFIG_X86_PAE
2478         .set_pte_atomic = xen_set_pte_atomic,
2479         .pte_clear = xen_pte_clear,
2480         .pmd_clear = xen_pmd_clear,
2481 #endif  /* CONFIG_X86_PAE */
2482         .set_pud = xen_set_pud_hyper,
2483
2484         .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2485         .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2486
2487 #if CONFIG_PGTABLE_LEVELS >= 4
2488         .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2489         .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2490         .set_p4d = xen_set_p4d_hyper,
2491
2492         .alloc_pud = xen_alloc_pmd_init,
2493         .release_pud = xen_release_pmd_init,
2494 #endif  /* CONFIG_PGTABLE_LEVELS == 4 */
2495
2496         .activate_mm = xen_activate_mm,
2497         .dup_mmap = xen_dup_mmap,
2498         .exit_mmap = xen_exit_mmap,
2499
2500         .lazy_mode = {
2501                 .enter = paravirt_enter_lazy_mmu,
2502                 .leave = xen_leave_lazy_mmu,
2503                 .flush = paravirt_flush_lazy_mmu,
2504         },
2505
2506         .set_fixmap = xen_set_fixmap,
2507 };
2508
2509 void __init xen_init_mmu_ops(void)
2510 {
2511         x86_init.paging.pagetable_init = xen_pagetable_init;
2512
2513         if (xen_feature(XENFEAT_auto_translated_physmap))
2514                 return;
2515
2516         pv_mmu_ops = xen_mmu_ops;
2517
2518         memset(dummy_mapping, 0xff, PAGE_SIZE);
2519 }
2520
2521 /* Protected by xen_reservation_lock. */
2522 #define MAX_CONTIG_ORDER 9 /* 2MB */
2523 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2524
2525 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2526 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2527                                 unsigned long *in_frames,
2528                                 unsigned long *out_frames)
2529 {
2530         int i;
2531         struct multicall_space mcs;
2532
2533         xen_mc_batch();
2534         for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2535                 mcs = __xen_mc_entry(0);
2536
2537                 if (in_frames)
2538                         in_frames[i] = virt_to_mfn(vaddr);
2539
2540                 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2541                 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2542
2543                 if (out_frames)
2544                         out_frames[i] = virt_to_pfn(vaddr);
2545         }
2546         xen_mc_issue(0);
2547 }
2548
2549 /*
2550  * Update the pfn-to-mfn mappings for a virtual address range, either to
2551  * point to an array of mfns, or contiguously from a single starting
2552  * mfn.
2553  */
2554 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2555                                      unsigned long *mfns,
2556                                      unsigned long first_mfn)
2557 {
2558         unsigned i, limit;
2559         unsigned long mfn;
2560
2561         xen_mc_batch();
2562
2563         limit = 1u << order;
2564         for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2565                 struct multicall_space mcs;
2566                 unsigned flags;
2567
2568                 mcs = __xen_mc_entry(0);
2569                 if (mfns)
2570                         mfn = mfns[i];
2571                 else
2572                         mfn = first_mfn + i;
2573
2574                 if (i < (limit - 1))
2575                         flags = 0;
2576                 else {
2577                         if (order == 0)
2578                                 flags = UVMF_INVLPG | UVMF_ALL;
2579                         else
2580                                 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2581                 }
2582
2583                 MULTI_update_va_mapping(mcs.mc, vaddr,
2584                                 mfn_pte(mfn, PAGE_KERNEL), flags);
2585
2586                 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2587         }
2588
2589         xen_mc_issue(0);
2590 }
2591
2592 /*
2593  * Perform the hypercall to exchange a region of our pfns to point to
2594  * memory with the required contiguous alignment.  Takes the pfns as
2595  * input, and populates mfns as output.
2596  *
2597  * Returns a success code indicating whether the hypervisor was able to
2598  * satisfy the request or not.
2599  */
2600 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2601                                unsigned long *pfns_in,
2602                                unsigned long extents_out,
2603                                unsigned int order_out,
2604                                unsigned long *mfns_out,
2605                                unsigned int address_bits)
2606 {
2607         long rc;
2608         int success;
2609
2610         struct xen_memory_exchange exchange = {
2611                 .in = {
2612                         .nr_extents   = extents_in,
2613                         .extent_order = order_in,
2614                         .extent_start = pfns_in,
2615                         .domid        = DOMID_SELF
2616                 },
2617                 .out = {
2618                         .nr_extents   = extents_out,
2619                         .extent_order = order_out,
2620                         .extent_start = mfns_out,
2621                         .address_bits = address_bits,
2622                         .domid        = DOMID_SELF
2623                 }
2624         };
2625
2626         BUG_ON(extents_in << order_in != extents_out << order_out);
2627
2628         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2629         success = (exchange.nr_exchanged == extents_in);
2630
2631         BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2632         BUG_ON(success && (rc != 0));
2633
2634         return success;
2635 }
2636
2637 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
2638                                  unsigned int address_bits,
2639                                  dma_addr_t *dma_handle)
2640 {
2641         unsigned long *in_frames = discontig_frames, out_frame;
2642         unsigned long  flags;
2643         int            success;
2644         unsigned long vstart = (unsigned long)phys_to_virt(pstart);
2645
2646         /*
2647          * Currently an auto-translated guest will not perform I/O, nor will
2648          * it require PAE page directories below 4GB. Therefore any calls to
2649          * this function are redundant and can be ignored.
2650          */
2651
2652         if (xen_feature(XENFEAT_auto_translated_physmap))
2653                 return 0;
2654
2655         if (unlikely(order > MAX_CONTIG_ORDER))
2656                 return -ENOMEM;
2657
2658         memset((void *) vstart, 0, PAGE_SIZE << order);
2659
2660         spin_lock_irqsave(&xen_reservation_lock, flags);
2661
2662         /* 1. Zap current PTEs, remembering MFNs. */
2663         xen_zap_pfn_range(vstart, order, in_frames, NULL);
2664
2665         /* 2. Get a new contiguous memory extent. */
2666         out_frame = virt_to_pfn(vstart);
2667         success = xen_exchange_memory(1UL << order, 0, in_frames,
2668                                       1, order, &out_frame,
2669                                       address_bits);
2670
2671         /* 3. Map the new extent in place of old pages. */
2672         if (success)
2673                 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2674         else
2675                 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2676
2677         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2678
2679         *dma_handle = virt_to_machine(vstart).maddr;
2680         return success ? 0 : -ENOMEM;
2681 }
2682 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2683
2684 void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
2685 {
2686         unsigned long *out_frames = discontig_frames, in_frame;
2687         unsigned long  flags;
2688         int success;
2689         unsigned long vstart;
2690
2691         if (xen_feature(XENFEAT_auto_translated_physmap))
2692                 return;
2693
2694         if (unlikely(order > MAX_CONTIG_ORDER))
2695                 return;
2696
2697         vstart = (unsigned long)phys_to_virt(pstart);
2698         memset((void *) vstart, 0, PAGE_SIZE << order);
2699
2700         spin_lock_irqsave(&xen_reservation_lock, flags);
2701
2702         /* 1. Find start MFN of contiguous extent. */
2703         in_frame = virt_to_mfn(vstart);
2704
2705         /* 2. Zap current PTEs. */
2706         xen_zap_pfn_range(vstart, order, NULL, out_frames);
2707
2708         /* 3. Do the exchange for non-contiguous MFNs. */
2709         success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2710                                         0, out_frames, 0);
2711
2712         /* 4. Map new pages in place of old pages. */
2713         if (success)
2714                 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2715         else
2716                 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2717
2718         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2719 }
2720 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2721
2722 #ifdef CONFIG_KEXEC_CORE
2723 phys_addr_t paddr_vmcoreinfo_note(void)
2724 {
2725         if (xen_pv_domain())
2726                 return virt_to_machine(&vmcoreinfo_note).maddr;
2727         else
2728                 return __pa_symbol(&vmcoreinfo_note);
2729 }
2730 #endif /* CONFIG_KEXEC_CORE */