free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
- if (!tlb_is_full_mm(*tlb))
+ if (!(*tlb)->fullmm)
flush_tlb_pgtables((*tlb)->mm, start, end);
}
}
}
-pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
- if (!pmd_present(*pmd)) {
- struct page *new;
+ struct page *new = pte_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pmd_present(*pmd)) {
- pte_free(new);
- goto out;
- }
+ spin_lock(&mm->page_table_lock);
+ if (pmd_present(*pmd)) /* Another has populated it */
+ pte_free(new);
+ else {
mm->nr_ptes++;
inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
}
-out:
- return pte_offset_map(pmd, address);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
}
-pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
{
- if (!pmd_present(*pmd)) {
- pte_t *new;
+ pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+ if (!new)
+ return -ENOMEM;
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one_kernel(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
+ spin_lock(&init_mm.page_table_lock);
+ if (pmd_present(*pmd)) /* Another has populated it */
+ pte_free_kernel(new);
+ else
+ pmd_populate_kernel(&init_mm, pmd, new);
+ spin_unlock(&init_mm.page_table_lock);
+ return 0;
+}
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pmd_present(*pmd)) {
- pte_free_kernel(new);
- goto out;
- }
- pmd_populate_kernel(mm, pmd, new);
- }
-out:
- return pte_offset_kernel(pmd, address);
+static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+{
+ if (file_rss)
+ add_mm_counter(mm, file_rss, file_rss);
+ if (anon_rss)
+ add_mm_counter(mm, anon_rss, anon_rss);
+}
+
+/*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+ printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+ "vm_flags = %lx, vaddr = %lx\n",
+ (long long)pte_val(pte),
+ (vma->vm_mm == current->mm ? current->comm : "???"),
+ vma->vm_flags, vaddr);
+ dump_stack();
}
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
*/
static inline void
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
- unsigned long addr)
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+ unsigned long addr, int *rss)
{
+ unsigned long vm_flags = vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
unsigned long pfn;
spin_unlock(&mmlist_lock);
}
}
- set_pte_at(dst_mm, addr, dst_pte, pte);
- return;
+ goto out_set_pte;
}
- pfn = pte_pfn(pte);
- /* the pte points outside of valid memory, the
- * mapping is assumed to be good, meaningful
- * and not mapped via rmap - duplicate the
- * mapping as is.
+ /* If the region is VM_RESERVED, the mapping is not
+ * mapped via rmap - duplicate the pte as is.
*/
- page = NULL;
- if (pfn_valid(pfn))
- page = pfn_to_page(pfn);
+ if (vm_flags & VM_RESERVED)
+ goto out_set_pte;
- if (!page || PageReserved(page)) {
- set_pte_at(dst_mm, addr, dst_pte, pte);
- return;
+ pfn = pte_pfn(pte);
+ /* If the pte points outside of valid memory but
+ * the region is not VM_RESERVED, we have a problem.
+ */
+ if (unlikely(!pfn_valid(pfn))) {
+ print_bad_pte(vma, pte, addr);
+ goto out_set_pte; /* try to do something sane */
}
+ page = pfn_to_page(pfn);
+
/*
* If it's a COW mapping, write protect it both
* in the parent and the child
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
get_page(page);
- inc_mm_counter(dst_mm, rss);
- if (PageAnon(page))
- inc_mm_counter(dst_mm, anon_rss);
- set_pte_at(dst_mm, addr, dst_pte, pte);
page_dup_rmap(page);
+ rss[!!PageAnon(page)]++;
+
+out_set_pte:
+ set_pte_at(dst_mm, addr, dst_pte, pte);
}
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
unsigned long addr, unsigned long end)
{
pte_t *src_pte, *dst_pte;
- unsigned long vm_flags = vma->vm_flags;
- int progress;
+ spinlock_t *src_ptl, *dst_ptl;
+ int progress = 0;
+ int rss[2];
again:
- dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
+ rss[1] = rss[0] = 0;
+ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
+ src_ptl = &src_mm->page_table_lock;
+ spin_lock(src_ptl);
- progress = 0;
- spin_lock(&src_mm->page_table_lock);
do {
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
*/
- if (progress >= 32 && (need_resched() ||
- need_lockbreak(&src_mm->page_table_lock) ||
- need_lockbreak(&dst_mm->page_table_lock)))
- break;
+ if (progress >= 32) {
+ progress = 0;
+ if (need_resched() ||
+ need_lockbreak(src_ptl) ||
+ need_lockbreak(dst_ptl))
+ break;
+ }
if (pte_none(*src_pte)) {
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
+ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
- spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(src_ptl);
pte_unmap_nested(src_pte - 1);
- pte_unmap(dst_pte - 1);
- cond_resched_lock(&dst_mm->page_table_lock);
+ add_mm_rss(dst_mm, rss[0], rss[1]);
+ pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ cond_resched();
if (addr != end)
goto again;
return 0;
return 0;
}
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+static void zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
+ struct mm_struct *mm = tlb->mm;
pte_t *pte;
+ int file_rss = 0;
+ int anon_rss = 0;
pte = pte_offset_map(pmd, addr);
do {
continue;
if (pte_present(ptent)) {
struct page *page = NULL;
- unsigned long pfn = pte_pfn(ptent);
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
- if (PageReserved(page))
- page = NULL;
+ if (!(vma->vm_flags & VM_RESERVED)) {
+ unsigned long pfn = pte_pfn(ptent);
+ if (unlikely(!pfn_valid(pfn)))
+ print_bad_pte(vma, ptent, addr);
+ else
+ page = pfn_to_page(pfn);
}
if (unlikely(details) && page) {
/*
page->index > details->last_index))
continue;
}
- ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
addr) != page->index)
- set_pte_at(tlb->mm, addr, pte,
+ set_pte_at(mm, addr, pte,
pgoff_to_pte(page->index));
- if (pte_dirty(ptent))
- set_page_dirty(page);
if (PageAnon(page))
- dec_mm_counter(tlb->mm, anon_rss);
- else if (pte_young(ptent))
- mark_page_accessed(page);
- tlb->freed++;
+ anon_rss--;
+ else {
+ if (pte_dirty(ptent))
+ set_page_dirty(page);
+ if (pte_young(ptent))
+ mark_page_accessed(page);
+ file_rss--;
+ }
page_remove_rmap(page);
tlb_remove_page(tlb, page);
continue;
continue;
if (!pte_file(ptent))
free_swap_and_cache(pte_to_swp_entry(ptent));
- pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
+ pte_clear_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
+
+ add_mm_rss(mm, file_rss, anon_rss);
pte_unmap(pte - 1);
}
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline void zap_pmd_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- zap_pte_range(tlb, pmd, addr, next, details);
+ zap_pte_range(tlb, vma, pmd, addr, next, details);
} while (pmd++, addr = next, addr != end);
}
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void zap_pud_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- zap_pmd_range(tlb, pud, addr, next, details);
+ zap_pmd_range(tlb, vma, pud, addr, next, details);
} while (pud++, addr = next, addr != end);
}
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- zap_pud_range(tlb, pgd, addr, next, details);
+ zap_pud_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
}
int tlb_start_valid = 0;
unsigned long start = start_addr;
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
- int fullmm = tlb_is_full_mm(*tlbp);
+ int fullmm = (*tlbp)->fullmm;
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long end;
lru_add_drain();
spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
+ update_hiwater_rss(mm);
end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
tlb_finish_mmu(tlb, address, end);
spin_unlock(&mm->page_table_lock);
continue;
}
- if (!vma || (vma->vm_flags & VM_IO)
+ if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
|| !(flags & vma->vm_flags))
return i ? : -EFAULT;
if (pages) {
pages[i] = page;
flush_dcache_page(page);
- if (!PageReserved(page))
- page_cache_get(page);
+ page_cache_get(page);
}
if (vmas)
vmas[i] = vma;
unsigned long addr, unsigned long end, pgprot_t prot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
do {
- pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
+ struct page *page = ZERO_PAGE(addr);
+ pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+ page_cache_get(page);
+ page_add_file_rmap(page);
+ inc_mm_counter(mm, file_rss);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, zero_pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
return 0;
}
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = zeromap_pud_range(mm, pgd, addr, next, prot);
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
return err;
}
unsigned long pfn, pgprot_t prot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
do {
BUG_ON(!pte_none(*pte));
- if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
- set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+ set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
return 0;
}
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
- * VM_RESERVED tells swapout not to try to touch
- * this region.
+ * VM_RESERVED tells the core MM not to "manage" these pages
+ * (e.g. refcount, mapcount, try to swap them out).
*/
vma->vm_flags |= VM_IO | VM_RESERVED;
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = remap_pud_range(mm, pgd, addr, next,
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
return pte;
}
-/*
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
- */
-static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
- pte_t *page_table)
-{
- pte_t entry;
-
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
- vma);
- ptep_establish(vma, address, page_table, entry);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
-}
-
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
- * Goto-purists beware: the only reason for goto's here is that it results
- * in better assembly code.. The "default" path will see no jumps at all.
- *
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
* Thus we can safely just mark it writable once we've done any necessary
* We hold the mm semaphore and the page_table_lock on entry and exit
* with the page_table_lock released.
*/
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ pte_t orig_pte)
{
struct page *old_page, *new_page;
- unsigned long pfn = pte_pfn(pte);
+ unsigned long pfn = pte_pfn(orig_pte);
pte_t entry;
- int ret;
+ int ret = VM_FAULT_MINOR;
+
+ BUG_ON(vma->vm_flags & VM_RESERVED);
if (unlikely(!pfn_valid(pfn))) {
/*
- * This should really halt the system so it can be debugged or
- * at least the kernel stops what it's doing before it corrupts
- * data, but for the moment just pretend this is OOM.
+ * Page table corrupted: show pte and kill process.
*/
- pte_unmap(page_table);
- printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
- address);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_OOM;
+ print_bad_pte(vma, orig_pte, address);
+ ret = VM_FAULT_OOM;
+ goto unlock;
}
old_page = pfn_to_page(pfn);
unlock_page(old_page);
if (reuse) {
flush_cache_page(vma, address, pfn);
- entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
- vma);
+ entry = pte_mkyoung(orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
ptep_set_access_flags(vma, address, page_table, entry, 1);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_MINOR|VM_FAULT_WRITE;
+ ret |= VM_FAULT_WRITE;
+ goto unlock;
}
}
- pte_unmap(page_table);
/*
* Ok, we need to copy. Oh, well..
*/
- if (!PageReserved(old_page))
- page_cache_get(old_page);
+ page_cache_get(old_page);
+ pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
if (unlikely(anon_vma_prepare(vma)))
- goto no_new_page;
+ goto oom;
if (old_page == ZERO_PAGE(address)) {
new_page = alloc_zeroed_user_highpage(vma, address);
if (!new_page)
- goto no_new_page;
+ goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
if (!new_page)
- goto no_new_page;
+ goto oom;
copy_user_highpage(new_page, old_page, address);
}
+
/*
* Re-check the pte - we dropped the lock
*/
- ret = VM_FAULT_MINOR;
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
- if (likely(pte_same(*page_table, pte))) {
- if (PageAnon(old_page))
- dec_mm_counter(mm, anon_rss);
- if (PageReserved(old_page))
- inc_mm_counter(mm, rss);
- else
- page_remove_rmap(old_page);
+ if (likely(pte_same(*page_table, orig_pte))) {
+ page_remove_rmap(old_page);
+ if (!PageAnon(old_page)) {
+ inc_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, file_rss);
+ }
flush_cache_page(vma, address, pfn);
- break_cow(vma, new_page, address, page_table);
+ entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ ptep_establish(vma, address, page_table, entry);
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
+
lru_cache_add_active(new_page);
page_add_anon_rmap(new_page, vma, address);
new_page = old_page;
ret |= VM_FAULT_WRITE;
}
- pte_unmap(page_table);
page_cache_release(new_page);
page_cache_release(old_page);
+unlock:
+ pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
return ret;
-
-no_new_page:
+oom:
page_cache_release(old_page);
return VM_FAULT_OOM;
}
* We hold the mm semaphore and the page_table_lock on entry and
* should release the pagetable lock on exit..
*/
-static int do_swap_page(struct mm_struct * mm,
- struct vm_area_struct * vma, unsigned long address,
- pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
+static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
{
struct page *page;
- swp_entry_t entry = pte_to_swp_entry(orig_pte);
+ swp_entry_t entry;
pte_t pte;
int ret = VM_FAULT_MINOR;
pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
+
+ entry = pte_to_swp_entry(orig_pte);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
page_table = pte_offset_map(pmd, address);
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_MINOR;
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- goto out;
+ goto unlock;
}
/* Had to read the page from swap area: Major fault */
*/
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
- if (unlikely(!pte_same(*page_table, orig_pte))) {
- ret = VM_FAULT_MINOR;
+ if (unlikely(!pte_same(*page_table, orig_pte)))
goto out_nomap;
- }
if (unlikely(!PageUptodate(page))) {
ret = VM_FAULT_SIGBUS;
/* The page isn't present yet, go ahead with the fault. */
- inc_mm_counter(mm, rss);
+ inc_mm_counter(mm, anon_rss);
pte = mk_pte(page, vma->vm_page_prot);
if (write_access && can_share_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
lazy_mmu_prot_update(pte);
+unlock:
pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
out:
spin_unlock(&mm->page_table_lock);
unlock_page(page);
page_cache_release(page);
- goto out;
+ return ret;
}
/*
* spinlock held to protect against concurrent faults in
* multithreaded programs.
*/
-static int
-do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
- pte_t *page_table, pmd_t *pmd, int write_access,
- unsigned long addr)
+static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
{
+ struct page *page = ZERO_PAGE(addr);
pte_t entry;
- struct page * page = ZERO_PAGE(addr);
- /* Read-only mapping of ZERO_PAGE. */
- entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+ /* Mapping of ZERO_PAGE - vm_page_prot is readonly */
+ entry = mk_pte(page, vma->vm_page_prot);
- /* ..except if it's a write access */
if (write_access) {
/* Allocate our own private page. */
pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
if (unlikely(anon_vma_prepare(vma)))
- goto no_mem;
- page = alloc_zeroed_user_highpage(vma, addr);
+ goto oom;
+ page = alloc_zeroed_user_highpage(vma, address);
if (!page)
- goto no_mem;
+ goto oom;
spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, addr);
+ page_table = pte_offset_map(pmd, address);
if (!pte_none(*page_table)) {
- pte_unmap(page_table);
page_cache_release(page);
- spin_unlock(&mm->page_table_lock);
- goto out;
+ goto unlock;
}
- inc_mm_counter(mm, rss);
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
- vma->vm_page_prot)),
- vma);
+ inc_mm_counter(mm, anon_rss);
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
lru_cache_add_active(page);
SetPageReferenced(page);
- page_add_anon_rmap(page, vma, addr);
+ page_add_anon_rmap(page, vma, address);
+ } else {
+ inc_mm_counter(mm, file_rss);
+ page_add_file_rmap(page);
+ page_cache_get(page);
}
- set_pte_at(mm, addr, page_table, entry);
- pte_unmap(page_table);
+ set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, addr, entry);
+ update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
+unlock:
+ pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
-out:
return VM_FAULT_MINOR;
-no_mem:
+oom:
return VM_FAULT_OOM;
}
* This is called with the MM semaphore held and the page table
* spinlock held. Exit with the spinlock released.
*/
-static int
-do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
{
- struct page * new_page;
+ struct page *new_page;
struct address_space *mapping = NULL;
pte_t entry;
unsigned int sequence = 0;
int ret = VM_FAULT_MINOR;
int anon = 0;
- if (!vma->vm_ops || !vma->vm_ops->nopage)
- return do_anonymous_page(mm, vma, page_table,
- pmd, write_access, address);
pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
smp_rmb(); /* serializes i_size against truncate_count */
}
retry:
- cond_resched();
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
/*
* No smp_rmb is needed here as long as there's a full
* retry getting the page.
*/
if (mapping && unlikely(sequence != mapping->truncate_count)) {
- sequence = mapping->truncate_count;
spin_unlock(&mm->page_table_lock);
page_cache_release(new_page);
+ cond_resched();
+ sequence = mapping->truncate_count;
+ smp_rmb();
goto retry;
}
page_table = pte_offset_map(pmd, address);
*/
/* Only go through if we didn't race with anybody else... */
if (pte_none(*page_table)) {
- if (!PageReserved(new_page))
- inc_mm_counter(mm, rss);
-
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte_at(mm, address, page_table, entry);
if (anon) {
+ inc_mm_counter(mm, anon_rss);
lru_cache_add_active(new_page);
page_add_anon_rmap(new_page, vma, address);
- } else
+ } else if (!(vma->vm_flags & VM_RESERVED)) {
+ inc_mm_counter(mm, file_rss);
page_add_file_rmap(new_page);
- pte_unmap(page_table);
+ }
} else {
/* One of our sibling threads was faster, back out. */
- pte_unmap(page_table);
page_cache_release(new_page);
- spin_unlock(&mm->page_table_lock);
- goto out;
+ goto unlock;
}
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
+unlock:
+ pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
-out:
return ret;
oom:
page_cache_release(new_page);
- ret = VM_FAULT_OOM;
- goto out;
+ return VM_FAULT_OOM;
}
/*
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
*/
-static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
{
- unsigned long pgoff;
+ pgoff_t pgoff;
int err;
- BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
- /*
- * Fall back to the linear mapping if the fs does not support
- * ->populate:
- */
- if (!vma->vm_ops->populate ||
- (write_access && !(vma->vm_flags & VM_SHARED))) {
- pte_clear(mm, address, pte);
- return do_no_page(mm, vma, address, write_access, pte, pmd);
- }
-
- pgoff = pte_to_pgoff(*pte);
-
- pte_unmap(pte);
+ pte_unmap(page_table);
spin_unlock(&mm->page_table_lock);
- err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
+ if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
+ /*
+ * Page table corrupted: show pte and kill process.
+ */
+ print_bad_pte(vma, orig_pte, address);
+ return VM_FAULT_OOM;
+ }
+ /* We can then assume vm->vm_ops && vma->vm_ops->populate */
+
+ pgoff = pte_to_pgoff(orig_pte);
+ err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
+ vma->vm_page_prot, pgoff, 0);
if (err == -ENOMEM)
return VM_FAULT_OOM;
if (err)
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
- * The adding of pages is protected by the MM semaphore (which we hold),
- * so we don't need to worry about a page being suddenly been added into
- * our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct * vma, unsigned long address,
- int write_access, pte_t *pte, pmd_t *pmd)
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, int write_access)
{
pte_t entry;
+ spin_lock(&mm->page_table_lock);
entry = *pte;
if (!pte_present(entry)) {
- /*
- * If it truly wasn't present, we know that kswapd
- * and the PTE updates will not touch it later. So
- * drop the lock.
- */
- if (pte_none(entry))
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ if (pte_none(entry)) {
+ if (!vma->vm_ops || !vma->vm_ops->nopage)
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, write_access);
+ return do_no_page(mm, vma, address,
+ pte, pmd, write_access);
+ }
if (pte_file(entry))
- return do_file_page(mm, vma, address, write_access, pte, pmd);
- return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
+ return do_file_page(mm, vma, address,
+ pte, pmd, write_access, entry);
+ return do_swap_page(mm, vma, address,
+ pte, pmd, write_access, entry);
}
if (write_access) {
/*
* By the time we get here, we already hold the mm semaphore
*/
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access)
{
pgd_t *pgd;
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, write_access);
- /*
- * We need the page table lock to synchronize with kswapd
- * and the SMP-safe atomic PTE updates.
- */
pgd = pgd_offset(mm, address);
- spin_lock(&mm->page_table_lock);
-
pud = pud_alloc(mm, pgd, address);
if (!pud)
- goto oom;
-
+ return VM_FAULT_OOM;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
- goto oom;
-
+ return VM_FAULT_OOM;
pte = pte_alloc_map(mm, pmd, address);
if (!pte)
- goto oom;
-
- return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+ return VM_FAULT_OOM;
- oom:
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_OOM;
+ return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
}
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
*/
-pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
- pud_t *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pud_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
+ pud_t *new = pud_alloc_one(mm, address);
if (!new)
- return NULL;
+ return -ENOMEM;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pgd_present(*pgd)) {
+ spin_lock(&mm->page_table_lock);
+ if (pgd_present(*pgd)) /* Another has populated it */
pud_free(new);
- goto out;
- }
- pgd_populate(mm, pgd, new);
- out:
- return pud_offset(pgd, address);
+ else
+ pgd_populate(mm, pgd, new);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
- *
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
*/
-pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
- pmd_t *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pmd_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
+ pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
- return NULL;
+ return -ENOMEM;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
+ spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_4LEVEL_HACK
- if (pud_present(*pud)) {
+ if (pud_present(*pud)) /* Another has populated it */
pmd_free(new);
- goto out;
- }
- pud_populate(mm, pud, new);
+ else
+ pud_populate(mm, pud, new);
#else
- if (pgd_present(*pud)) {
+ if (pgd_present(*pud)) /* Another has populated it */
pmd_free(new);
- goto out;
- }
- pgd_populate(mm, pud, new);
+ else
+ pgd_populate(mm, pud, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
-
- out:
- return pmd_offset(pud, address);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */
EXPORT_SYMBOL(vmalloc_to_pfn);
-/*
- * update_mem_hiwater
- * - update per process rss and vm high water data
- */
-void update_mem_hiwater(struct task_struct *tsk)
-{
- if (tsk->mm) {
- unsigned long rss = get_mm_counter(tsk->mm, rss);
-
- if (tsk->mm->hiwater_rss < rss)
- tsk->mm->hiwater_rss = rss;
- if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
- tsk->mm->hiwater_vm = tsk->mm->total_vm;
- }
-}
-
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
gate_vma.vm_start = FIXADDR_USER_START;
gate_vma.vm_end = FIXADDR_USER_END;
gate_vma.vm_page_prot = PAGE_READONLY;
- gate_vma.vm_flags = 0;
+ gate_vma.vm_flags = VM_RESERVED;
return 0;
}
__initcall(gate_vma_init);