mm: migration: do not lose soft dirty bit if page is in migration state

[karo-tx-linux.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index b3c6bf9a398e9b13f7c5bb1130a72e67034faf75..f7b7692c05edee78faf951778bb3909143ac9ac1 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -372,30 +372,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
  
  #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
  
-/*
- * If a p?d_bad entry is found while walking page tables, report
- * the error, before resetting entry to p?d_none.  Usually (but
- * very seldom) called out from the p?d_none_or_clear_bad macros.
- */
-
-void pgd_clear_bad(pgd_t *pgd)
-{
-       pgd_ERROR(*pgd);
-       pgd_clear(pgd);
-}
-
-void pud_clear_bad(pud_t *pud)
-{
-       pud_ERROR(*pud);
-       pud_clear(pud);
-}
-
-void pmd_clear_bad(pmd_t *pmd)
-{
-       pmd_ERROR(*pmd);
-       pmd_clear(pmd);
-}
-
  /*
   * Note: this doesn't free the actual pages themselves. That
   * has been handled earlier when unmapping all the memory regions.
@@ -861,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                          */
                                         make_migration_entry_read(&entry);
                                         pte = swp_entry_to_pte(entry);
+                                       if (pte_swp_soft_dirty(*src_pte))
+                                               pte = pte_swp_mksoft_dirty(pte);
                                         set_pte_at(src_mm, addr, src_pte, pte);
                                 }
                         }
@@ -1505,7 +1483,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
         if (pud_none(*pud))
                 goto no_page_table;
         if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
-               BUG_ON(flags & FOLL_GET);
+               if (flags & FOLL_GET)
+                       goto out;
                 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
                 goto out;
         }
@@ -1516,8 +1495,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
         if (pmd_none(*pmd))
                 goto no_page_table;
         if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-               BUG_ON(flags & FOLL_GET);
                 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+               if (flags & FOLL_GET) {
+                       /*
+                        * Refcount on tail pages are not well-defined and
+                        * shouldn't be taken. The caller should handle a NULL
+                        * return when trying to follow tail pages.
+                        */
+                       if (PageHead(page))
+                               get_page(page);
+                       else {
+                               page = NULL;
+                               goto out;
+                       }
+               }
                 goto out;
         }
         if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
@@ -3706,7 +3697,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
   * but allow concurrent faults), and pte mapped but not yet locked.
   * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
-int handle_pte_fault(struct mm_struct *mm,
+static int handle_pte_fault(struct mm_struct *mm,
                      struct vm_area_struct *vma, unsigned long address,
                      pte_t *pte, pmd_t *pmd, unsigned int flags)
  {
@@ -3765,22 +3756,14 @@ unlock:
  /*
   * By the time we get here, we already hold the mm semaphore
   */
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, unsigned int flags)
+static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+                            unsigned long address, unsigned int flags)
  {
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
         pte_t *pte;
  
-       __set_current_state(TASK_RUNNING);
-
-       count_vm_event(PGFAULT);
-       mem_cgroup_count_vm_event(mm, PGFAULT);
-
-       /* do counter updates before entering really critical section. */
-       check_sync_rss_stat(current);
-
         if (unlikely(is_vm_hugetlb_page(vma)))
                 return hugetlb_fault(mm, vma, address, flags);
  
@@ -3793,9 +3776,12 @@ retry:
         if (!pmd)
                 return VM_FAULT_OOM;
         if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+               int ret = VM_FAULT_FALLBACK;
                 if (!vma->vm_ops)
-                       return do_huge_pmd_anonymous_page(mm, vma, address,
-                                                         pmd, flags);
+                       ret = do_huge_pmd_anonymous_page(mm, vma, address,
+                                       pmd, flags);
+               if (!(ret & VM_FAULT_FALLBACK))
+                       return ret;
         } else {
                 pmd_t orig_pmd = *pmd;
                 int ret;
@@ -3861,6 +3847,37 @@ retry:
         return handle_pte_fault(mm, vma, address, pte, pmd, flags);
  }
  
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+                   unsigned long address, unsigned int flags)
+{
+       int ret;
+
+       __set_current_state(TASK_RUNNING);
+
+       count_vm_event(PGFAULT);
+       mem_cgroup_count_vm_event(mm, PGFAULT);
+
+       /* do counter updates before entering really critical section. */
+       check_sync_rss_stat(current);
+
+       /*
+        * Enable the memcg OOM handling for faults triggered in user
+        * space.  Kernel faults are handled more gracefully.
+        */
+       if (flags & FAULT_FLAG_USER)
+               mem_cgroup_enable_oom();
+
+       ret = __handle_mm_fault(mm, vma, address, flags);
+
+       if (flags & FAULT_FLAG_USER)
+               mem_cgroup_disable_oom();
+
+       if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
+               mem_cgroup_oom_synchronize();
+
+       return ret;
+}
+
  #ifndef __PAGETABLE_PUD_FOLDED
  /*
   * Allocate page upper directory.