mm, oom: fix potential data corruption when oom_reaper races with writer

[karo-tx-linux.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 0e517be91a89e162bb868af9835c8641aadfd01b..fe2fba27ded2fab229d0ef7a4908551343d31b89 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
  #include <linux/debugfs.h>
  #include <linux/userfaultfd_k.h>
  #include <linux/dax.h>
+#include <linux/oom.h>
  
  #include <asm/io.h>
  #include <asm/mmu_context.h>
@@ -215,12 +216,8 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
         return true;
  }
  
-/* tlb_gather_mmu
- *     Called to initialize an (on-stack) mmu_gather structure for page-table
- *     tear-down from @mm. The @fullmm argument is used when @mm is without
- *     users and we're going to destroy the full address space (exit/execve).
- */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+                               unsigned long start, unsigned long end)
  {
         tlb->mm = mm;
  
@@ -275,10 +272,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
   *     Called at the end of the shootdown operation to free up any resources
   *     that were required.
   */
-void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+void arch_tlb_finish_mmu(struct mmu_gather *tlb,
+               unsigned long start, unsigned long end, bool force)
  {
         struct mmu_gather_batch *batch, *next;
  
+       if (force)
+               __tlb_adjust_range(tlb, start, end - start);
+
         tlb_flush_mmu(tlb);
  
         /* keep the page table cache within bounds */
@@ -398,6 +399,34 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
  
  #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
  
+/* tlb_gather_mmu
+ *     Called to initialize an (on-stack) mmu_gather structure for page-table
+ *     tear-down from @mm. The @fullmm argument is used when @mm is without
+ *     users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+                       unsigned long start, unsigned long end)
+{
+       arch_tlb_gather_mmu(tlb, mm, start, end);
+       inc_tlb_flush_pending(tlb->mm);
+}
+
+void tlb_finish_mmu(struct mmu_gather *tlb,
+               unsigned long start, unsigned long end)
+{
+       /*
+        * If there are parallel threads are doing PTE changes on same range
+        * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
+        * flush by batching, a thread has stable TLB entry can fail to flush
+        * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
+        * forcefully if we detect parallel PTE batching threads.
+        */
+       bool force = mm_tlb_flush_nested(tlb->mm);
+
+       arch_tlb_finish_mmu(tlb, start, end, force);
+       dec_tlb_flush_pending(tlb->mm);
+}
+
  /*
   * Note: this doesn't free the actual pages themselves. That
   * has been handled earlier when unmapping all the memory regions.
@@ -1197,6 +1226,7 @@ again:
         init_rss_vec(rss);
         start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
         pte = start_pte;
+       flush_tlb_batched_pending(mm);
         arch_enter_lazy_mmu_mode();
         do {
                 pte_t ptent = *pte;
@@ -2864,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
         struct vm_area_struct *vma = vmf->vma;
         struct mem_cgroup *memcg;
         struct page *page;
+       int ret = 0;
         pte_t entry;
  
         /* File mapping without ->vm_ops ? */
@@ -2896,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
                                 vmf->address, &vmf->ptl);
                 if (!pte_none(*vmf->pte))
                         goto unlock;
+               ret = check_stable_address_space(vma->vm_mm);
+               if (ret)
+                       goto unlock;
                 /* Deliver the page fault to userland, check inside PT lock */
                 if (userfaultfd_missing(vma)) {
                         pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2930,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
         if (!pte_none(*vmf->pte))
                 goto release;
  
+       ret = check_stable_address_space(vma->vm_mm);
+       if (ret)
+               goto release;
+
         /* Deliver the page fault to userland, check inside PT lock */
         if (userfaultfd_missing(vma)) {
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2949,7 +2987,7 @@ setpte:
         update_mmu_cache(vma, vmf->address, vmf->pte);
  unlock:
         pte_unmap_unlock(vmf->pte, vmf->ptl);
-       return 0;
+       return ret;
  release:
         mem_cgroup_cancel_charge(page, memcg, false);
         put_page(page);
@@ -3223,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
  int finish_fault(struct vm_fault *vmf)
  {
         struct page *page;
-       int ret;
+       int ret = 0;
  
         /* Did we COW the page? */
         if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3231,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf)
                 page = vmf->cow_page;
         else
                 page = vmf->page;
-       ret = alloc_set_pte(vmf, vmf->memcg, page);
+
+       /*
+        * check even for read faults because we might have lost our CoWed
+        * page
+        */
+       if (!(vmf->vma->vm_flags & VM_SHARED))
+               ret = check_stable_address_space(vmf->vma->vm_mm);
+       if (!ret)
+               ret = alloc_set_pte(vmf, vmf->memcg, page);
         if (vmf->pte)
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
         return ret;
@@ -3871,19 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                         mem_cgroup_oom_synchronize(false);
         }
  
-       /*
-        * This mm has been already reaped by the oom reaper and so the
-        * refault cannot be trusted in general. Anonymous refaults would
-        * lose data and give a zero page instead e.g. This is especially
-        * problem for use_mm() because regular tasks will just die and
-        * the corrupted data will not be visible anywhere while kthread
-        * will outlive the oom victim and potentially propagate the date
-        * further.
-        */
-       if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
-                               && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
-               ret = VM_FAULT_SIGBUS;
-
         return ret;
  }
  EXPORT_SYMBOL_GPL(handle_mm_fault);