mm, oom: fix potential data corruption when oom_reaper races with writer

author Michal Hocko <mhocko@suse.com>

Fri, 18 Aug 2017 22:16:15 +0000 (15:16 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 18 Aug 2017 22:32:01 +0000 (15:32 -0700)
author Michal Hocko <mhocko@suse.com>
Fri, 18 Aug 2017 22:16:15 +0000 (15:16 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 18 Aug 2017 22:32:01 +0000 (15:32 -0700)
diff --git a/include/linux/oom.h b/include/linux/oom.h

index 8a266e2be5a63a29a2de38ff76f0bdc2e4fac08e..76aac4ce39bcf59e2e5eeb38053cf05f0c9d05ae 100644 (file)
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -6,6 +6,8 @@
  #include <linux/types.h>
  #include <linux/nodemask.h>
  #include <uapi/linux/oom.h>
+#include <linux/sched/coredump.h> /* MMF_* */
+#include <linux/mm.h> /* VM_FAULT* */
  
  struct zonelist;
  struct notifier_block;
@@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
         return tsk->signal->oom_mm;
  }
  
+/*
+ * Checks whether a page fault on the given mm is still reliable.
+ * This is no longer true if the oom reaper started to reap the
+ * address space which is reflected by MMF_UNSTABLE flag set in
+ * the mm. At that moment any !shared mapping would lose the content
+ * and could cause a memory corruption (zero pages instead of the
+ * original content).
+ *
+ * User should call this before establishing a page table entry for
+ * a !shared mapping and under the proper page table lock.
+ *
+ * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
+ */
+static inline int check_stable_address_space(struct mm_struct *mm)
+{
+       if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
+               return VM_FAULT_SIGBUS;
+       return 0;
+}
+
  extern unsigned long oom_badness(struct task_struct *p,
                 struct mem_cgroup *memcg, const nodemask_t *nodemask,
                 unsigned long totalpages);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 216114f6ef0b7f8c09378edd3615d6a39527ead0..90731e3b7e589ea9f83c21827916917a27f71b82 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -32,6 +32,7 @@
  #include <linux/userfaultfd_k.h>
  #include <linux/page_idle.h>
  #include <linux/shmem_fs.h>
+#include <linux/oom.h>
  
  #include <asm/tlb.h>
  #include <asm/pgalloc.h>
@@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
         struct mem_cgroup *memcg;
         pgtable_t pgtable;
         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+       int ret = 0;
  
         VM_BUG_ON_PAGE(!PageCompound(page), page);
  
@@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
  
         pgtable = pte_alloc_one(vma->vm_mm, haddr);
         if (unlikely(!pgtable)) {
-               mem_cgroup_cancel_charge(page, memcg, true);
-               put_page(page);
-               return VM_FAULT_OOM;
+               ret = VM_FAULT_OOM;
+               goto release;
         }
  
         clear_huge_page(page, haddr, HPAGE_PMD_NR);
@@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
  
         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
         if (unlikely(!pmd_none(*vmf->pmd))) {
-               spin_unlock(vmf->ptl);
-               mem_cgroup_cancel_charge(page, memcg, true);
-               put_page(page);
-               pte_free(vma->vm_mm, pgtable);
+               goto unlock_release;
         } else {
                 pmd_t entry;
  
+               ret = check_stable_address_space(vma->vm_mm);
+               if (ret)
+                       goto unlock_release;
+
                 /* Deliver the page fault to userland */
                 if (userfaultfd_missing(vma)) {
                         int ret;
@@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
         }
  
         return 0;
+unlock_release:
+       spin_unlock(vmf->ptl);
+release:
+       if (pgtable)
+               pte_free(vma->vm_mm, pgtable);
+       mem_cgroup_cancel_charge(page, memcg, true);
+       put_page(page);
+       return ret;
+
  }
  
  /*
@@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                 ret = 0;
                 set = false;
                 if (pmd_none(*vmf->pmd)) {
-                       if (userfaultfd_missing(vma)) {
+                       ret = check_stable_address_space(vma->vm_mm);
+                       if (ret) {
+                               spin_unlock(vmf->ptl);
+                       } else if (userfaultfd_missing(vma)) {
                                 spin_unlock(vmf->ptl);
                                 ret = handle_userfault(vmf, VM_UFFD_MISSING);
                                 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
diff --git a/mm/memory.c b/mm/memory.c

index c717b5bcc80e2c3e19664d0638a391708e89fdaf..fe2fba27ded2fab229d0ef7a4908551343d31b89 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
  #include <linux/debugfs.h>
  #include <linux/userfaultfd_k.h>
  #include <linux/dax.h>
+#include <linux/oom.h>
  
  #include <asm/io.h>
  #include <asm/mmu_context.h>
@@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
         struct vm_area_struct *vma = vmf->vma;
         struct mem_cgroup *memcg;
         struct page *page;
+       int ret = 0;
         pte_t entry;
  
         /* File mapping without ->vm_ops ? */
@@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
                                 vmf->address, &vmf->ptl);
                 if (!pte_none(*vmf->pte))
                         goto unlock;
+               ret = check_stable_address_space(vma->vm_mm);
+               if (ret)
+                       goto unlock;
                 /* Deliver the page fault to userland, check inside PT lock */
                 if (userfaultfd_missing(vma)) {
                         pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
         if (!pte_none(*vmf->pte))
                 goto release;
  
+       ret = check_stable_address_space(vma->vm_mm);
+       if (ret)
+               goto release;
+
         /* Deliver the page fault to userland, check inside PT lock */
         if (userfaultfd_missing(vma)) {
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2978,7 +2987,7 @@ setpte:
         update_mmu_cache(vma, vmf->address, vmf->pte);
  unlock:
         pte_unmap_unlock(vmf->pte, vmf->ptl);
-       return 0;
+       return ret;
  release:
         mem_cgroup_cancel_charge(page, memcg, false);
         put_page(page);
@@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
  int finish_fault(struct vm_fault *vmf)
  {
         struct page *page;
-       int ret;
+       int ret = 0;
  
         /* Did we COW the page? */
         if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3260,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf)
                 page = vmf->cow_page;
         else
                 page = vmf->page;
-       ret = alloc_set_pte(vmf, vmf->memcg, page);
+
+       /*
+        * check even for read faults because we might have lost our CoWed
+        * page
+        */
+       if (!(vmf->vma->vm_flags & VM_SHARED))
+               ret = check_stable_address_space(vmf->vma->vm_mm);
+       if (!ret)
+               ret = alloc_set_pte(vmf, vmf->memcg, page);
         if (vmf->pte)
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
         return ret;
@@ -3900,29 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                         mem_cgroup_oom_synchronize(false);
         }
  
-       /*
-        * This mm has been already reaped by the oom reaper and so the
-        * refault cannot be trusted in general. Anonymous refaults would
-        * lose data and give a zero page instead e.g. This is especially
-        * problem for use_mm() because regular tasks will just die and
-        * the corrupted data will not be visible anywhere while kthread
-        * will outlive the oom victim and potentially propagate the date
-        * further.
-        */
-       if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
-                               && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) {
-
-               /*
-                * We are going to enforce SIGBUS but the PF path might have
-                * dropped the mmap_sem already so take it again so that
-                * we do not break expectations of all arch specific PF paths
-                * and g-u-p
-                */
-               if (ret & VM_FAULT_RETRY)
-                       down_read(&vma->vm_mm->mmap_sem);
-               ret = VM_FAULT_SIGBUS;
-       }
-
         return ret;
  }
  EXPORT_SYMBOL_GPL(handle_mm_fault);
author	Michal Hocko <mhocko@suse.com>
	Fri, 18 Aug 2017 22:16:15 +0000 (15:16 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 18 Aug 2017 22:32:01 +0000 (15:32 -0700)
include/linux/oom.h		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history