]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - mm/mmap.c
mm: avoid taking rmap locks in move_ptes()
[karo-tx-linux.git] / mm / mmap.c
index 66984aab79152f8fba0a55ad5925e8f08dce5cf0..2d942353d681a8b4f08155eebdcfb20b088093e7 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end);
 
-/*
- * WARNING: the debugging will use recursive algorithms so never enable this
- * unless you know what you are doing.
- */
-#undef DEBUG_MM_RB
-
 /* description of effects of mapping type and prot in current implementation.
  * this is due to the limited x86 page protection hardware.  The expected
  * behavior is in parens:
@@ -303,7 +297,7 @@ out:
        return retval;
 }
 
-#ifdef DEBUG_MM_RB
+#ifdef CONFIG_DEBUG_VM_RB
 static int browse_rb(struct rb_root *root)
 {
        int i = 0, j;
@@ -337,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
 {
        int bug = 0;
        int i = 0;
-       struct vm_area_struct *tmp = mm->mmap;
-       while (tmp) {
-               tmp = tmp->vm_next;
+       struct vm_area_struct *vma = mm->mmap;
+       while (vma) {
+               struct anon_vma_chain *avc;
+               list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                       anon_vma_interval_tree_verify(avc);
+               vma = vma->vm_next;
                i++;
        }
        if (i != mm->map_count)
@@ -353,6 +350,38 @@ void validate_mm(struct mm_struct *mm)
 #define validate_mm(mm) do { } while (0)
 #endif
 
+/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_sem and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+       struct anon_vma_chain *avc;
+
+       list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+               anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+       struct anon_vma_chain *avc;
+
+       list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+               anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
 static int find_vma_links(struct mm_struct *mm, unsigned long addr,
                unsigned long end, struct vm_area_struct **pprev,
                struct rb_node ***rb_link, struct rb_node **rb_parent)
@@ -565,20 +594,17 @@ again:                    remove_next = 1 + (end > next->vm_end);
 
        vma_adjust_trans_huge(vma, start, end, adjust_next);
 
-       /*
-        * When changing only vma->vm_end, we don't really need anon_vma
-        * lock. This is a fairly rare case by itself, but the anon_vma
-        * lock may be shared between many sibling processes.  Skipping
-        * the lock for brk adjustments makes a difference sometimes.
-        */
-       if (vma->anon_vma && (importer || start != vma->vm_start)) {
-               anon_vma = vma->anon_vma;
+       anon_vma = vma->anon_vma;
+       if (!anon_vma && adjust_next)
+               anon_vma = next->anon_vma;
+       if (anon_vma) {
                VM_BUG_ON(adjust_next && next->anon_vma &&
                          anon_vma != next->anon_vma);
-       } else if (adjust_next && next->anon_vma)
-               anon_vma = next->anon_vma;
-       if (anon_vma)
                anon_vma_lock(anon_vma);
+               anon_vma_interval_tree_pre_update_vma(vma);
+               if (adjust_next)
+                       anon_vma_interval_tree_pre_update_vma(next);
+       }
 
        if (root) {
                flush_dcache_mmap_lock(mapping);
@@ -619,8 +645,12 @@ again:                     remove_next = 1 + (end > next->vm_end);
                __insert_vm_struct(mm, insert);
        }
 
-       if (anon_vma)
+       if (anon_vma) {
+               anon_vma_interval_tree_post_update_vma(vma);
+               if (adjust_next)
+                       anon_vma_interval_tree_post_update_vma(next);
                anon_vma_unlock(anon_vma);
+       }
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
 
@@ -1748,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                               anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
+                               anon_vma_interval_tree_post_update_vma(vma);
                                perf_event_mmap(vma);
                        }
                }
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma);
+       validate_mm(vma->vm_mm);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1798,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma,
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
+                               anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
+                               anon_vma_interval_tree_post_update_vma(vma);
                                perf_event_mmap(vma);
                        }
                }
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma);
+       validate_mm(vma->vm_mm);
        return error;
 }
 
@@ -2335,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
  * prior to moving page table entries, to effect an mremap move.
  */
 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
-       unsigned long addr, unsigned long len, pgoff_t pgoff)
+       unsigned long addr, unsigned long len, pgoff_t pgoff,
+       bool *need_rmap_locks)
 {
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
@@ -2377,27 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON(faulted_in_anon_vma);
-                       *vmap = new_vma;
+                       *vmap = vma = new_vma;
                }
+               *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (new_vma) {
                        *new_vma = *vma;
+                       new_vma->vm_start = addr;
+                       new_vma->vm_end = addr + len;
+                       new_vma->vm_pgoff = pgoff;
                        pol = mpol_dup(vma_policy(vma));
                        if (IS_ERR(pol))
                                goto out_free_vma;
+                       vma_set_policy(new_vma, pol);
                        INIT_LIST_HEAD(&new_vma->anon_vma_chain);
                        if (anon_vma_clone(new_vma, vma))
                                goto out_free_mempol;
-                       vma_set_policy(new_vma, pol);
-                       new_vma->vm_start = addr;
-                       new_vma->vm_end = addr + len;
-                       new_vma->vm_pgoff = pgoff;
                        if (new_vma->vm_file)
                                get_file(new_vma->vm_file);
                        if (new_vma->vm_ops && new_vma->vm_ops->open)
                                new_vma->vm_ops->open(new_vma);
                        vma_link(mm, new_vma, prev, rb_link, rb_parent);
+                       *need_rmap_locks = false;
                }
        }
        return new_vma;
@@ -2515,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-       if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+       if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
@@ -2531,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * anon_vma->root->mutex.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
-                                      &anon_vma->root->head.next))
+                                      &anon_vma->root->rb_root.rb_node))
                        BUG();
        }
 }
@@ -2572,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  * A single task can't take more than one mm_take_all_locks() in a row
  * or it would deadlock.
  *
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
  * mapping->flags avoid to take the same lock twice, if more than one
  * vma in this mm is backed by the same anon_vma or address_space.
  *
@@ -2619,13 +2658,13 @@ out_unlock:
 
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-       if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+       if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
-                * the vma so the users using the anon_vma->head will
+                * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
@@ -2633,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 * anon_vma->root->mutex.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
-                                         &anon_vma->root->head.next))
+                                         &anon_vma->root->rb_root.rb_node))
                        BUG();
                anon_vma_unlock(anon_vma);
        }