Merge remote-tracking branch 'cifs/for-next'

[karo-tx-linux.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index d5ff3ce13029b2c99b4ed402898ae0c76a143fde..34d3ca9572d6baed85499d099a0050af3b9bdf66 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
  #include <linux/limits.h>
  #include <linux/export.h>
  #include <linux/mutex.h>
+#include <linux/rbtree.h>
  #include <linux/slab.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
@@ -160,6 +161,10 @@ struct mem_cgroup_per_zone {
  
         struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
  
+       struct rb_node          tree_node;      /* RB tree node */
+       unsigned long long      usage_in_excess;/* Set to the value by which */
+                                               /* the soft limit is exceeded*/
+       bool                    on_tree;
         struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
                                                 /* use container_of        */
  };
@@ -168,6 +173,26 @@ struct mem_cgroup_per_node {
         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
  };
  
+/*
+ * Cgroups above their limits are maintained in a RB-Tree, independent of
+ * their hierarchy representation
+ */
+
+struct mem_cgroup_tree_per_zone {
+       struct rb_root rb_root;
+       spinlock_t lock;
+};
+
+struct mem_cgroup_tree_per_node {
+       struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_tree {
+       struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
+};
+
+static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+
  struct mem_cgroup_threshold {
         struct eventfd_ctx *eventfd;
         u64 threshold;
@@ -303,22 +328,6 @@ struct mem_cgroup {
         atomic_t        numainfo_events;
         atomic_t        numainfo_updating;
  #endif
-       /*
-        * Protects soft_contributed transitions.
-        * See mem_cgroup_update_soft_limit
-        */
-       spinlock_t soft_lock;
-
-       /*
-        * If true then this group has increased parents' children_in_excess
-        * when it got over the soft limit.
-        * When a group falls bellow the soft limit, parents' children_in_excess
-        * is decreased and soft_contributed changed to false.
-        */
-       bool soft_contributed;
-
-       /* Number of children that are in soft limit excess */
-       atomic_t children_in_excess;
  
         struct mem_cgroup_per_node *nodeinfo[0];
         /* WARNING: nodeinfo must be the last member here */
@@ -422,6 +431,7 @@ static bool move_file(void)
   * limit reclaim to prevent infinite loops, if they ever occur.
   */
  #define        MEM_CGROUP_MAX_RECLAIM_LOOPS            100
+#define        MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
  
  enum charge_type {
         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
         return mem_cgroup_zoneinfo(memcg, nid, zid);
  }
  
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_node_zone(int nid, int zid)
+{
+       return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static struct mem_cgroup_tree_per_zone *
+soft_limit_tree_from_page(struct page *page)
+{
+       int nid = page_to_nid(page);
+       int zid = page_zonenum(page);
+
+       return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+}
+
+static void
+__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
+                               struct mem_cgroup_per_zone *mz,
+                               struct mem_cgroup_tree_per_zone *mctz,
+                               unsigned long long new_usage_in_excess)
+{
+       struct rb_node **p = &mctz->rb_root.rb_node;
+       struct rb_node *parent = NULL;
+       struct mem_cgroup_per_zone *mz_node;
+
+       if (mz->on_tree)
+               return;
+
+       mz->usage_in_excess = new_usage_in_excess;
+       if (!mz->usage_in_excess)
+               return;
+       while (*p) {
+               parent = *p;
+               mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+                                       tree_node);
+               if (mz->usage_in_excess < mz_node->usage_in_excess)
+                       p = &(*p)->rb_left;
+               /*
+                * We can't avoid mem cgroups that are over their soft
+                * limit by the same amount
+                */
+               else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+                       p = &(*p)->rb_right;
+       }
+       rb_link_node(&mz->tree_node, parent, p);
+       rb_insert_color(&mz->tree_node, &mctz->rb_root);
+       mz->on_tree = true;
+}
+
+static void
+__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
+                               struct mem_cgroup_per_zone *mz,
+                               struct mem_cgroup_tree_per_zone *mctz)
+{
+       if (!mz->on_tree)
+               return;
+       rb_erase(&mz->tree_node, &mctz->rb_root);
+       mz->on_tree = false;
+}
+
+static void
+mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
+                               struct mem_cgroup_per_zone *mz,
+                               struct mem_cgroup_tree_per_zone *mctz)
+{
+       spin_lock(&mctz->lock);
+       __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+       spin_unlock(&mctz->lock);
+}
+
+
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+{
+       unsigned long long excess;
+       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup_tree_per_zone *mctz;
+       int nid = page_to_nid(page);
+       int zid = page_zonenum(page);
+       mctz = soft_limit_tree_from_page(page);
+
+       /*
+        * Necessary to update all ancestors when hierarchy is used.
+        * because their event counter is not touched.
+        */
+       for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+               mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+               excess = res_counter_soft_limit_excess(&memcg->res);
+               /*
+                * We have to update the tree if mz is on RB-tree or
+                * mem is over its softlimit.
+                */
+               if (excess || mz->on_tree) {
+                       spin_lock(&mctz->lock);
+                       /* if on-tree, remove it */
+                       if (mz->on_tree)
+                               __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+                       /*
+                        * Insert again. mz->usage_in_excess will be updated.
+                        * If excess is 0, no tree ops.
+                        */
+                       __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
+                       spin_unlock(&mctz->lock);
+               }
+       }
+}
+
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
+{
+       int node, zone;
+       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup_tree_per_zone *mctz;
+
+       for_each_node(node) {
+               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                       mz = mem_cgroup_zoneinfo(memcg, node, zone);
+                       mctz = soft_limit_tree_node_zone(node, zone);
+                       mem_cgroup_remove_exceeded(memcg, mz, mctz);
+               }
+       }
+}
+
+static struct mem_cgroup_per_zone *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+       struct rb_node *rightmost = NULL;
+       struct mem_cgroup_per_zone *mz;
+
+retry:
+       mz = NULL;
+       rightmost = rb_last(&mctz->rb_root);
+       if (!rightmost)
+               goto done;              /* Nothing to reclaim from */
+
+       mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+       /*
+        * Remove the node now but someone else can add it back,
+        * we will to add it back at the end of reclaim to its correct
+        * position in the tree.
+        */
+       __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+       if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
+               !css_tryget(&mz->memcg->css))
+               goto retry;
+done:
+       return mz;
+}
+
+static struct mem_cgroup_per_zone *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+{
+       struct mem_cgroup_per_zone *mz;
+
+       spin_lock(&mctz->lock);
+       mz = __mem_cgroup_largest_soft_limit_node(mctz);
+       spin_unlock(&mctz->lock);
+       return mz;
+}
+
  /*
   * Implementation Note: reading percpu statistics for memcg.
   *
@@ -698,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
         unsigned long val = 0;
         int cpu;
  
+       get_online_cpus();
         for_each_online_cpu(cpu)
                 val += per_cpu(memcg->stat->events[idx], cpu);
  #ifdef CONFIG_HOTPLUG_CPU
@@ -705,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
         val += memcg->nocpu_base.events[idx];
         spin_unlock(&memcg->pcp_counter_lock);
  #endif
+       put_online_cpus();
         return val;
  }
  
@@ -821,48 +991,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
         return false;
  }
  
-/*
- * Called from rate-limited memcg_check_events when enough
- * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
- * that all the parents up the hierarchy will be notified that this group
- * is in excess or that it is not in excess anymore. mmecg->soft_contributed
- * makes the transition a single action whenever the state flips from one to
- * the other.
- */
-static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
-{
-       unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
-       struct mem_cgroup *parent = memcg;
-       int delta = 0;
-
-       spin_lock(&memcg->soft_lock);
-       if (excess) {
-               if (!memcg->soft_contributed) {
-                       delta = 1;
-                       memcg->soft_contributed = true;
-               }
-       } else {
-               if (memcg->soft_contributed) {
-                       delta = -1;
-                       memcg->soft_contributed = false;
-               }
-       }
-
-       /*
-        * Necessary to update all ancestors when hierarchy is used
-        * because their event counter is not touched.
-        * We track children even outside the hierarchy for the root
-        * cgroup because tree walk starting at root should visit
-        * all cgroups and we want to prevent from pointless tree
-        * walk if no children is below the limit.
-        */
-       while (delta && (parent = parent_mem_cgroup(parent)))
-               atomic_add(delta, &parent->children_in_excess);
-       if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
-               atomic_add(delta, &root_mem_cgroup->children_in_excess);
-       spin_unlock(&memcg->soft_lock);
-}
-
  /*
   * Check events in order.
   *
@@ -886,7 +1014,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
  
                 mem_cgroup_threshold(memcg);
                 if (unlikely(do_softlimit))
-                       mem_cgroup_update_soft_limit(memcg);
+                       mem_cgroup_update_tree(memcg, page);
  #if MAX_NUMNODES > 1
                 if (unlikely(do_numainfo))
                         atomic_inc(&memcg->numainfo_events);
@@ -929,15 +1057,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
         return memcg;
  }
  
-static enum mem_cgroup_filter_t
-mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
-               mem_cgroup_iter_filter cond)
-{
-       if (!cond)
-               return VISIT;
-       return cond(memcg, root);
-}
-
  /*
   * Returns a next (in a pre-order walk) alive memcg (with elevated css
   * ref. count) or NULL if the whole root's subtree has been visited.
@@ -945,7 +1064,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
   * helper function to be used by mem_cgroup_iter
   */
  static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-               struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
+               struct mem_cgroup *last_visited)
  {
         struct cgroup_subsys_state *prev_css, *next_css;
  
@@ -963,31 +1082,11 @@ skip_node:
         if (next_css) {
                 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
  
-               switch (mem_cgroup_filter(mem, root, cond)) {
-               case SKIP:
+               if (css_tryget(&mem->css))
+                       return mem;
+               else {
                         prev_css = next_css;
                         goto skip_node;
-               case SKIP_TREE:
-                       if (mem == root)
-                               return NULL;
-                       /*
-                        * css_rightmost_descendant is not an optimal way to
-                        * skip through a subtree (especially for imbalanced
-                        * trees leaning to right) but that's what we have right
-                        * now. More effective solution would be traversing
-                        * right-up for first non-NULL without calling
-                        * css_next_descendant_pre afterwards.
-                        */
-                       prev_css = css_rightmost_descendant(next_css);
-                       goto skip_node;
-               case VISIT:
-                       if (css_tryget(&mem->css))
-                               return mem;
-                       else {
-                               prev_css = next_css;
-                               goto skip_node;
-                       }
-                       break;
                 }
         }
  
@@ -1051,7 +1150,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
   * @root: hierarchy root
   * @prev: previously returned memcg, NULL on first invocation
   * @reclaim: cookie for shared reclaim walks, NULL for full walks
- * @cond: filter for visited nodes, NULL for no filter
   *
   * Returns references to children of the hierarchy below @root, or
   * @root itself, or %NULL after a full round-trip.
@@ -1064,18 +1162,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
   * divide up the memcgs in the hierarchy among all concurrent
   * reclaimers operating on the same zone and priority.
   */
-struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                    struct mem_cgroup *prev,
-                                  struct mem_cgroup_reclaim_cookie *reclaim,
-                                  mem_cgroup_iter_filter cond)
+                                  struct mem_cgroup_reclaim_cookie *reclaim)
  {
         struct mem_cgroup *memcg = NULL;
         struct mem_cgroup *last_visited = NULL;
  
-       if (mem_cgroup_disabled()) {
-               /* first call must return non-NULL, second return NULL */
-               return (struct mem_cgroup *)(unsigned long)!prev;
-       }
+       if (mem_cgroup_disabled())
+               return NULL;
  
         if (!root)
                 root = root_mem_cgroup;
@@ -1086,9 +1181,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
         if (!root->use_hierarchy && root != root_mem_cgroup) {
                 if (prev)
                         goto out_css_put;
-               if (mem_cgroup_filter(root, root, cond) == VISIT)
-                       return root;
-               return NULL;
+               return root;
         }
  
         rcu_read_lock();
@@ -1111,7 +1204,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
                         last_visited = mem_cgroup_iter_load(iter, root, &seq);
                 }
  
-               memcg = __mem_cgroup_iter_next(root, last_visited, cond);
+               memcg = __mem_cgroup_iter_next(root, last_visited);
  
                 if (reclaim) {
                         mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1122,11 +1215,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
                                 reclaim->generation = iter->generation;
                 }
  
-               /*
-                * We have finished the whole tree walk or no group has been
-                * visited because filter told us to skip the root node.
-                */
-               if (!memcg && (prev || (cond && !last_visited)))
+               if (prev && !memcg)
                         goto out_unlock;
         }
  out_unlock:
@@ -1767,7 +1856,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
         return total;
  }
  
-#if MAX_NUMNODES > 1
  /**
   * test_mem_cgroup_node_reclaimable
   * @memcg: the target memcg
@@ -1790,6 +1878,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
         return false;
  
  }
+#if MAX_NUMNODES > 1
  
  /*
   * Always updating the nodemask is not very good - even if we have an empty
@@ -1857,50 +1946,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
         return node;
  }
  
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+{
+       int nid;
+
+       /*
+        * quick check...making use of scan_node.
+        * We can skip unused nodes.
+        */
+       if (!nodes_empty(memcg->scan_nodes)) {
+               for (nid = first_node(memcg->scan_nodes);
+                    nid < MAX_NUMNODES;
+                    nid = next_node(nid, memcg->scan_nodes)) {
+
+                       if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
+                               return true;
+               }
+       }
+       /*
+        * Check rest of nodes.
+        */
+       for_each_node_state(nid, N_MEMORY) {
+               if (node_isset(nid, memcg->scan_nodes))
+                       continue;
+               if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
+                       return true;
+       }
+       return false;
+}
+
  #else
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
  {
         return 0;
  }
  
-#endif
-
-/*
- * A group is eligible for the soft limit reclaim under the given root
- * hierarchy if
- *     a) it is over its soft limit
- *     b) any parent up the hierarchy is over its soft limit
- *
- * If the given group doesn't have any children over the limit then it
- * doesn't make any sense to iterate its subtree.
- */
-enum mem_cgroup_filter_t
-mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
-               struct mem_cgroup *root)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
  {
-       struct mem_cgroup *parent;
-
-       if (!memcg)
-               memcg = root_mem_cgroup;
-       parent = memcg;
-
-       if (res_counter_soft_limit_excess(&memcg->res))
-               return VISIT;
+       return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
+}
+#endif
  
-       /*
-        * If any parent up to the root in the hierarchy is over its soft limit
-        * then we have to obey and reclaim from this group as well.
-        */
-       while ((parent = parent_mem_cgroup(parent))) {
-               if (res_counter_soft_limit_excess(&parent->res))
-                       return VISIT;
-               if (parent == root)
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
+                                  struct zone *zone,
+                                  gfp_t gfp_mask,
+                                  unsigned long *total_scanned)
+{
+       struct mem_cgroup *victim = NULL;
+       int total = 0;
+       int loop = 0;
+       unsigned long excess;
+       unsigned long nr_scanned;
+       struct mem_cgroup_reclaim_cookie reclaim = {
+               .zone = zone,
+               .priority = 0,
+       };
+
+       excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
+
+       while (1) {
+               victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
+               if (!victim) {
+                       loop++;
+                       if (loop >= 2) {
+                               /*
+                                * If we have not been able to reclaim
+                                * anything, it might because there are
+                                * no reclaimable pages under this hierarchy
+                                */
+                               if (!total)
+                                       break;
+                               /*
+                                * We want to do more targeted reclaim.
+                                * excess >> 2 is not to excessive so as to
+                                * reclaim too much, nor too less that we keep
+                                * coming back to reclaim from this cgroup
+                                */
+                               if (total >= (excess >> 2) ||
+                                       (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
+                                       break;
+                       }
+                       continue;
+               }
+               if (!mem_cgroup_reclaimable(victim, false))
+                       continue;
+               total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+                                                    zone, &nr_scanned);
+               *total_scanned += nr_scanned;
+               if (!res_counter_soft_limit_excess(&root_memcg->res))
                         break;
         }
-
-       if (!atomic_read(&memcg->children_in_excess))
-               return SKIP_TREE;
-       return SKIP;
+       mem_cgroup_iter_break(root_memcg, victim);
+       return total;
  }
  
  static DEFINE_SPINLOCK(memcg_oom_lock);
@@ -2018,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
                 memcg_wakeup_oom(memcg);
  }
  
-/*
- * try to call OOM killer
- */
  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  {
-       bool locked;
-       int wakeups;
-
         if (!current->memcg_oom.may_oom)
                 return;
-
-       current->memcg_oom.in_memcg_oom = 1;
-
         /*
-        * As with any blocking lock, a contender needs to start
-        * listening for wakeups before attempting the trylock,
-        * otherwise it can miss the wakeup from the unlock and sleep
-        * indefinitely.  This is just open-coded because our locking
-        * is so particular to memcg hierarchies.
+        * We are in the middle of the charge context here, so we
+        * don't want to block when potentially sitting on a callstack
+        * that holds all kinds of filesystem and mm locks.
+        *
+        * Also, the caller may handle a failed allocation gracefully
+        * (like optional page cache readahead) and so an OOM killer
+        * invocation might not even be necessary.
+        *
+        * That's why we don't do anything here except remember the
+        * OOM context and then deal with it at the end of the page
+        * fault when the stack is unwound, the locks are released,
+        * and when we know whether the fault was overall successful.
          */
-       wakeups = atomic_read(&memcg->oom_wakeups);
-       mem_cgroup_mark_under_oom(memcg);
-
-       locked = mem_cgroup_oom_trylock(memcg);
-
-       if (locked)
-               mem_cgroup_oom_notify(memcg);
-
-       if (locked && !memcg->oom_kill_disable) {
-               mem_cgroup_unmark_under_oom(memcg);
-               mem_cgroup_out_of_memory(memcg, mask, order);
-               mem_cgroup_oom_unlock(memcg);
-               /*
-                * There is no guarantee that an OOM-lock contender
-                * sees the wakeups triggered by the OOM kill
-                * uncharges.  Wake any sleepers explicitely.
-                */
-               memcg_oom_recover(memcg);
-       } else {
-               /*
-                * A system call can just return -ENOMEM, but if this
-                * is a page fault and somebody else is handling the
-                * OOM already, we need to sleep on the OOM waitqueue
-                * for this memcg until the situation is resolved.
-                * Which can take some time because it might be
-                * handled by a userspace task.
-                *
-                * However, this is the charge context, which means
-                * that we may sit on a large call stack and hold
-                * various filesystem locks, the mmap_sem etc. and we
-                * don't want the OOM handler to deadlock on them
-                * while we sit here and wait.  Store the current OOM
-                * context in the task_struct, then return -ENOMEM.
-                * At the end of the page fault handler, with the
-                * stack unwound, pagefault_out_of_memory() will check
-                * back with us by calling
-                * mem_cgroup_oom_synchronize(), possibly putting the
-                * task to sleep.
-                */
-               current->memcg_oom.oom_locked = locked;
-               current->memcg_oom.wakeups = wakeups;
-               css_get(&memcg->css);
-               current->memcg_oom.wait_on_memcg = memcg;
-       }
+       css_get(&memcg->css);
+       current->memcg_oom.memcg = memcg;
+       current->memcg_oom.gfp_mask = mask;
+       current->memcg_oom.order = order;
  }
  
  /**
   * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
   *
- * This has to be called at the end of a page fault if the the memcg
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
   *
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
   * sleep on a waitqueue until the userspace task resolves the
   * situation.  Sleeping directly in the charge context with all kinds
   * of locks held is not a good idea, instead we remember an OOM state
   * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
- * OOM state.
+ * the end of the page fault to complete the OOM handling.
   *
   * Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
   */
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
  {
+       struct mem_cgroup *memcg = current->memcg_oom.memcg;
         struct oom_wait_info owait;
-       struct mem_cgroup *memcg;
+       bool locked;
  
         /* OOM is global, do not handle */
-       if (!current->memcg_oom.in_memcg_oom)
-               return false;
-
-       /*
-        * We invoked the OOM killer but there is a chance that a kill
-        * did not free up any charges.  Everybody else might already
-        * be sleeping, so restart the fault and keep the rampage
-        * going until some charges are released.
-        */
-       memcg = current->memcg_oom.wait_on_memcg;
         if (!memcg)
-               goto out;
+               return false;
  
-       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
-               goto out_memcg;
+       if (!handle)
+               goto cleanup;
  
         owait.memcg = memcg;
         owait.wait.flags = 0;
@@ -2130,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
         INIT_LIST_HEAD(&owait.wait.task_list);
  
         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-       /* Only sleep if we didn't miss any wakeups since OOM */
-       if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+       mem_cgroup_mark_under_oom(memcg);
+
+       locked = mem_cgroup_oom_trylock(memcg);
+
+       if (locked)
+               mem_cgroup_oom_notify(memcg);
+
+       if (locked && !memcg->oom_kill_disable) {
+               mem_cgroup_unmark_under_oom(memcg);
+               finish_wait(&memcg_oom_waitq, &owait.wait);
+               mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+                                        current->memcg_oom.order);
+       } else {
                 schedule();
-       finish_wait(&memcg_oom_waitq, &owait.wait);
-out_memcg:
-       mem_cgroup_unmark_under_oom(memcg);
-       if (current->memcg_oom.oom_locked) {
+               mem_cgroup_unmark_under_oom(memcg);
+               finish_wait(&memcg_oom_waitq, &owait.wait);
+       }
+
+       if (locked) {
                 mem_cgroup_oom_unlock(memcg);
                 /*
                  * There is no guarantee that an OOM-lock contender
@@ -2145,10 +2249,9 @@ out_memcg:
                  */
                 memcg_oom_recover(memcg);
         }
+cleanup:
+       current->memcg_oom.memcg = NULL;
         css_put(&memcg->css);
-       current->memcg_oom.wait_on_memcg = NULL;
-out:
-       current->memcg_oom.in_memcg_oom = 0;
         return true;
  }
  
@@ -2562,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                      || fatal_signal_pending(current)))
                 goto bypass;
  
+       if (unlikely(task_in_memcg_oom(current)))
+               goto bypass;
+
         /*
          * We always charge the cgroup the mm_struct belongs to.
          * The mm_struct's mem_cgroup changes on task migration if the
@@ -2660,6 +2766,8 @@ done:
         return 0;
  nomem:
         *ptr = NULL;
+       if (gfp_mask & __GFP_NOFAIL)
+               return 0;
         return -ENOMEM;
  bypass:
         *ptr = root_mem_cgroup;
@@ -2812,7 +2920,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
         unlock_page_cgroup(pc);
  
         /*
-        * "charge_statistics" updated event counter.
+        * "charge_statistics" updated event counter. Then, check it.
+        * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+        * if they exceeds softlimit.
          */
         memcg_check_events(memcg, page);
  }
@@ -4647,6 +4757,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
         return ret;
  }
  
+unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+                                           gfp_t gfp_mask,
+                                           unsigned long *total_scanned)
+{
+       unsigned long nr_reclaimed = 0;
+       struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+       unsigned long reclaimed;
+       int loop = 0;
+       struct mem_cgroup_tree_per_zone *mctz;
+       unsigned long long excess;
+       unsigned long nr_scanned;
+
+       if (order > 0)
+               return 0;
+
+       mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
+       /*
+        * This loop can run a while, specially if mem_cgroup's continuously
+        * keep exceeding their soft limit and putting the system under
+        * pressure
+        */
+       do {
+               if (next_mz)
+                       mz = next_mz;
+               else
+                       mz = mem_cgroup_largest_soft_limit_node(mctz);
+               if (!mz)
+                       break;
+
+               nr_scanned = 0;
+               reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
+                                                   gfp_mask, &nr_scanned);
+               nr_reclaimed += reclaimed;
+               *total_scanned += nr_scanned;
+               spin_lock(&mctz->lock);
+
+               /*
+                * If we failed to reclaim anything from this memory cgroup
+                * it is time to move on to the next cgroup
+                */
+               next_mz = NULL;
+               if (!reclaimed) {
+                       do {
+                               /*
+                                * Loop until we find yet another one.
+                                *
+                                * By the time we get the soft_limit lock
+                                * again, someone might have aded the
+                                * group back on the RB tree. Iterate to
+                                * make sure we get a different mem.
+                                * mem_cgroup_largest_soft_limit_node returns
+                                * NULL if no other cgroup is present on
+                                * the tree
+                                */
+                               next_mz =
+                               __mem_cgroup_largest_soft_limit_node(mctz);
+                               if (next_mz == mz)
+                                       css_put(&next_mz->memcg->css);
+                               else /* next_mz == NULL or other memcg */
+                                       break;
+                       } while (1);
+               }
+               __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+               excess = res_counter_soft_limit_excess(&mz->memcg->res);
+               /*
+                * One school of thought says that we should not add
+                * back the node to the tree if reclaim returns 0.
+                * But our reclaim could return 0, simply because due
+                * to priority we are exposing a smaller subset of
+                * memory to reclaim from. Consider this as a longer
+                * term TODO.
+                */
+               /* If excess == 0, no tree ops */
+               __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
+               spin_unlock(&mctz->lock);
+               css_put(&mz->memcg->css);
+               loop++;
+               /*
+                * Could not reclaim anything and there are no more
+                * mem cgroups to try or we seem to be looping without
+                * reclaiming anything.
+                */
+               if (!nr_reclaimed &&
+                       (next_mz == NULL ||
+                       loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
+                       break;
+       } while (!nr_reclaimed);
+       if (next_mz)
+               css_put(&next_mz->memcg->css);
+       return nr_reclaimed;
+}
+
  /**
   * mem_cgroup_force_empty_list - clears LRU of a group
   * @memcg: group to clear
@@ -5911,6 +6113,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                 mz = &pn->zoneinfo[zone];
                 lruvec_init(&mz->lruvec);
+               mz->usage_in_excess = 0;
+               mz->on_tree = false;
                 mz->memcg = memcg;
         }
         memcg->nodeinfo[node] = pn;
@@ -5966,6 +6170,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
         int node;
         size_t size = memcg_size();
  
+       mem_cgroup_remove_from_trees(memcg);
         free_css_id(&mem_cgroup_subsys, &memcg->css);
  
         for_each_node(node)
@@ -6002,6 +6207,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
  }
  EXPORT_SYMBOL(parent_mem_cgroup);
  
+static void __init mem_cgroup_soft_limit_tree_init(void)
+{
+       struct mem_cgroup_tree_per_node *rtpn;
+       struct mem_cgroup_tree_per_zone *rtpz;
+       int tmp, node, zone;
+
+       for_each_node(node) {
+               tmp = node;
+               if (!node_state(node, N_NORMAL_MEMORY))
+                       tmp = -1;
+               rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
+               BUG_ON(!rtpn);
+
+               soft_limit_tree.rb_tree_per_node[node] = rtpn;
+
+               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                       rtpz = &rtpn->rb_tree_per_zone[zone];
+                       rtpz->rb_root = RB_ROOT;
+                       spin_lock_init(&rtpz->lock);
+               }
+       }
+}
+
  static struct cgroup_subsys_state * __ref
  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  {
@@ -6031,7 +6259,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
         mutex_init(&memcg->thresholds_lock);
         spin_lock_init(&memcg->move_lock);
         vmpressure_init(&memcg->vmpressure);
-       spin_lock_init(&memcg->soft_lock);
  
         return &memcg->css;
  
@@ -6109,13 +6336,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
  
         mem_cgroup_invalidate_reclaim_iterators(memcg);
         mem_cgroup_reparent_charges(memcg);
-       if (memcg->soft_contributed) {
-               while ((memcg = parent_mem_cgroup(memcg)))
-                       atomic_dec(&memcg->children_in_excess);
-
-               if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
-                       atomic_dec(&root_mem_cgroup->children_in_excess);
-       }
         mem_cgroup_destroy_all_caches(memcg);
         vmpressure_cleanup(&memcg->vmpressure);
  }
@@ -6790,6 +7010,7 @@ static int __init mem_cgroup_init(void)
  {
         hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
         enable_swap_cgroup();
+       mem_cgroup_soft_limit_tree_init();
         memcg_stock_init();
         return 0;
  }