]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - mm/vmscan.c
Merge tag 'nios2-v4.12-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/lftan...
[karo-tx-linux.git] / mm / vmscan.c
index f80a54da5f7f7e41fec9bc8eefc1a0063cf06501..2f45c0520f43fbcb5696063172cd281553302f9d 100644 (file)
@@ -972,7 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                bool dirty, writeback;
-               int ret = SWAP_SUCCESS;
 
                cond_resched();
 
@@ -1145,13 +1144,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page)) {
-                       switch (ret = try_to_unmap(page,
-                               ttu_flags | TTU_BATCH_FLUSH)) {
-                       case SWAP_FAIL:
+                       if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
                                nr_unmap_fail++;
                                goto activate_locked;
-                       case SWAP_SUCCESS:
-                               ; /* try to free the page below */
                        }
                }
 
@@ -2011,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
  * Both inactive lists should also be large enough that each inactive
  * page has a chance to be referenced again before it is reclaimed.
  *
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
  * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
  * on this LRU, maintained by the pageout code. A zone->inactive_ratio
  * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
@@ -2027,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *   10TB     320        32GB
  */
 static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-                                               struct scan_control *sc, bool trace)
+                                struct mem_cgroup *memcg,
+                                struct scan_control *sc, bool actual_reclaim)
 {
-       unsigned long inactive_ratio;
-       unsigned long inactive, active;
-       enum lru_list inactive_lru = file * LRU_FILE;
        enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       enum lru_list inactive_lru = file * LRU_FILE;
+       unsigned long inactive, active;
+       unsigned long inactive_ratio;
+       unsigned long refaults;
        unsigned long gb;
 
        /*
@@ -2045,27 +2045,42 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
        inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
        active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
-       gb = (inactive + active) >> (30 - PAGE_SHIFT);
-       if (gb)
-               inactive_ratio = int_sqrt(10 * gb);
+       if (memcg)
+               refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
        else
-               inactive_ratio = 1;
+               refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+       /*
+        * When refaults are being observed, it means a new workingset
+        * is being established. Disable active list protection to get
+        * rid of the stale workingset quickly.
+        */
+       if (file && actual_reclaim && lruvec->refaults != refaults) {
+               inactive_ratio = 0;
+       } else {
+               gb = (inactive + active) >> (30 - PAGE_SHIFT);
+               if (gb)
+                       inactive_ratio = int_sqrt(10 * gb);
+               else
+                       inactive_ratio = 1;
+       }
 
-       if (trace)
-               trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
-                               sc->reclaim_idx,
-                               lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
-                               lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
-                               inactive_ratio, file);
+       if (actual_reclaim)
+               trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+                       lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+                       lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+                       inactive_ratio, file);
 
        return inactive * inactive_ratio < active;
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-                                struct lruvec *lruvec, struct scan_control *sc)
+                                struct lruvec *lruvec, struct mem_cgroup *memcg,
+                                struct scan_control *sc)
 {
        if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
+               if (inactive_list_is_low(lruvec, is_file_lru(lru),
+                                        memcg, sc, true))
                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
                return 0;
        }
@@ -2174,7 +2189,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * lruvec even if it has plenty of old anonymous pages unless the
         * system is under heavy pressure.
         */
-       if (!inactive_list_is_low(lruvec, true, sc, false) &&
+       if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
            lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
                scan_balance = SCAN_FILE;
                goto out;
@@ -2325,7 +2340,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
                                nr[lru] -= nr_to_scan;
 
                                nr_reclaimed += shrink_list(lru, nr_to_scan,
-                                                           lruvec, sc);
+                                                           lruvec, memcg, sc);
                        }
                }
 
@@ -2392,7 +2407,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (inactive_list_is_low(lruvec, false, sc, true))
+       if (inactive_list_is_low(lruvec, false, memcg, sc, true))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 }
@@ -2510,7 +2525,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                                        sc->memcg_low_skipped = 1;
                                        continue;
                                }
-                               mem_cgroup_events(memcg, MEMCG_LOW, 1);
+                               mem_cgroup_event(memcg, MEMCG_LOW);
                        }
 
                        reclaimed = sc->nr_reclaimed;
@@ -2708,6 +2723,25 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        sc->gfp_mask = orig_mask;
 }
 
+static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
+{
+       struct mem_cgroup *memcg;
+
+       memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
+       do {
+               unsigned long refaults;
+               struct lruvec *lruvec;
+
+               if (memcg)
+                       refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
+               else
+                       refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+               lruvec = mem_cgroup_lruvec(pgdat, memcg);
+               lruvec->refaults = refaults;
+       } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
+}
+
 /*
  * This is the main entry point to direct page reclaim.
  *
@@ -2728,6 +2762,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                          struct scan_control *sc)
 {
        int initial_priority = sc->priority;
+       pg_data_t *last_pgdat;
+       struct zoneref *z;
+       struct zone *zone;
 retry:
        delayacct_freepages_start();
 
@@ -2754,6 +2791,15 @@ retry:
                        sc->may_writepage = 1;
        } while (--sc->priority >= 0);
 
+       last_pgdat = NULL;
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
+                                       sc->nodemask) {
+               if (zone->zone_pgdat == last_pgdat)
+                       continue;
+               last_pgdat = zone->zone_pgdat;
+               snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+       }
+
        delayacct_freepages_end();
 
        if (sc->nr_reclaimed)
@@ -2990,6 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
        int nid;
+       unsigned int noreclaim_flag;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
@@ -3016,9 +3063,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                            sc.gfp_mask,
                                            sc.reclaim_idx);
 
-       current->flags |= PF_MEMALLOC;
+       noreclaim_flag = memalloc_noreclaim_save();
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
-       current->flags &= ~PF_MEMALLOC;
+       memalloc_noreclaim_restore(noreclaim_flag);
 
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 
@@ -3038,7 +3085,7 @@ static void age_active_anon(struct pglist_data *pgdat,
        do {
                struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
 
-               if (inactive_list_is_low(lruvec, false, sc, true))
+               if (inactive_list_is_low(lruvec, false, memcg, sc, true))
                        shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                           sc, LRU_ACTIVE_ANON);
 
@@ -3285,6 +3332,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                pgdat->kswapd_failures++;
 
 out:
+       snapshot_refaults(NULL, pgdat);
        /*
         * Return the order kswapd stopped reclaiming at as
         * prepare_kswapd_sleep() takes it into account. If another caller
@@ -3542,8 +3590,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
        struct task_struct *p = current;
        unsigned long nr_reclaimed;
+       unsigned int noreclaim_flag;
 
-       p->flags |= PF_MEMALLOC;
+       noreclaim_flag = memalloc_noreclaim_save();
        lockdep_set_current_reclaim_state(sc.gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
@@ -3552,7 +3601,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 
        p->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
-       p->flags &= ~PF_MEMALLOC;
+       memalloc_noreclaim_restore(noreclaim_flag);
 
        return nr_reclaimed;
 }
@@ -3717,6 +3766,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
        int classzone_idx = gfp_zone(gfp_mask);
+       unsigned int noreclaim_flag;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
@@ -3734,7 +3784,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
         * and we also need to be able to write out pages for RECLAIM_WRITE
         * and RECLAIM_UNMAP.
         */
-       p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+       noreclaim_flag = memalloc_noreclaim_save();
+       p->flags |= PF_SWAPWRITE;
        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
@@ -3750,7 +3801,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
        }
 
        p->reclaim_state = NULL;
-       current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+       current->flags &= ~PF_SWAPWRITE;
+       memalloc_noreclaim_restore(noreclaim_flag);
        lockdep_clear_current_reclaim_state();
        return sc.nr_reclaimed >= nr_pages;
 }