]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Jul 2016 23:36:48 +0000 (16:36 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 28 Jul 2016 23:36:48 +0000 (16:36 -0700)
Merge more updates from Andrew Morton:
 "The rest of MM"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (101 commits)
  mm, compaction: simplify contended compaction handling
  mm, compaction: introduce direct compaction priority
  mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations
  mm, page_alloc: make THP-specific decisions more generic
  mm, page_alloc: restructure direct compaction handling in slowpath
  mm, page_alloc: don't retry initial attempt in slowpath
  mm, page_alloc: set alloc_flags only once in slowpath
  lib/stackdepot.c: use __GFP_NOWARN for stack allocations
  mm, kasan: switch SLUB to stackdepot, enable memory quarantine for SLUB
  mm, kasan: account for object redzone in SLUB's nearest_obj()
  mm: fix use-after-free if memory allocation failed in vma_adjust()
  zsmalloc: Delete an unnecessary check before the function call "iput"
  mm/memblock.c: fix index adjustment error in __next_mem_range_rev()
  mem-hotplug: alloc new page from a nearest neighbor node when mem-offline
  mm: optimize copy_page_to/from_iter_iovec
  mm: add cond_resched() to generic_swapfile_activate()
  Revert "mm, mempool: only set __GFP_NOMEMALLOC if there are free elements"
  mm, compaction: don't isolate PageWriteback pages in MIGRATE_SYNC_LIGHT mode
  mm: hwpoison: remove incorrect comments
  make __section_nr() more efficient
  ...

90 files changed:
Documentation/cgroup-v1/memcg_test.txt
Documentation/cgroup-v1/memory.txt
arch/arm64/mm/init.c
arch/s390/appldata/appldata_mem.c
arch/tile/mm/pgtable.c
drivers/base/node.c
drivers/staging/android/lowmemorykiller.c
drivers/staging/lustre/lustre/osc/osc_cache.c
fs/fs-writeback.c
fs/fuse/file.c
fs/nfs/internal.h
fs/nfs/write.c
fs/proc/base.c
fs/proc/meminfo.c
include/linux/backing-dev.h
include/linux/compaction.h
include/linux/gfp.h
include/linux/huge_mm.h
include/linux/kasan.h
include/linux/kdb.h
include/linux/memblock.h
include/linux/memcontrol.h
include/linux/memremap.h
include/linux/mm.h
include/linux/mm_inline.h
include/linux/mm_types.h
include/linux/mmzone.h
include/linux/oom.h
include/linux/sched.h
include/linux/slab_def.h
include/linux/slub_def.h
include/linux/swap.h
include/linux/topology.h
include/linux/vm_event_item.h
include/linux/vmstat.h
include/linux/writeback.h
include/trace/events/compaction.h
include/trace/events/mmflags.h
include/trace/events/vmscan.h
include/trace/events/writeback.h
kernel/cpuset.c
kernel/fork.c
kernel/freezer.c
kernel/memremap.c
kernel/power/snapshot.c
kernel/printk/printk.c
kernel/sysctl.c
lib/Kconfig.kasan
lib/iov_iter.c
lib/stackdepot.c
mm/Kconfig
mm/backing-dev.c
mm/compaction.c
mm/filemap.c
mm/huge_memory.c
mm/hugetlb.c
mm/internal.h
mm/kasan/Makefile
mm/kasan/kasan.c
mm/kasan/kasan.h
mm/kasan/report.c
mm/khugepaged.c
mm/kmemleak.c
mm/memblock.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/mempool.c
mm/migrate.c
mm/mlock.c
mm/mmap.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_idle.c
mm/page_io.c
mm/rmap.c
mm/shmem.c
mm/slab.h
mm/slub.c
mm/sparse.c
mm/swap.c
mm/swap_state.c
mm/util.c
mm/vmscan.c
mm/vmstat.c
mm/workingset.c
mm/zsmalloc.c
tools/perf/builtin-kmem.c

index 8870b02121502430a420e129b61e7d887ee0c762..78a8c2963b38816b152cb07c0f6b82a953b32eaa 100644 (file)
@@ -107,9 +107,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 
 8. LRU
         Each memcg has its own private LRU. Now, its handling is under global
-       VM's control (means that it's handled under global zone->lru_lock).
+       VM's control (means that it's handled under global zone_lru_lock).
        Almost all routines around memcg's LRU is called by global LRU's
-       list management functions under zone->lru_lock().
+       list management functions under zone_lru_lock().
 
        A special function is mem_cgroup_isolate_pages(). This scans
        memcg's private LRU and call __isolate_lru_page() to extract a page
index b14abf217239e892e19cd9f8c6366e3646ee640f..946e69103cdd78134d2071483aa12b22c699e670 100644 (file)
@@ -267,11 +267,11 @@ When oom event notifier is registered, event will be delivered.
    Other lock order is following:
    PG_locked.
    mm->page_table_lock
-       zone->lru_lock
+       zone_lru_lock
          lock_page_cgroup.
   In many cases, just lock_page_cgroup() is called.
   per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
-  zone->lru_lock, it has no lock of its own.
+  zone_lru_lock, it has no lock of its own.
 
 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
 
index 2ade7a6a10a704e880837e4fcae2f4ae1cbb6ae9..bbb7ee76e319f933aac802f005c056693bb7d87f 100644 (file)
@@ -224,7 +224,7 @@ void __init arm64_memblock_init(void)
         * via the linear mapping.
         */
        if (memory_limit != (phys_addr_t)ULLONG_MAX) {
-               memblock_enforce_memory_limit(memory_limit);
+               memblock_mem_limit_remove_map(memory_limit);
                memblock_add(__pa(_text), (u64)(_end - _text));
        }
 
index edcf2a70694204c9685cb6ed019f8c93496749c6..598df5708501734307565d3d7ba7c2e8b3472f6e 100644 (file)
@@ -102,7 +102,7 @@ static void appldata_get_mem_data(void *data)
        mem_data->totalhigh = P2K(val.totalhigh);
        mem_data->freehigh  = P2K(val.freehigh);
        mem_data->bufferram = P2K(val.bufferram);
-       mem_data->cached    = P2K(global_page_state(NR_FILE_PAGES)
+       mem_data->cached    = P2K(global_node_page_state(NR_FILE_PAGES)
                                - val.bufferram);
 
        si_swapinfo(&val);
index c4d5bf841a7f19bc727abbff423ed12b20666f29..7cc6ee7f1a58391af570f8ada0507af631639bb9 100644 (file)
@@ -45,20 +45,20 @@ void show_mem(unsigned int filter)
        struct zone *zone;
 
        pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n",
-              (global_page_state(NR_ACTIVE_ANON) +
-               global_page_state(NR_ACTIVE_FILE)),
-              (global_page_state(NR_INACTIVE_ANON) +
-               global_page_state(NR_INACTIVE_FILE)),
-              global_page_state(NR_FILE_DIRTY),
-              global_page_state(NR_WRITEBACK),
-              global_page_state(NR_UNSTABLE_NFS),
+              (global_node_page_state(NR_ACTIVE_ANON) +
+               global_node_page_state(NR_ACTIVE_FILE)),
+              (global_node_page_state(NR_INACTIVE_ANON) +
+               global_node_page_state(NR_INACTIVE_FILE)),
+              global_node_page_state(NR_FILE_DIRTY),
+              global_node_page_state(NR_WRITEBACK),
+              global_node_page_state(NR_UNSTABLE_NFS),
               global_page_state(NR_FREE_PAGES),
               (global_page_state(NR_SLAB_RECLAIMABLE) +
                global_page_state(NR_SLAB_UNRECLAIMABLE)),
-              global_page_state(NR_FILE_MAPPED),
+              global_node_page_state(NR_FILE_MAPPED),
               global_page_state(NR_PAGETABLE),
               global_page_state(NR_BOUNCE),
-              global_page_state(NR_FILE_PAGES),
+              global_node_page_state(NR_FILE_PAGES),
               get_nr_swap_pages());
 
        for_each_zone(zone) {
index 51c7db2c4ee2a6bc06cc5ee2c22758fc91cb7c66..29cd96661b3077f5272c40e17fd5d5f1b9b9d952 100644 (file)
@@ -56,6 +56,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 {
        int n;
        int nid = dev->id;
+       struct pglist_data *pgdat = NODE_DATA(nid);
        struct sysinfo i;
 
        si_meminfo_node(&i, nid);
@@ -74,16 +75,16 @@ static ssize_t node_read_meminfo(struct device *dev,
                       nid, K(i.totalram),
                       nid, K(i.freeram),
                       nid, K(i.totalram - i.freeram),
-                      nid, K(node_page_state(nid, NR_ACTIVE_ANON) +
-                               node_page_state(nid, NR_ACTIVE_FILE)),
-                      nid, K(node_page_state(nid, NR_INACTIVE_ANON) +
-                               node_page_state(nid, NR_INACTIVE_FILE)),
-                      nid, K(node_page_state(nid, NR_ACTIVE_ANON)),
-                      nid, K(node_page_state(nid, NR_INACTIVE_ANON)),
-                      nid, K(node_page_state(nid, NR_ACTIVE_FILE)),
-                      nid, K(node_page_state(nid, NR_INACTIVE_FILE)),
-                      nid, K(node_page_state(nid, NR_UNEVICTABLE)),
-                      nid, K(node_page_state(nid, NR_MLOCK)));
+                      nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
+                               node_page_state(pgdat, NR_ACTIVE_FILE)),
+                      nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
+                               node_page_state(pgdat, NR_INACTIVE_FILE)),
+                      nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+                      nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+                      nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+                      nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+                      nid, K(node_page_state(pgdat, NR_UNEVICTABLE)),
+                      nid, K(sum_zone_node_page_state(nid, NR_MLOCK)));
 
 #ifdef CONFIG_HIGHMEM
        n += sprintf(buf + n,
@@ -117,31 +118,30 @@ static ssize_t node_read_meminfo(struct device *dev,
                       "Node %d ShmemPmdMapped: %8lu kB\n"
 #endif
                        ,
-                      nid, K(node_page_state(nid, NR_FILE_DIRTY)),
-                      nid, K(node_page_state(nid, NR_WRITEBACK)),
-                      nid, K(node_page_state(nid, NR_FILE_PAGES)),
-                      nid, K(node_page_state(nid, NR_FILE_MAPPED)),
-                      nid, K(node_page_state(nid, NR_ANON_PAGES)),
+                      nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
+                      nid, K(node_page_state(pgdat, NR_WRITEBACK)),
+                      nid, K(node_page_state(pgdat, NR_FILE_PAGES)),
+                      nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
+                      nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
                       nid, K(i.sharedram),
-                      nid, node_page_state(nid, NR_KERNEL_STACK) *
-                               THREAD_SIZE / 1024,
-                      nid, K(node_page_state(nid, NR_PAGETABLE)),
-                      nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
-                      nid, K(node_page_state(nid, NR_BOUNCE)),
-                      nid, K(node_page_state(nid, NR_WRITEBACK_TEMP)),
-                      nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE) +
-                               node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
-                      nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
+                      nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
+                      nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
+                      nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
+                      nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
+                      nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+                      nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE) +
+                               sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
+                      nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE)),
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-                      nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
-                      nid, K(node_page_state(nid, NR_ANON_THPS) *
+                      nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
+                      nid, K(node_page_state(pgdat, NR_ANON_THPS) *
                                       HPAGE_PMD_NR),
-                      nid, K(node_page_state(nid, NR_SHMEM_THPS) *
+                      nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
                                       HPAGE_PMD_NR),
-                      nid, K(node_page_state(nid, NR_SHMEM_PMDMAPPED) *
+                      nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
                                       HPAGE_PMD_NR));
 #else
-                      nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
+                      nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
 #endif
        n += hugetlb_report_node_meminfo(nid, buf + n);
        return n;
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
                       "interleave_hit %lu\n"
                       "local_node %lu\n"
                       "other_node %lu\n",
-                      node_page_state(dev->id, NUMA_HIT),
-                      node_page_state(dev->id, NUMA_MISS),
-                      node_page_state(dev->id, NUMA_FOREIGN),
-                      node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
-                      node_page_state(dev->id, NUMA_LOCAL),
-                      node_page_state(dev->id, NUMA_OTHER));
+                      sum_zone_node_page_state(dev->id, NUMA_HIT),
+                      sum_zone_node_page_state(dev->id, NUMA_MISS),
+                      sum_zone_node_page_state(dev->id, NUMA_FOREIGN),
+                      sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
+                      sum_zone_node_page_state(dev->id, NUMA_LOCAL),
+                      sum_zone_node_page_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
@@ -173,12 +173,18 @@ static ssize_t node_read_vmstat(struct device *dev,
                                struct device_attribute *attr, char *buf)
 {
        int nid = dev->id;
+       struct pglist_data *pgdat = NODE_DATA(nid);
        int i;
        int n = 0;
 
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
                n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
-                            node_page_state(nid, i));
+                            sum_zone_node_page_state(nid, i));
+
+       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+               n += sprintf(buf+n, "%s %lu\n",
+                            vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+                            node_page_state(pgdat, i));
 
        return n;
 }
index 24d2745e943778200c64996510814756854fa461..45a1b4ec4ca33000e96faee0cb1cff7933b91580 100644 (file)
@@ -72,10 +72,10 @@ static unsigned long lowmem_deathpending_timeout;
 static unsigned long lowmem_count(struct shrinker *s,
                                  struct shrink_control *sc)
 {
-       return global_page_state(NR_ACTIVE_ANON) +
-               global_page_state(NR_ACTIVE_FILE) +
-               global_page_state(NR_INACTIVE_ANON) +
-               global_page_state(NR_INACTIVE_FILE);
+       return global_node_page_state(NR_ACTIVE_ANON) +
+               global_node_page_state(NR_ACTIVE_FILE) +
+               global_node_page_state(NR_INACTIVE_ANON) +
+               global_node_page_state(NR_INACTIVE_FILE);
 }
 
 static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
@@ -91,8 +91,8 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
        short selected_oom_score_adj;
        int array_size = ARRAY_SIZE(lowmem_adj);
        int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
-       int other_file = global_page_state(NR_FILE_PAGES) -
-                                               global_page_state(NR_SHMEM) -
+       int other_file = global_node_page_state(NR_FILE_PAGES) -
+                                               global_node_page_state(NR_SHMEM) -
                                                total_swapcache_pages();
 
        if (lowmem_adj_size < array_size)
index d1a7d6beee60f09187dfab9c021a79b139858c0b..d011135802d59313c109005e4c5f760615740c56 100644 (file)
@@ -1864,7 +1864,8 @@ void osc_dec_unstable_pages(struct ptlrpc_request *req)
        LASSERT(page_count >= 0);
 
        for (i = 0; i < page_count; i++)
-               dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS);
+               dec_node_page_state(desc->bd_iov[i].kiov_page,
+                                                       NR_UNSTABLE_NFS);
 
        atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr);
        LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
@@ -1898,7 +1899,8 @@ void osc_inc_unstable_pages(struct ptlrpc_request *req)
        LASSERT(page_count >= 0);
 
        for (i = 0; i < page_count; i++)
-               inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS);
+               inc_node_page_state(desc->bd_iov[i].kiov_page,
+                                                       NR_UNSTABLE_NFS);
 
        LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0);
        atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr);
index 6f9c9f6f515792acc6fd80f51c568d66a1b05389..56c8fda436c0aa0d6a0883415baa4271f1265e84 100644 (file)
@@ -1807,8 +1807,8 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
  */
 static unsigned long get_nr_dirty_pages(void)
 {
-       return global_page_state(NR_FILE_DIRTY) +
-               global_page_state(NR_UNSTABLE_NFS) +
+       return global_node_page_state(NR_FILE_DIRTY) +
+               global_node_page_state(NR_UNSTABLE_NFS) +
                get_nr_dirty_inodes();
 }
 
index 9154f8679024f25b33a7dac1607877da2ce3fab9..2382f22a2a8bfc3af619d62ec8bf00a23b73ed80 100644 (file)
@@ -1452,7 +1452,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
        list_del(&req->writepages_entry);
        for (i = 0; i < req->num_pages; i++) {
                dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-               dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+               dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP);
                wb_writeout_inc(&bdi->wb);
        }
        wake_up(&fi->page_waitq);
@@ -1642,7 +1642,7 @@ static int fuse_writepage_locked(struct page *page)
        req->inode = inode;
 
        inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
-       inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+       inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
        spin_lock(&fc->lock);
        list_add(&req->writepages_entry, &fi->writepages);
@@ -1756,7 +1756,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
                spin_unlock(&fc->lock);
 
                dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-               dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+               dec_node_page_state(page, NR_WRITEBACK_TEMP);
                wb_writeout_inc(&bdi->wb);
                fuse_writepage_free(fc, new_req);
                fuse_request_free(new_req);
@@ -1855,7 +1855,7 @@ static int fuse_writepages_fill(struct page *page,
        req->page_descs[req->num_pages].length = PAGE_SIZE;
 
        inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
-       inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+       inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
        err = 0;
        if (is_writeback && fuse_writepage_in_flight(req, page)) {
index 5154fa65a2f2a20efad5eeb9667bd10974f8aaed..5ea04d87fc653db7bf578853b1e97b165812864e 100644 (file)
@@ -623,7 +623,7 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
        if (!cinfo->dreq) {
                struct inode *inode = page_file_mapping(page)->host;
 
-               inc_zone_page_state(page, NR_UNSTABLE_NFS);
+               inc_node_page_state(page, NR_UNSTABLE_NFS);
                inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        }
index e1c74d3db64de48851c0d758ade6df2741da9d30..593fa21a02c07a9dca3f45ce0ef8f87edad61a5f 100644 (file)
@@ -898,7 +898,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
 static void
 nfs_clear_page_commit(struct page *page)
 {
-       dec_zone_page_state(page, NR_UNSTABLE_NFS);
+       dec_node_page_state(page, NR_UNSTABLE_NFS);
        dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
                    WB_RECLAIMABLE);
 }
index a11eb7196ec8b814ce1e9ed098253e0f6cfd24a5..31370da2ee7ce9ca2944eb22efb4d512681bc9dc 100644 (file)
@@ -1024,23 +1024,107 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
        char buffer[PROC_NUMBUF];
        int oom_adj = OOM_ADJUST_MIN;
        size_t len;
-       unsigned long flags;
 
        if (!task)
                return -ESRCH;
-       if (lock_task_sighand(task, &flags)) {
-               if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
-                       oom_adj = OOM_ADJUST_MAX;
-               else
-                       oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
-                                 OOM_SCORE_ADJ_MAX;
-               unlock_task_sighand(task, &flags);
-       }
+       if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+               oom_adj = OOM_ADJUST_MAX;
+       else
+               oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+                         OOM_SCORE_ADJ_MAX;
        put_task_struct(task);
        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
 }
 
+static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
+{
+       static DEFINE_MUTEX(oom_adj_mutex);
+       struct mm_struct *mm = NULL;
+       struct task_struct *task;
+       int err = 0;
+
+       task = get_proc_task(file_inode(file));
+       if (!task)
+               return -ESRCH;
+
+       mutex_lock(&oom_adj_mutex);
+       if (legacy) {
+               if (oom_adj < task->signal->oom_score_adj &&
+                               !capable(CAP_SYS_RESOURCE)) {
+                       err = -EACCES;
+                       goto err_unlock;
+               }
+               /*
+                * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+                * /proc/pid/oom_score_adj instead.
+                */
+               pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+                         current->comm, task_pid_nr(current), task_pid_nr(task),
+                         task_pid_nr(task));
+       } else {
+               if ((short)oom_adj < task->signal->oom_score_adj_min &&
+                               !capable(CAP_SYS_RESOURCE)) {
+                       err = -EACCES;
+                       goto err_unlock;
+               }
+       }
+
+       /*
+        * Make sure we will check other processes sharing the mm if this is
+        * not vfrok which wants its own oom_score_adj.
+        * pin the mm so it doesn't go away and get reused after task_unlock
+        */
+       if (!task->vfork_done) {
+               struct task_struct *p = find_lock_task_mm(task);
+
+               if (p) {
+                       if (atomic_read(&p->mm->mm_users) > 1) {
+                               mm = p->mm;
+                               atomic_inc(&mm->mm_count);
+                       }
+                       task_unlock(p);
+               }
+       }
+
+       task->signal->oom_score_adj = oom_adj;
+       if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+               task->signal->oom_score_adj_min = (short)oom_adj;
+       trace_oom_score_adj_update(task);
+
+       if (mm) {
+               struct task_struct *p;
+
+               rcu_read_lock();
+               for_each_process(p) {
+                       if (same_thread_group(task, p))
+                               continue;
+
+                       /* do not touch kernel threads or the global init */
+                       if (p->flags & PF_KTHREAD || is_global_init(p))
+                               continue;
+
+                       task_lock(p);
+                       if (!p->vfork_done && process_shares_mm(p, mm)) {
+                               pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
+                                               task_pid_nr(p), p->comm,
+                                               p->signal->oom_score_adj, oom_adj,
+                                               task_pid_nr(task), task->comm);
+                               p->signal->oom_score_adj = oom_adj;
+                               if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+                                       p->signal->oom_score_adj_min = (short)oom_adj;
+                       }
+                       task_unlock(p);
+               }
+               rcu_read_unlock();
+               mmdrop(mm);
+       }
+err_unlock:
+       mutex_unlock(&oom_adj_mutex);
+       put_task_struct(task);
+       return err;
+}
+
 /*
  * /proc/pid/oom_adj exists solely for backwards compatibility with previous
  * kernels.  The effective policy is defined by oom_score_adj, which has a
@@ -1054,10 +1138,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
 static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
 {
-       struct task_struct *task;
        char buffer[PROC_NUMBUF];
        int oom_adj;
-       unsigned long flags;
        int err;
 
        memset(buffer, 0, sizeof(buffer));
@@ -1077,23 +1159,6 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                goto out;
        }
 
-       task = get_proc_task(file_inode(file));
-       if (!task) {
-               err = -ESRCH;
-               goto out;
-       }
-
-       task_lock(task);
-       if (!task->mm) {
-               err = -EINVAL;
-               goto err_task_lock;
-       }
-
-       if (!lock_task_sighand(task, &flags)) {
-               err = -ESRCH;
-               goto err_task_lock;
-       }
-
        /*
         * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
         * value is always attainable.
@@ -1103,27 +1168,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
        else
                oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
 
-       if (oom_adj < task->signal->oom_score_adj &&
-           !capable(CAP_SYS_RESOURCE)) {
-               err = -EACCES;
-               goto err_sighand;
-       }
-
-       /*
-        * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
-        * /proc/pid/oom_score_adj instead.
-        */
-       pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
-                 current->comm, task_pid_nr(current), task_pid_nr(task),
-                 task_pid_nr(task));
-
-       task->signal->oom_score_adj = oom_adj;
-       trace_oom_score_adj_update(task);
-err_sighand:
-       unlock_task_sighand(task, &flags);
-err_task_lock:
-       task_unlock(task);
-       put_task_struct(task);
+       err = __set_oom_adj(file, oom_adj, true);
 out:
        return err < 0 ? err : count;
 }
@@ -1140,15 +1185,11 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        short oom_score_adj = OOM_SCORE_ADJ_MIN;
-       unsigned long flags;
        size_t len;
 
        if (!task)
                return -ESRCH;
-       if (lock_task_sighand(task, &flags)) {
-               oom_score_adj = task->signal->oom_score_adj;
-               unlock_task_sighand(task, &flags);
-       }
+       oom_score_adj = task->signal->oom_score_adj;
        put_task_struct(task);
        len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
@@ -1157,9 +1198,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *ppos)
 {
-       struct task_struct *task;
        char buffer[PROC_NUMBUF];
-       unsigned long flags;
        int oom_score_adj;
        int err;
 
@@ -1180,39 +1219,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto out;
        }
 
-       task = get_proc_task(file_inode(file));
-       if (!task) {
-               err = -ESRCH;
-               goto out;
-       }
-
-       task_lock(task);
-       if (!task->mm) {
-               err = -EINVAL;
-               goto err_task_lock;
-       }
-
-       if (!lock_task_sighand(task, &flags)) {
-               err = -ESRCH;
-               goto err_task_lock;
-       }
-
-       if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
-                       !capable(CAP_SYS_RESOURCE)) {
-               err = -EACCES;
-               goto err_sighand;
-       }
-
-       task->signal->oom_score_adj = (short)oom_score_adj;
-       if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
-               task->signal->oom_score_adj_min = (short)oom_score_adj;
-       trace_oom_score_adj_update(task);
-
-err_sighand:
-       unlock_task_sighand(task, &flags);
-err_task_lock:
-       task_unlock(task);
-       put_task_struct(task);
+       err = __set_oom_adj(file, oom_score_adj, false);
 out:
        return err < 0 ? err : count;
 }
index cf301a9ef5123ecd2f9a7f3705bbe026dea7558c..09e18fdf61e5b48dd67234b19972a10dd5131662 100644 (file)
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        si_swapinfo(&i);
        committed = percpu_counter_read_positive(&vm_committed_as);
 
-       cached = global_page_state(NR_FILE_PAGES) -
+       cached = global_node_page_state(NR_FILE_PAGES) -
                        total_swapcache_pages() - i.bufferram;
        if (cached < 0)
                cached = 0;
@@ -138,23 +138,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #endif
                K(i.totalswap),
                K(i.freeswap),
-               K(global_page_state(NR_FILE_DIRTY)),
-               K(global_page_state(NR_WRITEBACK)),
-               K(global_page_state(NR_ANON_PAGES)),
-               K(global_page_state(NR_FILE_MAPPED)),
+               K(global_node_page_state(NR_FILE_DIRTY)),
+               K(global_node_page_state(NR_WRITEBACK)),
+               K(global_node_page_state(NR_ANON_MAPPED)),
+               K(global_node_page_state(NR_FILE_MAPPED)),
                K(i.sharedram),
                K(global_page_state(NR_SLAB_RECLAIMABLE) +
                                global_page_state(NR_SLAB_UNRECLAIMABLE)),
                K(global_page_state(NR_SLAB_RECLAIMABLE)),
                K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
-               global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
+               global_page_state(NR_KERNEL_STACK_KB),
                K(global_page_state(NR_PAGETABLE)),
 #ifdef CONFIG_QUICKLIST
                K(quicklist_total_size()),
 #endif
-               K(global_page_state(NR_UNSTABLE_NFS)),
+               K(global_node_page_state(NR_UNSTABLE_NFS)),
                K(global_page_state(NR_BOUNCE)),
-               K(global_page_state(NR_WRITEBACK_TEMP)),
+               K(global_node_page_state(NR_WRITEBACK_TEMP)),
                K(vm_commit_limit()),
                K(committed),
                (unsigned long)VMALLOC_TOTAL >> 10,
@@ -164,9 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-               , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
-               , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
-               , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
+               , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
+               , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
+               , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
 #endif
 #ifdef CONFIG_CMA
                , K(totalcma_pages)
index c82794f20110420582d496ae478bc600f9400233..491a91717788591384230eb7d46a5332adf6298f 100644 (file)
@@ -197,7 +197,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 }
 
 long congestion_wait(int sync, long timeout);
-long wait_iff_congested(struct zone *zone, int sync, long timeout);
+long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
 int pdflush_proc_obsolete(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos);
 
index 1a02dab16646ebc8917a71cc52b0a5491bd49b1c..d4e106b5dc277a387f159b40770b5f8f2c348038 100644 (file)
@@ -1,6 +1,18 @@
 #ifndef _LINUX_COMPACTION_H
 #define _LINUX_COMPACTION_H
 
+/*
+ * Determines how hard direct compaction should try to succeed.
+ * Lower value means higher priority, analogically to reclaim priority.
+ */
+enum compact_priority {
+       COMPACT_PRIO_SYNC_LIGHT,
+       MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
+       DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
+       COMPACT_PRIO_ASYNC,
+       INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
+};
+
 /* Return values for compact_zone() and try_to_compact_pages() */
 /* When adding new states, please adjust include/trace/events/compaction.h */
 enum compact_result {
@@ -43,14 +55,6 @@ enum compact_result {
        COMPACT_PARTIAL,
 };
 
-/* Used to signal whether compaction detected need_sched() or lock contention */
-/* No contention detected */
-#define COMPACT_CONTENDED_NONE 0
-/* Either need_sched() was true or fatal signal pending */
-#define COMPACT_CONTENDED_SCHED        1
-/* Zone lock or lru_lock was contended in async compaction */
-#define COMPACT_CONTENDED_LOCK 2
-
 struct alloc_context; /* in mm/internal.h */
 
 #ifdef CONFIG_COMPACTION
@@ -64,9 +68,8 @@ extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
-                       unsigned int order,
-               unsigned int alloc_flags, const struct alloc_context *ac,
-               enum migrate_mode mode, int *contended);
+               unsigned int order, unsigned int alloc_flags,
+               const struct alloc_context *ac, enum compact_priority prio);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern enum compact_result compaction_suitable(struct zone *zone, int order,
@@ -151,14 +154,6 @@ extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
 
 #else
-static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
-                       unsigned int order, int alloc_flags,
-                       const struct alloc_context *ac,
-                       enum migrate_mode mode, int *contended)
-{
-       return COMPACT_CONTINUE;
-}
-
 static inline void compact_pgdat(pg_data_t *pgdat, int order)
 {
 }
index c29e9d347bc697125e6eb02fe2ca0b033cbf83fa..f8041f9de31efc998916537889dd559b9fc375cb 100644 (file)
@@ -237,9 +237,11 @@ struct vm_area_struct;
  *   are expected to be movable via page reclaim or page migration. Typically,
  *   pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE.
  *
- * GFP_TRANSHUGE is used for THP allocations. They are compound allocations
- *   that will fail quickly if memory is not available and will not wake
- *   kswapd on failure.
+ * GFP_TRANSHUGE and GFP_TRANSHUGE_LIGHT are used for THP allocations. They are
+ *   compound allocations that will generally fail quickly if memory is not
+ *   available and will not wake kswapd/kcompactd on failure. The _LIGHT
+ *   version does not attempt reclaim/compaction at all and is by default used
+ *   in page fault path, while the non-light is used by khugepaged.
  */
 #define GFP_ATOMIC     (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
 #define GFP_KERNEL     (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
@@ -254,9 +256,9 @@ struct vm_area_struct;
 #define GFP_DMA32      __GFP_DMA32
 #define GFP_HIGHUSER   (GFP_USER | __GFP_HIGHMEM)
 #define GFP_HIGHUSER_MOVABLE   (GFP_HIGHUSER | __GFP_MOVABLE)
-#define GFP_TRANSHUGE  ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
-                        __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \
-                        ~__GFP_RECLAIM)
+#define GFP_TRANSHUGE_LIGHT    ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+                        __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
+#define GFP_TRANSHUGE  (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
 
 /* Convert GFP flags to their corresponding migrate type */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
index 92ce91c03cd0e0165279c01e90e58f1f271f8a4b..6f14de45b5ce0807e01f7c5093d46ab5c9a546bb 100644 (file)
@@ -11,7 +11,7 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          unsigned long addr,
                                          pmd_t *pmd,
                                          unsigned int flags);
-extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
                        struct vm_area_struct *vma,
                        pmd_t *pmd, unsigned long addr, unsigned long next);
 extern int zap_huge_pmd(struct mmu_gather *tlb,
index ac4b3c46a84d1493d59e22e241c2cc7b9045ce27..c9cf374445d8ee748c1b2260a190b3998e9473a1 100644 (file)
@@ -77,6 +77,7 @@ void kasan_free_shadow(const struct vm_struct *vm);
 
 size_t ksize(const void *);
 static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
+size_t kasan_metadata_size(struct kmem_cache *cache);
 
 #else /* CONFIG_KASAN */
 
@@ -121,6 +122,7 @@ static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
 
 static inline void kasan_unpoison_slab(const void *ptr) { }
+static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
 #endif /* CONFIG_KASAN */
 
index a19bcf9e762e1d4886e8bd5ed202e4485ba8cf3e..410decacff8fc7907622e99b9cbf7f3c32b1aefb 100644 (file)
@@ -177,7 +177,7 @@ extern int kdb_get_kbd_char(void);
 static inline
 int kdb_process_cpu(const struct task_struct *p)
 {
-       unsigned int cpu = task_thread_info(p)->cpu;
+       unsigned int cpu = task_cpu(p);
        if (cpu > num_possible_cpus())
                cpu = 0;
        return cpu;
index 6c14b6179727010f15c8ea3fcab036a32bfc84ad..2925da23505d1dd7939060b471e4efc266cacaef 100644 (file)
@@ -332,6 +332,7 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn);
 phys_addr_t memblock_start_of_DRAM(void);
 phys_addr_t memblock_end_of_DRAM(void);
 void memblock_enforce_memory_limit(phys_addr_t memory_limit);
+void memblock_mem_limit_remove_map(phys_addr_t limit);
 bool memblock_is_memory(phys_addr_t addr);
 int memblock_is_map_memory(phys_addr_t addr);
 int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
index 71aff733a4970879aee17c81b26444328e198dae..5d8ca6e02e396bd1eb5a00118ecf6141c43900ac 100644 (file)
@@ -52,7 +52,7 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
        MEM_CGROUP_STAT_NSTATS,
        /* default hierarchy stats */
-       MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS,
+       MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS,
        MEMCG_SLAB_RECLAIMABLE,
        MEMCG_SLAB_UNRECLAIMABLE,
        MEMCG_SOCK,
@@ -60,7 +60,7 @@ enum mem_cgroup_stat_index {
 };
 
 struct mem_cgroup_reclaim_cookie {
-       struct zone *zone;
+       pg_data_t *pgdat;
        int priority;
        unsigned int generation;
 };
@@ -118,7 +118,7 @@ struct mem_cgroup_reclaim_iter {
 /*
  * per-zone information in memory controller.
  */
-struct mem_cgroup_per_zone {
+struct mem_cgroup_per_node {
        struct lruvec           lruvec;
        unsigned long           lru_size[NR_LRU_LISTS];
 
@@ -132,10 +132,6 @@ struct mem_cgroup_per_zone {
                                                /* use container_of        */
 };
 
-struct mem_cgroup_per_node {
-       struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
 struct mem_cgroup_threshold {
        struct eventfd_ctx *eventfd;
        unsigned long threshold;
@@ -314,8 +310,46 @@ void mem_cgroup_uncharge_list(struct list_head *page_list);
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
 
-struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
-struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
+static struct mem_cgroup_per_node *
+mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
+{
+       return memcg->nodeinfo[nid];
+}
+
+/**
+ * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
+ * @node: node of the wanted lruvec
+ * @memcg: memcg of the wanted lruvec
+ *
+ * Returns the lru list vector holding pages for a given @node or a given
+ * @memcg and @zone. This can be the node lruvec, if the memory controller
+ * is disabled.
+ */
+static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
+                               struct mem_cgroup *memcg)
+{
+       struct mem_cgroup_per_node *mz;
+       struct lruvec *lruvec;
+
+       if (mem_cgroup_disabled()) {
+               lruvec = node_lruvec(pgdat);
+               goto out;
+       }
+
+       mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+       lruvec = &mz->lruvec;
+out:
+       /*
+        * Since a node can be onlined after the mem_cgroup was created,
+        * we have to be prepared to initialize lruvec->pgdat here;
+        * and if offlined then reonlined, we need to reinitialize it.
+        */
+       if (unlikely(lruvec->pgdat != pgdat))
+               lruvec->pgdat = pgdat;
+       return lruvec;
+}
+
+struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
@@ -404,9 +438,9 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 static inline
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
-       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup_per_node *mz;
 
-       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+       mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return mz->lru_size[lru];
 }
 
@@ -477,7 +511,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
        mem_cgroup_update_page_stat(page, idx, -1);
 }
 
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                                gfp_t gfp_mask,
                                                unsigned long *total_scanned);
 
@@ -568,16 +602,16 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
 {
 }
 
-static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
-                                                   struct mem_cgroup *memcg)
+static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
+                               struct mem_cgroup *memcg)
 {
-       return &zone->lruvec;
+       return node_lruvec(pgdat);
 }
 
 static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
-                                                   struct zone *zone)
+                                                   struct pglist_data *pgdat)
 {
-       return &zone->lruvec;
+       return &pgdat->lruvec;
 }
 
 static inline bool mm_match_cgroup(struct mm_struct *mm,
@@ -681,7 +715,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 }
 
 static inline
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                            gfp_t gfp_mask,
                                            unsigned long *total_scanned)
 {
index bcaa634139a996160247738d94107ce08485e185..93416196ba64f6b4747ad9f37aaf5a913ff3ac29 100644 (file)
@@ -26,7 +26,7 @@ struct vmem_altmap {
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
 
-#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE)
+#ifdef CONFIG_ZONE_DEVICE
 struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start);
 #else
 static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
index 192c1bbe5fcddac7d386b95cf7537843e6942cd5..08ed53eeedd5fd203a961310367ef8a206c76696 100644 (file)
@@ -933,6 +933,11 @@ static inline struct zone *page_zone(const struct page *page)
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
 }
 
+static inline pg_data_t *page_pgdat(const struct page *page)
+{
+       return NODE_DATA(page_to_nid(page));
+}
+
 #ifdef SECTION_IN_PAGE_FLAGS
 static inline void set_page_section(struct page *page, unsigned long section)
 {
@@ -973,11 +978,21 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
 {
        return page->mem_cgroup;
 }
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       return READ_ONCE(page->mem_cgroup);
+}
 #else
 static inline struct mem_cgroup *page_memcg(struct page *page)
 {
        return NULL;
 }
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       return NULL;
+}
 #endif
 
 /*
@@ -2284,6 +2299,8 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
 }
 #endif /* __HAVE_ARCH_GATE_AREA */
 
+extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
+
 #ifdef CONFIG_SYSCTL
 extern int sysctl_drop_caches;
 int drop_caches_sysctl_handler(struct ctl_table *, int,
index 5bd29ba4f174f531d1921f8ea2616b7727b069aa..71613e8a720f99b6870f1e3f8957d2761a6ac1a0 100644 (file)
@@ -23,25 +23,30 @@ static inline int page_is_file_cache(struct page *page)
 }
 
 static __always_inline void __update_lru_size(struct lruvec *lruvec,
-                               enum lru_list lru, int nr_pages)
+                               enum lru_list lru, enum zone_type zid,
+                               int nr_pages)
 {
-       __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+       __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
+       __mod_zone_page_state(&pgdat->node_zones[zid],
+                               NR_ZONE_LRU_BASE + lru, nr_pages);
 }
 
 static __always_inline void update_lru_size(struct lruvec *lruvec,
-                               enum lru_list lru, int nr_pages)
+                               enum lru_list lru, enum zone_type zid,
+                               int nr_pages)
 {
+       __update_lru_size(lruvec, lru, zid, nr_pages);
 #ifdef CONFIG_MEMCG
        mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
-#else
-       __update_lru_size(lruvec, lru, nr_pages);
 #endif
 }
 
 static __always_inline void add_page_to_lru_list(struct page *page,
                                struct lruvec *lruvec, enum lru_list lru)
 {
-       update_lru_size(lruvec, lru, hpage_nr_pages(page));
+       update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
        list_add(&page->lru, &lruvec->lists[lru]);
 }
 
@@ -49,7 +54,7 @@ static __always_inline void del_page_from_lru_list(struct page *page,
                                struct lruvec *lruvec, enum lru_list lru)
 {
        list_del(&page->lru);
-       update_lru_size(lruvec, lru, -hpage_nr_pages(page));
+       update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
 }
 
 /**
index 79472b22d23f0866cd251e3e5e47c902522994e0..903200f4ec41ce03c50c15bfa079d5a01f903afa 100644 (file)
@@ -118,7 +118,7 @@ struct page {
         */
        union {
                struct list_head lru;   /* Pageout list, eg. active_list
-                                        * protected by zone->lru_lock !
+                                        * protected by zone_lru_lock !
                                         * Can be used as a generic list
                                         * by the page owner.
                                         */
index 19425e988bdc1a6b2d9fc75a71064a9a57be68b7..f2e4e90621ec25c237703bc5b1ecba814d1afce9 100644 (file)
@@ -93,7 +93,7 @@ struct free_area {
 struct pglist_data;
 
 /*
- * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
+ * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
  * So add a wild amount of padding here to ensure that they fall into separate
  * cachelines.  There are very few zone structures in the machine, so space
  * consumption is not a concern here.
@@ -110,36 +110,20 @@ struct zone_padding {
 enum zone_stat_item {
        /* First 128 byte cacheline (assuming 64 bit words) */
        NR_FREE_PAGES,
-       NR_ALLOC_BATCH,
-       NR_LRU_BASE,
-       NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
-       NR_ACTIVE_ANON,         /*  "     "     "   "       "         */
-       NR_INACTIVE_FILE,       /*  "     "     "   "       "         */
-       NR_ACTIVE_FILE,         /*  "     "     "   "       "         */
-       NR_UNEVICTABLE,         /*  "     "     "   "       "         */
+       NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
+       NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
+       NR_ZONE_ACTIVE_ANON,
+       NR_ZONE_INACTIVE_FILE,
+       NR_ZONE_ACTIVE_FILE,
+       NR_ZONE_UNEVICTABLE,
+       NR_ZONE_WRITE_PENDING,  /* Count of dirty, writeback and unstable pages */
        NR_MLOCK,               /* mlock()ed pages found and moved off LRU */
-       NR_ANON_PAGES,  /* Mapped anonymous pages */
-       NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
-                          only modified from process context */
-       NR_FILE_PAGES,
-       NR_FILE_DIRTY,
-       NR_WRITEBACK,
        NR_SLAB_RECLAIMABLE,
        NR_SLAB_UNRECLAIMABLE,
        NR_PAGETABLE,           /* used for pagetables */
-       NR_KERNEL_STACK,
+       NR_KERNEL_STACK_KB,     /* measured in KiB */
        /* Second 128 byte cacheline */
-       NR_UNSTABLE_NFS,        /* NFS unstable pages */
        NR_BOUNCE,
-       NR_VMSCAN_WRITE,
-       NR_VMSCAN_IMMEDIATE,    /* Prioritise for reclaim when writeback ends */
-       NR_WRITEBACK_TEMP,      /* Writeback using temporary buffers */
-       NR_ISOLATED_ANON,       /* Temporary isolated pages from anon lru */
-       NR_ISOLATED_FILE,       /* Temporary isolated pages from file lru */
-       NR_SHMEM,               /* shmem pages (included tmpfs/GEM pages) */
-       NR_DIRTIED,             /* page dirtyings since bootup */
-       NR_WRITTEN,             /* page writings since bootup */
-       NR_PAGES_SCANNED,       /* pages scanned since last reclaim */
 #if IS_ENABLED(CONFIG_ZSMALLOC)
        NR_ZSPAGES,             /* allocated in zsmalloc */
 #endif
@@ -151,14 +135,40 @@ enum zone_stat_item {
        NUMA_LOCAL,             /* allocation from local node */
        NUMA_OTHER,             /* allocation from other node */
 #endif
+       NR_FREE_CMA_PAGES,
+       NR_VM_ZONE_STAT_ITEMS };
+
+enum node_stat_item {
+       NR_LRU_BASE,
+       NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
+       NR_ACTIVE_ANON,         /*  "     "     "   "       "         */
+       NR_INACTIVE_FILE,       /*  "     "     "   "       "         */
+       NR_ACTIVE_FILE,         /*  "     "     "   "       "         */
+       NR_UNEVICTABLE,         /*  "     "     "   "       "         */
+       NR_ISOLATED_ANON,       /* Temporary isolated pages from anon lru */
+       NR_ISOLATED_FILE,       /* Temporary isolated pages from file lru */
+       NR_PAGES_SCANNED,       /* pages scanned since last reclaim */
        WORKINGSET_REFAULT,
        WORKINGSET_ACTIVATE,
        WORKINGSET_NODERECLAIM,
-       NR_ANON_THPS,
+       NR_ANON_MAPPED, /* Mapped anonymous pages */
+       NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
+                          only modified from process context */
+       NR_FILE_PAGES,
+       NR_FILE_DIRTY,
+       NR_WRITEBACK,
+       NR_WRITEBACK_TEMP,      /* Writeback using temporary buffers */
+       NR_SHMEM,               /* shmem pages (included tmpfs/GEM pages) */
        NR_SHMEM_THPS,
        NR_SHMEM_PMDMAPPED,
-       NR_FREE_CMA_PAGES,
-       NR_VM_ZONE_STAT_ITEMS };
+       NR_ANON_THPS,
+       NR_UNSTABLE_NFS,        /* NFS unstable pages */
+       NR_VMSCAN_WRITE,
+       NR_VMSCAN_IMMEDIATE,    /* Prioritise for reclaim when writeback ends */
+       NR_DIRTIED,             /* page dirtyings since bootup */
+       NR_WRITTEN,             /* page writings since bootup */
+       NR_VM_NODE_STAT_ITEMS
+};
 
 /*
  * We do arithmetic on the LRU lists in various places in the code,
@@ -215,7 +225,7 @@ struct lruvec {
        /* Evictions & activations on the inactive file list */
        atomic_long_t                   inactive_age;
 #ifdef CONFIG_MEMCG
-       struct zone                     *zone;
+       struct pglist_data *pgdat;
 #endif
 };
 
@@ -267,6 +277,11 @@ struct per_cpu_pageset {
 #endif
 };
 
+struct per_cpu_nodestat {
+       s8 stat_threshold;
+       s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
+};
+
 #endif /* !__GENERATING_BOUNDS.H */
 
 enum zone_type {
@@ -348,22 +363,9 @@ struct zone {
 #ifdef CONFIG_NUMA
        int node;
 #endif
-
-       /*
-        * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
-        * this zone's LRU.  Maintained by the pageout code.
-        */
-       unsigned int inactive_ratio;
-
        struct pglist_data      *zone_pgdat;
        struct per_cpu_pageset __percpu *pageset;
 
-       /*
-        * This is a per-zone reserve of pages that are not available
-        * to userspace allocations.
-        */
-       unsigned long           totalreserve_pages;
-
 #ifndef CONFIG_SPARSEMEM
        /*
         * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
@@ -372,14 +374,6 @@ struct zone {
        unsigned long           *pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 
-#ifdef CONFIG_NUMA
-       /*
-        * zone reclaim becomes active if more unmapped pages exist.
-        */
-       unsigned long           min_unmapped_pages;
-       unsigned long           min_slab_pages;
-#endif /* CONFIG_NUMA */
-
        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long           zone_start_pfn;
 
@@ -472,24 +466,21 @@ struct zone {
        unsigned long           wait_table_hash_nr_entries;
        unsigned long           wait_table_bits;
 
+       /* Write-intensive fields used from the page allocator */
        ZONE_PADDING(_pad1_)
+
        /* free areas of different sizes */
        struct free_area        free_area[MAX_ORDER];
 
        /* zone flags, see below */
        unsigned long           flags;
 
-       /* Write-intensive fields used from the page allocator */
+       /* Primarily protects free_area */
        spinlock_t              lock;
 
+       /* Write-intensive fields used by compaction and vmstats. */
        ZONE_PADDING(_pad2_)
 
-       /* Write-intensive fields used by page reclaim */
-
-       /* Fields commonly accessed by the page reclaim scanner */
-       spinlock_t              lru_lock;
-       struct lruvec           lruvec;
-
        /*
         * When free pages are below this point, additional steps are taken
         * when reading the number of free pages to avoid per-cpu counter
@@ -527,19 +518,18 @@ struct zone {
        atomic_long_t           vm_stat[NR_VM_ZONE_STAT_ITEMS];
 } ____cacheline_internodealigned_in_smp;
 
-enum zone_flags {
-       ZONE_RECLAIM_LOCKED,            /* prevents concurrent reclaim */
-       ZONE_CONGESTED,                 /* zone has many dirty pages backed by
+enum pgdat_flags {
+       PGDAT_CONGESTED,                /* pgdat has many dirty pages backed by
                                         * a congested BDI
                                         */
-       ZONE_DIRTY,                     /* reclaim scanning has recently found
+       PGDAT_DIRTY,                    /* reclaim scanning has recently found
                                         * many dirty file pages at the tail
                                         * of the LRU.
                                         */
-       ZONE_WRITEBACK,                 /* reclaim scanning has recently found
+       PGDAT_WRITEBACK,                /* reclaim scanning has recently found
                                         * many pages under writeback
                                         */
-       ZONE_FAIR_DEPLETED,             /* fair zone policy batch depleted */
+       PGDAT_RECLAIM_LOCKED,           /* prevents concurrent reclaim */
 };
 
 static inline unsigned long zone_end_pfn(const struct zone *zone)
@@ -663,8 +653,9 @@ typedef struct pglist_data {
        wait_queue_head_t pfmemalloc_wait;
        struct task_struct *kswapd;     /* Protected by
                                           mem_hotplug_begin/end() */
-       int kswapd_max_order;
-       enum zone_type classzone_idx;
+       int kswapd_order;
+       enum zone_type kswapd_classzone_idx;
+
 #ifdef CONFIG_COMPACTION
        int kcompactd_max_order;
        enum zone_type kcompactd_classzone_idx;
@@ -681,6 +672,23 @@ typedef struct pglist_data {
        /* Number of pages migrated during the rate limiting time interval */
        unsigned long numabalancing_migrate_nr_pages;
 #endif
+       /*
+        * This is a per-node reserve of pages that are not available
+        * to userspace allocations.
+        */
+       unsigned long           totalreserve_pages;
+
+#ifdef CONFIG_NUMA
+       /*
+        * zone reclaim becomes active if more unmapped pages exist.
+        */
+       unsigned long           min_unmapped_pages;
+       unsigned long           min_slab_pages;
+#endif /* CONFIG_NUMA */
+
+       /* Write-intensive fields used by page reclaim */
+       ZONE_PADDING(_pad1_)
+       spinlock_t              lru_lock;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        /*
@@ -695,6 +703,23 @@ typedef struct pglist_data {
        struct list_head split_queue;
        unsigned long split_queue_len;
 #endif
+
+       /* Fields commonly accessed by the page reclaim scanner */
+       struct lruvec           lruvec;
+
+       /*
+        * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+        * this node's LRU.  Maintained by the pageout code.
+        */
+       unsigned int inactive_ratio;
+
+       unsigned long           flags;
+
+       ZONE_PADDING(_pad2_)
+
+       /* Per-node vmstats */
+       struct per_cpu_nodestat __percpu *per_cpu_nodestats;
+       atomic_long_t           vm_stat[NR_VM_NODE_STAT_ITEMS];
 } pg_data_t;
 
 #define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
@@ -708,6 +733,15 @@ typedef struct pglist_data {
 
 #define node_start_pfn(nid)    (NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
+static inline spinlock_t *zone_lru_lock(struct zone *zone)
+{
+       return &zone->zone_pgdat->lru_lock;
+}
+
+static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
+{
+       return &pgdat->lruvec;
+}
 
 static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
 {
@@ -760,12 +794,12 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
 
 extern void lruvec_init(struct lruvec *lruvec);
 
-static inline struct zone *lruvec_zone(struct lruvec *lruvec)
+static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 {
 #ifdef CONFIG_MEMCG
-       return lruvec->zone;
+       return lruvec->pgdat;
 #else
-       return container_of(lruvec, struct zone, lruvec);
+       return container_of(lruvec, struct pglist_data, lruvec);
 #endif
 }
 
index 606137b3b778e224291ba89e2df3d87277134aa8..5bc0457ee3a88955f64b750a06858d45cb74b5da 100644 (file)
@@ -73,9 +73,9 @@ static inline bool oom_task_origin(const struct task_struct *p)
 extern void mark_oom_victim(struct task_struct *tsk);
 
 #ifdef CONFIG_MMU
-extern void try_oom_reaper(struct task_struct *tsk);
+extern void wake_oom_reaper(struct task_struct *tsk);
 #else
-static inline void try_oom_reaper(struct task_struct *tsk)
+static inline void wake_oom_reaper(struct task_struct *tsk)
 {
 }
 #endif
@@ -107,27 +107,7 @@ extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
-static inline bool task_will_free_mem(struct task_struct *task)
-{
-       struct signal_struct *sig = task->signal;
-
-       /*
-        * A coredumping process may sleep for an extended period in exit_mm(),
-        * so the oom killer cannot assume that the process will promptly exit
-        * and release memory.
-        */
-       if (sig->flags & SIGNAL_GROUP_COREDUMP)
-               return false;
-
-       if (!(task->flags & PF_EXITING))
-               return false;
-
-       /* Make sure that the whole thread group is going down */
-       if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT))
-               return false;
-
-       return true;
-}
+bool task_will_free_mem(struct task_struct *task);
 
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
index d99218a1e04370683dd239de59c38e48a04d2eb7..553af2923824e9611b015835a761436c2e422fe5 100644 (file)
@@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_HAS_UPROBES                19      /* has uprobes */
 #define MMF_RECALC_UPROBES     20      /* MMF_HAS_UPROBES can be wrong */
 #define MMF_OOM_REAPED         21      /* mm has been already reaped */
+#define MMF_OOM_NOT_REAPABLE   22      /* mm couldn't be reaped */
 
 #define MMF_INIT_MASK          (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
@@ -1949,6 +1950,32 @@ static inline int tsk_nr_cpus_allowed(struct task_struct *p)
 #define TNF_FAULT_LOCAL        0x08
 #define TNF_MIGRATE_FAIL 0x10
 
+static inline bool in_vfork(struct task_struct *tsk)
+{
+       bool ret;
+
+       /*
+        * need RCU to access ->real_parent if CLONE_VM was used along with
+        * CLONE_PARENT.
+        *
+        * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
+        * imply CLONE_VM
+        *
+        * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
+        * ->real_parent is not necessarily the task doing vfork(), so in
+        * theory we can't rely on task_lock() if we want to dereference it.
+        *
+        * And in this case we can't trust the real_parent->mm == tsk->mm
+        * check, it can be false negative. But we do not care, if init or
+        * another oom-unkillable task does this it should blame itself.
+        */
+       rcu_read_lock();
+       ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm;
+       rcu_read_unlock();
+
+       return ret;
+}
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
index 339ba027ade9873eebf91ea4180ee4a6527f4b92..4ad2c5a263999e24cf4d860f088bb4df07ba0e4c 100644 (file)
@@ -88,7 +88,8 @@ struct kmem_cache {
 };
 
 static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
-                               void *x) {
+                               void *x)
+{
        void *object = x - (x - page->s_mem) % cache->size;
        void *last_object = page->s_mem + (cache->num - 1) * cache->size;
 
index 5624c1f3eb0add4347768e1dada1581b794589d8..75f56c2ef2d471f1424b947632b984f431dd46e5 100644 (file)
@@ -104,6 +104,10 @@ struct kmem_cache {
        unsigned int *random_seq;
 #endif
 
+#ifdef CONFIG_KASAN
+       struct kasan_cache kasan_info;
+#endif
+
        struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
@@ -119,15 +123,17 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
 void object_err(struct kmem_cache *s, struct page *page,
                u8 *object, char *reason);
 
+void *fixup_red_left(struct kmem_cache *s, void *p);
+
 static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
                                void *x) {
        void *object = x - (x - page_address(page)) % cache->size;
        void *last_object = page_address(page) +
                (page->objects - 1) * cache->size;
-       if (unlikely(object > last_object))
-               return last_object;
-       else
-               return object;
+       void *result = (unlikely(object > last_object)) ? last_object : object;
+
+       result = fixup_red_left(cache, result);
+       return result;
 }
 
 #endif /* _LINUX_SLUB_DEF_H */
index 0af2bb2028fd51c56b319b264e1d2f4991d7b073..b17cc4830fa670512abdc3987f58fc55e583f0bc 100644 (file)
@@ -157,15 +157,6 @@ enum {
 #define SWAP_CLUSTER_MAX 32UL
 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
 
-/*
- * Ratio between zone->managed_pages and the "gap" that above the per-zone
- * "high_wmark". While balancing nodes, We allow kswapd to shrink zones that
- * do not meet the (high_wmark + gap) watermark, even which already met the
- * high_wmark, in order to provide better per-zone lru behavior. We are ok to
- * spend not more than 1% of the memory for this zone balancing "gap".
- */
-#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
-
 #define SWAP_MAP_MAX   0x3e    /* Max duplication count, in first swap_map */
 #define SWAP_MAP_BAD   0x3f    /* Note pageblock is bad, in first swap_map */
 #define SWAP_HAS_CACHE 0x40    /* Flag page is cached, in first swap_map */
@@ -317,6 +308,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
 
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
+extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                        gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
@@ -324,9 +316,9 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                                  unsigned long nr_pages,
                                                  gfp_t gfp_mask,
                                                  bool may_swap);
-extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
-                                               struct zone *zone,
+                                               pg_data_t *pgdat,
                                                unsigned long *nr_scanned);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
@@ -334,13 +326,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern unsigned long vm_total_pages;
 
 #ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
+extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
 extern int sysctl_min_slab_ratio;
-extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
 #else
-#define zone_reclaim_mode 0
-static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+#define node_reclaim_mode 0
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+                               unsigned int order)
 {
        return 0;
 }
index afce69296ac05699880b80c6ae42c1a46b409b8c..cb0775e1ee4bd5f2dfc6d004a5f3fb4438390893 100644 (file)
@@ -54,7 +54,7 @@ int arch_update_cpu_topology(void);
 /*
  * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
  * (in whatever arch specific measurement units returned by node_distance())
- * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
+ * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
  * on nodes within this distance.
  */
 #define RECLAIM_DISTANCE 30
index 42604173f122423303d06371e7e8bfb45774df6d..4d6ec58a8d45b04aa6bae51585e49d44319dd155 100644 (file)
 
 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                FOR_ALL_ZONES(PGALLOC),
+               FOR_ALL_ZONES(ALLOCSTALL),
+               FOR_ALL_ZONES(PGSCAN_SKIP),
                PGFREE, PGACTIVATE, PGDEACTIVATE,
                PGFAULT, PGMAJFAULT,
                PGLAZYFREED,
-               FOR_ALL_ZONES(PGREFILL),
-               FOR_ALL_ZONES(PGSTEAL_KSWAPD),
-               FOR_ALL_ZONES(PGSTEAL_DIRECT),
-               FOR_ALL_ZONES(PGSCAN_KSWAPD),
-               FOR_ALL_ZONES(PGSCAN_DIRECT),
+               PGREFILL,
+               PGSTEAL_KSWAPD,
+               PGSTEAL_DIRECT,
+               PGSCAN_KSWAPD,
+               PGSCAN_DIRECT,
                PGSCAN_DIRECT_THROTTLE,
 #ifdef CONFIG_NUMA
                PGSCAN_ZONE_RECLAIM_FAILED,
 #endif
                PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
                KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
-               PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+               PAGEOUTRUN, PGROTATED,
                DROP_PAGECACHE, DROP_SLAB,
 #ifdef CONFIG_NUMA_BALANCING
                NUMA_PTE_UPDATES,
index d2da8e053210041bfcefb9e04b59d195880d2d0e..613771909b6ee1f284806c49b444b8779926a373 100644 (file)
@@ -101,25 +101,42 @@ static inline void vm_events_fold_cpu(int cpu)
 #define count_vm_vmacache_event(x) do {} while (0)
 #endif
 
-#define __count_zone_vm_events(item, zone, delta) \
-               __count_vm_events(item##_NORMAL - ZONE_NORMAL + \
-               zone_idx(zone), delta)
+#define __count_zid_vm_events(item, zid, delta) \
+       __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
 
 /*
- * Zone based page accounting with per cpu differentials.
+ * Zone and node-based page accounting with per cpu differentials.
  */
-extern atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
+extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
 
 static inline void zone_page_state_add(long x, struct zone *zone,
                                 enum zone_stat_item item)
 {
        atomic_long_add(x, &zone->vm_stat[item]);
-       atomic_long_add(x, &vm_stat[item]);
+       atomic_long_add(x, &vm_zone_stat[item]);
+}
+
+static inline void node_page_state_add(long x, struct pglist_data *pgdat,
+                                enum node_stat_item item)
+{
+       atomic_long_add(x, &pgdat->vm_stat[item]);
+       atomic_long_add(x, &vm_node_stat[item]);
 }
 
 static inline unsigned long global_page_state(enum zone_stat_item item)
 {
-       long x = atomic_long_read(&vm_stat[item]);
+       long x = atomic_long_read(&vm_zone_stat[item]);
+#ifdef CONFIG_SMP
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
+}
+
+static inline unsigned long global_node_page_state(enum node_stat_item item)
+{
+       long x = atomic_long_read(&vm_node_stat[item]);
 #ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
@@ -160,32 +177,61 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
        return x;
 }
 
-#ifdef CONFIG_NUMA
+static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
+                                       enum node_stat_item item)
+{
+       long x = atomic_long_read(&pgdat->vm_stat[item]);
 
-extern unsigned long node_page_state(int node, enum zone_stat_item item);
+#ifdef CONFIG_SMP
+       int cpu;
+       for_each_online_cpu(cpu)
+               x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item];
 
-#else
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
+}
 
-#define node_page_state(node, item) global_page_state(item)
 
+#ifdef CONFIG_NUMA
+extern unsigned long sum_zone_node_page_state(int node,
+                                               enum zone_stat_item item);
+extern unsigned long node_page_state(struct pglist_data *pgdat,
+                                               enum node_stat_item item);
+#else
+#define sum_zone_node_page_state(node, item) global_page_state(item)
+#define node_page_state(node, item) global_node_page_state(item)
 #endif /* CONFIG_NUMA */
 
 #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
 #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
+#define add_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, __d)
+#define sub_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, -(__d))
 
 #ifdef CONFIG_SMP
 void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
 void __inc_zone_page_state(struct page *, enum zone_stat_item);
 void __dec_zone_page_state(struct page *, enum zone_stat_item);
 
+void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
+void __inc_node_page_state(struct page *, enum node_stat_item);
+void __dec_node_page_state(struct page *, enum node_stat_item);
+
 void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
 void inc_zone_page_state(struct page *, enum zone_stat_item);
 void dec_zone_page_state(struct page *, enum zone_stat_item);
 
-extern void inc_zone_state(struct zone *, enum zone_stat_item);
+void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
+void inc_node_page_state(struct page *, enum node_stat_item);
+void dec_node_page_state(struct page *, enum node_stat_item);
+
+extern void inc_node_state(struct pglist_data *, enum node_stat_item);
 extern void __inc_zone_state(struct zone *, enum zone_stat_item);
+extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
 extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
+extern void __dec_node_state(struct pglist_data *, enum node_stat_item);
 
 void quiet_vmstat(void);
 void cpu_vm_stats_fold(int cpu);
@@ -213,16 +259,34 @@ static inline void __mod_zone_page_state(struct zone *zone,
        zone_page_state_add(delta, zone, item);
 }
 
+static inline void __mod_node_page_state(struct pglist_data *pgdat,
+                       enum node_stat_item item, int delta)
+{
+       node_page_state_add(delta, pgdat, item);
+}
+
 static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
        atomic_long_inc(&zone->vm_stat[item]);
-       atomic_long_inc(&vm_stat[item]);
+       atomic_long_inc(&vm_zone_stat[item]);
+}
+
+static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       atomic_long_inc(&pgdat->vm_stat[item]);
+       atomic_long_inc(&vm_node_stat[item]);
 }
 
 static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
        atomic_long_dec(&zone->vm_stat[item]);
-       atomic_long_dec(&vm_stat[item]);
+       atomic_long_dec(&vm_zone_stat[item]);
+}
+
+static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       atomic_long_dec(&pgdat->vm_stat[item]);
+       atomic_long_dec(&vm_node_stat[item]);
 }
 
 static inline void __inc_zone_page_state(struct page *page,
@@ -231,12 +295,26 @@ static inline void __inc_zone_page_state(struct page *page,
        __inc_zone_state(page_zone(page), item);
 }
 
+static inline void __inc_node_page_state(struct page *page,
+                       enum node_stat_item item)
+{
+       __inc_node_state(page_pgdat(page), item);
+}
+
+
 static inline void __dec_zone_page_state(struct page *page,
                        enum zone_stat_item item)
 {
        __dec_zone_state(page_zone(page), item);
 }
 
+static inline void __dec_node_page_state(struct page *page,
+                       enum node_stat_item item)
+{
+       __dec_node_state(page_pgdat(page), item);
+}
+
+
 /*
  * We only use atomic operations to update counters. So there is no need to
  * disable interrupts.
@@ -245,7 +323,12 @@ static inline void __dec_zone_page_state(struct page *page,
 #define dec_zone_page_state __dec_zone_page_state
 #define mod_zone_page_state __mod_zone_page_state
 
+#define inc_node_page_state __inc_node_page_state
+#define dec_node_page_state __dec_node_page_state
+#define mod_node_page_state __mod_node_page_state
+
 #define inc_zone_state __inc_zone_state
+#define inc_node_state __inc_node_state
 #define dec_zone_state __dec_zone_state
 
 #define set_pgdat_percpu_threshold(pgdat, callback) { }
index 717e6149e753364084698d3b32b1004d4087a281..fc1e16c25a296877e85d37d267bd953929ee93cd 100644 (file)
@@ -320,7 +320,7 @@ void laptop_mode_timer_fn(unsigned long data);
 static inline void laptop_sync_completion(void) { }
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
-bool zone_dirty_ok(struct zone *zone);
+bool node_dirty_ok(struct pglist_data *pgdat);
 int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
 #ifdef CONFIG_CGROUP_WRITEBACK
 void wb_domain_exit(struct wb_domain *dom);
index 36e2d6fb1360a722183e9cabd47b8583b557b13f..c2ba402ab25651009c5b67e90145b81124fd110e 100644 (file)
@@ -226,26 +226,26 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages,
        TP_PROTO(
                int order,
                gfp_t gfp_mask,
-               enum migrate_mode mode),
+               int prio),
 
-       TP_ARGS(order, gfp_mask, mode),
+       TP_ARGS(order, gfp_mask, prio),
 
        TP_STRUCT__entry(
                __field(int, order)
                __field(gfp_t, gfp_mask)
-               __field(enum migrate_mode, mode)
+               __field(int, prio)
        ),
 
        TP_fast_assign(
                __entry->order = order;
                __entry->gfp_mask = gfp_mask;
-               __entry->mode = mode;
+               __entry->prio = prio;
        ),
 
-       TP_printk("order=%d gfp_mask=0x%x mode=%d",
+       TP_printk("order=%d gfp_mask=0x%x priority=%d",
                __entry->order,
                __entry->gfp_mask,
-               (int)__entry->mode)
+               __entry->prio)
 );
 
 DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
index 43cedbf0c759e5074871baf4d600ab5f3e2bb3e8..5a81ab48a2fb2921328e098c388e77d58a765147 100644 (file)
@@ -11,6 +11,7 @@
 
 #define __def_gfpflag_names                                            \
        {(unsigned long)GFP_TRANSHUGE,          "GFP_TRANSHUGE"},       \
+       {(unsigned long)GFP_TRANSHUGE_LIGHT,    "GFP_TRANSHUGE_LIGHT"}, \
        {(unsigned long)GFP_HIGHUSER_MOVABLE,   "GFP_HIGHUSER_MOVABLE"},\
        {(unsigned long)GFP_HIGHUSER,           "GFP_HIGHUSER"},        \
        {(unsigned long)GFP_USER,               "GFP_USER"},            \
index 0101ef37f1eed78a39ef095f3ced183cb4e3f7c8..c88fd0934e7ead911941d13272bb89215584ba67 100644 (file)
@@ -55,21 +55,23 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
 
 TRACE_EVENT(mm_vmscan_kswapd_wake,
 
-       TP_PROTO(int nid, int order),
+       TP_PROTO(int nid, int zid, int order),
 
-       TP_ARGS(nid, order),
+       TP_ARGS(nid, zid, order),
 
        TP_STRUCT__entry(
                __field(        int,    nid     )
+               __field(        int,    zid     )
                __field(        int,    order   )
        ),
 
        TP_fast_assign(
                __entry->nid    = nid;
+               __entry->zid    = zid;
                __entry->order  = order;
        ),
 
-       TP_printk("nid=%d order=%d", __entry->nid, __entry->order)
+       TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order)
 );
 
 TRACE_EVENT(mm_vmscan_wakeup_kswapd,
@@ -98,47 +100,50 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
 
 DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
 
-       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags),
+       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
 
-       TP_ARGS(order, may_writepage, gfp_flags),
+       TP_ARGS(order, may_writepage, gfp_flags, classzone_idx),
 
        TP_STRUCT__entry(
                __field(        int,    order           )
                __field(        int,    may_writepage   )
                __field(        gfp_t,  gfp_flags       )
+               __field(        int,    classzone_idx   )
        ),
 
        TP_fast_assign(
                __entry->order          = order;
                __entry->may_writepage  = may_writepage;
                __entry->gfp_flags      = gfp_flags;
+               __entry->classzone_idx  = classzone_idx;
        ),
 
-       TP_printk("order=%d may_writepage=%d gfp_flags=%s",
+       TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d",
                __entry->order,
                __entry->may_writepage,
-               show_gfp_flags(__entry->gfp_flags))
+               show_gfp_flags(__entry->gfp_flags),
+               __entry->classzone_idx)
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin,
 
-       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags),
+       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
 
-       TP_ARGS(order, may_writepage, gfp_flags)
+       TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
 
-       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags),
+       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
 
-       TP_ARGS(order, may_writepage, gfp_flags)
+       TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
 
-       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags),
+       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
 
-       TP_ARGS(order, may_writepage, gfp_flags)
+       TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
 );
 
 DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
@@ -266,16 +271,18 @@ TRACE_EVENT(mm_shrink_slab_end,
 
 DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 
-       TP_PROTO(int order,
+       TP_PROTO(int classzone_idx,
+               int order,
                unsigned long nr_requested,
                unsigned long nr_scanned,
                unsigned long nr_taken,
                isolate_mode_t isolate_mode,
                int file),
 
-       TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
+       TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
 
        TP_STRUCT__entry(
+               __field(int, classzone_idx)
                __field(int, order)
                __field(unsigned long, nr_requested)
                __field(unsigned long, nr_scanned)
@@ -285,6 +292,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
        ),
 
        TP_fast_assign(
+               __entry->classzone_idx = classzone_idx;
                __entry->order = order;
                __entry->nr_requested = nr_requested;
                __entry->nr_scanned = nr_scanned;
@@ -293,8 +301,9 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
                __entry->file = file;
        ),
 
-       TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
+       TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
                __entry->isolate_mode,
+               __entry->classzone_idx,
                __entry->order,
                __entry->nr_requested,
                __entry->nr_scanned,
@@ -304,27 +313,29 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 
 DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
 
-       TP_PROTO(int order,
+       TP_PROTO(int classzone_idx,
+               int order,
                unsigned long nr_requested,
                unsigned long nr_scanned,
                unsigned long nr_taken,
                isolate_mode_t isolate_mode,
                int file),
 
-       TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
+       TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
 
 );
 
 DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
 
-       TP_PROTO(int order,
+       TP_PROTO(int classzone_idx,
+               int order,
                unsigned long nr_requested,
                unsigned long nr_scanned,
                unsigned long nr_taken,
                isolate_mode_t isolate_mode,
                int file),
 
-       TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
+       TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
 
 );
 
@@ -352,15 +363,14 @@ TRACE_EVENT(mm_vmscan_writepage,
 
 TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 
-       TP_PROTO(struct zone *zone,
+       TP_PROTO(int nid,
                unsigned long nr_scanned, unsigned long nr_reclaimed,
                int priority, int file),
 
-       TP_ARGS(zone, nr_scanned, nr_reclaimed, priority, file),
+       TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file),
 
        TP_STRUCT__entry(
                __field(int, nid)
-               __field(int, zid)
                __field(unsigned long, nr_scanned)
                __field(unsigned long, nr_reclaimed)
                __field(int, priority)
@@ -368,16 +378,15 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
        ),
 
        TP_fast_assign(
-               __entry->nid = zone_to_nid(zone);
-               __entry->zid = zone_idx(zone);
+               __entry->nid = nid;
                __entry->nr_scanned = nr_scanned;
                __entry->nr_reclaimed = nr_reclaimed;
                __entry->priority = priority;
                __entry->reclaim_flags = trace_shrink_flags(file);
        ),
 
-       TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
-               __entry->nid, __entry->zid,
+       TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
+               __entry->nid,
                __entry->nr_scanned, __entry->nr_reclaimed,
                __entry->priority,
                show_reclaim_flags(__entry->reclaim_flags))
index 531f5811ff6be07dd264d7cacd11507de9083fab..2ccd9ccbf9efeaa7794eda0ca20589d42cf67784 100644 (file)
@@ -412,11 +412,11 @@ TRACE_EVENT(global_dirty_state,
        ),
 
        TP_fast_assign(
-               __entry->nr_dirty       = global_page_state(NR_FILE_DIRTY);
-               __entry->nr_writeback   = global_page_state(NR_WRITEBACK);
-               __entry->nr_unstable    = global_page_state(NR_UNSTABLE_NFS);
-               __entry->nr_dirtied     = global_page_state(NR_DIRTIED);
-               __entry->nr_written     = global_page_state(NR_WRITTEN);
+               __entry->nr_dirty       = global_node_page_state(NR_FILE_DIRTY);
+               __entry->nr_writeback   = global_node_page_state(NR_WRITEBACK);
+               __entry->nr_unstable    = global_node_page_state(NR_UNSTABLE_NFS);
+               __entry->nr_dirtied     = global_node_page_state(NR_DIRTIED);
+               __entry->nr_written     = global_node_page_state(NR_WRITTEN);
                __entry->background_thresh = background_thresh;
                __entry->dirty_thresh   = dirty_thresh;
                __entry->dirty_limit    = global_wb_domain.dirty_limit;
index 73e93e53884d1098ceec4b3c8d31049c8d9fc70b..c7fd2778ed50edc52b7c0e5f1f651dcb141c9495 100644 (file)
@@ -1034,15 +1034,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 {
        bool need_loop;
 
-       /*
-        * Allow tasks that have access to memory reserves because they have
-        * been OOM killed to get memory anywhere.
-        */
-       if (unlikely(test_thread_flag(TIF_MEMDIE)))
-               return;
-       if (current->flags & PF_EXITING) /* Let dying task have memory */
-               return;
-
        task_lock(tsk);
        /*
         * Determine if a loop is necessary if another thread is doing
index de21f25e0d2ce9755b80ecc653269b8b5f8e5415..52e725d4a866b4ac30f16b4db075b9b197e4e53d 100644 (file)
@@ -165,20 +165,12 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);
 
-       if (page)
-               memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-                                           1 << THREAD_SIZE_ORDER);
-
        return page ? page_address(page) : NULL;
 }
 
 static inline void free_thread_stack(unsigned long *stack)
 {
-       struct page *page = virt_to_page(stack);
-
-       memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-                                   -(1 << THREAD_SIZE_ORDER));
-       __free_pages(page, THREAD_SIZE_ORDER);
+       __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_stack_cache;
@@ -223,9 +215,15 @@ static struct kmem_cache *mm_cachep;
 
 static void account_kernel_stack(unsigned long *stack, int account)
 {
-       struct zone *zone = page_zone(virt_to_page(stack));
+       /* All stack pages are in the same zone and belong to the same memcg. */
+       struct page *first_page = virt_to_page(stack);
+
+       mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                           THREAD_SIZE / 1024 * account);
 
-       mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+       memcg_kmem_update_page_stat(
+               first_page, MEMCG_KERNEL_STACK_KB,
+               account * (THREAD_SIZE / 1024));
 }
 
 void free_task(struct task_struct *tsk)
index a8900a3bc27a895b65580a23a144e581bfe0149d..6f56a9e219facf003d802bd1e9c0cd2dea9824d3 100644 (file)
@@ -42,7 +42,7 @@ bool freezing_slow_path(struct task_struct *p)
        if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
                return false;
 
-       if (test_thread_flag(TIF_MEMDIE))
+       if (test_tsk_thread_flag(p, TIF_MEMDIE))
                return false;
 
        if (pm_nosig_freezing || cgroup_freezing(p))
index 017532193fb1c009b83801b27172f8210988f074..ddb3247a872a60b3ec5e716ac503f8bd13254d97 100644 (file)
@@ -308,12 +308,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
        if (is_ram == REGION_INTERSECTS)
                return __va(res->start);
 
-       if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) {
-               dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n",
-                               __func__);
-               return ERR_PTR(-ENXIO);
-       }
-
        if (!ref)
                return ERR_PTR(-EINVAL);
 
@@ -401,7 +395,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
        altmap->alloc -= nr_pfns;
 }
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
 struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 {
        /*
@@ -427,5 +420,4 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 
        return pgmap ? pgmap->altmap : NULL;
 }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 #endif /* CONFIG_ZONE_DEVICE */
index d90df926b59fbdf8e76d057d85b55b31ef615f1d..9a0178c2ac1df6b68f52a31e96beda95a1517781 100644 (file)
@@ -1627,11 +1627,11 @@ static unsigned long minimum_image_size(unsigned long saveable)
        unsigned long size;
 
        size = global_page_state(NR_SLAB_RECLAIMABLE)
-               + global_page_state(NR_ACTIVE_ANON)
-               + global_page_state(NR_INACTIVE_ANON)
-               + global_page_state(NR_ACTIVE_FILE)
-               + global_page_state(NR_INACTIVE_FILE)
-               - global_page_state(NR_FILE_MAPPED);
+               + global_node_page_state(NR_ACTIVE_ANON)
+               + global_node_page_state(NR_INACTIVE_ANON)
+               + global_node_page_state(NR_ACTIVE_FILE)
+               + global_node_page_state(NR_INACTIVE_FILE)
+               - global_node_page_state(NR_FILE_MAPPED);
 
        return saveable <= size ? 0 : saveable - size;
 }
index 60cdf63867632bdca556d7ab32a2824138a48f4c..d4de33934dacc41d481269bd7fe7e7b2660e1efd 100644 (file)
@@ -3177,9 +3177,8 @@ void show_regs_print_info(const char *log_lvl)
 {
        dump_stack_print_info(log_lvl);
 
-       printk("%stask: %p ti: %p task.ti: %p\n",
-              log_lvl, current, current_thread_info(),
-              task_thread_info(current));
+       printk("%stask: %p task.stack: %p\n",
+              log_lvl, current, task_stack_page(current));
 }
 
 #endif
index 35f0dcb1cb4f6db187679edc586b5df543e85046..53954631a4e192e9c8a35d806b31345b5ece2423 100644 (file)
@@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_NUMA
        {
                .procname       = "zone_reclaim_mode",
-               .data           = &zone_reclaim_mode,
-               .maxlen         = sizeof(zone_reclaim_mode),
+               .data           = &node_reclaim_mode,
+               .maxlen         = sizeof(node_reclaim_mode),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
                .extra1         = &zero,
index 67d8c6838ba95fe244060e1d67362220e646514f..bd38aab05929813acae4db2cd09f609a83d2025c 100644 (file)
@@ -5,9 +5,9 @@ if HAVE_ARCH_KASAN
 
 config KASAN
        bool "KASan: runtime memory debugger"
-       depends on SLUB_DEBUG || (SLAB && !DEBUG_SLAB)
+       depends on SLUB || (SLAB && !DEBUG_SLAB)
        select CONSTRUCTORS
-       select STACKDEPOT if SLAB
+       select STACKDEPOT
        help
          Enables kernel address sanitizer - runtime memory debugger,
          designed to find out-of-bounds accesses and use-after-free bugs.
index d67c8288d95dba435bf608b6258c3168c8206d7a..9e8c7386b3a03e8d418dd09847c9c7f295e06e77 100644 (file)
@@ -144,7 +144,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
        buf = iov->iov_base + skip;
        copy = min(bytes, iov->iov_len - skip);
 
-       if (!fault_in_pages_writeable(buf, copy)) {
+       if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
                kaddr = kmap_atomic(page);
                from = kaddr + offset;
 
@@ -175,6 +175,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
                copy = min(bytes, iov->iov_len - skip);
        }
        /* Too bad - revert to non-atomic kmap */
+
        kaddr = kmap(page);
        from = kaddr + offset;
        left = __copy_to_user(buf, from, copy);
@@ -193,6 +194,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
                bytes -= copy;
        }
        kunmap(page);
+
 done:
        if (skip == iov->iov_len) {
                iov++;
@@ -225,7 +227,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
        buf = iov->iov_base + skip;
        copy = min(bytes, iov->iov_len - skip);
 
-       if (!fault_in_pages_readable(buf, copy)) {
+       if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
                kaddr = kmap_atomic(page);
                to = kaddr + offset;
 
@@ -256,6 +258,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
                copy = min(bytes, iov->iov_len - skip);
        }
        /* Too bad - revert to non-atomic kmap */
+
        kaddr = kmap(page);
        to = kaddr + offset;
        left = __copy_from_user(to, buf, copy);
@@ -274,6 +277,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
                bytes -= copy;
        }
        kunmap(page);
+
 done:
        if (skip == iov->iov_len) {
                iov++;
index 53ad6c0831aebe6d3c9ec7cfc1387fa34ca7ea72..60f77f1d470a0589ccdeacbb52a9eb45a8a1cec1 100644 (file)
@@ -242,6 +242,7 @@ depot_stack_handle_t depot_save_stack(struct stack_trace *trace,
                 */
                alloc_flags &= ~GFP_ZONEMASK;
                alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
+               alloc_flags |= __GFP_NOWARN;
                page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER);
                if (page)
                        prealloc = page_address(page);
index 3c81803b00a32c44690893758cf0bb35d6f4db95..c0837845c17c408a123d5864c1bec354ee074723 100644 (file)
@@ -681,7 +681,7 @@ config IDLE_PAGE_TRACKING
          See Documentation/vm/idle_page_tracking.txt for more details.
 
 config ZONE_DEVICE
-       bool "Device memory (pmem, etc...) hotplug support" if EXPERT
+       bool "Device memory (pmem, etc...) hotplug support"
        depends on MEMORY_HOTPLUG
        depends on MEMORY_HOTREMOVE
        depends on SPARSEMEM_VMEMMAP
index ed173b8ae8f24a317e11da29735f0a5faa9e361e..efe2377420744d4d557b801ce4a265d3815cfb8f 100644 (file)
@@ -947,24 +947,24 @@ long congestion_wait(int sync, long timeout)
 EXPORT_SYMBOL(congestion_wait);
 
 /**
- * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
- * @zone: A zone to check if it is heavily congested
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
+ * @pgdat: A pgdat to check if it is heavily congested
  * @sync: SYNC or ASYNC IO
  * @timeout: timeout in jiffies
  *
  * In the event of a congested backing_dev (any backing_dev) and the given
- * @zone has experienced recent congestion, this waits for up to @timeout
+ * @pgdat has experienced recent congestion, this waits for up to @timeout
  * jiffies for either a BDI to exit congestion of the given @sync queue
  * or a write to complete.
  *
- * In the absence of zone congestion, cond_resched() is called to yield
+ * In the absence of pgdat congestion, cond_resched() is called to yield
  * the processor if necessary but otherwise does not sleep.
  *
  * The return value is 0 if the sleep is for the full timeout. Otherwise,
  * it is the number of jiffies that were still remaining when the function
  * returned. return_value == timeout implies the function did not sleep.
  */
-long wait_iff_congested(struct zone *zone, int sync, long timeout)
+long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout)
 {
        long ret;
        unsigned long start = jiffies;
@@ -973,12 +973,13 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
 
        /*
         * If there is no congestion, or heavy congestion is not being
-        * encountered in the current zone, yield if necessary instead
+        * encountered in the current pgdat, yield if necessary instead
         * of sleeping on the congestion queue
         */
        if (atomic_read(&nr_wb_congested[sync]) == 0 ||
-           !test_bit(ZONE_CONGESTED, &zone->flags)) {
+           !test_bit(PGDAT_CONGESTED, &pgdat->flags)) {
                cond_resched();
+
                /* In case we scheduled, work out time remaining */
                ret = timeout - (jiffies - start);
                if (ret < 0)
index 64df5fe052db656dcfd38999abdf561942dfd4c5..9affb290830421de71f82eac65d7e4f7ce78cb9d 100644 (file)
@@ -331,7 +331,7 @@ static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
 {
        if (cc->mode == MIGRATE_ASYNC) {
                if (!spin_trylock_irqsave(lock, *flags)) {
-                       cc->contended = COMPACT_CONTENDED_LOCK;
+                       cc->contended = true;
                        return false;
                }
        } else {
@@ -365,13 +365,13 @@ static bool compact_unlock_should_abort(spinlock_t *lock,
        }
 
        if (fatal_signal_pending(current)) {
-               cc->contended = COMPACT_CONTENDED_SCHED;
+               cc->contended = true;
                return true;
        }
 
        if (need_resched()) {
                if (cc->mode == MIGRATE_ASYNC) {
-                       cc->contended = COMPACT_CONTENDED_SCHED;
+                       cc->contended = true;
                        return true;
                }
                cond_resched();
@@ -394,7 +394,7 @@ static inline bool compact_should_abort(struct compact_control *cc)
        /* async compaction aborts if contended */
        if (need_resched()) {
                if (cc->mode == MIGRATE_ASYNC) {
-                       cc->contended = COMPACT_CONTENDED_SCHED;
+                       cc->contended = true;
                        return true;
                }
 
@@ -646,8 +646,8 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
        list_for_each_entry(page, &cc->migratepages, lru)
                count[!!page_is_file_cache(page)]++;
 
-       mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
-       mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+       mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
+       mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
 }
 
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -655,12 +655,12 @@ static bool too_many_isolated(struct zone *zone)
 {
        unsigned long active, inactive, isolated;
 
-       inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
-                                       zone_page_state(zone, NR_INACTIVE_ANON);
-       active = zone_page_state(zone, NR_ACTIVE_FILE) +
-                                       zone_page_state(zone, NR_ACTIVE_ANON);
-       isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
-                                       zone_page_state(zone, NR_ISOLATED_ANON);
+       inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
+                       node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
+       active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
+                       node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
+       isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
+                       node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
 
        return isolated > (inactive + active) / 2;
 }
@@ -752,7 +752,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                 * if contended.
                 */
                if (!(low_pfn % SWAP_CLUSTER_MAX)
-                   && compact_unlock_should_abort(&zone->lru_lock, flags,
+                   && compact_unlock_should_abort(zone_lru_lock(zone), flags,
                                                                &locked, cc))
                        break;
 
@@ -813,7 +813,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        if (unlikely(__PageMovable(page)) &&
                                        !PageIsolated(page)) {
                                if (locked) {
-                                       spin_unlock_irqrestore(&zone->lru_lock,
+                                       spin_unlock_irqrestore(zone_lru_lock(zone),
                                                                        flags);
                                        locked = false;
                                }
@@ -836,7 +836,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
                /* If we already hold the lock, we can skip some rechecking */
                if (!locked) {
-                       locked = compact_trylock_irqsave(&zone->lru_lock,
+                       locked = compact_trylock_irqsave(zone_lru_lock(zone),
                                                                &flags, cc);
                        if (!locked)
                                break;
@@ -856,7 +856,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        }
                }
 
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 
                /* Try isolate the page */
                if (__isolate_lru_page(page, isolate_mode) != 0)
@@ -899,7 +899,7 @@ isolate_fail:
                 */
                if (nr_isolated) {
                        if (locked) {
-                               spin_unlock_irqrestore(&zone->lru_lock, flags);
+                               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
                                locked = false;
                        }
                        acct_isolated(zone, cc);
@@ -927,7 +927,7 @@ isolate_fail:
                low_pfn = end_pfn;
 
        if (locked)
-               spin_unlock_irqrestore(&zone->lru_lock, flags);
+               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
 
        /*
         * Update the pageblock-skip information and cached scanner pfn,
@@ -1200,7 +1200,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        struct page *page;
        const isolate_mode_t isolate_mode =
                (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
-               (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
+               (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
 
        /*
         * Start at where we last stopped, or beginning of the zone as
@@ -1619,14 +1619,11 @@ out:
        trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
                                cc->free_pfn, end_pfn, sync, ret);
 
-       if (ret == COMPACT_CONTENDED)
-               ret = COMPACT_PARTIAL;
-
        return ret;
 }
 
 static enum compact_result compact_zone_order(struct zone *zone, int order,
-               gfp_t gfp_mask, enum migrate_mode mode, int *contended,
+               gfp_t gfp_mask, enum compact_priority prio,
                unsigned int alloc_flags, int classzone_idx)
 {
        enum compact_result ret;
@@ -1636,7 +1633,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
                .order = order,
                .gfp_mask = gfp_mask,
                .zone = zone,
-               .mode = mode,
+               .mode = (prio == COMPACT_PRIO_ASYNC) ?
+                                       MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
                .alloc_flags = alloc_flags,
                .classzone_idx = classzone_idx,
                .direct_compaction = true,
@@ -1649,7 +1647,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
        VM_BUG_ON(!list_empty(&cc.freepages));
        VM_BUG_ON(!list_empty(&cc.migratepages));
 
-       *contended = cc.contended;
        return ret;
 }
 
@@ -1662,50 +1659,38 @@ int sysctl_extfrag_threshold = 500;
  * @alloc_flags: The allocation flags of the current allocation
  * @ac: The context of current allocation
  * @mode: The migration mode for async, sync light, or sync migration
- * @contended: Return value that determines if compaction was aborted due to
- *            need_resched() or lock contention
  *
  * This is the main entry point for direct page compaction.
  */
 enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
                unsigned int alloc_flags, const struct alloc_context *ac,
-               enum migrate_mode mode, int *contended)
+               enum compact_priority prio)
 {
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
        struct zoneref *z;
        struct zone *zone;
        enum compact_result rc = COMPACT_SKIPPED;
-       int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
-
-       *contended = COMPACT_CONTENDED_NONE;
 
        /* Check if the GFP flags allow compaction */
-       if (!order || !may_enter_fs || !may_perform_io)
+       if (!may_enter_fs || !may_perform_io)
                return COMPACT_SKIPPED;
 
-       trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
+       trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
 
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                                                ac->nodemask) {
                enum compact_result status;
-               int zone_contended;
 
                if (compaction_deferred(zone, order)) {
                        rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
                        continue;
                }
 
-               status = compact_zone_order(zone, order, gfp_mask, mode,
-                               &zone_contended, alloc_flags,
-                               ac_classzone_idx(ac));
+               status = compact_zone_order(zone, order, gfp_mask, prio,
+                                       alloc_flags, ac_classzone_idx(ac));
                rc = max(status, rc);
-               /*
-                * It takes at least one zone that wasn't lock contended
-                * to clear all_zones_contended.
-                */
-               all_zones_contended &= zone_contended;
 
                /* If a normal allocation would succeed, stop compacting */
                if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
@@ -1717,59 +1702,29 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
                         * succeeds in this zone.
                         */
                        compaction_defer_reset(zone, order, false);
-                       /*
-                        * It is possible that async compaction aborted due to
-                        * need_resched() and the watermarks were ok thanks to
-                        * somebody else freeing memory. The allocation can
-                        * however still fail so we better signal the
-                        * need_resched() contention anyway (this will not
-                        * prevent the allocation attempt).
-                        */
-                       if (zone_contended == COMPACT_CONTENDED_SCHED)
-                               *contended = COMPACT_CONTENDED_SCHED;
 
-                       goto break_loop;
+                       break;
                }
 
-               if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE ||
-                                       status == COMPACT_PARTIAL_SKIPPED)) {
+               if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
+                                       status == COMPACT_PARTIAL_SKIPPED))
                        /*
                         * We think that allocation won't succeed in this zone
                         * so we defer compaction there. If it ends up
                         * succeeding after all, it will be reset.
                         */
                        defer_compaction(zone, order);
-               }
 
                /*
                 * We might have stopped compacting due to need_resched() in
                 * async compaction, or due to a fatal signal detected. In that
-                * case do not try further zones and signal need_resched()
-                * contention.
-                */
-               if ((zone_contended == COMPACT_CONTENDED_SCHED)
-                                       || fatal_signal_pending(current)) {
-                       *contended = COMPACT_CONTENDED_SCHED;
-                       goto break_loop;
-               }
-
-               continue;
-break_loop:
-               /*
-                * We might not have tried all the zones, so  be conservative
-                * and assume they are not all lock contended.
+                * case do not try further zones
                 */
-               all_zones_contended = 0;
-               break;
+               if ((prio == COMPACT_PRIO_ASYNC && need_resched())
+                                       || fatal_signal_pending(current))
+                       break;
        }
 
-       /*
-        * If at least one zone wasn't deferred or skipped, we report if all
-        * zones that were tried were lock contended.
-        */
-       if (rc > COMPACT_INACTIVE && all_zones_contended)
-               *contended = COMPACT_CONTENDED_LOCK;
-
        return rc;
 }
 
index e90c1543ec2dc0fa577bf07c5a6eea824c0eecf6..c5f5e46c6f7ff855f36efd934cce2d78ce73c549 100644 (file)
@@ -95,8 +95,8 @@
  *    ->swap_lock              (try_to_unmap_one)
  *    ->private_lock           (try_to_unmap_one)
  *    ->tree_lock              (try_to_unmap_one)
- *    ->zone.lru_lock          (follow_page->mark_page_accessed)
- *    ->zone.lru_lock          (check_pte_range->isolate_lru_page)
+ *    ->zone_lru_lock(zone)    (follow_page->mark_page_accessed)
+ *    ->zone_lru_lock(zone)    (check_pte_range->isolate_lru_page)
  *    ->private_lock           (page_remove_rmap->set_page_dirty)
  *    ->tree_lock              (page_remove_rmap->set_page_dirty)
  *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
@@ -218,11 +218,11 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 
        /* hugetlb pages do not participate in page cache accounting. */
        if (!PageHuge(page))
-               __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr);
+               __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
        if (PageSwapBacked(page)) {
-               __mod_zone_page_state(page_zone(page), NR_SHMEM, -nr);
+               __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
                if (PageTransHuge(page))
-                       __dec_zone_page_state(page, NR_SHMEM_THPS);
+                       __dec_node_page_state(page, NR_SHMEM_THPS);
        } else {
                VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page);
        }
@@ -568,9 +568,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                 * hugetlb pages do not participate in page cache accounting.
                 */
                if (!PageHuge(new))
-                       __inc_zone_page_state(new, NR_FILE_PAGES);
+                       __inc_node_page_state(new, NR_FILE_PAGES);
                if (PageSwapBacked(new))
-                       __inc_zone_page_state(new, NR_SHMEM);
+                       __inc_node_page_state(new, NR_SHMEM);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
                mem_cgroup_migrate(old, new);
                radix_tree_preload_end();
@@ -677,7 +677,7 @@ static int __add_to_page_cache_locked(struct page *page,
 
        /* hugetlb pages do not participate in page cache accounting. */
        if (!huge)
-               __inc_zone_page_state(page, NR_FILE_PAGES);
+               __inc_node_page_state(page, NR_FILE_PAGES);
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
                mem_cgroup_commit_charge(page, memcg, false, false);
index 3647334c2ef916ce9824153a359cb13cff3d9aba..2373f0a7d3405bb869ac4a3a4be175289af38114 100644 (file)
@@ -539,23 +539,26 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
 }
 
 /*
- * If THP is set to always then directly reclaim/compact as necessary
- * If set to defer then do no reclaim and defer to khugepaged
+ * If THP defrag is set to always then directly reclaim/compact as necessary
+ * If set to defer then do only background reclaim/compact and defer to khugepaged
  * If set to madvise and the VMA is flagged then directly reclaim/compact
+ * When direct reclaim/compact is allowed, don't retry except for flagged VMA's
  */
 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 {
-       gfp_t reclaim_flags = 0;
-
-       if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) &&
-           (vma->vm_flags & VM_HUGEPAGE))
-               reclaim_flags = __GFP_DIRECT_RECLAIM;
-       else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-               reclaim_flags = __GFP_KSWAPD_RECLAIM;
-       else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
-               reclaim_flags = __GFP_DIRECT_RECLAIM;
-
-       return GFP_TRANSHUGE | reclaim_flags;
+       bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+
+       if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+                               &transparent_hugepage_flags) && vma_madvised)
+               return GFP_TRANSHUGE;
+       else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+                                               &transparent_hugepage_flags))
+               return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+       else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+                                               &transparent_hugepage_flags))
+               return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+       return GFP_TRANSHUGE_LIGHT;
 }
 
 /* Caller must hold page table lock. */
@@ -1249,25 +1252,26 @@ out:
        return 0;
 }
 
-int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+/*
+ * Return true if we do MADV_FREE successfully on entire pmd page.
+ * Otherwise, return false.
+ */
+bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                pmd_t *pmd, unsigned long addr, unsigned long next)
-
 {
        spinlock_t *ptl;
        pmd_t orig_pmd;
        struct page *page;
        struct mm_struct *mm = tlb->mm;
-       int ret = 0;
+       bool ret = false;
 
        ptl = pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                goto out_unlocked;
 
        orig_pmd = *pmd;
-       if (is_huge_zero_pmd(orig_pmd)) {
-               ret = 1;
+       if (is_huge_zero_pmd(orig_pmd))
                goto out;
-       }
 
        page = pmd_page(orig_pmd);
        /*
@@ -1309,7 +1313,7 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                set_pmd_at(mm, addr, pmd, orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
        }
-       ret = 1;
+       ret = true;
 out:
        spin_unlock(ptl);
 out_unlocked:
@@ -1586,7 +1590,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
        if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
                /* Last compound_mapcount is gone. */
-               __dec_zone_page_state(page, NR_ANON_THPS);
+               __dec_node_page_state(page, NR_ANON_THPS);
                if (TestClearPageDoubleMap(page)) {
                        /* No need in mapcount reference anymore */
                        for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -1818,7 +1822,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
        pgoff_t end = -1;
        int i;
 
-       lruvec = mem_cgroup_page_lruvec(head, zone);
+       lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
 
        /* complete memcg works before add pages to LRU */
        mem_cgroup_split_huge_fixup(head);
@@ -1848,7 +1852,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
                spin_unlock(&head->mapping->tree_lock);
        }
 
-       spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
+       spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
 
        unfreeze_page(head);
 
@@ -2034,7 +2038,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                lru_add_drain();
 
        /* prevent PageLRU to go away from under us, and freeze lru stats */
-       spin_lock_irqsave(&page_zone(head)->lru_lock, flags);
+       spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
 
        if (mapping) {
                void **pslot;
@@ -2061,7 +2065,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                        list_del(page_deferred_list(head));
                }
                if (mapping)
-                       __dec_zone_page_state(page, NR_SHMEM_THPS);
+                       __dec_node_page_state(page, NR_SHMEM_THPS);
                spin_unlock(&pgdata->split_queue_lock);
                __split_huge_page(page, list, flags);
                ret = 0;
@@ -2077,7 +2081,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                spin_unlock(&pgdata->split_queue_lock);
 fail:          if (mapping)
                        spin_unlock(&mapping->tree_lock);
-               spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
+               spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
                unfreeze_page(head);
                ret = -EBUSY;
        }
index 51a04e5e937351d6f138c91b211505f549e3bd5d..f904246a8fd5ae29d0d727f9eb1ccad6b9272cca 100644 (file)
@@ -4391,7 +4391,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 
 /*
  * This function is called from memory failure code.
- * Assume the caller holds page lock of the head page.
  */
 int dequeue_hwpoisoned_huge_page(struct page *hpage)
 {
index 9b6a6c43ac391d5548d7a6e62eca30f4aadca875..1501304f87a41aaa48b0b32d16917a51ba73a1b7 100644 (file)
@@ -78,7 +78,7 @@ extern unsigned long highest_memmap_pfn;
  */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
-extern bool zone_reclaimable(struct zone *zone);
+extern bool pgdat_reclaimable(struct pglist_data *pgdat);
 
 /*
  * in mm/rmap.c:
@@ -185,10 +185,7 @@ struct compact_control {
        const unsigned int alloc_flags; /* alloc flags of a direct compactor */
        const int classzone_idx;        /* zone index of a direct compactor */
        struct zone *zone;
-       int contended;                  /* Signal need_sched() or lock
-                                        * contention detected during
-                                        * compaction
-                                        */
+       bool contended;                 /* Signal lock or sched contention */
 };
 
 unsigned long
@@ -433,10 +430,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 }
 #endif /* CONFIG_SPARSEMEM */
 
-#define ZONE_RECLAIM_NOSCAN    -2
-#define ZONE_RECLAIM_FULL      -1
-#define ZONE_RECLAIM_SOME      0
-#define ZONE_RECLAIM_SUCCESS   1
+#define NODE_RECLAIM_NOSCAN    -2
+#define NODE_RECLAIM_FULL      -1
+#define NODE_RECLAIM_SOME      0
+#define NODE_RECLAIM_SUCCESS   1
 
 extern int hwpoison_filter(struct page *p);
 
@@ -467,7 +464,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 #define ALLOC_HIGH             0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET           0x40 /* check for correct cpuset */
 #define ALLOC_CMA              0x80 /* allow allocations from CMA areas */
-#define ALLOC_FAIR             0x100 /* fair zone allocation */
 
 enum ttu_flags;
 struct tlbflush_unmap_batch;
index 1548749a3d452735c78028f4fb30ec3ac6bb10e8..2976a9ee104f9248731dc07c3619e34b79998041 100644 (file)
@@ -7,5 +7,4 @@ CFLAGS_REMOVE_kasan.o = -pg
 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 
-obj-y := kasan.o report.o kasan_init.o
-obj-$(CONFIG_SLAB) += quarantine.o
+obj-y := kasan.o report.o kasan_init.o quarantine.o
index 6845f9294696cedb9d6567bd09a7a86862bd5c3b..b6f99e81bfebdce9cc13ee7fb72e665d117b1ce8 100644 (file)
@@ -351,7 +351,6 @@ void kasan_free_pages(struct page *page, unsigned int order)
                                KASAN_FREE_PAGE);
 }
 
-#ifdef CONFIG_SLAB
 /*
  * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
  * For larger allocations larger redzones are used.
@@ -373,16 +372,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
                        unsigned long *flags)
 {
        int redzone_adjust;
-       /* Make sure the adjusted size is still less than
-        * KMALLOC_MAX_CACHE_SIZE.
-        * TODO: this check is only useful for SLAB, but not SLUB. We'll need
-        * to skip it for SLUB when it starts using kasan_cache_create().
-        */
-       if (*size > KMALLOC_MAX_CACHE_SIZE -
-           sizeof(struct kasan_alloc_meta) -
-           sizeof(struct kasan_free_meta))
-               return;
-       *flags |= SLAB_KASAN;
+       int orig_size = *size;
+
        /* Add alloc meta. */
        cache->kasan_info.alloc_meta_offset = *size;
        *size += sizeof(struct kasan_alloc_meta);
@@ -395,14 +386,26 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
        }
        redzone_adjust = optimal_redzone(cache->object_size) -
                (*size - cache->object_size);
+
        if (redzone_adjust > 0)
                *size += redzone_adjust;
-       *size = min(KMALLOC_MAX_CACHE_SIZE,
-                   max(*size,
-                       cache->object_size +
-                       optimal_redzone(cache->object_size)));
+
+       *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size +
+                                       optimal_redzone(cache->object_size)));
+
+       /*
+        * If the metadata doesn't fit, don't enable KASAN at all.
+        */
+       if (*size <= cache->kasan_info.alloc_meta_offset ||
+                       *size <= cache->kasan_info.free_meta_offset) {
+               cache->kasan_info.alloc_meta_offset = 0;
+               cache->kasan_info.free_meta_offset = 0;
+               *size = orig_size;
+               return;
+       }
+
+       *flags |= SLAB_KASAN;
 }
-#endif
 
 void kasan_cache_shrink(struct kmem_cache *cache)
 {
@@ -414,6 +417,14 @@ void kasan_cache_destroy(struct kmem_cache *cache)
        quarantine_remove_cache(cache);
 }
 
+size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+       return (cache->kasan_info.alloc_meta_offset ?
+               sizeof(struct kasan_alloc_meta) : 0) +
+               (cache->kasan_info.free_meta_offset ?
+               sizeof(struct kasan_free_meta) : 0);
+}
+
 void kasan_poison_slab(struct page *page)
 {
        kasan_poison_shadow(page_address(page),
@@ -431,16 +442,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
        kasan_poison_shadow(object,
                        round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
                        KASAN_KMALLOC_REDZONE);
-#ifdef CONFIG_SLAB
        if (cache->flags & SLAB_KASAN) {
                struct kasan_alloc_meta *alloc_info =
                        get_alloc_info(cache, object);
                alloc_info->state = KASAN_STATE_INIT;
        }
-#endif
 }
 
-#ifdef CONFIG_SLAB
 static inline int in_irqentry_text(unsigned long ptr)
 {
        return (ptr >= (unsigned long)&__irqentry_text_start &&
@@ -501,7 +509,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
        BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
        return (void *)object + cache->kasan_info.free_meta_offset;
 }
-#endif
 
 void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 {
@@ -522,16 +529,16 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 
 bool kasan_slab_free(struct kmem_cache *cache, void *object)
 {
-#ifdef CONFIG_SLAB
        /* RCU slabs could be legally used after free within the RCU period */
        if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
                return false;
 
        if (likely(cache->flags & SLAB_KASAN)) {
-               struct kasan_alloc_meta *alloc_info =
-                       get_alloc_info(cache, object);
-               struct kasan_free_meta *free_info =
-                       get_free_info(cache, object);
+               struct kasan_alloc_meta *alloc_info;
+               struct kasan_free_meta *free_info;
+
+               alloc_info = get_alloc_info(cache, object);
+               free_info = get_free_info(cache, object);
 
                switch (alloc_info->state) {
                case KASAN_STATE_ALLOC:
@@ -550,10 +557,6 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
                }
        }
        return false;
-#else
-       kasan_poison_slab_free(cache, object);
-       return false;
-#endif
 }
 
 void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -576,7 +579,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
        kasan_unpoison_shadow(object, size);
        kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
                KASAN_KMALLOC_REDZONE);
-#ifdef CONFIG_SLAB
        if (cache->flags & SLAB_KASAN) {
                struct kasan_alloc_meta *alloc_info =
                        get_alloc_info(cache, object);
@@ -585,7 +587,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
                alloc_info->alloc_size = size;
                set_track(&alloc_info->track, flags);
        }
-#endif
 }
 EXPORT_SYMBOL(kasan_kmalloc);
 
index fb87923552ef2b90c65847fdb5c56644560ca7ec..31972cdba433a97f114147ecf2a823ac91cdc5bc 100644 (file)
@@ -95,7 +95,6 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
 struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
                                        const void *object);
 
-
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
        return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
@@ -110,7 +109,7 @@ static inline bool kasan_report_enabled(void)
 void kasan_report(unsigned long addr, size_t size,
                bool is_write, unsigned long ip);
 
-#ifdef CONFIG_SLAB
+#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
 void quarantine_reduce(void);
 void quarantine_remove_cache(struct kmem_cache *cache);
index b3c122ddd45483566de8311ab1e1cf814ff18176..861b9776841a94ee69f485fa91e13cdeebc0d8be 100644 (file)
@@ -116,7 +116,6 @@ static inline bool init_task_stack_addr(const void *addr)
                        sizeof(init_thread_union.stack));
 }
 
-#ifdef CONFIG_SLAB
 static void print_track(struct kasan_track *track)
 {
        pr_err("PID = %u\n", track->pid);
@@ -130,8 +129,8 @@ static void print_track(struct kasan_track *track)
        }
 }
 
-static void object_err(struct kmem_cache *cache, struct page *page,
-                       void *object, char *unused_reason)
+static void kasan_object_err(struct kmem_cache *cache, struct page *page,
+                               void *object, char *unused_reason)
 {
        struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
        struct kasan_free_meta *free_info;
@@ -162,7 +161,6 @@ static void object_err(struct kmem_cache *cache, struct page *page,
                break;
        }
 }
-#endif
 
 static void print_address_description(struct kasan_access_info *info)
 {
@@ -177,7 +175,7 @@ static void print_address_description(struct kasan_access_info *info)
                        struct kmem_cache *cache = page->slab_cache;
                        object = nearest_obj(cache, page,
                                                (void *)info->access_addr);
-                       object_err(cache, page, object,
+                       kasan_object_err(cache, page, object,
                                        "kasan: bad access detected");
                        return;
                }
index 7dbee698d6aa8594c7a65fded02a5b48265f348d..79c52d0061af591b0417e5d67462f49fda8ac632 100644 (file)
@@ -480,7 +480,7 @@ void __khugepaged_exit(struct mm_struct *mm)
 static void release_pte_page(struct page *page)
 {
        /* 0 stands for page_is_file_cache(page) == false */
-       dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+       dec_node_page_state(page, NR_ISOLATED_ANON + 0);
        unlock_page(page);
        putback_lru_page(page);
 }
@@ -576,7 +576,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                        goto out;
                }
                /* 0 stands for page_is_file_cache(page) == false */
-               inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+               inc_node_page_state(page, NR_ISOLATED_ANON + 0);
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                VM_BUG_ON_PAGE(PageLRU(page), page);
 
@@ -672,10 +672,10 @@ static bool khugepaged_scan_abort(int nid)
        int i;
 
        /*
-        * If zone_reclaim_mode is disabled, then no extra effort is made to
+        * If node_reclaim_mode is disabled, then no extra effort is made to
         * allocate memory locally.
         */
-       if (!zone_reclaim_mode)
+       if (!node_reclaim_mode)
                return false;
 
        /* If there is a count for this node already, it must be acceptable */
@@ -694,7 +694,7 @@ static bool khugepaged_scan_abort(int nid)
 /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
 static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
 {
-       return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
+       return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
 }
 
 #ifdef CONFIG_NUMA
@@ -1483,10 +1483,10 @@ tree_unlocked:
                }
 
                local_irq_save(flags);
-               __inc_zone_page_state(new_page, NR_SHMEM_THPS);
+               __inc_node_page_state(new_page, NR_SHMEM_THPS);
                if (nr_none) {
-                       __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none);
-                       __mod_zone_page_state(zone, NR_SHMEM, nr_none);
+                       __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
+                       __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
                }
                local_irq_restore(flags);
 
index 04320d3adbef9922edda52e43cc516463e9e33c6..086292f7c59d1a69881068758150c2140feb8253 100644 (file)
@@ -1485,8 +1485,10 @@ static int kmemleak_scan_thread(void *arg)
         * Wait before the first scan to allow the system to fully initialize.
         */
        if (first_run) {
+               signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000);
                first_run = 0;
-               ssleep(SECS_FIRST_SCAN);
+               while (timeout && !kthread_should_stop())
+                       timeout = schedule_timeout_interruptible(timeout);
        }
 
        while (!kthread_should_stop()) {
index ca099159b45a82c2f1fcfbad28fc14221953a8ee..ff5ff3b5f1ea774403b2231aa3053865a52ee31d 100644 (file)
@@ -20,7 +20,7 @@
 #include <linux/seq_file.h>
 #include <linux/memblock.h>
 
-#include <asm-generic/sections.h>
+#include <asm/sections.h>
 #include <linux/io.h>
 
 #include "internal.h"
@@ -1027,7 +1027,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
                                *out_end = m_end;
                        if (out_nid)
                                *out_nid = m_nid;
-                       idx_a++;
+                       idx_a--;
                        *idx = (u32)idx_a | (u64)idx_b << 32;
                        return;
                }
@@ -1465,15 +1465,16 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
        return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
 }
 
-void __init memblock_enforce_memory_limit(phys_addr_t limit)
+static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
 {
        phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
        struct memblock_region *r;
 
-       if (!limit)
-               return;
-
-       /* find out max address */
+       /*
+        * translate the memory @limit size into the max address within one of
+        * the memory memblock regions, if the @limit exceeds the total size
+        * of those regions, max_addr will keep original value ULLONG_MAX
+        */
        for_each_memblock(memory, r) {
                if (limit <= r->size) {
                        max_addr = r->base + limit;
@@ -1482,6 +1483,22 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
                limit -= r->size;
        }
 
+       return max_addr;
+}
+
+void __init memblock_enforce_memory_limit(phys_addr_t limit)
+{
+       phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+
+       if (!limit)
+               return;
+
+       max_addr = __find_max_addr(limit);
+
+       /* @limit exceeds the total size of the memory, do nothing */
+       if (max_addr == (phys_addr_t)ULLONG_MAX)
+               return;
+
        /* truncate both memory and reserved regions */
        memblock_remove_range(&memblock.memory, max_addr,
                              (phys_addr_t)ULLONG_MAX);
@@ -1489,6 +1506,36 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
                              (phys_addr_t)ULLONG_MAX);
 }
 
+void __init memblock_mem_limit_remove_map(phys_addr_t limit)
+{
+       struct memblock_type *type = &memblock.memory;
+       phys_addr_t max_addr;
+       int i, ret, start_rgn, end_rgn;
+
+       if (!limit)
+               return;
+
+       max_addr = __find_max_addr(limit);
+
+       /* @limit exceeds the total size of the memory, do nothing */
+       if (max_addr == (phys_addr_t)ULLONG_MAX)
+               return;
+
+       ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX,
+                               &start_rgn, &end_rgn);
+       if (ret)
+               return;
+
+       /* remove all the MAP regions above the limit */
+       for (i = end_rgn - 1; i >= start_rgn; i--) {
+               if (!memblock_is_nomap(&type->regions[i]))
+                       memblock_remove_region(type, i);
+       }
+       /* truncate the reserved regions */
+       memblock_remove_range(&memblock.reserved, max_addr,
+                             (phys_addr_t)ULLONG_MAX);
+}
+
 static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
 {
        unsigned int left = 0, right = type->cnt;
index f3a84c64f35cf807c5e6d6ddbf6c601d5ff334a3..c265212bec8c0f412e9f23e260a5ff0d490ea046 100644 (file)
@@ -132,15 +132,11 @@ static const char * const mem_cgroup_lru_names[] = {
  * their hierarchy representation
  */
 
-struct mem_cgroup_tree_per_zone {
+struct mem_cgroup_tree_per_node {
        struct rb_root rb_root;
        spinlock_t lock;
 };
 
-struct mem_cgroup_tree_per_node {
-       struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
-};
-
 struct mem_cgroup_tree {
        struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 };
@@ -323,15 +319,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
 #endif /* !CONFIG_SLOB */
 
-static struct mem_cgroup_per_zone *
-mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
-{
-       int nid = zone_to_nid(zone);
-       int zid = zone_idx(zone);
-
-       return &memcg->nodeinfo[nid]->zoneinfo[zid];
-}
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -383,37 +370,35 @@ ino_t page_cgroup_ino(struct page *page)
        return ino;
 }
 
-static struct mem_cgroup_per_zone *
-mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
+static struct mem_cgroup_per_node *
+mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
 {
        int nid = page_to_nid(page);
-       int zid = page_zonenum(page);
 
-       return &memcg->nodeinfo[nid]->zoneinfo[zid];
+       return memcg->nodeinfo[nid];
 }
 
-static struct mem_cgroup_tree_per_zone *
-soft_limit_tree_node_zone(int nid, int zid)
+static struct mem_cgroup_tree_per_node *
+soft_limit_tree_node(int nid)
 {
-       return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+       return soft_limit_tree.rb_tree_per_node[nid];
 }
 
-static struct mem_cgroup_tree_per_zone *
+static struct mem_cgroup_tree_per_node *
 soft_limit_tree_from_page(struct page *page)
 {
        int nid = page_to_nid(page);
-       int zid = page_zonenum(page);
 
-       return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+       return soft_limit_tree.rb_tree_per_node[nid];
 }
 
-static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
-                                        struct mem_cgroup_tree_per_zone *mctz,
+static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
+                                        struct mem_cgroup_tree_per_node *mctz,
                                         unsigned long new_usage_in_excess)
 {
        struct rb_node **p = &mctz->rb_root.rb_node;
        struct rb_node *parent = NULL;
-       struct mem_cgroup_per_zone *mz_node;
+       struct mem_cgroup_per_node *mz_node;
 
        if (mz->on_tree)
                return;
@@ -423,7 +408,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
                return;
        while (*p) {
                parent = *p;
-               mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+               mz_node = rb_entry(parent, struct mem_cgroup_per_node,
                                        tree_node);
                if (mz->usage_in_excess < mz_node->usage_in_excess)
                        p = &(*p)->rb_left;
@@ -439,8 +424,8 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
        mz->on_tree = true;
 }
 
-static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
-                                        struct mem_cgroup_tree_per_zone *mctz)
+static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
+                                        struct mem_cgroup_tree_per_node *mctz)
 {
        if (!mz->on_tree)
                return;
@@ -448,8 +433,8 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
        mz->on_tree = false;
 }
 
-static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
-                                      struct mem_cgroup_tree_per_zone *mctz)
+static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
+                                      struct mem_cgroup_tree_per_node *mctz)
 {
        unsigned long flags;
 
@@ -473,8 +458,8 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 {
        unsigned long excess;
-       struct mem_cgroup_per_zone *mz;
-       struct mem_cgroup_tree_per_zone *mctz;
+       struct mem_cgroup_per_node *mz;
+       struct mem_cgroup_tree_per_node *mctz;
 
        mctz = soft_limit_tree_from_page(page);
        /*
@@ -482,7 +467,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
         * because their event counter is not touched.
         */
        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
-               mz = mem_cgroup_page_zoneinfo(memcg, page);
+               mz = mem_cgroup_page_nodeinfo(memcg, page);
                excess = soft_limit_excess(memcg);
                /*
                 * We have to update the tree if mz is on RB-tree or
@@ -507,24 +492,22 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 
 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 {
-       struct mem_cgroup_tree_per_zone *mctz;
-       struct mem_cgroup_per_zone *mz;
-       int nid, zid;
+       struct mem_cgroup_tree_per_node *mctz;
+       struct mem_cgroup_per_node *mz;
+       int nid;
 
        for_each_node(nid) {
-               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-                       mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
-                       mctz = soft_limit_tree_node_zone(nid, zid);
-                       mem_cgroup_remove_exceeded(mz, mctz);
-               }
+               mz = mem_cgroup_nodeinfo(memcg, nid);
+               mctz = soft_limit_tree_node(nid);
+               mem_cgroup_remove_exceeded(mz, mctz);
        }
 }
 
-static struct mem_cgroup_per_zone *
-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+static struct mem_cgroup_per_node *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 {
        struct rb_node *rightmost = NULL;
-       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup_per_node *mz;
 
 retry:
        mz = NULL;
@@ -532,7 +515,7 @@ retry:
        if (!rightmost)
                goto done;              /* Nothing to reclaim from */
 
-       mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+       mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
        /*
         * Remove the node now but someone else can add it back,
         * we will to add it back at the end of reclaim to its correct
@@ -546,10 +529,10 @@ done:
        return mz;
 }
 
-static struct mem_cgroup_per_zone *
-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+static struct mem_cgroup_per_node *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 {
-       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup_per_node *mz;
 
        spin_lock_irq(&mctz->lock);
        mz = __mem_cgroup_largest_soft_limit_node(mctz);
@@ -643,20 +626,16 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
                                           int nid, unsigned int lru_mask)
 {
        unsigned long nr = 0;
-       int zid;
+       struct mem_cgroup_per_node *mz;
+       enum lru_list lru;
 
        VM_BUG_ON((unsigned)nid >= nr_node_ids);
 
-       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-               struct mem_cgroup_per_zone *mz;
-               enum lru_list lru;
-
-               for_each_lru(lru) {
-                       if (!(BIT(lru) & lru_mask))
-                               continue;
-                       mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
-                       nr += mz->lru_size[lru];
-               }
+       for_each_lru(lru) {
+               if (!(BIT(lru) & lru_mask))
+                       continue;
+               mz = mem_cgroup_nodeinfo(memcg, nid);
+               nr += mz->lru_size[lru];
        }
        return nr;
 }
@@ -809,9 +788,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
        rcu_read_lock();
 
        if (reclaim) {
-               struct mem_cgroup_per_zone *mz;
+               struct mem_cgroup_per_node *mz;
 
-               mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+               mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
                iter = &mz->iter[reclaim->priority];
 
                if (prev && reclaim->generation != iter->generation)
@@ -910,19 +889,17 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 {
        struct mem_cgroup *memcg = dead_memcg;
        struct mem_cgroup_reclaim_iter *iter;
-       struct mem_cgroup_per_zone *mz;
-       int nid, zid;
+       struct mem_cgroup_per_node *mz;
+       int nid;
        int i;
 
        while ((memcg = parent_mem_cgroup(memcg))) {
                for_each_node(nid) {
-                       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-                               mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
-                               for (i = 0; i <= DEF_PRIORITY; i++) {
-                                       iter = &mz->iter[i];
-                                       cmpxchg(&iter->position,
-                                               dead_memcg, NULL);
-                               }
+                       mz = mem_cgroup_nodeinfo(memcg, nid);
+                       for (i = 0; i <= DEF_PRIORITY; i++) {
+                               iter = &mz->iter[i];
+                               cmpxchg(&iter->position,
+                                       dead_memcg, NULL);
                        }
                }
        }
@@ -943,39 +920,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
             iter != NULL;                              \
             iter = mem_cgroup_iter(NULL, iter, NULL))
 
-/**
- * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
- * @zone: zone of the wanted lruvec
- * @memcg: memcg of the wanted lruvec
- *
- * Returns the lru list vector holding pages for the given @zone and
- * @mem.  This can be the global zone lruvec, if the memory controller
- * is disabled.
- */
-struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
-                                     struct mem_cgroup *memcg)
-{
-       struct mem_cgroup_per_zone *mz;
-       struct lruvec *lruvec;
-
-       if (mem_cgroup_disabled()) {
-               lruvec = &zone->lruvec;
-               goto out;
-       }
-
-       mz = mem_cgroup_zone_zoneinfo(memcg, zone);
-       lruvec = &mz->lruvec;
-out:
-       /*
-        * Since a node can be onlined after the mem_cgroup was created,
-        * we have to be prepared to initialize lruvec->zone here;
-        * and if offlined then reonlined, we need to reinitialize it.
-        */
-       if (unlikely(lruvec->zone != zone))
-               lruvec->zone = zone;
-       return lruvec;
-}
-
 /**
  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
  * @page: the page
@@ -985,14 +929,14 @@ out:
  * and putback protocol: the LRU lock must be held, and the page must
  * either be PageLRU() or the caller must have isolated/allocated it.
  */
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
+struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
 {
-       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup_per_node *mz;
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;
 
        if (mem_cgroup_disabled()) {
-               lruvec = &zone->lruvec;
+               lruvec = &pgdat->lruvec;
                goto out;
        }
 
@@ -1004,7 +948,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
        if (!memcg)
                memcg = root_mem_cgroup;
 
-       mz = mem_cgroup_page_zoneinfo(memcg, page);
+       mz = mem_cgroup_page_nodeinfo(memcg, page);
        lruvec = &mz->lruvec;
 out:
        /*
@@ -1012,8 +956,8 @@ out:
         * we have to be prepared to initialize lruvec->zone here;
         * and if offlined then reonlined, we need to reinitialize it.
         */
-       if (unlikely(lruvec->zone != zone))
-               lruvec->zone = zone;
+       if (unlikely(lruvec->pgdat != pgdat))
+               lruvec->pgdat = pgdat;
        return lruvec;
 }
 
@@ -1030,17 +974,15 @@ out:
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                                int nr_pages)
 {
-       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup_per_node *mz;
        unsigned long *lru_size;
        long size;
        bool empty;
 
-       __update_lru_size(lruvec, lru, nr_pages);
-
        if (mem_cgroup_disabled())
                return;
 
-       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+       mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        lru_size = mz->lru_size + lru;
        empty = list_empty(lruvec->lists + lru);
 
@@ -1276,9 +1218,9 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
-       if (fatal_signal_pending(current) || task_will_free_mem(current)) {
+       if (task_will_free_mem(current)) {
                mark_oom_victim(current);
-               try_oom_reaper(current);
+               wake_oom_reaper(current);
                goto unlock;
        }
 
@@ -1433,7 +1375,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 #endif
 
 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
-                                  struct zone *zone,
+                                  pg_data_t *pgdat,
                                   gfp_t gfp_mask,
                                   unsigned long *total_scanned)
 {
@@ -1443,7 +1385,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
        unsigned long excess;
        unsigned long nr_scanned;
        struct mem_cgroup_reclaim_cookie reclaim = {
-               .zone = zone,
+               .pgdat = pgdat,
                .priority = 0,
        };
 
@@ -1473,8 +1415,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
                        }
                        continue;
                }
-               total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
-                                                    zone, &nr_scanned);
+               total += mem_cgroup_shrink_node(victim, gfp_mask, false,
+                                       pgdat, &nr_scanned);
                *total_scanned += nr_scanned;
                if (!soft_limit_excess(root_memcg))
                        break;
@@ -2107,11 +2049,11 @@ static void lock_page_lru(struct page *page, int *isolated)
 {
        struct zone *zone = page_zone(page);
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
        if (PageLRU(page)) {
                struct lruvec *lruvec;
 
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
                ClearPageLRU(page);
                del_page_from_lru_list(page, lruvec, page_lru(page));
                *isolated = 1;
@@ -2126,12 +2068,12 @@ static void unlock_page_lru(struct page *page, int isolated)
        if (isolated) {
                struct lruvec *lruvec;
 
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
                VM_BUG_ON_PAGE(PageLRU(page), page);
                SetPageLRU(page);
                add_page_to_lru_list(page, lruvec, page_lru(page));
        }
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 }
 
 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
@@ -2431,7 +2373,7 @@ void memcg_kmem_uncharge(struct page *page, int order)
 
 /*
  * Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock and migration entries setup in all page mappings.
+ * zone_lru_lock and migration entries setup in all page mappings.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
@@ -2601,22 +2543,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
        return ret;
 }
 
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                            gfp_t gfp_mask,
                                            unsigned long *total_scanned)
 {
        unsigned long nr_reclaimed = 0;
-       struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+       struct mem_cgroup_per_node *mz, *next_mz = NULL;
        unsigned long reclaimed;
        int loop = 0;
-       struct mem_cgroup_tree_per_zone *mctz;
+       struct mem_cgroup_tree_per_node *mctz;
        unsigned long excess;
        unsigned long nr_scanned;
 
        if (order > 0)
                return 0;
 
-       mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
+       mctz = soft_limit_tree_node(pgdat->node_id);
        /*
         * This loop can run a while, specially if mem_cgroup's continuously
         * keep exceeding their soft limit and putting the system under
@@ -2631,7 +2573,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                        break;
 
                nr_scanned = 0;
-               reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
+               reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
                                                    gfp_mask, &nr_scanned);
                nr_reclaimed += reclaimed;
                *total_scanned += nr_scanned;
@@ -3252,22 +3194,21 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 
 #ifdef CONFIG_DEBUG_VM
        {
-               int nid, zid;
-               struct mem_cgroup_per_zone *mz;
+               pg_data_t *pgdat;
+               struct mem_cgroup_per_node *mz;
                struct zone_reclaim_stat *rstat;
                unsigned long recent_rotated[2] = {0, 0};
                unsigned long recent_scanned[2] = {0, 0};
 
-               for_each_online_node(nid)
-                       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
-                               mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
-                               rstat = &mz->lruvec.reclaim_stat;
+               for_each_online_pgdat(pgdat) {
+                       mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+                       rstat = &mz->lruvec.reclaim_stat;
 
-                               recent_rotated[0] += rstat->recent_rotated[0];
-                               recent_rotated[1] += rstat->recent_rotated[1];
-                               recent_scanned[0] += rstat->recent_scanned[0];
-                               recent_scanned[1] += rstat->recent_scanned[1];
-                       }
+                       recent_rotated[0] += rstat->recent_rotated[0];
+                       recent_rotated[1] += rstat->recent_rotated[1];
+                       recent_scanned[0] += rstat->recent_scanned[0];
+                       recent_scanned[1] += rstat->recent_scanned[1];
+               }
                seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
                seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
                seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
@@ -4147,11 +4088,10 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
        return idr_find(&mem_cgroup_idr, id);
 }
 
-static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
+static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 {
        struct mem_cgroup_per_node *pn;
-       struct mem_cgroup_per_zone *mz;
-       int zone, tmp = node;
+       int tmp = node;
        /*
         * This routine is called against possible nodes.
         * But it's BUG to call kmalloc() against offline node.
@@ -4166,18 +4106,16 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
        if (!pn)
                return 1;
 
-       for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-               mz = &pn->zoneinfo[zone];
-               lruvec_init(&mz->lruvec);
-               mz->usage_in_excess = 0;
-               mz->on_tree = false;
-               mz->memcg = memcg;
-       }
+       lruvec_init(&pn->lruvec);
+       pn->usage_in_excess = 0;
+       pn->on_tree = false;
+       pn->memcg = memcg;
+
        memcg->nodeinfo[node] = pn;
        return 0;
 }
 
-static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
+static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 {
        kfree(memcg->nodeinfo[node]);
 }
@@ -4188,7 +4126,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
 
        memcg_wb_domain_exit(memcg);
        for_each_node(node)
-               free_mem_cgroup_per_zone_info(memcg, node);
+               free_mem_cgroup_per_node_info(memcg, node);
        free_percpu(memcg->stat);
        kfree(memcg);
 }
@@ -4217,7 +4155,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
                goto fail;
 
        for_each_node(node)
-               if (alloc_mem_cgroup_per_zone_info(memcg, node))
+               if (alloc_mem_cgroup_per_node_info(memcg, node))
                        goto fail;
 
        if (memcg_wb_domain_init(memcg, GFP_KERNEL))
@@ -5233,7 +5171,7 @@ static int memory_stat_show(struct seq_file *m, void *v)
        seq_printf(m, "file %llu\n",
                   (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
        seq_printf(m, "kernel_stack %llu\n",
-                  (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
+                  (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
        seq_printf(m, "slab %llu\n",
                   (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
                         stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
@@ -5820,18 +5758,12 @@ static int __init mem_cgroup_init(void)
 
        for_each_node(node) {
                struct mem_cgroup_tree_per_node *rtpn;
-               int zone;
 
                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
                                    node_online(node) ? node : NUMA_NO_NODE);
 
-               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-                       struct mem_cgroup_tree_per_zone *rtpz;
-
-                       rtpz = &rtpn->rb_tree_per_zone[zone];
-                       rtpz->rb_root = RB_ROOT;
-                       spin_lock_init(&rtpz->lock);
-               }
+               rtpn->rb_root = RB_ROOT;
+               spin_lock_init(&rtpn->lock);
                soft_limit_tree.rb_tree_per_node[node] = rtpn;
        }
 
index 2fcca6b0e005f0b973af6f13a91ee3a0659267fc..de88f33519c0d6398de8fcc06bfa82f2a477dd95 100644 (file)
@@ -741,8 +741,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
         * page->lru because it can be used in other hugepage operations,
         * such as __unmap_hugepage_range() and gather_surplus_pages().
         * So instead we use page_mapping() and PageAnon().
-        * We assume that this function is called with page lock held,
-        * so there is no race between isolation and mapping/unmapping.
         */
        if (!(page_mapping(hpage) || PageAnon(hpage))) {
                res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1663,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags)
        put_hwpoison_page(page);
        if (!ret) {
                LIST_HEAD(pagelist);
-               inc_zone_page_state(page, NR_ISOLATED_ANON +
+               inc_node_page_state(page, NR_ISOLATED_ANON +
                                        page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
@@ -1671,7 +1669,7 @@ static int __soft_offline_page(struct page *page, int flags)
                if (ret) {
                        if (!list_empty(&pagelist)) {
                                list_del(&page->lru);
-                               dec_zone_page_state(page, NR_ISOLATED_ANON +
+                               dec_node_page_state(page, NR_ISOLATED_ANON +
                                                page_is_file_cache(page));
                                putback_lru_page(page);
                        }
index 82d0b98d27f878015153dd4d360ad7825bab9c48..3894b65b155555f11076f0cae90f71e2475b6929 100644 (file)
@@ -1209,9 +1209,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 
                arch_refresh_nodedata(nid, pgdat);
        } else {
-               /* Reset the nr_zones and classzone_idx to 0 before reuse */
+               /* Reset the nr_zones, order and classzone_idx before reuse */
                pgdat->nr_zones = 0;
-               pgdat->classzone_idx = 0;
+               pgdat->kswapd_order = 0;
+               pgdat->kswapd_classzone_idx = 0;
        }
 
        /* we can use NODE_DATA(nid) from here */
@@ -1547,6 +1548,37 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
        return 0;
 }
 
+static struct page *new_node_page(struct page *page, unsigned long private,
+               int **result)
+{
+       gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+       int nid = page_to_nid(page);
+       nodemask_t nmask = node_online_map;
+       struct page *new_page;
+
+       /*
+        * TODO: allocate a destination hugepage from a nearest neighbor node,
+        * accordance with memory policy of the user process if possible. For
+        * now as a simple work-around, we use the next node for destination.
+        */
+       if (PageHuge(page))
+               return alloc_huge_page_node(page_hstate(compound_head(page)),
+                                       next_node_in(nid, nmask));
+
+       node_clear(nid, nmask);
+       if (PageHighMem(page)
+           || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
+               gfp_mask |= __GFP_HIGHMEM;
+
+       new_page = __alloc_pages_nodemask(gfp_mask, 0,
+                                       node_zonelist(nid, gfp_mask), &nmask);
+       if (!new_page)
+               new_page = __alloc_pages(gfp_mask, 0,
+                                       node_zonelist(nid, gfp_mask));
+
+       return new_page;
+}
+
 #define NR_OFFLINE_AT_ONCE_PAGES       (256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -1586,7 +1618,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        put_page(page);
                        list_add_tail(&page->lru, &source);
                        move_pages--;
-                       inc_zone_page_state(page, NR_ISOLATED_ANON +
+                       inc_node_page_state(page, NR_ISOLATED_ANON +
                                            page_is_file_cache(page));
 
                } else {
@@ -1610,11 +1642,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        goto out;
                }
 
-               /*
-                * alloc_migrate_target should be improooooved!!
-                * migrate_pages returns # of failed pages.
-                */
-               ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
+               /* Allocate a new page from the nearest neighbor node */
+               ret = migrate_pages(&source, new_node_page, NULL, 0,
                                        MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
                if (ret)
                        putback_movable_pages(&source);
index 53e40d3f39335d2602c94517c5f4e150538067bc..d8c4e38fb5f4be1d9748dc77f214c8a285374f8d 100644 (file)
@@ -962,7 +962,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
                if (!isolate_lru_page(page)) {
                        list_add_tail(&page->lru, pagelist);
-                       inc_zone_page_state(page, NR_ISOLATED_ANON +
+                       inc_node_page_state(page, NR_ISOLATED_ANON +
                                            page_is_file_cache(page));
                }
        }
index 8f65464da5de84d8bca0a8d97352f3c800f90614..47a659dedd44405a277190d3bf9b1655636573d5 100644 (file)
@@ -306,7 +306,7 @@ EXPORT_SYMBOL(mempool_resize);
  * returns NULL. Note that due to preallocation, this function
  * *never* fails when called from process contexts. (it might
  * fail if called from an IRQ context.)
- * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported.
+ * Note: using __GFP_ZERO is not supported.
  */
 void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 {
@@ -315,27 +315,16 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
        wait_queue_t wait;
        gfp_t gfp_temp;
 
-       /* If oom killed, memory reserves are essential to prevent livelock */
-       VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC);
-       /* No element size to zero on allocation */
        VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
-
        might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
+       gfp_mask |= __GFP_NOMEMALLOC;   /* don't allocate emergency reserves */
        gfp_mask |= __GFP_NORETRY;      /* don't loop in __alloc_pages */
        gfp_mask |= __GFP_NOWARN;       /* failures are OK */
 
        gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
 
 repeat_alloc:
-       if (likely(pool->curr_nr)) {
-               /*
-                * Don't allocate from emergency reserves if there are
-                * elements available.  This check is racy, but it will
-                * be rechecked each loop.
-                */
-               gfp_temp |= __GFP_NOMEMALLOC;
-       }
 
        element = pool->alloc(gfp_temp, pool->pool_data);
        if (likely(element != NULL))
@@ -359,12 +348,11 @@ repeat_alloc:
         * We use gfp mask w/o direct reclaim or IO for the first round.  If
         * alloc failed with that and @pool was empty, retry immediately.
         */
-       if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) {
+       if (gfp_temp != gfp_mask) {
                spin_unlock_irqrestore(&pool->lock, flags);
                gfp_temp = gfp_mask;
                goto repeat_alloc;
        }
-       gfp_temp = gfp_mask;
 
        /* We must not sleep if !__GFP_DIRECT_RECLAIM */
        if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
index 2232f6923cc720963d4cd4a2ac105d3b984e9a7a..f7ee04a5ae27a2934de9fab746667597bffafef3 100644 (file)
@@ -168,7 +168,7 @@ void putback_movable_pages(struct list_head *l)
                        continue;
                }
                list_del(&page->lru);
-               dec_zone_page_state(page, NR_ISOLATED_ANON +
+               dec_node_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
                /*
                 * We isolated non-lru movable page so here we can use
@@ -501,19 +501,21 @@ int migrate_page_move_mapping(struct address_space *mapping,
         * new page and drop references to the old page.
         *
         * Note that anonymous pages are accounted for
-        * via NR_FILE_PAGES and NR_ANON_PAGES if they
+        * via NR_FILE_PAGES and NR_ANON_MAPPED if they
         * are mapped to swap space.
         */
        if (newzone != oldzone) {
-               __dec_zone_state(oldzone, NR_FILE_PAGES);
-               __inc_zone_state(newzone, NR_FILE_PAGES);
+               __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
+               __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
                if (PageSwapBacked(page) && !PageSwapCache(page)) {
-                       __dec_zone_state(oldzone, NR_SHMEM);
-                       __inc_zone_state(newzone, NR_SHMEM);
+                       __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
+                       __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
                }
                if (dirty && mapping_cap_account_dirty(mapping)) {
-                       __dec_zone_state(oldzone, NR_FILE_DIRTY);
-                       __inc_zone_state(newzone, NR_FILE_DIRTY);
+                       __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
+                       __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
+                       __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
+                       __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
                }
        }
        local_irq_enable();
@@ -1119,7 +1121,7 @@ out:
                 * restored.
                 */
                list_del(&page->lru);
-               dec_zone_page_state(page, NR_ISOLATED_ANON +
+               dec_node_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
        }
 
@@ -1460,7 +1462,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                err = isolate_lru_page(page);
                if (!err) {
                        list_add_tail(&page->lru, &pagelist);
-                       inc_zone_page_state(page, NR_ISOLATED_ANON +
+                       inc_node_page_state(page, NR_ISOLATED_ANON +
                                            page_is_file_cache(page));
                }
 put_and_set:
@@ -1726,15 +1728,16 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
                                   unsigned long nr_migrate_pages)
 {
        int z;
+
+       if (!pgdat_reclaimable(pgdat))
+               return false;
+
        for (z = pgdat->nr_zones - 1; z >= 0; z--) {
                struct zone *zone = pgdat->node_zones + z;
 
                if (!populated_zone(zone))
                        continue;
 
-               if (!zone_reclaimable(zone))
-                       continue;
-
                /* Avoid waking kswapd by allocating pages_to_migrate pages. */
                if (!zone_watermark_ok(zone, 0,
                                       high_wmark_pages(zone) +
@@ -1828,7 +1831,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
        }
 
        page_lru = page_is_file_cache(page);
-       mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
+       mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
                                hpage_nr_pages(page));
 
        /*
@@ -1886,7 +1889,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
        if (nr_remaining) {
                if (!list_empty(&migratepages)) {
                        list_del(&page->lru);
-                       dec_zone_page_state(page, NR_ISOLATED_ANON +
+                       dec_node_page_state(page, NR_ISOLATED_ANON +
                                        page_is_file_cache(page));
                        putback_lru_page(page);
                }
@@ -1931,7 +1934,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                goto out_dropref;
 
        new_page = alloc_pages_node(node,
-               (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
+               (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
                HPAGE_PMD_ORDER);
        if (!new_page)
                goto out_fail;
@@ -1979,7 +1982,7 @@ fail_putback:
                /* Retake the callers reference and putback on LRU */
                get_page(page);
                putback_lru_page(page);
-               mod_zone_page_state(page_zone(page),
+               mod_node_page_state(page_pgdat(page),
                         NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
 
                goto out_unlock;
@@ -2030,7 +2033,7 @@ fail_putback:
        count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
        count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
 
-       mod_zone_page_state(page_zone(page),
+       mod_node_page_state(page_pgdat(page),
                        NR_ISOLATED_ANON + page_lru,
                        -HPAGE_PMD_NR);
        return isolated;
index ef8dc9f395c4cb52e5e80c73f31654dc6936223b..14645be06e301cfc4a62301e25818365e766874e 100644 (file)
@@ -103,7 +103,7 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
        if (PageLRU(page)) {
                struct lruvec *lruvec;
 
-               lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
+               lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
                if (getpage)
                        get_page(page);
                ClearPageLRU(page);
@@ -188,7 +188,7 @@ unsigned int munlock_vma_page(struct page *page)
         * might otherwise copy PageMlocked to part of the tail pages before
         * we clear it in the head page. It also stabilizes hpage_nr_pages().
         */
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
 
        nr_pages = hpage_nr_pages(page);
        if (!TestClearPageMlocked(page))
@@ -197,14 +197,14 @@ unsigned int munlock_vma_page(struct page *page)
        __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
 
        if (__munlock_isolate_lru_page(page, true)) {
-               spin_unlock_irq(&zone->lru_lock);
+               spin_unlock_irq(zone_lru_lock(zone));
                __munlock_isolated_page(page);
                goto out;
        }
        __munlock_isolation_failed(page);
 
 unlock_out:
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 
 out:
        return nr_pages - 1;
@@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
        pagevec_init(&pvec_putback, 0);
 
        /* Phase 1: page isolation */
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
        for (i = 0; i < nr; i++) {
                struct page *page = pvec->pages[i];
 
@@ -315,7 +315,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
        }
        delta_munlocked = -nr + pagevec_count(&pvec_putback);
        __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
 
        /* Now we can release pins of pages that we are not munlocking */
        pagevec_release(&pvec_putback);
index 86b18f334f4f759d8342a5220f8ac850c770ab03..d44bee96a5fe4f5141d56916591889424a195684 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -621,7 +621,6 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 {
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *next = vma->vm_next;
-       struct vm_area_struct *importer = NULL;
        struct address_space *mapping = NULL;
        struct rb_root *root = NULL;
        struct anon_vma *anon_vma = NULL;
@@ -631,17 +630,25 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        int remove_next = 0;
 
        if (next && !insert) {
-               struct vm_area_struct *exporter = NULL;
+               struct vm_area_struct *exporter = NULL, *importer = NULL;
 
                if (end >= next->vm_end) {
                        /*
                         * vma expands, overlapping all the next, and
                         * perhaps the one after too (mprotect case 6).
                         */
-again:                 remove_next = 1 + (end > next->vm_end);
+                       remove_next = 1 + (end > next->vm_end);
                        end = next->vm_end;
                        exporter = next;
                        importer = vma;
+
+                       /*
+                        * If next doesn't have anon_vma, import from vma after
+                        * next, if the vma overlaps with it.
+                        */
+                       if (remove_next == 2 && next && !next->anon_vma)
+                               exporter = next->vm_next;
+
                } else if (end > next->vm_start) {
                        /*
                         * vma expands, overlapping part of the next:
@@ -675,7 +682,7 @@ again:                      remove_next = 1 + (end > next->vm_end);
                                return error;
                }
        }
-
+again:
        vma_adjust_trans_huge(vma, start, end, adjust_next);
 
        if (file) {
@@ -796,8 +803,11 @@ again:                     remove_next = 1 + (end > next->vm_end);
                 * up the code too much to do both in one go.
                 */
                next = vma->vm_next;
-               if (remove_next == 2)
+               if (remove_next == 2) {
+                       remove_next = 1;
+                       end = next->vm_end;
                        goto again;
+               }
                else if (next)
                        vma_gap_update(next);
                else
index d4a929d79470aa0c9edc6131854dbf309ca39d47..7d0a275df822e9e14c55e5d472cfc473ac3ae173 100644 (file)
@@ -176,11 +176,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 
        /*
         * Do not even consider tasks which are explicitly marked oom
-        * unkillable or have been already oom reaped.
+        * unkillable or have been already oom reaped or the are in
+        * the middle of vfork
         */
        adj = (long)p->signal->oom_score_adj;
        if (adj == OOM_SCORE_ADJ_MIN ||
-                       test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
+                       test_bit(MMF_OOM_REAPED, &p->mm->flags) ||
+                       in_vfork(p)) {
                task_unlock(p);
                return 0;
        }
@@ -281,10 +283,22 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 
        /*
         * This task already has access to memory reserves and is being killed.
-        * Don't allow any other task to have access to the reserves.
+        * Don't allow any other task to have access to the reserves unless
+        * the task has MMF_OOM_REAPED because chances that it would release
+        * any memory is quite low.
         */
-       if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
-               return OOM_SCAN_ABORT;
+       if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
+               struct task_struct *p = find_lock_task_mm(task);
+               enum oom_scan_t ret = OOM_SCAN_ABORT;
+
+               if (p) {
+                       if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
+                               ret = OOM_SCAN_CONTINUE;
+                       task_unlock(p);
+               }
+
+               return ret;
+       }
 
        /*
         * If task is allocating a lot of memory and has been marked to be
@@ -415,7 +429,7 @@ bool oom_killer_disabled __read_mostly;
  * task's threads: if one of those is using this mm then this task was also
  * using it.
  */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
 {
        struct task_struct *t;
 
@@ -554,8 +568,27 @@ static void oom_reap_task(struct task_struct *tsk)
                schedule_timeout_idle(HZ/10);
 
        if (attempts > MAX_OOM_REAP_RETRIES) {
+               struct task_struct *p;
+
                pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
                                task_pid_nr(tsk), tsk->comm);
+
+               /*
+                * If we've already tried to reap this task in the past and
+                * failed it probably doesn't make much sense to try yet again
+                * so hide the mm from the oom killer so that it can move on
+                * to another task with a different mm struct.
+                */
+               p = find_lock_task_mm(tsk);
+               if (p) {
+                       if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
+                               pr_info("oom_reaper: giving up pid:%d (%s)\n",
+                                               task_pid_nr(tsk), tsk->comm);
+                               set_bit(MMF_OOM_REAPED, &p->mm->flags);
+                       }
+                       task_unlock(p);
+               }
+
                debug_show_all_locks();
        }
 
@@ -594,7 +627,7 @@ static int oom_reaper(void *unused)
        return 0;
 }
 
-static void wake_oom_reaper(struct task_struct *tsk)
+void wake_oom_reaper(struct task_struct *tsk)
 {
        if (!oom_reaper_th)
                return;
@@ -612,46 +645,6 @@ static void wake_oom_reaper(struct task_struct *tsk)
        wake_up(&oom_reaper_wait);
 }
 
-/* Check if we can reap the given task. This has to be called with stable
- * tsk->mm
- */
-void try_oom_reaper(struct task_struct *tsk)
-{
-       struct mm_struct *mm = tsk->mm;
-       struct task_struct *p;
-
-       if (!mm)
-               return;
-
-       /*
-        * There might be other threads/processes which are either not
-        * dying or even not killable.
-        */
-       if (atomic_read(&mm->mm_users) > 1) {
-               rcu_read_lock();
-               for_each_process(p) {
-                       if (!process_shares_mm(p, mm))
-                               continue;
-                       if (fatal_signal_pending(p))
-                               continue;
-
-                       /*
-                        * If the task is exiting make sure the whole thread group
-                        * is exiting and cannot acces mm anymore.
-                        */
-                       if (signal_group_exit(p->signal))
-                               continue;
-
-                       /* Give up */
-                       rcu_read_unlock();
-                       return;
-               }
-               rcu_read_unlock();
-       }
-
-       wake_oom_reaper(tsk);
-}
-
 static int __init oom_init(void)
 {
        oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -663,10 +656,6 @@ static int __init oom_init(void)
        return 0;
 }
 subsys_initcall(oom_init)
-#else
-static void wake_oom_reaper(struct task_struct *tsk)
-{
-}
 #endif
 
 /**
@@ -743,6 +732,80 @@ void oom_killer_enable(void)
        oom_killer_disabled = false;
 }
 
+static inline bool __task_will_free_mem(struct task_struct *task)
+{
+       struct signal_struct *sig = task->signal;
+
+       /*
+        * A coredumping process may sleep for an extended period in exit_mm(),
+        * so the oom killer cannot assume that the process will promptly exit
+        * and release memory.
+        */
+       if (sig->flags & SIGNAL_GROUP_COREDUMP)
+               return false;
+
+       if (sig->flags & SIGNAL_GROUP_EXIT)
+               return true;
+
+       if (thread_group_empty(task) && (task->flags & PF_EXITING))
+               return true;
+
+       return false;
+}
+
+/*
+ * Checks whether the given task is dying or exiting and likely to
+ * release its address space. This means that all threads and processes
+ * sharing the same mm have to be killed or exiting.
+ * Caller has to make sure that task->mm is stable (hold task_lock or
+ * it operates on the current).
+ */
+bool task_will_free_mem(struct task_struct *task)
+{
+       struct mm_struct *mm = task->mm;
+       struct task_struct *p;
+       bool ret;
+
+       /*
+        * Skip tasks without mm because it might have passed its exit_mm and
+        * exit_oom_victim. oom_reaper could have rescued that but do not rely
+        * on that for now. We can consider find_lock_task_mm in future.
+        */
+       if (!mm)
+               return false;
+
+       if (!__task_will_free_mem(task))
+               return false;
+
+       /*
+        * This task has already been drained by the oom reaper so there are
+        * only small chances it will free some more
+        */
+       if (test_bit(MMF_OOM_REAPED, &mm->flags))
+               return false;
+
+       if (atomic_read(&mm->mm_users) <= 1)
+               return true;
+
+       /*
+        * This is really pessimistic but we do not have any reliable way
+        * to check that external processes share with our mm
+        */
+       rcu_read_lock();
+       for_each_process(p) {
+               if (!process_shares_mm(p, mm))
+                       continue;
+               if (same_thread_group(task, p))
+                       continue;
+               ret = __task_will_free_mem(p);
+               if (!ret)
+                       break;
+       }
+       rcu_read_unlock();
+
+       return ret;
+}
+
 /*
  * Must be called while holding a reference to p, which will be released upon
  * returning.
@@ -765,9 +828,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
         * its children or threads, just set TIF_MEMDIE so it can die quickly
         */
        task_lock(p);
-       if (p->mm && task_will_free_mem(p)) {
+       if (task_will_free_mem(p)) {
                mark_oom_victim(p);
-               try_oom_reaper(p);
+               wake_oom_reaper(p);
                task_unlock(p);
                put_task_struct(p);
                return;
@@ -850,14 +913,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                        continue;
                if (same_thread_group(p, victim))
                        continue;
-               if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
-                   p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+               if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) {
                        /*
                         * We cannot use oom_reaper for the mm shared by this
                         * process because it wouldn't get killed and so the
-                        * memory might be still used.
+                        * memory might be still used. Hide the mm from the oom
+                        * killer to guarantee OOM forward progress.
                         */
                        can_oom_reap = false;
+                       set_bit(MMF_OOM_REAPED, &mm->flags);
+                       pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
+                                       task_pid_nr(victim), victim->comm,
+                                       task_pid_nr(p), p->comm);
                        continue;
                }
                do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
@@ -939,14 +1006,10 @@ bool out_of_memory(struct oom_control *oc)
         * If current has a pending SIGKILL or is exiting, then automatically
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
-        *
-        * But don't select if current has already released its mm and cleared
-        * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
         */
-       if (current->mm &&
-           (fatal_signal_pending(current) || task_will_free_mem(current))) {
+       if (task_will_free_mem(current)) {
                mark_oom_victim(current);
-               try_oom_reaper(current);
+               wake_oom_reaper(current);
                return true;
        }
 
index d578d2a56b192d5cc566fea4b88ecae81a431818..f4cd7d8005c9071dc1caf77a155eb3f2dc611b61 100644 (file)
@@ -267,26 +267,35 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
  */
 
 /**
- * zone_dirtyable_memory - number of dirtyable pages in a zone
- * @zone: the zone
+ * node_dirtyable_memory - number of dirtyable pages in a node
+ * @pgdat: the node
  *
- * Returns the zone's number of pages potentially available for dirty
- * page cache.  This is the base value for the per-zone dirty limits.
+ * Returns the node's number of pages potentially available for dirty
+ * page cache.  This is the base value for the per-node dirty limits.
  */
-static unsigned long zone_dirtyable_memory(struct zone *zone)
+static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
 {
-       unsigned long nr_pages;
+       unsigned long nr_pages = 0;
+       int z;
+
+       for (z = 0; z < MAX_NR_ZONES; z++) {
+               struct zone *zone = pgdat->node_zones + z;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               nr_pages += zone_page_state(zone, NR_FREE_PAGES);
+       }
 
-       nr_pages = zone_page_state(zone, NR_FREE_PAGES);
        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
         * clean pages in order to balance the zones.
         */
-       nr_pages -= min(nr_pages, zone->totalreserve_pages);
+       nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
 
-       nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
-       nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+       nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
+       nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);
 
        return nr_pages;
 }
@@ -299,13 +308,26 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
        int i;
 
        for_each_node_state(node, N_HIGH_MEMORY) {
-               for (i = 0; i < MAX_NR_ZONES; i++) {
-                       struct zone *z = &NODE_DATA(node)->node_zones[i];
+               for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
+                       struct zone *z;
+                       unsigned long nr_pages;
+
+                       if (!is_highmem_idx(i))
+                               continue;
+
+                       z = &NODE_DATA(node)->node_zones[i];
+                       if (!populated_zone(z))
+                               continue;
 
-                       if (is_highmem(z))
-                               x += zone_dirtyable_memory(z);
+                       nr_pages = zone_page_state(z, NR_FREE_PAGES);
+                       /* watch for underflows */
+                       nr_pages -= min(nr_pages, high_wmark_pages(z));
+                       nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
+                       nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
+                       x += nr_pages;
                }
        }
+
        /*
         * Unreclaimable memory (kernel memory or anonymous memory
         * without swap) can bring down the dirtyable pages below
@@ -348,8 +370,8 @@ static unsigned long global_dirtyable_memory(void)
         */
        x -= min(x, totalreserve_pages);
 
-       x += global_page_state(NR_INACTIVE_FILE);
-       x += global_page_state(NR_ACTIVE_FILE);
+       x += global_node_page_state(NR_INACTIVE_FILE);
+       x += global_node_page_state(NR_ACTIVE_FILE);
 
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
@@ -445,23 +467,23 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 }
 
 /**
- * zone_dirty_limit - maximum number of dirty pages allowed in a zone
- * @zone: the zone
+ * node_dirty_limit - maximum number of dirty pages allowed in a node
+ * @pgdat: the node
  *
- * Returns the maximum number of dirty pages allowed in a zone, based
- * on the zone's dirtyable memory.
+ * Returns the maximum number of dirty pages allowed in a node, based
+ * on the node's dirtyable memory.
  */
-static unsigned long zone_dirty_limit(struct zone *zone)
+static unsigned long node_dirty_limit(struct pglist_data *pgdat)
 {
-       unsigned long zone_memory = zone_dirtyable_memory(zone);
+       unsigned long node_memory = node_dirtyable_memory(pgdat);
        struct task_struct *tsk = current;
        unsigned long dirty;
 
        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
-                       zone_memory / global_dirtyable_memory();
+                       node_memory / global_dirtyable_memory();
        else
-               dirty = vm_dirty_ratio * zone_memory / 100;
+               dirty = vm_dirty_ratio * node_memory / 100;
 
        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
                dirty += dirty / 4;
@@ -470,19 +492,22 @@ static unsigned long zone_dirty_limit(struct zone *zone)
 }
 
 /**
- * zone_dirty_ok - tells whether a zone is within its dirty limits
- * @zone: the zone to check
+ * node_dirty_ok - tells whether a node is within its dirty limits
+ * @pgdat: the node to check
  *
- * Returns %true when the dirty pages in @zone are within the zone's
+ * Returns %true when the dirty pages in @pgdat are within the node's
  * dirty limit, %false if the limit is exceeded.
  */
-bool zone_dirty_ok(struct zone *zone)
+bool node_dirty_ok(struct pglist_data *pgdat)
 {
-       unsigned long limit = zone_dirty_limit(zone);
+       unsigned long limit = node_dirty_limit(pgdat);
+       unsigned long nr_pages = 0;
+
+       nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
+       nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
+       nr_pages += node_page_state(pgdat, NR_WRITEBACK);
 
-       return zone_page_state(zone, NR_FILE_DIRTY) +
-              zone_page_state(zone, NR_UNSTABLE_NFS) +
-              zone_page_state(zone, NR_WRITEBACK) <= limit;
+       return nr_pages <= limit;
 }
 
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -1570,10 +1595,10 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * written to the server's write cache, but has not yet
                 * been flushed to permanent storage.
                 */
-               nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-                                       global_page_state(NR_UNSTABLE_NFS);
+               nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
+                                       global_node_page_state(NR_UNSTABLE_NFS);
                gdtc->avail = global_dirtyable_memory();
-               gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
+               gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
 
                domain_dirty_limits(gdtc);
 
@@ -1910,8 +1935,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
         * as we're trying to decide whether to put more under writeback.
         */
        gdtc->avail = global_dirtyable_memory();
-       gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
-                     global_page_state(NR_UNSTABLE_NFS);
+       gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) +
+                     global_node_page_state(NR_UNSTABLE_NFS);
        domain_dirty_limits(gdtc);
 
        if (gdtc->dirty > gdtc->bg_thresh)
@@ -1955,8 +1980,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                  */
                 dirty_thresh += dirty_thresh / 10;      /* wheeee... */
 
-                if (global_page_state(NR_UNSTABLE_NFS) +
-                       global_page_state(NR_WRITEBACK) <= dirty_thresh)
+                if (global_node_page_state(NR_UNSTABLE_NFS) +
+                       global_node_page_state(NR_WRITEBACK) <= dirty_thresh)
                                break;
                 congestion_wait(BLK_RW_ASYNC, HZ/10);
 
@@ -1984,8 +2009,8 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
 void laptop_mode_timer_fn(unsigned long data)
 {
        struct request_queue *q = (struct request_queue *)data;
-       int nr_pages = global_page_state(NR_FILE_DIRTY) +
-               global_page_state(NR_UNSTABLE_NFS);
+       int nr_pages = global_node_page_state(NR_FILE_DIRTY) +
+               global_node_page_state(NR_UNSTABLE_NFS);
        struct bdi_writeback *wb;
 
        /*
@@ -2436,8 +2461,9 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
                wb = inode_to_wb(inode);
 
                mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
-               __inc_zone_page_state(page, NR_FILE_DIRTY);
-               __inc_zone_page_state(page, NR_DIRTIED);
+               __inc_node_page_state(page, NR_FILE_DIRTY);
+               __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+               __inc_node_page_state(page, NR_DIRTIED);
                __inc_wb_stat(wb, WB_RECLAIMABLE);
                __inc_wb_stat(wb, WB_DIRTIED);
                task_io_account_write(PAGE_SIZE);
@@ -2457,7 +2483,8 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
 {
        if (mapping_cap_account_dirty(mapping)) {
                mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
-               dec_zone_page_state(page, NR_FILE_DIRTY);
+               dec_node_page_state(page, NR_FILE_DIRTY);
+               dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                dec_wb_stat(wb, WB_RECLAIMABLE);
                task_io_account_cancelled_write(PAGE_SIZE);
        }
@@ -2525,7 +2552,7 @@ void account_page_redirty(struct page *page)
 
                wb = unlocked_inode_to_wb_begin(inode, &locked);
                current->nr_dirtied--;
-               dec_zone_page_state(page, NR_DIRTIED);
+               dec_node_page_state(page, NR_DIRTIED);
                dec_wb_stat(wb, WB_DIRTIED);
                unlocked_inode_to_wb_end(inode, locked);
        }
@@ -2713,7 +2740,8 @@ int clear_page_dirty_for_io(struct page *page)
                wb = unlocked_inode_to_wb_begin(inode, &locked);
                if (TestClearPageDirty(page)) {
                        mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
-                       dec_zone_page_state(page, NR_FILE_DIRTY);
+                       dec_node_page_state(page, NR_FILE_DIRTY);
+                       dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                        dec_wb_stat(wb, WB_RECLAIMABLE);
                        ret = 1;
                }
@@ -2759,8 +2787,9 @@ int test_clear_page_writeback(struct page *page)
        }
        if (ret) {
                mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
-               dec_zone_page_state(page, NR_WRITEBACK);
-               inc_zone_page_state(page, NR_WRITTEN);
+               dec_node_page_state(page, NR_WRITEBACK);
+               dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+               inc_node_page_state(page, NR_WRITTEN);
        }
        unlock_page_memcg(page);
        return ret;
@@ -2813,7 +2842,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
        }
        if (!ret) {
                mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
-               inc_zone_page_state(page, NR_WRITEBACK);
+               inc_node_page_state(page, NR_WRITEBACK);
+               inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
        }
        unlock_page_memcg(page);
        return ret;
index 452513bf02ceca4d331a2bd1781d0e1f9ec47b79..ea759b9353603bcea22bf77fb5fef633877c4a22 100644 (file)
@@ -295,14 +295,6 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
        return false;
 }
 
-static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
-{
-       if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
-               return true;
-
-       return false;
-}
-
 /*
  * Returns false when the remaining initialisation should be deferred until
  * later in the boot cycle when it can be parallelised.
@@ -342,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn)
        return false;
 }
 
-static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
-{
-       return false;
-}
-
 static inline bool update_defer_init(pg_data_t *pgdat,
                                unsigned long pfn, unsigned long zone_end,
                                unsigned long *nr_initialised)
@@ -1091,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 
        spin_lock(&zone->lock);
        isolated_pageblocks = has_isolate_pageblock(zone);
-       nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+       nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
        if (nr_scanned)
-               __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+               __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
 
        while (count) {
                struct page *page;
@@ -1148,9 +1135,9 @@ static void free_one_page(struct zone *zone,
 {
        unsigned long nr_scanned;
        spin_lock(&zone->lock);
-       nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+       nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
        if (nr_scanned)
-               __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+               __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
 
        if (unlikely(has_isolate_pageblock(zone) ||
                is_migrate_isolate(migratetype))) {
@@ -2517,7 +2504,10 @@ int __isolate_free_page(struct page *page, unsigned int order)
        zone->free_area[order].nr_free--;
        rmv_page_order(page);
 
-       /* Set the pageblock if the isolated page is at least a pageblock */
+       /*
+        * Set the pageblock if the isolated page is at least half of a
+        * pageblock
+        */
        if (order >= pageblock_order - 1) {
                struct page *endpage = page + (1 << order) - 1;
                for (; page < endpage; page += pageblock_nr_pages) {
@@ -2597,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                        else
                                page = list_first_entry(list, struct page, lru);
 
-                       __dec_zone_state(zone, NR_ALLOC_BATCH);
                        list_del(&page->lru);
                        pcp->count--;
 
@@ -2623,16 +2612,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
-               __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
                __mod_zone_freepage_state(zone, -(1 << order),
                                          get_pcppage_migratetype(page));
        }
 
-       if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
-           !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
-               set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-
-       __count_zone_vm_events(PGALLOC, zone, 1 << order);
+       __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
        local_irq_restore(flags);
 
@@ -2842,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 }
 
 #ifdef CONFIG_NUMA
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
-       return local_zone->node == zone->node;
-}
-
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
                                RECLAIM_DISTANCE;
 }
 #else  /* CONFIG_NUMA */
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
-       return true;
-}
-
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
        return true;
 }
 #endif /* CONFIG_NUMA */
 
-static void reset_alloc_batches(struct zone *preferred_zone)
-{
-       struct zone *zone = preferred_zone->zone_pgdat->node_zones;
-
-       do {
-               mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                       high_wmark_pages(zone) - low_wmark_pages(zone) -
-                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-               clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-       } while (zone++ != preferred_zone);
-}
-
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
@@ -2886,10 +2848,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 {
        struct zoneref *z = ac->preferred_zoneref;
        struct zone *zone;
-       bool fair_skipped = false;
-       bool apply_fair = (alloc_flags & ALLOC_FAIR);
+       struct pglist_data *last_pgdat_dirty_limit = NULL;
 
-zonelist_scan:
        /*
         * Scan zonelist, looking for a zone with enough free.
         * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
@@ -2903,51 +2863,34 @@ zonelist_scan:
                        (alloc_flags & ALLOC_CPUSET) &&
                        !__cpuset_zone_allowed(zone, gfp_mask))
                                continue;
-               /*
-                * Distribute pages in proportion to the individual
-                * zone size to ensure fair page aging.  The zone a
-                * page was allocated in should have no effect on the
-                * time the page has in memory before being reclaimed.
-                */
-               if (apply_fair) {
-                       if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
-                               fair_skipped = true;
-                               continue;
-                       }
-                       if (!zone_local(ac->preferred_zoneref->zone, zone)) {
-                               if (fair_skipped)
-                                       goto reset_fair;
-                               apply_fair = false;
-                       }
-               }
                /*
                 * When allocating a page cache page for writing, we
-                * want to get it from a zone that is within its dirty
-                * limit, such that no single zone holds more than its
+                * want to get it from a node that is within its dirty
+                * limit, such that no single node holds more than its
                 * proportional share of globally allowed dirty pages.
-                * The dirty limits take into account the zone's
+                * The dirty limits take into account the node's
                 * lowmem reserves and high watermark so that kswapd
                 * should be able to balance it without having to
                 * write pages from its LRU list.
                 *
-                * This may look like it could increase pressure on
-                * lower zones by failing allocations in higher zones
-                * before they are full.  But the pages that do spill
-                * over are limited as the lower zones are protected
-                * by this very same mechanism.  It should not become
-                * a practical burden to them.
-                *
                 * XXX: For now, allow allocations to potentially
-                * exceed the per-zone dirty limit in the slowpath
+                * exceed the per-node dirty limit in the slowpath
                 * (spread_dirty_pages unset) before going into reclaim,
                 * which is important when on a NUMA setup the allowed
-                * zones are together not big enough to reach the
+                * nodes are together not big enough to reach the
                 * global limit.  The proper fix for these situations
-                * will require awareness of zones in the
+                * will require awareness of nodes in the
                 * dirty-throttling and the flusher threads.
                 */
-               if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
-                       continue;
+               if (ac->spread_dirty_pages) {
+                       if (last_pgdat_dirty_limit == zone->zone_pgdat)
+                               continue;
+
+                       if (!node_dirty_ok(zone->zone_pgdat)) {
+                               last_pgdat_dirty_limit = zone->zone_pgdat;
+                               continue;
+                       }
+               }
 
                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
                if (!zone_watermark_fast(zone, order, mark,
@@ -2959,16 +2902,16 @@ zonelist_scan:
                        if (alloc_flags & ALLOC_NO_WATERMARKS)
                                goto try_this_zone;
 
-                       if (zone_reclaim_mode == 0 ||
+                       if (node_reclaim_mode == 0 ||
                            !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
                                continue;
 
-                       ret = zone_reclaim(zone, gfp_mask, order);
+                       ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
                        switch (ret) {
-                       case ZONE_RECLAIM_NOSCAN:
+                       case NODE_RECLAIM_NOSCAN:
                                /* did not scan */
                                continue;
-                       case ZONE_RECLAIM_FULL:
+                       case NODE_RECLAIM_FULL:
                                /* scanned but unreclaimable */
                                continue;
                        default:
@@ -2998,23 +2941,6 @@ try_this_zone:
                }
        }
 
-       /*
-        * The first pass makes sure allocations are spread fairly within the
-        * local node.  However, the local node might have free pages left
-        * after the fairness batches are exhausted, and remote zones haven't
-        * even been considered yet.  Try once more without fairness, and
-        * include remote zones now, before entering the slowpath and waking
-        * kswapd: prefer spilling to a remote zone over swapping locally.
-        */
-       if (fair_skipped) {
-reset_fair:
-               apply_fair = false;
-               fair_skipped = false;
-               reset_alloc_batches(ac->preferred_zoneref->zone);
-               z = ac->preferred_zoneref;
-               goto zonelist_scan;
-       }
-
        return NULL;
 }
 
@@ -3159,7 +3085,6 @@ out:
        return page;
 }
 
-
 /*
  * Maximum number of compaction retries wit a progress before OOM
  * killer is consider as the only way to move forward.
@@ -3171,17 +3096,16 @@ out:
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                unsigned int alloc_flags, const struct alloc_context *ac,
-               enum migrate_mode mode, enum compact_result *compact_result)
+               enum compact_priority prio, enum compact_result *compact_result)
 {
        struct page *page;
-       int contended_compaction;
 
        if (!order)
                return NULL;
 
        current->flags |= PF_MEMALLOC;
        *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
-                                               mode, &contended_compaction);
+                                                                       prio);
        current->flags &= ~PF_MEMALLOC;
 
        if (*compact_result <= COMPACT_INACTIVE)
@@ -3193,8 +3117,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         */
        count_vm_event(COMPACTSTALL);
 
-       page = get_page_from_freelist(gfp_mask, order,
-                                       alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
 
        if (page) {
                struct zone *zone = page_zone(page);
@@ -3211,24 +3134,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         */
        count_vm_event(COMPACTFAIL);
 
-       /*
-        * In all zones where compaction was attempted (and not
-        * deferred or skipped), lock contention has been detected.
-        * For THP allocation we do not want to disrupt the others
-        * so we fallback to base pages instead.
-        */
-       if (contended_compaction == COMPACT_CONTENDED_LOCK)
-               *compact_result = COMPACT_CONTENDED;
-
-       /*
-        * If compaction was aborted due to need_resched(), we do not
-        * want to further increase allocation latency, unless it is
-        * khugepaged trying to collapse.
-        */
-       if (contended_compaction == COMPACT_CONTENDED_SCHED
-               && !(current->flags & PF_KTHREAD))
-               *compact_result = COMPACT_CONTENDED;
-
        cond_resched();
 
        return NULL;
@@ -3236,7 +3141,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 static inline bool
 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
-                    enum compact_result compact_result, enum migrate_mode *migrate_mode,
+                    enum compact_result compact_result,
+                    enum compact_priority *compact_priority,
                     int compaction_retries)
 {
        int max_retries = MAX_COMPACT_RETRIES;
@@ -3247,11 +3153,11 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
        /*
         * compaction considers all the zone as desperately out of memory
         * so it doesn't really make much sense to retry except when the
-        * failure could be caused by weak migration mode.
+        * failure could be caused by insufficient priority
         */
        if (compaction_failed(compact_result)) {
-               if (*migrate_mode == MIGRATE_ASYNC) {
-                       *migrate_mode = MIGRATE_SYNC_LIGHT;
+               if (*compact_priority > MIN_COMPACT_PRIORITY) {
+                       (*compact_priority)--;
                        return true;
                }
                return false;
@@ -3285,7 +3191,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                unsigned int alloc_flags, const struct alloc_context *ac,
-               enum migrate_mode mode, enum compact_result *compact_result)
+               enum compact_priority prio, enum compact_result *compact_result)
 {
        *compact_result = COMPACT_SKIPPED;
        return NULL;
@@ -3294,7 +3200,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 static inline bool
 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
                     enum compact_result compact_result,
-                    enum migrate_mode *migrate_mode,
+                    enum compact_priority *compact_priority,
                     int compaction_retries)
 {
        struct zone *zone;
@@ -3362,8 +3268,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
                return NULL;
 
 retry:
-       page = get_page_from_freelist(gfp_mask, order,
-                                       alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
 
        /*
         * If an allocation failed after direct reclaim, it could be because
@@ -3384,10 +3289,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
 {
        struct zoneref *z;
        struct zone *zone;
+       pg_data_t *last_pgdat = NULL;
 
        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
-                                               ac->high_zoneidx, ac->nodemask)
-               wakeup_kswapd(zone, order, ac_classzone_idx(ac));
+                                       ac->high_zoneidx, ac->nodemask) {
+               if (last_pgdat != zone->zone_pgdat)
+                       wakeup_kswapd(zone, order, ac->high_zoneidx);
+               last_pgdat = zone->zone_pgdat;
+       }
 }
 
 static inline unsigned int
@@ -3421,16 +3330,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
 
-       if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
-               if (gfp_mask & __GFP_MEMALLOC)
-                       alloc_flags |= ALLOC_NO_WATERMARKS;
-               else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
-                       alloc_flags |= ALLOC_NO_WATERMARKS;
-               else if (!in_interrupt() &&
-                               ((current->flags & PF_MEMALLOC) ||
-                                unlikely(test_thread_flag(TIF_MEMDIE))))
-                       alloc_flags |= ALLOC_NO_WATERMARKS;
-       }
 #ifdef CONFIG_CMA
        if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
@@ -3440,12 +3339,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 
 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 {
-       return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
-}
+       if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+               return false;
 
-static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
-{
-       return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+       if (gfp_mask & __GFP_MEMALLOC)
+               return true;
+       if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+               return true;
+       if (!in_interrupt() &&
+                       ((current->flags & PF_MEMALLOC) ||
+                        unlikely(test_thread_flag(TIF_MEMDIE))))
+               return true;
+
+       return false;
 }
 
 /*
@@ -3481,10 +3387,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                return false;
 
        /*
-        * Keep reclaiming pages while there is a chance this will lead somewhere.
-        * If none of the target zones can satisfy our allocation request even
-        * if all reclaimable pages are considered then we are screwed and have
-        * to go OOM.
+        * Keep reclaiming pages while there is a chance this will lead
+        * somewhere.  If none of the target zones can satisfy our allocation
+        * request even if all reclaimable pages are considered then we are
+        * screwed and have to go OOM.
         */
        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                        ac->nodemask) {
@@ -3509,14 +3415,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                         * prevent from pre mature OOM
                         */
                        if (!did_some_progress) {
-                               unsigned long writeback;
-                               unsigned long dirty;
+                               unsigned long write_pending;
 
-                               writeback = zone_page_state_snapshot(zone,
-                                                                    NR_WRITEBACK);
-                               dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+                               write_pending = zone_page_state_snapshot(zone,
+                                                       NR_ZONE_WRITE_PENDING);
 
-                               if (2*(writeback + dirty) > reclaimable) {
+                               if (2 * write_pending > reclaimable) {
                                        congestion_wait(BLK_RW_ASYNC, HZ/10);
                                        return true;
                                }
@@ -3551,7 +3455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        struct page *page = NULL;
        unsigned int alloc_flags;
        unsigned long did_some_progress;
-       enum migrate_mode migration_mode = MIGRATE_ASYNC;
+       enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
        enum compact_result compact_result;
        int compaction_retries = 0;
        int no_progress_loops = 0;
@@ -3575,42 +3479,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
                gfp_mask &= ~__GFP_ATOMIC;
 
-retry:
+       /*
+        * The fast path uses conservative alloc_flags to succeed only until
+        * kswapd needs to be woken up, and to avoid the cost of setting up
+        * alloc_flags precisely. So we do that now.
+        */
+       alloc_flags = gfp_to_alloc_flags(gfp_mask);
+
        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                wake_all_kswapds(order, ac);
 
        /*
-        * OK, we're below the kswapd watermark and have kicked background
-        * reclaim. Now things get more complex, so set up alloc_flags according
-        * to how we want to proceed.
+        * The adjusted alloc_flags might result in immediate success, so try
+        * that first
         */
-       alloc_flags = gfp_to_alloc_flags(gfp_mask);
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+       if (page)
+               goto got_pg;
+
+       /*
+        * For costly allocations, try direct compaction first, as it's likely
+        * that we have enough base pages and don't need to reclaim. Don't try
+        * that for allocations that are allowed to ignore watermarks, as the
+        * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+        */
+       if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
+               !gfp_pfmemalloc_allowed(gfp_mask)) {
+               page = __alloc_pages_direct_compact(gfp_mask, order,
+                                               alloc_flags, ac,
+                                               INIT_COMPACT_PRIORITY,
+                                               &compact_result);
+               if (page)
+                       goto got_pg;
+
+               /*
+                * Checks for costly allocations with __GFP_NORETRY, which
+                * includes THP page fault allocations
+                */
+               if (gfp_mask & __GFP_NORETRY) {
+                       /*
+                        * If compaction is deferred for high-order allocations,
+                        * it is because sync compaction recently failed. If
+                        * this is the case and the caller requested a THP
+                        * allocation, we do not want to heavily disrupt the
+                        * system, so we fail the allocation instead of entering
+                        * direct reclaim.
+                        */
+                       if (compact_result == COMPACT_DEFERRED)
+                               goto nopage;
+
+                       /*
+                        * Looks like reclaim/compaction is worth trying, but
+                        * sync compaction could be very expensive, so keep
+                        * using async compaction.
+                        */
+                       compact_priority = INIT_COMPACT_PRIORITY;
+               }
+       }
+
+retry:
+       /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
+       if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+               wake_all_kswapds(order, ac);
+
+       if (gfp_pfmemalloc_allowed(gfp_mask))
+               alloc_flags = ALLOC_NO_WATERMARKS;
 
        /*
         * Reset the zonelist iterators if memory policies can be ignored.
         * These allocations are high priority and system rather than user
         * orientated.
         */
-       if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) {
+       if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
                ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
                ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
                                        ac->high_zoneidx, ac->nodemask);
        }
 
-       /* This is the last chance, in general, before the goto nopage. */
-       page = get_page_from_freelist(gfp_mask, order,
-                               alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+       /* Attempt with potentially adjusted zonelist and alloc_flags */
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
        if (page)
                goto got_pg;
 
-       /* Allocate without watermarks if the context allows */
-       if (alloc_flags & ALLOC_NO_WATERMARKS) {
-               page = get_page_from_freelist(gfp_mask, order,
-                                               ALLOC_NO_WATERMARKS, ac);
-               if (page)
-                       goto got_pg;
-       }
-
        /* Caller is not willing to reclaim, we can't balance anything */
        if (!can_direct_reclaim) {
                /*
@@ -3640,38 +3590,6 @@ retry:
        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
 
-       /*
-        * Try direct compaction. The first pass is asynchronous. Subsequent
-        * attempts after direct reclaim are synchronous
-        */
-       page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
-                                       migration_mode,
-                                       &compact_result);
-       if (page)
-               goto got_pg;
-
-       /* Checks for THP-specific high-order allocations */
-       if (is_thp_gfp_mask(gfp_mask)) {
-               /*
-                * If compaction is deferred for high-order allocations, it is
-                * because sync compaction recently failed. If this is the case
-                * and the caller requested a THP allocation, we do not want
-                * to heavily disrupt the system, so we fail the allocation
-                * instead of entering direct reclaim.
-                */
-               if (compact_result == COMPACT_DEFERRED)
-                       goto nopage;
-
-               /*
-                * Compaction is contended so rather back off than cause
-                * excessive stalls.
-                */
-               if(compact_result == COMPACT_CONTENDED)
-                       goto nopage;
-       }
-
-       if (order && compaction_made_progress(compact_result))
-               compaction_retries++;
 
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3679,16 +3597,25 @@ retry:
        if (page)
                goto got_pg;
 
+       /* Try direct compaction and then allocating */
+       page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
+                                       compact_priority, &compact_result);
+       if (page)
+               goto got_pg;
+
+       if (order && compaction_made_progress(compact_result))
+               compaction_retries++;
+
        /* Do not loop if specifically requested */
        if (gfp_mask & __GFP_NORETRY)
-               goto noretry;
+               goto nopage;
 
        /*
         * Do not retry costly high order allocations unless they are
         * __GFP_REPEAT
         */
        if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
-               goto noretry;
+               goto nopage;
 
        /*
         * Costly allocations might have made a progress but this doesn't mean
@@ -3712,7 +3639,7 @@ retry:
         */
        if (did_some_progress > 0 &&
                        should_compact_retry(ac, order, alloc_flags,
-                               compact_result, &migration_mode,
+                               compact_result, &compact_priority,
                                compaction_retries))
                goto retry;
 
@@ -3727,25 +3654,6 @@ retry:
                goto retry;
        }
 
-noretry:
-       /*
-        * High-order allocations do not necessarily loop after direct reclaim
-        * and reclaim/compaction depends on compaction being called after
-        * reclaim so call directly if necessary.
-        * It can become very expensive to allocate transparent hugepages at
-        * fault, so use asynchronous memory compaction for THP unless it is
-        * khugepaged trying to collapse. All other requests should tolerate
-        * at least light sync migration.
-        */
-       if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
-               migration_mode = MIGRATE_ASYNC;
-       else
-               migration_mode = MIGRATE_SYNC_LIGHT;
-       page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
-                                           ac, migration_mode,
-                                           &compact_result);
-       if (page)
-               goto got_pg;
 nopage:
        warn_alloc_failed(gfp_mask, order, NULL);
 got_pg:
@@ -3761,7 +3669,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page;
        unsigned int cpuset_mems_cookie;
-       unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+       unsigned int alloc_flags = ALLOC_WMARK_LOW;
        gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
        struct alloc_context ac = {
                .high_zoneidx = gfp_zone(gfp_mask),
@@ -4192,7 +4100,7 @@ EXPORT_SYMBOL_GPL(si_mem_available);
 void si_meminfo(struct sysinfo *val)
 {
        val->totalram = totalram_pages;
-       val->sharedram = global_page_state(NR_SHMEM);
+       val->sharedram = global_node_page_state(NR_SHMEM);
        val->freeram = global_page_state(NR_FREE_PAGES);
        val->bufferram = nr_blockdev_pages();
        val->totalhigh = totalhigh_pages;
@@ -4214,8 +4122,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
                managed_pages += pgdat->node_zones[zone_type].managed_pages;
        val->totalram = managed_pages;
-       val->sharedram = node_page_state(nid, NR_SHMEM);
-       val->freeram = node_page_state(nid, NR_FREE_PAGES);
+       val->sharedram = node_page_state(pgdat, NR_SHMEM);
+       val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
                struct zone *zone = &pgdat->node_zones[zone_type];
@@ -4298,6 +4206,7 @@ void show_free_areas(unsigned int filter)
        unsigned long free_pcp = 0;
        int cpu;
        struct zone *zone;
+       pg_data_t *pgdat;
 
        for_each_populated_zone(zone) {
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -4312,35 +4221,74 @@ void show_free_areas(unsigned int filter)
                " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
                " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-               " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n"
-#endif
                " free:%lu free_pcp:%lu free_cma:%lu\n",
-               global_page_state(NR_ACTIVE_ANON),
-               global_page_state(NR_INACTIVE_ANON),
-               global_page_state(NR_ISOLATED_ANON),
-               global_page_state(NR_ACTIVE_FILE),
-               global_page_state(NR_INACTIVE_FILE),
-               global_page_state(NR_ISOLATED_FILE),
-               global_page_state(NR_UNEVICTABLE),
-               global_page_state(NR_FILE_DIRTY),
-               global_page_state(NR_WRITEBACK),
-               global_page_state(NR_UNSTABLE_NFS),
+               global_node_page_state(NR_ACTIVE_ANON),
+               global_node_page_state(NR_INACTIVE_ANON),
+               global_node_page_state(NR_ISOLATED_ANON),
+               global_node_page_state(NR_ACTIVE_FILE),
+               global_node_page_state(NR_INACTIVE_FILE),
+               global_node_page_state(NR_ISOLATED_FILE),
+               global_node_page_state(NR_UNEVICTABLE),
+               global_node_page_state(NR_FILE_DIRTY),
+               global_node_page_state(NR_WRITEBACK),
+               global_node_page_state(NR_UNSTABLE_NFS),
                global_page_state(NR_SLAB_RECLAIMABLE),
                global_page_state(NR_SLAB_UNRECLAIMABLE),
-               global_page_state(NR_FILE_MAPPED),
-               global_page_state(NR_SHMEM),
+               global_node_page_state(NR_FILE_MAPPED),
+               global_node_page_state(NR_SHMEM),
                global_page_state(NR_PAGETABLE),
                global_page_state(NR_BOUNCE),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-               global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR,
-               global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR,
-               global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR,
-#endif
                global_page_state(NR_FREE_PAGES),
                free_pcp,
                global_page_state(NR_FREE_CMA_PAGES));
 
+       for_each_online_pgdat(pgdat) {
+               printk("Node %d"
+                       " active_anon:%lukB"
+                       " inactive_anon:%lukB"
+                       " active_file:%lukB"
+                       " inactive_file:%lukB"
+                       " unevictable:%lukB"
+                       " isolated(anon):%lukB"
+                       " isolated(file):%lukB"
+                       " mapped:%lukB"
+                       " dirty:%lukB"
+                       " writeback:%lukB"
+                       " shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                       " shmem_thp: %lukB"
+                       " shmem_pmdmapped: %lukB"
+                       " anon_thp: %lukB"
+#endif
+                       " writeback_tmp:%lukB"
+                       " unstable:%lukB"
+                       " pages_scanned:%lu"
+                       " all_unreclaimable? %s"
+                       "\n",
+                       pgdat->node_id,
+                       K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+                       K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+                       K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+                       K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+                       K(node_page_state(pgdat, NR_UNEVICTABLE)),
+                       K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+                       K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+                       K(node_page_state(pgdat, NR_FILE_MAPPED)),
+                       K(node_page_state(pgdat, NR_FILE_DIRTY)),
+                       K(node_page_state(pgdat, NR_WRITEBACK)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                       K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
+                       K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
+                                       * HPAGE_PMD_NR),
+                       K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+#endif
+                       K(node_page_state(pgdat, NR_SHMEM)),
+                       K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+                       K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
+                       node_page_state(pgdat, NR_PAGES_SCANNED),
+                       !pgdat_reclaimable(pgdat) ? "yes" : "no");
+       }
+
        for_each_populated_zone(zone) {
                int i;
 
@@ -4362,72 +4310,41 @@ void show_free_areas(unsigned int filter)
                        " active_file:%lukB"
                        " inactive_file:%lukB"
                        " unevictable:%lukB"
-                       " isolated(anon):%lukB"
-                       " isolated(file):%lukB"
+                       " writepending:%lukB"
                        " present:%lukB"
                        " managed:%lukB"
                        " mlocked:%lukB"
-                       " dirty:%lukB"
-                       " writeback:%lukB"
-                       " mapped:%lukB"
-                       " shmem:%lukB"
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-                       " shmem_thp: %lukB"
-                       " shmem_pmdmapped: %lukB"
-                       " anon_thp: %lukB"
-#endif
                        " slab_reclaimable:%lukB"
                        " slab_unreclaimable:%lukB"
                        " kernel_stack:%lukB"
                        " pagetables:%lukB"
-                       " unstable:%lukB"
                        " bounce:%lukB"
                        " free_pcp:%lukB"
                        " local_pcp:%ukB"
                        " free_cma:%lukB"
-                       " writeback_tmp:%lukB"
-                       " pages_scanned:%lu"
-                       " all_unreclaimable? %s"
                        "\n",
                        zone->name,
                        K(zone_page_state(zone, NR_FREE_PAGES)),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
-                       K(zone_page_state(zone, NR_ACTIVE_ANON)),
-                       K(zone_page_state(zone, NR_INACTIVE_ANON)),
-                       K(zone_page_state(zone, NR_ACTIVE_FILE)),
-                       K(zone_page_state(zone, NR_INACTIVE_FILE)),
-                       K(zone_page_state(zone, NR_UNEVICTABLE)),
-                       K(zone_page_state(zone, NR_ISOLATED_ANON)),
-                       K(zone_page_state(zone, NR_ISOLATED_FILE)),
+                       K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
+                       K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
+                       K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
+                       K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
+                       K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+                       K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
                        K(zone->present_pages),
                        K(zone->managed_pages),
                        K(zone_page_state(zone, NR_MLOCK)),
-                       K(zone_page_state(zone, NR_FILE_DIRTY)),
-                       K(zone_page_state(zone, NR_WRITEBACK)),
-                       K(zone_page_state(zone, NR_FILE_MAPPED)),
-                       K(zone_page_state(zone, NR_SHMEM)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-                       K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR),
-                       K(zone_page_state(zone, NR_SHMEM_PMDMAPPED)
-                                       * HPAGE_PMD_NR),
-                       K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR),
-#endif
                        K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
                        K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
-                       zone_page_state(zone, NR_KERNEL_STACK) *
-                               THREAD_SIZE / 1024,
+                       zone_page_state(zone, NR_KERNEL_STACK_KB),
                        K(zone_page_state(zone, NR_PAGETABLE)),
-                       K(zone_page_state(zone, NR_UNSTABLE_NFS)),
                        K(zone_page_state(zone, NR_BOUNCE)),
                        K(free_pcp),
                        K(this_cpu_read(zone->pageset->pcp.count)),
-                       K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
-                       K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
-                       K(zone_page_state(zone, NR_PAGES_SCANNED)),
-                       (!zone_reclaimable(zone) ? "yes" : "no")
-                       );
+                       K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
                printk("lowmem_reserve[]:");
                for (i = 0; i < MAX_NR_ZONES; i++)
                        printk(" %ld", zone->lowmem_reserve[i]);
@@ -4469,7 +4386,7 @@ void show_free_areas(unsigned int filter)
 
        hugetlb_show_meminfo();
 
-       printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
+       printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
 
        show_swap_cache_info();
 }
@@ -5340,6 +5257,11 @@ static void __meminit setup_zone_pageset(struct zone *zone)
        zone->pageset = alloc_percpu(struct per_cpu_pageset);
        for_each_possible_cpu(cpu)
                zone_pageset_init(zone, cpu);
+
+       if (!zone->zone_pgdat->per_cpu_nodestats) {
+               zone->zone_pgdat->per_cpu_nodestats =
+                       alloc_percpu(struct per_cpu_nodestat);
+       }
 }
 
 /*
@@ -5909,6 +5831,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
        init_waitqueue_head(&pgdat->kcompactd_wait);
 #endif
        pgdat_page_ext_init(pgdat);
+       spin_lock_init(&pgdat->lru_lock);
+       lruvec_init(node_lruvec(pgdat));
 
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
@@ -5958,21 +5882,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
                zone->node = nid;
-               zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
+               pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
                                                / 100;
-               zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
+               pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
 #endif
                zone->name = zone_names[j];
+               zone->zone_pgdat = pgdat;
                spin_lock_init(&zone->lock);
-               spin_lock_init(&zone->lru_lock);
                zone_seqlock_init(zone);
-               zone->zone_pgdat = pgdat;
                zone_pcp_init(zone);
 
-               /* For bootup, initialized properly in watermark setup */
-               mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
-
-               lruvec_init(&zone->lruvec);
                if (!size)
                        continue;
 
@@ -6038,11 +5957,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        unsigned long end_pfn = 0;
 
        /* pg_data_t should be reset to zero when it's allocated */
-       WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
+       WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
 
        reset_deferred_meminit(pgdat);
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
+       pgdat->per_cpu_nodestats = NULL;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
        pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
@@ -6699,6 +6619,9 @@ static void calculate_totalreserve_pages(void)
        enum zone_type i, j;
 
        for_each_online_pgdat(pgdat) {
+
+               pgdat->totalreserve_pages = 0;
+
                for (i = 0; i < MAX_NR_ZONES; i++) {
                        struct zone *zone = pgdat->node_zones + i;
                        long max = 0;
@@ -6715,7 +6638,7 @@ static void calculate_totalreserve_pages(void)
                        if (max > zone->managed_pages)
                                max = zone->managed_pages;
 
-                       zone->totalreserve_pages = max;
+                       pgdat->totalreserve_pages += max;
 
                        reserve_pages += max;
                }
@@ -6816,10 +6739,6 @@ static void __setup_per_zone_wmarks(void)
                zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
 
-               __mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                       high_wmark_pages(zone) - low_wmark_pages(zone) -
-                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-
                spin_unlock_irqrestore(&zone->lock, flags);
        }
 
@@ -6930,6 +6849,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
+       struct pglist_data *pgdat;
        struct zone *zone;
        int rc;
 
@@ -6937,8 +6857,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
        if (rc)
                return rc;
 
+       for_each_online_pgdat(pgdat)
+               pgdat->min_slab_pages = 0;
+
        for_each_zone(zone)
-               zone->min_unmapped_pages = (zone->managed_pages *
+               zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
                                sysctl_min_unmapped_ratio) / 100;
        return 0;
 }
@@ -6946,6 +6869,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
+       struct pglist_data *pgdat;
        struct zone *zone;
        int rc;
 
@@ -6953,8 +6877,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
        if (rc)
                return rc;
 
+       for_each_online_pgdat(pgdat)
+               pgdat->min_slab_pages = 0;
+
        for_each_zone(zone)
-               zone->min_slab_pages = (zone->managed_pages *
+               zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
                                sysctl_min_slab_ratio) / 100;
        return 0;
 }
index 4ea9c4ef5146b8b784848a6710b70e5fc0dfcd41..ae11aa914e5569afbd5ee67f5349b81912121d02 100644 (file)
@@ -41,12 +41,12 @@ static struct page *page_idle_get_page(unsigned long pfn)
                return NULL;
 
        zone = page_zone(page);
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
        if (unlikely(!PageLRU(page))) {
                put_page(page);
                page = NULL;
        }
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(zone_lru_lock(zone));
        return page;
 }
 
index dcc5d3769608088a8c100f04f4e01dcccf5b74da..fb1fa269d3a06416151a2b3ef9e57a1e99ae4db5 100644 (file)
@@ -166,6 +166,8 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
                unsigned block_in_page;
                sector_t first_block;
 
+               cond_resched();
+
                first_block = bmap(inode, probe_block);
                if (first_block == 0)
                        goto bad_bmap;
index 8a13d9f7b5666b153d64788576f46029bdc1d99f..709bc83703b1bfef419fa674ef5b5e28f5d70f05 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -27,7 +27,7 @@
  *         mapping->i_mmap_rwsem
  *           anon_vma->rwsem
  *             mm->page_table_lock or pte_lock
- *               zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ *               zone_lru_lock (in mark_page_accessed, isolate_lru_page)
  *               swap_lock (in swap_duplicate, swap_info_get)
  *                 mmlist_lock (in mmput, drain_mmlist and others)
  *                 mapping->private_lock (in __set_page_dirty_buffers)
@@ -1213,8 +1213,8 @@ void do_page_add_anon_rmap(struct page *page,
                 * disabled.
                 */
                if (compound)
-                       __inc_zone_page_state(page, NR_ANON_THPS);
-               __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
+                       __inc_node_page_state(page, NR_ANON_THPS);
+               __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
        }
        if (unlikely(PageKsm(page)))
                return;
@@ -1251,14 +1251,14 @@ void page_add_new_anon_rmap(struct page *page,
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                /* increment count (starts at -1) */
                atomic_set(compound_mapcount_ptr(page), 0);
-               __inc_zone_page_state(page, NR_ANON_THPS);
+               __inc_node_page_state(page, NR_ANON_THPS);
        } else {
                /* Anon THP always mapped first with PMD */
                VM_BUG_ON_PAGE(PageTransCompound(page), page);
                /* increment count (starts at -1) */
                atomic_set(&page->_mapcount, 0);
        }
-       __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
+       __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
        __page_set_anon_rmap(page, vma, address, 1);
 }
 
@@ -1282,7 +1282,7 @@ void page_add_file_rmap(struct page *page, bool compound)
                if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
                        goto out;
                VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
-               __inc_zone_page_state(page, NR_SHMEM_PMDMAPPED);
+               __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
        } else {
                if (PageTransCompound(page)) {
                        VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1293,7 +1293,7 @@ void page_add_file_rmap(struct page *page, bool compound)
                if (!atomic_inc_and_test(&page->_mapcount))
                        goto out;
        }
-       __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr);
+       __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
        mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
 out:
        unlock_page_memcg(page);
@@ -1322,18 +1322,18 @@ static void page_remove_file_rmap(struct page *page, bool compound)
                if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
                        goto out;
                VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
-               __dec_zone_page_state(page, NR_SHMEM_PMDMAPPED);
+               __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
        } else {
                if (!atomic_add_negative(-1, &page->_mapcount))
                        goto out;
        }
 
        /*
-        * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+        * We use the irq-unsafe __{inc|mod}_zone_page_state because
         * these counters are not modified in interrupt context, and
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
-       __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, -nr);
+       __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
        mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
 
        if (unlikely(PageMlocked(page)))
@@ -1356,7 +1356,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return;
 
-       __dec_zone_page_state(page, NR_ANON_THPS);
+       __dec_node_page_state(page, NR_ANON_THPS);
 
        if (TestClearPageDoubleMap(page)) {
                /*
@@ -1375,7 +1375,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
                clear_page_mlock(page);
 
        if (nr) {
-               __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+               __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
                deferred_split_huge_page(page);
        }
 }
@@ -1404,7 +1404,7 @@ void page_remove_rmap(struct page *page, bool compound)
         * these counters are not modified in interrupt context, and
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
-       __dec_zone_page_state(page, NR_ANON_PAGES);
+       __dec_node_page_state(page, NR_ANON_MAPPED);
 
        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
index 62e42c7d544c148cd7f1fc10839e6e7a2205884e..2ac19a61d5655b82d6aa6b01a37eb6d05ea3a72a 100644 (file)
@@ -575,9 +575,9 @@ static int shmem_add_to_page_cache(struct page *page,
        if (!error) {
                mapping->nrpages += nr;
                if (PageTransHuge(page))
-                       __inc_zone_page_state(page, NR_SHMEM_THPS);
-               __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr);
-               __mod_zone_page_state(page_zone(page), NR_SHMEM, nr);
+                       __inc_node_page_state(page, NR_SHMEM_THPS);
+               __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
+               __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
                spin_unlock_irq(&mapping->tree_lock);
        } else {
                page->mapping = NULL;
@@ -601,8 +601,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
        error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
        page->mapping = NULL;
        mapping->nrpages--;
-       __dec_zone_page_state(page, NR_FILE_PAGES);
-       __dec_zone_page_state(page, NR_SHMEM);
+       __dec_node_page_state(page, NR_FILE_PAGES);
+       __dec_node_page_state(page, NR_SHMEM);
        spin_unlock_irq(&mapping->tree_lock);
        put_page(page);
        BUG_ON(error);
@@ -1493,8 +1493,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
                                                                   newpage);
        if (!error) {
-               __inc_zone_page_state(newpage, NR_FILE_PAGES);
-               __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+               __inc_node_page_state(newpage, NR_FILE_PAGES);
+               __dec_node_page_state(oldpage, NR_FILE_PAGES);
        }
        spin_unlock_irq(&swap_mapping->tree_lock);
 
index f33980ab0406980edc03bb4a54f6f08c5e9aaa4b..9653f2e2591ad0982d2dc74c668323a43e5b026d 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -369,6 +369,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
                return s->object_size;
 # endif
+       if (s->flags & SLAB_KASAN)
+               return s->object_size;
        /*
         * If we have the need to store the freelist pointer
         * back there or track user information then we can
index f9da8716b8b358fb6a0d7184a5c2ccb95d7369a5..74e7c8c30db8e15ef3455b7bbdd0b2ab9412d452 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,7 +124,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #endif
 }
 
-static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+inline void *fixup_red_left(struct kmem_cache *s, void *p)
 {
        if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
                p += s->red_left_pad;
@@ -454,8 +454,6 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p)
  */
 #if defined(CONFIG_SLUB_DEBUG_ON)
 static int slub_debug = DEBUG_DEFAULT_FLAGS;
-#elif defined(CONFIG_KASAN)
-static int slub_debug = SLAB_STORE_USER;
 #else
 static int slub_debug;
 #endif
@@ -660,6 +658,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        if (s->flags & SLAB_STORE_USER)
                off += 2 * sizeof(struct track);
 
+       off += kasan_metadata_size(s);
+
        if (off != size_from_object(s))
                /* Beginning of the filler is the free pointer */
                print_section("Padding ", p + off, size_from_object(s) - off);
@@ -787,6 +787,8 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
                /* We also have user information there */
                off += 2 * sizeof(struct track);
 
+       off += kasan_metadata_size(s);
+
        if (size_from_object(s) == off)
                return 1;
 
@@ -1322,8 +1324,10 @@ static inline void kfree_hook(const void *x)
        kasan_kfree_large(x);
 }
 
-static inline void slab_free_hook(struct kmem_cache *s, void *x)
+static inline void *slab_free_hook(struct kmem_cache *s, void *x)
 {
+       void *freeptr;
+
        kmemleak_free_recursive(x, s->flags);
 
        /*
@@ -1344,7 +1348,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
        if (!(s->flags & SLAB_DEBUG_OBJECTS))
                debug_check_no_obj_freed(x, s->object_size);
 
+       freeptr = get_freepointer(s, x);
+       /*
+        * kasan_slab_free() may put x into memory quarantine, delaying its
+        * reuse. In this case the object's freelist pointer is changed.
+        */
        kasan_slab_free(s, x);
+       return freeptr;
 }
 
 static inline void slab_free_freelist_hook(struct kmem_cache *s,
@@ -1362,11 +1372,11 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
 
        void *object = head;
        void *tail_obj = tail ? : head;
+       void *freeptr;
 
        do {
-               slab_free_hook(s, object);
-       } while ((object != tail_obj) &&
-                (object = get_freepointer(s, object)));
+               freeptr = slab_free_hook(s, object);
+       } while ((object != tail_obj) && (object = freeptr));
 #endif
 }
 
@@ -2878,16 +2888,13 @@ slab_empty:
  * same page) possible by specifying head and tail ptr, plus objects
  * count (cnt). Bulk free indicated by tail pointer being set.
  */
-static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
-                                     void *head, void *tail, int cnt,
-                                     unsigned long addr)
+static __always_inline void do_slab_free(struct kmem_cache *s,
+                               struct page *page, void *head, void *tail,
+                               int cnt, unsigned long addr)
 {
        void *tail_obj = tail ? : head;
        struct kmem_cache_cpu *c;
        unsigned long tid;
-
-       slab_free_freelist_hook(s, head, tail);
-
 redo:
        /*
         * Determine the currently cpus per cpu slab.
@@ -2921,6 +2928,27 @@ redo:
 
 }
 
+static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+                                     void *head, void *tail, int cnt,
+                                     unsigned long addr)
+{
+       slab_free_freelist_hook(s, head, tail);
+       /*
+        * slab_free_freelist_hook() could have put the items into quarantine.
+        * If so, no need to free them.
+        */
+       if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU))
+               return;
+       do_slab_free(s, page, head, tail, cnt, addr);
+}
+
+#ifdef CONFIG_KASAN
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
+{
+       do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
+}
+#endif
+
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
        s = cache_from_obj(s, x);
@@ -3363,7 +3391,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
 static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
        unsigned long flags = s->flags;
-       unsigned long size = s->object_size;
+       size_t size = s->object_size;
        int order;
 
        /*
@@ -3422,7 +3450,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
                 * the object.
                 */
                size += 2 * sizeof(struct track);
+#endif
 
+       kasan_cache_create(s, &size, &s->flags);
+#ifdef CONFIG_SLUB_DEBUG
        if (flags & SLAB_RED_ZONE) {
                /*
                 * Add some empty padding so that we can catch
index 5d0cf45403646e54b0e92673f8855e693ad1cf84..36d7bbb80e4932ceed6887f27f9e5d15d0311b39 100644 (file)
@@ -100,11 +100,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
 }
 #endif
 
-/*
- * Although written for the SPARSEMEM_EXTREME case, this happens
- * to also work for the flat array case because
- * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
- */
+#ifdef CONFIG_SPARSEMEM_EXTREME
 int __section_nr(struct mem_section* ms)
 {
        unsigned long root_nr;
@@ -123,6 +119,12 @@ int __section_nr(struct mem_section* ms)
 
        return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
+#else
+int __section_nr(struct mem_section* ms)
+{
+       return (int)(ms - mem_section[0]);
+}
+#endif
 
 /*
  * During early boot, before section_mem_map is used for an actual
index 616df4ddd8708ee7dc4a16fcd6f4814256a86d82..75c63bb2a1da1dc0c3e55600e9db1618949df87a 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -62,12 +62,12 @@ static void __page_cache_release(struct page *page)
                struct lruvec *lruvec;
                unsigned long flags;
 
-               spin_lock_irqsave(&zone->lru_lock, flags);
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               spin_lock_irqsave(zone_lru_lock(zone), flags);
+               lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
                VM_BUG_ON_PAGE(!PageLRU(page), page);
                __ClearPageLRU(page);
                del_page_from_lru_list(page, lruvec, page_off_lru(page));
-               spin_unlock_irqrestore(&zone->lru_lock, flags);
+               spin_unlock_irqrestore(zone_lru_lock(zone), flags);
        }
        mem_cgroup_uncharge(page);
 }
@@ -179,26 +179,26 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
        void *arg)
 {
        int i;
-       struct zone *zone = NULL;
+       struct pglist_data *pgdat = NULL;
        struct lruvec *lruvec;
        unsigned long flags = 0;
 
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
-               struct zone *pagezone = page_zone(page);
+               struct pglist_data *pagepgdat = page_pgdat(page);
 
-               if (pagezone != zone) {
-                       if (zone)
-                               spin_unlock_irqrestore(&zone->lru_lock, flags);
-                       zone = pagezone;
-                       spin_lock_irqsave(&zone->lru_lock, flags);
+               if (pagepgdat != pgdat) {
+                       if (pgdat)
+                               spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+                       pgdat = pagepgdat;
+                       spin_lock_irqsave(&pgdat->lru_lock, flags);
                }
 
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               lruvec = mem_cgroup_page_lruvec(page, pgdat);
                (*move_fn)(page, lruvec, arg);
        }
-       if (zone)
-               spin_unlock_irqrestore(&zone->lru_lock, flags);
+       if (pgdat)
+               spin_unlock_irqrestore(&pgdat->lru_lock, flags);
        release_pages(pvec->pages, pvec->nr, pvec->cold);
        pagevec_reinit(pvec);
 }
@@ -318,9 +318,9 @@ void activate_page(struct page *page)
        struct zone *zone = page_zone(page);
 
        page = compound_head(page);
-       spin_lock_irq(&zone->lru_lock);
-       __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
-       spin_unlock_irq(&zone->lru_lock);
+       spin_lock_irq(zone_lru_lock(zone));
+       __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
+       spin_unlock_irq(zone_lru_lock(zone));
 }
 #endif
 
@@ -445,16 +445,16 @@ void lru_cache_add(struct page *page)
  */
 void add_page_to_unevictable_list(struct page *page)
 {
-       struct zone *zone = page_zone(page);
+       struct pglist_data *pgdat = page_pgdat(page);
        struct lruvec *lruvec;
 
-       spin_lock_irq(&zone->lru_lock);
-       lruvec = mem_cgroup_page_lruvec(page, zone);
+       spin_lock_irq(&pgdat->lru_lock);
+       lruvec = mem_cgroup_page_lruvec(page, pgdat);
        ClearPageActive(page);
        SetPageUnevictable(page);
        SetPageLRU(page);
        add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&pgdat->lru_lock);
 }
 
 /**
@@ -730,7 +730,7 @@ void release_pages(struct page **pages, int nr, bool cold)
 {
        int i;
        LIST_HEAD(pages_to_free);
-       struct zone *zone = NULL;
+       struct pglist_data *locked_pgdat = NULL;
        struct lruvec *lruvec;
        unsigned long uninitialized_var(flags);
        unsigned int uninitialized_var(lock_batch);
@@ -741,11 +741,11 @@ void release_pages(struct page **pages, int nr, bool cold)
                /*
                 * Make sure the IRQ-safe lock-holding time does not get
                 * excessive with a continuous string of pages from the
-                * same zone. The lock is held only if zone != NULL.
+                * same pgdat. The lock is held only if pgdat != NULL.
                 */
-               if (zone && ++lock_batch == SWAP_CLUSTER_MAX) {
-                       spin_unlock_irqrestore(&zone->lru_lock, flags);
-                       zone = NULL;
+               if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
+                       spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+                       locked_pgdat = NULL;
                }
 
                if (is_huge_zero_page(page)) {
@@ -758,27 +758,27 @@ void release_pages(struct page **pages, int nr, bool cold)
                        continue;
 
                if (PageCompound(page)) {
-                       if (zone) {
-                               spin_unlock_irqrestore(&zone->lru_lock, flags);
-                               zone = NULL;
+                       if (locked_pgdat) {
+                               spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+                               locked_pgdat = NULL;
                        }
                        __put_compound_page(page);
                        continue;
                }
 
                if (PageLRU(page)) {
-                       struct zone *pagezone = page_zone(page);
+                       struct pglist_data *pgdat = page_pgdat(page);
 
-                       if (pagezone != zone) {
-                               if (zone)
-                                       spin_unlock_irqrestore(&zone->lru_lock,
+                       if (pgdat != locked_pgdat) {
+                               if (locked_pgdat)
+                                       spin_unlock_irqrestore(&locked_pgdat->lru_lock,
                                                                        flags);
                                lock_batch = 0;
-                               zone = pagezone;
-                               spin_lock_irqsave(&zone->lru_lock, flags);
+                               locked_pgdat = pgdat;
+                               spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
                        }
 
-                       lruvec = mem_cgroup_page_lruvec(page, zone);
+                       lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
                        VM_BUG_ON_PAGE(!PageLRU(page), page);
                        __ClearPageLRU(page);
                        del_page_from_lru_list(page, lruvec, page_off_lru(page));
@@ -789,8 +789,8 @@ void release_pages(struct page **pages, int nr, bool cold)
 
                list_add(&page->lru, &pages_to_free);
        }
-       if (zone)
-               spin_unlock_irqrestore(&zone->lru_lock, flags);
+       if (locked_pgdat)
+               spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
 
        mem_cgroup_uncharge_list(&pages_to_free);
        free_hot_cold_page_list(&pages_to_free, cold);
@@ -826,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
        VM_BUG_ON_PAGE(PageCompound(page_tail), page);
        VM_BUG_ON_PAGE(PageLRU(page_tail), page);
        VM_BUG_ON(NR_CPUS != 1 &&
-                 !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
+                 !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
 
        if (!list)
                SetPageLRU(page_tail);
index c99463ac02fb56d270fdc4ea7ab749e1f58fe666..c8310a37be3abbcf267472f30bd0e3677c1336d4 100644 (file)
@@ -95,7 +95,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
                                        entry.val, page);
        if (likely(!error)) {
                address_space->nrpages++;
-               __inc_zone_page_state(page, NR_FILE_PAGES);
+               __inc_node_page_state(page, NR_FILE_PAGES);
                INC_CACHE_INFO(add_total);
        }
        spin_unlock_irq(&address_space->tree_lock);
@@ -147,7 +147,7 @@ void __delete_from_swap_cache(struct page *page)
        set_page_private(page, 0);
        ClearPageSwapCache(page);
        address_space->nrpages--;
-       __dec_zone_page_state(page, NR_FILE_PAGES);
+       __dec_node_page_state(page, NR_FILE_PAGES);
        INC_CACHE_INFO(del_total);
 }
 
index 8d010ef2ce1c8e094919b85e95130fe82474bd5b..662cddf914af2048ab6c9c674f59d37a7aceba69 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -528,7 +528,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                free = global_page_state(NR_FREE_PAGES);
-               free += global_page_state(NR_FILE_PAGES);
+               free += global_node_page_state(NR_FILE_PAGES);
 
                /*
                 * shmem pages shouldn't be counted as free in this
@@ -536,7 +536,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                 * that won't affect the overall amount of available
                 * memory in the system.
                 */
-               free -= global_page_state(NR_SHMEM);
+               free -= global_node_page_state(NR_SHMEM);
 
                free += get_nr_swap_pages();
 
index 21d417ccff69d4efb8154d78c976b97e353a3609..650d26832569a9c41399f3a5da49c6eb56015d46 100644 (file)
@@ -84,6 +84,9 @@ struct scan_control {
        /* Scan (total_size >> priority) pages at once */
        int priority;
 
+       /* The highest zone to isolate pages for reclaim from */
+       enum zone_type reclaim_idx;
+
        unsigned int may_writepage:1;
 
        /* Can mapped pages be reclaimed? */
@@ -191,26 +194,44 @@ static bool sane_reclaim(struct scan_control *sc)
 }
 #endif
 
+/*
+ * This misses isolated pages which are not accounted for to save counters.
+ * As the data only determines if reclaim or compaction continues, it is
+ * not expected that isolated pages will be a dominating factor.
+ */
 unsigned long zone_reclaimable_pages(struct zone *zone)
 {
        unsigned long nr;
 
-       nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
-            zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
-            zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
+       nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
+               zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
+       if (get_nr_swap_pages() > 0)
+               nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
+                       zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
+
+       return nr;
+}
+
+unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
+{
+       unsigned long nr;
+
+       nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
+            node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
+            node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
 
        if (get_nr_swap_pages() > 0)
-               nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
-                     zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
-                     zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
+               nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
+                     node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
+                     node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
 
        return nr;
 }
 
-bool zone_reclaimable(struct zone *zone)
+bool pgdat_reclaimable(struct pglist_data *pgdat)
 {
-       return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
-               zone_reclaimable_pages(zone) * 6;
+       return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
+               pgdat_reclaimable_pages(pgdat) * 6;
 }
 
 unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -218,7 +239,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
        if (!mem_cgroup_disabled())
                return mem_cgroup_get_lru_size(lruvec, lru);
 
-       return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
+       return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
 }
 
 /*
@@ -593,7 +614,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        ClearPageReclaim(page);
                }
                trace_mm_vmscan_writepage(page);
-               inc_zone_page_state(page, NR_VMSCAN_WRITE);
+               inc_node_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
 
@@ -877,7 +898,7 @@ static void page_check_dirty_writeback(struct page *page,
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
-                                     struct zone *zone,
+                                     struct pglist_data *pgdat,
                                      struct scan_control *sc,
                                      enum ttu_flags ttu_flags,
                                      unsigned long *ret_nr_dirty,
@@ -917,7 +938,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        goto keep;
 
                VM_BUG_ON_PAGE(PageActive(page), page);
-               VM_BUG_ON_PAGE(page_zone(page) != zone, page);
 
                sc->nr_scanned++;
 
@@ -996,7 +1016,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        /* Case 1 above */
                        if (current_is_kswapd() &&
                            PageReclaim(page) &&
-                           test_bit(ZONE_WRITEBACK, &zone->flags)) {
+                           test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
                                nr_immediate++;
                                goto keep_locked;
 
@@ -1092,14 +1112,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         */
                        if (page_is_file_cache(page) &&
                                        (!current_is_kswapd() ||
-                                        !test_bit(ZONE_DIRTY, &zone->flags))) {
+                                        !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
                                /*
                                 * Immediately reclaim when written back.
                                 * Similar in principal to deactivate_page()
                                 * except we already have the page isolated
                                 * and know it's dirty
                                 */
-                               inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+                               inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
                                SetPageReclaim(page);
 
                                goto keep_locked;
@@ -1266,11 +1286,11 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
                }
        }
 
-       ret = shrink_page_list(&clean_pages, zone, &sc,
+       ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
                        TTU_UNMAP|TTU_IGNORE_ACCESS,
                        &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
        list_splice(&clean_pages, page_list);
-       mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+       mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
        return ret;
 }
 
@@ -1348,8 +1368,31 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
        return ret;
 }
 
+
 /*
- * zone->lru_lock is heavily contended.  Some of the functions that
+ * Update LRU sizes after isolating pages. The LRU size updates must
+ * be complete before mem_cgroup_update_lru_size due to a santity check.
+ */
+static __always_inline void update_lru_sizes(struct lruvec *lruvec,
+                       enum lru_list lru, unsigned long *nr_zone_taken,
+                       unsigned long nr_taken)
+{
+       int zid;
+
+       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+               if (!nr_zone_taken[zid])
+                       continue;
+
+               __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
+       }
+
+#ifdef CONFIG_MEMCG
+       mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
+#endif
+}
+
+/*
+ * zone_lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
  * and working on them outside the LRU lock.
  *
@@ -1375,10 +1418,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 {
        struct list_head *src = &lruvec->lists[lru];
        unsigned long nr_taken = 0;
-       unsigned long scan;
+       unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
+       unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
+       unsigned long scan, nr_pages;
+       LIST_HEAD(pages_skipped);
 
        for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
-                                       !list_empty(src); scan++) {
+                                       !list_empty(src);) {
                struct page *page;
 
                page = lru_to_page(src);
@@ -1386,9 +1432,23 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
                VM_BUG_ON_PAGE(!PageLRU(page), page);
 
+               if (page_zonenum(page) > sc->reclaim_idx) {
+                       list_move(&page->lru, &pages_skipped);
+                       nr_skipped[page_zonenum(page)]++;
+                       continue;
+               }
+
+               /*
+                * Account for scanned and skipped separetly to avoid the pgdat
+                * being prematurely marked unreclaimable by pgdat_reclaimable.
+                */
+               scan++;
+
                switch (__isolate_lru_page(page, mode)) {
                case 0:
-                       nr_taken += hpage_nr_pages(page);
+                       nr_pages = hpage_nr_pages(page);
+                       nr_taken += nr_pages;
+                       nr_zone_taken[page_zonenum(page)] += nr_pages;
                        list_move(&page->lru, dst);
                        break;
 
@@ -1402,9 +1462,38 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                }
        }
 
+       /*
+        * Splice any skipped pages to the start of the LRU list. Note that
+        * this disrupts the LRU order when reclaiming for lower zones but
+        * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
+        * scanning would soon rescan the same pages to skip and put the
+        * system at risk of premature OOM.
+        */
+       if (!list_empty(&pages_skipped)) {
+               int zid;
+               unsigned long total_skipped = 0;
+
+               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                       if (!nr_skipped[zid])
+                               continue;
+
+                       __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
+                       total_skipped += nr_skipped[zid];
+               }
+
+               /*
+                * Account skipped pages as a partial scan as the pgdat may be
+                * close to unreclaimable. If the LRU list is empty, account
+                * skipped pages as a full scan.
+                */
+               scan += list_empty(src) ? total_skipped : total_skipped >> 2;
+
+               list_splice(&pages_skipped, src);
+       }
        *nr_scanned = scan;
-       trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+       trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
                                    nr_taken, mode, is_file_lru(lru));
+       update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
        return nr_taken;
 }
 
@@ -1444,8 +1533,8 @@ int isolate_lru_page(struct page *page)
                struct zone *zone = page_zone(page);
                struct lruvec *lruvec;
 
-               spin_lock_irq(&zone->lru_lock);
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               spin_lock_irq(zone_lru_lock(zone));
+               lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
                if (PageLRU(page)) {
                        int lru = page_lru(page);
                        get_page(page);
@@ -1453,7 +1542,7 @@ int isolate_lru_page(struct page *page)
                        del_page_from_lru_list(page, lruvec, lru);
                        ret = 0;
                }
-               spin_unlock_irq(&zone->lru_lock);
+               spin_unlock_irq(zone_lru_lock(zone));
        }
        return ret;
 }
@@ -1465,7 +1554,7 @@ int isolate_lru_page(struct page *page)
  * the LRU list will go small and be scanned faster than necessary, leading to
  * unnecessary swapping, thrashing and OOM.
  */
-static int too_many_isolated(struct zone *zone, int file,
+static int too_many_isolated(struct pglist_data *pgdat, int file,
                struct scan_control *sc)
 {
        unsigned long inactive, isolated;
@@ -1477,11 +1566,11 @@ static int too_many_isolated(struct zone *zone, int file,
                return 0;
 
        if (file) {
-               inactive = zone_page_state(zone, NR_INACTIVE_FILE);
-               isolated = zone_page_state(zone, NR_ISOLATED_FILE);
+               inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
+               isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
        } else {
-               inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-               isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+               inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
+               isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
        }
 
        /*
@@ -1499,7 +1588,7 @@ static noinline_for_stack void
 putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 {
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-       struct zone *zone = lruvec_zone(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        LIST_HEAD(pages_to_free);
 
        /*
@@ -1512,13 +1601,13 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
                VM_BUG_ON_PAGE(PageLRU(page), page);
                list_del(&page->lru);
                if (unlikely(!page_evictable(page))) {
-                       spin_unlock_irq(&zone->lru_lock);
+                       spin_unlock_irq(&pgdat->lru_lock);
                        putback_lru_page(page);
-                       spin_lock_irq(&zone->lru_lock);
+                       spin_lock_irq(&pgdat->lru_lock);
                        continue;
                }
 
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
                SetPageLRU(page);
                lru = page_lru(page);
@@ -1535,10 +1624,10 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
                        del_page_from_lru_list(page, lruvec, lru);
 
                        if (unlikely(PageCompound(page))) {
-                               spin_unlock_irq(&zone->lru_lock);
+                               spin_unlock_irq(&pgdat->lru_lock);
                                mem_cgroup_uncharge(page);
                                (*get_compound_page_dtor(page))(page);
-                               spin_lock_irq(&zone->lru_lock);
+                               spin_lock_irq(&pgdat->lru_lock);
                        } else
                                list_add(&page->lru, &pages_to_free);
                }
@@ -1563,8 +1652,32 @@ static int current_may_throttle(void)
                bdi_write_congested(current->backing_dev_info);
 }
 
+static bool inactive_reclaimable_pages(struct lruvec *lruvec,
+                               struct scan_control *sc, enum lru_list lru)
+{
+       int zid;
+       struct zone *zone;
+       int file = is_file_lru(lru);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+       if (!global_reclaim(sc))
+               return true;
+
+       for (zid = sc->reclaim_idx; zid >= 0; zid--) {
+               zone = &pgdat->node_zones[zid];
+               if (!populated_zone(zone))
+                       continue;
+
+               if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
+                               LRU_FILE * file) >= SWAP_CLUSTER_MAX)
+                       return true;
+       }
+
+       return false;
+}
+
 /*
- * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * shrink_inactive_list() is a helper for shrink_node().  It returns the number
  * of reclaimed pages
  */
 static noinline_for_stack unsigned long
@@ -1582,10 +1695,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        unsigned long nr_immediate = 0;
        isolate_mode_t isolate_mode = 0;
        int file = is_file_lru(lru);
-       struct zone *zone = lruvec_zone(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 
-       while (unlikely(too_many_isolated(zone, file, sc))) {
+       if (!inactive_reclaimable_pages(lruvec, sc, lru))
+               return 0;
+
+       while (unlikely(too_many_isolated(pgdat, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
 
                /* We are about to die and free our memory. Return now. */
@@ -1600,48 +1716,45 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        if (!sc->may_writepage)
                isolate_mode |= ISOLATE_CLEAN;
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&pgdat->lru_lock);
 
        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
                                     &nr_scanned, sc, isolate_mode, lru);
 
-       update_lru_size(lruvec, lru, -nr_taken);
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+       __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
        reclaim_stat->recent_scanned[file] += nr_taken;
 
        if (global_reclaim(sc)) {
-               __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+               __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
                if (current_is_kswapd())
-                       __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+                       __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
                else
-                       __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
+                       __count_vm_events(PGSCAN_DIRECT, nr_scanned);
        }
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&pgdat->lru_lock);
 
        if (nr_taken == 0)
                return 0;
 
-       nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+       nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
                                &nr_dirty, &nr_unqueued_dirty, &nr_congested,
                                &nr_writeback, &nr_immediate,
                                false);
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&pgdat->lru_lock);
 
        if (global_reclaim(sc)) {
                if (current_is_kswapd())
-                       __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
-                                              nr_reclaimed);
+                       __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
                else
-                       __count_zone_vm_events(PGSTEAL_DIRECT, zone,
-                                              nr_reclaimed);
+                       __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
        }
 
        putback_inactive_pages(lruvec, &page_list);
 
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
+       __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
 
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&pgdat->lru_lock);
 
        mem_cgroup_uncharge_list(&page_list);
        free_hot_cold_page_list(&page_list, true);
@@ -1661,7 +1774,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         * are encountered in the nr_immediate check below.
         */
        if (nr_writeback && nr_writeback == nr_taken)
-               set_bit(ZONE_WRITEBACK, &zone->flags);
+               set_bit(PGDAT_WRITEBACK, &pgdat->flags);
 
        /*
         * Legacy memcg will stall in page writeback so avoid forcibly
@@ -1673,16 +1786,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                 * backed by a congested BDI and wait_iff_congested will stall.
                 */
                if (nr_dirty && nr_dirty == nr_congested)
-                       set_bit(ZONE_CONGESTED, &zone->flags);
+                       set_bit(PGDAT_CONGESTED, &pgdat->flags);
 
                /*
                 * If dirty pages are scanned that are not queued for IO, it
                 * implies that flushers are not keeping up. In this case, flag
-                * the zone ZONE_DIRTY and kswapd will start writing pages from
+                * the pgdat PGDAT_DIRTY and kswapd will start writing pages from
                 * reclaim context.
                 */
                if (nr_unqueued_dirty == nr_taken)
-                       set_bit(ZONE_DIRTY, &zone->flags);
+                       set_bit(PGDAT_DIRTY, &pgdat->flags);
 
                /*
                 * If kswapd scans pages marked marked for immediate
@@ -1701,9 +1814,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         */
        if (!sc->hibernation_mode && !current_is_kswapd() &&
            current_may_throttle())
-               wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+               wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
 
-       trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed,
+       trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
+                       nr_scanned, nr_reclaimed,
                        sc->priority, file);
        return nr_reclaimed;
 }
@@ -1715,9 +1829,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
  * processes, from rmap.
  *
  * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation.  But if
+ * appropriate to hold zone_lru_lock across the whole operation.  But if
  * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page.  It's impossible to balance
+ * should drop zone_lru_lock around each page.  It's impossible to balance
  * this, so instead we remove the pages from the LRU while processing them.
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
@@ -1731,20 +1845,20 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
                                     struct list_head *pages_to_free,
                                     enum lru_list lru)
 {
-       struct zone *zone = lruvec_zone(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        unsigned long pgmoved = 0;
        struct page *page;
        int nr_pages;
 
        while (!list_empty(list)) {
                page = lru_to_page(list);
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
                VM_BUG_ON_PAGE(PageLRU(page), page);
                SetPageLRU(page);
 
                nr_pages = hpage_nr_pages(page);
-               update_lru_size(lruvec, lru, nr_pages);
+               update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
                list_move(&page->lru, &lruvec->lists[lru]);
                pgmoved += nr_pages;
 
@@ -1754,10 +1868,10 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
                        del_page_from_lru_list(page, lruvec, lru);
 
                        if (unlikely(PageCompound(page))) {
-                               spin_unlock_irq(&zone->lru_lock);
+                               spin_unlock_irq(&pgdat->lru_lock);
                                mem_cgroup_uncharge(page);
                                (*get_compound_page_dtor(page))(page);
-                               spin_lock_irq(&zone->lru_lock);
+                               spin_lock_irq(&pgdat->lru_lock);
                        } else
                                list_add(&page->lru, pages_to_free);
                }
@@ -1783,7 +1897,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
        unsigned long nr_rotated = 0;
        isolate_mode_t isolate_mode = 0;
        int file = is_file_lru(lru);
-       struct zone *zone = lruvec_zone(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
        lru_add_drain();
 
@@ -1792,20 +1906,19 @@ static void shrink_active_list(unsigned long nr_to_scan,
        if (!sc->may_writepage)
                isolate_mode |= ISOLATE_CLEAN;
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&pgdat->lru_lock);
 
        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
                                     &nr_scanned, sc, isolate_mode, lru);
 
-       update_lru_size(lruvec, lru, -nr_taken);
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+       __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
        reclaim_stat->recent_scanned[file] += nr_taken;
 
        if (global_reclaim(sc))
-               __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
-       __count_zone_vm_events(PGREFILL, zone, nr_scanned);
+               __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
+       __count_vm_events(PGREFILL, nr_scanned);
 
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&pgdat->lru_lock);
 
        while (!list_empty(&l_hold)) {
                cond_resched();
@@ -1850,7 +1963,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
        /*
         * Move pages back to the lru list.
         */
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&pgdat->lru_lock);
        /*
         * Count referenced pages from currently used mappings as rotated,
         * even though only some of them are actually re-activated.  This
@@ -1861,8 +1974,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
        move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
        move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
-       __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
-       spin_unlock_irq(&zone->lru_lock);
+       __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+       spin_unlock_irq(&pgdat->lru_lock);
 
        mem_cgroup_uncharge_list(&l_hold);
        free_hot_cold_page_list(&l_hold, true);
@@ -1894,12 +2007,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
-static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
+                                               struct scan_control *sc)
 {
        unsigned long inactive_ratio;
        unsigned long inactive;
        unsigned long active;
        unsigned long gb;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       int zid;
 
        /*
         * If we don't have swap space, anonymous page deactivation
@@ -1911,6 +2027,27 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
        inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
        active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
 
+       /*
+        * For zone-constrained allocations, it is necessary to check if
+        * deactivations are required for lowmem to be reclaimed. This
+        * calculates the inactive/active pages available in eligible zones.
+        */
+       for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
+               struct zone *zone = &pgdat->node_zones[zid];
+               unsigned long inactive_zone, active_zone;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               inactive_zone = zone_page_state(zone,
+                               NR_ZONE_LRU_BASE + (file * LRU_FILE));
+               active_zone = zone_page_state(zone,
+                               NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
+
+               inactive -= min(inactive, inactive_zone);
+               active -= min(active, active_zone);
+       }
+
        gb = (inactive + active) >> (30 - PAGE_SHIFT);
        if (gb)
                inactive_ratio = int_sqrt(10 * gb);
@@ -1924,7 +2061,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                                 struct lruvec *lruvec, struct scan_control *sc)
 {
        if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru)))
+               if (inactive_list_is_low(lruvec, is_file_lru(lru), sc))
                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
                return 0;
        }
@@ -1956,7 +2093,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        u64 fraction[2];
        u64 denominator = 0;    /* gcc */
-       struct zone *zone = lruvec_zone(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        unsigned long anon_prio, file_prio;
        enum scan_balance scan_balance;
        unsigned long anon, file;
@@ -1977,7 +2114,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * well.
         */
        if (current_is_kswapd()) {
-               if (!zone_reclaimable(zone))
+               if (!pgdat_reclaimable(pgdat))
                        force_scan = true;
                if (!mem_cgroup_online(memcg))
                        force_scan = true;
@@ -2023,14 +2160,24 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * anon pages.  Try to detect this based on file LRU size.
         */
        if (global_reclaim(sc)) {
-               unsigned long zonefile;
-               unsigned long zonefree;
+               unsigned long pgdatfile;
+               unsigned long pgdatfree;
+               int z;
+               unsigned long total_high_wmark = 0;
 
-               zonefree = zone_page_state(zone, NR_FREE_PAGES);
-               zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
-                          zone_page_state(zone, NR_INACTIVE_FILE);
+               pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+               pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
+                          node_page_state(pgdat, NR_INACTIVE_FILE);
 
-               if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
+               for (z = 0; z < MAX_NR_ZONES; z++) {
+                       struct zone *zone = &pgdat->node_zones[z];
+                       if (!populated_zone(zone))
+                               continue;
+
+                       total_high_wmark += high_wmark_pages(zone);
+               }
+
+               if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
                        scan_balance = SCAN_ANON;
                        goto out;
                }
@@ -2045,7 +2192,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * lruvec even if it has plenty of old anonymous pages unless the
         * system is under heavy pressure.
         */
-       if (!inactive_list_is_low(lruvec, true) &&
+       if (!inactive_list_is_low(lruvec, true, sc) &&
            lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
                scan_balance = SCAN_FILE;
                goto out;
@@ -2077,7 +2224,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
        file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
                lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
 
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&pgdat->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                reclaim_stat->recent_scanned[0] /= 2;
                reclaim_stat->recent_rotated[0] /= 2;
@@ -2098,7 +2245,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 
        fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
        fp /= reclaim_stat->recent_rotated[1] + 1;
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&pgdat->lru_lock);
 
        fraction[0] = ap;
        fraction[1] = fp;
@@ -2174,12 +2321,12 @@ static inline void init_tlb_ubc(void)
 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 
 /*
- * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
+ * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
  */
-static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
+static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
                              struct scan_control *sc, unsigned long *lru_pages)
 {
-       struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+       struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
        unsigned long nr[NR_LRU_LISTS];
        unsigned long targets[NR_LRU_LISTS];
        unsigned long nr_to_scan;
@@ -2287,7 +2434,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (inactive_list_is_low(lruvec, false))
+       if (inactive_list_is_low(lruvec, false, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 
@@ -2312,13 +2459,14 @@ static bool in_reclaim_compaction(struct scan_control *sc)
  * calls try_to_compact_zone() that it will have enough free pages to succeed.
  * It will give up earlier than that if there is difficulty reclaiming pages.
  */
-static inline bool should_continue_reclaim(struct zone *zone,
+static inline bool should_continue_reclaim(struct pglist_data *pgdat,
                                        unsigned long nr_reclaimed,
                                        unsigned long nr_scanned,
                                        struct scan_control *sc)
 {
        unsigned long pages_for_compaction;
        unsigned long inactive_lru_pages;
+       int z;
 
        /* If not in reclaim/compaction mode, stop */
        if (!in_reclaim_compaction(sc))
@@ -2352,25 +2500,32 @@ static inline bool should_continue_reclaim(struct zone *zone,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-       inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
+       inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
        if (get_nr_swap_pages() > 0)
-               inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
+               inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
        if (sc->nr_reclaimed < pages_for_compaction &&
                        inactive_lru_pages > pages_for_compaction)
                return true;
 
        /* If compaction would go ahead or the allocation would succeed, stop */
-       switch (compaction_suitable(zone, sc->order, 0, 0)) {
-       case COMPACT_PARTIAL:
-       case COMPACT_CONTINUE:
-               return false;
-       default:
-               return true;
+       for (z = 0; z <= sc->reclaim_idx; z++) {
+               struct zone *zone = &pgdat->node_zones[z];
+               if (!populated_zone(zone))
+                       continue;
+
+               switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
+               case COMPACT_PARTIAL:
+               case COMPACT_CONTINUE:
+                       return false;
+               default:
+                       /* check next zone */
+                       ;
+               }
        }
+       return true;
 }
 
-static bool shrink_zone(struct zone *zone, struct scan_control *sc,
-                       bool is_classzone)
+static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 {
        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long nr_reclaimed, nr_scanned;
@@ -2379,10 +2534,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
        do {
                struct mem_cgroup *root = sc->target_mem_cgroup;
                struct mem_cgroup_reclaim_cookie reclaim = {
-                       .zone = zone,
+                       .pgdat = pgdat,
                        .priority = sc->priority,
                };
-               unsigned long zone_lru_pages = 0;
+               unsigned long node_lru_pages = 0;
                struct mem_cgroup *memcg;
 
                nr_reclaimed = sc->nr_reclaimed;
@@ -2403,11 +2558,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                        reclaimed = sc->nr_reclaimed;
                        scanned = sc->nr_scanned;
 
-                       shrink_zone_memcg(zone, memcg, sc, &lru_pages);
-                       zone_lru_pages += lru_pages;
+                       shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
+                       node_lru_pages += lru_pages;
 
-                       if (memcg && is_classzone)
-                               shrink_slab(sc->gfp_mask, zone_to_nid(zone),
+                       if (!global_reclaim(sc))
+                               shrink_slab(sc->gfp_mask, pgdat->node_id,
                                            memcg, sc->nr_scanned - scanned,
                                            lru_pages);
 
@@ -2419,7 +2574,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                        /*
                         * Direct reclaim and kswapd have to scan all memory
                         * cgroups to fulfill the overall scan target for the
-                        * zone.
+                        * node.
                         *
                         * Limit reclaim, on the other hand, only cares about
                         * nr_to_reclaim pages to be reclaimed and it will
@@ -2437,10 +2592,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                 * Shrink the slab caches in the same proportion that
                 * the eligible LRU pages were scanned.
                 */
-               if (global_reclaim(sc) && is_classzone)
-                       shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
+               if (global_reclaim(sc))
+                       shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
                                    sc->nr_scanned - nr_scanned,
-                                   zone_lru_pages);
+                                   node_lru_pages);
 
                if (reclaim_state) {
                        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2455,7 +2610,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                if (sc->nr_reclaimed - nr_reclaimed)
                        reclaimable = true;
 
-       } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
+       } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc));
 
        return reclaimable;
@@ -2465,9 +2620,9 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
  * Returns true if compaction should go ahead for a high-order request, or
  * the high-order allocation would succeed without compaction.
  */
-static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx)
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 {
-       unsigned long balance_gap, watermark;
+       unsigned long watermark;
        bool watermark_ok;
 
        /*
@@ -2476,23 +2631,21 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
         * there is a buffer of free pages available to give compaction
         * a reasonable chance of completing and allocating the page
         */
-       balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
-                       zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
-       watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
-       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx);
+       watermark = high_wmark_pages(zone) + (2UL << sc->order);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
 
        /*
         * If compaction is deferred, reclaim up to a point where
         * compaction will have a chance of success when re-enabled
         */
-       if (compaction_deferred(zone, order))
+       if (compaction_deferred(zone, sc->order))
                return watermark_ok;
 
        /*
         * If compaction is not ready to start and allocation is not likely
         * to succeed without it, then keep reclaiming.
         */
-       if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED)
+       if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED)
                return false;
 
        return watermark_ok;
@@ -2503,14 +2656,6 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
  * try to reclaim pages from zones which will satisfy the caller's allocation
  * request.
  *
- * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
- * Because:
- * a) The caller may be trying to free *extra* pages to satisfy a higher-order
- *    allocation or
- * b) The target zone may be at high_wmark_pages(zone) but the lower zones
- *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
- *    zone defense algorithm.
- *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
@@ -2521,7 +2666,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
        gfp_t orig_mask;
-       enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
+       pg_data_t *last_pgdat = NULL;
 
        /*
         * If the number of buffer_heads in the machine exceeds the maximum
@@ -2529,21 +2674,13 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
         * highmem pages could be pinning lowmem pages storing buffer_heads
         */
        orig_mask = sc->gfp_mask;
-       if (buffer_heads_over_limit)
+       if (buffer_heads_over_limit) {
                sc->gfp_mask |= __GFP_HIGHMEM;
+               sc->reclaim_idx = gfp_zone(sc->gfp_mask);
+       }
 
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                                       gfp_zone(sc->gfp_mask), sc->nodemask) {
-               enum zone_type classzone_idx;
-
-               if (!populated_zone(zone))
-                       continue;
-
-               classzone_idx = requested_highidx;
-               while (!populated_zone(zone->zone_pgdat->node_zones +
-                                                       classzone_idx))
-                       classzone_idx--;
-
+                                       sc->reclaim_idx, sc->nodemask) {
                /*
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
@@ -2554,7 +2691,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                continue;
 
                        if (sc->priority != DEF_PRIORITY &&
-                           !zone_reclaimable(zone))
+                           !pgdat_reclaimable(zone->zone_pgdat))
                                continue;       /* Let kswapd poll it */
 
                        /*
@@ -2568,12 +2705,20 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                         */
                        if (IS_ENABLED(CONFIG_COMPACTION) &&
                            sc->order > PAGE_ALLOC_COSTLY_ORDER &&
-                           zonelist_zone_idx(z) <= requested_highidx &&
-                           compaction_ready(zone, sc->order, requested_highidx)) {
+                           compaction_ready(zone, sc)) {
                                sc->compaction_ready = true;
                                continue;
                        }
 
+                       /*
+                        * Shrink each node in the zonelist once. If the
+                        * zonelist is ordered by zone (not the default) then a
+                        * node may be shrunk multiple times but in that case
+                        * the user prefers lower zones being preserved.
+                        */
+                       if (zone->zone_pgdat == last_pgdat)
+                               continue;
+
                        /*
                         * This steals pages from memory cgroups over softlimit
                         * and returns the number of reclaimed pages and
@@ -2581,7 +2726,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                         * and balancing, not for a memcg's limit.
                         */
                        nr_soft_scanned = 0;
-                       nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                       nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
                                                sc->order, sc->gfp_mask,
                                                &nr_soft_scanned);
                        sc->nr_reclaimed += nr_soft_reclaimed;
@@ -2589,7 +2734,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                        /* need some check for avoid more shrink_zone() */
                }
 
-               shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
+               /* See comment about same check for global reclaim above */
+               if (zone->zone_pgdat == last_pgdat)
+                       continue;
+               last_pgdat = zone->zone_pgdat;
+               shrink_node(zone->zone_pgdat, sc);
        }
 
        /*
@@ -2625,7 +2774,7 @@ retry:
        delayacct_freepages_start();
 
        if (global_reclaim(sc))
-               count_vm_event(ALLOCSTALL);
+               __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
 
        do {
                vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2692,7 +2841,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
                if (!populated_zone(zone) ||
-                   zone_reclaimable_pages(zone) == 0)
+                   pgdat_reclaimable_pages(pgdat) == 0)
                        continue;
 
                pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2707,7 +2856,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 
        /* kswapd must be awake if processes are being throttled */
        if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
-               pgdat->classzone_idx = min(pgdat->classzone_idx,
+               pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
                                                (enum zone_type)ZONE_NORMAL);
                wake_up_interruptible(&pgdat->kswapd_wait);
        }
@@ -2815,6 +2964,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        struct scan_control sc = {
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .reclaim_idx = gfp_zone(gfp_mask),
                .order = order,
                .nodemask = nodemask,
                .priority = DEF_PRIORITY,
@@ -2833,7 +2983,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
-                               gfp_mask);
+                               gfp_mask,
+                               sc.reclaim_idx);
 
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
@@ -2844,9 +2995,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
 #ifdef CONFIG_MEMCG
 
-unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
+unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
-                                               struct zone *zone,
+                                               pg_data_t *pgdat,
                                                unsigned long *nr_scanned)
 {
        struct scan_control sc = {
@@ -2854,6 +3005,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                .target_mem_cgroup = memcg,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
+               .reclaim_idx = MAX_NR_ZONES - 1,
                .may_swap = !noswap,
        };
        unsigned long lru_pages;
@@ -2863,16 +3015,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 
        trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
                                                      sc.may_writepage,
-                                                     sc.gfp_mask);
+                                                     sc.gfp_mask,
+                                                     sc.reclaim_idx);
 
        /*
         * NOTE: Although we can get the priority field, using it
         * here is not a good idea, since it limits the pages we can scan.
-        * if we don't reclaim here, the shrink_zone from balance_pgdat
+        * if we don't reclaim here, the shrink_node from balance_pgdat
         * will pick up pages from other mem cgroup's as well. We hack
         * the priority and make it zero.
         */
-       shrink_zone_memcg(zone, memcg, &sc, &lru_pages);
+       shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
 
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
@@ -2892,6 +3045,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+               .reclaim_idx = MAX_NR_ZONES - 1,
                .target_mem_cgroup = memcg,
                .priority = DEF_PRIORITY,
                .may_writepage = !laptop_mode,
@@ -2910,7 +3064,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 
        trace_mm_vmscan_memcg_reclaim_begin(0,
                                            sc.may_writepage,
-                                           sc.gfp_mask);
+                                           sc.gfp_mask,
+                                           sc.reclaim_idx);
 
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
@@ -2920,7 +3075,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 }
 #endif
 
-static void age_active_anon(struct zone *zone, struct scan_control *sc)
+static void age_active_anon(struct pglist_data *pgdat,
+                               struct scan_control *sc)
 {
        struct mem_cgroup *memcg;
 
@@ -2929,9 +3085,9 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
 
        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
-               struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+               struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
 
-               if (inactive_list_is_low(lruvec, false))
+               if (inactive_list_is_low(lruvec, false, sc))
                        shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                           sc, LRU_ACTIVE_ANON);
 
@@ -2939,82 +3095,21 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
        } while (memcg);
 }
 
-static bool zone_balanced(struct zone *zone, int order, bool highorder,
-                       unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
 {
-       unsigned long mark = high_wmark_pages(zone) + balance_gap;
+       unsigned long mark = high_wmark_pages(zone);
+
+       if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+               return false;
 
        /*
-        * When checking from pgdat_balanced(), kswapd should stop and sleep
-        * when it reaches the high order-0 watermark and let kcompactd take
-        * over. Other callers such as wakeup_kswapd() want to determine the
-        * true high-order watermark.
+        * If any eligible zone is balanced then the node is not considered
+        * to be congested or dirty
         */
-       if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
-               mark += (1UL << order);
-               order = 0;
-       }
-
-       return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
-}
-
-/*
- * pgdat_balanced() is used when checking if a node is balanced.
- *
- * For order-0, all zones must be balanced!
- *
- * For high-order allocations only zones that meet watermarks and are in a
- * zone allowed by the callers classzone_idx are added to balanced_pages. The
- * total of balanced pages must be at least 25% of the zones allowed by
- * classzone_idx for the node to be considered balanced. Forcing all zones to
- * be balanced for high orders can cause excessive reclaim when there are
- * imbalanced zones.
- * The choice of 25% is due to
- *   o a 16M DMA zone that is balanced will not balance a zone on any
- *     reasonable sized machine
- *   o On all other machines, the top zone must be at least a reasonable
- *     percentage of the middle zones. For example, on 32-bit x86, highmem
- *     would need to be at least 256M for it to be balance a whole node.
- *     Similarly, on x86-64 the Normal zone would need to be at least 1G
- *     to balance a node on its own. These seemed like reasonable ratios.
- */
-static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
-{
-       unsigned long managed_pages = 0;
-       unsigned long balanced_pages = 0;
-       int i;
-
-       /* Check the watermark levels */
-       for (i = 0; i <= classzone_idx; i++) {
-               struct zone *zone = pgdat->node_zones + i;
-
-               if (!populated_zone(zone))
-                       continue;
-
-               managed_pages += zone->managed_pages;
-
-               /*
-                * A special case here:
-                *
-                * balance_pgdat() skips over all_unreclaimable after
-                * DEF_PRIORITY. Effectively, it considers them balanced so
-                * they must be considered balanced here as well!
-                */
-               if (!zone_reclaimable(zone)) {
-                       balanced_pages += zone->managed_pages;
-                       continue;
-               }
+       clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
+       clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
 
-               if (zone_balanced(zone, order, false, 0, i))
-                       balanced_pages += zone->managed_pages;
-               else if (!order)
-                       return false;
-       }
-
-       if (order)
-               return balanced_pages >= (managed_pages >> 2);
-       else
-               return true;
+       return true;
 }
 
 /*
@@ -3023,12 +3118,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
  *
  * Returns true if kswapd is ready to sleep
  */
-static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
-                                       int classzone_idx)
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 {
-       /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
-       if (remaining)
-               return false;
+       int i;
 
        /*
         * The throttled processes are normally woken up in balance_pgdat() as
@@ -3046,91 +3138,81 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
        if (waitqueue_active(&pgdat->pfmemalloc_wait))
                wake_up_all(&pgdat->pfmemalloc_wait);
 
-       return pgdat_balanced(pgdat, order, classzone_idx);
+       for (i = 0; i <= classzone_idx; i++) {
+               struct zone *zone = pgdat->node_zones + i;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               if (!zone_balanced(zone, order, classzone_idx))
+                       return false;
+       }
+
+       return true;
 }
 
 /*
- * kswapd shrinks the zone by the number of pages required to reach
- * the high watermark.
+ * kswapd shrinks a node of pages that are at or below the highest usable
+ * zone that is currently unbalanced.
  *
  * Returns true if kswapd scanned at least the requested number of pages to
  * reclaim or if the lack of progress was due to pages under writeback.
  * This is used to determine if the scanning priority needs to be raised.
  */
-static bool kswapd_shrink_zone(struct zone *zone,
-                              int classzone_idx,
+static bool kswapd_shrink_node(pg_data_t *pgdat,
                               struct scan_control *sc)
 {
-       unsigned long balance_gap;
-       bool lowmem_pressure;
+       struct zone *zone;
+       int z;
 
-       /* Reclaim above the high watermark. */
-       sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
+       /* Reclaim a number of pages proportional to the number of zones */
+       sc->nr_to_reclaim = 0;
+       for (z = 0; z <= sc->reclaim_idx; z++) {
+               zone = pgdat->node_zones + z;
+               if (!populated_zone(zone))
+                       continue;
 
-       /*
-        * We put equal pressure on every zone, unless one zone has way too
-        * many pages free already. The "too many pages" is defined as the
-        * high wmark plus a "gap" where the gap is either the low
-        * watermark or 1% of the zone, whichever is smaller.
-        */
-       balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
-                       zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
+               sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
+       }
 
        /*
-        * If there is no low memory pressure or the zone is balanced then no
-        * reclaim is necessary
+        * Historically care was taken to put equal pressure on all zones but
+        * now pressure is applied based on node LRU order.
         */
-       lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
-       if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
-                                               balance_gap, classzone_idx))
-               return true;
-
-       shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
-
-       clear_bit(ZONE_WRITEBACK, &zone->flags);
+       shrink_node(pgdat, sc);
 
        /*
-        * If a zone reaches its high watermark, consider it to be no longer
-        * congested. It's possible there are dirty pages backed by congested
-        * BDIs but as pressure is relieved, speculatively avoid congestion
-        * waits.
+        * Fragmentation may mean that the system cannot be rebalanced for
+        * high-order allocations. If twice the allocation size has been
+        * reclaimed then recheck watermarks only at order-0 to prevent
+        * excessive reclaim. Assume that a process requested a high-order
+        * can direct reclaim/compact.
         */
-       if (zone_reclaimable(zone) &&
-           zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
-               clear_bit(ZONE_CONGESTED, &zone->flags);
-               clear_bit(ZONE_DIRTY, &zone->flags);
-       }
+       if (sc->order && sc->nr_reclaimed >= 2UL << sc->order)
+               sc->order = 0;
 
        return sc->nr_scanned >= sc->nr_to_reclaim;
 }
 
 /*
- * For kswapd, balance_pgdat() will work across all this node's zones until
- * they are all at high_wmark_pages(zone).
+ * For kswapd, balance_pgdat() will reclaim pages across a node from zones
+ * that are eligible for use by the caller until at least one zone is
+ * balanced.
  *
- * Returns the highest zone idx kswapd was reclaiming at
- *
- * There is special handling here for zones which are full of pinned pages.
- * This can happen if the pages are all mlocked, or if they are all used by
- * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
- * What we do is to detect the case where all pages in the zone have been
- * scanned twice and there has been zero successful reclaim.  Mark the zone as
- * dead and from now on, only perform a short scan.  Basically we're polling
- * the zone for when the problem goes away.
+ * Returns the order kswapd finished reclaiming at.
  *
  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
  * zones which have free_pages > high_wmark_pages(zone), but once a zone is
- * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
- * lower zones regardless of the number of free pages in the lower zones. This
- * interoperates with the page allocator fallback scheme to ensure that aging
- * of pages is balanced across the zones.
+ * found to have free_pages <= high_wmark_pages(zone), any page is that zone
+ * or lower is eligible for reclaim until at least one usable zone is
+ * balanced.
  */
 static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 {
        int i;
-       int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
+       struct zone *zone;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .order = order,
@@ -3145,100 +3227,77 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                bool raise_priority = true;
 
                sc.nr_reclaimed = 0;
+               sc.reclaim_idx = classzone_idx;
 
                /*
-                * Scan in the highmem->dma direction for the highest
-                * zone which needs scanning
+                * If the number of buffer_heads exceeds the maximum allowed
+                * then consider reclaiming from all zones. This has a dual
+                * purpose -- on 64-bit systems it is expected that
+                * buffer_heads are stripped during active rotation. On 32-bit
+                * systems, highmem pages can pin lowmem memory and shrinking
+                * buffers can relieve lowmem pressure. Reclaim may still not
+                * go ahead if all eligible zones for the original allocation
+                * request are balanced to avoid excessive reclaim from kswapd.
                 */
-               for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-                       struct zone *zone = pgdat->node_zones + i;
-
-                       if (!populated_zone(zone))
-                               continue;
-
-                       if (sc.priority != DEF_PRIORITY &&
-                           !zone_reclaimable(zone))
-                               continue;
-
-                       /*
-                        * Do some background aging of the anon list, to give
-                        * pages a chance to be referenced before reclaiming.
-                        */
-                       age_active_anon(zone, &sc);
+               if (buffer_heads_over_limit) {
+                       for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
+                               zone = pgdat->node_zones + i;
+                               if (!populated_zone(zone))
+                                       continue;
 
-                       /*
-                        * If the number of buffer_heads in the machine
-                        * exceeds the maximum allowed level and this node
-                        * has a highmem zone, force kswapd to reclaim from
-                        * it to relieve lowmem pressure.
-                        */
-                       if (buffer_heads_over_limit && is_highmem_idx(i)) {
-                               end_zone = i;
+                               sc.reclaim_idx = i;
                                break;
                        }
+               }
 
-                       if (!zone_balanced(zone, order, false, 0, 0)) {
-                               end_zone = i;
-                               break;
-                       } else {
-                               /*
-                                * If balanced, clear the dirty and congested
-                                * flags
-                                */
-                               clear_bit(ZONE_CONGESTED, &zone->flags);
-                               clear_bit(ZONE_DIRTY, &zone->flags);
-                       }
+               /*
+                * Only reclaim if there are no eligible zones. Check from
+                * high to low zone as allocations prefer higher zones.
+                * Scanning from low to high zone would allow congestion to be
+                * cleared during a very small window when a small low
+                * zone was balanced even under extreme pressure when the
+                * overall node may be congested. Note that sc.reclaim_idx
+                * is not used as buffer_heads_over_limit may have adjusted
+                * it.
+                */
+               for (i = classzone_idx; i >= 0; i--) {
+                       zone = pgdat->node_zones + i;
+                       if (!populated_zone(zone))
+                               continue;
+
+                       if (zone_balanced(zone, sc.order, classzone_idx))
+                               goto out;
                }
 
-               if (i < 0)
-                       goto out;
+               /*
+                * Do some background aging of the anon list, to give
+                * pages a chance to be referenced before reclaiming. All
+                * pages are rotated regardless of classzone as this is
+                * about consistent aging.
+                */
+               age_active_anon(pgdat, &sc);
 
                /*
                 * If we're getting trouble reclaiming, start doing writepage
                 * even in laptop mode.
                 */
-               if (sc.priority < DEF_PRIORITY - 2)
+               if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
                        sc.may_writepage = 1;
 
+               /* Call soft limit reclaim before calling shrink_node. */
+               sc.nr_scanned = 0;
+               nr_soft_scanned = 0;
+               nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
+                                               sc.gfp_mask, &nr_soft_scanned);
+               sc.nr_reclaimed += nr_soft_reclaimed;
+
                /*
-                * Now scan the zone in the dma->highmem direction, stopping
-                * at the last zone which needs scanning.
-                *
-                * We do this because the page allocator works in the opposite
-                * direction.  This prevents the page allocator from allocating
-                * pages behind kswapd's direction of progress, which would
-                * cause too much scanning of the lower zones.
+                * There should be no need to raise the scanning priority if
+                * enough pages are already being scanned that that high
+                * watermark would be met at 100% efficiency.
                 */
-               for (i = 0; i <= end_zone; i++) {
-                       struct zone *zone = pgdat->node_zones + i;
-
-                       if (!populated_zone(zone))
-                               continue;
-
-                       if (sc.priority != DEF_PRIORITY &&
-                           !zone_reclaimable(zone))
-                               continue;
-
-                       sc.nr_scanned = 0;
-
-                       nr_soft_scanned = 0;
-                       /*
-                        * Call soft limit reclaim before calling shrink_zone.
-                        */
-                       nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
-                                                       order, sc.gfp_mask,
-                                                       &nr_soft_scanned);
-                       sc.nr_reclaimed += nr_soft_reclaimed;
-
-                       /*
-                        * There should be no need to raise the scanning
-                        * priority if enough pages are already being scanned
-                        * that that high watermark would be met at 100%
-                        * efficiency.
-                        */
-                       if (kswapd_shrink_zone(zone, end_zone, &sc))
-                               raise_priority = false;
-               }
+               if (kswapd_shrink_node(pgdat, &sc))
+                       raise_priority = false;
 
                /*
                 * If the low watermark is met there is no need for processes
@@ -3259,19 +3318,20 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 */
                if (raise_priority || !sc.nr_reclaimed)
                        sc.priority--;
-       } while (sc.priority >= 1 &&
-                       !pgdat_balanced(pgdat, order, classzone_idx));
+       } while (sc.priority >= 1);
 
 out:
        /*
-        * Return the highest zone idx we were reclaiming at so
-        * prepare_kswapd_sleep() makes the same decisions as here.
+        * Return the order kswapd stopped reclaiming at as
+        * prepare_kswapd_sleep() takes it into account. If another caller
+        * entered the allocator slow path while kswapd was awake, order will
+        * remain at the higher level.
         */
-       return end_zone;
+       return sc.order;
 }
 
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
-                               int classzone_idx, int balanced_classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
+                               unsigned int classzone_idx)
 {
        long remaining = 0;
        DEFINE_WAIT(wait);
@@ -3282,8 +3342,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 
        /* Try to sleep for a short interval */
-       if (prepare_kswapd_sleep(pgdat, order, remaining,
-                                               balanced_classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                /*
                 * Compaction records what page blocks it recently failed to
                 * isolate pages from and skips them in the future scanning.
@@ -3296,9 +3355,20 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
                 * We have freed the memory, now we should compact it to make
                 * allocation of the requested order possible.
                 */
-               wakeup_kcompactd(pgdat, order, classzone_idx);
+               wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
 
                remaining = schedule_timeout(HZ/10);
+
+               /*
+                * If woken prematurely then reset kswapd_classzone_idx and
+                * order. The values will either be from a wakeup request or
+                * the previous request that slept prematurely.
+                */
+               if (remaining) {
+                       pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+                       pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
+               }
+
                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
        }
@@ -3307,8 +3377,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
-       if (prepare_kswapd_sleep(pgdat, order, remaining,
-                                               balanced_classzone_idx)) {
+       if (!remaining &&
+           prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 
                /*
@@ -3349,9 +3419,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
  */
 static int kswapd(void *p)
 {
-       unsigned long order, new_order;
-       int classzone_idx, new_classzone_idx;
-       int balanced_classzone_idx;
+       unsigned int alloc_order, reclaim_order, classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
 
@@ -3381,38 +3449,20 @@ static int kswapd(void *p)
        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
        set_freezable();
 
-       order = new_order = 0;
-       classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
-       balanced_classzone_idx = classzone_idx;
+       pgdat->kswapd_order = alloc_order = reclaim_order = 0;
+       pgdat->kswapd_classzone_idx = classzone_idx = 0;
        for ( ; ; ) {
                bool ret;
 
-               /*
-                * While we were reclaiming, there might have been another
-                * wakeup, so check the values.
-                */
-               new_order = pgdat->kswapd_max_order;
-               new_classzone_idx = pgdat->classzone_idx;
-               pgdat->kswapd_max_order =  0;
-               pgdat->classzone_idx = pgdat->nr_zones - 1;
+kswapd_try_sleep:
+               kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
+                                       classzone_idx);
 
-               if (order < new_order || classzone_idx > new_classzone_idx) {
-                       /*
-                        * Don't sleep if someone wants a larger 'order'
-                        * allocation or has tigher zone constraints
-                        */
-                       order = new_order;
-                       classzone_idx = new_classzone_idx;
-               } else {
-                       kswapd_try_to_sleep(pgdat, order, classzone_idx,
-                                               balanced_classzone_idx);
-                       order = pgdat->kswapd_max_order;
-                       classzone_idx = pgdat->classzone_idx;
-                       new_order = order;
-                       new_classzone_idx = classzone_idx;
-                       pgdat->kswapd_max_order = 0;
-                       pgdat->classzone_idx = pgdat->nr_zones - 1;
-               }
+               /* Read the new order and classzone_idx */
+               alloc_order = reclaim_order = pgdat->kswapd_order;
+               classzone_idx = pgdat->kswapd_classzone_idx;
+               pgdat->kswapd_order = 0;
+               pgdat->kswapd_classzone_idx = 0;
 
                ret = try_to_freeze();
                if (kthread_should_stop())
@@ -3422,11 +3472,25 @@ static int kswapd(void *p)
                 * We can speed up thawing tasks if we don't call balance_pgdat
                 * after returning from the refrigerator
                 */
-               if (!ret) {
-                       trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       balanced_classzone_idx = balance_pgdat(pgdat, order,
-                                                               classzone_idx);
-               }
+               if (ret)
+                       continue;
+
+               /*
+                * Reclaim begins at the requested order but if a high-order
+                * reclaim fails then kswapd falls back to reclaiming for
+                * order-0. If that happens, kswapd will consider sleeping
+                * for the order it finished reclaiming at (reclaim_order)
+                * but kcompactd is woken to compact for the original
+                * request (alloc_order).
+                */
+               trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
+                                               alloc_order);
+               reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+               if (reclaim_order < alloc_order)
+                       goto kswapd_try_sleep;
+
+               alloc_order = reclaim_order = pgdat->kswapd_order;
+               classzone_idx = pgdat->kswapd_classzone_idx;
        }
 
        tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3442,6 +3506,7 @@ static int kswapd(void *p)
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
        pg_data_t *pgdat;
+       int z;
 
        if (!populated_zone(zone))
                return;
@@ -3449,14 +3514,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
                return;
        pgdat = zone->zone_pgdat;
-       if (pgdat->kswapd_max_order < order) {
-               pgdat->kswapd_max_order = order;
-               pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
-       }
+       pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+       pgdat->kswapd_order = max(pgdat->kswapd_order, order);
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
-       if (zone_balanced(zone, order, true, 0, 0))
-               return;
+
+       /* Only wake kswapd if all zones are unbalanced */
+       for (z = 0; z <= classzone_idx; z++) {
+               zone = pgdat->node_zones + z;
+               if (!populated_zone(zone))
+                       continue;
+
+               if (zone_balanced(zone, order, classzone_idx))
+                       return;
+       }
 
        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        wake_up_interruptible(&pgdat->kswapd_wait);
@@ -3477,6 +3548,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
        struct scan_control sc = {
                .nr_to_reclaim = nr_to_reclaim,
                .gfp_mask = GFP_HIGHUSER_MOVABLE,
+               .reclaim_idx = MAX_NR_ZONES - 1,
                .priority = DEF_PRIORITY,
                .may_writepage = 1,
                .may_unmap = 1,
@@ -3578,12 +3650,12 @@ module_init(kswapd_init)
 
 #ifdef CONFIG_NUMA
 /*
- * Zone reclaim mode
+ * Node reclaim mode
  *
- * If non-zero call zone_reclaim when the number of free pages falls below
+ * If non-zero call node_reclaim when the number of free pages falls below
  * the watermarks.
  */
-int zone_reclaim_mode __read_mostly;
+int node_reclaim_mode __read_mostly;
 
 #define RECLAIM_OFF 0
 #define RECLAIM_ZONE (1<<0)    /* Run shrink_inactive_list on the zone */
@@ -3591,14 +3663,14 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_UNMAP (1<<2)   /* Unmap pages during reclaim */
 
 /*
- * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of
  * a zone.
  */
-#define ZONE_RECLAIM_PRIORITY 4
+#define NODE_RECLAIM_PRIORITY 4
 
 /*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
  * occur.
  */
 int sysctl_min_unmapped_ratio = 1;
@@ -3609,11 +3681,11 @@ int sysctl_min_unmapped_ratio = 1;
  */
 int sysctl_min_slab_ratio = 5;
 
-static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
+static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
 {
-       unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
-       unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
-               zone_page_state(zone, NR_ACTIVE_FILE);
+       unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
+       unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
+               node_page_state(pgdat, NR_ACTIVE_FILE);
 
        /*
         * It's possible for there to be more file mapped pages than
@@ -3624,7 +3696,7 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
 }
 
 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static unsigned long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
 {
        unsigned long nr_pagecache_reclaimable;
        unsigned long delta = 0;
@@ -3632,17 +3704,17 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
        /*
         * If RECLAIM_UNMAP is set, then all file pages are considered
         * potentially reclaimable. Otherwise, we have to worry about
-        * pages like swapcache and zone_unmapped_file_pages() provides
+        * pages like swapcache and node_unmapped_file_pages() provides
         * a better estimate
         */
-       if (zone_reclaim_mode & RECLAIM_UNMAP)
-               nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
+       if (node_reclaim_mode & RECLAIM_UNMAP)
+               nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
        else
-               nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
+               nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
 
        /* If we can't clean pages, remove dirty pages from consideration */
-       if (!(zone_reclaim_mode & RECLAIM_WRITE))
-               delta += zone_page_state(zone, NR_FILE_DIRTY);
+       if (!(node_reclaim_mode & RECLAIM_WRITE))
+               delta += node_page_state(pgdat, NR_FILE_DIRTY);
 
        /* Watch for any possible underflows due to delta */
        if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3652,22 +3724,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
 }
 
 /*
- * Try to free up some pages from this zone through reclaim.
+ * Try to free up some pages from this node through reclaim.
  */
-static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 {
        /* Minimum pages needed in order to stay on node */
        const unsigned long nr_pages = 1 << order;
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
+       int classzone_idx = gfp_zone(gfp_mask);
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
                .order = order,
-               .priority = ZONE_RECLAIM_PRIORITY,
-               .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-               .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
+               .priority = NODE_RECLAIM_PRIORITY,
+               .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+               .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
                .may_swap = 1,
+               .reclaim_idx = classzone_idx,
        };
 
        cond_resched();
@@ -3681,13 +3755,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
 
-       if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+       if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
                /*
                 * Free memory by calling shrink zone with increasing
                 * priorities until we have enough memory freed.
                 */
                do {
-                       shrink_zone(zone, &sc, true);
+                       shrink_node(pgdat, &sc);
                } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
        }
 
@@ -3697,49 +3771,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        return sc.nr_reclaimed >= nr_pages;
 }
 
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 {
-       int node_id;
        int ret;
 
        /*
-        * Zone reclaim reclaims unmapped file backed pages and
+        * Node reclaim reclaims unmapped file backed pages and
         * slab pages if we are over the defined limits.
         *
         * A small portion of unmapped file backed pages is needed for
         * file I/O otherwise pages read by file I/O will be immediately
-        * thrown out if the zone is overallocated. So we do not reclaim
-        * if less than a specified percentage of the zone is used by
+        * thrown out if the node is overallocated. So we do not reclaim
+        * if less than a specified percentage of the node is used by
         * unmapped file backed pages.
         */
-       if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
-           zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
-               return ZONE_RECLAIM_FULL;
+       if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+           sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+               return NODE_RECLAIM_FULL;
 
-       if (!zone_reclaimable(zone))
-               return ZONE_RECLAIM_FULL;
+       if (!pgdat_reclaimable(pgdat))
+               return NODE_RECLAIM_FULL;
 
        /*
         * Do not scan if the allocation should not be delayed.
         */
        if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
-               return ZONE_RECLAIM_NOSCAN;
+               return NODE_RECLAIM_NOSCAN;
 
        /*
-        * Only run zone reclaim on the local zone or on zones that do not
+        * Only run node reclaim on the local node or on nodes that do not
         * have associated processors. This will favor the local processor
         * over remote processors and spread off node memory allocations
         * as wide as possible.
         */
-       node_id = zone_to_nid(zone);
-       if (node_state(node_id, N_CPU) && node_id != numa_node_id())
-               return ZONE_RECLAIM_NOSCAN;
+       if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
+               return NODE_RECLAIM_NOSCAN;
 
-       if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
-               return ZONE_RECLAIM_NOSCAN;
+       if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+               return NODE_RECLAIM_NOSCAN;
 
-       ret = __zone_reclaim(zone, gfp_mask, order);
-       clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+       ret = __node_reclaim(pgdat, gfp_mask, order);
+       clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
 
        if (!ret)
                count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
@@ -3778,24 +3850,23 @@ int page_evictable(struct page *page)
 void check_move_unevictable_pages(struct page **pages, int nr_pages)
 {
        struct lruvec *lruvec;
-       struct zone *zone = NULL;
+       struct pglist_data *pgdat = NULL;
        int pgscanned = 0;
        int pgrescued = 0;
        int i;
 
        for (i = 0; i < nr_pages; i++) {
                struct page *page = pages[i];
-               struct zone *pagezone;
+               struct pglist_data *pagepgdat = page_pgdat(page);
 
                pgscanned++;
-               pagezone = page_zone(page);
-               if (pagezone != zone) {
-                       if (zone)
-                               spin_unlock_irq(&zone->lru_lock);
-                       zone = pagezone;
-                       spin_lock_irq(&zone->lru_lock);
+               if (pagepgdat != pgdat) {
+                       if (pgdat)
+                               spin_unlock_irq(&pgdat->lru_lock);
+                       pgdat = pagepgdat;
+                       spin_lock_irq(&pgdat->lru_lock);
                }
-               lruvec = mem_cgroup_page_lruvec(page, zone);
+               lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
                if (!PageLRU(page) || !PageUnevictable(page))
                        continue;
@@ -3811,10 +3882,10 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                }
        }
 
-       if (zone) {
+       if (pgdat) {
                __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
                __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
-               spin_unlock_irq(&zone->lru_lock);
+               spin_unlock_irq(&pgdat->lru_lock);
        }
 }
 #endif /* CONFIG_SHMEM */
index 7997f52935c9b39a07ef67ef90de6bffffa3ab53..89cec42d19ffa8da5ad1e3c8e64ff4df1b3e562b 100644 (file)
@@ -86,8 +86,10 @@ void vm_events_fold_cpu(int cpu)
  *
  * vm_stat contains the global counters
  */
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
-EXPORT_SYMBOL(vm_stat);
+atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
+EXPORT_SYMBOL(vm_zone_stat);
+EXPORT_SYMBOL(vm_node_stat);
 
 #ifdef CONFIG_SMP
 
@@ -167,19 +169,36 @@ int calculate_normal_threshold(struct zone *zone)
  */
 void refresh_zone_stat_thresholds(void)
 {
+       struct pglist_data *pgdat;
        struct zone *zone;
        int cpu;
        int threshold;
 
+       /* Zero current pgdat thresholds */
+       for_each_online_pgdat(pgdat) {
+               for_each_online_cpu(cpu) {
+                       per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
+               }
+       }
+
        for_each_populated_zone(zone) {
+               struct pglist_data *pgdat = zone->zone_pgdat;
                unsigned long max_drift, tolerate_drift;
 
                threshold = calculate_normal_threshold(zone);
 
-               for_each_online_cpu(cpu)
+               for_each_online_cpu(cpu) {
+                       int pgdat_threshold;
+
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
                                                        = threshold;
 
+                       /* Base nodestat threshold on the largest populated zone. */
+                       pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
+                       per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
+                               = max(threshold, pgdat_threshold);
+               }
+
                /*
                 * Only set percpu_drift_mark if there is a danger that
                 * NR_FREE_PAGES reports the low watermark is ok when in fact
@@ -238,6 +257,26 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 
+void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                               long delta)
+{
+       struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+       s8 __percpu *p = pcp->vm_node_stat_diff + item;
+       long x;
+       long t;
+
+       x = delta + __this_cpu_read(*p);
+
+       t = __this_cpu_read(pcp->stat_threshold);
+
+       if (unlikely(x > t || x < -t)) {
+               node_page_state_add(x, pgdat, item);
+               x = 0;
+       }
+       __this_cpu_write(*p, x);
+}
+EXPORT_SYMBOL(__mod_node_page_state);
+
 /*
  * Optimized increment and decrement functions.
  *
@@ -277,12 +316,34 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
        }
 }
 
+void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+       s8 __percpu *p = pcp->vm_node_stat_diff + item;
+       s8 v, t;
+
+       v = __this_cpu_inc_return(*p);
+       t = __this_cpu_read(pcp->stat_threshold);
+       if (unlikely(v > t)) {
+               s8 overstep = t >> 1;
+
+               node_page_state_add(v + overstep, pgdat, item);
+               __this_cpu_write(*p, -overstep);
+       }
+}
+
 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
        __inc_zone_state(page_zone(page), item);
 }
 EXPORT_SYMBOL(__inc_zone_page_state);
 
+void __inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+       __inc_node_state(page_pgdat(page), item);
+}
+EXPORT_SYMBOL(__inc_node_page_state);
+
 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
        struct per_cpu_pageset __percpu *pcp = zone->pageset;
@@ -299,12 +360,34 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
        }
 }
 
+void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+       s8 __percpu *p = pcp->vm_node_stat_diff + item;
+       s8 v, t;
+
+       v = __this_cpu_dec_return(*p);
+       t = __this_cpu_read(pcp->stat_threshold);
+       if (unlikely(v < - t)) {
+               s8 overstep = t >> 1;
+
+               node_page_state_add(v - overstep, pgdat, item);
+               __this_cpu_write(*p, overstep);
+       }
+}
+
 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
        __dec_zone_state(page_zone(page), item);
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
+void __dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+       __dec_node_state(page_pgdat(page), item);
+}
+EXPORT_SYMBOL(__dec_node_page_state);
+
 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
 /*
  * If we have cmpxchg_local support then we do not need to incur the overhead
@@ -318,8 +401,8 @@ EXPORT_SYMBOL(__dec_zone_page_state);
  *     1       Overstepping half of threshold
  *     -1      Overstepping minus half of threshold
 */
-static inline void mod_state(struct zone *zone, enum zone_stat_item item,
-                            long delta, int overstep_mode)
+static inline void mod_zone_state(struct zone *zone,
+       enum zone_stat_item item, long delta, int overstep_mode)
 {
        struct per_cpu_pageset __percpu *pcp = zone->pageset;
        s8 __percpu *p = pcp->vm_stat_diff + item;
@@ -359,26 +442,83 @@ static inline void mod_state(struct zone *zone, enum zone_stat_item item,
 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
                         long delta)
 {
-       mod_state(zone, item, delta, 0);
+       mod_zone_state(zone, item, delta, 0);
 }
 EXPORT_SYMBOL(mod_zone_page_state);
 
-void inc_zone_state(struct zone *zone, enum zone_stat_item item)
-{
-       mod_state(zone, item, 1, 1);
-}
-
 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
-       mod_state(page_zone(page), item, 1, 1);
+       mod_zone_state(page_zone(page), item, 1, 1);
 }
 EXPORT_SYMBOL(inc_zone_page_state);
 
 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 {
-       mod_state(page_zone(page), item, -1, -1);
+       mod_zone_state(page_zone(page), item, -1, -1);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
+
+static inline void mod_node_state(struct pglist_data *pgdat,
+       enum node_stat_item item, int delta, int overstep_mode)
+{
+       struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+       s8 __percpu *p = pcp->vm_node_stat_diff + item;
+       long o, n, t, z;
+
+       do {
+               z = 0;  /* overflow to node counters */
+
+               /*
+                * The fetching of the stat_threshold is racy. We may apply
+                * a counter threshold to the wrong the cpu if we get
+                * rescheduled while executing here. However, the next
+                * counter update will apply the threshold again and
+                * therefore bring the counter under the threshold again.
+                *
+                * Most of the time the thresholds are the same anyways
+                * for all cpus in a node.
+                */
+               t = this_cpu_read(pcp->stat_threshold);
+
+               o = this_cpu_read(*p);
+               n = delta + o;
+
+               if (n > t || n < -t) {
+                       int os = overstep_mode * (t >> 1) ;
+
+                       /* Overflow must be added to node counters */
+                       z = n + os;
+                       n = -os;
+               }
+       } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+       if (z)
+               node_page_state_add(z, pgdat, item);
+}
+
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                                       long delta)
+{
+       mod_node_state(pgdat, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_node_page_state);
+
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       mod_node_state(pgdat, item, 1, 1);
+}
+
+void inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+       mod_node_state(page_pgdat(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_node_page_state);
+
+void dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+       mod_node_state(page_pgdat(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_node_page_state);
 #else
 /*
  * Use interrupt disable to serialize counter updates
@@ -394,15 +534,6 @@ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 }
 EXPORT_SYMBOL(mod_zone_page_state);
 
-void inc_zone_state(struct zone *zone, enum zone_stat_item item)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-       __inc_zone_state(zone, item);
-       local_irq_restore(flags);
-}
-
 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
 {
        unsigned long flags;
@@ -424,21 +555,69 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
-#endif
 
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __inc_node_state(pgdat, item);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_node_state);
+
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                                       long delta)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __mod_node_page_state(pgdat, item, delta);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_node_page_state);
+
+void inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+       unsigned long flags;
+       struct pglist_data *pgdat;
+
+       pgdat = page_pgdat(page);
+       local_irq_save(flags);
+       __inc_node_state(pgdat, item);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_node_page_state);
+
+void dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __dec_node_page_state(page, item);
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL(dec_node_page_state);
+#endif
 
 /*
  * Fold a differential into the global counters.
  * Returns the number of counters updated.
  */
-static int fold_diff(int *diff)
+static int fold_diff(int *zone_diff, int *node_diff)
 {
        int i;
        int changes = 0;
 
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-               if (diff[i]) {
-                       atomic_long_add(diff[i], &vm_stat[i]);
+               if (zone_diff[i]) {
+                       atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+                       changes++;
+       }
+
+       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+               if (node_diff[i]) {
+                       atomic_long_add(node_diff[i], &vm_node_stat[i]);
                        changes++;
        }
        return changes;
@@ -462,9 +641,11 @@ static int fold_diff(int *diff)
  */
 static int refresh_cpu_vm_stats(bool do_pagesets)
 {
+       struct pglist_data *pgdat;
        struct zone *zone;
        int i;
-       int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+       int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+       int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
        int changes = 0;
 
        for_each_populated_zone(zone) {
@@ -477,7 +658,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
                        if (v) {
 
                                atomic_long_add(v, &zone->vm_stat[i]);
-                               global_diff[i] += v;
+                               global_zone_diff[i] += v;
 #ifdef CONFIG_NUMA
                                /* 3 seconds idle till flush */
                                __this_cpu_write(p->expire, 3);
@@ -516,7 +697,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
                }
 #endif
        }
-       changes += fold_diff(global_diff);
+
+       for_each_online_pgdat(pgdat) {
+               struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
+
+               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+                       int v;
+
+                       v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
+                       if (v) {
+                               atomic_long_add(v, &pgdat->vm_stat[i]);
+                               global_node_diff[i] += v;
+                       }
+               }
+       }
+
+       changes += fold_diff(global_zone_diff, global_node_diff);
        return changes;
 }
 
@@ -527,9 +723,11 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
  */
 void cpu_vm_stats_fold(int cpu)
 {
+       struct pglist_data *pgdat;
        struct zone *zone;
        int i;
-       int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+       int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+       int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 
        for_each_populated_zone(zone) {
                struct per_cpu_pageset *p;
@@ -543,11 +741,27 @@ void cpu_vm_stats_fold(int cpu)
                                v = p->vm_stat_diff[i];
                                p->vm_stat_diff[i] = 0;
                                atomic_long_add(v, &zone->vm_stat[i]);
-                               global_diff[i] += v;
+                               global_zone_diff[i] += v;
                        }
        }
 
-       fold_diff(global_diff);
+       for_each_online_pgdat(pgdat) {
+               struct per_cpu_nodestat *p;
+
+               p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+
+               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+                       if (p->vm_node_stat_diff[i]) {
+                               int v;
+
+                               v = p->vm_node_stat_diff[i];
+                               p->vm_node_stat_diff[i] = 0;
+                               atomic_long_add(v, &pgdat->vm_stat[i]);
+                               global_node_diff[i] += v;
+                       }
+       }
+
+       fold_diff(global_zone_diff, global_node_diff);
 }
 
 /*
@@ -563,16 +777,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
                        int v = pset->vm_stat_diff[i];
                        pset->vm_stat_diff[i] = 0;
                        atomic_long_add(v, &zone->vm_stat[i]);
-                       atomic_long_add(v, &vm_stat[i]);
+                       atomic_long_add(v, &vm_zone_stat[i]);
                }
 }
 #endif
 
 #ifdef CONFIG_NUMA
 /*
- * Determine the per node value of a stat item.
+ * Determine the per node value of a stat item. This function
+ * is called frequently in a NUMA machine, so try to be as
+ * frugal as possible.
  */
-unsigned long node_page_state(int node, enum zone_stat_item item)
+unsigned long sum_zone_node_page_state(int node,
+                                enum zone_stat_item item)
 {
        struct zone *zones = NODE_DATA(node)->node_zones;
        int i;
@@ -584,6 +801,19 @@ unsigned long node_page_state(int node, enum zone_stat_item item)
        return count;
 }
 
+/*
+ * Determine the per node value of a stat item.
+ */
+unsigned long node_page_state(struct pglist_data *pgdat,
+                               enum node_stat_item item)
+{
+       long x = atomic_long_read(&pgdat->vm_stat[item]);
+#ifdef CONFIG_SMP
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
+}
 #endif
 
 #ifdef CONFIG_COMPACTION
@@ -691,33 +921,18 @@ int fragmentation_index(struct zone *zone, unsigned int order)
 const char * const vmstat_text[] = {
        /* enum zone_stat_item countes */
        "nr_free_pages",
-       "nr_alloc_batch",
-       "nr_inactive_anon",
-       "nr_active_anon",
-       "nr_inactive_file",
-       "nr_active_file",
-       "nr_unevictable",
+       "nr_zone_inactive_anon",
+       "nr_zone_active_anon",
+       "nr_zone_inactive_file",
+       "nr_zone_active_file",
+       "nr_zone_unevictable",
+       "nr_zone_write_pending",
        "nr_mlock",
-       "nr_anon_pages",
-       "nr_mapped",
-       "nr_file_pages",
-       "nr_dirty",
-       "nr_writeback",
        "nr_slab_reclaimable",
        "nr_slab_unreclaimable",
        "nr_page_table_pages",
        "nr_kernel_stack",
-       "nr_unstable",
        "nr_bounce",
-       "nr_vmscan_write",
-       "nr_vmscan_immediate_reclaim",
-       "nr_writeback_temp",
-       "nr_isolated_anon",
-       "nr_isolated_file",
-       "nr_shmem",
-       "nr_dirtied",
-       "nr_written",
-       "nr_pages_scanned",
 #if IS_ENABLED(CONFIG_ZSMALLOC)
        "nr_zspages",
 #endif
@@ -729,13 +944,35 @@ const char * const vmstat_text[] = {
        "numa_local",
        "numa_other",
 #endif
+       "nr_free_cma",
+
+       /* Node-based counters */
+       "nr_inactive_anon",
+       "nr_active_anon",
+       "nr_inactive_file",
+       "nr_active_file",
+       "nr_unevictable",
+       "nr_isolated_anon",
+       "nr_isolated_file",
+       "nr_pages_scanned",
        "workingset_refault",
        "workingset_activate",
        "workingset_nodereclaim",
-       "nr_anon_transparent_hugepages",
+       "nr_anon_pages",
+       "nr_mapped",
+       "nr_file_pages",
+       "nr_dirty",
+       "nr_writeback",
+       "nr_writeback_temp",
+       "nr_shmem",
        "nr_shmem_hugepages",
        "nr_shmem_pmdmapped",
-       "nr_free_cma",
+       "nr_anon_transparent_hugepages",
+       "nr_unstable",
+       "nr_vmscan_write",
+       "nr_vmscan_immediate_reclaim",
+       "nr_dirtied",
+       "nr_written",
 
        /* enum writeback_stat_item counters */
        "nr_dirty_threshold",
@@ -749,6 +986,8 @@ const char * const vmstat_text[] = {
        "pswpout",
 
        TEXTS_FOR_ZONES("pgalloc")
+       TEXTS_FOR_ZONES("allocstall")
+       TEXTS_FOR_ZONES("pgskip")
 
        "pgfree",
        "pgactivate",
@@ -758,11 +997,11 @@ const char * const vmstat_text[] = {
        "pgmajfault",
        "pglazyfreed",
 
-       TEXTS_FOR_ZONES("pgrefill")
-       TEXTS_FOR_ZONES("pgsteal_kswapd")
-       TEXTS_FOR_ZONES("pgsteal_direct")
-       TEXTS_FOR_ZONES("pgscan_kswapd")
-       TEXTS_FOR_ZONES("pgscan_direct")
+       "pgrefill",
+       "pgsteal_kswapd",
+       "pgsteal_direct",
+       "pgscan_kswapd",
+       "pgscan_direct",
        "pgscan_direct_throttle",
 
 #ifdef CONFIG_NUMA
@@ -774,7 +1013,6 @@ const char * const vmstat_text[] = {
        "kswapd_low_wmark_hit_quickly",
        "kswapd_high_wmark_hit_quickly",
        "pageoutrun",
-       "allocstall",
 
        "pgrotated",
 
@@ -1180,17 +1418,41 @@ static const struct file_operations pagetypeinfo_file_ops = {
        .release        = seq_release,
 };
 
+static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
+{
+       int zid;
+
+       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+               struct zone *compare = &pgdat->node_zones[zid];
+
+               if (populated_zone(compare))
+                       return zone == compare;
+       }
+
+       /* The zone must be somewhere! */
+       WARN_ON_ONCE(1);
+       return false;
+}
+
 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                                                        struct zone *zone)
 {
        int i;
        seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+       if (is_zone_first_populated(pgdat, zone)) {
+               seq_printf(m, "\n  per-node stats");
+               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+                       seq_printf(m, "\n      %-12s %lu",
+                               vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+                               node_page_state(pgdat, i));
+               }
+       }
        seq_printf(m,
                   "\n  pages free     %lu"
                   "\n        min      %lu"
                   "\n        low      %lu"
                   "\n        high     %lu"
-                  "\n        scanned  %lu"
+                  "\n   node_scanned  %lu"
                   "\n        spanned  %lu"
                   "\n        present  %lu"
                   "\n        managed  %lu",
@@ -1198,13 +1460,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),
-                  zone_page_state(zone, NR_PAGES_SCANNED),
+                  node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
                   zone->spanned_pages,
                   zone->present_pages,
                   zone->managed_pages);
 
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-               seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
+               seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
                                zone_page_state(zone, i));
 
        seq_printf(m,
@@ -1234,12 +1496,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 #endif
        }
        seq_printf(m,
-                  "\n  all_unreclaimable: %u"
-                  "\n  start_pfn:         %lu"
-                  "\n  inactive_ratio:    %u",
-                  !zone_reclaimable(zone),
+                  "\n  node_unreclaimable:  %u"
+                  "\n  start_pfn:           %lu"
+                  "\n  node_inactive_ratio: %u",
+                  !pgdat_reclaimable(zone->zone_pgdat),
                   zone->zone_start_pfn,
-                  zone->inactive_ratio);
+                  zone->zone_pgdat->inactive_ratio);
        seq_putc(m, '\n');
 }
 
@@ -1287,6 +1549,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
        if (*pos >= ARRAY_SIZE(vmstat_text))
                return NULL;
        stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+                         NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
                          NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
@@ -1301,6 +1564,10 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
                v[i] = global_page_state(i);
        v += NR_VM_ZONE_STAT_ITEMS;
 
+       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+               v[i] = global_node_page_state(i);
+       v += NR_VM_NODE_STAT_ITEMS;
+
        global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
                            v + NR_DIRTY_THRESHOLD);
        v += NR_VM_WRITEBACK_STAT_ITEMS;
@@ -1325,7 +1592,6 @@ static int vmstat_show(struct seq_file *m, void *arg)
 {
        unsigned long *l = arg;
        unsigned long off = l - (unsigned long *)m->private;
-
        seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
        return 0;
 }
@@ -1390,13 +1656,12 @@ int vmstat_refresh(struct ctl_table *table, int write,
        if (err)
                return err;
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
-               val = atomic_long_read(&vm_stat[i]);
+               val = atomic_long_read(&vm_zone_stat[i]);
                if (val < 0) {
                        switch (i) {
-                       case NR_ALLOC_BATCH:
                        case NR_PAGES_SCANNED:
                                /*
-                                * These are often seen to go negative in
+                                * This is often seen to go negative in
                                 * recent kernels, but not to go permanently
                                 * negative.  Whilst it would be nicer not to
                                 * have exceptions, rooting them out would be
index 577277546d985db41aee898a3c69b1b0488d289b..69551cfae97bd50bece343da57e2d12a79c2b74f 100644 (file)
@@ -16,7 +16,7 @@
 /*
  *             Double CLOCK lists
  *
- * Per zone, two clock lists are maintained for file pages: the
+ * Per node, two clock lists are maintained for file pages: the
  * inactive and the active list.  Freshly faulted pages start out at
  * the head of the inactive list and page reclaim scans pages from the
  * tail.  Pages that are accessed multiple times on the inactive list
  *
  *             Implementation
  *
- * For each zone's file LRU lists, a counter for inactive evictions
- * and activations is maintained (zone->inactive_age).
+ * For each node's file LRU lists, a counter for inactive evictions
+ * and activations is maintained (node->inactive_age).
  *
  * On eviction, a snapshot of this counter (along with some bits to
- * identify the zone) is stored in the now empty page cache radix tree
+ * identify the node) is stored in the now empty page cache radix tree
  * slot of the evicted page.  This is called a shadow entry.
  *
  * On cache misses for which there are shadow entries, an eligible
  */
 
 #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
-                        ZONES_SHIFT + NODES_SHIFT +    \
+                        NODES_SHIFT +  \
                         MEM_CGROUP_ID_SHIFT)
 #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
 
  */
 static unsigned int bucket_order __read_mostly;
 
-static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
 {
        eviction >>= bucket_order;
        eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
-       eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
-       eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
+       eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
        eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
 
        return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
 }
 
-static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
+static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
                          unsigned long *evictionp)
 {
        unsigned long entry = (unsigned long)shadow;
-       int memcgid, nid, zid;
+       int memcgid, nid;
 
        entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
-       zid = entry & ((1UL << ZONES_SHIFT) - 1);
-       entry >>= ZONES_SHIFT;
        nid = entry & ((1UL << NODES_SHIFT) - 1);
        entry >>= NODES_SHIFT;
        memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
        entry >>= MEM_CGROUP_ID_SHIFT;
 
        *memcgidp = memcgid;
-       *zonep = NODE_DATA(nid)->node_zones + zid;
+       *pgdat = NODE_DATA(nid);
        *evictionp = entry << bucket_order;
 }
 
@@ -208,7 +205,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
 void *workingset_eviction(struct address_space *mapping, struct page *page)
 {
        struct mem_cgroup *memcg = page_memcg(page);
-       struct zone *zone = page_zone(page);
+       struct pglist_data *pgdat = page_pgdat(page);
        int memcgid = mem_cgroup_id(memcg);
        unsigned long eviction;
        struct lruvec *lruvec;
@@ -218,9 +215,9 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
        VM_BUG_ON_PAGE(page_count(page), page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
 
-       lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+       lruvec = mem_cgroup_lruvec(pgdat, memcg);
        eviction = atomic_long_inc_return(&lruvec->inactive_age);
-       return pack_shadow(memcgid, zone, eviction);
+       return pack_shadow(memcgid, pgdat, eviction);
 }
 
 /**
@@ -228,7 +225,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
  * @shadow: shadow entry of the evicted page
  *
  * Calculates and evaluates the refault distance of the previously
- * evicted page in the context of the zone it was allocated in.
+ * evicted page in the context of the node it was allocated in.
  *
  * Returns %true if the page should be activated, %false otherwise.
  */
@@ -240,10 +237,10 @@ bool workingset_refault(void *shadow)
        unsigned long eviction;
        struct lruvec *lruvec;
        unsigned long refault;
-       struct zone *zone;
+       struct pglist_data *pgdat;
        int memcgid;
 
-       unpack_shadow(shadow, &memcgid, &zone, &eviction);
+       unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
 
        rcu_read_lock();
        /*
@@ -267,7 +264,7 @@ bool workingset_refault(void *shadow)
                rcu_read_unlock();
                return false;
        }
-       lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+       lruvec = mem_cgroup_lruvec(pgdat, memcg);
        refault = atomic_long_read(&lruvec->inactive_age);
        active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
        rcu_read_unlock();
@@ -290,10 +287,10 @@ bool workingset_refault(void *shadow)
         */
        refault_distance = (refault - eviction) & EVICTION_MASK;
 
-       inc_zone_state(zone, WORKINGSET_REFAULT);
+       inc_node_state(pgdat, WORKINGSET_REFAULT);
 
        if (refault_distance <= active_file) {
-               inc_zone_state(zone, WORKINGSET_ACTIVATE);
+               inc_node_state(pgdat, WORKINGSET_ACTIVATE);
                return true;
        }
        return false;
@@ -305,9 +302,10 @@ bool workingset_refault(void *shadow)
  */
 void workingset_activation(struct page *page)
 {
+       struct mem_cgroup *memcg;
        struct lruvec *lruvec;
 
-       lock_page_memcg(page);
+       rcu_read_lock();
        /*
         * Filter non-memcg pages here, e.g. unmap can call
         * mark_page_accessed() on VDSO pages.
@@ -315,12 +313,13 @@ void workingset_activation(struct page *page)
         * XXX: See workingset_refault() - this should return
         * root_mem_cgroup even for !CONFIG_MEMCG.
         */
-       if (!mem_cgroup_disabled() && !page_memcg(page))
+       memcg = page_memcg_rcu(page);
+       if (!mem_cgroup_disabled() && !memcg)
                goto out;
-       lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+       lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
        atomic_long_inc(&lruvec->inactive_age);
 out:
-       unlock_page_memcg(page);
+       rcu_read_unlock();
 }
 
 /*
@@ -349,12 +348,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
        shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
        local_irq_enable();
 
-       if (memcg_kmem_enabled())
+       if (memcg_kmem_enabled()) {
                pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
                                                     LRU_ALL_FILE);
-       else
-               pages = node_page_state(sc->nid, NR_ACTIVE_FILE) +
-                       node_page_state(sc->nid, NR_INACTIVE_FILE);
+       } else {
+               pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
+                       node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
+       }
 
        /*
         * Active cache pages are limited to 50% of memory, and shadow
@@ -433,7 +433,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
                }
        }
        BUG_ON(node->count);
-       inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM);
+       inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
        if (!__radix_tree_delete_node(&mapping->page_tree, node))
                BUG();
 
index 04176de6df705510b1da24ea6001174cbe1b46d8..b0bc023d25c539e05b5a874b65dcf575a136f271 100644 (file)
@@ -20,6 +20,7 @@
  *     page->freelist(index): links together all component pages of a zspage
  *             For the huge page, this is always 0, so we use this field
  *             to store handle.
+ *     page->units: first object offset in a subpage of zspage
  *
  * Usage of struct page flags:
  *     PG_private: identifies the first component page
  */
 #define ZS_SIZE_CLASS_DELTA    (PAGE_SIZE >> CLASS_BITS)
 
-/*
- * We do not maintain any list for completely empty or full pages
- */
 enum fullness_group {
        ZS_EMPTY,
        ZS_ALMOST_EMPTY,
@@ -467,11 +465,6 @@ static struct zpool_driver zs_zpool_driver = {
 MODULE_ALIAS("zpool-zsmalloc");
 #endif /* CONFIG_ZPOOL */
 
-static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
-{
-       return pages_per_zspage * PAGE_SIZE / size;
-}
-
 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
 
@@ -635,8 +628,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
                freeable = zs_can_compact(class);
                spin_unlock(&class->lock);
 
-               objs_per_zspage = get_maxobj_per_zspage(class->size,
-                               class->pages_per_zspage);
+               objs_per_zspage = class->objs_per_zspage;
                pages_used = obj_allocated / objs_per_zspage *
                                class->pages_per_zspage;
 
@@ -945,8 +937,8 @@ static void unpin_tag(unsigned long handle)
 static void reset_page(struct page *page)
 {
        __ClearPageMovable(page);
-       clear_bit(PG_private, &page->flags);
-       clear_bit(PG_private_2, &page->flags);
+       ClearPagePrivate(page);
+       ClearPagePrivate2(page);
        set_page_private(page, 0);
        page_mapcount_reset(page);
        ClearPageHugeObject(page);
@@ -1014,8 +1006,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
 
        cache_free_zspage(pool, zspage);
 
-       zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
-                       class->size, class->pages_per_zspage));
+       zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
        atomic_long_sub(class->pages_per_zspage,
                                        &pool->pages_allocated);
 }
@@ -1350,7 +1341,7 @@ static void zs_unregister_cpu_notifier(void)
        cpu_notifier_register_done();
 }
 
-static void init_zs_size_classes(void)
+static void __init init_zs_size_classes(void)
 {
        int nr;
 
@@ -1361,16 +1352,14 @@ static void init_zs_size_classes(void)
        zs_size_classes = nr;
 }
 
-static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
+static bool can_merge(struct size_class *prev, int pages_per_zspage,
+                                       int objs_per_zspage)
 {
-       if (prev->pages_per_zspage != pages_per_zspage)
-               return false;
+       if (prev->pages_per_zspage == pages_per_zspage &&
+               prev->objs_per_zspage == objs_per_zspage)
+               return true;
 
-       if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
-               != get_maxobj_per_zspage(size, pages_per_zspage))
-               return false;
-
-       return true;
+       return false;
 }
 
 static bool zspage_full(struct size_class *class, struct zspage *zspage)
@@ -1541,6 +1530,7 @@ static unsigned long obj_malloc(struct size_class *class,
  * zs_malloc - Allocate block of given size from pool.
  * @pool: pool to allocate from
  * @size: size of block to allocate
+ * @gfp: gfp flags when allocating object
  *
  * On success, handle to the allocated object is returned,
  * otherwise 0.
@@ -1592,8 +1582,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
        record_obj(handle, obj);
        atomic_long_add(class->pages_per_zspage,
                                &pool->pages_allocated);
-       zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
-                       class->size, class->pages_per_zspage));
+       zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
 
        /* We completely set up zspage so mark them as movable */
        SetZsPageMovable(pool, zspage);
@@ -1741,10 +1730,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
  * return handle.
  */
 static unsigned long find_alloced_obj(struct size_class *class,
-                                       struct page *page, int index)
+                                       struct page *page, int *obj_idx)
 {
        unsigned long head;
        int offset = 0;
+       int index = *obj_idx;
        unsigned long handle = 0;
        void *addr = kmap_atomic(page);
 
@@ -1765,6 +1755,9 @@ static unsigned long find_alloced_obj(struct size_class *class,
        }
 
        kunmap_atomic(addr);
+
+       *obj_idx = index;
+
        return handle;
 }
 
@@ -1776,7 +1769,7 @@ struct zs_compact_control {
        struct page *d_page;
         /* Starting object index within @s_page which used for live object
          * in the subpage. */
-       int index;
+       int obj_idx;
 };
 
 static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1786,16 +1779,16 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
        unsigned long handle;
        struct page *s_page = cc->s_page;
        struct page *d_page = cc->d_page;
-       unsigned long index = cc->index;
+       int obj_idx = cc->obj_idx;
        int ret = 0;
 
        while (1) {
-               handle = find_alloced_obj(class, s_page, index);
+               handle = find_alloced_obj(class, s_page, &obj_idx);
                if (!handle) {
                        s_page = get_next_page(s_page);
                        if (!s_page)
                                break;
-                       index = 0;
+                       obj_idx = 0;
                        continue;
                }
 
@@ -1809,7 +1802,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
                used_obj = handle_to_obj(handle);
                free_obj = obj_malloc(class, get_zspage(d_page), handle);
                zs_object_copy(class, free_obj, used_obj);
-               index++;
+               obj_idx++;
                /*
                 * record_obj updates handle's value to free_obj and it will
                 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
@@ -1824,7 +1817,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
 
        /* Remember last position in this iteration */
        cc->s_page = s_page;
-       cc->index = index;
+       cc->obj_idx = obj_idx;
 
        return ret;
 }
@@ -2181,8 +2174,7 @@ static int zs_register_migration(struct zs_pool *pool)
 static void zs_unregister_migration(struct zs_pool *pool)
 {
        flush_work(&pool->free_work);
-       if (pool->inode)
-               iput(pool->inode);
+       iput(pool->inode);
 }
 
 /*
@@ -2261,8 +2253,7 @@ static unsigned long zs_can_compact(struct size_class *class)
                return 0;
 
        obj_wasted = obj_allocated - obj_used;
-       obj_wasted /= get_maxobj_per_zspage(class->size,
-                       class->pages_per_zspage);
+       obj_wasted /= class->objs_per_zspage;
 
        return obj_wasted * class->pages_per_zspage;
 }
@@ -2279,7 +2270,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
                if (!zs_can_compact(class))
                        break;
 
-               cc.index = 0;
+               cc.obj_idx = 0;
                cc.s_page = get_first_page(src_zspage);
 
                while ((dst_zspage = isolate_zspage(class, false))) {
@@ -2398,7 +2389,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
 
 /**
  * zs_create_pool - Creates an allocation pool to work from.
- * @flags: allocation flags used to allocate pool metadata
+ * @name: pool name to be created
  *
  * This function must be called before anything when using
  * the zsmalloc allocator.
@@ -2438,6 +2429,7 @@ struct zs_pool *zs_create_pool(const char *name)
        for (i = zs_size_classes - 1; i >= 0; i--) {
                int size;
                int pages_per_zspage;
+               int objs_per_zspage;
                struct size_class *class;
                int fullness = 0;
 
@@ -2445,6 +2437,7 @@ struct zs_pool *zs_create_pool(const char *name)
                if (size > ZS_MAX_ALLOC_SIZE)
                        size = ZS_MAX_ALLOC_SIZE;
                pages_per_zspage = get_pages_per_zspage(size);
+               objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
 
                /*
                 * size_class is used for normal zsmalloc operation such
@@ -2456,7 +2449,7 @@ struct zs_pool *zs_create_pool(const char *name)
                 * previous size_class if possible.
                 */
                if (prev_class) {
-                       if (can_merge(prev_class, size, pages_per_zspage)) {
+                       if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) {
                                pool->size_class[i] = prev_class;
                                continue;
                        }
@@ -2469,8 +2462,7 @@ struct zs_pool *zs_create_pool(const char *name)
                class->size = size;
                class->index = i;
                class->pages_per_zspage = pages_per_zspage;
-               class->objs_per_zspage = class->pages_per_zspage *
-                                               PAGE_SIZE / class->size;
+               class->objs_per_zspage = objs_per_zspage;
                spin_lock_init(&class->lock);
                pool->size_class[i] = class;
                for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
index b1d491c2e7047c5ba7ef1dac4f86db697f8a4c19..fdde1bd3e3062bbf266ef165eb4dce8512f4d9b3 100644 (file)
@@ -608,6 +608,7 @@ static const struct {
        const char *compact;
 } gfp_compact_table[] = {
        { "GFP_TRANSHUGE",              "THP" },
+       { "GFP_TRANSHUGE_LIGHT",        "THL" },
        { "GFP_HIGHUSER_MOVABLE",       "HUM" },
        { "GFP_HIGHUSER",               "HU" },
        { "GFP_USER",                   "U" },