]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'akpm-current/current'
authorStephen Rothwell <sfr@canb.auug.org.au>
Mon, 2 Nov 2015 03:45:18 +0000 (14:45 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Mon, 2 Nov 2015 03:45:26 +0000 (14:45 +1100)
386 files changed:
Documentation/features/vm/pmdp_splitting_flush/arch-support.txt [deleted file]
Documentation/filesystems/proc.txt
Documentation/filesystems/vfat.txt
Documentation/kasan.txt
Documentation/kernel-parameters.txt
Documentation/lockup-watchdogs.txt
Documentation/printk-formats.txt
Documentation/sysctl/kernel.txt
Documentation/vm/balance
Documentation/vm/page_migration
Documentation/vm/split_page_table_lock
Documentation/vm/transhuge.txt
Documentation/vm/unevictable-lru.txt
MAINTAINERS
arch/alpha/include/uapi/asm/mman.h
arch/arc/mm/cache.c
arch/arm/include/asm/pgtable-3level.h
arch/arm/lib/uaccess_with_memcpy.c
arch/arm/mm/alignment.c
arch/arm/mm/dma-mapping.c
arch/arm/mm/flush.c
arch/arm/xen/mm.c
arch/arm64/include/asm/pgtable.h
arch/arm64/mm/dma-mapping.c
arch/arm64/mm/flush.c
arch/mips/include/asm/pgtable-bits.h
arch/mips/include/asm/pgtable.h
arch/mips/include/uapi/asm/mman.h
arch/mips/mm/c-r4k.c
arch/mips/mm/cache.c
arch/mips/mm/gup.c
arch/mips/mm/init.c
arch/mips/mm/pgtable-64.c
arch/mips/mm/tlbex.c
arch/parisc/include/uapi/asm/mman.h
arch/powerpc/include/asm/pgtable-ppc64.h
arch/powerpc/include/uapi/asm/mman.h
arch/powerpc/mm/hugepage-hash64.c
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/mm/numa.c
arch/powerpc/mm/pgtable_64.c
arch/powerpc/mm/subpage-prot.c
arch/powerpc/sysdev/fsl_pci.c
arch/s390/include/asm/pgtable.h
arch/s390/mm/gup.c
arch/s390/mm/pgtable.c
arch/sh/kernel/cpu/sh5/unwind.c
arch/sh/kernel/traps_64.c
arch/sh/mm/cache-sh4.c
arch/sh/mm/cache.c
arch/sparc/include/asm/pgtable_64.h
arch/sparc/include/uapi/asm/mman.h
arch/sparc/mm/fault_64.c
arch/sparc/mm/gup.c
arch/tile/include/asm/pgtable.h
arch/tile/include/uapi/asm/mman.h
arch/x86/Kconfig
arch/x86/boot/Makefile
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_types.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/cpu/perf_event_msr.c
arch/x86/kernel/machine_kexec_64.c
arch/x86/kernel/pci-dma.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/vm86_32.c
arch/x86/mm/gup.c
arch/x86/mm/kasan_init_64.c
arch/x86/mm/numa.c
arch/x86/mm/pgtable.c
arch/xtensa/configs/iss_defconfig
arch/xtensa/include/uapi/asm/mman.h
arch/xtensa/mm/tlb.c
block/bio.c
block/blk-core.c
block/blk-ioc.c
block/blk-mq-tag.c
block/blk-mq.c
block/genhd.c
block/ioprio.c
block/scsi_ioctl.c
drivers/block/drbd/drbd_bitmap.c
drivers/block/drbd/drbd_receiver.c
drivers/block/mtip32xx/mtip32xx.c
drivers/block/nbd.c
drivers/block/osdblk.c
drivers/block/paride/pd.c
drivers/block/pktcdvd.c
drivers/block/zram/zram_drv.c
drivers/connector/connector.c
drivers/firewire/core-cdev.c
drivers/gpu/drm/drm_gem.c
drivers/gpu/drm/drm_lock.c
drivers/gpu/drm/i915/i915_gem.c
drivers/ide/ide-atapi.c
drivers/ide/ide-cd.c
drivers/ide/ide-cd_ioctl.c
drivers/ide/ide-devsets.c
drivers/ide/ide-disk.c
drivers/ide/ide-ioctls.c
drivers/ide/ide-park.c
drivers/ide/ide-pm.c
drivers/ide/ide-tape.c
drivers/ide/ide-taskfile.c
drivers/infiniband/core/sa_query.c
drivers/infiniband/hw/qib/qib_init.c
drivers/iommu/amd_iommu.c
drivers/iommu/intel-iommu.c
drivers/md/dm-crypt.c
drivers/md/dm-kcopyd.c
drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c
drivers/media/pci/solo6x10/solo6x10-v4l2.c
drivers/media/pci/tw68/tw68-video.c
drivers/misc/sgi-xp/xpc_uv.c
drivers/misc/vmw_balloon.c
drivers/mtd/mtdcore.c
drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
drivers/nvme/host/pci.c
drivers/scsi/scsi_error.c
drivers/scsi/scsi_lib.c
drivers/staging/android/ion/ion_system_heap.c
drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
drivers/staging/rdma/hfi1/init.c
drivers/staging/rdma/ipath/ipath_file_ops.c
drivers/usb/gadget/function/f_mass_storage.c
drivers/usb/host/u132-hcd.c
drivers/video/fbdev/vermilion/vermilion.c
fs/9p/vfs_file.c
fs/btrfs/compression.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/free-space-cache.c
fs/btrfs/volumes.c
fs/buffer.c
fs/cachefiles/internal.h
fs/ceph/addr.c
fs/cifs/file.c
fs/coredump.c
fs/direct-io.c
fs/ext4/fsync.c
fs/ext4/inode.c
fs/ext4/readpage.c
fs/ext4/super.c
fs/fat/cache.c
fs/fat/dir.c
fs/fat/fat.h
fs/fat/file.c
fs/fat/inode.c
fs/fs-writeback.c
fs/fscache/cookie.c
fs/fscache/page.c
fs/hugetlbfs/inode.c
fs/jbd2/transaction.c
fs/jffs2/background.c
fs/jffs2/wbuf.c
fs/logfs/dev_bdev.c
fs/logfs/segment.c
fs/mpage.c
fs/namei.c
fs/nfs/file.c
fs/nilfs2/alloc.c
fs/nilfs2/alloc.h
fs/nilfs2/btree.c
fs/nilfs2/dat.c
fs/nilfs2/inode.c
fs/nilfs2/mdt.c
fs/nilfs2/mdt.h
fs/nilfs2/recovery.c
fs/nilfs2/segment.c
fs/nilfs2/segment.h
fs/nilfs2/sufile.c
fs/nilfs2/super.c
fs/notify/fdinfo.c
fs/notify/inotify/inotify_user.c
fs/ntfs/file.c
fs/ocfs2/alloc.c
fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/cluster/heartbeat.c
fs/ocfs2/dlm/dlmconvert.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/file.c
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/journal.c
fs/ocfs2/localalloc.c
fs/ocfs2/mmap.c
fs/ocfs2/namei.c
fs/ocfs2/namei.h
fs/ocfs2/ocfs2.h
fs/ocfs2/ocfs2_trace.h
fs/ocfs2/quota_global.c
fs/ocfs2/resize.c
fs/ocfs2/suballoc.c
fs/ocfs2/super.c
fs/ocfs2/super.h
fs/proc/array.c
fs/proc/page.c
fs/proc/task_mmu.c
fs/seq_file.c
fs/splice.c
fs/sync.c
fs/xfs/xfs_qm.c
include/asm-generic/dma-mapping-common.h
include/asm-generic/pgtable.h
include/drm/drmP.h
include/linux/bitops.h
include/linux/compaction.h
include/linux/compiler-gcc.h
include/linux/compiler.h
include/linux/cpuset.h
include/linux/crc64_ecma.h [new file with mode: 0644]
include/linux/dma-debug.h
include/linux/dma-mapping.h
include/linux/fs.h
include/linux/gfp.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/hugetlb_cgroup.h
include/linux/kernel.h
include/linux/kexec.h
include/linux/memblock.h
include/linux/memcontrol.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmdebug.h
include/linux/mmzone.h
include/linux/moduleparam.h
include/linux/nmi.h
include/linux/page-flags.h
include/linux/pageblock-flags.h
include/linux/pagemap.h
include/linux/poison.h
include/linux/rbtree.h
include/linux/rmap.h
include/linux/sched.h
include/linux/skbuff.h
include/linux/slab.h
include/linux/string.h
include/linux/swap.h
include/linux/syscalls.h
include/linux/tracehook.h
include/linux/types.h
include/linux/uaccess.h
include/linux/vm_event_item.h
include/linux/vmstat.h
include/linux/zpool.h
include/linux/zsmalloc.h
include/linux/zutil.h
include/net/sock.h
include/trace/events/compaction.h
include/trace/events/gfpflags.h
include/trace/events/huge_memory.h [new file with mode: 0644]
include/trace/events/nilfs2.h [new file with mode: 0644]
include/uapi/asm-generic/mman-common.h
include/uapi/asm-generic/mman.h
include/uapi/asm-generic/unistd.h
ipc/msg.c
ipc/msgutil.c
kernel/audit.c
kernel/cgroup.c
kernel/cpuset.c
kernel/events/uprobes.c
kernel/fork.c
kernel/futex.c
kernel/kexec.c
kernel/kexec_core.c
kernel/kexec_file.c
kernel/locking/lockdep.c
kernel/panic.c
kernel/params.c
kernel/power/snapshot.c
kernel/power/swap.c
kernel/profile.c
kernel/signal.c
kernel/smp.c
kernel/sys.c
kernel/sys_ni.c
kernel/sysctl.c
kernel/watchdog.c
lib/Kconfig
lib/Kconfig.debug
lib/Kconfig.kasan
lib/Makefile
lib/crc64_ecma.c [new file with mode: 0644]
lib/dma-debug.c
lib/dynamic_debug.c
lib/halfmd4.c
lib/idr.c
lib/is_single_threaded.c
lib/kasprintf.c
lib/kobject.c
lib/llist.c
lib/percpu_ida.c
lib/radix-tree.c
lib/test-string_helpers.c
lib/test_kasan.c
lib/test_printf.c [new file with mode: 0644]
lib/vsprintf.c
mm/Kconfig
mm/backing-dev.c
mm/balloon_compaction.c
mm/cma.c
mm/compaction.c
mm/debug.c
mm/dmapool.c
mm/early_ioremap.c
mm/failslab.c
mm/filemap.c
mm/frame_vector.c
mm/gup.c
mm/huge_memory.c
mm/hugetlb.c
mm/hugetlb_cgroup.c
mm/internal.h
mm/kasan/kasan.c
mm/kasan/kasan.h
mm/kasan/report.c
mm/kmemleak.c
mm/ksm.c
mm/list_lru.c
mm/maccess.c
mm/madvise.c
mm/memblock.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/mempool.c
mm/migrate.c
mm/mincore.c
mm/mlock.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/msync.c
mm/nommu.c
mm/oom_kill.c
mm/page_alloc.c
mm/page_idle.c
mm/pagewalk.c
mm/percpu.c
mm/pgtable-generic.c
mm/readahead.c
mm/rmap.c
mm/shmem.c
mm/slab.c
mm/slab.h
mm/slab_common.c
mm/slub.c
mm/swap.c
mm/swap_state.c
mm/swapfile.c
mm/userfaultfd.c
mm/util.c
mm/vmacache.c
mm/vmalloc.c
mm/vmpressure.c
mm/vmscan.c
mm/vmstat.c
mm/zbud.c
mm/zpool.c
mm/zsmalloc.c
mm/zswap.c
net/core/skbuff.c
net/core/sock.c
net/netlink/af_netlink.c
net/openvswitch/flow.c
net/rds/ib_recv.c
net/rxrpc/ar-connection.c
net/sctp/associola.c
scripts/checkpatch.pl
security/integrity/ima/ima_crypto.c
tools/testing/selftests/Makefile
tools/testing/selftests/lib/Makefile [new file with mode: 0644]
tools/testing/selftests/lib/printf.sh [new file with mode: 0644]
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/mlock2-tests.c [new file with mode: 0644]
tools/testing/selftests/vm/on-fault-limit.c [new file with mode: 0644]
tools/testing/selftests/vm/run_vmtests
tools/vm/slabinfo-gnuplot.sh [new file with mode: 0644]
tools/vm/slabinfo.c

diff --git a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt b/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt
deleted file mode 100644 (file)
index 26f74b4..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-# Feature name:          pmdp_splitting_flush
-#         Kconfig:       __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-#         description:   arch supports the pmdp_splitting_flush() VM API
-#
-    -----------------------
-    |         arch |status|
-    -----------------------
-    |       alpha: | TODO |
-    |         arc: | TODO |
-    |         arm: |  ok  |
-    |       arm64: |  ok  |
-    |       avr32: | TODO |
-    |    blackfin: | TODO |
-    |         c6x: | TODO |
-    |        cris: | TODO |
-    |         frv: | TODO |
-    |       h8300: | TODO |
-    |     hexagon: | TODO |
-    |        ia64: | TODO |
-    |        m32r: | TODO |
-    |        m68k: | TODO |
-    |       metag: | TODO |
-    |  microblaze: | TODO |
-    |        mips: |  ok  |
-    |     mn10300: | TODO |
-    |       nios2: | TODO |
-    |    openrisc: | TODO |
-    |      parisc: | TODO |
-    |     powerpc: |  ok  |
-    |        s390: |  ok  |
-    |       score: | TODO |
-    |          sh: | TODO |
-    |       sparc: | TODO |
-    |        tile: | TODO |
-    |          um: | TODO |
-    |   unicore32: | TODO |
-    |         x86: |  ok  |
-    |      xtensa: | TODO |
-    -----------------------
index 3a9d65c912e780977c12102d7719a0374241b962..55ffd0820feba09105555137efa4eb54abc39be1 100644 (file)
@@ -175,6 +175,7 @@ read the file /proc/PID/status:
   VmLib:      1412 kB
   VmPTE:        20 kb
   VmSwap:        0 kB
+  HugetlbPages:          0 kB
   Threads:        1
   SigQ:   0/28578
   SigPnd: 0000000000000000
@@ -238,6 +239,7 @@ Table 1-2: Contents of the status files (as of 4.1)
  VmPTE                       size of page table entries
  VmPMD                       size of second level page tables
  VmSwap                      size of swap usage (the number of referred swapents)
+ HugetlbPages                size of hugetlb memory portions
  Threads                     number of threads
  SigQ                        number of signals queued/max. number for queue
  SigPnd                      bitmap of pending signals for the thread
@@ -424,6 +426,9 @@ Private_Clean:         0 kB
 Private_Dirty:         0 kB
 Referenced:          892 kB
 Anonymous:             0 kB
+AnonHugePages:         0 kB
+Shared_Hugetlb:        0 kB
+Private_Hugetlb:        0 kB
 Swap:                  0 kB
 SwapPss:               0 kB
 KernelPageSize:        4 kB
@@ -452,6 +457,11 @@ and a page is modified, the file page is replaced by a private anonymous copy.
 "Swap" shows how much would-be-anonymous memory is also used, but out on
 swap.
 "SwapPss" shows proportional swap share of this mapping.
+"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
+"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by
+hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
+reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
+
 "VmFlags" field deserves a separate description. This member represents the kernel
 flags associated with the particular virtual memory area in two letter encoded
 manner. The codes are the following:
@@ -475,7 +485,6 @@ manner. The codes are the following:
     ac  - area is accountable
     nr  - swap space is not reserved for the area
     ht  - area uses huge tlb pages
-    nl  - non-linear mapping
     ar  - architecture specific flag
     dd  - do not include area into core dump
     sd  - soft-dirty flag
index ce1126aceed8fc739aba64b1ff61ae3ce7cbc90b..223c32171dcc2b562ee0995a708e87ba1829cc8c 100644 (file)
@@ -180,6 +180,16 @@ dos1xfloppy  -- If set, use a fallback default BIOS Parameter Block
 
 <bool>: 0,1,yes,no,true,false
 
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+  when using fallocate with FALLOC_FL_KEEP_SIZE.
+  So, User should assume that fallocated region can be discarded at
+  last close if there is memory pressure resulting in eviction of
+  the inode from the memory. As a result, for any dependency on
+  the fallocated region, user should make sure to recheck fallocate
+  after reopening the file.
+
 TODO
 ----------------------------------------------------------------------
 * Need to get rid of the raw scanning stuff.  Instead, always use
index 0d32355a4c348ce18cf4540e61a129b4cf2ac3fb..aa1e0c91e368885ba90e152abd377ce18dd4bdc3 100644 (file)
@@ -1,36 +1,34 @@
-Kernel address sanitizer
-================
+KernelAddressSanitizer (KASAN)
+==============================
 
 0. Overview
 ===========
 
-Kernel Address sanitizer (KASan) is a dynamic memory error detector. It provides
+KernelAddressSANitizer (KASAN) is a dynamic memory error detector. It provides
 a fast and comprehensive solution for finding use-after-free and out-of-bounds
 bugs.
 
-KASan uses compile-time instrumentation for checking every memory access,
-therefore you will need a gcc version of 4.9.2 or later. KASan could detect out
-of bounds accesses to stack or global variables, but only if gcc 5.0 or later was
-used to built the kernel.
+KASAN uses compile-time instrumentation for checking every memory access,
+therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is
+required for detection of out-of-bounds accesses to stack or global variables.
 
-Currently KASan is supported only for x86_64 architecture and requires that the
-kernel be built with the SLUB allocator.
+Currently KASAN is supported only for x86_64 architecture and requires the
+kernel to be built with the SLUB allocator.
 
 1. Usage
-=========
+========
 
 To enable KASAN configure kernel with:
 
          CONFIG_KASAN = y
 
-and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline/inline
-is compiler instrumentation types. The former produces smaller binary the
-latter is 1.1 - 2 times faster. Inline instrumentation requires a gcc version
-of 5.0 or later.
+and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline and
+inline are compiler instrumentation types. The former produces smaller binary
+the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC
+version 5.0 or later.
 
 Currently KASAN works only with the SLUB memory allocator.
-For better bug detection and nicer report, enable CONFIG_STACKTRACE and put
-at least 'slub_debug=U' in the boot cmdline.
+For better bug detection and nicer reporting, enable CONFIG_STACKTRACE.
 
 To disable instrumentation for specific files or directories, add a line
 similar to the following to the respective kernel Makefile:
@@ -42,7 +40,7 @@ similar to the following to the respective kernel Makefile:
                 KASAN_SANITIZE := n
 
 1.1 Error reports
-==========
+=================
 
 A typical out of bounds access report looks like this:
 
@@ -119,14 +117,16 @@ Memory state around the buggy address:
  ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ==================================================================
 
-First sections describe slub object where bad access happened.
-See 'SLUB Debug output' section in Documentation/vm/slub.txt for details.
+The header of the report discribe what kind of bug happened and what kind of
+access caused it. It's followed by the description of the accessed slub object
+(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and
+the description of the accessed memory page.
 
 In the last section the report shows memory state around the accessed address.
-Reading this part requires some more understanding of how KASAN works.
+Reading this part requires some understanding of how KASAN works.
 
-Each 8 bytes of memory are encoded in one shadow byte as accessible,
-partially accessible, freed or they can be part of a redzone.
+The state of each 8 aligned bytes of memory is encoded in one shadow byte.
+Those 8 bytes can be accessible, partially accessible, freed or be a redzone.
 We use the following encoding for each shadow byte: 0 means that all 8 bytes
 of the corresponding memory region are accessible; number N (1 <= N <= 7) means
 that the first N bytes are accessible, and other (8 - N) bytes are not;
@@ -139,7 +139,7 @@ the accessed address is partially accessible.
 
 
 2. Implementation details
-========================
+=========================
 
 From a high level, our approach to memory error detection is similar to that
 of kmemcheck: use shadow memory to record whether each byte of memory is safe
index 101573c07788945b4423e3d29dbbed9275ab5cda..f8aae632f02f678000f9de7a8d507a5ea0bb9404 100644 (file)
@@ -1278,6 +1278,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
                        Default: 1024
 
+       hardlockup_all_cpu_backtrace=
+                       [KNL] Should the hard-lockup detector generate
+                       backtraces on all cpus.
+                       Format: <integer>
+
        hashdist=       [KNL,NUMA] Large hashes allocated during boot
                        are distributed across NUMA nodes.  Defaults on
                        for 64-bit NUMA, off otherwise.
index 22dd6af2e4bd42152edbe872b224b85a769e7184..4a6e33e1af61e8ed2562c4f9f8870eefc1bc9f13 100644 (file)
@@ -20,8 +20,9 @@ kernel mode for more than 10 seconds (see "Implementation" below for
 details), without letting other interrupts have a chance to run.
 Similarly to the softlockup case, the current stack trace is displayed
 upon detection and the system will stay locked up unless the default
-behavior is changed, which can be done through a compile time knob,
-"BOOTPARAM_HARDLOCKUP_PANIC", and a kernel parameter, "nmi_watchdog"
+behavior is changed, which can be done through a sysctl,
+'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
+and a kernel parameter, "nmi_watchdog"
 (see "Documentation/kernel-parameters.txt" for details).
 
 The panic option can be used in combination with panic_timeout (this
index 2216eb187c213b4c0c5140a760f9df3098150e41..b784c270105f40e8320cd388cb9a8ef1e2d463f4 100644 (file)
@@ -23,6 +23,10 @@ Example:
 
 Reminder: sizeof() result is of type size_t.
 
+The kernel's printf does not support %n. For obvious reasons, floating
+point formats (%e, %f, %g, %a) are also not recognized. Use of any
+unsupported specifier or length qualifier results in a WARN and early
+return from vsnprintf.
 
 Raw pointer value SHOULD be printed with %p. The kernel supports
 the following extended format specifiers for pointer types:
@@ -119,6 +123,7 @@ Raw buffer as an escaped string:
        If field width is omitted the 1 byte only will be escaped.
 
 Raw buffer as a hex string:
+
        %*ph    00 01 02  ...  3f
        %*phC   00:01:02: ... :3f
        %*phD   00-01-02- ... -3f
@@ -234,6 +239,7 @@ UUID/GUID addresses:
        Passed by reference.
 
 dentry names:
+
        %pd{,2,3,4}
        %pD{,2,3,4}
 
@@ -256,6 +262,8 @@ struct va_format:
                va_list *va;
        };
 
+       Implements a "recursive vsnprintf".
+
        Do not use this feature without some mechanism to verify the
        correctness of the format string and va_list arguments.
 
@@ -284,6 +292,27 @@ bitmap and its derivatives such as cpumask and nodemask:
 
        Passed by reference.
 
+Network device features:
+
+       %pNF    0x000000000000c000
+
+       For printing netdev_features_t.
+
+       Passed by reference.
+
+Command from struct task_struct
+
+       %pT     ls
+
+       For printing executable name excluding path from struct
+       task_struct.
+
+       Passed by reference.
+
+If you add other %p extensions, please extend lib/test_printf.c with
+one or more test cases, if at all feasible.
+
+
 Thank you for your cooperation and attention.
 
 
index 6fccb69c03e747c2eb713a1256676c8fa44f2349..af70d1541d3af5b18834bce320ddd3e37009d29b 100644 (file)
@@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
 - domainname
 - hostname
 - hotplug
+- hardlockup_all_cpu_backtrace
 - hung_task_panic
 - hung_task_check_count
 - hung_task_timeout_secs
@@ -292,6 +293,17 @@ Information Service) or YP (Yellow Pages) domainname. These two
 domain names are in general different. For a detailed discussion
 see the hostname(1) man page.
 
+==============================================================
+hardlockup_all_cpu_backtrace:
+
+This value controls the hard lockup detector behavior when a hard
+lockup condition is detected as to whether or not to gather further
+debug information. If enabled, arch-specific all-CPU stack dumping
+will be initiated.
+
+0: do nothing. This is the default behavior.
+
+1: on detection capture more debug information.
 ==============================================================
 
 hotplug:
index c46e68cf93449aadb652bdce92953c53bb00ca22..964595481af683a7a9c344182ca707fd1c2a33fa 100644 (file)
@@ -1,12 +1,14 @@
 Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
 
-Memory balancing is needed for non __GFP_WAIT as well as for non
-__GFP_IO allocations.
+Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as
+well as for non __GFP_IO allocations.
 
-There are two reasons to be requesting non __GFP_WAIT allocations:
-the caller can not sleep (typically intr context), or does not want
-to incur cost overheads of page stealing and possible swap io for
-whatever reasons.
+The first reason why a caller may avoid reclaim is that the caller can not
+sleep due to holding a spinlock or is in interrupt context. The second may
+be that the caller is willing to fail the allocation without incurring the
+overhead of page reclaim. This may happen for opportunistic high-order
+allocation requests that have order-0 fallback options. In such cases,
+the caller may also wish to avoid waking kswapd.
 
 __GFP_IO allocation requests are made to prevent file system deadlocks.
 
index 6513fe2d90b8084743bd66ebcb5a5bd73c64482b..fea5c08641705b10507a2413f3c212b723d3a457 100644 (file)
@@ -92,29 +92,26 @@ Steps:
 
 2. Insure that writeback is complete.
 
-3. Prep the new page that we want to move to. It is locked
-   and set to not being uptodate so that all accesses to the new
-   page immediately lock while the move is in progress.
+3. Lock the new page that we want to move to. It is locked so that accesses to
+   this (not yet uptodate) page immediately lock while the move is in progress.
 
-4. The new page is prepped with some settings from the old page so that
-   accesses to the new page will discover a page with the correct settings.
-
-5. All the page table references to the page are converted
-   to migration entries or dropped (nonlinear vmas).
-   This decrease the mapcount of a page. If the resulting
-   mapcount is not zero then we do not migrate the page.
-   All user space processes that attempt to access the page
-   will now wait on the page lock.
+4. All the page table references to the page are converted to migration
+   entries. This decreases the mapcount of a page. If the resulting
+   mapcount is not zero then we do not migrate the page. All user space
+   processes that attempt to access the page will now wait on the page lock.
 
-6. The radix tree lock is taken. This will cause all processes trying
+5. The radix tree lock is taken. This will cause all processes trying
    to access the page via the mapping to block on the radix tree spinlock.
 
-7. The refcount of the page is examined and we back out if references remain
+6. The refcount of the page is examined and we back out if references remain
    otherwise we know that we are the only one referencing this page.
 
-8. The radix tree is checked and if it does not contain the pointer to this
+7. The radix tree is checked and if it does not contain the pointer to this
    page then we back out because someone else modified the radix tree.
 
+8. The new page is prepped with some settings from the old page so that
+   accesses to the new page will discover a page with the correct settings.
+
 9. The radix tree is changed to point to the new page.
 
 10. The reference count of the old page is dropped because the radix tree
index 6dea4fd5c96100d75f9152ffd8ac9be5909acf5f..62842a857dab32477f90ad84f010bacb883d0aa6 100644 (file)
@@ -54,8 +54,8 @@ everything required is done by pgtable_page_ctor() and pgtable_page_dtor(),
 which must be called on PTE table allocation / freeing.
 
 Make sure the architecture doesn't use slab allocator for page table
-allocation: slab uses page->slab_cache and page->first_page for its pages.
-These fields share storage with page->ptl.
+allocation: slab uses page->slab_cache for its pages.
+This field shares storage with page->ptl.
 
 PMD split lock only makes sense if you have more than two page table
 levels.
index 8143b9e8373db41746330c468dede07acfd96b56..21cf34f3ddb268c5a64478db776d05333027cc54 100644 (file)
@@ -35,10 +35,10 @@ miss is going to run faster.
 
 == Design ==
 
-- "graceful fallback": mm components which don't have transparent
-  hugepage knowledge fall back to breaking a transparent hugepage and
-  working on the regular pages and their respective regular pmd/pte
-  mappings
+- "graceful fallback": mm components which don't have transparent hugepage
+  knowledge fall back to breaking huge pmd mapping into table of ptes and,
+  if necessary, split a transparent hugepage. Therefore these components
+  can continue working on the regular pages or regular pte mappings.
 
 - if a hugepage allocation fails because of memory fragmentation,
   regular pages should be gracefully allocated instead and mixed in
@@ -170,6 +170,16 @@ A lower value leads to gain less thp performance. Value of
 max_ptes_none can waste cpu time very little, you can
 ignore it.
 
+max_ptes_swap specifies how many pages can be brought in from
+swap when collapsing a group of pages into a transparent huge page.
+
+/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
+
+A higher value can cause excessive swap IO and waste
+memory. A lower value can prevent THPs from being
+collapsed, resulting fewer pages being collapsed into
+THPs, and lower memory access performance.
+
 == Boot parameter ==
 
 You can change the sysfs boot time defaults of Transparent Hugepage
@@ -211,9 +221,18 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range
        of pages that should be collapsed into one huge page but failed
        the allocation.
 
-thp_split is incremented every time a huge page is split into base
+thp_split_page is incremented every time a huge page is split into base
        pages. This can happen for a variety of reasons but a common
        reason is that a huge page is old and is being reclaimed.
+       This action implies splitting all PMD the page mapped with.
+
+thp_split_page_failed is is incremented if kernel fails to split huge
+       page. This can happen if the page was pinned by somebody.
+
+thp_split_pmd is incremented every time a PMD split into table of PTEs.
+       This can happen, for instance, when application calls mprotect() or
+       munmap() on part of huge page. It doesn't split huge page, only
+       page table entry.
 
 thp_zero_page_alloc is incremented every time a huge zero page is
        successfully allocated. It includes allocations which where
@@ -264,10 +283,8 @@ is complete, so they won't ever notice the fact the page is huge. But
 if any driver is going to mangle over the page structure of the tail
 page (like for checking page->mapping or other bits that are relevant
 for the head page and not the tail page), it should be updated to jump
-to check head page instead (while serializing properly against
-split_huge_page() to avoid the head and tail pages to disappear from
-under it, see the futex code to see an example of that, hugetlbfs also
-needed special handling in futex code for similar reasons).
+to check head page instead. Taking reference on any head/tail page would
+prevent page from being split by anyone.
 
 NOTE: these aren't new constraints to the GUP API, and they match the
 same constrains that applies to hugetlbfs too, so any driver capable
@@ -302,9 +319,9 @@ unaffected. libhugetlbfs will also work fine as usual.
 == Graceful fallback ==
 
 Code walking pagetables but unware about huge pmds can simply call
-split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by
+split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
 pmd_offset. It's trivial to make the code transparent hugepage aware
-by just grepping for "pmd_offset" and adding split_huge_page_pmd where
+by just grepping for "pmd_offset" and adding split_huge_pmd where
 missing after pmd_offset returns the pmd. Thanks to the graceful
 fallback design, with a one liner change, you can avoid to write
 hundred if not thousand of lines of complex code to make your code
@@ -313,7 +330,8 @@ hugepage aware.
 If you're not walking pagetables but you run into a physical hugepage
 but you can't handle it natively in your code, you can split it by
 calling split_huge_page(page). This is what the Linux VM does before
-it tries to swapout the hugepage for example.
+it tries to swapout the hugepage for example. split_huge_page() can fail
+if the page is pinned and you must handle this correctly.
 
 Example to make mremap.c transparent hugepage aware with a one liner
 change:
@@ -325,14 +343,14 @@ diff --git a/mm/mremap.c b/mm/mremap.c
                return NULL;
 
        pmd = pmd_offset(pud, addr);
-+      split_huge_page_pmd(vma, addr, pmd);
++      split_huge_pmd(vma, pmd, addr);
        if (pmd_none_or_clear_bad(pmd))
                return NULL;
 
 == Locking in hugepage aware code ==
 
 We want as much code as possible hugepage aware, as calling
-split_huge_page() or split_huge_page_pmd() has a cost.
+split_huge_page() or split_huge_pmd() has a cost.
 
 To make pagetable walks huge pmd aware, all you need to do is to call
 pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
@@ -341,47 +359,80 @@ created from under you by khugepaged (khugepaged collapse_huge_page
 takes the mmap_sem in write mode in addition to the anon_vma lock). If
 pmd_trans_huge returns false, you just fallback in the old code
 paths. If instead pmd_trans_huge returns true, you have to take the
-mm->page_table_lock and re-run pmd_trans_huge. Taking the
-page_table_lock will prevent the huge pmd to be converted into a
-regular pmd from under you (split_huge_page can run in parallel to the
+page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
+page table lock will prevent the huge pmd to be converted into a
+regular pmd from under you (split_huge_pmd can run in parallel to the
 pagetable walk). If the second pmd_trans_huge returns false, you
-should just drop the page_table_lock and fallback to the old code as
-before. Otherwise you should run pmd_trans_splitting on the pmd. In
-case pmd_trans_splitting returns true, it means split_huge_page is
-already in the middle of splitting the page. So if pmd_trans_splitting
-returns true it's enough to drop the page_table_lock and call
-wait_split_huge_page and then fallback the old code paths. You are
-guaranteed by the time wait_split_huge_page returns, the pmd isn't
-huge anymore. If pmd_trans_splitting returns false, you can proceed to
-process the huge pmd and the hugepage natively. Once finished you can
-drop the page_table_lock.
-
-== compound_lock, get_user_pages and put_page ==
+should just drop the page table lock and fallback to the old code as
+before. Otherwise you can proceed to process the huge pmd and the
+hugepage natively. Once finished you can drop the page table lock.
+
+== Refcounts and transparent huge pages ==
+
+Refcounting on THP is mostly consistent with refcounting on other compound
+pages:
+
+  - get_page()/put_page() and GUP operate in head page's ->_count.
+
+  - ->_count in tail pages is always zero: get_page_unless_zero() never
+    succeed on tail pages.
+
+  - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
+    on relevant sub-page of the compound page.
+
+  - map/unmap of the whole compound page accounted in compound_mapcount
+    (stored in first tail page).
+
+PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one.
+This additional reference is required to get race-free detection of unmap of
+subpages when we have them mapped with both PMDs and PTEs.
+
+This is optimization required to lower overhead of per-subpage mapcount
+tracking. The alternative is alter ->_mapcount in all subpages on each
+map/unmap of the whole compound page.
+
+We set PG_double_map when a PMD of the page got split for the first time,
+but still have PMD mapping. The addtional references go away with last
+compound_mapcount.
 
 split_huge_page internally has to distribute the refcounts in the head
-page to the tail pages before clearing all PG_head/tail bits from the
-page structures. It can do that easily for refcounts taken by huge pmd
-mappings. But the GUI API as created by hugetlbfs (that returns head
-and tail pages if running get_user_pages on an address backed by any
-hugepage), requires the refcount to be accounted on the tail pages and
-not only in the head pages, if we want to be able to run
-split_huge_page while there are gup pins established on any tail
-page. Failure to be able to run split_huge_page if there's any gup pin
-on any tail page, would mean having to split all hugepages upfront in
-get_user_pages which is unacceptable as too many gup users are
-performance critical and they must work natively on hugepages like
-they work natively on hugetlbfs already (hugetlbfs is simpler because
-hugetlbfs pages cannot be split so there wouldn't be requirement of
-accounting the pins on the tail pages for hugetlbfs). If we wouldn't
-account the gup refcounts on the tail pages during gup, we won't know
-anymore which tail page is pinned by gup and which is not while we run
-split_huge_page. But we still have to add the gup pin to the head page
-too, to know when we can free the compound page in case it's never
-split during its lifetime. That requires changing not just
-get_page, but put_page as well so that when put_page runs on a tail
-page (and only on a tail page) it will find its respective head page,
-and then it will decrease the head page refcount in addition to the
-tail page refcount. To obtain a head page reliably and to decrease its
-refcount without race conditions, put_page has to serialize against
-__split_huge_page_refcount using a special per-page lock called
-compound_lock.
+page to the tail pages before clearing all PG_head/tail bits from the page
+structures. It can be done easily for refcounts taken by page table
+entries. But we don't have enough information on how to distribute any
+additional pins (i.e. from get_user_pages). split_huge_page() fails any
+requests to split pinned huge page: it expects page count to be equal to
+sum of mapcount of all sub-pages plus one (split_huge_page caller must
+have reference for head page).
+
+split_huge_page uses migration entries to stabilize page->_count and
+page->_mapcount.
+
+We safe against physical memory scanners too: the only legitimate way
+scanner can get reference to a page is get_page_unless_zero().
+
+All tail pages has zero ->_count until atomic_add(). It prevent scanner
+from geting reference to tail page up to the point. After the atomic_add()
+we don't care about ->_count value.  We already known how many references
+with should uncharge from head page.
+
+For head page get_page_unless_zero() will succeed and we don't mind. It's
+clear where reference should go after split: it will stay on head page.
+
+Note that split_huge_pmd() doesn't have any limitation on refcounting:
+pmd can be split at any point and never fails.
+
+== Partial unmap and deferred_split_huge_page() ==
+
+Unmapping part of THP (with munmap() or other way) is not going to free
+memory immediately. Instead, we detect that a subpage of THP is not in use
+in page_remove_rmap() and queue the THP for splitting if memory pressure
+comes. Splitting will free up unused subpages.
+
+Splitting the page right away is not an option due to locking context in
+the place where we can detect partial unmap. It's also might be
+counterproductive since in many cases partial unmap unmap happens during
+exit(2) if an THP crosses VMA boundary.
+
+Function deferred_split_huge_page() is used to queue page for splitting.
+The splitting itself will happen when we get memory pressure via shrinker
+interface.
index 32ee3a67dba20e64aa46762e24de99fb8ba825c3..fa3b527086fabdb44f7ebb9ef97ef00923ec63c6 100644 (file)
@@ -531,83 +531,20 @@ map.
 
 try_to_unmap() is always called, by either vmscan for reclaim or for page
 migration, with the argument page locked and isolated from the LRU.  Separate
-functions handle anonymous and mapped file pages, as these types of pages have
-different reverse map mechanisms.
-
- (*) try_to_unmap_anon()
-
-     To unmap anonymous pages, each VMA in the list anchored in the anon_vma
-     must be visited - at least until a VM_LOCKED VMA is encountered.  If the
-     page is being unmapped for migration, VM_LOCKED VMAs do not stop the
-     process because mlocked pages are migratable.  However, for reclaim, if
-     the page is mapped into a VM_LOCKED VMA, the scan stops.
-
-     try_to_unmap_anon() attempts to acquire in read mode the mmap semaphore of
-     the mm_struct to which the VMA belongs.  If this is successful, it will
-     mlock the page via mlock_vma_page() - we wouldn't have gotten to
-     try_to_unmap_anon() if the page were already mlocked - and will return
-     SWAP_MLOCK, indicating that the page is unevictable.
-
-     If the mmap semaphore cannot be acquired, we are not sure whether the page
-     is really unevictable or not.  In this case, try_to_unmap_anon() will
-     return SWAP_AGAIN.
-
- (*) try_to_unmap_file() - linear mappings
-
-     Unmapping of a mapped file page works the same as for anonymous mappings,
-     except that the scan visits all VMAs that map the page's index/page offset
-     in the page's mapping's reverse map priority search tree.  It also visits
-     each VMA in the page's mapping's non-linear list, if the list is
-     non-empty.
-
-     As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file
-     page, try_to_unmap_file() will attempt to acquire the associated
-     mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this
-     is successful, and SWAP_AGAIN, if not.
-
- (*) try_to_unmap_file() - non-linear mappings
-
-     If a page's mapping contains a non-empty non-linear mapping VMA list, then
-     try_to_un{map|lock}() must also visit each VMA in that list to determine
-     whether the page is mapped in a VM_LOCKED VMA.  Again, the scan must visit
-     all VMAs in the non-linear list to ensure that the pages is not/should not
-     be mlocked.
-
-     If a VM_LOCKED VMA is found in the list, the scan could terminate.
-     However, there is no easy way to determine whether the page is actually
-     mapped in a given VMA - either for unmapping or testing whether the
-     VM_LOCKED VMA actually pins the page.
-
-     try_to_unmap_file() handles non-linear mappings by scanning a certain
-     number of pages - a "cluster" - in each non-linear VMA associated with the
-     page's mapping, for each file mapped page that vmscan tries to unmap.  If
-     this happens to unmap the page we're trying to unmap, try_to_unmap() will
-     notice this on return (page_mapcount(page) will be 0) and return
-     SWAP_SUCCESS.  Otherwise, it will return SWAP_AGAIN, causing vmscan to
-     recirculate this page.  We take advantage of the cluster scan in
-     try_to_unmap_cluster() as follows:
-
-       For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the
-       mmap semaphore of the associated mm_struct for read without blocking.
-
-       If this attempt is successful and the VMA is VM_LOCKED,
-       try_to_unmap_cluster() will retain the mmap semaphore for the scan;
-       otherwise it drops it here.
-
-       Then, for each page in the cluster, if we're holding the mmap semaphore
-       for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to
-       mlock the page.  This call is a no-op if the page is already locked,
-       but will mlock any pages in the non-linear mapping that happen to be
-       unlocked.
-
-       If one of the pages so mlocked is the page passed in to try_to_unmap(),
-       try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default
-       SWAP_AGAIN.  This will allow vmscan to cull the page, rather than
-       recirculating it on the inactive list.
-
-       Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it
-       returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED
-       VMA, but couldn't be mlocked.
+functions handle anonymous and mapped file and KSM pages, as these types of
+pages have different reverse map lookup mechanisms, with different locking.
+In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(),
+it will call try_to_unmap_one() for every VMA which might contain the page.
+
+When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED
+VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it,
+and return SWAP_MLOCK to indicate that the page is unevictable: and the scan
+stops there.
+
+mlock_vma_page() is called while holding the page table's lock (in addition
+to the page lock, and the rmap lock): to serialize against concurrent mlock or
+munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
+holepunching, and truncation of file pages and their anonymous COWed pages.
 
 
 try_to_munlock() REVERSE MAP SCAN
@@ -623,29 +560,15 @@ all PTEs from the page.  For this purpose, the unevictable/mlock infrastructure
 introduced a variant of try_to_unmap() called try_to_munlock().
 
 try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
-mapped file pages with an additional argument specifying unlock versus unmap
+mapped file and KSM pages with a flag argument specifying unlock versus unmap
 processing.  Again, these functions walk the respective reverse maps looking
-for VM_LOCKED VMAs.  When such a VMA is found for anonymous pages and file
-pages mapped in linear VMAs, as in the try_to_unmap() case, the functions
-attempt to acquire the associated mmap semaphore, mlock the page via
-mlock_vma_page() and return SWAP_MLOCK.  This effectively undoes the
-pre-clearing of the page's PG_mlocked done by munlock_vma_page.
-
-If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap
-semaphore, it will return SWAP_AGAIN.  This will allow shrink_page_list() to
-recycle the page on the inactive list and hope that it has better luck with the
-page next time.
-
-For file pages mapped into non-linear VMAs, the try_to_munlock() logic works
-slightly differently.  On encountering a VM_LOCKED non-linear VMA that might
-map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the
-page.  munlock_vma_page() will just leave the page unlocked and let vmscan deal
-with it - the usual fallback position.
+for VM_LOCKED VMAs.  When such a VMA is found, as in the try_to_unmap() case,
+the functions mlock the page via mlock_vma_page() and return SWAP_MLOCK.  This
+undoes the pre-clearing of the page's PG_mlocked done by munlock_vma_page.
 
 Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
 reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
-However, the scan can terminate when it encounters a VM_LOCKED VMA and can
-successfully acquire the VMA's mmap semaphore for read and mlock the page.
+However, the scan can terminate when it encounters a VM_LOCKED VMA.
 Although try_to_munlock() might be called a great many times when munlocking a
 large region or tearing down a large address space that has been mlocked via
 mlockall(), overall this is a fairly rare event.
@@ -673,11 +596,6 @@ Some examples of these unevictable pages on the LRU lists are:
  (3) mlocked pages that could not be isolated from the LRU and moved to the
      unevictable list in mlock_vma_page().
 
- (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't
-     acquire the VMA's mmap semaphore to test the flags and set PageMlocked.
-     munlock_vma_page() was forced to let the page back on to the normal LRU
-     list for vmscan to handle.
-
 shrink_inactive_list() also diverts any unevictable pages that it finds on the
 inactive lists to the appropriate zone's unevictable list.
 
index 7016beec7f93a157694bd97981722472f6bf815a..c16f3f95db26998179fcc942bd2051290fc69200 100644 (file)
@@ -7509,6 +7509,7 @@ S:        Supported
 F:     Documentation/filesystems/nilfs2.txt
 F:     fs/nilfs2/
 F:     include/linux/nilfs2_fs.h
+F:     include/trace/events/nilfs2.h
 
 NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER
 M:     YOKOTA Hiroshi <yokota@netlab.is.tsukuba.ac.jp>
index 0086b472bc2b4c8e2a588a2a8ab763c3243362f7..d828beb5e69b0a040d1012354bc6c89446bcb35c 100644 (file)
@@ -37,6 +37,9 @@
 
 #define MCL_CURRENT     8192           /* lock all currently mapped pages */
 #define MCL_FUTURE     16384           /* lock all additions to address space */
+#define MCL_ONFAULT    32768           /* lock all pages that are faulted in */
+
+#define MLOCK_ONFAULT  0x01            /* Lock pages in range after they are faulted in, do not prefault */
 
 #define MADV_NORMAL    0               /* no further special treatment */
 #define MADV_RANDOM    1               /* expect random page references */
@@ -44,6 +47,7 @@
 #define MADV_WILLNEED  3               /* will need these pages */
 #define        MADV_SPACEAVAIL 5               /* ensure resources are available */
 #define MADV_DONTNEED  6               /* don't need these pages */
+#define MADV_FREE      7               /* free pages only if memory pressure */
 
 /* common/generic parameters */
 #define MADV_REMOVE    9               /* remove these pages & resources */
index ff7ff6cbb8112408c05a38a2f8e001265d5d3726..b65f797e9ad6723abd7c38bba09e382df52450b4 100644 (file)
@@ -617,7 +617,7 @@ void flush_dcache_page(struct page *page)
         */
        if (!mapping_mapped(mapping)) {
                clear_bit(PG_dc_clean, &page->flags);
-       } else if (page_mapped(page)) {
+       } else if (page_mapcount(page)) {
 
                /* kernel reading from page with U-mapping */
                phys_addr_t paddr = (unsigned long)page_address(page);
@@ -857,7 +857,7 @@ void copy_user_highpage(struct page *to, struct page *from,
         * For !VIPT cache, all of this gets compiled out as
         * addr_not_cache_congruent() is 0
         */
-       if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
+       if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
                __flush_dcache_page((unsigned long)kfrom, u_vaddr);
                clean_src_k_mappings = 1;
        }
index a745a2a53853c384f688ebab2cd6d254b87a6017..dc46398bc3a528ccf51fb01cdabd143036df10de 100644 (file)
@@ -88,7 +88,6 @@
 
 #define L_PMD_SECT_VALID       (_AT(pmdval_t, 1) << 0)
 #define L_PMD_SECT_DIRTY       (_AT(pmdval_t, 1) << 55)
-#define L_PMD_SECT_SPLITTING   (_AT(pmdval_t, 1) << 56)
 #define L_PMD_SECT_NONE                (_AT(pmdval_t, 1) << 57)
 #define L_PMD_SECT_RDONLY      (_AT(pteval_t, 1) << 58)
 
@@ -232,13 +231,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_trans_huge(pmd)    (pmd_val(pmd) && !pmd_table(pmd))
-#define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING))
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp);
-#endif
 #endif
 
 #define PMD_BIT_FUNC(fn,op) \
@@ -246,9 +238,9 @@ static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
 
 PMD_BIT_FUNC(wrprotect,        |= L_PMD_SECT_RDONLY);
 PMD_BIT_FUNC(mkold,    &= ~PMD_SECT_AF);
-PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
 PMD_BIT_FUNC(mkwrite,   &= ~L_PMD_SECT_RDONLY);
 PMD_BIT_FUNC(mkdirty,   |= L_PMD_SECT_DIRTY);
+PMD_BIT_FUNC(mkclean,   &= ~L_PMD_SECT_DIRTY);
 PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
 
 #define pmd_mkhuge(pmd)                (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
index d72b90905132487257220939a255ac7ed1d3754d..96554afe1b839b250e9aa1e8fcada2428453ac77 100644 (file)
@@ -52,14 +52,13 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
         *
         * Lock the page table for the destination and check
         * to see that it's still huge and whether or not we will
-        * need to fault on write, or if we have a splitting THP.
+        * need to fault on write.
         */
        if (unlikely(pmd_thp_or_huge(*pmd))) {
                ptl = &current->mm->page_table_lock;
                spin_lock(ptl);
                if (unlikely(!pmd_thp_or_huge(*pmd)
-                       || pmd_hugewillfault(*pmd)
-                       || pmd_trans_splitting(*pmd))) {
+                       || pmd_hugewillfault(*pmd))) {
                        spin_unlock(ptl);
                        return 0;
                }
index 00b7f7de28a182c849249a242fb0ecd2d68b09ca..7d5f4c736a16b4c1f514d0c3ce768ede43f4cef1 100644 (file)
@@ -803,7 +803,7 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
                        }
                }
        } else {
-               fault = probe_kernel_address(instrptr, instr);
+               fault = probe_kernel_address((void *)instrptr, instr);
                instr = __mem_to_opcode_arm(instr);
        }
 
index ad4eb2d26e1697fc6a16f47a8805e532e198a693..e62400e5fb99fdbf864af966e718a98decf85e29 100644 (file)
@@ -651,12 +651,12 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 
        if (nommu())
                addr = __alloc_simple_buffer(dev, size, gfp, &page);
-       else if (dev_get_cma_area(dev) && (gfp & __GFP_WAIT))
+       else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM))
                addr = __alloc_from_contiguous(dev, size, prot, &page,
                                               caller, want_vaddr);
        else if (is_coherent)
                addr = __alloc_simple_buffer(dev, size, gfp, &page);
-       else if (!(gfp & __GFP_WAIT))
+       else if (!gfpflags_allow_blocking(gfp))
                addr = __alloc_from_pool(size, &page);
        else
                addr = __alloc_remap_buffer(dev, size, gfp, prot, &page,
@@ -1363,7 +1363,7 @@ static void *arm_iommu_alloc_attrs(struct device *dev, size_t size,
        *handle = DMA_ERROR_CODE;
        size = PAGE_ALIGN(size);
 
-       if (!(gfp & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(gfp))
                return __iommu_alloc_atomic(dev, size, handle);
 
        /*
index 1ec8e7590fc6823bf1d1ffe87c1901f645ffcd07..d0ba3551d49a4b05371db7a12c02e151be9d4eae 100644 (file)
@@ -330,7 +330,7 @@ void flush_dcache_page(struct page *page)
        mapping = page_mapping(page);
 
        if (!cache_ops_need_broadcast() &&
-           mapping && !page_mapped(page))
+           mapping && !page_mapcount(page))
                clear_bit(PG_dcache_clean, &page->flags);
        else {
                __flush_dcache_page(mapping, page);
@@ -415,18 +415,3 @@ void __flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned l
         */
        __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
 }
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
-{
-       pmd_t pmd = pmd_mksplitting(*pmdp);
-       VM_BUG_ON(address & ~PMD_MASK);
-       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
-       /* dummy IPI to serialise against fast_gup */
-       kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
index 7c34f7126b046abe9d61637a1716a2ae5139bfe5..c5f9a9e3d1f393daa161ce8349c820462870a942 100644 (file)
@@ -25,7 +25,7 @@
 unsigned long xen_get_swiotlb_free_pages(unsigned int order)
 {
        struct memblock_region *reg;
-       gfp_t flags = __GFP_NOWARN;
+       gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM;
 
        for_each_memblock(memory, reg) {
                if (reg->base < (phys_addr_t)0xffffffff) {
index f3acf421ded4f55616abd7b68a7dcf83081e6f38..5687caf59dd565183213bc920b390bb0232158fa 100644 (file)
@@ -331,21 +331,15 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_trans_huge(pmd)    (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
-#define pmd_trans_splitting(pmd)       pte_special(pmd_pte(pmd))
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-struct vm_area_struct;
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp);
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)         pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
 #define pmd_wrprotect(pmd)     pte_pmd(pte_wrprotect(pmd_pte(pmd)))
-#define pmd_mksplitting(pmd)   pte_pmd(pte_mkspecial(pmd_pte(pmd)))
 #define pmd_mkold(pmd)         pte_pmd(pte_mkold(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)       pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkclean(pmd)       pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)       pte_pmd(pte_mkdirty(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)       pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 #define pmd_mknotpresent(pmd)  (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
index 6320361d8d4c703cf4072ba2e47fdf7884c7f779..131a199114b405e8403f05137e560a2b317f4941 100644 (file)
@@ -100,7 +100,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size,
        if (IS_ENABLED(CONFIG_ZONE_DMA) &&
            dev->coherent_dma_mask <= DMA_BIT_MASK(32))
                flags |= GFP_DMA;
-       if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) {
+       if (dev_get_cma_area(dev) && gfpflags_allow_blocking(flags)) {
                struct page *page;
                void *addr;
 
@@ -148,7 +148,7 @@ static void *__dma_alloc(struct device *dev, size_t size,
 
        size = PAGE_ALIGN(size);
 
-       if (!coherent && !(flags & __GFP_WAIT)) {
+       if (!coherent && !gfpflags_allow_blocking(flags)) {
                struct page *page = NULL;
                void *addr = __alloc_from_pool(size, &page, flags);
 
@@ -562,7 +562,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
         */
        gfp |= __GFP_ZERO;
 
-       if (gfp & __GFP_WAIT) {
+       if (gfpflags_allow_blocking(gfp)) {
                struct page **pages;
                pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
 
index c26b804015e80c46e1380d0a1af7f8f439c55405..3d59de89c04249686170077e96c07036a0f304db 100644 (file)
@@ -99,19 +99,3 @@ EXPORT_SYMBOL(flush_dcache_page);
  * Additional functions defined in assembly.
  */
 EXPORT_SYMBOL(flush_icache_range);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
-{
-       pmd_t pmd = pmd_mksplitting(*pmdp);
-
-       VM_BUG_ON(address & ~PMD_MASK);
-       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-
-       /* dummy IPI to serialise against fast_gup */
-       kick_all_cpus_sync();
-}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
index ff7ad91c85db325b27dacb8987c9e1b930088855..97b313882678084857129b811cea812a723c6a3a 100644 (file)
 /* Huge TLB page */
 #define _PAGE_HUGE_SHIFT       (_PAGE_MODIFIED_SHIFT + 1)
 #define _PAGE_HUGE             (1 << _PAGE_HUGE_SHIFT)
-#define _PAGE_SPLITTING_SHIFT  (_PAGE_HUGE_SHIFT + 1)
-#define _PAGE_SPLITTING                (1 << _PAGE_SPLITTING_SHIFT)
 #endif /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */
 
 #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
 /* XI - page cannot be executed */
-#ifdef _PAGE_SPLITTING_SHIFT
-#define _PAGE_NO_EXEC_SHIFT    (_PAGE_SPLITTING_SHIFT + 1)
+#ifdef _PAGE_HUGE_SHIFT
+#define _PAGE_NO_EXEC_SHIFT    (_PAGE_HUGE_SHIFT + 1)
 #else
 #define _PAGE_NO_EXEC_SHIFT    (_PAGE_MODIFIED_SHIFT + 1)
 #endif
 
 #if defined(_PAGE_NO_READ_SHIFT)
 #define _PAGE_GLOBAL_SHIFT     (_PAGE_NO_READ_SHIFT + 1)
-#elif defined(_PAGE_SPLITTING_SHIFT)
-#define _PAGE_GLOBAL_SHIFT     (_PAGE_SPLITTING_SHIFT + 1)
+#elif defined(_PAGE_HUGE_SHIFT)
+#define _PAGE_GLOBAL_SHIFT     (_PAGE_HUGE_SHIFT + 1)
 #else
 #define _PAGE_GLOBAL_SHIFT     (_PAGE_MODIFIED_SHIFT + 1)
 #endif
index 8957f15e21ec4c911e8ebe017ea8cb4ea1276ad4..6995b4a02e2359bf6e2e1b8493bcae55443753f1 100644 (file)
@@ -482,27 +482,9 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
        return pmd;
 }
 
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       return !!(pmd_val(pmd) & _PAGE_SPLITTING);
-}
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-       pmd_val(pmd) |= _PAGE_SPLITTING;
-
-       return pmd;
-}
-
 extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                       pmd_t *pmdp, pmd_t pmd);
 
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-/* Extern to avoid header file madness */
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                       unsigned long address,
-                                       pmd_t *pmdp);
-
 #define __HAVE_ARCH_PMD_WRITE
 static inline int pmd_write(pmd_t pmd)
 {
index cfcb876cae6bc017e7fbf53e3df33fdc637680f3..a6f8daff8e3b882eaed6b1de683b19424279b5c5 100644 (file)
  */
 #define MCL_CURRENT    1               /* lock all current mappings */
 #define MCL_FUTURE     2               /* lock all future mappings */
+#define MCL_ONFAULT    4               /* lock all pages that are faulted in */
+
+/*
+ * Flags for mlock
+ */
+#define MLOCK_ONFAULT  0x01            /* Lock pages in range after they are faulted in, do not prefault */
 
 #define MADV_NORMAL    0               /* no further special treatment */
 #define MADV_RANDOM    1               /* expect random page references */
 #define MADV_SEQUENTIAL 2              /* expect sequential page references */
 #define MADV_WILLNEED  3               /* will need these pages */
 #define MADV_DONTNEED  4               /* don't need these pages */
+#define MADV_FREE      5               /* free pages only if memory pressure */
 
 /* common parameters: try to keep these consistent across architectures */
 #define MADV_REMOVE    9               /* remove these pages & resources */
index 5d3a25e1cfaea62cf7859f3408e3d101f9bf4060..caac3d747a909dbd75d9f0935b8a67766795174c 100644 (file)
@@ -587,7 +587,8 @@ static inline void local_r4k_flush_cache_page(void *args)
                 * another ASID than the current one.
                 */
                map_coherent = (cpu_has_dc_aliases &&
-                               page_mapped(page) && !Page_dcache_dirty(page));
+                               page_mapcount(page) &&
+                               !Page_dcache_dirty(page));
                if (map_coherent)
                        vaddr = kmap_coherent(page, addr);
                else
index aab218c36e0d3e2f7669c47343e583e527103169..3f159caf6dbc902d20d2284913aea498c67cd596 100644 (file)
@@ -106,7 +106,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
        unsigned long addr = (unsigned long) page_address(page);
 
        if (pages_do_alias(addr, vmaddr)) {
-               if (page_mapped(page) && !Page_dcache_dirty(page)) {
+               if (page_mapcount(page) && !Page_dcache_dirty(page)) {
                        void *kaddr;
 
                        kaddr = kmap_coherent(page, vmaddr);
index 349995d19c7f2c85ee1eeb83d7882d9558d96e03..1afd87c999b0c22f1ab08508a233126b5d6d9f2b 100644 (file)
@@ -87,8 +87,6 @@ static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end,
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@ -109,18 +107,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
 
                next = pmd_addr_end(addr, end);
-               /*
-                * The pmd_trans_splitting() check below explains why
-                * pmdp_splitting_flush has to flush the tlb, to stop
-                * this gup-fast code from running while we set the
-                * splitting bit in the pmd. Returning zero will take
-                * the slow path that will call wait_split_huge_page()
-                * if the pmd is still in splitting state. gup-fast
-                * can't because it has irq disabled and
-                * wait_split_huge_page() would never return as the
-                * tlb flush IPI wouldn't run.
-                */
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_huge(pmd))) {
                        if (!gup_huge_pmd(pmd, addr, next, write, pages,nr))
@@ -153,8 +140,6 @@ static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end,
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
index 8770e619185eb034b317ce3de837c5185ba05511..7e5fa0938c2174cefe9d5bfb0f04249626b14af1 100644 (file)
@@ -165,7 +165,7 @@ void copy_user_highpage(struct page *to, struct page *from,
 
        vto = kmap_atomic(to);
        if (cpu_has_dc_aliases &&
-           page_mapped(from) && !Page_dcache_dirty(from)) {
+           page_mapcount(from) && !Page_dcache_dirty(from)) {
                vfrom = kmap_coherent(from, vaddr);
                copy_page(vto, vfrom);
                kunmap_coherent();
@@ -187,7 +187,7 @@ void copy_to_user_page(struct vm_area_struct *vma,
        unsigned long len)
 {
        if (cpu_has_dc_aliases &&
-           page_mapped(page) && !Page_dcache_dirty(page)) {
+           page_mapcount(page) && !Page_dcache_dirty(page)) {
                void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(vto, src, len);
                kunmap_coherent();
@@ -205,7 +205,7 @@ void copy_from_user_page(struct vm_area_struct *vma,
        unsigned long len)
 {
        if (cpu_has_dc_aliases &&
-           page_mapped(page) && !Page_dcache_dirty(page)) {
+           page_mapcount(page) && !Page_dcache_dirty(page)) {
                void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(dst, vfrom, len);
                kunmap_coherent();
index e8adc0069d66f17fcc6e27915fa1d5eb8e8258a6..ce4473e7c0d261b04d7bf44fcfc8ddc6414435d9 100644 (file)
@@ -62,20 +62,6 @@ void pmd_init(unsigned long addr, unsigned long pagetable)
 }
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
-                        unsigned long address,
-                        pmd_t *pmdp)
-{
-       if (!pmd_trans_splitting(*pmdp)) {
-               pmd_t pmd = pmd_mksplitting(*pmdp);
-               set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-       }
-}
-
-#endif
-
 pmd_t mk_pmd(struct page *page, pgprot_t prot)
 {
        pmd_t pmd;
index 32e0be27673fefbeca6839929e61a581c8980902..482192cc8f2b88ae89f4cf1495c0dd6055a5bda6 100644 (file)
@@ -240,7 +240,6 @@ static void output_pgtable_bits_defines(void)
        pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT);
 #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
        pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT);
-       pr_define("_PAGE_SPLITTING_SHIFT %d\n", _PAGE_SPLITTING_SHIFT);
 #endif
 #ifdef CONFIG_CPU_MIPSR2
        if (cpu_has_rixi) {
index 294d251ca7b2ee6472d6da01cf4e73f84a177cc2..84f6bd365c7ceca2709203c4b93efda2d502866d 100644 (file)
@@ -31,6 +31,9 @@
 
 #define MCL_CURRENT    1               /* lock all current mappings */
 #define MCL_FUTURE     2               /* lock all future mappings */
+#define MCL_ONFAULT    4               /* lock all pages that are faulted in */
+
+#define MLOCK_ONFAULT  0x01            /* Lock pages in range after they are faulted in, do not prefault */
 
 #define MADV_NORMAL     0               /* no further special treatment */
 #define MADV_RANDOM     1               /* expect random page references */
@@ -40,6 +43,7 @@
 #define MADV_SPACEAVAIL 5               /* insure that resources are reserved */
 #define MADV_VPS_PURGE  6               /* Purge pages from VM page cache */
 #define MADV_VPS_INHERIT 7              /* Inherit parents page size */
+#define MADV_FREE      8               /* free pages only if memory pressure */
 
 /* common/generic parameters */
 #define MADV_REMOVE    9               /* remove these pages & resources */
index 3245f2d96d4f59e5140348b8c4dddbe836c5dda6..21d961bbac0e1e284b6f9cdb76cb472c8be48227 100644 (file)
@@ -373,11 +373,6 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
 #endif /* __ASSEMBLY__ */
 
-/*
- * THP pages can't be special. So use the _PAGE_SPECIAL
- */
-#define _PAGE_SPLITTING _PAGE_SPECIAL
-
 /*
  * We need to differentiate between explicit huge page and THP huge
  * page, since THP huge page also need to track real subpage details
@@ -387,9 +382,8 @@ void pgtable_cache_init(void);
 /*
  * set of bits not changed in pmd_modify.
  */
-#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |              \
-                        _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
-                        _PAGE_THP_HUGE)
+#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+                        _PAGE_ACCESSED | _PAGE_THP_HUGE)
 
 #ifndef __ASSEMBLY__
 /*
@@ -471,13 +465,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
        return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
 }
 
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       if (pmd_trans_huge(pmd))
-               return pmd_val(pmd) & _PAGE_SPLITTING;
-       return 0;
-}
-
 extern int has_transparent_hugepage(void);
 #else
 static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
@@ -515,9 +502,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
 #define pmd_pfn(pmd)           pte_pfn(pmd_pte(pmd))
 #define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
 #define pmd_young(pmd)         pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd)         pte_dirty(pmd_pte(pmd))
 #define pmd_mkold(pmd)         pte_pmd(pte_mkold(pmd_pte(pmd)))
 #define pmd_wrprotect(pmd)     pte_pmd(pte_wrprotect(pmd_pte(pmd)))
 #define pmd_mkdirty(pmd)       pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkclean(pmd)       pte_pmd(pte_mkclean(pmd_pte(pmd)))
 #define pmd_mkyoung(pmd)       pte_pmd(pte_mkyoung(pmd_pte(pmd)))
 #define pmd_mkwrite(pmd)       pte_pmd(pte_mkwrite(pmd_pte(pmd)))
 
@@ -536,12 +525,6 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
        return pmd;
 }
 
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-       pmd_val(pmd) |= _PAGE_SPLITTING;
-       return pmd;
-}
-
 #define __HAVE_ARCH_PMD_SAME
 static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 {
@@ -592,10 +575,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
        pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
 }
 
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long address, pmd_t *pmdp);
-
 extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
 #define pmdp_collapse_flush pmdp_collapse_flush
index 6ea26df0a73c19ba7461292d2538f8f07908ad36..03c06ba7464f4fc47d7f5b6f0fb0057fa69d8f1c 100644 (file)
@@ -22,6 +22,7 @@
 
 #define MCL_CURRENT     0x2000          /* lock all currently mapped pages */
 #define MCL_FUTURE      0x4000          /* lock all additions to address space */
+#define MCL_ONFAULT    0x8000          /* lock all pages that are faulted in */
 
 #define MAP_POPULATE   0x8000          /* populate (prefault) pagetables */
 #define MAP_NONBLOCK   0x10000         /* do not block on IO */
index 4d87122cf6a725805d3de6bdf46cc44b9d3f91ef..6ffade530dab8dad0b2b9417b5b448dbf9516347 100644 (file)
@@ -39,9 +39,6 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
                /* If PMD busy, retry the access */
                if (unlikely(old_pmd & _PAGE_BUSY))
                        return 0;
-               /* If PMD is trans splitting retry the access */
-               if (unlikely(old_pmd & _PAGE_SPLITTING))
-                       return 0;
                /* If PMD permissions don't match, take page fault */
                if (unlikely(access & ~old_pmd))
                        return 1;
index 9833fee493ec414be50c241153889d7ac4259402..cd2d82efe1cd15b1fe003b2eceee8bee33e0a072 100644 (file)
@@ -1030,10 +1030,6 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
                        /*
                         * A hugepage collapse is captured by pmd_none, because
                         * it mark the pmd none and do a hpte invalidate.
-                        *
-                        * We don't worry about pmd_trans_splitting here, The
-                        * caller if it needs to handle the splitting case
-                        * should check for that.
                         */
                        if (pmd_none(pmd))
                                return NULL;
@@ -1071,7 +1067,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 {
        unsigned long mask;
        unsigned long pte_end;
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        pte_t pte;
        int refs;
 
@@ -1094,7 +1090,6 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
        head = pte_page(pte);
 
        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
@@ -1116,15 +1111,5 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
                return 0;
        }
 
-       /*
-        * Any tail page need their mapcount reference taken before we
-        * return.
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
index b85d44271c3b9a1591e1fc5c3cd75db9501a5133..669a15e7fa76a07ad57c3d2b82712766520a049a 100644 (file)
@@ -80,7 +80,7 @@ static void __init setup_node_to_cpumask_map(void)
                setup_nr_node_ids();
 
        /* allocate the map */
-       for (node = 0; node < nr_node_ids; node++)
+       for_each_node(node)
                alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
 
        /* cpumask_of_node() will now work */
index e92cb2146b1862668ade6f9b8ec8bc94a13a209f..422c59a245619beedcfb07d35663c458c5e9f6ba 100644 (file)
@@ -603,55 +603,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
        return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
 }
 
-/*
- * We mark the pmd splitting and invalidate all the hpte
- * entries for this hugepage.
- */
-void pmdp_splitting_flush(struct vm_area_struct *vma,
-                         unsigned long address, pmd_t *pmdp)
-{
-       unsigned long old, tmp;
-
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-#ifdef CONFIG_DEBUG_VM
-       WARN_ON(!pmd_trans_huge(*pmdp));
-       assert_spin_locked(&vma->vm_mm->page_table_lock);
-#endif
-
-#ifdef PTE_ATOMIC_UPDATES
-
-       __asm__ __volatile__(
-       "1:     ldarx   %0,0,%3\n\
-               andi.   %1,%0,%6\n\
-               bne-    1b \n\
-               ori     %1,%0,%4 \n\
-               stdcx.  %1,0,%3 \n\
-               bne-    1b"
-       : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
-       : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
-       : "cc" );
-#else
-       old = pmd_val(*pmdp);
-       *pmdp = __pmd(old | _PAGE_SPLITTING);
-#endif
-       /*
-        * If we didn't had the splitting flag set, go and flush the
-        * HPTE entries.
-        */
-       trace_hugepage_splitting(address, old);
-       if (!(old & _PAGE_SPLITTING)) {
-               /* We need to flush the hpte */
-               if (old & _PAGE_HASHPTE)
-                       hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
-       }
-       /*
-        * This ensures that generic code that rely on IRQ disabling
-        * to prevent a parallel THP split work as expected.
-        */
-       kick_all_cpus_sync();
-}
-
 /*
  * We want to put the pgtable in pmd and use pgtable for tracking
  * the base page size hptes
index fa9fb5b4c66cf8b29fe550b4795fb4a3ec3aeeb7..d5543514c1dfe0d3f53c9ed19bda7e11896dd4b1 100644 (file)
@@ -135,7 +135,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
                                  unsigned long end, struct mm_walk *walk)
 {
        struct vm_area_struct *vma = walk->vma;
-       split_huge_page_pmd(vma, addr, pmd);
+       split_huge_pmd(vma, pmd, addr);
        return 0;
 }
 
index 1c65ef92768dbb553563506373094338ad92e7f9..610f472f91d14c25cef49a118ecc7f1d9eaf73b1 100644 (file)
@@ -1037,7 +1037,7 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs)
                        ret = get_user(regs->nip, &inst);
                        pagefault_enable();
                } else {
-                       ret = probe_kernel_address(regs->nip, inst);
+                       ret = probe_kernel_address((void *)regs->nip, inst);
                }
 
                if (!ret && mcheck_handle_load(regs, inst)) {
index 024f85f947aec50ea93c881e56a73ba3a5591d3c..64ead80912488b476e19a004eaf01924dbdc6b4c 100644 (file)
@@ -286,7 +286,6 @@ static inline int is_module_addr(void *addr)
 
 #define _SEGMENT_ENTRY_DIRTY   0x2000  /* SW segment dirty bit */
 #define _SEGMENT_ENTRY_YOUNG   0x1000  /* SW segment young bit */
-#define _SEGMENT_ENTRY_SPLIT   0x0800  /* THP splitting bit */
 #define _SEGMENT_ENTRY_LARGE   0x0400  /* STE-format control, large page */
 #define _SEGMENT_ENTRY_READ    0x0002  /* SW segment read bit */
 #define _SEGMENT_ENTRY_WRITE   0x0001  /* SW segment write bit */
@@ -318,8 +317,6 @@ static inline int is_module_addr(void *addr)
  * SW-bits: y young, d dirty, r read, w write
  */
 
-#define _SEGMENT_ENTRY_SPLIT_BIT 11    /* THP splitting bit number */
-
 /* Page status table bits for virtualization */
 #define PGSTE_ACC_BITS 0xf000000000000000UL
 #define PGSTE_FP_BIT   0x0800000000000000UL
@@ -523,10 +520,6 @@ static inline int pmd_bad(pmd_t pmd)
        return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
 }
 
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long addr, pmd_t *pmdp);
-
 #define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
 extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
@@ -1424,8 +1417,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
        if (pmd_large(pmd)) {
                pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
                        _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
-                       _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT |
-                       _SEGMENT_ENTRY_SOFT_DIRTY;
+                       _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
                pmd_val(pmd) |= massage_pgprot_pmd(newprot);
                if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
                        pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
@@ -1533,12 +1525,6 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 #define __HAVE_ARCH_PGTABLE_WITHDRAW
 extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) &&
-               (pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT);
-}
-
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t entry)
 {
index 12bbf0e8478f84d0fe464029648cf2333c0cfde3..79f09170943cde6eaa17efaa02510eb90004698b 100644 (file)
@@ -55,7 +55,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
        unsigned long mask, result;
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        int refs;
 
        result = write ? 0 : _SEGMENT_ENTRY_PROTECT;
@@ -67,7 +67,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
        refs = 0;
        head = pmd_page(pmd);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
@@ -88,16 +87,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                return 0;
        }
 
-       /*
-        * Any tail page need their mapcount reference taken before we
-        * return.
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
 
@@ -116,16 +105,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
                pmd = *pmdp;
                barrier();
                next = pmd_addr_end(addr, end);
-               /*
-                * The pmd_trans_splitting() check below explains why
-                * pmdp_splitting_flush() has to serialize with
-                * smp_call_function() against our disabled IRQs, to stop
-                * this gup-fast code from running while we set the
-                * splitting bit in the pmd. Returning zero will take
-                * the slow path that will call wait_split_huge_page()
-                * if the pmd is still in splitting state.
-                */
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd))) {
                        /*
index 54ef3bc01b43c361a47d6a9112b9bb7bfcdc4501..34f3790fe4599b77496ceb9eee11d6216cc66e9f 100644 (file)
@@ -1308,22 +1308,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
        return 1;
 }
 
-static void pmdp_splitting_flush_sync(void *arg)
-{
-       /* Simply deliver the interrupt */
-}
-
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
-{
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
-                             (unsigned long *) pmdp)) {
-               /* need to serialize against gup-fast (IRQ disabled) */
-               smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
-       }
-}
-
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
 {
index 10aed41757fc8247f9e1c271623907b31192607b..3a4fed406fc615b27d48088c2f35396b788dd779 100644 (file)
@@ -159,7 +159,7 @@ static int lookup_prev_stack_frame(unsigned long fp, unsigned long pc,
 
                        /* Sign extend */
                        regcache[dest] =
-                               ((((s64)(u64)op >> 10) & 0xffff) << 54) >> 54;
+                               sign_extend64((((u64)op >> 10) & 0xffff), 9);
                        break;
                case (0xd0 >> 2): /* addi */
                case (0xd4 >> 2): /* addi.l */
index 112ea11c030d68ef6bcfbd0ec1ab8148b7db327f..d208c27ccc67c0738c38f327175fcb18135ee7c2 100644 (file)
@@ -101,7 +101,7 @@ static int generate_and_check_address(struct pt_regs *regs,
        if (displacement_not_indexed) {
                __s64 displacement;
                displacement = (opcode >> 10) & 0x3ff;
-               displacement = ((displacement << 54) >> 54); /* sign extend */
+               displacement = sign_extend64(displacement, 9);
                addr = (__u64)((__s64)base_address + (displacement << width_shift));
        } else {
                __u64 offset;
index 51d8f7f31d1d797392ab2813f4fb3d4d33480598..58aaa4f33b8129b8dd7e8118ec2691478cf79757 100644 (file)
@@ -241,7 +241,7 @@ static void sh4_flush_cache_page(void *args)
                 */
                map_coherent = (current_cpu_data.dcache.n_aliases &&
                        test_bit(PG_dcache_clean, &page->flags) &&
-                       page_mapped(page));
+                       page_mapcount(page));
                if (map_coherent)
                        vaddr = kmap_coherent(page, address);
                else
index f770e3992620e8a1673ee1a2bd47280be55cccd4..e58cfbf4515008c32f670519b0904bb4aafad706 100644 (file)
@@ -59,7 +59,7 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
                       unsigned long vaddr, void *dst, const void *src,
                       unsigned long len)
 {
-       if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+       if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
            test_bit(PG_dcache_clean, &page->flags)) {
                void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(vto, src, len);
@@ -78,7 +78,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
                         unsigned long vaddr, void *dst, const void *src,
                         unsigned long len)
 {
-       if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+       if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
            test_bit(PG_dcache_clean, &page->flags)) {
                void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
                memcpy(dst, vfrom, len);
@@ -97,7 +97,7 @@ void copy_user_highpage(struct page *to, struct page *from,
 
        vto = kmap_atomic(to);
 
-       if (boot_cpu_data.dcache.n_aliases && page_mapped(from) &&
+       if (boot_cpu_data.dcache.n_aliases && page_mapcount(from) &&
            test_bit(PG_dcache_clean, &from->flags)) {
                vfrom = kmap_coherent(from, vaddr);
                copy_page(vto, vfrom);
@@ -153,7 +153,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr)
        unsigned long addr = (unsigned long) page_address(page);
 
        if (pages_do_alias(addr, vmaddr)) {
-               if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+               if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
                    test_bit(PG_dcache_clean, &page->flags)) {
                        void *kaddr;
 
index 131d36fcd07a60af83ae1b6e8968e48577df54f0..7a38d6a576c5e2ea718deb632e63c0939e60c011 100644 (file)
@@ -681,13 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd)
        return pte_val(pte) & _PAGE_PMD_HUGE;
 }
 
-static inline unsigned long pmd_trans_splitting(pmd_t pmd)
-{
-       pte_t pte = __pte(pmd_val(pmd));
-
-       return pmd_trans_huge(pmd) && pte_special(pte);
-}
-
 #define has_transparent_hugepage() 1
 
 static inline pmd_t pmd_mkold(pmd_t pmd)
@@ -717,29 +710,29 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
        return __pmd(pte_val(pte));
 }
 
-static inline pmd_t pmd_mkyoung(pmd_t pmd)
+static inline pmd_t pmd_mkclean(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
 
-       pte = pte_mkyoung(pte);
+       pte = pte_mkclean(pte);
 
        return __pmd(pte_val(pte));
 }
 
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
 
-       pte = pte_mkwrite(pte);
+       pte = pte_mkyoung(pte);
 
        return __pmd(pte_val(pte));
 }
 
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
 {
        pte_t pte = __pte(pmd_val(pmd));
 
-       pte = pte_mkspecial(pte);
+       pte = pte_mkwrite(pte);
 
        return __pmd(pte_val(pte));
 }
index 0b14df33cffa48ab9d0a98c553020af60c00d67d..9765896ecb2c89204465de20bc41afb53561a77e 100644 (file)
@@ -17,6 +17,7 @@
 
 #define MCL_CURRENT     0x2000          /* lock all currently mapped pages */
 #define MCL_FUTURE      0x4000          /* lock all additions to address space */
+#define MCL_ONFAULT    0x8000          /* lock all pages that are faulted in */
 
 #define MAP_POPULATE   0x8000          /* populate (prefault) pagetables */
 #define MAP_NONBLOCK   0x10000         /* do not block on IO */
index dbabe5713a158eec17eb61de90d1cdfd6e974e80..cb841a33da59061d6f435cb8cb7c5f717284f817 100644 (file)
@@ -113,9 +113,6 @@ static unsigned int get_user_insn(unsigned long tpc)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (pmd_trans_huge(*pmdp)) {
-               if (pmd_trans_splitting(*pmdp))
-                       goto out_irq_enable;
-
                pa  = pmd_pfn(*pmdp) << PAGE_SHIFT;
                pa += tpc & ~HPAGE_MASK;
 
index 2e5c4fc2daa91efa1dd4325ca001169fc37b4d89..eb3d8e8ebc6b064febae847c92ef5329a02b8ae4 100644 (file)
@@ -56,8 +56,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
                        put_page(head);
                        return 0;
                }
-               if (head != page)
-                       get_huge_page_tail(page);
 
                pages[*nr] = page;
                (*nr)++;
@@ -70,7 +68,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                        unsigned long end, int write, struct page **pages,
                        int *nr)
 {
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        int refs;
 
        if (!(pmd_val(pmd) & _PAGE_VALID))
@@ -82,7 +80,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
        refs = 0;
        head = pmd_page(pmd);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
@@ -103,15 +100,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
                return 0;
        }
 
-       /* Any tail page need their mapcount reference taken before we
-        * return.
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
 
@@ -126,7 +114,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
 
                next = pmd_addr_end(addr, end);
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd))) {
                        if (!gup_huge_pmd(pmdp, pmd, addr, next,
index 2b05ccbebed9b88623871eef69234a24915bcdbb..96cecf55522ef492f1afbdd143ca80536d63d31d 100644 (file)
@@ -489,16 +489,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define has_transparent_hugepage() 1
 #define pmd_trans_huge pmd_huge_page
-
-static inline pmd_t pmd_mksplitting(pmd_t pmd)
-{
-       return pte_pmd(hv_pte_set_client2(pmd_pte(pmd)));
-}
-
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       return hv_pte_get_client2(pmd_pte(pmd));
-}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
index 81b8fc348d63d9caea21b2edd5e94cfe999313dd..63ee13faf17d2461ad79255bde0669a862e8d344 100644 (file)
@@ -36,6 +36,7 @@
  */
 #define MCL_CURRENT    1               /* lock all current mappings */
 #define MCL_FUTURE     2               /* lock all future mappings */
+#define MCL_ONFAULT    4               /* lock all pages that are faulted in */
 
 
 #endif /* _ASM_TILE_MMAN_H */
index c22df590e7e7463c71ac80a3f7e795656f1d33c2..0f1ccc3b3d2b06729230b9ce4c44252bc283980b 100644 (file)
@@ -2151,6 +2151,9 @@ config USE_PERCPU_NUMA_NODE_ID
        def_bool y
        depends on NUMA
 
+config HAVE_MEMORYLESS_NODES
+       def_bool NUMA
+
 config ARCH_ENABLE_SPLIT_PMD_PTLOCK
        def_bool y
        depends on X86_64 || X86_PAE
index 0d553e54171bdacfc5c64804b341314080b8c6f6..2ee62dba0373b059664de8d554caa6143ba8d989 100644 (file)
@@ -9,13 +9,13 @@
 # Changed by many, many contributors over the years.
 #
 
+KASAN_SANITIZE := n
+
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
 # The number is the same as you would ordinarily press at bootup.
 
-KASAN_SANITIZE := n
-
 SVGA_MODE      := -DSVGA_MODE=NORMAL_VGA
 
 targets                := vmlinux.bin setup.bin setup.elf bzImage
index caa2c712d1e70c5895d92cf856b91a02d9123083..f17705e1332cc3b81dc9a3a7551ece5d1848d5db 100644 (file)
 373    i386    shutdown                sys_shutdown
 374    i386    userfaultfd             sys_userfaultfd
 375    i386    membarrier              sys_membarrier
+376    i386    mlock2                  sys_mlock2
index 278842fdf1f6393d58ea0ac09abe60ef09e8ce79..314a90bfc09c16ab76c5d1451d7d2a4026be946f 100644 (file)
 322    64      execveat                stub_execveat
 323    common  userfaultfd             sys_userfaultfd
 324    common  membarrier              sys_membarrier
+325    common  mlock2                  sys_mlock2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
index 6ec0c8b2e9df5b1d4c7702fd7f1d96c2c24db5d4..9ff592003afda8b9d1d2bf1d3353ae8f04625d60 100644 (file)
@@ -165,11 +165,6 @@ static inline int pmd_large(pmd_t pte)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       return pmd_val(pmd) & _PAGE_SPLITTING;
-}
-
 static inline int pmd_trans_huge(pmd_t pmd)
 {
        return pmd_val(pmd) & _PAGE_PSE;
@@ -274,6 +269,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
 }
 
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+       return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
+
 static inline pmd_t pmd_wrprotect(pmd_t pmd)
 {
        return pmd_clear_flags(pmd, _PAGE_RW);
@@ -816,10 +816,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
 
 
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long addr, pmd_t *pmdp);
-
 #define __HAVE_ARCH_PMD_WRITE
 static inline int pmd_write(pmd_t pmd)
 {
index dd5b0aa9dd2f93a01b554029ebc243aa56ae91d3..116fc4ee586f3750da9bb2d3d9d7867e71fbea09 100644 (file)
@@ -22,7 +22,6 @@
 #define _PAGE_BIT_PAT_LARGE    12      /* On 2MB or 1GB pages */
 #define _PAGE_BIT_SPECIAL      _PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST     _PAGE_BIT_SOFTW1
-#define _PAGE_BIT_SPLITTING    _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
 #define _PAGE_BIT_HIDDEN       _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
 #define _PAGE_BIT_SOFT_DIRTY   _PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
@@ -46,7 +45,6 @@
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL  (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
 #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define _PAGE_SPLITTING        (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
 #define __HAVE_ARCH_PTE_SPECIAL
 
 #ifdef CONFIG_KMEMCHECK
index e75907601a41c349e05c8dfe63047dc460ccdf30..3625ac798821366bbdd2680f9db172a081550fea 100644 (file)
@@ -705,8 +705,14 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 
        nid = acpi_get_node(handle);
        if (nid != -1) {
+               if (try_online_node(nid)) {
+                       pr_warn("failed to online node%d for CPU%d, use node%d instead.\n",
+                               nid, cpu, first_node(node_online_map));
+                       nid = first_node(node_online_map);
+               }
                set_apicid_to_node(physid, nid);
                numa_set_node(cpu, nid);
+               set_cpu_numa_mem(cpu, local_memory_node(nid));
        }
 #endif
 }
@@ -733,9 +739,10 @@ int acpi_unmap_cpu(int cpu)
 {
 #ifdef CONFIG_ACPI_NUMA
        set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+       set_cpu_numa_mem(cpu, NUMA_NO_NODE);
 #endif
 
-       per_cpu(x86_cpu_to_apicid, cpu) = -1;
+       per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
        set_cpu_present(cpu, false);
        num_processors--;
 
index f32ac13934f2310c1b61f54884246089c7e06e01..ec863b9a9f780c7507634353d64f9c2f76f1a0e1 100644 (file)
@@ -163,10 +163,9 @@ again:
                goto again;
 
        delta = now - prev;
-       if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
-               delta <<= 32;
-               delta >>= 32; /* sign extend */
-       }
+       if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+               delta = sign_extend64(delta, 31);
+
        local64_add(now - prev, &event->count);
 }
 
index 819ab3f9c9c7cb1476007619b5e6998c528be30f..22db575a2fece3f5c6db9e74aaf2b9585a3f80c3 100644 (file)
@@ -337,6 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
 #endif
        vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
                              kaslr_offset());
+       VMCOREINFO_PHYS_BASE(phys_base);
 }
 
 /* arch-dependent functionality related to kexec file-based syscall */
index cd99433b8ba17597cbc9e91aba9c40eee7e05e4b..6ba014c61d62d20a078dd260103f23465a47a8cd 100644 (file)
@@ -90,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 again:
        page = NULL;
        /* CMA can be used only in the context which permits sleeping */
-       if (flag & __GFP_WAIT) {
+       if (gfpflags_allow_blocking(flag)) {
                page = dma_alloc_from_contiguous(dev, count, get_order(size));
                if (page && page_to_phys(page) + size > dma_mask) {
                        dma_release_from_contiguous(dev, page, count);
index 892ee2e5ecbce417df506715f7b28d28c403ef91..5ed24ea0e9455558931a414a38045cac44edb2ae 100644 (file)
@@ -155,6 +155,8 @@ static void smp_callin(void)
         */
        phys_id = read_apic_id();
 
+       set_numa_mem(local_memory_node(cpu_to_node(cpuid)));
+
        /*
         * the boot CPU has finished the init stage and is spinning
         * on callin_map until we finish. We are free to set up this
index 5246193519614dbd8d3a602544e8117376df05f7..3a5330213aca4812d6eaf30ff6b9a7fffffd43c3 100644 (file)
@@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm)
        if (pud_none_or_clear_bad(pud))
                goto out;
        pmd = pmd_offset(pud, 0xA0000);
-       split_huge_page_pmd_mm(mm, 0xA0000, pmd);
+
+       if (pmd_trans_huge(*pmd)) {
+               struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+               split_huge_pmd(vma, pmd, 0xA0000);
+       }
        if (pmd_none_or_clear_bad(pmd))
                goto out;
        pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
index ae9a37bf13711460892584e67d02880291168f86..f8cb3e8ac250ecc8ae288ec0135f5689ca7cc0b9 100644 (file)
@@ -136,8 +136,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@ -158,18 +156,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = *pmdp;
 
                next = pmd_addr_end(addr, end);
-               /*
-                * The pmd_trans_splitting() check below explains why
-                * pmdp_splitting_flush has to flush the tlb, to stop
-                * this gup-fast code from running while we set the
-                * splitting bit in the pmd. Returning zero will take
-                * the slow path that will call wait_split_huge_page()
-                * if the pmd is still in splitting state. gup-fast
-                * can't because it has irq disabled and
-                * wait_split_huge_page() would never return as the
-                * tlb flush IPI wouldn't run.
-                */
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
                        /*
@@ -212,8 +199,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
index 9ce5da27b136beec916e5f1ad19c88ba493bb5f5..d470cf219a2d8f4608445b47ee8c3307d9929bd8 100644 (file)
@@ -126,5 +126,5 @@ void __init kasan_init(void)
        __flush_tlb_all();
        init_task.kasan_depth = 0;
 
-       pr_info("Kernel address sanitizer initialized\n");
+       pr_info("KernelAddressSanitizer initialized\n");
 }
index c3b3f653ed0c6c9112297164c5cf535494f19461..e5a3b35083a69db99a7da169b88880eca551370d 100644 (file)
@@ -22,6 +22,7 @@
 
 int __initdata numa_off;
 nodemask_t numa_nodes_parsed __initdata;
+static nodemask_t numa_nodes_empty __initdata;
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
@@ -562,17 +563,16 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
                        end = max(mi->blk[i].end, end);
                }
 
-               if (start >= end)
-                       continue;
-
                /*
                 * Don't confuse VM with a node that doesn't have the
                 * minimum amount of memory:
                 */
-               if (end && (end - start) < NODE_MIN_SIZE)
-                       continue;
-
-               alloc_node_data(nid);
+               if (start < end && (end - start) >= NODE_MIN_SIZE) {
+                       alloc_node_data(nid);
+               } else if (IS_ENABLED(CONFIG_HAVE_MEMORYLESS_NODES)) {
+                       alloc_node_data(nid);
+                       node_set(nid, numa_nodes_empty);
+               }
        }
 
        /* Dump memblock with node info and return. */
@@ -589,16 +589,18 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
  */
 static void __init numa_init_array(void)
 {
-       int rr, i;
+       int i, rr = MAX_NUMNODES;
 
-       rr = first_node(node_online_map);
        for (i = 0; i < nr_cpu_ids; i++) {
-               if (early_cpu_to_node(i) != NUMA_NO_NODE)
-                       continue;
+               /* Search for an onlined node with memory */
+               do {
+                       if (rr != MAX_NUMNODES)
+                               rr = next_node(rr, node_online_map);
+                       if (rr == MAX_NUMNODES)
+                               rr = first_node(node_online_map);
+               } while (node_isset(rr, numa_nodes_empty));
+
                numa_set_node(i, rr);
-               rr = next_node(rr, node_online_map);
-               if (rr == MAX_NUMNODES)
-                       rr = first_node(node_online_map);
        }
 }
 
@@ -646,14 +648,6 @@ static int __init numa_init(int (*init_func)(void))
        if (ret < 0)
                return ret;
 
-       for (i = 0; i < nr_cpu_ids; i++) {
-               int nid = early_cpu_to_node(i);
-
-               if (nid == NUMA_NO_NODE)
-                       continue;
-               if (!node_online(nid))
-                       numa_clear_node(i);
-       }
        numa_init_array();
 
        return 0;
@@ -708,9 +702,12 @@ static __init int find_near_online_node(int node)
 {
        int n, val;
        int min_val = INT_MAX;
-       int best_node = -1;
+       int best_node = NUMA_NO_NODE;
 
        for_each_online_node(n) {
+               if (node_isset(n, numa_nodes_empty))
+                       continue;
+
                val = node_distance(node, n);
 
                if (val < min_val) {
@@ -751,6 +748,22 @@ void __init init_cpu_to_node(void)
                if (!node_online(node))
                        node = find_near_online_node(node);
                numa_set_node(cpu, node);
+               if (node_spanned_pages(node))
+                       set_cpu_numa_mem(cpu, node);
+               if (IS_ENABLED(CONFIG_HAVE_MEMORYLESS_NODES))
+                       node_clear(node, numa_nodes_empty);
+       }
+
+       /* Destroy empty nodes */
+       if (IS_ENABLED(CONFIG_HAVE_MEMORYLESS_NODES)) {
+               int nid;
+               const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+
+               for_each_node_mask(nid, numa_nodes_empty) {
+                       node_set_offline(nid);
+                       memblock_free(__pa(node_data[nid]), nd_size);
+                       node_data[nid] = NULL;
+               }
        }
 }
 
index fb0a9dd1d6e46fc6e6921bf29df9f71e830fdcb8..f52caf9c519b417af64d97dcdbcba7cf1df376ef 100644 (file)
@@ -509,20 +509,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
 
        return young;
 }
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
-                         unsigned long address, pmd_t *pmdp)
-{
-       int set;
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
-                               (unsigned long *)pmdp);
-       if (set) {
-               pmd_update(vma->vm_mm, address, pmdp);
-               /* need tlb flush only to serialize against gup-fast */
-               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-       }
-}
 #endif
 
 /**
index f3dfe0d921c2759a2ff201e3b6b7425078338004..44c6764d9146347d713a8d377d612ae7970f9e96 100644 (file)
@@ -169,7 +169,6 @@ CONFIG_FLATMEM_MANUAL=y
 # CONFIG_SPARSEMEM_MANUAL is not set
 CONFIG_FLATMEM=y
 CONFIG_FLAT_NODE_MEM_MAP=y
-CONFIG_PAGEFLAGS_EXTENDED=y
 CONFIG_SPLIT_PTLOCK_CPUS=4
 # CONFIG_PHYS_ADDR_T_64BIT is not set
 CONFIG_ZONE_DMA_FLAG=1
index 201aec0e0446e84d2ee27d1f8bcd5a050c42344c..83c5150b06f9e3aa21b917a0430810bb514871d8 100644 (file)
  */
 #define MCL_CURRENT    1               /* lock all current mappings */
 #define MCL_FUTURE     2               /* lock all future mappings */
+#define MCL_ONFAULT    4               /* lock all pages that are faulted in */
+
+/*
+ * Flags for mlock
+ */
+#define MLOCK_ONFAULT  0x01            /* Lock pages in range after they are faulted in, do not prefault */
 
 #define MADV_NORMAL    0               /* no further special treatment */
 #define MADV_RANDOM    1               /* expect random page references */
 #define MADV_SEQUENTIAL        2               /* expect sequential page references */
 #define MADV_WILLNEED  3               /* will need these pages */
 #define MADV_DONTNEED  4               /* don't need these pages */
+#define MADV_FREE      5               /* free pages only if memory pressure */
 
 /* common parameters: try to keep these consistent across architectures */
 #define MADV_REMOVE    9               /* remove these pages & resources */
index 5ece856c5725c7cc72d0a0175bf9229330fabec5..35c822286bbe8b194e654cceef50972c260922c6 100644 (file)
@@ -245,7 +245,7 @@ static int check_tlb_entry(unsigned w, unsigned e, bool dtlb)
                                                page_mapcount(p));
                                if (!page_count(p))
                                        rc |= TLB_INSANE;
-                               else if (page_mapped(p))
+                               else if (page_mapcount(p))
                                        rc |= TLB_SUSPICIOUS;
                        } else {
                                rc |= TLB_INSANE;
index ad3f276d74bcb5a21474c49c786dee82f2f1b9f6..4f184d938942dcbbbbedbac3b330f6e4bb39a4a9 100644 (file)
@@ -211,7 +211,7 @@ fallback:
                bvl = mempool_alloc(pool, gfp_mask);
        } else {
                struct biovec_slab *bvs = bvec_slabs + *idx;
-               gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+               gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
 
                /*
                 * Make this allocation restricted and don't dump info on
@@ -221,11 +221,11 @@ fallback:
                __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
 
                /*
-                * Try a slab allocation. If this fails and __GFP_WAIT
+                * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
                 * is set, retry with the 1-entry mempool
                 */
                bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
-               if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
+               if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
                        *idx = BIOVEC_MAX_IDX;
                        goto fallback;
                }
@@ -395,12 +395,12 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
  *   If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
  *   backed by the @bs's mempool.
  *
- *   When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
- *   able to allocate a bio. This is due to the mempool guarantees. To make this
- *   work, callers must never allocate more than 1 bio at a time from this pool.
- *   Callers that need to allocate more than 1 bio must always submit the
- *   previously allocated bio for IO before attempting to allocate a new one.
- *   Failure to do so can cause deadlocks under memory pressure.
+ *   When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
+ *   always be able to allocate a bio. This is due to the mempool guarantees.
+ *   To make this work, callers must never allocate more than 1 bio at a time
+ *   from this pool. Callers that need to allocate more than 1 bio must always
+ *   submit the previously allocated bio for IO before attempting to allocate
+ *   a new one. Failure to do so can cause deadlocks under memory pressure.
  *
  *   Note that when running under generic_make_request() (i.e. any block
  *   driver), bios are not submitted until after you return - see the code in
@@ -459,13 +459,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                 * We solve this, and guarantee forward progress, with a rescuer
                 * workqueue per bio_set. If we go to allocate and there are
                 * bios on current->bio_list, we first try the allocation
-                * without __GFP_WAIT; if that fails, we punt those bios we
-                * would be blocking to the rescuer workqueue before we retry
-                * with the original gfp_flags.
+                * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
+                * bios we would be blocking to the rescuer workqueue before
+                * we retry with the original gfp_flags.
                 */
 
                if (current->bio_list && !bio_list_empty(current->bio_list))
-                       gfp_mask &= ~__GFP_WAIT;
+                       gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
                p = mempool_alloc(bs->bio_pool, gfp_mask);
                if (!p && gfp_mask != saved_gfp) {
index 89eec79658702a7e53712bc52178dae25bddcc22..5dd1f54d793549e50180b0e4840f8667536351ad 100644 (file)
@@ -638,7 +638,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
                if (percpu_ref_tryget_live(&q->q_usage_counter))
                        return 0;
 
-               if (!(gfp & __GFP_WAIT))
+               if (!gfpflags_allow_blocking(gfp))
                        return -EBUSY;
 
                ret = wait_event_interruptible(q->mq_freeze_wq,
@@ -1206,8 +1206,8 @@ rq_starved:
  * @bio: bio to allocate request for (can be %NULL)
  * @gfp_mask: allocation mask
  *
- * Get a free request from @q.  If %__GFP_WAIT is set in @gfp_mask, this
- * function keeps retrying under memory pressure and fails iff @q is dead.
+ * Get a free request from @q.  If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
+ * this function keeps retrying under memory pressure and fails iff @q is dead.
  *
  * Must be called with @q->queue_lock held and,
  * Returns ERR_PTR on failure, with @q->queue_lock held.
@@ -1227,7 +1227,7 @@ retry:
        if (!IS_ERR(rq))
                return rq;
 
-       if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
+       if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
                blk_put_rl(rl);
                return rq;
        }
@@ -1305,11 +1305,11 @@ EXPORT_SYMBOL(blk_get_request);
  * BUG.
  *
  * WARNING: When allocating/cloning a bio-chain, careful consideration should be
- * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
- * anything but the first bio in the chain. Otherwise you risk waiting for IO
- * completion of a bio that hasn't been submitted yet, thus resulting in a
- * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
- * of bio_alloc(), as that avoids the mempool deadlock.
+ * given to how you allocate bios. In particular, you cannot use
+ * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise
+ * you risk waiting for IO completion of a bio that hasn't been submitted yet,
+ * thus resulting in a deadlock. Alternatively bios should be allocated using
+ * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock.
  * If possible a big IO should be split into smaller parts when allocation
  * fails. Partial allocation should not be an error, or you risk a live-lock.
  */
@@ -2038,7 +2038,7 @@ void generic_make_request(struct bio *bio)
        do {
                struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
-               if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) {
+               if (likely(blk_queue_enter(q, ___GFP_DIRECT_RECLAIM) == 0)) {
 
                        q->make_request_fn(q, bio);
 
index 1a27f45ec776a4e0a951a89de3185dc2ead26a42..381cb50a673c33ed86f6f5d597331137f2881ce3 100644 (file)
@@ -289,7 +289,7 @@ struct io_context *get_task_io_context(struct task_struct *task,
 {
        struct io_context *ioc;
 
-       might_sleep_if(gfp_flags & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(gfp_flags));
 
        do {
                task_lock(task);
index 60ac684c8b8c52f26fe8a83d290fbd8f75f76581..a07ca3488d96fb7a96159fbe48c0c9e5e95391ec 100644 (file)
@@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
        if (tag != -1)
                return tag;
 
-       if (!(data->gfp & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(data->gfp))
                return -1;
 
        bs = bt_wait_ptr(bt, hctx);
index 27bf3097532d02b0d43228d26bc696362b86cb28..34e26163b73a434abf27bae5af8e6b21a6596c20 100644 (file)
@@ -244,11 +244,11 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 
        ctx = blk_mq_get_ctx(q);
        hctx = q->mq_ops->map_queue(q, ctx->cpu);
-       blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
+       blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
                        reserved, ctx, hctx);
 
        rq = __blk_mq_alloc_request(&alloc_data, rw);
-       if (!rq && (gfp & __GFP_WAIT)) {
+       if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
                __blk_mq_run_hw_queue(hctx);
                blk_mq_put_ctx(ctx);
 
@@ -1186,7 +1186,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
                ctx = blk_mq_get_ctx(q);
                hctx = q->mq_ops->map_queue(q, ctx->cpu);
                blk_mq_set_alloc_data(&alloc_data, q,
-                               __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
+                               __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
                rq = __blk_mq_alloc_request(&alloc_data, rw);
                ctx = alloc_data.ctx;
                hctx = alloc_data.hctx;
index e5cafa51567c9d589147523c8ab7b43504f9d725..ebb41feea35754525761edf815b7e3a994ab0a06 100644 (file)
@@ -852,7 +852,7 @@ static int show_partition(struct seq_file *seqf, void *v)
        char buf[BDEVNAME_SIZE];
 
        /* Don't show non-partitionable removeable devices or empty devices */
-       if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+       if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
                                   (sgp->flags & GENHD_FL_REMOVABLE)))
                return 0;
        if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
index 31666c92b46af29919f42ea3e1093caed7127d71..cc7800e9eb441e2b7737a152f0dbb60182821408 100644 (file)
@@ -123,7 +123,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
                                break;
 
                        do_each_thread(g, p) {
-                               if (!uid_eq(task_uid(p), uid))
+                               if (!uid_eq(task_uid(p), uid) ||
+                                   !task_pid_vnr(p))
                                        continue;
                                ret = set_task_ioprio(p, ioprio);
                                if (ret)
@@ -220,7 +221,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
                                break;
 
                        do_each_thread(g, p) {
-                               if (!uid_eq(task_uid(p), user->uid))
+                               if (!uid_eq(task_uid(p), user->uid) ||
+                                   !task_pid_vnr(p))
                                        continue;
                                tmpio = get_task_ioprio(p);
                                if (tmpio < 0)
index dda653ce7b24cfb959f668bdb4a676900ed7637d..0774799942e06a8d890a5c88e40990cd53a15037 100644 (file)
@@ -444,7 +444,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
        }
 
-       rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
+       rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM);
        if (IS_ERR(rq)) {
                err = PTR_ERR(rq);
                goto error_free_buffer;
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
                break;
        }
 
-       if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {
+       if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) {
                err = DRIVER_ERROR << 24;
                goto error;
        }
@@ -536,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
        struct request *rq;
        int err;
 
-       rq = blk_get_request(q, WRITE, __GFP_WAIT);
+       rq = blk_get_request(q, WRITE, __GFP_RECLAIM);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
        blk_rq_set_block_pc(rq);
index d3d73d114a4615e124e89bd6d4196ba5be35f415..9462d27528507d693d8e4efe0e6464597ab1768b 100644 (file)
@@ -1007,7 +1007,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
        bm_set_page_unchanged(b->bm_pages[page_nr]);
 
        if (ctx->flags & BM_AIO_COPY_PAGES) {
-               page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+               page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM);
                copy_highpage(page, b->bm_pages[page_nr]);
                bm_store_page_idx(page, page_nr);
        } else
index c097909c589c240e4b9d2ca44e1a9b7084ea9b15..b4b5680ac6adb1dcdbda2428b08398d38e2c8d4b 100644 (file)
@@ -357,7 +357,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
        }
 
        if (has_payload && data_size) {
-               page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
+               page = drbd_alloc_pages(peer_device, nr_pages,
+                                       gfpflags_allow_blocking(gfp_mask));
                if (!page)
                        goto fail;
        }
index f504232c1ee779079353e7ffd4b2d6831f87e61b..a28a562f7b7f245355db7d536e0558c78cc7e374 100644 (file)
@@ -173,7 +173,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
 {
        struct request *rq;
 
-       rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true);
+       rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true);
        return blk_mq_rq_to_pdu(rq);
 }
 
index 1b87623381e2b1183b5c9d57c870b7c10924f65e..93b3f99b6865fe721f7124412553cadf3c328e7a 100644 (file)
@@ -444,9 +444,7 @@ static int nbd_thread_recv(struct nbd_device *nbd)
        spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 
        if (signal_pending(current)) {
-               siginfo_t info;
-
-               ret = dequeue_signal_lock(current, &current->blocked, &info);
+               ret = kernel_dequeue_signal(NULL);
                dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
                         task_pid_nr(current), current->comm, ret);
                mutex_lock(&nbd->tx_lock);
@@ -560,11 +558,8 @@ static int nbd_thread_send(void *data)
                                         !list_empty(&nbd->waiting_queue));
 
                if (signal_pending(current)) {
-                       siginfo_t info;
-                       int ret;
+                       int ret = kernel_dequeue_signal(NULL);
 
-                       ret = dequeue_signal_lock(current, &current->blocked,
-                                                 &info);
                        dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
                                 task_pid_nr(current), current->comm, ret);
                        mutex_lock(&nbd->tx_lock);
@@ -592,10 +587,8 @@ static int nbd_thread_send(void *data)
        spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 
        /* Clear maybe pending signals */
-       if (signal_pending(current)) {
-               siginfo_t info;
-               dequeue_signal_lock(current, &current->blocked, &info);
-       }
+       if (signal_pending(current))
+               kernel_dequeue_signal(NULL);
 
        return 0;
 }
index e229425962074cf358ee234b853af452435495ea..1b709a4e3b5ec62eaf9709b366ea11c0d097ffd3 100644 (file)
@@ -271,7 +271,7 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
                        goto err_out;
 
                tmp->bi_bdev = NULL;
-               gfpmask &= ~__GFP_WAIT;
+               gfpmask &= ~__GFP_DIRECT_RECLAIM;
                tmp->bi_next = NULL;
 
                if (!new_chain)
index b9242d78283db3a2bb03524a9bbbfd6923b7dccf..562b5a4ca7b712f6b2b4440375904d8be1ac5200 100644 (file)
@@ -723,7 +723,7 @@ static int pd_special_command(struct pd_unit *disk,
        struct request *rq;
        int err = 0;
 
-       rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT);
+       rq = blk_get_request(disk->gd->queue, READ, __GFP_RECLAIM);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
 
index cd813f9110bfc99314604fcf0ae93e58242ac413..2f477d45d6cfa42d586080db8c293d41406055ae 100644 (file)
@@ -704,14 +704,14 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
        int ret = 0;
 
        rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
-                            WRITE : READ, __GFP_WAIT);
+                            WRITE : READ, __GFP_RECLAIM);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
        blk_rq_set_block_pc(rq);
 
        if (cgc->buflen) {
                ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
-                                     __GFP_WAIT);
+                                     __GFP_RECLAIM);
                if (ret)
                        goto out;
        }
index 9fa15bb9d118ee5ad2d9f34e23aab6777d09d105..81a557c33a1f8b4a41b19f39b58d8c0e0e40ae35 100644 (file)
@@ -106,7 +106,7 @@ static void zram_set_obj_size(struct zram_meta *meta,
        meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
 }
 
-static inline int is_partial_io(struct bio_vec *bvec)
+static inline bool is_partial_io(struct bio_vec *bvec)
 {
        return bvec->bv_len != PAGE_SIZE;
 }
@@ -114,25 +114,25 @@ static inline int is_partial_io(struct bio_vec *bvec)
 /*
  * Check if request is within bounds and aligned on zram logical blocks.
  */
-static inline int valid_io_request(struct zram *zram,
+static inline bool valid_io_request(struct zram *zram,
                sector_t start, unsigned int size)
 {
        u64 end, bound;
 
        /* unaligned request */
        if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
-               return 0;
+               return false;
        if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
-               return 0;
+               return false;
 
        end = start + (size >> SECTOR_SHIFT);
        bound = zram->disksize >> SECTOR_SHIFT;
        /* out of range range */
        if (unlikely(start >= bound || end > bound || start > end))
-               return 0;
+               return false;
 
        /* I/O request is valid */
-       return 1;
+       return true;
 }
 
 static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
@@ -157,7 +157,7 @@ static inline void update_used_max(struct zram *zram,
        } while (old_max != cur_max);
 }
 
-static int page_zero_filled(void *ptr)
+static bool page_zero_filled(void *ptr)
 {
        unsigned int pos;
        unsigned long *page;
@@ -166,10 +166,10 @@ static int page_zero_filled(void *ptr)
 
        for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
                if (page[pos])
-                       return 0;
+                       return false;
        }
 
-       return 1;
+       return true;
 }
 
 static void handle_zero_page(struct bio_vec *bvec)
@@ -365,6 +365,9 @@ static ssize_t comp_algorithm_store(struct device *dev,
        struct zram *zram = dev_to_zram(dev);
        size_t sz;
 
+       if (!zcomp_available_algorithm(buf))
+               return -EINVAL;
+
        down_write(&zram->init_lock);
        if (init_done(zram)) {
                up_write(&zram->init_lock);
@@ -378,9 +381,6 @@ static ssize_t comp_algorithm_store(struct device *dev,
        if (sz > 0 && zram->compressor[sz - 1] == '\n')
                zram->compressor[sz - 1] = 0x00;
 
-       if (!zcomp_available_algorithm(zram->compressor))
-               len = -EINVAL;
-
        up_write(&zram->init_lock);
        return len;
 }
@@ -726,14 +726,14 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
        }
 
        alloced_pages = zs_get_total_pages(meta->mem_pool);
+       update_used_max(zram, alloced_pages);
+
        if (zram->limit_pages && alloced_pages > zram->limit_pages) {
                zs_free(meta->mem_pool, handle);
                ret = -ENOMEM;
                goto out;
        }
 
-       update_used_max(zram, alloced_pages);
-
        cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
 
        if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
index 30f522848c7353166af0d9605215205101c0b013..d7373ca69c9947a83dd8a1d67b77c7fc9e77209e 100644 (file)
@@ -124,7 +124,8 @@ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group,
        if (group)
                return netlink_broadcast(dev->nls, skb, portid, group,
                                         gfp_mask);
-       return netlink_unicast(dev->nls, skb, portid, !(gfp_mask&__GFP_WAIT));
+       return netlink_unicast(dev->nls, skb, portid,
+                       !gfpflags_allow_blocking(gfp_mask));
 }
 EXPORT_SYMBOL_GPL(cn_netlink_send_mult);
 
index 2a3973a7c44179457f635196696bc78e756a76e1..36a7c2d89a010e7a567224c89a01a2ebe0c8fd20 100644 (file)
@@ -486,7 +486,7 @@ static int ioctl_get_info(struct client *client, union ioctl_arg *arg)
 static int add_client_resource(struct client *client,
                               struct client_resource *resource, gfp_t gfp_mask)
 {
-       bool preload = !!(gfp_mask & __GFP_WAIT);
+       bool preload = gfpflags_allow_blocking(gfp_mask);
        unsigned long flags;
        int ret;
 
index abeb9af31f9c5e7c02b3f8d64ddf34f01621cfd0..2e10bba4468b0c6b65aee0d07fa279295c52d2b2 100644 (file)
@@ -496,7 +496,7 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj)
                 * __GFP_DMA32 to be set in mapping_gfp_mask(inode->i_mapping)
                 * so shmem can relocate pages during swapin if required.
                 */
-               BUG_ON((mapping_gfp_mask(mapping) & __GFP_DMA32) &&
+               BUG_ON(mapping_gfp_constraint(mapping, __GFP_DMA32) &&
                                (page_to_pfn(p) >= 0x00100000UL));
        }
 
index 4924d381b6642f51a4b98698994e6b2ab3c0744e..daa2ff12101ba366d82be5b550ba5cf5738a6ff2 100644 (file)
@@ -38,8 +38,6 @@
 #include "drm_legacy.h"
 #include "drm_internal.h"
 
-static int drm_notifier(void *priv);
-
 static int drm_lock_take(struct drm_lock_data *lock_data, unsigned int context);
 
 /**
@@ -118,14 +116,8 @@ int drm_legacy_lock(struct drm_device *dev, void *data,
         * really probably not the correct answer but lets us debug xkb
         * xserver for now */
        if (!file_priv->is_master) {
-               sigemptyset(&dev->sigmask);
-               sigaddset(&dev->sigmask, SIGSTOP);
-               sigaddset(&dev->sigmask, SIGTSTP);
-               sigaddset(&dev->sigmask, SIGTTIN);
-               sigaddset(&dev->sigmask, SIGTTOU);
                dev->sigdata.context = lock->context;
                dev->sigdata.lock = master->lock.hw_lock;
-               block_all_signals(drm_notifier, dev, &dev->sigmask);
        }
 
        if (dev->driver->dma_quiescent && (lock->flags & _DRM_LOCK_QUIESCENT))
@@ -169,7 +161,6 @@ int drm_legacy_unlock(struct drm_device *dev, void *data, struct drm_file *file_
                /* FIXME: Should really bail out here. */
        }
 
-       unblock_all_signals();
        return 0;
 }
 
@@ -287,38 +278,6 @@ int drm_legacy_lock_free(struct drm_lock_data *lock_data, unsigned int context)
        return 0;
 }
 
-/**
- * If we get here, it means that the process has called DRM_IOCTL_LOCK
- * without calling DRM_IOCTL_UNLOCK.
- *
- * If the lock is not held, then let the signal proceed as usual.  If the lock
- * is held, then set the contended flag and keep the signal blocked.
- *
- * \param priv pointer to a drm_device structure.
- * \return one if the signal should be delivered normally, or zero if the
- * signal should be blocked.
- */
-static int drm_notifier(void *priv)
-{
-       struct drm_device *dev = priv;
-       struct drm_hw_lock *lock = dev->sigdata.lock;
-       unsigned int old, new, prev;
-
-       /* Allow signal delivery if lock isn't held */
-       if (!lock || !_DRM_LOCK_IS_HELD(lock->lock)
-           || _DRM_LOCKING_CONTEXT(lock->lock) != dev->sigdata.context)
-               return 1;
-
-       /* Otherwise, set flag to force call to
-          drmUnlock */
-       do {
-               old = lock->lock;
-               new = old | _DRM_LOCK_CONT;
-               prev = cmpxchg(&lock->lock, old, new);
-       } while (prev != old);
-       return 0;
-}
-
 /**
  * This function returns immediately and takes the hw lock
  * with the kernel context if it is free, otherwise it gets the highest priority when and if
index e57061ac02191dd352d71f72ed0599f58c80b45b..5cf4a1998273c3cfcc494c83210c0bc572f35c2e 100644 (file)
@@ -2216,9 +2216,8 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
         * Fail silently without starting the shrinker
         */
        mapping = file_inode(obj->base.filp)->i_mapping;
-       gfp = mapping_gfp_mask(mapping);
-       gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD;
-       gfp &= ~(__GFP_IO | __GFP_WAIT);
+       gfp = mapping_gfp_constraint(mapping, ~(__GFP_IO | __GFP_RECLAIM));
+       gfp |= __GFP_NORETRY | __GFP_NOWARN;
        sg = st->sgl;
        st->nents = 0;
        for (i = 0; i < page_count; i++) {
index 1362ad80a76c071e9e2abb2b36472e433bc5dae0..05352f490d6088d736016e588c8a79313df30e21 100644 (file)
@@ -92,7 +92,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
        struct request *rq;
        int error;
 
-       rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+       rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
        rq->cmd_type = REQ_TYPE_DRV_PRIV;
        rq->special = (char *)pc;
 
index 64a6b827b3dd12210e23008a81af06e9d9d7cb64..ef907fd5ba98a036d98d24478af49ec0a4c5831b 100644 (file)
@@ -441,7 +441,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
                struct request *rq;
                int error;
 
-               rq = blk_get_request(drive->queue, write, __GFP_WAIT);
+               rq = blk_get_request(drive->queue, write, __GFP_RECLAIM);
 
                memcpy(rq->cmd, cmd, BLK_MAX_CDB);
                rq->cmd_type = REQ_TYPE_ATA_PC;
index 066e3903651842fa31c82bc278c8f7434bf6c950..474173eb31bb345c86bf31fcb195a44432ee14ad 100644 (file)
@@ -303,7 +303,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
        struct request *rq;
        int ret;
 
-       rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+       rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
        rq->cmd_type = REQ_TYPE_DRV_PRIV;
        rq->cmd_flags = REQ_QUIET;
        ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
index b05a74d78ef560deefd8f022d27f145766e094b4..0dd43b4fcec6353633d12338be557b8e1e13ef14 100644 (file)
@@ -165,7 +165,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
        if (!(setting->flags & DS_SYNC))
                return setting->set(drive, arg);
 
-       rq = blk_get_request(q, READ, __GFP_WAIT);
+       rq = blk_get_request(q, READ, __GFP_RECLAIM);
        rq->cmd_type = REQ_TYPE_DRV_PRIV;
        rq->cmd_len = 5;
        rq->cmd[0] = REQ_DEVSET_EXEC;
index 56b9708894a5e294302e42066711274b6941616c..37a8a907febeb2556277fa78d03f474a5d865d84 100644 (file)
@@ -477,7 +477,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
        if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
                return -EBUSY;
 
-       rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+       rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
        rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
 
        drive->mult_req = arg;
index aa2e9b77b20d39a67d3da80fd23338fcb631faf6..d05db2469209bb1dfe1f9757d078b0298e55d180 100644 (file)
@@ -125,7 +125,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
        if (NULL == (void *) arg) {
                struct request *rq;
 
-               rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+               rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
                rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
                err = blk_execute_rq(drive->queue, NULL, rq, 0);
                blk_put_request(rq);
@@ -221,7 +221,7 @@ static int generic_drive_reset(ide_drive_t *drive)
        struct request *rq;
        int ret = 0;
 
-       rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+       rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
        rq->cmd_type = REQ_TYPE_DRV_PRIV;
        rq->cmd_len = 1;
        rq->cmd[0] = REQ_DRIVE_RESET;
index c808685204883db93213e80ee9516a298e1f58b1..2d7dca56dd244387a633e2eec30b177dfbd25a70 100644 (file)
@@ -31,7 +31,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
        }
        spin_unlock_irq(&hwif->lock);
 
-       rq = blk_get_request(q, READ, __GFP_WAIT);
+       rq = blk_get_request(q, READ, __GFP_RECLAIM);
        rq->cmd[0] = REQ_PARK_HEADS;
        rq->cmd_len = 1;
        rq->cmd_type = REQ_TYPE_DRV_PRIV;
index 081e43458d50f745671f9c5c4476d760e33b7fc0..e34af488693a62d26fbd7d282232d393a4807f55 100644 (file)
@@ -18,7 +18,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
        }
 
        memset(&rqpm, 0, sizeof(rqpm));
-       rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+       rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
        rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND;
        rq->special = &rqpm;
        rqpm.pm_step = IDE_PM_START_SUSPEND;
@@ -88,7 +88,7 @@ int generic_ide_resume(struct device *dev)
        }
 
        memset(&rqpm, 0, sizeof(rqpm));
-       rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+       rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
        rq->cmd_type = REQ_TYPE_ATA_PM_RESUME;
        rq->cmd_flags |= REQ_PREEMPT;
        rq->special = &rqpm;
index f5d51d1d09ee480becca86a1cda687ab3919cdb9..12fa04997dcc33a09129389435fb96f00bfe8e87 100644 (file)
@@ -852,7 +852,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
        BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
        BUG_ON(size < 0 || size % tape->blk_size);
 
-       rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+       rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM);
        rq->cmd_type = REQ_TYPE_DRV_PRIV;
        rq->cmd[13] = cmd;
        rq->rq_disk = tape->disk;
@@ -860,7 +860,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 
        if (size) {
                ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
-                                     __GFP_WAIT);
+                                     __GFP_RECLAIM);
                if (ret)
                        goto out_put;
        }
index 0979e126fff1e69ee3b3f8df19e5a443cbae5194..a716693417a308c49186bcfb4e692aa86ef1d99d 100644 (file)
@@ -430,7 +430,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
        int error;
        int rw = !(cmd->tf_flags & IDE_TFLAG_WRITE) ? READ : WRITE;
 
-       rq = blk_get_request(drive->queue, rw, __GFP_WAIT);
+       rq = blk_get_request(drive->queue, rw, __GFP_RECLAIM);
        rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
 
        /*
@@ -441,7 +441,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
         */
        if (nsect) {
                error = blk_rq_map_kern(drive->queue, rq, buf,
-                                       nsect * SECTOR_SIZE, __GFP_WAIT);
+                                       nsect * SECTOR_SIZE, __GFP_RECLAIM);
                if (error)
                        goto put_req;
        }
index dcdaa79e3f0faa0dcd9288a0f14cf9e4e996210c..2aba774f835b9caca8e9e1645d1efd6cf6f08bf9 100644 (file)
@@ -1086,7 +1086,7 @@ static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
 
 static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
 {
-       bool preload = !!(gfp_mask & __GFP_WAIT);
+       bool preload = gfpflags_allow_blocking(gfp_mask);
        unsigned long flags;
        int ret, id;
 
index 7e00470adc30223c183f0e287f7a7d0d9beff944..4ff340fe904f5c3bc9484b1f0cf2c53abd2bd4dd 100644 (file)
@@ -1680,7 +1680,7 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
         * heavy filesystem activity makes these fail, and we can
         * use compound pages.
         */
-       gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
 
        egrcnt = rcd->rcvegrcnt;
        egroff = rcd->rcvegr_tid_base;
index 0d533bba4ad18097e447be3dca3fa41e5d9abeab..8b2be1e7714f8bb7aa1d62193d3b3320fab64414 100644 (file)
@@ -2668,7 +2668,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
 
        page = alloc_pages(flag | __GFP_NOWARN,  get_order(size));
        if (!page) {
-               if (!(flag & __GFP_WAIT))
+               if (!gfpflags_allow_blocking(flag))
                        return NULL;
 
                page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
index 7cf80c1a8a1607f2d2ea675b270600bed70c6ad7..f1042daef9ada83e931ae450623ce491ebd55959 100644 (file)
@@ -3647,7 +3647,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size,
                        flags |= GFP_DMA32;
        }
 
-       if (flags & __GFP_WAIT) {
+       if (gfpflags_allow_blocking(flags)) {
                unsigned int count = size >> PAGE_SHIFT;
 
                page = dma_alloc_from_contiguous(dev, count, order);
index 3729b394432c9d66c7c219a8a52ddbf9be987869..917d47e290ae08be08f4c964a3326f1f67acd077 100644 (file)
@@ -994,7 +994,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
        struct bio_vec *bvec;
 
 retry:
-       if (unlikely(gfp_mask & __GFP_WAIT))
+       if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
                mutex_lock(&cc->bio_alloc_lock);
 
        clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
@@ -1010,7 +1010,7 @@ retry:
                if (!page) {
                        crypt_free_buffer_pages(cc, clone);
                        bio_put(clone);
-                       gfp_mask |= __GFP_WAIT;
+                       gfp_mask |= __GFP_DIRECT_RECLAIM;
                        goto retry;
                }
 
@@ -1027,7 +1027,7 @@ retry:
        }
 
 return_clone:
-       if (unlikely(gfp_mask & __GFP_WAIT))
+       if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
                mutex_unlock(&cc->bio_alloc_lock);
 
        return clone;
index 3a7cade5e27d828ffa2df3b9254f9064ec078c84..1452ed9aacb4222e4ee86c28480ddf373ac5e3c7 100644 (file)
@@ -244,7 +244,7 @@ static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
        *pages = NULL;
 
        do {
-               pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY);
+               pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY | __GFP_KSWAPD_RECLAIM);
                if (unlikely(!pl)) {
                        /* Use reserved pages */
                        pl = kc->pages;
index 1bd2fd47421fc43e02b7ad39c28546e55b3ea69e..4432fd69b7cbf86db946fb9aa6bfa753cb21434c 100644 (file)
@@ -1297,7 +1297,7 @@ static struct solo_enc_dev *solo_enc_alloc(struct solo_dev *solo_dev,
        solo_enc->vidq.ops = &solo_enc_video_qops;
        solo_enc->vidq.mem_ops = &vb2_dma_sg_memops;
        solo_enc->vidq.drv_priv = solo_enc;
-       solo_enc->vidq.gfp_flags = __GFP_DMA32;
+       solo_enc->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
        solo_enc->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
        solo_enc->vidq.buf_struct_size = sizeof(struct solo_vb2_buf);
        solo_enc->vidq.lock = &solo_enc->lock;
index 26df903585d7dd02a54077bc24bbba8acda87b40..f7ce493b1feed72c27dd4f9d58e1ebe15bec5708 100644 (file)
@@ -678,7 +678,7 @@ int solo_v4l2_init(struct solo_dev *solo_dev, unsigned nr)
        solo_dev->vidq.mem_ops = &vb2_dma_contig_memops;
        solo_dev->vidq.drv_priv = solo_dev;
        solo_dev->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
-       solo_dev->vidq.gfp_flags = __GFP_DMA32;
+       solo_dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
        solo_dev->vidq.buf_struct_size = sizeof(struct solo_vb2_buf);
        solo_dev->vidq.lock = &solo_dev->lock;
        ret = vb2_queue_init(&solo_dev->vidq);
index 4c3293dcddbcde3cd6b37584c6665d870fef1bc8..46642ef9151b644413c3de4e33ee5e9279d2107d 100644 (file)
@@ -979,7 +979,7 @@ int tw68_video_init2(struct tw68_dev *dev, int video_nr)
        dev->vidq.ops = &tw68_video_qops;
        dev->vidq.mem_ops = &vb2_dma_sg_memops;
        dev->vidq.drv_priv = dev;
-       dev->vidq.gfp_flags = __GFP_DMA32;
+       dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
        dev->vidq.buf_struct_size = sizeof(struct tw68_buf);
        dev->vidq.lock = &dev->lock;
        dev->vidq.min_buffers_needed = 2;
index 340b44d9e8cf7c634685fd2afe84fddfc136d14b..fa0aafebc672fc7722320da4a4716bf542d88dda 100644 (file)
@@ -238,7 +238,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
 
        mq->mmr_blade = uv_cpu_to_blade_id(cpu);
 
-       nid = cpu_to_node(cpu);
+       nid = cpu_to_mem(cpu);
        page = __alloc_pages_node(nid,
                                      GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
                                      pg_order);
index 89300870fefb97a66291b96d52fca3aeb3714259..1e688bfec56728c3d00ebc353031c26fde29f187 100644 (file)
@@ -75,7 +75,7 @@ MODULE_LICENSE("GPL");
 
 /*
  * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
- * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use
+ * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
  * __GFP_NOWARN, to suppress page allocation failure warnings.
  */
 #define VMW_PAGE_ALLOC_NOSLEEP         (__GFP_HIGHMEM|__GFP_NOWARN)
index b1eea48c501d11cdb58e862ac73a0b29eb015aba..a9a15c22ef24649ff4ef82385a1128dfef8d7818 100644 (file)
@@ -1215,8 +1215,7 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  */
 void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
 {
-       gfp_t flags = __GFP_NOWARN | __GFP_WAIT |
-                      __GFP_NORETRY | __GFP_NO_KSWAPD;
+       gfp_t flags = __GFP_NOWARN | __GFP_DIRECT_RECLAIM | __GFP_NORETRY;
        size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
        void *kbuf;
 
index 44173be5cbf0d914111304ba2420954f44d5a7e7..f8d7a2f06950139b936dc7d793bb884bb579980d 100644 (file)
@@ -691,7 +691,7 @@ static void *bnx2x_frag_alloc(const struct bnx2x_fastpath *fp, gfp_t gfp_mask)
 {
        if (fp->rx_frag_size) {
                /* GFP_KERNEL allocations are used only during initialization */
-               if (unlikely(gfp_mask & __GFP_WAIT))
+               if (unlikely(gfpflags_allow_blocking(gfp_mask)))
                        return (void *)__get_free_page(gfp_mask);
 
                return netdev_alloc_frag(fp->rx_frag_size);
index c4bb85934aa2692cd6f46a12666ee2b6eff1c2e6..34fae2804e0b4e75a8d6159677a9506aefca343c 100644 (file)
@@ -1025,11 +1025,11 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
        req->special = (void *)0;
 
        if (buffer && bufflen) {
-               ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT);
+               ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_RECLAIM);
                if (ret)
                        goto out;
        } else if (ubuffer && bufflen) {
-               ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT);
+               ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_RECLAIM);
                if (ret)
                        goto out;
                bio = req->bio;
index 66a96cd98b975dcdbd5429cf02f069b52771fa84..984ddcb4786d6074c94a2b0387c25362a4f4c2c4 100644 (file)
@@ -1970,7 +1970,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
        struct request *req;
 
        /*
-        * blk_get_request with GFP_KERNEL (__GFP_WAIT) sleeps until a
+        * blk_get_request with GFP_KERNEL (__GFP_RECLAIM) sleeps until a
         * request becomes available
         */
        req = blk_get_request(sdev->request_queue, READ, GFP_KERNEL);
index 126a48c6431e5a5d9798aed3472916b06ef476c8..dd8ad2a44510cae1b97260b7d64fc65e7770db53 100644 (file)
@@ -222,13 +222,13 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
        int write = (data_direction == DMA_TO_DEVICE);
        int ret = DRIVER_ERROR << 24;
 
-       req = blk_get_request(sdev->request_queue, write, __GFP_WAIT);
+       req = blk_get_request(sdev->request_queue, write, __GFP_RECLAIM);
        if (IS_ERR(req))
                return ret;
        blk_rq_set_block_pc(req);
 
        if (bufflen &&  blk_rq_map_kern(sdev->request_queue, req,
-                                       buffer, bufflen, __GFP_WAIT))
+                                       buffer, bufflen, __GFP_RECLAIM))
                goto out;
 
        req->cmd_len = COMMAND_SIZE(cmd[0]);
index ada724aab3d586ebb81193909a2518a1e68872fb..d4c3e5512dd54dbcf0b3f6d9a927a1e8bbb3fc5a 100644 (file)
@@ -27,7 +27,7 @@
 #include "ion_priv.h"
 
 static gfp_t high_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN |
-                                    __GFP_NORETRY) & ~__GFP_WAIT;
+                                    __GFP_NORETRY) & ~__GFP_DIRECT_RECLAIM;
 static gfp_t low_order_gfp_flags  = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN);
 static const unsigned int orders[] = {8, 4, 0};
 static const int num_orders = ARRAY_SIZE(orders);
index 6af733de69caddbf7672b80e75e1827304e7b237..f0b0423a716bd09252b74f5f9d22506fddc7c4dd 100644 (file)
@@ -95,7 +95,7 @@ do {                                                              \
 do {                                                                       \
        LASSERT(!in_interrupt() ||                                          \
                ((size) <= LIBCFS_VMALLOC_SIZE &&                           \
-                ((mask) & __GFP_WAIT) == 0));                              \
+                !gfpflags_allow_blocking(mask)));                          \
 } while (0)
 
 #define LIBCFS_ALLOC_POST(ptr, size)                                       \
index 47a1202fcbdf5117e66a8c78da95c80bbffdf459..8666f3ad24e9960bfb85b6814de6c277ae37b522 100644 (file)
@@ -1560,7 +1560,7 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
         * heavy filesystem activity makes these fail, and we can
         * use compound pages.
         */
-       gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
 
        /*
         * The minimum size of the eager buffers is a groups of MTU-sized
index 5d9b9dbd8fc44804fbf7a4256782ecc7d4f7630f..13c3cd11ab92a5c610ef7a5e32795a0b0a0931c9 100644 (file)
@@ -905,7 +905,7 @@ static int ipath_create_user_egr(struct ipath_portdata *pd)
         * heavy filesystem activity makes these fail, and we can
         * use compound pages.
         */
-       gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
 
        egrcnt = dd->ipath_rcvegrcnt;
        /* TID number offset for this port */
index c69b650b7bba67ef5f1a885027b17f36f44dd3d7..223ccf89d2263fb6483a1a22116a3743c9d7ed8e 100644 (file)
@@ -2345,7 +2345,6 @@ static void fsg_disable(struct usb_function *f)
 
 static void handle_exception(struct fsg_common *common)
 {
-       siginfo_t               info;
        int                     i;
        struct fsg_buffhd       *bh;
        enum fsg_state          old_state;
@@ -2357,8 +2356,7 @@ static void handle_exception(struct fsg_common *common)
         * into a high-priority EXIT exception.
         */
        for (;;) {
-               int sig =
-                       dequeue_signal_lock(current, &current->blocked, &info);
+               int sig = kernel_dequeue_signal(NULL);
                if (!sig)
                        break;
                if (sig != SIGUSR1) {
index 0a94895a358d47e8e51cf32722afbdce0d6104d9..692ccc69345e4a9998246a53b6af817a56a5d435 100644 (file)
@@ -2244,7 +2244,7 @@ static int u132_urb_enqueue(struct usb_hcd *hcd, struct urb *urb,
 {
        struct u132 *u132 = hcd_to_u132(hcd);
        if (irqs_disabled()) {
-               if (__GFP_WAIT & mem_flags) {
+               if (gfpflags_allow_blocking(mem_flags)) {
                        printk(KERN_ERR "invalid context for function that might sleep\n");
                        return -EINVAL;
                }
index 6b70d7f62b2fa43aa9f286d51b6aa8983fd08353..1c1e95a0b8faa04006c857158c79ac11bc904d96 100644 (file)
@@ -99,7 +99,7 @@ static int vmlfb_alloc_vram_area(struct vram_area *va, unsigned max_order,
                 * below the first 16MB.
                 */
 
-               flags = __GFP_DMA | __GFP_HIGH;
+               flags = __GFP_DMA | __GFP_HIGH | __GFP_KSWAPD_RECLAIM;
                va->logical =
                         __get_free_pages(flags, --max_order);
        } while (va->logical == 0 && max_order > min_order);
index f23fd86697ea5ed4234ff96a8dd236b8884e6026..7bf835f85bc822ef1119b639be82619af066d326 100644 (file)
@@ -231,7 +231,8 @@ out_unlock:
        if (res < 0 && fl->fl_type != F_UNLCK) {
                fl_type = fl->fl_type;
                fl->fl_type = F_UNLCK;
-               res = locks_lock_file_wait(filp, fl);
+               /* Even if this fails we want to return the remote error */
+               locks_lock_file_wait(filp, fl);
                fl->fl_type = fl_type;
        }
 out:
index 57ee8ca29b0601060fae924f43b7897f2c4c7c7c..36dfeff2c1f443aade91be4ccbce56bec9317008 100644 (file)
@@ -482,13 +482,12 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                        goto next;
                }
 
-               page = __page_cache_alloc(mapping_gfp_mask(mapping) &
-                                                               ~__GFP_FS);
+               page = __page_cache_alloc(mapping_gfp_constraint(mapping,
+                                                                ~__GFP_FS));
                if (!page)
                        break;
 
-               if (add_to_page_cache_lru(page, mapping, pg_index,
-                                                               GFP_NOFS)) {
+               if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
                        page_cache_release(page);
                        goto next;
                }
index 938efe33be809240bc19bdf81d854c623191a3d8..eb90f0f1a12428827ad84a52fcb32f44ee3d289d 100644 (file)
@@ -3316,7 +3316,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 
 static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 {
-       return mapping_gfp_mask(mapping) & ~__GFP_FS;
+       return mapping_gfp_constraint(mapping, ~__GFP_FS);
 }
 
 /* extent-tree.c */
index 1e60d00d4ea7c42104614ede9e203a1f56e6408a..c339d561e59654935378fb8c23593f418aee2c8f 100644 (file)
@@ -2572,7 +2572,7 @@ int open_ctree(struct super_block *sb,
        fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
        fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
        /* readahead state */
-       INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+       INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
        spin_lock_init(&fs_info->reada_lock);
 
        fs_info->thread_pool_size = min_t(unsigned long,
index 3915c9473e9445d4aeada81c8fb96af7fb521f2c..032abfbebe76e6e4ba41e25d7c9326079ed71f68 100644 (file)
@@ -594,7 +594,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                clear = 1;
 again:
-       if (!prealloc && (mask & __GFP_WAIT)) {
+       if (!prealloc && gfpflags_allow_blocking(mask)) {
                /*
                 * Don't care for allocation failure here because we might end
                 * up not needing the pre-allocated extent state at all, which
@@ -718,7 +718,7 @@ search_again:
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
-       if (mask & __GFP_WAIT)
+       if (gfpflags_allow_blocking(mask))
                cond_resched();
        goto again;
 }
@@ -850,7 +850,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 
        bits |= EXTENT_FIRST_DELALLOC;
 again:
-       if (!prealloc && (mask & __GFP_WAIT)) {
+       if (!prealloc && gfpflags_allow_blocking(mask)) {
                prealloc = alloc_extent_state(mask);
                BUG_ON(!prealloc);
        }
@@ -1028,7 +1028,7 @@ search_again:
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
-       if (mask & __GFP_WAIT)
+       if (gfpflags_allow_blocking(mask))
                cond_resched();
        goto again;
 }
@@ -1076,7 +1076,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        btrfs_debug_check_extent_io_range(tree, start, end);
 
 again:
-       if (!prealloc && (mask & __GFP_WAIT)) {
+       if (!prealloc && gfpflags_allow_blocking(mask)) {
                /*
                 * Best effort, don't worry if extent state allocation fails
                 * here for the first iteration. We might have a cached state
@@ -1253,7 +1253,7 @@ search_again:
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
-       if (mask & __GFP_WAIT)
+       if (gfpflags_allow_blocking(mask))
                cond_resched();
        first_iteration = false;
        goto again;
@@ -4319,7 +4319,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
        u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
 
-       if ((mask & __GFP_WAIT) &&
+       if (gfpflags_allow_blocking(mask) &&
            page->mapping->host->i_size > 16 * 1024 * 1024) {
                u64 len;
                while (start <= end) {
index abe3a66bd3ba6d31f9fb072c6d4d6c6be42c174e..ed05da1b977e59f6ca54ecfe95be81e85bc631ba 100644 (file)
@@ -85,8 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
        }
 
        mapping_set_gfp_mask(inode->i_mapping,
-                       mapping_gfp_mask(inode->i_mapping) &
-                       ~(__GFP_FS | __GFP_HIGHMEM));
+                       mapping_gfp_constraint(inode->i_mapping,
+                       ~(__GFP_FS | __GFP_HIGHMEM)));
 
        return inode;
 }
index 6fc735869c186c35fb79fa66decc7d3519ed2e93..e023919b447064cbc6fad6984065731cbbde68c3 100644 (file)
@@ -156,8 +156,8 @@ static struct btrfs_device *__alloc_device(void)
        spin_lock_init(&dev->reada_lock);
        atomic_set(&dev->reada_in_flight, 0);
        atomic_set(&dev->dev_stats_ccnt, 0);
-       INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
-       INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+       INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+       INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 
        return dev;
 }
index 82283abb2795d95bd31d595809e7881ca43942a3..51aff0296ce2435189aaa71b68a18219e0e34578 100644 (file)
@@ -999,7 +999,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
        int ret = 0;            /* Will call free_more_memory() */
        gfp_t gfp_mask;
 
-       gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+       gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
 
        /*
         * XXX: __getblk_slow() can not really deal with failure and
index aecd0859eacbf28d9b3782527c178f8e05000e7c..9c4b737a54df64bb15df446c5c359e594ce5b3a3 100644 (file)
@@ -30,7 +30,7 @@ extern unsigned cachefiles_debug;
 #define CACHEFILES_DEBUG_KLEAVE        2
 #define CACHEFILES_DEBUG_KDEBUG        4
 
-#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC)
 
 /*
  * node records
index 9d23e788d1dfdab235d1edd0f8d1d3e065904e65..b7d218a168fb81c2c028ea38ffccf2a17a1d68d5 100644 (file)
@@ -1283,8 +1283,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                int ret1;
                struct address_space *mapping = inode->i_mapping;
                struct page *page = find_or_create_page(mapping, 0,
-                                               mapping_gfp_mask(mapping) &
-                                               ~__GFP_FS);
+                                               mapping_gfp_constraint(mapping,
+                                               ~__GFP_FS));
                if (!page) {
                        ret = VM_FAULT_OOM;
                        goto out;
@@ -1428,7 +1428,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
                if (i_size_read(inode) == 0)
                        return;
                page = find_or_create_page(mapping, 0,
-                                          mapping_gfp_mask(mapping) & ~__GFP_FS);
+                                          mapping_gfp_constraint(mapping,
+                                          ~__GFP_FS));
                if (!page)
                        return;
                if (PageUptodate(page)) {
index 47c5c97e2dd31c20663f1fa0584da2c3f87722bd..0a2752b79e72cc2b7a083894843a8b3ae1dea23d 100644 (file)
@@ -3380,7 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
        struct page *page, *tpage;
        unsigned int expected_index;
        int rc;
-       gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+       gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
 
        INIT_LIST_HEAD(tmplist);
 
@@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
         * should have access to this page, we're safe to simply set
         * PG_locked without checking it first.
         */
-       __set_page_locked(page);
+       __SetPageLocked(page);
        rc = add_to_page_cache_locked(page, mapping,
                                      page->index, gfp);
 
        /* give up if we can't stick it in the cache */
        if (rc) {
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
                return rc;
        }
 
@@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
                if (*bytes + PAGE_CACHE_SIZE > rsize)
                        break;
 
-               __set_page_locked(page);
+               __SetPageLocked(page);
                if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
-                       __clear_page_locked(page);
+                       __ClearPageLocked(page);
                        break;
                }
                list_move_tail(&page->lru, tmplist);
index 53d7d46c55c82c58c321895225faaad8e9a2e6e5..b3c153ca435d24fdbdfcb909228b6b4787bb63f2 100644 (file)
@@ -282,23 +282,24 @@ out:
        return ispipe;
 }
 
-static int zap_process(struct task_struct *start, int exit_code)
+static int zap_process(struct task_struct *start, int exit_code, int flags)
 {
        struct task_struct *t;
        int nr = 0;
 
+       /* ignore all signals except SIGKILL, see prepare_signal() */
+       start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
        start->signal->group_exit_code = exit_code;
        start->signal->group_stop_count = 0;
 
-       t = start;
-       do {
+       for_each_thread(start, t) {
                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                if (t != current && t->mm) {
                        sigaddset(&t->pending.signal, SIGKILL);
                        signal_wake_up(t, 1);
                        nr++;
                }
-       } while_each_thread(start, t);
+       }
 
        return nr;
 }
@@ -313,10 +314,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
        spin_lock_irq(&tsk->sighand->siglock);
        if (!signal_group_exit(tsk->signal)) {
                mm->core_state = core_state;
-               nr = zap_process(tsk, exit_code);
                tsk->signal->group_exit_task = tsk;
-               /* ignore all signals except SIGKILL, see prepare_signal() */
-               tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+               nr = zap_process(tsk, exit_code, 0);
                clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
        }
        spin_unlock_irq(&tsk->sighand->siglock);
@@ -362,18 +361,18 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
                        continue;
                if (g->flags & PF_KTHREAD)
                        continue;
-               p = g;
-               do {
-                       if (p->mm) {
-                               if (unlikely(p->mm == mm)) {
-                                       lock_task_sighand(p, &flags);
-                                       nr += zap_process(p, exit_code);
-                                       p->signal->flags = SIGNAL_GROUP_EXIT;
-                                       unlock_task_sighand(p, &flags);
-                               }
-                               break;
+
+               for_each_thread(g, p) {
+                       if (unlikely(!p->mm))
+                               continue;
+                       if (unlikely(p->mm == mm)) {
+                               lock_task_sighand(p, &flags);
+                               nr += zap_process(p, exit_code,
+                                                       SIGNAL_GROUP_EXIT);
+                               unlock_task_sighand(p, &flags);
                        }
-               } while_each_thread(g, p);
+                       break;
+               }
        }
        rcu_read_unlock();
 done:
index 3ae0e0427191c7849fc70301e58792515f5f181c..18e7554cf94cac57d1eb3dc17ba4001e75cef5f6 100644 (file)
@@ -361,7 +361,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 
        /*
         * bio_alloc() is guaranteed to return a bio when called with
-        * __GFP_WAIT and we request a valid number of vectors.
+        * __GFP_RECLAIM and we request a valid number of vectors.
         */
        bio = bio_alloc(GFP_KERNEL, nr_vecs);
 
index 8850254136ae31fea8c4398a43a21fcb0d21f9cd..7002467bfbace8a24118236a49a9277ecdceaae3 100644 (file)
@@ -106,7 +106,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        }
 
        if (!journal) {
-               ret = generic_file_fsync(file, start, end, datasync);
+               if (test_opt(inode->i_sb, BARRIER))
+                       ret = generic_file_fsync(file, start, end, datasync);
+               else
+                       ret = __generic_file_fsync(file, start, end, datasync);
                if (!ret && !hlist_empty(&inode->i_dentry))
                        ret = ext4_sync_parent(inode);
                goto out;
index e8d620a484f6a86bb684ce432888c329a429e948..7d1aad1d9313155f3780cde3923710fe7f60ea1c 100644 (file)
@@ -3386,7 +3386,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
        int err = 0;
 
        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-                                  mapping_gfp_mask(mapping) & ~__GFP_FS);
+                                  mapping_gfp_constraint(mapping, ~__GFP_FS));
        if (!page)
                return -ENOMEM;
 
index d94af71a4e7fcabd1783f28cccb86363473fe3ae..5dc5e95063de2a7e42749a94464f00f7c50be4b8 100644 (file)
@@ -166,7 +166,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
                        page = list_entry(pages->prev, struct page, lru);
                        list_del(&page->lru);
                        if (add_to_page_cache_lru(page, mapping, page->index,
-                                       GFP_KERNEL & mapping_gfp_mask(mapping)))
+                                 mapping_gfp_constraint(mapping, GFP_KERNEL)))
                                goto next_page;
                }
 
index 04d0f1b334096525030674c6d818e124a1bd81f3..753f4e68b820da0dd78fc7a7e3a66e529846ea0b 100644 (file)
@@ -1061,7 +1061,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
                return 0;
        if (journal)
                return jbd2_journal_try_to_free_buffers(journal, page,
-                                                       wait & ~__GFP_WAIT);
+                                               wait & ~__GFP_DIRECT_RECLAIM);
        return try_to_free_buffers(page);
 }
 
index 93fc62232ec21e795efc938edad43a22d55cac52..5d384921524d9722c96cd14c9fdcf0a3aab05851 100644 (file)
@@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
        return dclus;
 }
 
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-            unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+                          sector_t last_block,
+                          unsigned long *mapped_blocks, sector_t *bmap)
 {
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
+       int cluster, offset;
+
+       cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+       offset  = sector & (sbi->sec_per_clus - 1);
+       cluster = fat_bmap_cluster(inode, cluster);
+       if (cluster < 0)
+               return cluster;
+       else if (cluster) {
+               *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+               *mapped_blocks = sbi->sec_per_clus - offset;
+               if (*mapped_blocks > last_block - sector)
+                       *mapped_blocks = last_block - sector;
+       }
+
+       return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+                        sector_t *last_block, int create)
+{
+       struct super_block *sb = inode->i_sb;
        const unsigned long blocksize = sb->s_blocksize;
        const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+       *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+       if (sector >= *last_block) {
+               if (!create)
+                       return 1;
+
+               /*
+                * ->mmu_private can access on only allocation path.
+                * (caller must hold ->i_mutex)
+                */
+               *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+                       >> blocksize_bits;
+               if (sector >= *last_block)
+                       return 1;
+       }
+
+       return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+            unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+       struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        sector_t last_block;
-       int cluster, offset;
 
        *phys = 0;
        *mapped_blocks = 0;
@@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
                return 0;
        }
 
-       last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
-       if (sector >= last_block) {
-               if (!create)
+       if (!from_bmap) {
+               if (is_exceed_eof(inode, sector, &last_block, create))
                        return 0;
-
-               /*
-                * ->mmu_private can access on only allocation path.
-                * (caller must hold ->i_mutex)
-                */
-               last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
-                       >> blocksize_bits;
+       } else {
+               last_block = inode->i_blocks >>
+                               (inode->i_sb->s_blocksize_bits - 9);
                if (sector >= last_block)
                        return 0;
        }
 
-       cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
-       offset  = sector & (sbi->sec_per_clus - 1);
-       cluster = fat_bmap_cluster(inode, cluster);
-       if (cluster < 0)
-               return cluster;
-       else if (cluster) {
-               *phys = fat_clus_to_blknr(sbi, cluster) + offset;
-               *mapped_blocks = sbi->sec_per_clus - offset;
-               if (*mapped_blocks > last_block - sector)
-                       *mapped_blocks = last_block - sector;
-       }
-       return 0;
+       return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+                                     phys);
 }
index 4afc4d9d2e4127debe45cb51b5f1fc17d16fee4d..4c71c8c764263a2d5e18ac87917d12f0ac54c135 100644 (file)
@@ -91,7 +91,7 @@ next:
 
        *bh = NULL;
        iblock = *pos >> sb->s_blocksize_bits;
-       err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+       err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
        if (err || !phys)
                return -1;      /* beyond EOF or error */
 
index be5e15323bab45b99e4693b43cb543c9aa85c586..4307cd4f8da096e8061b68128bf32c79ffa69daa 100644 (file)
@@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
 extern void fat_cache_inval_inode(struct inode *inode);
 extern int fat_get_cluster(struct inode *inode, int cluster,
                           int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+                                 sector_t last_block,
+                                 unsigned long *mapped_blocks, sector_t *bmap);
 extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-                   unsigned long *mapped_blocks, int create);
+                   unsigned long *mapped_blocks, int create, bool from_bmap);
 
 /* fat/dir.c */
 extern const struct file_operations fat_dir_operations;
@@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart)
 {
        return hash_32(logstart, FAT_HASH_BITS);
 }
+extern int fat_add_cluster(struct inode *inode);
 
 /* fat/misc.c */
 extern __printf(3, 4) __cold
index a08f1039909a76e6427cf9e64ddf16c30a1716c9..43d3475da83a79c8857e0861bf298df38a9d7ba7 100644 (file)
 #include <linux/backing-dev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
+#include <linux/falloc.h>
 #include "fat.h"
 
+static long fat_fallocate(struct file *file, int mode,
+                         loff_t offset, loff_t len);
+
 static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
 {
        u32 attr;
@@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = {
 #endif
        .fsync          = fat_file_fsync,
        .splice_read    = generic_file_splice_read,
+       .fallocate      = fat_fallocate,
 };
 
 static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -215,6 +220,62 @@ out:
        return err;
 }
 
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+                         loff_t offset, loff_t len)
+{
+       int nr_cluster; /* Number of clusters to be allocated */
+       loff_t mm_bytes; /* Number of bytes to be allocated for file */
+       loff_t ondisksize; /* block aligned on-disk size in bytes*/
+       struct inode *inode = file->f_mapping->host;
+       struct super_block *sb = inode->i_sb;
+       struct msdos_sb_info *sbi = MSDOS_SB(sb);
+       int err = 0;
+
+       /* No support for hole punch or other fallocate flags. */
+       if (mode & ~FALLOC_FL_KEEP_SIZE)
+               return -EOPNOTSUPP;
+
+       /* No support for dir */
+       if (!S_ISREG(inode->i_mode))
+               return -EOPNOTSUPP;
+
+       mutex_lock(&inode->i_mutex);
+       if (mode & FALLOC_FL_KEEP_SIZE) {
+               ondisksize = inode->i_blocks << 9;
+               if ((offset + len) <= ondisksize)
+                       goto error;
+
+               /* First compute the number of clusters to be allocated */
+               mm_bytes = offset + len - ondisksize;
+               nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+                       sbi->cluster_bits;
+
+               /* Start the allocation.We are not zeroing out the clusters */
+               while (nr_cluster-- > 0) {
+                       err = fat_add_cluster(inode);
+                       if (err)
+                               goto error;
+               }
+       } else {
+               if ((offset + len) <= i_size_read(inode))
+                       goto error;
+
+               /* This is just an expanding truncate */
+               err = fat_cont_expand(inode, (offset + len));
+       }
+
+error:
+       mutex_unlock(&inode->i_mutex);
+       return err;
+}
+
 /* Free all clusters after the skip'th cluster. */
 static int fat_free(struct inode *inode, int skip)
 {
index 509411dd3698959c0b8c387dcf0f487b5da12abe..d04c87da425535c834805a3fc27bd99ed5ebe1ec 100644 (file)
@@ -93,7 +93,7 @@ static struct fat_floppy_defaults {
 },
 };
 
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
 {
        int err, cluster;
 
@@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        unsigned long mapped_blocks;
-       sector_t phys;
+       sector_t phys, last_block;
        int err, offset;
 
-       err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+       err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
        if (err)
                return err;
        if (phys) {
@@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
                return -EIO;
        }
 
+       last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
        offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
-       if (!offset) {
+       /*
+        * allocate a cluster according to the following.
+        * 1) no more available blocks
+        * 2) not part of fallocate region
+        */
+       if (!offset && !(iblock < last_block)) {
                /* TODO: multiple cluster allocation would be desirable. */
                err = fat_add_cluster(inode);
                if (err)
@@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
        *max_blocks = min(mapped_blocks, *max_blocks);
        MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
 
-       err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+       err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
        if (err)
                return err;
 
@@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
        return ret;
 }
 
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+               struct buffer_head *bh_result, int create)
+{
+       struct super_block *sb = inode->i_sb;
+       unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+       int err;
+       sector_t bmap;
+       unsigned long mapped_blocks;
+
+       BUG_ON(create != 0);
+
+       err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+       if (err)
+               return err;
+
+       if (bmap) {
+               map_bh(bh_result, sb, bmap);
+               max_blocks = min(mapped_blocks, max_blocks);
+       }
+
+       bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+       return 0;
+}
+
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
        sector_t blocknr;
 
        /* fat_get_cluster() assumes the requested blocknr isn't truncated. */
        down_read(&MSDOS_I(mapping->host)->truncate_lock);
-       blocknr = generic_block_bmap(mapping, block, fat_get_block);
+       blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
        up_read(&MSDOS_I(mapping->host)->truncate_lock);
 
        return blocknr;
@@ -553,13 +584,43 @@ out:
 
 EXPORT_SYMBOL_GPL(fat_build_inode);
 
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+       /* Release unwritten fallocated blocks on inode eviction. */
+       if ((inode->i_blocks << 9) >
+                       round_up(MSDOS_I(inode)->mmu_private,
+                               MSDOS_SB(inode->i_sb)->cluster_size)) {
+               int err;
+
+               fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+               /* Fallocate results in updating the i_start/iogstart
+                * for the zero byte file. So, make it return to
+                * original state during evict and commit it to avoid
+                * any corruption on the next access to the cluster
+                * chain for the file.
+                */
+               err = __fat_write_inode(inode, inode_needs_sync(inode));
+               if (err) {
+                       fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+                                       "update on disk inode for unused "
+                                       "fallocated blocks, inode could be "
+                                       "corrupted. Please run fsck");
+               }
+
+       }
+}
+
 static void fat_evict_inode(struct inode *inode)
 {
        truncate_inode_pages_final(&inode->i_data);
        if (!inode->i_nlink) {
                inode->i_size = 0;
                fat_truncate_blocks(inode, 0);
-       }
+       } else
+               fat_free_eofblocks(inode);
+
        invalidate_inode_buffers(inode);
        clear_inode(inode);
        fat_cache_inval_inode(inode);
index 7378169e90be6ed485ac48d0cf633c8e37c4c3d2..206a68b1db1ab1b1f5a6ed6a611723f957e1e58b 100644 (file)
@@ -2149,7 +2149,12 @@ static void wait_sb_inodes(struct super_block *sb)
                iput(old_inode);
                old_inode = inode;
 
-               filemap_fdatawait(mapping);
+               /*
+                * We keep the error status of individual mapping so that
+                * applications can catch the writeback error using fsync(2).
+                * See filemap_fdatawait_keep_errors() for details.
+                */
+               filemap_fdatawait_keep_errors(mapping);
 
                cond_resched();
 
index d403c69bee0829fba9187114bc318dd810e8d179..4304072161aa08c14d24291bf24eb2481c567874 100644 (file)
@@ -111,7 +111,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
 
        /* radix tree insertion won't use the preallocation pool unless it's
         * told it may not wait */
-       INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
+       INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 
        switch (cookie->def->type) {
        case FSCACHE_COOKIE_TYPE_INDEX:
index 483bbc613bf04528cbc66d5d54a5b54b6a8dc85b..79483b3d8c6f273f9a8e8a5c0c997efa949d48ff 100644 (file)
@@ -58,7 +58,7 @@ bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
 
 /*
  * decide whether a page can be released, possibly by cancelling a store to it
- * - we're allowed to sleep if __GFP_WAIT is flagged
+ * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged
  */
 bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
                                  struct page *page,
@@ -122,7 +122,7 @@ page_busy:
         * allocator as the work threads writing to the cache may all end up
         * sleeping on memory allocation, so we may need to impose a timeout
         * too. */
-       if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
+       if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) {
                fscache_stat(&fscache_n_store_vmscan_busy);
                return false;
        }
@@ -132,7 +132,7 @@ page_busy:
                _debug("fscache writeout timeout page: %p{%lx}",
                        page, page->index);
 
-       gfp &= ~__GFP_WAIT;
+       gfp &= ~__GFP_DIRECT_RECLAIM;
        goto try_again;
 }
 EXPORT_SYMBOL(__fscache_maybe_release_page);
index 316adb968b6588faca5d64cf57005f48e067bb08..f25b72f4b2a7fc6958b2cbf6b2bce7021ce918bf 100644 (file)
@@ -324,11 +324,44 @@ static void remove_huge_page(struct page *page)
        delete_from_page_cache(page);
 }
 
+static inline void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+       struct vm_area_struct *vma;
+
+       /*
+        * end == 0 indicates that the entire range after
+        * start should be unmapped.
+        */
+       vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+               unsigned long v_offset;
+
+               /*
+                * Can the expression below overflow on 32-bit arches?
+                * No, because the interval tree returns us only those vmas
+                * which overlap the truncated area starting at pgoff,
+                * and no vma on a 32-bit arch can span beyond the 4GB.
+                */
+               if (vma->vm_pgoff < start)
+                       v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+               else
+                       v_offset = 0;
+
+               if (end) {
+                       end = ((end - start) << PAGE_SHIFT) +
+                              vma->vm_start + v_offset;
+                       if (end > vma->vm_end)
+                               end = vma->vm_end;
+               } else
+                       end = vma->vm_end;
+
+               unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
+       }
+}
 
 /*
  * remove_inode_hugepages handles two distinct cases: truncation and hole
  * punch.  There are subtle differences in operation for each case.
-
  * truncation is indicated by end of range being LLONG_MAX
  *     In this case, we first scan the range and release found pages.
  *     After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -381,12 +414,25 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                for (i = 0; i < pagevec_count(&pvec); ++i) {
                        struct page *page = pvec.pages[i];
                        u32 hash;
+                       bool rsv_on_error;
 
                        hash = hugetlb_fault_mutex_hash(h, current->mm,
                                                        &pseudo_vma,
                                                        mapping, next, 0);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
+                       /*
+                        * If page is mapped, it was faulted in after being
+                        * unmapped in caller.  Unmap (again) now after taking
+                        * the fault mutex.  The mutex will prevent faults
+                        * until we finish removing the page.
+                        */
+                       if (page_mapped(page)) {
+                               hugetlb_vmdelete_list(&mapping->i_mmap,
+                                       next * pages_per_huge_page(h),
+                                       (next + 1) * pages_per_huge_page(h));
+                       }
+
                        lock_page(page);
                        if (page->index >= end) {
                                unlock_page(page);
@@ -396,31 +442,23 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                        }
 
                        /*
-                        * If page is mapped, it was faulted in after being
-                        * unmapped.  Do nothing in this race case.  In the
-                        * normal case page is not mapped.
+                        * We must free the huge page and remove from page
+                        * cache (remove_huge_page) BEFORE removing the
+                        * region/reserve map (hugetlb_unreserve_pages).
+                        * In rare out of memory conditions, removal of the
+                        * region/reserve map could fail.  Before free'ing
+                        * the page, note PagePrivate which is used in case
+                        * of error.
                         */
-                       if (!page_mapped(page)) {
-                               bool rsv_on_error = !PagePrivate(page);
-                               /*
-                                * We must free the huge page and remove
-                                * from page cache (remove_huge_page) BEFORE
-                                * removing the region/reserve map
-                                * (hugetlb_unreserve_pages).  In rare out
-                                * of memory conditions, removal of the
-                                * region/reserve map could fail.  Before
-                                * free'ing the page, note PagePrivate which
-                                * is used in case of error.
-                                */
-                               remove_huge_page(page);
-                               freed++;
-                               if (!truncate_op) {
-                                       if (unlikely(hugetlb_unreserve_pages(
-                                                       inode, next,
-                                                       next + 1, 1)))
-                                               hugetlb_fix_reserve_counts(
-                                                       inode, rsv_on_error);
-                               }
+                       rsv_on_error = !PagePrivate(page);
+                       remove_huge_page(page);
+                       freed++;
+                       if (!truncate_op) {
+                               if (unlikely(hugetlb_unreserve_pages(inode,
+                                                               next, next + 1,
+                                                               1)))
+                                       hugetlb_fix_reserve_counts(inode,
+                                                               rsv_on_error);
                        }
 
                        if (page->index > next)
@@ -450,41 +488,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
        clear_inode(inode);
 }
 
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
-       struct vm_area_struct *vma;
-
-       /*
-        * end == 0 indicates that the entire range after
-        * start should be unmapped.
-        */
-       vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
-               unsigned long v_offset;
-
-               /*
-                * Can the expression below overflow on 32-bit arches?
-                * No, because the interval tree returns us only those vmas
-                * which overlap the truncated area starting at pgoff,
-                * and no vma on a 32-bit arch can span beyond the 4GB.
-                */
-               if (vma->vm_pgoff < start)
-                       v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
-               else
-                       v_offset = 0;
-
-               if (end) {
-                       end = ((end - start) << PAGE_SHIFT) +
-                              vma->vm_start + v_offset;
-                       if (end > vma->vm_end)
-                               end = vma->vm_end;
-               } else
-                       end = vma->vm_end;
-
-               unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
-       }
-}
-
 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
        pgoff_t pgoff;
@@ -507,6 +510,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
        struct hstate *h = hstate_inode(inode);
        loff_t hpage_size = huge_page_size(h);
+       unsigned long hpage_shift = huge_page_shift(h);
        loff_t hole_start, hole_end;
 
        /*
@@ -518,8 +522,30 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
        if (hole_end > hole_start) {
                struct address_space *mapping = inode->i_mapping;
+               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(hugetlb_falloc_waitq);
+               /*
+                * Page faults on the area to be hole punched must be stopped
+                * during the operation.  Initialize struct and have
+                * inode->i_private point to it.
+                */
+               struct hugetlb_falloc hugetlb_falloc = {
+                       .waitq = &hugetlb_falloc_waitq,
+                       .start = hole_start >> hpage_shift,
+                       .end = hole_end >> hpage_shift
+               };
 
                mutex_lock(&inode->i_mutex);
+
+               /*
+                * inode->i_private will be checked in the page fault path.
+                * The locking assures that all writes to the structure are
+                * complete before assigning to i_private.  A fault on another
+                * CPU will see the fully initialized structure.
+                */
+               spin_lock(&inode->i_lock);
+               inode->i_private = &hugetlb_falloc;
+               spin_unlock(&inode->i_lock);
+
                i_mmap_lock_write(mapping);
                if (!RB_EMPTY_ROOT(&mapping->i_mmap))
                        hugetlb_vmdelete_list(&mapping->i_mmap,
@@ -527,6 +553,12 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                                                hole_end  >> PAGE_SHIFT);
                i_mmap_unlock_write(mapping);
                remove_inode_hugepages(inode, hole_start, hole_end);
+
+               spin_lock(&inode->i_lock);
+               inode->i_private = NULL;
+               wake_up_all(&hugetlb_falloc_waitq);
+               spin_unlock(&inode->i_lock);
+
                mutex_unlock(&inode->i_mutex);
        }
 
@@ -647,9 +679,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
                i_size_write(inode, offset + len);
        inode->i_ctime = CURRENT_TIME;
-       spin_lock(&inode->i_lock);
-       inode->i_private = NULL;
-       spin_unlock(&inode->i_lock);
 out:
        mutex_unlock(&inode->i_mutex);
        return error;
index 6b8338ec246454444d0dce734b9dee36b15c9f30..89463eee67914643a02ce711463e62fac0b83c8d 100644 (file)
@@ -1937,8 +1937,8 @@ out:
  * @journal: journal for operation
  * @page: to try and free
  * @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
+ * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
+ * code to release the buffers.
  *
  *
  * For all the buffers on this page,
index bb9cebc9ca8acb7e5e321e7b6fc8484174fd5fce..e5c1783ab64a050a7460bfe80b2afc966f0c8902 100644 (file)
@@ -80,7 +80,6 @@ static int jffs2_garbage_collect_thread(void *_c)
        siginitset(&hupmask, sigmask(SIGHUP));
        allow_signal(SIGKILL);
        allow_signal(SIGSTOP);
-       allow_signal(SIGCONT);
        allow_signal(SIGHUP);
 
        c->gc_task = current;
@@ -121,20 +120,18 @@ static int jffs2_garbage_collect_thread(void *_c)
                /* Put_super will send a SIGKILL and then wait on the sem.
                 */
                while (signal_pending(current) || freezing(current)) {
-                       siginfo_t info;
                        unsigned long signr;
 
                        if (try_to_freeze())
                                goto again;
 
-                       signr = dequeue_signal_lock(current, &current->blocked, &info);
+                       signr = kernel_dequeue_signal(NULL);
 
                        switch(signr) {
                        case SIGSTOP:
                                jffs2_dbg(1, "%s(): SIGSTOP received\n",
                                          __func__);
-                               set_current_state(TASK_STOPPED);
-                               schedule();
+                               kernel_signal_stop();
                                break;
 
                        case SIGKILL:
index 63f31c0733c51e5e1e8cc0b51425dc07bb59cc3e..f3a4857ff0718794b967836e796b8f9c9e345ae7 100644 (file)
@@ -1264,7 +1264,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
        if ((c->flash_size % c->sector_size) != 0) {
                c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
                pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
-       };
+       }
 
        c->wbuf_ofs = 0xFFFFFFFF;
        c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
index a7fdbd86847453ac524b261351894ef1c275fe0a..a709d80c8ebcc783fe21b34b8f77424edb0b94ea 100644 (file)
@@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
        unsigned int max_pages;
        int i;
 
-       max_pages = min(nr_pages, BIO_MAX_PAGES);
+       max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
 
        bio = bio_alloc(GFP_NOFS, max_pages);
        BUG_ON(!bio);
@@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
        unsigned int max_pages;
        int i;
 
-       max_pages = min(nr_pages, BIO_MAX_PAGES);
+       max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
 
        bio = bio_alloc(GFP_NOFS, max_pages);
        BUG_ON(!bio);
index 7f9b096d8d572e0753ee84d060c4018fa6978a40..6de0fbfc6c00a237b90c0d5bc6b4dc67659a1507 100644 (file)
@@ -57,7 +57,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
        filler_t *filler = super->s_devops->readpage;
        struct page *page;
 
-       BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+       BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS));
        if (use_filler)
                page = read_cache_page(mapping, index, filler, sb);
        else {
index 09abba7653aa8db8189d05d7c2094b77ef1998a9..1480d3a180370fe3922a7724e613d09b896f9d00 100644 (file)
@@ -361,7 +361,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        sector_t last_block_in_bio = 0;
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-       gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+       gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
 
        map_bh.b_state = 0;
        map_bh.b_size = 0;
@@ -397,7 +397,7 @@ int mpage_readpage(struct page *page, get_block_t get_block)
        sector_t last_block_in_bio = 0;
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-       gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping);
+       gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
 
        map_bh.b_state = 0;
        map_bh.b_size = 0;
index 2b729d253715ba183e912fae98cfec1e68665e24..174ef4f106cd2ac9696db8cf153f40dd7ef0ecaa 100644 (file)
@@ -4678,7 +4678,7 @@ EXPORT_SYMBOL(__page_symlink);
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
        return __page_symlink(inode, symname, len,
-                       !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+                       !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
 }
 EXPORT_SYMBOL(page_symlink);
 
index 37f639d50af580396bf016a2fb40c2b427c7b1dd..93e236429c5d785a1711d643d0b4676dfe4396cf 100644 (file)
@@ -473,8 +473,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
        /* Always try to initiate a 'commit' if relevant, but only
-        * wait for it if __GFP_WAIT is set.  Even then, only wait 1
-        * second and only if the 'bdi' is not congested.
+        * wait for it if the caller allows blocking.  Even then,
+        * only wait 1 second and only if the 'bdi' is not congested.
         * Waiting indefinitely can cause deadlocks when the NFS
         * server is on this machine, when a new TCP connection is
         * needed and in other rare cases.  There is no particular
@@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
        if (mapping) {
                struct nfs_server *nfss = NFS_SERVER(mapping->host);
                nfs_commit_inode(mapping->host, 0);
-               if ((gfp & __GFP_WAIT) &&
+               if (gfpflags_allow_blocking(gfp) &&
                    !bdi_write_congested(&nfss->backing_dev_info)) {
                        wait_on_page_bit_killable_timeout(page, PG_private,
                                                          HZ);
index 8df0f3b7839b5ee979ee48d63323dde21b637ce2..2ccbf5531554b74612420c6bd1ff6cc8ca7d6463 100644 (file)
@@ -133,38 +133,38 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
 
 /**
  * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
  * @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
  */
 static unsigned long
-nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
-                              const struct nilfs_palloc_group_desc *desc)
+nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
+                              spinlock_t *lock)
 {
        unsigned long nfree;
 
-       spin_lock(nilfs_mdt_bgl_lock(inode, group));
+       spin_lock(lock);
        nfree = le32_to_cpu(desc->pg_nfrees);
-       spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+       spin_unlock(lock);
        return nfree;
 }
 
 /**
  * nilfs_palloc_group_desc_add_entries - adjust count of free entries
- * @inode: inode of metadata file using this allocator
- * @group: group number
  * @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
  * @n: delta to be added
  */
-static void
-nilfs_palloc_group_desc_add_entries(struct inode *inode,
-                                   unsigned long group,
-                                   struct nilfs_palloc_group_desc *desc,
-                                   u32 n)
+static u32
+nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
+                                   spinlock_t *lock, u32 n)
 {
-       spin_lock(nilfs_mdt_bgl_lock(inode, group));
+       u32 nfree;
+
+       spin_lock(lock);
        le32_add_cpu(&desc->pg_nfrees, n);
-       spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+       nfree = le32_to_cpu(desc->pg_nfrees);
+       spin_unlock(lock);
+       return nfree;
 }
 
 /**
@@ -239,6 +239,26 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
        return ret;
 }
 
+/**
+ * nilfs_palloc_delete_block - delete a block on the persistent allocator file
+ * @inode: inode of metadata file using this allocator
+ * @blkoff: block offset
+ * @prev: nilfs_bh_assoc struct of the last used buffer
+ * @lock: spin lock protecting @prev
+ */
+static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
+                                    struct nilfs_bh_assoc *prev,
+                                    spinlock_t *lock)
+{
+       spin_lock(lock);
+       if (prev->bh && blkoff == prev->blkoff) {
+               brelse(prev->bh);
+               prev->bh = NULL;
+       }
+       spin_unlock(lock);
+       return nilfs_mdt_delete_block(inode, blkoff);
+}
+
 /**
  * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
  * @inode: inode of metadata file using this allocator
@@ -277,6 +297,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                      &cache->prev_bitmap, &cache->lock);
 }
 
+/**
+ * nilfs_palloc_delete_bitmap_block - delete a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ */
+static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
+                                           unsigned long group)
+{
+       struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+       return nilfs_palloc_delete_block(inode,
+                                        nilfs_palloc_bitmap_blkoff(inode,
+                                                                   group),
+                                        &cache->prev_bitmap, &cache->lock);
+}
+
 /**
  * nilfs_palloc_get_entry_block - get buffer head of an entry block
  * @inode: inode of metadata file using this allocator
@@ -295,6 +331,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                      &cache->prev_entry, &cache->lock);
 }
 
+/**
+ * nilfs_palloc_delete_entry_block - delete an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry
+ */
+static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
+{
+       struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+       return nilfs_palloc_delete_block(inode,
+                                        nilfs_palloc_entry_blkoff(inode, nr),
+                                        &cache->prev_entry, &cache->lock);
+}
+
 /**
  * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
  * @inode: inode of metadata file using this allocator
@@ -332,51 +382,40 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
 
 /**
  * nilfs_palloc_find_available_slot - find available slot in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @target: offset number of an entry in the group (start point)
  * @bitmap: bitmap of the group
+ * @target: offset number of an entry in the group (start point)
  * @bsize: size in bits
+ * @lock: spin lock protecting @bitmap
  */
-static int nilfs_palloc_find_available_slot(struct inode *inode,
-                                           unsigned long group,
+static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
                                            unsigned long target,
-                                           unsigned char *bitmap,
-                                           int bsize)
-{
-       int curr, pos, end, i;
-
-       if (target > 0) {
-               end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
-               if (end > bsize)
-                       end = bsize;
-               pos = nilfs_find_next_zero_bit(bitmap, end, target);
-               if (pos < end &&
-                   !nilfs_set_bit_atomic(
-                           nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
-                       return pos;
-       } else
-               end = 0;
-
-       for (i = 0, curr = end;
-            i < bsize;
-            i += BITS_PER_LONG, curr += BITS_PER_LONG) {
-               /* wrap around */
-               if (curr >= bsize)
-                       curr = 0;
-               while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
-                      != ~0UL) {
-                       end = curr + BITS_PER_LONG;
-                       if (end > bsize)
-                               end = bsize;
-                       pos = nilfs_find_next_zero_bit(bitmap, end, curr);
-                       if ((pos < end) &&
-                           !nilfs_set_bit_atomic(
-                                   nilfs_mdt_bgl_lock(inode, group), pos,
-                                   bitmap))
+                                           unsigned bsize,
+                                           spinlock_t *lock)
+{
+       int pos, end = bsize;
+
+       if (likely(target < bsize)) {
+               pos = target;
+               do {
+                       pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+                       if (pos >= end)
+                               break;
+                       if (!nilfs_set_bit_atomic(lock, pos, bitmap))
                                return pos;
-               }
+               } while (++pos < end);
+
+               end = target;
+       }
+
+       /* wrap around */
+       for (pos = 0; pos < end; pos++) {
+               pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+               if (pos >= end)
+                       break;
+               if (!nilfs_set_bit_atomic(lock, pos, bitmap))
+                       return pos;
        }
+
        return -ENOSPC;
 }
 
@@ -475,15 +514,15 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
        void *desc_kaddr, *bitmap_kaddr;
        unsigned long group, maxgroup, ngroups;
        unsigned long group_offset, maxgroup_offset;
-       unsigned long n, entries_per_group, groups_per_desc_block;
+       unsigned long n, entries_per_group;
        unsigned long i, j;
+       spinlock_t *lock;
        int pos, ret;
 
        ngroups = nilfs_palloc_groups_count(inode);
        maxgroup = ngroups - 1;
        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
        entries_per_group = nilfs_palloc_entries_per_group(inode);
-       groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
 
        for (i = 0; i < ngroups; i += n) {
                if (group >= ngroups) {
@@ -501,8 +540,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
                n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
                                                           maxgroup);
                for (j = 0; j < n; j++, desc++, group++) {
-                       if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
-                           > 0) {
+                       lock = nilfs_mdt_bgl_lock(inode, group);
+                       if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
                                ret = nilfs_palloc_get_bitmap_block(
                                        inode, group, 1, &bitmap_bh);
                                if (ret < 0)
@@ -510,12 +549,12 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
                                bitmap_kaddr = kmap(bitmap_bh->b_page);
                                bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
                                pos = nilfs_palloc_find_available_slot(
-                                       inode, group, group_offset, bitmap,
-                                       entries_per_group);
+                                       bitmap, group_offset,
+                                       entries_per_group, lock);
                                if (pos >= 0) {
                                        /* found a free entry */
                                        nilfs_palloc_group_desc_add_entries(
-                                               inode, group, desc, -1);
+                                               desc, lock, -1);
                                        req->pr_entry_nr =
                                                entries_per_group * group + pos;
                                        kunmap(desc_bh->b_page);
@@ -573,6 +612,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
        unsigned long group, group_offset;
        unsigned char *bitmap;
        void *desc_kaddr, *bitmap_kaddr;
+       spinlock_t *lock;
 
        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
        desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -580,13 +620,15 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
                                                 req->pr_desc_bh, desc_kaddr);
        bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
        bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+       lock = nilfs_mdt_bgl_lock(inode, group);
 
-       if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
-                                   group_offset, bitmap))
-               printk(KERN_WARNING "%s: entry number %llu already freed\n",
-                      __func__, (unsigned long long)req->pr_entry_nr);
+       if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+               nilfs_warning(inode->i_sb, __func__,
+                             "entry number %llu already freed: ino=%lu\n",
+                             (unsigned long long)req->pr_entry_nr,
+                             (unsigned long)inode->i_ino);
        else
-               nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+               nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
        kunmap(req->pr_bitmap_bh->b_page);
        kunmap(req->pr_desc_bh->b_page);
@@ -611,6 +653,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
        void *desc_kaddr, *bitmap_kaddr;
        unsigned char *bitmap;
        unsigned long group, group_offset;
+       spinlock_t *lock;
 
        group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
        desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -618,12 +661,15 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
                                                 req->pr_desc_bh, desc_kaddr);
        bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
        bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
-       if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
-                                   group_offset, bitmap))
-               printk(KERN_WARNING "%s: entry number %llu already freed\n",
-                      __func__, (unsigned long long)req->pr_entry_nr);
+       lock = nilfs_mdt_bgl_lock(inode, group);
+
+       if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+               nilfs_warning(inode->i_sb, __func__,
+                             "entry number %llu already freed: ino=%lu\n",
+                             (unsigned long long)req->pr_entry_nr,
+                             (unsigned long)inode->i_ino);
        else
-               nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+               nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
        kunmap(req->pr_bitmap_bh->b_page);
        kunmap(req->pr_desc_bh->b_page);
@@ -679,22 +725,6 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
        req->pr_desc_bh = NULL;
 }
 
-/**
- * nilfs_palloc_group_is_in - judge if an entry is in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @nr: serial number of the entry (e.g. inode number)
- */
-static int
-nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
-{
-       __u64 first, last;
-
-       first = group * nilfs_palloc_entries_per_group(inode);
-       last = first + nilfs_palloc_entries_per_group(inode) - 1;
-       return (nr >= first) && (nr <= last);
-}
-
 /**
  * nilfs_palloc_freev - deallocate a set of persistent objects
  * @inode: inode of metadata file using this allocator
@@ -708,9 +738,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
        unsigned char *bitmap;
        void *desc_kaddr, *bitmap_kaddr;
        unsigned long group, group_offset;
-       int i, j, n, ret;
+       __u64 group_min_nr, last_nrs[8];
+       const unsigned long epg = nilfs_palloc_entries_per_group(inode);
+       const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
+       unsigned entry_start, end, pos;
+       spinlock_t *lock;
+       int i, j, k, ret;
+       u32 nfree;
 
        for (i = 0; i < nitems; i = j) {
+               int change_group = false;
+               int nempties = 0, n = 0;
+
                group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
                ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
                if (ret < 0)
@@ -721,38 +760,89 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
                        brelse(desc_bh);
                        return ret;
                }
-               desc_kaddr = kmap(desc_bh->b_page);
-               desc = nilfs_palloc_block_get_group_desc(
-                       inode, group, desc_bh, desc_kaddr);
+
+               /* Get the first entry number of the group */
+               group_min_nr = (__u64)group * epg;
+
                bitmap_kaddr = kmap(bitmap_bh->b_page);
                bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
-               for (j = i, n = 0;
-                    (j < nitems) && nilfs_palloc_group_is_in(inode, group,
-                                                             entry_nrs[j]);
-                    j++) {
-                       nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
-                       if (!nilfs_clear_bit_atomic(
-                                   nilfs_mdt_bgl_lock(inode, group),
-                                   group_offset, bitmap)) {
-                               printk(KERN_WARNING
-                                      "%s: entry number %llu already freed\n",
-                                      __func__,
-                                      (unsigned long long)entry_nrs[j]);
+               lock = nilfs_mdt_bgl_lock(inode, group);
+
+               j = i;
+               entry_start = rounddown(group_offset, epb);
+               do {
+                       if (!nilfs_clear_bit_atomic(lock, group_offset,
+                                                   bitmap)) {
+                               nilfs_warning(inode->i_sb, __func__,
+                                             "entry number %llu already freed: ino=%lu\n",
+                                             (unsigned long long)entry_nrs[j],
+                                             (unsigned long)inode->i_ino);
                        } else {
                                n++;
                        }
-               }
-               nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+
+                       j++;
+                       if (j >= nitems || entry_nrs[j] < group_min_nr ||
+                           entry_nrs[j] >= group_min_nr + epg) {
+                               change_group = true;
+                       } else {
+                               group_offset = entry_nrs[j] - group_min_nr;
+                               if (group_offset >= entry_start &&
+                                   group_offset < entry_start + epb) {
+                                       /* This entry is in the same block */
+                                       continue;
+                               }
+                       }
+
+                       /* Test if the entry block is empty or not */
+                       end = entry_start + epb;
+                       pos = nilfs_find_next_bit(bitmap, end, entry_start);
+                       if (pos >= end) {
+                               last_nrs[nempties++] = entry_nrs[j - 1];
+                               if (nempties >= ARRAY_SIZE(last_nrs))
+                                       break;
+                       }
+
+                       if (change_group)
+                               break;
+
+                       /* Go on to the next entry block */
+                       entry_start = rounddown(group_offset, epb);
+               } while (true);
 
                kunmap(bitmap_bh->b_page);
-               kunmap(desc_bh->b_page);
+               mark_buffer_dirty(bitmap_bh);
+               brelse(bitmap_bh);
 
+               for (k = 0; k < nempties; k++) {
+                       ret = nilfs_palloc_delete_entry_block(inode,
+                                                             last_nrs[k]);
+                       if (ret && ret != -ENOENT) {
+                               nilfs_warning(inode->i_sb, __func__,
+                                             "failed to delete block of entry %llu: ino=%lu, err=%d\n",
+                                             (unsigned long long)last_nrs[k],
+                                             (unsigned long)inode->i_ino, ret);
+                       }
+               }
+
+               desc_kaddr = kmap_atomic(desc_bh->b_page);
+               desc = nilfs_palloc_block_get_group_desc(
+                       inode, group, desc_bh, desc_kaddr);
+               nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
+               kunmap_atomic(desc_kaddr);
                mark_buffer_dirty(desc_bh);
-               mark_buffer_dirty(bitmap_bh);
                nilfs_mdt_mark_dirty(inode);
-
-               brelse(bitmap_bh);
                brelse(desc_bh);
+
+               if (nfree == nilfs_palloc_entries_per_group(inode)) {
+                       ret = nilfs_palloc_delete_bitmap_block(inode, group);
+                       if (ret && ret != -ENOENT) {
+                               nilfs_warning(inode->i_sb, __func__,
+                                             "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n",
+                                             group,
+                                             (unsigned long)inode->i_ino, ret);
+                       }
+               }
        }
        return 0;
 }
index 4bd6451b570398b25fa935fc2e080a24a546829a..6e6f49aa53df38360450a12dab1e840b682c59a1 100644 (file)
@@ -77,6 +77,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
 #define nilfs_set_bit_atomic           ext2_set_bit_atomic
 #define nilfs_clear_bit_atomic         ext2_clear_bit_atomic
 #define nilfs_find_next_zero_bit       find_next_zero_bit_le
+#define nilfs_find_next_bit            find_next_bit_le
 
 /**
  * struct nilfs_bh_assoc - block offset and buffer head association
index 919fd5bb14a842e94b80ae8d839cac1918bb1cc2..3a3821b00486b531e72ff931be80b3627a0ddca1 100644 (file)
@@ -919,8 +919,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
                              int level, __u64 *keyp, __u64 *ptrp)
 {
        struct nilfs_btree_node *node, *right;
-       __u64 newkey;
-       __u64 newptr;
        int nchildren, n, move, ncblk;
 
        node = nilfs_btree_get_nonroot_node(path, level);
@@ -942,9 +940,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
        if (!buffer_dirty(path[level].bp_sib_bh))
                mark_buffer_dirty(path[level].bp_sib_bh);
 
-       newkey = nilfs_btree_node_get_key(right, 0);
-       newptr = path[level].bp_newreq.bpr_ptr;
-
        if (move) {
                path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
                nilfs_btree_node_insert(right, path[level].bp_index,
@@ -1856,7 +1851,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
                                   __u64 key, __u64 ptr,
                                   const __u64 *keys, const __u64 *ptrs, int n)
 {
-       struct buffer_head *bh;
+       struct buffer_head *bh = NULL;
        union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
        struct nilfs_bmap_stats stats;
        int ret;
index 0d5fada9119136e793b73cf9d008bae29e43fcf8..7dc23f100e579df17aec0499a7af609e9c0dc5af 100644 (file)
@@ -155,7 +155,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
 int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 {
        struct nilfs_dat_entry *entry;
-       __u64 start;
        sector_t blocknr;
        void *kaddr;
        int ret;
@@ -169,7 +168,6 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
        kaddr = kmap_atomic(req->pr_entry_bh->b_page);
        entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
                                             req->pr_entry_bh, kaddr);
-       start = le64_to_cpu(entry->de_start);
        blocknr = le64_to_cpu(entry->de_blocknr);
        kunmap_atomic(kaddr);
 
index 4a73d6dffabf696198f9ce892b2f93d7008e3266..ac2f64943ff4c257f3fe2cd8d32de581de601cc6 100644 (file)
@@ -356,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
                goto failed;
 
        mapping_set_gfp_mask(inode->i_mapping,
-                            mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+                          mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
 
        root = NILFS_I(dir)->i_root;
        ii = NILFS_I(inode);
@@ -522,7 +522,7 @@ static int __nilfs_read_inode(struct super_block *sb,
        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
        nilfs_set_inode_flags(inode);
        mapping_set_gfp_mask(inode->i_mapping,
-                            mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+                          mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
        return 0;
 
  failed_unmap:
index dee34d9902813fe1dc1d1e1703445ca8ac013284..1125f40233ffd9c849acc0f13d432f520de95e0a 100644 (file)
@@ -33,6 +33,7 @@
 #include "page.h"
 #include "mdt.h"
 
+#include <trace/events/nilfs2.h>
 
 #define NILFS_MDT_MAX_RA_BLOCKS                (16 - 1)
 
@@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
        set_buffer_uptodate(bh);
        mark_buffer_dirty(bh);
        nilfs_mdt_mark_dirty(inode);
+
+       trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block);
+
        return 0;
 }
 
@@ -158,6 +162,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
        get_bh(bh);
        submit_bh(mode, bh);
        ret = 0;
+
+       trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
  out:
        get_bh(bh);
        *out_bh = bh;
index fe529a87a208d4db026efbe98de71608a1622a6c..03246cac33384e2cc9a0d49bfe8beaba74118873 100644 (file)
@@ -72,7 +72,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
 }
 
 /* Default GFP flags using highmem */
-#define NILFS_MDT_GFP      (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+#define NILFS_MDT_GFP      (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)
 
 int nilfs_mdt_get_block(struct inode *, unsigned long, int,
                        void (*init_block)(struct inode *,
index ff00a0b7acb927ee18a9dba898aac5cf55b2def5..9b4f205d11736dc4d109828260055c2af34c0e9d 100644 (file)
@@ -582,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                                 struct nilfs_recovery_info *ri)
 {
        struct buffer_head *bh_sum = NULL;
-       struct nilfs_segment_summary *sum;
+       struct nilfs_segment_summary *sum = NULL;
        sector_t pseg_start;
        sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
        unsigned long nsalvaged_blocks = 0;
@@ -814,7 +814,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
                            struct nilfs_recovery_info *ri)
 {
        struct buffer_head *bh_sum = NULL;
-       struct nilfs_segment_summary *sum;
+       struct nilfs_segment_summary *sum = NULL;
        sector_t pseg_start, pseg_end, sr_pseg_start = 0;
        sector_t seg_start, seg_end; /* range of full segment (block number) */
        sector_t b, end;
index c6abbad9b8e3833d9cbbac60f648e786b600836f..3b65adaae7e47b9732669c4db8a563cfeeec4d87 100644 (file)
@@ -77,6 +77,36 @@ enum {
        NILFS_ST_DONE,
 };
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/nilfs2.h>
+
+/*
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are
+ * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of
+ * the variable must use them because transition of stage count must involve
+ * trace events (trace_nilfs2_collection_stage_transition).
+ *
+ * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't
+ * produce tracepoint events. It is provided just for making the intention
+ * clear.
+ */
+static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci)
+{
+       sci->sc_stage.scnt++;
+       trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt)
+{
+       sci->sc_stage.scnt = next_scnt;
+       trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci)
+{
+       return sci->sc_stage.scnt;
+}
+
 /* State flags of collection */
 #define NILFS_CF_NODE          0x0001  /* Collecting node blocks */
 #define NILFS_CF_IFILE_STARTED 0x0002  /* IFILE stage has started */
@@ -184,11 +214,18 @@ int nilfs_transaction_begin(struct super_block *sb,
 {
        struct the_nilfs *nilfs;
        int ret = nilfs_prepare_segment_lock(ti);
+       struct nilfs_transaction_info *trace_ti;
 
        if (unlikely(ret < 0))
                return ret;
-       if (ret > 0)
+       if (ret > 0) {
+               trace_ti = current->journal_info;
+
+               trace_nilfs2_transaction_transition(sb, trace_ti,
+                                   trace_ti->ti_count, trace_ti->ti_flags,
+                                   TRACE_NILFS2_TRANSACTION_BEGIN);
                return 0;
+       }
 
        sb_start_intwrite(sb);
 
@@ -199,6 +236,11 @@ int nilfs_transaction_begin(struct super_block *sb,
                ret = -ENOSPC;
                goto failed;
        }
+
+       trace_ti = current->journal_info;
+       trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count,
+                                           trace_ti->ti_flags,
+                                           TRACE_NILFS2_TRANSACTION_BEGIN);
        return 0;
 
  failed:
@@ -231,6 +273,8 @@ int nilfs_transaction_commit(struct super_block *sb)
        ti->ti_flags |= NILFS_TI_COMMIT;
        if (ti->ti_count > 0) {
                ti->ti_count--;
+               trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+                           ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
                return 0;
        }
        if (nilfs->ns_writer) {
@@ -242,6 +286,9 @@ int nilfs_transaction_commit(struct super_block *sb)
                        nilfs_segctor_do_flush(sci, 0);
        }
        up_read(&nilfs->ns_segctor_sem);
+       trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+                           ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
+
        current->journal_info = ti->ti_save;
 
        if (ti->ti_flags & NILFS_TI_SYNC)
@@ -260,10 +307,15 @@ void nilfs_transaction_abort(struct super_block *sb)
        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
        if (ti->ti_count > 0) {
                ti->ti_count--;
+               trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+                           ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
                return;
        }
        up_read(&nilfs->ns_segctor_sem);
 
+       trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+                   ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
+
        current->journal_info = ti->ti_save;
        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
                kmem_cache_free(nilfs_transaction_cachep, ti);
@@ -309,6 +361,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
        current->journal_info = ti;
 
        for (;;) {
+               trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+                           ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK);
+
                down_write(&nilfs->ns_segctor_sem);
                if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
                        break;
@@ -320,6 +375,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
        }
        if (gcflag)
                ti->ti_flags |= NILFS_TI_GC;
+
+       trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+                           ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK);
 }
 
 static void nilfs_transaction_unlock(struct super_block *sb)
@@ -332,6 +390,9 @@ static void nilfs_transaction_unlock(struct super_block *sb)
 
        up_write(&nilfs->ns_segctor_sem);
        current->journal_info = ti->ti_save;
+
+       trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+                           ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK);
 }
 
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -1062,7 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
        size_t ndone;
        int err = 0;
 
-       switch (sci->sc_stage.scnt) {
+       switch (nilfs_sc_cstage_get(sci)) {
        case NILFS_ST_INIT:
                /* Pre-processes */
                sci->sc_stage.flags = 0;
@@ -1071,7 +1132,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                        sci->sc_nblk_inc = 0;
                        sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
                        if (mode == SC_LSEG_DSYNC) {
-                               sci->sc_stage.scnt = NILFS_ST_DSYNC;
+                               nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC);
                                goto dsync_mode;
                        }
                }
@@ -1079,10 +1140,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                sci->sc_stage.dirty_file_ptr = NULL;
                sci->sc_stage.gc_inode_ptr = NULL;
                if (mode == SC_FLUSH_DAT) {
-                       sci->sc_stage.scnt = NILFS_ST_DAT;
+                       nilfs_sc_cstage_set(sci, NILFS_ST_DAT);
                        goto dat_stage;
                }
-               sci->sc_stage.scnt++;  /* Fall through */
+               nilfs_sc_cstage_inc(sci);  /* Fall through */
        case NILFS_ST_GC:
                if (nilfs_doing_gc()) {
                        head = &sci->sc_gc_inodes;
@@ -1103,7 +1164,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                        }
                        sci->sc_stage.gc_inode_ptr = NULL;
                }
-               sci->sc_stage.scnt++;  /* Fall through */
+               nilfs_sc_cstage_inc(sci);  /* Fall through */
        case NILFS_ST_FILE:
                head = &sci->sc_dirty_files;
                ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
@@ -1125,10 +1186,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                }
                sci->sc_stage.dirty_file_ptr = NULL;
                if (mode == SC_FLUSH_FILE) {
-                       sci->sc_stage.scnt = NILFS_ST_DONE;
+                       nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
                        return 0;
                }
-               sci->sc_stage.scnt++;
+               nilfs_sc_cstage_inc(sci);
                sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
                /* Fall through */
        case NILFS_ST_IFILE:
@@ -1136,7 +1197,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
                        break;
-               sci->sc_stage.scnt++;
+               nilfs_sc_cstage_inc(sci);
                /* Creating a checkpoint */
                err = nilfs_segctor_create_checkpoint(sci);
                if (unlikely(err))
@@ -1147,7 +1208,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
                        break;
-               sci->sc_stage.scnt++;  /* Fall through */
+               nilfs_sc_cstage_inc(sci);  /* Fall through */
        case NILFS_ST_SUFILE:
                err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
                                         sci->sc_nfreesegs, &ndone);
@@ -1163,7 +1224,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
                        break;
-               sci->sc_stage.scnt++;  /* Fall through */
+               nilfs_sc_cstage_inc(sci);  /* Fall through */
        case NILFS_ST_DAT:
  dat_stage:
                err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
@@ -1171,10 +1232,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                if (unlikely(err))
                        break;
                if (mode == SC_FLUSH_DAT) {
-                       sci->sc_stage.scnt = NILFS_ST_DONE;
+                       nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
                        return 0;
                }
-               sci->sc_stage.scnt++;  /* Fall through */
+               nilfs_sc_cstage_inc(sci);  /* Fall through */
        case NILFS_ST_SR:
                if (mode == SC_LSEG_SR) {
                        /* Appending a super root */
@@ -1184,7 +1245,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                }
                /* End of a logical segment */
                sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
-               sci->sc_stage.scnt = NILFS_ST_DONE;
+               nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
                return 0;
        case NILFS_ST_DSYNC:
  dsync_mode:
@@ -1197,7 +1258,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                if (unlikely(err))
                        break;
                sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
-               sci->sc_stage.scnt = NILFS_ST_DONE;
+               nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
                return 0;
        case NILFS_ST_DONE:
                return 0;
@@ -1442,7 +1503,8 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                        goto failed;
 
                /* The current segment is filled up */
-               if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+               if (mode != SC_LSEG_SR ||
+                   nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE)
                        break;
 
                nilfs_clear_logs(&sci->sc_segbufs);
@@ -1946,7 +2008,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int err;
 
-       sci->sc_stage.scnt = NILFS_ST_INIT;
+       nilfs_sc_cstage_set(sci, NILFS_ST_INIT);
        sci->sc_cno = nilfs->ns_cno;
 
        err = nilfs_segctor_collect_dirty_files(sci, nilfs);
@@ -1974,7 +2036,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                        goto failed;
 
                /* Avoid empty segment */
-               if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+               if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE &&
                    nilfs_segbuf_empty(sci->sc_curseg)) {
                        nilfs_segctor_abort_construction(sci, nilfs, 1);
                        goto out;
@@ -1988,7 +2050,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                        nilfs_segctor_fill_in_file_bmap(sci);
 
                if (mode == SC_LSEG_SR &&
-                   sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
+                   nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
                        err = nilfs_segctor_fill_in_checkpoint(sci);
                        if (unlikely(err))
                                goto failed_to_write;
@@ -2007,7 +2069,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (unlikely(err))
                        goto failed_to_write;
 
-               if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+               if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
                    nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
                        /*
                         * At this point, we avoid double buffering
@@ -2020,7 +2082,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                        if (err)
                                goto failed_to_write;
                }
-       } while (sci->sc_stage.scnt != NILFS_ST_DONE);
+       } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE);
 
  out:
        nilfs_segctor_drop_written_files(sci, nilfs);
@@ -2430,7 +2492,6 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
 static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
 {
        int mode = 0;
-       int err;
 
        spin_lock(&sci->sc_state_lock);
        mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
@@ -2438,7 +2499,7 @@ static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
        spin_unlock(&sci->sc_state_lock);
 
        if (mode) {
-               err = nilfs_segctor_do_construct(sci, mode);
+               nilfs_segctor_do_construct(sci, mode);
 
                spin_lock(&sci->sc_state_lock);
                sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
index a48d6de1e02cc276019fb150fee8e0bc6c41bcbe..0408b9b2814b2242f6df96e2e42ba12b94fcd911 100644 (file)
@@ -67,7 +67,8 @@ struct nilfs_recovery_info {
 
 /**
  * struct nilfs_cstage - Context of collection stage
- * @scnt: Stage count
+ * @scnt: Stage count, must be accessed via wrappers:
+ *        nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get()
  * @flags: State flags
  * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
  * @gc_inode_ptr: Pointer on the list of gc-inodes
index 2a869c35c3622386ac0fdc774681df65c59f7093..52821ffc11f46d75e3ea170e19c48dd719a1f80d 100644 (file)
@@ -30,6 +30,8 @@
 #include "mdt.h"
 #include "sufile.h"
 
+#include <trace/events/nilfs2.h>
+
 /**
  * struct nilfs_sufile_info - on-memory private data of sufile
  * @mi: on-memory private data of metadata file
@@ -317,7 +319,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
        __u64 segnum, maxsegnum, last_alloc;
        void *kaddr;
-       unsigned long nsegments, ncleansegs, nsus, cnt;
+       unsigned long nsegments, nsus, cnt;
        int ret, j;
 
        down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -327,7 +329,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
                goto out_sem;
        kaddr = kmap_atomic(header_bh->b_page);
        header = kaddr + bh_offset(header_bh);
-       ncleansegs = le64_to_cpu(header->sh_ncleansegs);
        last_alloc = le64_to_cpu(header->sh_last_alloc);
        kunmap_atomic(kaddr);
 
@@ -358,6 +359,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
                                break; /* never happens */
                        }
                }
+               trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
                                                           &su_bh);
                if (ret < 0)
@@ -388,6 +390,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
                        nilfs_mdt_mark_dirty(sufile);
                        brelse(su_bh);
                        *segnump = segnum;
+
+                       trace_nilfs2_segment_usage_allocated(sufile, segnum);
+
                        goto out_header;
                }
 
@@ -490,6 +495,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
        NILFS_SUI(sufile)->ncleansegs++;
 
        nilfs_mdt_mark_dirty(sufile);
+
+       trace_nilfs2_segment_usage_freed(sufile, segnum);
 }
 
 /**
index f47585bfeb0169eee1198d01493e1d54705dabba..354013ea22ec212001ef2c300fdf282ad39c611d 100644 (file)
@@ -361,7 +361,7 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
        struct nilfs_super_block *nsbp;
        sector_t blocknr, newblocknr;
        unsigned long offset;
-       int sb2i = -1;  /* array index of the secondary superblock */
+       int sb2i;  /* array index of the secondary superblock */
        int ret = 0;
 
        /* nilfs->ns_sem must be locked by the caller. */
@@ -372,6 +372,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
        } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
                sb2i = 0;
                blocknr = nilfs->ns_sbh[0]->b_blocknr;
+       } else {
+               sb2i = -1;
+               blocknr = 0;
        }
        if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
                goto out;  /* super block location is unchanged */
@@ -1405,14 +1408,10 @@ static void nilfs_destroy_cachep(void)
         */
        rcu_barrier();
 
-       if (nilfs_inode_cachep)
-               kmem_cache_destroy(nilfs_inode_cachep);
-       if (nilfs_transaction_cachep)
-               kmem_cache_destroy(nilfs_transaction_cachep);
-       if (nilfs_segbuf_cachep)
-               kmem_cache_destroy(nilfs_segbuf_cachep);
-       if (nilfs_btree_path_cache)
-               kmem_cache_destroy(nilfs_btree_path_cache);
+       kmem_cache_destroy(nilfs_inode_cachep);
+       kmem_cache_destroy(nilfs_transaction_cachep);
+       kmem_cache_destroy(nilfs_segbuf_cachep);
+       kmem_cache_destroy(nilfs_btree_path_cache);
 }
 
 static int __init nilfs_init_cachep(void)
index 6b6f0d472ae816e3cd726796caccf346b86bc461..fd98e5100cabedaa2a71ecef4bd8997334ae528e 100644 (file)
@@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
        inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
        inode = igrab(mark->inode);
        if (inode) {
+               /*
+                * IN_ALL_EVENTS represents all of the mask bits
+                * that we expose to userspace.  There is at
+                * least one bit (FS_EVENT_ON_CHILD) which is
+                * used only internally to the kernel.
+                */
+               u32 mask = mark->mask & IN_ALL_EVENTS;
                seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
                           inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
-                          mark->mask, mark->ignored_mask);
+                          mask, mark->ignored_mask);
                show_mark_fhandle(m, inode);
                seq_putc(m, '\n');
                iput(inode);
index 5b1e2a497e5114c26e556830f9f891c36130ffc7..b8d08d0d0a4dbe061b9336dd7f861d32e13c8618 100644 (file)
@@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
        int ret;
        unsigned flags = 0;
 
-       /* don't allow invalid bits: we don't want flags set */
+       /*
+        * We share a lot of code with fs/dnotify.  We also share
+        * the bit layout between inotify's IN_* and the fsnotify
+        * FS_*.  This check ensures that only the inotify IN_*
+        * bits get passed in and set in watches/events.
+        */
+       if (unlikely(mask & ~ALL_INOTIFY_BITS))
+               return -EINVAL;
+       /*
+        * Require at least one valid bit set in the mask.
+        * Without _something_ set, we would have no events to
+        * watch for.
+        */
        if (unlikely(!(mask & ALL_INOTIFY_BITS)))
                return -EINVAL;
 
index 262561fea923aa2315cffe91af91d12b399ded8c..9d383e5eff0ea5519bffb34528b5aeccbe6f3831 100644 (file)
@@ -525,8 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                }
                        }
                        err = add_to_page_cache_lru(*cached_page, mapping,
-                                       index,
-                                       GFP_KERNEL & mapping_gfp_mask(mapping));
+                                  index,
+                                  mapping_gfp_constraint(mapping, GFP_KERNEL));
                        if (unlikely(err)) {
                                if (err == -EEXIST)
                                        continue;
index 86181d6526dc55de22b8e118660fba86546ae340..c2cb51d4f6e9af431c39bfe5a43f79e361666907 100644 (file)
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
        struct ocfs2_extent_block *eb;
        u32 range;
 
-       /*
-        * In normal tree rotation process, we will never touch the
-        * tree branch above subtree_index and ocfs2_extend_rotate_transaction
-        * doesn't reserve the credits for them either.
-        *
-        * But we do have a special case here which will update the rightmost
-        * records for all the bh in the path.
-        * So we have to allocate extra credits and access them.
-        */
-       ret = ocfs2_extend_trans(handle, subtree_index);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
-
        ret = ocfs2_journal_access_path(et->et_ci, handle, path);
        if (ret) {
                mlog_errno(ret);
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
                     right_path->p_node[subtree_root].bh->b_blocknr,
                     right_path->p_tree_depth);
 
-               ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+               ret = ocfs2_extend_rotate_transaction(handle, 0,
                                                      orig_credits, left_path);
                if (ret) {
                        mlog_errno(ret);
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
 
-
        ret = ocfs2_et_sanity_check(et);
        if (ret)
                goto out;
-       /*
-        * There's two ways we handle this depending on
-        * whether path is the only existing one.
-        */
-       ret = ocfs2_extend_rotate_transaction(handle, 0,
-                                             handle->h_buffer_credits,
-                                             path);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
-       }
 
        ret = ocfs2_journal_access_path(et->et_ci, handle, path);
        if (ret) {
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
                 */
                if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
                    le16_to_cpu(el->l_next_free_rec) == 1) {
+                       /* extend credit for ocfs2_remove_rightmost_path */
+                       ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                       handle->h_buffer_credits,
+                                       right_path);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
 
                        ret = ocfs2_remove_rightmost_path(handle, et,
                                                          right_path,
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
 
        if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+               /* extend credit for ocfs2_remove_rightmost_path */
+               ret = ocfs2_extend_rotate_transaction(handle, 0,
+                               handle->h_buffer_credits,
+                               path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
                /*
                 * The merge code will need to create an empty
                 * extent to take the place of the newly
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
                 */
                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
 
+               /* extend credit for ocfs2_remove_rightmost_path */
+               ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                       handle->h_buffer_credits,
+                                       path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
                /* The merge left us with an empty extent, remove it. */
                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
                if (ret) {
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
                        goto out;
                }
 
+               /* extend credit for ocfs2_remove_rightmost_path */
+               ret = ocfs2_extend_rotate_transaction(handle, 0,
+                               handle->h_buffer_credits,
+                               path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
                /*
                 * Error from this last rotate is not critical, so
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
                }
 
                if (ctxt->c_split_covers_rec) {
+                       /* extend credit for ocfs2_remove_rightmost_path */
+                       ret = ocfs2_extend_rotate_transaction(handle, 0,
+                                       handle->h_buffer_credits,
+                                       path);
+                       if (ret) {
+                               mlog_errno(ret);
+                               ret = 0;
+                               goto out;
+                       }
+
                        /*
                         * The merge may have left an empty extent in
                         * our leaf. Try to rotate it away.
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
        struct ocfs2_extent_block *eb;
 
        if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+               /* extend credit for ocfs2_remove_rightmost_path */
+               ret = ocfs2_extend_rotate_transaction(handle, 0,
+                               handle->h_buffer_credits,
+                               path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
                if (ret) {
                        mlog_errno(ret);
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 
                ocfs2_journal_dirty(handle, tl_bh);
 
-               /* TODO: Perhaps we can calculate the bulk of the
-                * credits up front rather than extending like
-                * this. */
-               status = ocfs2_extend_trans(handle,
-                                           OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
-
                rec = tl->tl_recs[i];
                start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
                                                    le32_to_cpu(rec.t_start));
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                                goto bail;
                        }
                }
+
+               status = ocfs2_extend_trans(handle,
+                               OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
                i--;
        }
 
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
                goto out_mutex;
        }
 
-       handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+       handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
                if (cancel)
                        cancel_delayed_work(&osb->osb_truncate_log_wq);
 
-               queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+               queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
                                   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
        }
 }
@@ -6254,7 +6277,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
 
        if (tl_inode) {
                cancel_delayed_work(&osb->osb_truncate_log_wq);
-               flush_workqueue(ocfs2_wq);
+               flush_workqueue(osb->ocfs2_wq);
 
                status = ocfs2_flush_truncate_log(osb);
                if (status < 0)
index 64b11d90eca688fbce4e81b0e503c0e9ffd44197..4bb992145385b7f37ea58d198c2bb12a71941510 100644 (file)
@@ -499,152 +499,6 @@ bail:
        return status;
 }
 
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- *  "So what we do is to permit the ->get_blocks function to populate
- *   bh.b_size with the size of IO which is permitted at this offset and
- *   this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- *                                     fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
-                                    struct buffer_head *bh_result, int create)
-{
-       int ret;
-       u32 cpos = 0;
-       int alloc_locked = 0;
-       u64 p_blkno, inode_blocks, contig_blocks;
-       unsigned int ext_flags;
-       unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
-       unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
-       unsigned long len = bh_result->b_size;
-       unsigned int clusters_to_alloc = 0, contig_clusters = 0;
-
-       cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
-
-       /* This function won't even be called if the request isn't all
-        * nicely aligned and of the right size, so there's no need
-        * for us to check any of that. */
-
-       inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-
-       down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-       /* This figures out the size of the next contiguous block, and
-        * our logical offset */
-       ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-                                         &contig_blocks, &ext_flags);
-       up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
-       if (ret) {
-               mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
-                    (unsigned long long)iblock);
-               ret = -EIO;
-               goto bail;
-       }
-
-       /* We should already CoW the refcounted extent in case of create. */
-       BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-
-       /* allocate blocks if no p_blkno is found, and create == 1 */
-       if (!p_blkno && create) {
-               ret = ocfs2_inode_lock(inode, NULL, 1);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto bail;
-               }
-
-               alloc_locked = 1;
-
-               down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
-               /* fill hole, allocate blocks can't be larger than the size
-                * of the hole */
-               clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
-               contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
-                               contig_blocks);
-               if (clusters_to_alloc > contig_clusters)
-                       clusters_to_alloc = contig_clusters;
-
-               /* allocate extent and insert them into the extent tree */
-               ret = ocfs2_extend_allocation(inode, cpos,
-                               clusters_to_alloc, 0);
-               if (ret < 0) {
-                       up_write(&OCFS2_I(inode)->ip_alloc_sem);
-                       mlog_errno(ret);
-                       goto bail;
-               }
-
-               ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
-                               &contig_blocks, &ext_flags);
-               if (ret < 0) {
-                       up_write(&OCFS2_I(inode)->ip_alloc_sem);
-                       mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
-                                       (unsigned long long)iblock);
-                       ret = -EIO;
-                       goto bail;
-               }
-               up_write(&OCFS2_I(inode)->ip_alloc_sem);
-       }
-
-       /*
-        * get_more_blocks() expects us to describe a hole by clearing
-        * the mapped bit on bh_result().
-        *
-        * Consider an unwritten extent as a hole.
-        */
-       if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
-               map_bh(bh_result, inode->i_sb, p_blkno);
-       else
-               clear_buffer_mapped(bh_result);
-
-       /* make sure we don't map more than max_blocks blocks here as
-          that's all the kernel will handle at this point. */
-       if (max_blocks < contig_blocks)
-               contig_blocks = max_blocks;
-       bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
-       if (alloc_locked)
-               ocfs2_inode_unlock(inode, 1);
-       return ret;
-}
-
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
- * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static void ocfs2_dio_end_io(struct kiocb *iocb,
-                            loff_t offset,
-                            ssize_t bytes,
-                            void *private)
-{
-       struct inode *inode = file_inode(iocb->ki_filp);
-       int level;
-
-       /* this io's submitter should not have unlocked this before we could */
-       BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-
-       if (ocfs2_iocb_is_unaligned_aio(iocb)) {
-               ocfs2_iocb_clear_unaligned_aio(iocb);
-
-               mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
-       }
-
-       /* Let rw unlock to be done later to protect append direct io write */
-       if (offset + bytes <= i_size_read(inode)) {
-               ocfs2_iocb_clear_rw_locked(iocb);
-
-               level = ocfs2_iocb_rw_locked_level(iocb);
-               ocfs2_rw_unlock(inode, level);
-       }
-}
-
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
 {
        if (!page_has_buffers(page))
@@ -652,361 +506,6 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
        return try_to_free_buffers(page);
 }
 
-static int ocfs2_is_overwrite(struct ocfs2_super *osb,
-               struct inode *inode, loff_t offset)
-{
-       int ret = 0;
-       u32 v_cpos = 0;
-       u32 p_cpos = 0;
-       unsigned int num_clusters = 0;
-       unsigned int ext_flags = 0;
-
-       v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-       ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
-                       &num_clusters, &ext_flags);
-       if (ret < 0) {
-               mlog_errno(ret);
-               return ret;
-       }
-
-       if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
-               return 1;
-
-       return 0;
-}
-
-static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
-               struct inode *inode, loff_t offset,
-               u64 zero_len, int cluster_align)
-{
-       u32 p_cpos = 0;
-       u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
-       unsigned int num_clusters = 0;
-       unsigned int ext_flags = 0;
-       int ret = 0;
-
-       if (offset <= i_size_read(inode) || cluster_align)
-               return 0;
-
-       ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
-                       &ext_flags);
-       if (ret < 0) {
-               mlog_errno(ret);
-               return ret;
-       }
-
-       if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-               u64 s = i_size_read(inode);
-               sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
-                       (do_div(s, osb->s_clustersize) >> 9);
-
-               ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
-                               zero_len >> 9, GFP_NOFS, false);
-               if (ret < 0)
-                       mlog_errno(ret);
-       }
-
-       return ret;
-}
-
-static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
-               struct inode *inode, loff_t offset)
-{
-       u64 zero_start, zero_len, total_zero_len;
-       u32 p_cpos = 0, clusters_to_add;
-       u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
-       unsigned int num_clusters = 0;
-       unsigned int ext_flags = 0;
-       u32 size_div, offset_div;
-       int ret = 0;
-
-       {
-               u64 o = offset;
-               u64 s = i_size_read(inode);
-
-               offset_div = do_div(o, osb->s_clustersize);
-               size_div = do_div(s, osb->s_clustersize);
-       }
-
-       if (offset <= i_size_read(inode))
-               return 0;
-
-       clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
-               ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
-       total_zero_len = offset - i_size_read(inode);
-       if (clusters_to_add)
-               total_zero_len -= offset_div;
-
-       /* Allocate clusters to fill out holes, and this is only needed
-        * when we add more than one clusters. Otherwise the cluster will
-        * be allocated during direct IO */
-       if (clusters_to_add > 1) {
-               ret = ocfs2_extend_allocation(inode,
-                               OCFS2_I(inode)->ip_clusters,
-                               clusters_to_add - 1, 0);
-               if (ret) {
-                       mlog_errno(ret);
-                       goto out;
-               }
-       }
-
-       while (total_zero_len) {
-               ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
-                               &ext_flags);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out;
-               }
-
-               zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
-                       size_div;
-               zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
-                       size_div;
-               zero_len = min(total_zero_len, zero_len);
-
-               if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-                       ret = blkdev_issue_zeroout(osb->sb->s_bdev,
-                                       zero_start >> 9, zero_len >> 9,
-                                       GFP_NOFS, false);
-                       if (ret < 0) {
-                               mlog_errno(ret);
-                               goto out;
-                       }
-               }
-
-               total_zero_len -= zero_len;
-               v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
-
-               /* Only at first iteration can be cluster not aligned.
-                * So set size_div to 0 for the rest */
-               size_div = 0;
-       }
-
-out:
-       return ret;
-}
-
-static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
-               struct iov_iter *iter,
-               loff_t offset)
-{
-       ssize_t ret = 0;
-       ssize_t written = 0;
-       bool orphaned = false;
-       int is_overwrite = 0;
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file_inode(file)->i_mapping->host;
-       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-       struct buffer_head *di_bh = NULL;
-       size_t count = iter->count;
-       journal_t *journal = osb->journal->j_journal;
-       u64 zero_len_head, zero_len_tail;
-       int cluster_align_head, cluster_align_tail;
-       loff_t final_size = offset + count;
-       int append_write = offset >= i_size_read(inode) ? 1 : 0;
-       unsigned int num_clusters = 0;
-       unsigned int ext_flags = 0;
-
-       {
-               u64 o = offset;
-               u64 s = i_size_read(inode);
-
-               zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
-               cluster_align_head = !zero_len_head;
-
-               zero_len_tail = osb->s_clustersize -
-                       do_div(s, osb->s_clustersize);
-               if ((offset - i_size_read(inode)) < zero_len_tail)
-                       zero_len_tail = offset - i_size_read(inode);
-               cluster_align_tail = !zero_len_tail;
-       }
-
-       /*
-        * when final_size > inode->i_size, inode->i_size will be
-        * updated after direct write, so add the inode to orphan
-        * dir first.
-        */
-       if (final_size > i_size_read(inode)) {
-               ret = ocfs2_add_inode_to_orphan(osb, inode);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out;
-               }
-               orphaned = true;
-       }
-
-       if (append_write) {
-               ret = ocfs2_inode_lock(inode, NULL, 1);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto clean_orphan;
-               }
-
-               /* zeroing out the previously allocated cluster tail
-                * that but not zeroed */
-               if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-                       down_read(&OCFS2_I(inode)->ip_alloc_sem);
-                       ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
-                                       zero_len_tail, cluster_align_tail);
-                       up_read(&OCFS2_I(inode)->ip_alloc_sem);
-               } else {
-                       down_write(&OCFS2_I(inode)->ip_alloc_sem);
-                       ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
-                                       offset);
-                       up_write(&OCFS2_I(inode)->ip_alloc_sem);
-               }
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       ocfs2_inode_unlock(inode, 1);
-                       goto clean_orphan;
-               }
-
-               is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
-               if (is_overwrite < 0) {
-                       mlog_errno(is_overwrite);
-                       ocfs2_inode_unlock(inode, 1);
-                       goto clean_orphan;
-               }
-
-               ocfs2_inode_unlock(inode, 1);
-       }
-
-       written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-                                      offset, ocfs2_direct_IO_get_blocks,
-                                      ocfs2_dio_end_io, NULL, 0);
-       /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
-       if ((written < 0) && (written != -EIOCBQUEUED)) {
-               loff_t i_size = i_size_read(inode);
-
-               if (offset + count > i_size) {
-                       ret = ocfs2_inode_lock(inode, &di_bh, 1);
-                       if (ret < 0) {
-                               mlog_errno(ret);
-                               goto clean_orphan;
-                       }
-
-                       if (i_size == i_size_read(inode)) {
-                               ret = ocfs2_truncate_file(inode, di_bh,
-                                               i_size);
-                               if (ret < 0) {
-                                       if (ret != -ENOSPC)
-                                               mlog_errno(ret);
-
-                                       ocfs2_inode_unlock(inode, 1);
-                                       brelse(di_bh);
-                                       di_bh = NULL;
-                                       goto clean_orphan;
-                               }
-                       }
-
-                       ocfs2_inode_unlock(inode, 1);
-                       brelse(di_bh);
-                       di_bh = NULL;
-
-                       ret = jbd2_journal_force_commit(journal);
-                       if (ret < 0)
-                               mlog_errno(ret);
-               }
-       } else if (written > 0 && append_write && !is_overwrite &&
-                       !cluster_align_head) {
-               /* zeroing out the allocated cluster head */
-               u32 p_cpos = 0;
-               u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-
-               ret = ocfs2_inode_lock(inode, NULL, 0);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto clean_orphan;
-               }
-
-               ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
-                               &num_clusters, &ext_flags);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       ocfs2_inode_unlock(inode, 0);
-                       goto clean_orphan;
-               }
-
-               BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
-
-               ret = blkdev_issue_zeroout(osb->sb->s_bdev,
-                               (u64)p_cpos << (osb->s_clustersize_bits - 9),
-                               zero_len_head >> 9, GFP_NOFS, false);
-               if (ret < 0)
-                       mlog_errno(ret);
-
-               ocfs2_inode_unlock(inode, 0);
-       }
-
-clean_orphan:
-       if (orphaned) {
-               int tmp_ret;
-               int update_isize = written > 0 ? 1 : 0;
-               loff_t end = update_isize ? offset + written : 0;
-
-               tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
-               if (tmp_ret < 0) {
-                       ret = tmp_ret;
-                       mlog_errno(ret);
-                       goto out;
-               }
-
-               tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
-                               update_isize, end);
-               if (tmp_ret < 0) {
-                       ret = tmp_ret;
-                       mlog_errno(ret);
-                       brelse(di_bh);
-                       goto out;
-               }
-
-               ocfs2_inode_unlock(inode, 1);
-               brelse(di_bh);
-
-               tmp_ret = jbd2_journal_force_commit(journal);
-               if (tmp_ret < 0) {
-                       ret = tmp_ret;
-                       mlog_errno(tmp_ret);
-               }
-       }
-
-out:
-       if (ret >= 0)
-               ret = written;
-       return ret;
-}
-
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-                              loff_t offset)
-{
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file_inode(file)->i_mapping->host;
-       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-       int full_coherency = !(osb->s_mount_opt &
-                       OCFS2_MOUNT_COHERENCY_BUFFERED);
-
-       /*
-        * Fallback to buffered I/O if we see an inode without
-        * extents.
-        */
-       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-               return 0;
-
-       /* Fallback to buffered I/O if we are appending and
-        * concurrent O_DIRECT writes are allowed.
-        */
-       if (i_size_read(inode) <= offset && !full_coherency)
-               return 0;
-
-       if (iov_iter_rw(iter) == READ)
-               return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-                                           iter, offset,
-                                           ocfs2_direct_IO_get_blocks,
-                                           ocfs2_dio_end_io, NULL, 0);
-       else
-               return ocfs2_direct_IO_write(iocb, iter, offset);
-}
-
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
                                            u32 cpos,
                                            unsigned int *start,
@@ -1193,6 +692,13 @@ next_bh:
 
 #define OCFS2_MAX_CLUSTERS_PER_PAGE    (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
 
+struct ocfs2_unwritten_extent {
+       struct list_head        ue_node;
+       struct list_head        ue_ip_node;
+       u32                     ue_cpos;
+       u32                     ue_phys;
+};
+
 /*
  * Describe the state of a single cluster to be written to.
  */
@@ -1204,7 +710,7 @@ struct ocfs2_write_cluster_desc {
         * filled.
         */
        unsigned        c_new;
-       unsigned        c_unwritten;
+       unsigned        c_clear_unwritten;
        unsigned        c_needs_zero;
 };
 
@@ -1216,6 +722,9 @@ struct ocfs2_write_ctxt {
        /* First cluster allocated in a nonsparse extend */
        u32                             w_first_new_cpos;
 
+       /* Type of caller. Must be one of buffer, mmap, direct.  */
+       ocfs2_write_type_t              w_type;
+
        struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
 
        /*
@@ -1264,6 +773,8 @@ struct ocfs2_write_ctxt {
        struct buffer_head              *w_di_bh;
 
        struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+       struct list_head                w_unwritten_list;
 };
 
 void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1302,8 +813,25 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
        ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
 }
 
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+                                struct list_head *head)
+{
+       struct ocfs2_inode_info *oi = OCFS2_I(inode);
+       struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
+
+       list_for_each_entry_safe(ue, tmp, head, ue_node) {
+               list_del(&ue->ue_node);
+               spin_lock(&oi->ip_lock);
+               list_del(&ue->ue_ip_node);
+               spin_unlock(&oi->ip_lock);
+               kfree(ue);
+       }
+}
+
+static void ocfs2_free_write_ctxt(struct inode *inode,
+                                 struct ocfs2_write_ctxt *wc)
 {
+       ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
        ocfs2_unlock_pages(wc);
        brelse(wc->w_di_bh);
        kfree(wc);
@@ -1311,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
 
 static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
                                  struct ocfs2_super *osb, loff_t pos,
-                                 unsigned len, struct buffer_head *di_bh)
+                                 unsigned len, ocfs2_write_type_t type,
+                                 struct buffer_head *di_bh)
 {
        u32 cend;
        struct ocfs2_write_ctxt *wc;
@@ -1326,6 +855,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
        wc->w_clen = cend - wc->w_cpos + 1;
        get_bh(di_bh);
        wc->w_di_bh = di_bh;
+       wc->w_type = type;
 
        if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
                wc->w_large_pages = 1;
@@ -1333,6 +863,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
                wc->w_large_pages = 0;
 
        ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+       INIT_LIST_HEAD(&wc->w_unwritten_list);
 
        *wcp = wc;
 
@@ -1393,12 +924,13 @@ static void ocfs2_write_failure(struct inode *inode,
                to = user_pos + user_len;
        struct page *tmppage;
 
-       ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+       if (wc->w_target_page)
+               ocfs2_zero_new_buffers(wc->w_target_page, from, to);
 
        for(i = 0; i < wc->w_num_pages; i++) {
                tmppage = wc->w_pages[i];
 
-               if (page_has_buffers(tmppage)) {
+               if (tmppage && page_has_buffers(tmppage)) {
                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
 
@@ -1528,11 +1060,13 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                wc->w_num_pages = 1;
                start = target_index;
        }
+       end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
 
        for(i = 0; i < wc->w_num_pages; i++) {
                index = start + i;
 
-               if (index == target_index && mmap_page) {
+               if (index >= target_index && index <= end_index &&
+                   wc->w_type == OCFS2_WRITE_MMAP) {
                        /*
                         * ocfs2_pagemkwrite() is a little different
                         * and wants us to directly use the page
@@ -1551,6 +1085,11 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                        page_cache_get(mmap_page);
                        wc->w_pages[i] = mmap_page;
                        wc->w_target_locked = true;
+               } else if (index >= target_index && index <= end_index &&
+                          wc->w_type == OCFS2_WRITE_DIRECT) {
+                       /* Direct write has no mapping page. */
+                       wc->w_pages[i] = NULL;
+                       continue;
                } else {
                        wc->w_pages[i] = find_or_create_page(mapping, index,
                                                             GFP_NOFS);
@@ -1575,19 +1114,20 @@ out:
  * Prepare a single cluster for write one cluster into the file.
  */
 static int ocfs2_write_cluster(struct address_space *mapping,
-                              u32 phys, unsigned int unwritten,
+                              u32 *phys, unsigned int new,
+                              unsigned int clear_unwritten,
                               unsigned int should_zero,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               struct ocfs2_write_ctxt *wc, u32 cpos,
                               loff_t user_pos, unsigned user_len)
 {
-       int ret, i, new;
-       u64 v_blkno, p_blkno;
+       int ret, i;
+       u64 p_blkno;
        struct inode *inode = mapping->host;
        struct ocfs2_extent_tree et;
+       int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 
-       new = phys == 0 ? 1 : 0;
        if (new) {
                u32 tmp_pos;
 
@@ -1597,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                 */
                tmp_pos = cpos;
                ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
-                                          &tmp_pos, 1, 0, wc->w_di_bh,
-                                          wc->w_handle, data_ac,
-                                          meta_ac, NULL);
+                                          &tmp_pos, 1, !clear_unwritten,
+                                          wc->w_di_bh, wc->w_handle,
+                                          data_ac, meta_ac, NULL);
                /*
                 * This shouldn't happen because we must have already
                 * calculated the correct meta data allocation required. The
@@ -1616,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                        mlog_errno(ret);
                        goto out;
                }
-       } else if (unwritten) {
+       } else if (clear_unwritten) {
                ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
                                              wc->w_di_bh);
                ret = ocfs2_mark_extent_written(inode, &et,
-                                               wc->w_handle, cpos, 1, phys,
+                                               wc->w_handle, cpos, 1, *phys,
                                                meta_ac, &wc->w_dealloc);
                if (ret < 0) {
                        mlog_errno(ret);
@@ -1628,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                }
        }
 
-       if (should_zero)
-               v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
-       else
-               v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
-
        /*
         * The only reason this should fail is due to an inability to
         * find the extent added.
         */
-       ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
-                                         NULL);
+       ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
        if (ret < 0) {
                mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
-                           "at logical block %llu",
-                           (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                           (unsigned long long)v_blkno);
+                           "at logical cluster %u",
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
                goto out;
        }
 
-       BUG_ON(p_blkno == 0);
+       BUG_ON(*phys == 0);
+
+       p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+       if (!should_zero)
+               p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
 
        for(i = 0; i < wc->w_num_pages; i++) {
                int tmpret;
 
+               /* This is the direct io target page. */
+               if (wc->w_pages[i] == NULL) {
+                       p_blkno++;
+                       continue;
+               }
+
                tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
                                                      wc->w_pages[i], cpos,
                                                      user_pos, user_len,
@@ -1698,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
                if ((cluster_off + local_len) > osb->s_clustersize)
                        local_len = osb->s_clustersize - cluster_off;
 
-               ret = ocfs2_write_cluster(mapping, desc->c_phys,
-                                         desc->c_unwritten,
+               ret = ocfs2_write_cluster(mapping, &desc->c_phys,
+                                         desc->c_new,
+                                         desc->c_clear_unwritten,
                                          desc->c_needs_zero,
                                          data_ac, meta_ac,
                                          wc, desc->c_cpos, pos, local_len);
@@ -1769,6 +1313,66 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
        }
 }
 
+/*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+                                struct ocfs2_write_ctxt *wc,
+                                struct ocfs2_write_cluster_desc *desc)
+{
+       struct ocfs2_inode_info *oi = OCFS2_I(inode);
+       struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
+       int ret = 0;
+
+       if (!desc->c_needs_zero)
+               return 0;
+
+retry:
+       spin_lock(&oi->ip_lock);
+       /* Needs not to zero no metter buffer or direct. The one who is zero
+        * the cluster is doing zero. And he will clear unwritten after all
+        * cluster io finished. */
+       list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+               if (desc->c_cpos == ue->ue_cpos) {
+                       BUG_ON(desc->c_new);
+                       desc->c_needs_zero = 0;
+                       desc->c_clear_unwritten = 0;
+                       goto unlock;
+               }
+       }
+
+       if (wc->w_type != OCFS2_WRITE_DIRECT)
+               goto unlock;
+
+       if (new == NULL) {
+               spin_unlock(&oi->ip_lock);
+               new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+                            GFP_NOFS);
+               if (new == NULL) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               goto retry;
+       }
+       /* This direct write will doing zero. */
+       new->ue_cpos = desc->c_cpos;
+       new->ue_phys = desc->c_phys;
+       desc->c_clear_unwritten = 0;
+       list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+       list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+       new = NULL;
+unlock:
+       spin_unlock(&oi->ip_lock);
+out:
+       if (new)
+               kfree(new);
+       return ret;
+}
+
 /*
  * Populate each single-cluster write descriptor in the write context
  * with information about the i/o to be done.
@@ -1844,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
                if (phys == 0) {
                        desc->c_new = 1;
                        desc->c_needs_zero = 1;
+                       desc->c_clear_unwritten = 1;
                        *clusters_to_alloc = *clusters_to_alloc + 1;
                }
 
                if (ext_flags & OCFS2_EXT_UNWRITTEN) {
-                       desc->c_unwritten = 1;
+                       desc->c_clear_unwritten = 1;
                        desc->c_needs_zero = 1;
                }
 
+               ret = ocfs2_unwritten_check(inode, wc, desc);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
                num_clusters--;
        }
 
@@ -2014,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
        if (ret)
                mlog_errno(ret);
 
-       wc->w_first_new_cpos =
-               ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+       /* There is no wc if this is call from direct. */
+       if (wc)
+               wc->w_first_new_cpos =
+                       ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
 
        return ret;
 }
@@ -2069,9 +1682,8 @@ out:
        return ret;
 }
 
-int ocfs2_write_begin_nolock(struct file *filp,
-                            struct address_space *mapping,
-                            loff_t pos, unsigned len, unsigned flags,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+                            loff_t pos, unsigned len, ocfs2_write_type_t type,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
@@ -2088,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
        int try_free = 1, ret1;
 
 try_again:
-       ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+       ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -2107,14 +1719,17 @@ try_again:
                }
        }
 
-       if (ocfs2_sparse_alloc(osb))
-               ret = ocfs2_zero_tail(inode, di_bh, pos);
-       else
-               ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
-                                                  wc);
-       if (ret) {
-               mlog_errno(ret);
-               goto out;
+       /* Direct io change i_size late, should not zero tail here. */
+       if (type != OCFS2_WRITE_DIRECT) {
+               if (ocfs2_sparse_alloc(osb))
+                       ret = ocfs2_zero_tail(inode, di_bh, pos);
+               else
+                       ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+                                                          len, wc);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
        }
 
        ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2145,7 +1760,7 @@ try_again:
                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
                        (long long)i_size_read(inode),
                        le32_to_cpu(di->i_clusters),
-                       pos, len, flags, mmap_page,
+                       pos, len, type, mmap_page,
                        clusters_to_alloc, extents_to_split);
 
        /*
@@ -2175,17 +1790,17 @@ try_again:
 
                credits = ocfs2_calc_extend_credits(inode->i_sb,
                                                    &di->id2.i_list);
-
-       }
+       } else if (type == OCFS2_WRITE_DIRECT)
+               /* direct write needs not to start trans if no extents alloc. */
+               goto success;
 
        /*
         * We have to zero sparse allocated clusters, unwritten extent clusters,
         * and non-sparse clusters we just extended.  For non-sparse writes,
         * we know zeros will only be needed in the first and/or last cluster.
         */
-       if (clusters_to_alloc || extents_to_split ||
-           (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
-                           wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+       if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+                          wc->w_desc[wc->w_clen - 1].c_needs_zero))
                cluster_of_pages = 1;
        else
                cluster_of_pages = 0;
@@ -2252,7 +1867,8 @@ try_again:
                ocfs2_free_alloc_context(meta_ac);
 
 success:
-       *pagep = wc->w_target_page;
+       if (pagep)
+               *pagep = wc->w_target_page;
        *fsdata = wc;
        return 0;
 out_quota:
@@ -2263,7 +1879,7 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
 
 out:
-       ocfs2_free_write_ctxt(wc);
+       ocfs2_free_write_ctxt(inode, wc);
 
        if (data_ac) {
                ocfs2_free_alloc_context(data_ac);
@@ -2315,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
         */
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-       ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
-                                      fsdata, di_bh, NULL);
+       ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
+                                      pagep, fsdata, di_bh, NULL);
        if (ret) {
                mlog_errno(ret);
                goto out_fail;
@@ -2373,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
        handle_t *handle = wc->w_handle;
        struct page *tmppage;
 
-       ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
-                       OCFS2_JOURNAL_ACCESS_WRITE);
-       if (ret) {
-               copied = ret;
-               mlog_errno(ret);
-               goto out;
+       BUG_ON(!list_empty(&wc->w_unwritten_list));
+
+       if (handle) {
+               ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+                               wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+               if (ret) {
+                       copied = ret;
+                       mlog_errno(ret);
+                       goto out;
+               }
        }
 
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2386,18 +2006,23 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                goto out_write_size;
        }
 
-       if (unlikely(copied < len)) {
+       if (unlikely(copied < len) && wc->w_target_page) {
                if (!PageUptodate(wc->w_target_page))
                        copied = 0;
 
                ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
                                       start+len);
        }
-       flush_dcache_page(wc->w_target_page);
+       if (wc->w_target_page)
+               flush_dcache_page(wc->w_target_page);
 
        for(i = 0; i < wc->w_num_pages; i++) {
                tmppage = wc->w_pages[i];
 
+               /* This is the direct io target page. */
+               if (tmppage == NULL)
+                       continue;
+
                if (tmppage == wc->w_target_page) {
                        from = wc->w_target_from;
                        to = wc->w_target_to;
@@ -2416,25 +2041,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                }
 
                if (page_has_buffers(tmppage)) {
-                       if (ocfs2_should_order_data(inode))
-                               ocfs2_jbd2_file_inode(wc->w_handle, inode);
+                       if (handle && ocfs2_should_order_data(inode))
+                               ocfs2_jbd2_file_inode(handle, inode);
                        block_commit_write(tmppage, from, to);
                }
        }
 
 out_write_size:
-       pos += copied;
-       if (pos > i_size_read(inode)) {
-               i_size_write(inode, pos);
-               mark_inode_dirty(inode);
-       }
-       inode->i_blocks = ocfs2_inode_sector_count(inode);
-       di->i_size = cpu_to_le64((u64)i_size_read(inode));
-       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-       di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-       di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-       ocfs2_update_inode_fsync_trans(handle, inode, 1);
-       ocfs2_journal_dirty(handle, wc->w_di_bh);
+       /* Direct io do not update i_size here. */
+       if (wc->w_type != OCFS2_WRITE_DIRECT) {
+               pos += copied;
+               if (pos > i_size_read(inode)) {
+                       i_size_write(inode, pos);
+                       mark_inode_dirty(inode);
+               }
+               inode->i_blocks = ocfs2_inode_sector_count(inode);
+               di->i_size = cpu_to_le64((u64)i_size_read(inode));
+               inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+               di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+               di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+               ocfs2_update_inode_fsync_trans(handle, inode, 1);
+       }
+       if (handle)
+               ocfs2_journal_dirty(handle, wc->w_di_bh);
 
 out:
        /* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2444,7 +2073,8 @@ out:
         */
        ocfs2_unlock_pages(wc);
 
-       ocfs2_commit_trans(osb, handle);
+       if (handle)
+               ocfs2_commit_trans(osb, handle);
 
        ocfs2_run_deallocs(osb, &wc->w_dealloc);
 
@@ -2469,6 +2099,340 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
        return ret;
 }
 
+struct ocfs2_dio_write_ctxt {
+       struct list_head        dw_zero_list;
+       unsigned                dw_zero_count;
+       int                     dw_orphaned;
+       pid_t                   dw_writer_pid;
+};
+
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+       struct ocfs2_dio_write_ctxt *dwc = NULL;
+
+       if (bh->b_private)
+               return bh->b_private;
+
+       dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+       if (dwc == NULL)
+               return NULL;
+       INIT_LIST_HEAD(&dwc->dw_zero_list);
+       dwc->dw_zero_count = 0;
+       dwc->dw_orphaned = 0;
+       dwc->dw_writer_pid = task_pid_nr(current);
+       bh->b_private = dwc;
+       *alloc = 1;
+
+       return dwc;
+}
+
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+                                    struct ocfs2_dio_write_ctxt *dwc)
+{
+       ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+       kfree(dwc);
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ *  "So what we do is to permit the ->get_blocks function to populate
+ *   bh.b_size with the size of IO which is permitted at this offset and
+ *   this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ *                                     fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+                              struct buffer_head *bh_result, int create)
+{
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct ocfs2_write_ctxt *wc;
+       struct ocfs2_write_cluster_desc *desc = NULL;
+       struct ocfs2_dio_write_ctxt *dwc = NULL;
+       struct buffer_head *di_bh = NULL;
+       u64 p_blkno;
+       loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+       unsigned len, total_len = bh_result->b_size;
+       int ret = 0, first_get_block = 0;
+
+       len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+       len = min(total_len, len);
+
+       mlog(0, "get block of %lu at %llu:%u req %u\n",
+                       inode->i_ino, pos, len, total_len);
+
+       /* This is the fast path for re-write. */
+       ret = ocfs2_get_block(inode, iblock, bh_result, create);
+
+       if (buffer_mapped(bh_result) &&
+           !buffer_new(bh_result) &&
+           ret == 0)
+               goto out;
+
+       /* Clear state set by ocfs2_get_block. */
+       bh_result->b_state = 0;
+
+       dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+       if (unlikely(dwc == NULL)) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+           ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+           !dwc->dw_orphaned) {
+               /*
+                * when we are going to alloc extents beyond file size, add the
+                * inode to orphan dir, so we can recall those spaces when
+                * system crashed during write.
+                */
+               ret = ocfs2_add_inode_to_orphan(osb, inode);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+               dwc->dw_orphaned = 1;
+       }
+
+       ret = ocfs2_inode_lock(inode, &di_bh, 1);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (first_get_block) {
+               if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                       ret = ocfs2_zero_tail(inode, di_bh, pos);
+               else
+                       ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+                                                          total_len, NULL);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto unlock;
+               }
+       }
+
+       ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+                                      OCFS2_WRITE_DIRECT, NULL,
+                                      (void **)&wc, di_bh, NULL);
+       if (ret) {
+               mlog_errno(ret);
+               goto unlock;
+       }
+
+       desc = &wc->w_desc[0];
+
+       p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+       BUG_ON(p_blkno == 0);
+       p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+
+       map_bh(bh_result, inode->i_sb, p_blkno);
+       bh_result->b_size = len;
+       if (desc->c_needs_zero)
+               set_buffer_new(bh_result);
+
+       /* May sleep in end_io. It should not happen in a irq context. So defer
+        * it to dio work queue. */
+       set_buffer_defer_completion(bh_result);
+
+       if (!list_empty(&wc->w_unwritten_list)) {
+               struct ocfs2_unwritten_extent *ue = NULL;
+
+               ue = list_first_entry(&wc->w_unwritten_list,
+                                     struct ocfs2_unwritten_extent,
+                                     ue_node);
+               BUG_ON(ue->ue_cpos != desc->c_cpos);
+               /* The physical address may be 0, fill it. */
+               ue->ue_phys = desc->c_phys;
+
+               list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+               dwc->dw_zero_count++;
+       }
+
+       ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+       BUG_ON(ret != len);
+       ret = 0;
+unlock:
+       ocfs2_inode_unlock(inode, 1);
+       brelse(di_bh);
+out:
+       if (ret < 0)
+               ret = -EIO;
+       return ret;
+}
+
+static void ocfs2_dio_end_io_write(struct inode *inode,
+                                  struct ocfs2_dio_write_ctxt *dwc,
+                                  loff_t offset,
+                                  ssize_t bytes)
+{
+       struct ocfs2_cached_dealloc_ctxt dealloc;
+       struct ocfs2_extent_tree et;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct ocfs2_unwritten_extent *ue = NULL;
+       struct buffer_head *di_bh = NULL;
+       struct ocfs2_dinode *di;
+       struct ocfs2_alloc_context *data_ac = NULL;
+       struct ocfs2_alloc_context *meta_ac = NULL;
+       handle_t *handle = NULL;
+       loff_t end = offset + bytes;
+       int ret = 0, credits = 0, locked = 0;
+
+       ocfs2_init_dealloc_ctxt(&dealloc);
+
+       /* We do clear unwritten, delete orphan, change i_size here. If neither
+        * of these happen, we can skip all this. */
+       if (list_empty(&dwc->dw_zero_list) &&
+           end <= i_size_read(inode) &&
+           !dwc->dw_orphaned)
+               goto out;
+
+       ret = ocfs2_inode_lock(inode, &di_bh, 1);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
+        * are in that context. */
+       if (dwc->dw_writer_pid != task_pid_nr(current)) {
+               mutex_lock(&inode->i_mutex);
+               locked = 1;
+       }
+
+       /* Delete orphan before acquire i_mutex. */
+       if (dwc->dw_orphaned) {
+               BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+
+               end = end > i_size_read(inode) ? end : 0;
+
+               ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+                               !!end, end);
+               if (ret < 0)
+                       mlog_errno(ret);
+       }
+
+       di = (struct ocfs2_dinode *)di_bh;
+
+       ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+
+       ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+                                   &data_ac, &meta_ac);
+
+       credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+
+       handle = ocfs2_start_trans(osb, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               goto unlock;
+       }
+       ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto commit;
+       }
+
+       list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+               ret = ocfs2_mark_extent_written(inode, &et, handle,
+                                               ue->ue_cpos, 1,
+                                               ue->ue_phys,
+                                               meta_ac, &dealloc);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       break;
+               }
+       }
+
+       if (end > i_size_read(inode)) {
+               ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+               if (ret < 0)
+                       mlog_errno(ret);
+       }
+commit:
+       ocfs2_commit_trans(osb, handle);
+unlock:
+       ocfs2_inode_unlock(inode, 1);
+       brelse(di_bh);
+out:
+       ocfs2_run_deallocs(osb, &dealloc);
+       if (locked)
+               mutex_unlock(&inode->i_mutex);
+       ocfs2_dio_free_write_ctx(inode, dwc);
+       if (data_ac)
+               ocfs2_free_alloc_context(data_ac);
+       if (meta_ac)
+               ocfs2_free_alloc_context(meta_ac);
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
+ * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static void ocfs2_dio_end_io(struct kiocb *iocb,
+                            loff_t offset,
+                            ssize_t bytes,
+                            void *private)
+{
+       struct inode *inode = file_inode(iocb->ki_filp);
+       int level;
+
+       /* this io's submitter should not have unlocked this before we could */
+       BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+       if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+               ocfs2_iocb_clear_unaligned_aio(iocb);
+
+               mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
+       }
+
+       if (private)
+               ocfs2_dio_end_io_write(inode, private, offset, bytes);
+
+       ocfs2_iocb_clear_rw_locked(iocb);
+
+       level = ocfs2_iocb_rw_locked_level(iocb);
+       ocfs2_rw_unlock(inode, level);
+}
+
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+                              loff_t offset)
+{
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file)->i_mapping->host;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       loff_t end = offset + iter->count;
+       get_block_t *get_block;
+
+       /*
+        * Fallback to buffered I/O if we see an inode without
+        * extents.
+        */
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+               return 0;
+
+       /* Fallback to buffered I/O if we do not support append dio. */
+       if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+               return 0;
+
+       if (iov_iter_rw(iter) == READ)
+               get_block = ocfs2_get_block;
+       else
+               get_block = ocfs2_dio_get_block;
+
+       return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+                                   iter, offset, get_block,
+                                   ocfs2_dio_end_io, NULL, 0);
+}
+
 const struct address_space_operations ocfs2_aops = {
        .readpage               = ocfs2_readpage,
        .readpages              = ocfs2_readpages,
index 24e496d6bdcdba9036dbc67d5f118aa9d10f431a..d06b80f58f83d162ffdeb757a80e5b047fe13911 100644 (file)
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                           loff_t pos, unsigned len, unsigned copied,
                           struct page *page, void *fsdata);
 
-int ocfs2_write_begin_nolock(struct file *filp,
-                            struct address_space *mapping,
-                            loff_t pos, unsigned len, unsigned flags,
+typedef enum {
+       OCFS2_WRITE_BUFFER = 0,
+       OCFS2_WRITE_DIRECT,
+       OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+                            loff_t pos, unsigned len, ocfs2_write_type_t type,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page);
 
index e404386bd93e8535a32152fcc5936ede9ac5fa41..709fbbd44c65366ce1e31aebce0904b5966d53a6 100644 (file)
@@ -219,7 +219,8 @@ struct o2hb_region {
        unsigned                hr_unclean_stop:1,
                                hr_aborted_start:1,
                                hr_item_pinned:1,
-                               hr_item_dropped:1;
+                               hr_item_dropped:1,
+                               hr_node_deleted:1;
 
        /* protected by the hr_callback_sem */
        struct task_struct      *hr_task;
@@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data)
        set_user_nice(current, MIN_NICE);
 
        /* Pin node */
-       o2nm_depend_this_node();
+       ret = o2nm_depend_this_node();
+       if (ret) {
+               mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
+               reg->hr_node_deleted = 1;
+               wake_up(&o2hb_steady_queue);
+               return 0;
+       }
 
        while (!kthread_should_stop() &&
               !reg->hr_unclean_stop && !reg->hr_aborted_start) {
@@ -1789,7 +1796,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
        spin_unlock(&o2hb_live_lock);
 
        ret = wait_event_interruptible(o2hb_steady_queue,
-                               atomic_read(&reg->hr_steady_iterations) == 0);
+                               atomic_read(&reg->hr_steady_iterations) == 0 ||
+                               reg->hr_node_deleted);
        if (ret) {
                atomic_set(&reg->hr_steady_iterations, 0);
                reg->hr_aborted_start = 1;
@@ -1800,6 +1808,11 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
                goto out3;
        }
 
+       if (reg->hr_node_deleted) {
+               ret = -EINVAL;
+               goto out3;
+       }
+
        /* Ok, we were woken.  Make sure it wasn't by drop_item() */
        spin_lock(&o2hb_live_lock);
        hb_task = reg->hr_task;
index e36d63ff17830bf321a1809d1b81a87827290037..f90931335c6b28af89cc0537b8d3fec48a089300 100644 (file)
@@ -262,6 +262,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                                  struct dlm_lock *lock, int flags, int type)
 {
        enum dlm_status status;
+       u8 old_owner = res->owner;
 
        mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
             lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +288,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
                status = DLM_DENIED;
                goto bail;
        }
+
+       if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+               mlog(0, "last convert request returned DLM_RECOVERING, but "
+                    "owner has already queued and sent ast to me. res %.*s, "
+                    "(cookie=%u:%llu, type=%d, conv=%d)\n",
+                    res->lockname.len, res->lockname.name,
+                    dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                    dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                    lock->ml.type, lock->ml.convert_type);
+               status = DLM_NORMAL;
+               goto bail;
+       }
+
        res->state |= DLM_LOCK_RES_IN_PROGRESS;
        /* move lock to local convert queue */
        /* do not alter lock refcount.  switching lists. */
@@ -316,11 +330,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
        lock->convert_pending = 0;
-       /* if it failed, move it back to granted queue */
+       /* if it failed, move it back to granted queue.
+        * if master returns DLM_NORMAL and then down before sending ast,
+        * it may have already been moved to granted queue, reset to
+        * DLM_RECOVERING and retry convert */
        if (status != DLM_NORMAL) {
                if (status != DLM_NOTQUEUED)
                        dlm_error(status);
                dlm_revert_pending_convert(res, lock);
+       } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+                       (old_owner != res->owner)) {
+               mlog(0, "res %.*s is in recovering or has been recovered.\n",
+                               res->lockname.len, res->lockname.name);
+               status = DLM_RECOVERING;
        }
 bail:
        spin_unlock(&res->spinlock);
index 58eaa5c0d387051301a089f0d234d4b69bf18d51..a43f9efef903458af777fd7c22b4eb87b3bff62a 100644 (file)
@@ -2064,7 +2064,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
                        dlm_lock_get(lock);
                        if (lock->convert_pending) {
                                /* move converting lock back to granted */
-                               BUG_ON(i != DLM_CONVERTING_LIST);
                                mlog(0, "node died with convert pending "
                                     "on %.*s. move back to granted list.\n",
                                     res->lockname.len, res->lockname.name);
index 0e5b4515f92e7a875a6f396db0d5648c9c157ad1..05346fb8d5fd0ee51296dd8681bb563d0c99f156 100644 (file)
@@ -1373,44 +1373,6 @@ out:
        return ret;
 }
 
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
-                                      size_t count)
-{
-       int ret = 0;
-       unsigned int extent_flags;
-       u32 cpos, clusters, extent_len, phys_cpos;
-       struct super_block *sb = inode->i_sb;
-
-       cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
-       clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-
-       while (clusters) {
-               ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
-                                        &extent_flags);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto out;
-               }
-
-               if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
-                       ret = 1;
-                       break;
-               }
-
-               if (extent_len > clusters)
-                       extent_len = clusters;
-
-               clusters -= extent_len;
-               cpos += extent_len;
-       }
-out:
-       return ret;
-}
-
 static int ocfs2_write_remove_suid(struct inode *inode)
 {
        int ret;
@@ -2121,18 +2083,12 @@ out:
 
 static int ocfs2_prepare_inode_for_write(struct file *file,
                                         loff_t pos,
-                                        size_t count,
-                                        int appending,
-                                        int *direct_io,
-                                        int *has_refcount)
+                                        size_t count)
 {
        int ret = 0, meta_level = 0;
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = d_inode(dentry);
        loff_t end;
-       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-       int full_coherency = !(osb->s_mount_opt &
-               OCFS2_MOUNT_COHERENCY_BUFFERED);
 
        /*
         * We start with a read level meta lock and only jump to an ex
@@ -2181,10 +2137,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                                                               pos,
                                                               count,
                                                               &meta_level);
-                       if (has_refcount)
-                               *has_refcount = 1;
-                       if (direct_io)
-                               *direct_io = 0;
                }
 
                if (ret < 0) {
@@ -2192,67 +2144,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                        goto out_unlock;
                }
 
-               /*
-                * Skip the O_DIRECT checks if we don't need
-                * them.
-                */
-               if (!direct_io || !(*direct_io))
-                       break;
-
-               /*
-                * There's no sane way to do direct writes to an inode
-                * with inline data.
-                */
-               if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                       *direct_io = 0;
-                       break;
-               }
-
-               /*
-                * Allowing concurrent direct writes means
-                * i_size changes wouldn't be synchronized, so
-                * one node could wind up truncating another
-                * nodes writes.
-                */
-               if (end > i_size_read(inode) && !full_coherency) {
-                       *direct_io = 0;
-                       break;
-               }
-
-               /*
-                * Fallback to old way if the feature bit is not set.
-                */
-               if (end > i_size_read(inode) &&
-                               !ocfs2_supports_append_dio(osb)) {
-                       *direct_io = 0;
-                       break;
-               }
-
-               /*
-                * We don't fill holes during direct io, so
-                * check for them here. If any are found, the
-                * caller will have to retake some cluster
-                * locks and initiate the io as buffered.
-                */
-               ret = ocfs2_check_range_for_holes(inode, pos, count);
-               if (ret == 1) {
-                       /*
-                        * Fallback to old way if the feature bit is not set.
-                        * Otherwise try dio first and then complete the rest
-                        * request through buffer io.
-                        */
-                       if (!ocfs2_supports_append_dio(osb))
-                               *direct_io = 0;
-                       ret = 0;
-               } else if (ret < 0)
-                       mlog_errno(ret);
                break;
        }
 
 out_unlock:
        trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
-                                           pos, appending, count,
-                                           direct_io, has_refcount);
+                                           pos, count);
 
        if (meta_level >= 0)
                ocfs2_inode_unlock(inode, meta_level);
@@ -2264,18 +2161,16 @@ out:
 static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                                    struct iov_iter *from)
 {
-       int direct_io, appending, rw_level;
-       int can_do_direct, has_refcount = 0;
+       int direct_io, rw_level;
        ssize_t written = 0;
        ssize_t ret;
-       size_t count = iov_iter_count(from), orig_count;
+       size_t count = iov_iter_count(from);
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
        int unaligned_dio = 0;
-       int dropped_dio = 0;
        int append_write = ((iocb->ki_pos + count) >=
                        i_size_read(inode) ? 1 : 0);
 
@@ -2288,12 +2183,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        if (count == 0)
                return 0;
 
-       appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
        direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
 
        mutex_lock(&inode->i_mutex);
 
-relock:
        /*
         * Concurrent O_DIRECT writes are allowed with
         * mount_option "coherency=buffered".
@@ -2326,7 +2219,6 @@ relock:
                ocfs2_inode_unlock(inode, 1);
        }
 
-       orig_count = iov_iter_count(from);
        ret = generic_write_checks(iocb, from);
        if (ret <= 0) {
                if (ret)
@@ -2335,9 +2227,7 @@ relock:
        }
        count = ret;
 
-       can_do_direct = direct_io;
-       ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
-                                           &can_do_direct, &has_refcount);
+       ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -2346,22 +2236,6 @@ relock:
        if (direct_io && !is_sync_kiocb(iocb))
                unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
 
-       /*
-        * We can't complete the direct I/O as requested, fall back to
-        * buffered I/O.
-        */
-       if (direct_io && !can_do_direct) {
-               ocfs2_rw_unlock(inode, rw_level);
-
-               rw_level = -1;
-
-               direct_io = 0;
-               iocb->ki_flags &= ~IOCB_DIRECT;
-               iov_iter_reexpand(from, orig_count);
-               dropped_dio = 1;
-               goto relock;
-       }
-
        if (unaligned_dio) {
                /*
                 * Wait on previous unaligned aio to complete before
@@ -2397,7 +2271,7 @@ relock:
                goto no_sync;
 
        if (((file->f_flags & O_DSYNC) && !direct_io) ||
-           IS_SYNC(inode) || dropped_dio) {
+           IS_SYNC(inode)) {
                ret = filemap_fdatawrite_range(file->f_mapping,
                                               iocb->ki_pos - written,
                                               iocb->ki_pos - 1);
index 8f87e05ee25d3824524c7f6e040a5f43d87c723d..0fd9ebdd3ed85cf8c754e7f110a29658eb7daa9f 100644 (file)
@@ -1125,6 +1125,9 @@ static void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
                        "Clear inode of %llu, inode has io markers\n",
                        (unsigned long long)oi->ip_blkno);
+       mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+                       "Clear inode of %llu, inode has unwritten extents\n",
+                       (unsigned long long)oi->ip_blkno);
 
        ocfs2_extent_map_trunc(inode, 0);
 
index ca3431ee7f2493fb999cdb6ced2908bdb87fbe17..0c22ddd4b0dd8de3f9492dff2b1dc7eed6406159 100644 (file)
@@ -57,6 +57,9 @@ struct ocfs2_inode_info
        u32                             ip_flags; /* see below */
        u32                             ip_attr; /* inode attributes */
 
+       /* Record unwritten extents during direct io. */
+       struct list_head                ip_unwritten_list;
+
        /* protected by recovery_lock. */
        struct inode                    *ip_next_orphan;
 
@@ -112,6 +115,8 @@ struct ocfs2_inode_info
 #define OCFS2_INODE_OPEN_DIRECT                0x00000020
 /* Tell the inode wipe code it's not in orphan dir */
 #define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000040
+/* Entry in orphan dir with 'dio-' prefix */
+#define OCFS2_INODE_DIO_ORPHAN_ENTRY   0x00000080
 
 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 {
index ff82b28462a65c85cf6bff850856a77859ad987e..b414add66d488c100e150312c9a6dbe1f5dfd8cb 100644 (file)
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
        /* At this point, we know that no more recovery threads can be
         * launched, so wait for any recovery completion work to
         * complete. */
-       flush_workqueue(ocfs2_wq);
+       flush_workqueue(osb->ocfs2_wq);
 
        /*
         * Now that recovery is shut down, and the osb is about to be
@@ -1327,7 +1327,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 
        spin_lock(&journal->j_lock);
        list_add_tail(&item->lri_list, &journal->j_la_cleanups);
-       queue_work(ocfs2_wq, &journal->j_recovery_work);
+       queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
        spin_unlock(&journal->j_lock);
 }
 
@@ -1972,7 +1972,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
        mutex_lock(&os->os_lock);
        ocfs2_queue_orphan_scan(osb);
        if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
-               queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+               queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
                                      ocfs2_orphan_scan_timeout());
        mutex_unlock(&os->os_lock);
 }
@@ -2012,7 +2012,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
        else {
                atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
-               queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+               queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
                                   ocfs2_orphan_scan_timeout());
        }
 }
@@ -2021,6 +2021,7 @@ struct ocfs2_orphan_filldir_priv {
        struct dir_context      ctx;
        struct inode            *head;
        struct ocfs2_super      *osb;
+       enum ocfs2_orphan_reco_type orphan_reco_type;
 };
 
 static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
@@ -2036,12 +2037,22 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
        if (name_len == 2 && !strncmp("..", name, 2))
                return 0;
 
+       /* do not include dio entry in case of orphan scan */
+       if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
+                       (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+                       OCFS2_DIO_ORPHAN_PREFIX_LEN)))
+               return 0;
+
        /* Skip bad inodes so that recovery can continue */
        iter = ocfs2_iget(p->osb, ino,
                          OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
        if (IS_ERR(iter))
                return 0;
 
+       if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+                       OCFS2_DIO_ORPHAN_PREFIX_LEN))
+               OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY;
+
        /* Skip inodes which are already added to recover list, since dio may
         * happen concurrently with unlink/rename */
        if (OCFS2_I(iter)->ip_next_orphan) {
@@ -2060,14 +2071,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
 
 static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                               int slot,
-                              struct inode **head)
+                              struct inode **head,
+                              enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        int status;
        struct inode *orphan_dir_inode = NULL;
        struct ocfs2_orphan_filldir_priv priv = {
                .ctx.actor = ocfs2_orphan_filldir,
                .osb = osb,
-               .head = *head
+               .head = *head,
+               .orphan_reco_type = orphan_reco_type
        };
 
        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
@@ -2170,7 +2183,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
        trace_ocfs2_recover_orphans(slot);
 
        ocfs2_mark_recovering_orphan_dir(osb, slot);
-       ret = ocfs2_queue_orphans(osb, slot, &inode);
+       ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type);
        ocfs2_clear_recovering_orphan_dir(osb, slot);
 
        /* Error here should be noted, but we want to continue with as
@@ -2186,25 +2199,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                iter = oi->ip_next_orphan;
                oi->ip_next_orphan = NULL;
 
-               mutex_lock(&inode->i_mutex);
-               ret = ocfs2_rw_lock(inode, 1);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       goto next;
-               }
-               /*
-                * We need to take and drop the inode lock to
-                * force read inode from disk.
-                */
-               ret = ocfs2_inode_lock(inode, &di_bh, 1);
-               if (ret) {
-                       mlog_errno(ret);
-                       goto unlock_rw;
-               }
+               if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
+                       mutex_lock(&inode->i_mutex);
+                       ret = ocfs2_rw_lock(inode, 1);
+                       if (ret < 0) {
+                               mlog_errno(ret);
+                               goto unlock_mutex;
+                       }
+                       /*
+                        * We need to take and drop the inode lock to
+                        * force read inode from disk.
+                        */
+                       ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto unlock_rw;
+                       }
 
-               di = (struct ocfs2_dinode *)di_bh->b_data;
+                       di = (struct ocfs2_dinode *)di_bh->b_data;
 
-               if (inode->i_nlink == 0) {
+                       if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) {
+                               ret = ocfs2_truncate_file(inode, di_bh,
+                                               i_size_read(inode));
+                               if (ret < 0) {
+                                       if (ret != -ENOSPC)
+                                               mlog_errno(ret);
+                                       goto unlock_inode;
+                               }
+
+                               ret = ocfs2_del_inode_from_orphan(osb, inode,
+                                               di_bh, 0, 0);
+                               if (ret)
+                                       mlog_errno(ret);
+                       }
+unlock_inode:
+                       ocfs2_inode_unlock(inode, 1);
+                       brelse(di_bh);
+                       di_bh = NULL;
+unlock_rw:
+                       ocfs2_rw_unlock(inode, 1);
+unlock_mutex:
+                       mutex_unlock(&inode->i_mutex);
+
+                       /* clear dio flag in ocfs2_inode_info */
+                       oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
+               } else {
                        spin_lock(&oi->ip_lock);
                        /* Set the proper information to get us going into
                         * ocfs2_delete_inode. */
@@ -2212,28 +2251,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                        spin_unlock(&oi->ip_lock);
                }
 
-               if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
-                               (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
-                       ret = ocfs2_truncate_file(inode, di_bh,
-                                       i_size_read(inode));
-                       if (ret < 0) {
-                               if (ret != -ENOSPC)
-                                       mlog_errno(ret);
-                               goto unlock_inode;
-                       }
-
-                       ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
-                       if (ret)
-                               mlog_errno(ret);
-               } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-unlock_inode:
-               ocfs2_inode_unlock(inode, 1);
-               brelse(di_bh);
-               di_bh = NULL;
-unlock_rw:
-               ocfs2_rw_unlock(inode, 1);
-next:
-               mutex_unlock(&inode->i_mutex);
                iput(inode);
                inode = iter;
        }
index 0a4457fb0711b7c9f65cc3da788d482342d0a69d..3e193127326882b9abc0806b585a56158eebf9a5 100644 (file)
@@ -387,7 +387,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        struct ocfs2_dinode *alloc = NULL;
 
        cancel_delayed_work(&osb->la_enable_wq);
-       flush_workqueue(ocfs2_wq);
+       flush_workqueue(osb->ocfs2_wq);
 
        if (osb->local_alloc_state == OCFS2_LA_UNUSED)
                goto out;
@@ -1087,7 +1087,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
                } else {
                        osb->local_alloc_state = OCFS2_LA_DISABLED;
                }
-               queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+               queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
                                   OCFS2_LA_ENABLE_INTERVAL);
                goto out_unlock;
        }
index 9581d190f6e12346e70226c93458df0a1791abe0..a88707a0f4da57b1309876837e9bf4470ee53dea 100644 (file)
@@ -104,8 +104,8 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
        if (page->index == last_index)
                len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
 
-       ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
-                                      &fsdata, di_bh, page);
+       ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+                                      &locked_page, &fsdata, di_bh, page);
        if (ret) {
                if (ret != -ENOSPC)
                        mlog_errno(ret);
index b7dfac226b1e2dc517877c9402832abf4cba3530..3b48ac25d8a7c450ac23360b5f7d28e9fafcc263 100644 (file)
@@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
 /* An orphan dir name is an 8 byte value, printed as a hex string */
 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
-#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
-#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
 
 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
                                   unsigned int flags)
@@ -657,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                return status;
        }
 
-       return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+       status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
                                    parent_fe_bh, handle, inode_ac,
                                    fe_blkno, suballoc_loc, suballoc_bit);
+       if (status < 0) {
+               u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
+               int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
+                               inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
+               if (tmp)
+                       mlog_errno(tmp);
+       }
+
+       return status;
 }
 
 static int ocfs2_mkdir(struct inode *dir,
index e173329eb83057ed6ce74c769c3dd3f769e08b24..1155918d6784fe59dcdefa948b2db0d5f53e11f3 100644 (file)
@@ -26,6 +26,9 @@
 #ifndef OCFS2_NAMEI_H
 #define OCFS2_NAMEI_H
 
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
+
 extern const struct inode_operations ocfs2_dir_iops;
 
 struct dentry *ocfs2_get_parent(struct dentry *child);
index 7a0126267847664e7a61d803278a2a15457ae37c..6cf6538a065160ec815d3c92549029dc14698265 100644 (file)
@@ -464,6 +464,14 @@ struct ocfs2_super
        struct ocfs2_refcount_tree *osb_ref_tree_lru;
 
        struct mutex system_file_mutex;
+
+       /*
+        * OCFS2 needs to schedule several different types of work which
+        * require cluster locking, disk I/O, recovery waits, etc. Since these
+        * types of work tend to be heavy we avoid using the kernel events
+        * workqueue and schedule on our own.
+        */
+       struct workqueue_struct *ocfs2_wq;
 };
 
 #define OCFS2_SB(sb)       ((struct ocfs2_super *)(sb)->s_fs_info)
index 6cb019b7c6a83c4ec449baff12652e84b5fca06a..09d0c89a9daf963df206e45fc48cf8339d85064b 100644 (file)
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
 
 TRACE_EVENT(ocfs2_prepare_inode_for_write,
        TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
-                int appending, unsigned long count,
-                int *direct_io, int *has_refcount),
-       TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+                unsigned long count),
+       TP_ARGS(ino, saved_pos, count),
        TP_STRUCT__entry(
                __field(unsigned long long, ino)
                __field(unsigned long long, saved_pos)
-               __field(int, appending)
                __field(unsigned long, count)
-               __field(int, direct_io)
-               __field(int, has_refcount)
        ),
        TP_fast_assign(
                __entry->ino = ino;
                __entry->saved_pos = saved_pos;
-               __entry->appending = appending;
                __entry->count = count;
-               __entry->direct_io = direct_io ? *direct_io : -1;
-               __entry->has_refcount = has_refcount ? *has_refcount : -1;
        ),
-       TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
-                 __entry->saved_pos, __entry->appending, __entry->count,
-                 __entry->direct_io, __entry->has_refcount)
+       TP_printk("%llu %llu %lu", __entry->ino,
+                 __entry->saved_pos, __entry->count)
 );
 
 DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
index c93d6722088753901c258151de82dc707e6eb71e..44df24b62feffd3aec42ea5cc8293c3e46d86aee 100644 (file)
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
                dqgrab(dquot);
                /* First entry on list -> queue work */
                if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
-                       queue_work(ocfs2_wq, &osb->dquot_drop_work);
+                       queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
                goto out;
        }
        status = ocfs2_lock_global_qf(oinfo, 1);
index d5da6f624142812ab63e83f5ab1b90e05151fed3..10ad87ba02e03e2e7548c9e9c39ca548b700da92 100644 (file)
@@ -187,7 +187,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
        for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
                blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
                cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
-               if (cluster > clusters)
+               if (cluster >= clusters)
                        break;
 
                ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
index d83d2602cf2b0aa8c7eee8a83f159ed54ccdb0c1..fc6d25f6d4442c79dd376da24f18c3e3355232ed 100644 (file)
@@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
                                    res, &bits_left);
        if (!status) {
-               hint = ocfs2_group_from_res(res);
+               if (ocfs2_is_cluster_bitmap(ac->ac_inode))
+                       hint = res->sr_bg_blkno;
+               else
+                       hint = ocfs2_group_from_res(res);
                goto set_hint;
        }
        if (status < 0 && status != -ENOSPC) {
index 2de4c8a9340c267a16381faacbd0ce66c18d123f..427248de9da59d32600e995079b654f9f3fb2af9 100644 (file)
@@ -79,12 +79,6 @@ static struct kmem_cache *ocfs2_inode_cachep;
 struct kmem_cache *ocfs2_dquot_cachep;
 struct kmem_cache *ocfs2_qf_chunk_cachep;
 
-/* OCFS2 needs to schedule several different types of work which
- * require cluster locking, disk I/O, recovery waits, etc. Since these
- * types of work tend to be heavy we avoid using the kernel events
- * workqueue and schedule on our own. */
-struct workqueue_struct *ocfs2_wq = NULL;
-
 static struct dentry *ocfs2_debugfs_root;
 
 MODULE_AUTHOR("Oracle");
@@ -1612,33 +1606,25 @@ static int __init ocfs2_init(void)
        if (status < 0)
                goto out2;
 
-       ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
-       if (!ocfs2_wq) {
-               status = -ENOMEM;
-               goto out3;
-       }
-
        ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
        if (!ocfs2_debugfs_root) {
                status = -ENOMEM;
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
-               goto out4;
+               goto out3;
        }
 
        ocfs2_set_locking_protocol();
 
        status = register_quota_format(&ocfs2_quota_format);
        if (status < 0)
-               goto out4;
+               goto out3;
        status = register_filesystem(&ocfs2_fs_type);
        if (!status)
                return 0;
 
        unregister_quota_format(&ocfs2_quota_format);
-out4:
-       destroy_workqueue(ocfs2_wq);
-       debugfs_remove(ocfs2_debugfs_root);
 out3:
+       debugfs_remove(ocfs2_debugfs_root);
        ocfs2_free_mem_caches();
 out2:
        exit_ocfs2_uptodate_cache();
@@ -1649,11 +1635,6 @@ out1:
 
 static void __exit ocfs2_exit(void)
 {
-       if (ocfs2_wq) {
-               flush_workqueue(ocfs2_wq);
-               destroy_workqueue(ocfs2_wq);
-       }
-
        unregister_quota_format(&ocfs2_quota_format);
 
        debugfs_remove(ocfs2_debugfs_root);
@@ -1744,6 +1725,7 @@ static void ocfs2_inode_init_once(void *data)
        spin_lock_init(&oi->ip_lock);
        ocfs2_extent_map_init(&oi->vfs_inode);
        INIT_LIST_HEAD(&oi->ip_io_markers);
+       INIT_LIST_HEAD(&oi->ip_unwritten_list);
        oi->ip_dir_start_lookup = 0;
        mutex_init(&oi->ip_unaligned_aio);
        init_rwsem(&oi->ip_alloc_sem);
@@ -2348,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        cleancache_init_shared_fs(sb);
 
+       osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+       if (!osb->ocfs2_wq) {
+               status = -ENOMEM;
+               mlog_errno(status);
+       }
+
 bail:
        return status;
 }
@@ -2535,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 {
        /* This function assumes that the caller has the main osb resource */
 
+       /* ocfs2_initializer_super have already created this workqueue */
+       if (osb->ocfs2_wq) {
+               flush_workqueue(osb->ocfs2_wq);
+               destroy_workqueue(osb->ocfs2_wq);
+       }
+
        ocfs2_free_slot_info(osb);
 
        kfree(osb->osb_orphan_wipes);
index b477d0b1c7b6ce4caaf06a388f9044983e424a2b..b023e4f3d740b758138e2603f5b81f3e7a3c1672 100644 (file)
@@ -26,8 +26,6 @@
 #ifndef OCFS2_SUPER_H
 #define OCFS2_SUPER_H
 
-extern struct workqueue_struct *ocfs2_wq;
-
 int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
                                  int node_num);
 
index eed2050db9be9c7795acd2153f976d4742e2fe82..d73291f5f0fcbfb0cd2cff2bb1b628a72f754f6e 100644 (file)
 static inline void task_name(struct seq_file *m, struct task_struct *p)
 {
        char *buf;
+       size_t size;
        char tcomm[sizeof(p->comm)];
+       int ret;
 
        get_task_comm(tcomm, p);
 
        seq_puts(m, "Name:\t");
-       buf = m->buf + m->count;
 
-       /* Ignore error for now */
-       buf += string_escape_str(tcomm, buf, m->size - m->count,
-                                ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+       size = seq_get_buf(m, &buf);
+       ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+       seq_commit(m, ret < size ? ret : -1);
 
-       m->count = buf - m->buf;
        seq_putc(m, '\n');
 }
 
index 93484034a03d04c38cc5ff7779fb95e7611fbd09..b2855eea54050655818a424b850bc4d94d5d8f47 100644 (file)
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
         * pseudo flags for the well known (anonymous) memory mapped pages
         *
         * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
-        * simple test in page_mapped() is not enough.
+        * simple test in page_mapcount() is not enough.
         */
-       if (!PageSlab(page) && page_mapped(page))
+       if (!PageSlab(page) && page_mapcount(page))
                u |= 1 << KPF_MMAP;
        if (PageAnon(page))
                u |= 1 << KPF_ANON;
index b029d426c55892544afcd3bf2b8a5965f6e0e5ee..9ca699b05e78906167519fa17ccb3acdbde510ec 100644 (file)
@@ -70,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                ptes >> 10,
                pmds >> 10,
                swap << (PAGE_SHIFT-10));
+       hugetlb_report_usage(m, mm);
 }
 
 unsigned long task_vsize(struct mm_struct *mm)
@@ -446,14 +447,17 @@ struct mem_size_stats {
        unsigned long anonymous;
        unsigned long anonymous_thp;
        unsigned long swap;
+       unsigned long shared_hugetlb;
+       unsigned long private_hugetlb;
        u64 pss;
        u64 swap_pss;
 };
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
-               unsigned long size, bool young, bool dirty)
+               bool compound, bool young, bool dirty)
 {
-       int mapcount;
+       int i, nr = compound ? HPAGE_PMD_NR : 1;
+       unsigned long size = nr * PAGE_SIZE;
 
        if (PageAnon(page))
                mss->anonymous += size;
@@ -462,23 +466,37 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
        /* Accumulate the size in pages that have been accessed. */
        if (young || page_is_young(page) || PageReferenced(page))
                mss->referenced += size;
-       mapcount = page_mapcount(page);
-       if (mapcount >= 2) {
-               u64 pss_delta;
 
-               if (dirty || PageDirty(page))
-                       mss->shared_dirty += size;
-               else
-                       mss->shared_clean += size;
-               pss_delta = (u64)size << PSS_SHIFT;
-               do_div(pss_delta, mapcount);
-               mss->pss += pss_delta;
-       } else {
+       /*
+        * page_count(page) == 1 guarantees the page is mapped exactly once.
+        * If any subpage of the compound page mapped with PTE it would elevate
+        * page_count().
+        */
+       if (page_count(page) == 1) {
                if (dirty || PageDirty(page))
                        mss->private_dirty += size;
                else
                        mss->private_clean += size;
                mss->pss += (u64)size << PSS_SHIFT;
+               return;
+       }
+
+       for (i = 0; i < nr; i++, page++) {
+               int mapcount = page_mapcount(page);
+
+               if (mapcount >= 2) {
+                       if (dirty || PageDirty(page))
+                               mss->shared_dirty += PAGE_SIZE;
+                       else
+                               mss->shared_clean += PAGE_SIZE;
+                       mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+               } else {
+                       if (dirty || PageDirty(page))
+                               mss->private_dirty += PAGE_SIZE;
+                       else
+                               mss->private_clean += PAGE_SIZE;
+                       mss->pss += PAGE_SIZE << PSS_SHIFT;
+               }
        }
 }
 
@@ -513,7 +531,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 
        if (!page)
                return;
-       smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+       smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -529,8 +548,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
        if (IS_ERR_OR_NULL(page))
                return;
        mss->anonymous_thp += HPAGE_PMD_SIZE;
-       smaps_account(mss, page, HPAGE_PMD_SIZE,
-                       pmd_young(*pmd), pmd_dirty(*pmd));
+       smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
 }
 #else
 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -546,7 +564,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                smaps_pmd_entry(pmd, addr, walk);
                spin_unlock(ptl);
                return 0;
@@ -625,12 +643,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
        seq_putc(m, '\n');
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+                                unsigned long addr, unsigned long end,
+                                struct mm_walk *walk)
+{
+       struct mem_size_stats *mss = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       struct page *page = NULL;
+
+       if (pte_present(*pte)) {
+               page = vm_normal_page(vma, addr, *pte);
+       } else if (is_swap_pte(*pte)) {
+               swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+               if (is_migration_entry(swpent))
+                       page = migration_entry_to_page(swpent);
+       }
+       if (page) {
+               int mapcount = page_mapcount(page);
+
+               if (mapcount >= 2)
+                       mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+               else
+                       mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+       }
+       return 0;
+}
+#endif /* HUGETLB_PAGE */
+
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
        struct vm_area_struct *vma = v;
        struct mem_size_stats mss;
        struct mm_walk smaps_walk = {
                .pmd_entry = smaps_pte_range,
+#ifdef CONFIG_HUGETLB_PAGE
+               .hugetlb_entry = smaps_hugetlb_range,
+#endif
                .mm = vma->vm_mm,
                .private = &mss,
        };
@@ -652,6 +702,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                   "Referenced:     %8lu kB\n"
                   "Anonymous:      %8lu kB\n"
                   "AnonHugePages:  %8lu kB\n"
+                  "Shared_Hugetlb: %8lu kB\n"
+                  "Private_Hugetlb: %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "SwapPss:        %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
@@ -667,6 +719,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                   mss.referenced >> 10,
                   mss.anonymous >> 10,
                   mss.anonymous_thp >> 10,
+                  mss.shared_hugetlb >> 10,
+                  mss.private_hugetlb >> 10,
                   mss.swap >> 10,
                   (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
                   vma_kernel_pagesize(vma) >> 10,
@@ -753,19 +807,27 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
        pte_t ptent = *pte;
 
        if (pte_present(ptent)) {
+               ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
                ptent = pte_wrprotect(ptent);
                ptent = pte_clear_soft_dirty(ptent);
+               ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
        } else if (is_swap_pte(ptent)) {
                ptent = pte_swp_clear_soft_dirty(ptent);
+               set_pte_at(vma->vm_mm, addr, pte, ptent);
        }
-
-       set_pte_at(vma->vm_mm, addr, pte, ptent);
 }
+#else
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+               unsigned long addr, pte_t *pte)
+{
+}
+#endif
 
+#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmdp)
 {
-       pmd_t pmd = *pmdp;
+       pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
 
        pmd = pmd_wrprotect(pmd);
        pmd = pmd_clear_soft_dirty(pmd);
@@ -775,14 +837,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 
        set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 }
-
 #else
-
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
-               unsigned long addr, pte_t *pte)
-{
-}
-
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmdp)
 {
@@ -798,7 +853,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        spinlock_t *ptl;
        struct page *page;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
                        clear_soft_dirty_pmd(vma, addr, pmd);
                        goto out;
@@ -1072,7 +1127,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
        int err = 0;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
                u64 flags = 0, frame = 0;
                pmd_t pmd = *pmdp;
 
@@ -1404,7 +1459,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
        pte_t *orig_pte;
        pte_t *pte;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                pte_t huge_pte = *(pte_t *)pmd;
                struct page *page;
 
index 225586e141cac6e21a35b75e74201355f7f3f6d1..e85664b7c7d963522fd7efc938a3d96a27edc651 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/cred.h>
 #include <linux/mm.h>
 #include <linux/printk.h>
+#include <linux/string_helpers.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -25,12 +26,17 @@ static void seq_set_overflow(struct seq_file *m)
 static void *seq_buf_alloc(unsigned long size)
 {
        void *buf;
+       gfp_t gfp = GFP_KERNEL;
 
        /*
-        * __GFP_NORETRY to avoid oom-killings with high-order allocations -
-        * it's better to fall back to vmalloc() than to kill things.
+        * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
+        * it's better to fall back to vmalloc() than to kill things.  For small
+        * allocations, just use GFP_KERNEL which will oom kill, thus no need
+        * for vmalloc fallback.
         */
-       buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+       if (size > PAGE_SIZE)
+               gfp |= __GFP_NORETRY | __GFP_NOWARN;
+       buf = kmalloc(size, gfp);
        if (!buf && size > PAGE_SIZE)
                buf = vmalloc(size);
        return buf;
@@ -377,26 +383,12 @@ EXPORT_SYMBOL(seq_release);
  */
 void seq_escape(struct seq_file *m, const char *s, const char *esc)
 {
-       char *end = m->buf + m->size;
-       char *p;
-       char c;
+       char *buf;
+       size_t size = seq_get_buf(m, &buf);
+       int ret;
 
-       for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
-               if (!strchr(esc, c)) {
-                       *p++ = c;
-                       continue;
-               }
-               if (p + 3 < end) {
-                       *p++ = '\\';
-                       *p++ = '0' + ((c & 0300) >> 6);
-                       *p++ = '0' + ((c & 070) >> 3);
-                       *p++ = '0' + (c & 07);
-                       continue;
-               }
-               seq_set_overflow(m);
-               return;
-       }
-       m->count = p - m->buf;
+       ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
+       seq_commit(m, ret < size ? ret : -1);
 }
 EXPORT_SYMBOL(seq_escape);
 
@@ -773,6 +765,8 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
 {
        const u8 *ptr = buf;
        int i, linelen, remaining = len;
+       char *buffer;
+       size_t size;
        int ret;
 
        if (rowsize != 16 && rowsize != 32)
@@ -794,15 +788,12 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
                        break;
                }
 
+               size = seq_get_buf(m, &buffer);
                ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
-                                        m->buf + m->count, m->size - m->count,
-                                        ascii);
-               if (ret >= m->size - m->count) {
-                       seq_set_overflow(m);
-               } else {
-                       m->count += ret;
-                       seq_putc(m, '\n');
-               }
+                                        buffer, size, ascii);
+               seq_commit(m, ret < size ? ret : -1);
+
+               seq_putc(m, '\n');
        }
 }
 EXPORT_SYMBOL(seq_hex_dump);
index 5fc1e50a7f30c4258c018f560709c54fda005b40..801c21cd77fe4f1d344aed6fa7b28369972d82b2 100644 (file)
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                break;
 
                        error = add_to_page_cache_lru(page, mapping, index,
-                                       GFP_KERNEL & mapping_gfp_mask(mapping));
+                                  mapping_gfp_constraint(mapping, GFP_KERNEL));
                        if (unlikely(error)) {
                                page_cache_release(page);
                                if (error == -EEXIST)
index fbc98ee620448dbf1f99697231e003ab86dc0893..4ec430ae2b0daa7ed9b0814f00abae5e1c63fa8f 100644 (file)
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
 
 static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 {
-       filemap_fdatawait(bdev->bd_inode->i_mapping);
+       /*
+        * We keep the error status of individual mapping so that
+        * applications can catch the writeback error using fsync(2).
+        * See filemap_fdatawait_keep_errors() for details.
+        */
+       filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
 }
 
 /*
index 7af7648c06c63bd63ec21b150cc2713914f28c08..532ab79d38fe376c14a5463a97195b59a61d8f84 100644 (file)
@@ -525,7 +525,7 @@ xfs_qm_shrink_scan(
        unsigned long           freed;
        int                     error;
 
-       if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+       if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
                return 0;
 
        INIT_LIST_HEAD(&isol.buffers);
index b1bc954eccf37438213d6744fe69ff1bc4d71365..0f3e16b1ea64b454f500d949cf192ea20eeba732 100644 (file)
@@ -260,7 +260,7 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
                return NULL;
 
        cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, flag);
        return cpu_addr;
 }
 
index 14b0ff32fb9f16c6ce30e0e54c3f3b4885216699..63abda1ac06dbf74793130ca2859b6cc553e6cfc 100644 (file)
@@ -207,11 +207,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long address, pmd_t *pmdp);
-#endif
-
 #ifndef pmdp_collapse_flush
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
@@ -619,10 +614,6 @@ static inline int pmd_trans_huge(pmd_t pmd)
 {
        return 0;
 }
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-       return 0;
-}
 #ifndef __HAVE_ARCH_PMD_WRITE
 static inline int pmd_write(pmd_t pmd)
 {
index 4d3b842f4319586fef51493cfebae68397d2b0b4..0b921ae06cd83585e1d1cf2adb6baf665f203013 100644 (file)
@@ -834,7 +834,6 @@ struct drm_device {
 
        struct drm_sg_mem *sg;  /**< Scatter gather memory */
        unsigned int num_crtcs;                  /**< Number of CRTCs on this device */
-       sigset_t sigmask;
 
        struct {
                int context;
index e63553386ae715d387fadd39c4e53650ce3e226e..2b8ed123ad36b26bc26956293452937d3ac1b90e 100644 (file)
@@ -164,6 +164,8 @@ static inline __u8 ror8(__u8 word, unsigned int shift)
  * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit
  * @value: value to sign extend
  * @index: 0 based bit index (0<=index<32) to sign bit
+ *
+ * This is safe to use for 16- and 8-bit types as well.
  */
 static inline __s32 sign_extend32(__u32 value, int index)
 {
@@ -171,6 +173,17 @@ static inline __s32 sign_extend32(__u32 value, int index)
        return (__s32)(value << shift) >> shift;
 }
 
+/**
+ * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit
+ * @value: value to sign extend
+ * @index: 0 based bit index (0<=index<64) to sign bit
+ */
+static inline __s64 sign_extend64(__u64 value, int index)
+{
+       __u8 shift = 63 - index;
+       return (__s64)(value << shift) >> shift;
+}
+
 static inline unsigned fls_long(unsigned long l)
 {
        if (sizeof(l) == 4)
index aa8f61cf3a19fa3a28214f2a504e031e9e7d3e68..4cd4ddf64cc7f9dfeffcd71492b24b6f57e1e4a3 100644 (file)
@@ -15,7 +15,8 @@
 /* For more detailed tracepoint output */
 #define COMPACT_NO_SUITABLE_PAGE       5
 #define COMPACT_NOT_SUITABLE_ZONE      6
-/* When adding new state, please change compaction_status_string, too */
+#define COMPACT_CONTENDED              7
+/* When adding new states, please adjust include/trace/events/compaction.h */
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
 /* No contention detected */
index 8efb40e61d6e48021d68f93635eea8d3ab3e8c0b..22ab246feed34c104038d3f94e1401ea9a587f8f 100644 (file)
 
 #if GCC_VERSION >= 40600
 /*
- * Tell the optimizer that something else uses this function or variable.
+ * When used with Link Time Optimization, gcc can optimize away C functions or
+ * variables which are referenced only from assembly code.  __visible tells the
+ * optimizer that something else uses this function or variable, thus preventing
+ * this.
  */
 #define __visible      __attribute__((externally_visible))
 #endif
 
+
+#if GCC_VERSION >= 40900 && !defined(__CHECKER__)
+/*
+ * __assume_aligned(n, k): Tell the optimizer that the returned
+ * pointer can be assumed to be k modulo n. The second argument is
+ * optional (default 0), so we use a variadic macro to make the
+ * shorthand.
+ *
+ * Beware: Do not apply this to functions which may return
+ * ERR_PTRs. Also, it is probably unwise to apply it to functions
+ * returning extra information in the low bits (but in that case the
+ * compiler should see some alignment anyway, when the return value is
+ * massaged by 'flags = ptr & 3; ptr &= ~3;').
+ */
+#define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__)))
+#endif
+
 /*
  * GCC 'asm goto' miscompiles certain code sequences:
  *
index 8807e4f1b0e6b1878c845a7301f7aded28b4707b..f108e5222dad0ea50ae685173eaceb486f6eab94 100644 (file)
@@ -433,6 +433,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 #define __visible
 #endif
 
+/*
+ * Assume alignment of return value.
+ */
+#ifndef __assume_aligned
+#define __assume_aligned(a, ...)
+#endif
+
+
 /* Are two types/vars the same type (ignoring qualifiers)? */
 #ifndef __same_type
 # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
index 1b357997cac5c95fabca2ec4e31988e1f57075d8..85a868ccb4931d374a1ee9fb4e4036bb84399561 100644 (file)
@@ -93,7 +93,7 @@ extern int current_cpuset_is_being_rebound(void);
 
 extern void rebuild_sched_domains(void);
 
-extern void cpuset_print_task_mems_allowed(struct task_struct *p);
+extern void cpuset_print_current_mems_allowed(void);
 
 /*
  * read_mems_allowed_begin is required when making decisions involving
@@ -104,6 +104,9 @@ extern void cpuset_print_task_mems_allowed(struct task_struct *p);
  */
 static inline unsigned int read_mems_allowed_begin(void)
 {
+       if (!cpusets_enabled())
+               return 0;
+
        return read_seqcount_begin(&current->mems_allowed_seq);
 }
 
@@ -115,6 +118,9 @@ static inline unsigned int read_mems_allowed_begin(void)
  */
 static inline bool read_mems_allowed_retry(unsigned int seq)
 {
+       if (!cpusets_enabled())
+               return false;
+
        return read_seqcount_retry(&current->mems_allowed_seq, seq);
 }
 
@@ -219,7 +225,7 @@ static inline void rebuild_sched_domains(void)
        partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_print_task_mems_allowed(struct task_struct *p)
+static inline void cpuset_print_current_mems_allowed(void)
 {
 }
 
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644 (file)
index 0000000..bba7a4d
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL           0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata:     pointer to the data to compute checksum for.
+ * @nbytes:    number of bytes in data buffer.
+ * @seed:      CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
index fe8cb610deac70f3027edb7923d4bd99f525b7c6..e5f539dd56bfa3dfbd21836e80d7b32188502593 100644 (file)
@@ -51,7 +51,8 @@ extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
                               int nelems, int dir);
 
 extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
-                                    dma_addr_t dma_addr, void *virt);
+                                    dma_addr_t dma_addr, void *virt,
+                                    gfp_t flags);
 
 extern void debug_dma_free_coherent(struct device *dev, size_t size,
                                    void *virt, dma_addr_t addr);
@@ -132,7 +133,8 @@ static inline void debug_dma_unmap_sg(struct device *dev,
 }
 
 static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
-                                           dma_addr_t dma_addr, void *virt)
+                                           dma_addr_t dma_addr, void *virt,
+                                           gfp_t flags)
 {
 }
 
index ac07ff090919309c4f51d0a29cfeb9f9a7d673e4..2e551e2d2d03a7d78c80633637672a483ae42199 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef _LINUX_DMA_MAPPING_H
 #define _LINUX_DMA_MAPPING_H
 
+#include <linux/sizes.h>
 #include <linux/string.h>
 #include <linux/device.h>
 #include <linux/err.h>
@@ -145,7 +146,9 @@ static inline void arch_teardown_dma_ops(struct device *dev) { }
 
 static inline unsigned int dma_get_max_seg_size(struct device *dev)
 {
-       return dev->dma_parms ? dev->dma_parms->max_segment_size : 65536;
+       if (dev->dma_parms && dev->dma_parms->max_segment_size)
+               return dev->dma_parms->max_segment_size;
+       return SZ_64K;
 }
 
 static inline unsigned int dma_set_max_seg_size(struct device *dev,
@@ -154,14 +157,15 @@ static inline unsigned int dma_set_max_seg_size(struct device *dev,
        if (dev->dma_parms) {
                dev->dma_parms->max_segment_size = size;
                return 0;
-       } else
-               return -EIO;
+       }
+       return -EIO;
 }
 
 static inline unsigned long dma_get_seg_boundary(struct device *dev)
 {
-       return dev->dma_parms ?
-               dev->dma_parms->segment_boundary_mask : 0xffffffff;
+       if (dev->dma_parms && dev->dma_parms->segment_boundary_mask)
+               return dev->dma_parms->segment_boundary_mask;
+       return DMA_BIT_MASK(32);
 }
 
 static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
@@ -169,8 +173,8 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
        if (dev->dma_parms) {
                dev->dma_parms->segment_boundary_mask = mask;
                return 0;
-       } else
-               return -EIO;
+       }
+       return -EIO;
 }
 
 #ifndef dma_max_pfn
index f2325998cd20cd551d445072bbcd0fb15c17732c..f78dd76f682817dbdbc1d357f8ffd8126857d3c2 100644 (file)
@@ -2410,6 +2410,7 @@ extern int write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
+extern void filemap_fdatawait_keep_errors(struct address_space *);
 extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
                                   loff_t lend);
 extern int filemap_write_and_wait(struct address_space *mapping);
index f92cbd2f44507adda1333d60adc5fb4686a3f029..a7265dd30221987c26f5dbbc56918c1da2350ec4 100644 (file)
@@ -14,7 +14,7 @@ struct vm_area_struct;
 #define ___GFP_HIGHMEM         0x02u
 #define ___GFP_DMA32           0x04u
 #define ___GFP_MOVABLE         0x08u
-#define ___GFP_WAIT            0x10u
+#define ___GFP_RECLAIMABLE     0x10u
 #define ___GFP_HIGH            0x20u
 #define ___GFP_IO              0x40u
 #define ___GFP_FS              0x80u
@@ -29,18 +29,17 @@ struct vm_area_struct;
 #define ___GFP_NOMEMALLOC      0x10000u
 #define ___GFP_HARDWALL                0x20000u
 #define ___GFP_THISNODE                0x40000u
-#define ___GFP_RECLAIMABLE     0x80000u
+#define ___GFP_ATOMIC          0x80000u
 #define ___GFP_NOACCOUNT       0x100000u
 #define ___GFP_NOTRACK         0x200000u
-#define ___GFP_NO_KSWAPD       0x400000u
+#define ___GFP_DIRECT_RECLAIM  0x400000u
 #define ___GFP_OTHER_NODE      0x800000u
 #define ___GFP_WRITE           0x1000000u
+#define ___GFP_KSWAPD_RECLAIM  0x2000000u
 /* If the above are modified, __GFP_BITS_SHIFT may need updating */
 
 /*
- * GFP bitmasks..
- *
- * Zone modifiers (see linux/mmzone.h - low three bits)
+ * Physical address zone modifiers (see linux/mmzone.h - low four bits)
  *
  * Do not put any conditional on these. If necessary modify the definitions
  * without the underscores and use them consistently. The definitions here may
@@ -50,116 +49,230 @@ struct vm_area_struct;
 #define __GFP_HIGHMEM  ((__force gfp_t)___GFP_HIGHMEM)
 #define __GFP_DMA32    ((__force gfp_t)___GFP_DMA32)
 #define __GFP_MOVABLE  ((__force gfp_t)___GFP_MOVABLE)  /* Page is movable */
+#define __GFP_MOVABLE  ((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
 #define GFP_ZONEMASK   (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
+
+/*
+ * Page mobility and placement hints
+ *
+ * These flags provide hints about how mobile the page is. Pages with similar
+ * mobility are placed within the same pageblocks to minimise problems due
+ * to external fragmentation.
+ *
+ * __GFP_MOVABLE (also a zone modifier) indicates that the page can be
+ *   moved by page migration during memory compaction or can be reclaimed.
+ *
+ * __GFP_RECLAIMABLE is used for slab allocations that specify
+ *   SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
+ *
+ * __GFP_WRITE indicates the caller intends to dirty the page. Where possible,
+ *   these pages will be spread between local zones to avoid all the dirty
+ *   pages being in one zone (fair zone allocation policy).
+ *
+ * __GFP_HARDWALL enforces the cpuset memory allocation policy.
+ *
+ * __GFP_THISNODE forces the allocation to be satisified from the requested
+ *   node with no fallbacks or placement policy enforcements.
+ */
+#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
+#define __GFP_WRITE    ((__force gfp_t)___GFP_WRITE)
+#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
+#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)
+
 /*
- * Action modifiers - doesn't change the zoning
+ * Watermark modifiers -- controls access to emergency reserves
+ *
+ * __GFP_HIGH indicates that the caller is high-priority and that granting
+ *   the request is necessary before the system can make forward progress.
+ *   For example, creating an IO context to clean pages.
+ *
+ * __GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
+ *   high priority. Users are typically interrupt handlers. This may be
+ *   used in conjunction with __GFP_HIGH
+ *
+ * __GFP_MEMALLOC allows access to all memory. This should only be used when
+ *   the caller guarantees the allocation will allow more memory to be freed
+ *   very shortly e.g. process exiting or swapping. Users either should
+ *   be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
+ *
+ * __GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
+ *   This takes precedence over the __GFP_MEMALLOC flag if both are set.
+ *
+ * __GFP_NOACCOUNT ignores the accounting for kmemcg limit enforcement.
+ */
+#define __GFP_ATOMIC   ((__force gfp_t)___GFP_ATOMIC)
+#define __GFP_HIGH     ((__force gfp_t)___GFP_HIGH)
+#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)
+#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
+#define __GFP_NOACCOUNT        ((__force gfp_t)___GFP_NOACCOUNT)
+
+/*
+ * Reclaim modifiers
+ *
+ * __GFP_IO can start physical IO.
+ *
+ * __GFP_FS can call down to the low-level FS. Clearing the flag avoids the
+ *   allocator recursing into the filesystem which might already be holding
+ *   locks.
+ *
+ * __GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
+ *   This flag can be cleared to avoid unnecessary delays when a fallback
+ *   option is available.
+ *
+ * __GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
+ *   the low watermark is reached and have it reclaim pages until the high
+ *   watermark is reached. A caller may wish to clear this flag when fallback
+ *   options are available and the reclaim is likely to disrupt the system. The
+ *   canonical example is THP allocation where a fallback is cheap but
+ *   reclaim/compaction may cause indirect stalls.
+ *
+ * __GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
  *
  * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt
- * _might_ fail.  This depends upon the particular VM implementation.
+ *   _might_ fail.  This depends upon the particular VM implementation.
  *
  * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
- * cannot handle allocation failures. New users should be evaluated carefully
- * (and the flag should be used only when there is no reasonable failure policy)
- * but it is definitely preferable to use the flag rather than opencode endless
- * loop around allocator.
+ *   cannot handle allocation failures. New users should be evaluated carefully
+ *   (and the flag should be used only when there is no reasonable failure
+ *   policy) but it is definitely preferable to use the flag rather than
+ *   opencode endless loop around allocator.
  *
  * __GFP_NORETRY: The VM implementation must not retry indefinitely and will
- * return NULL when direct reclaim and memory compaction have failed to allow
- * the allocation to succeed.  The OOM killer is not called with the current
- * implementation.
- *
- * __GFP_MOVABLE: Flag that this page will be movable by the page migration
- * mechanism or reclaimed
+ *   return NULL when direct reclaim and memory compaction have failed to allow
+ *   the allocation to succeed.  The OOM killer is not called with the current
+ *   implementation.
  */
-#define __GFP_WAIT     ((__force gfp_t)___GFP_WAIT)    /* Can wait and reschedule? */
-#define __GFP_HIGH     ((__force gfp_t)___GFP_HIGH)    /* Should access emergency pools? */
-#define __GFP_IO       ((__force gfp_t)___GFP_IO)      /* Can start physical IO? */
-#define __GFP_FS       ((__force gfp_t)___GFP_FS)      /* Can call down to low-level FS? */
-#define __GFP_COLD     ((__force gfp_t)___GFP_COLD)    /* Cache-cold page required */
-#define __GFP_NOWARN   ((__force gfp_t)___GFP_NOWARN)  /* Suppress page allocation failure warning */
-#define __GFP_REPEAT   ((__force gfp_t)___GFP_REPEAT)  /* See above */
-#define __GFP_NOFAIL   ((__force gfp_t)___GFP_NOFAIL)  /* See above */
-#define __GFP_NORETRY  ((__force gfp_t)___GFP_NORETRY) /* See above */
-#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */
-#define __GFP_COMP     ((__force gfp_t)___GFP_COMP)    /* Add compound page metadata */
-#define __GFP_ZERO     ((__force gfp_t)___GFP_ZERO)    /* Return zeroed page on success */
-#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves.
-                                                        * This takes precedence over the
-                                                        * __GFP_MEMALLOC flag if both are
-                                                        * set
-                                                        */
-#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
-#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
-#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
-#define __GFP_NOACCOUNT        ((__force gfp_t)___GFP_NOACCOUNT) /* Don't account to kmemcg */
-#define __GFP_NOTRACK  ((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
-
-#define __GFP_NO_KSWAPD        ((__force gfp_t)___GFP_NO_KSWAPD)
-#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
-#define __GFP_WRITE    ((__force gfp_t)___GFP_WRITE)   /* Allocator intends to dirty page */
+#define __GFP_IO       ((__force gfp_t)___GFP_IO)
+#define __GFP_FS       ((__force gfp_t)___GFP_FS)
+#define __GFP_DIRECT_RECLAIM   ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
+#define __GFP_KSWAPD_RECLAIM   ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
+#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
+#define __GFP_REPEAT   ((__force gfp_t)___GFP_REPEAT)
+#define __GFP_NOFAIL   ((__force gfp_t)___GFP_NOFAIL)
+#define __GFP_NORETRY  ((__force gfp_t)___GFP_NORETRY)
 
 /*
- * This may seem redundant, but it's a way of annotating false positives vs.
- * allocations that simply cannot be supported (e.g. page tables).
+ * Action modifiers
+ *
+ * __GFP_COLD indicates that the caller does not expect to be used in the near
+ *   future. Where possible, a cache-cold page will be returned.
+ *
+ * __GFP_NOWARN suppresses allocation failure reports.
+ *
+ * __GFP_COMP address compound page metadata.
+ *
+ * __GFP_ZERO returns a zeroed page on success.
+ *
+ * __GFP_NOTRACK avoids tracking with kmemcheck.
+ *
+ * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of
+ *   distinguishing in the source between false positives and allocations that
+ *   cannot be supported (e.g. page tables).
+ *
+ * __GFP_OTHER_NODE is for allocations that are on a remote node but that
+ *   should not be accounted for as a remote allocation in vmstat. A
+ *   typical user would be khugepaged collapsing a huge page on a remote
+ *   node.
  */
+#define __GFP_COLD     ((__force gfp_t)___GFP_COLD)
+#define __GFP_NOWARN   ((__force gfp_t)___GFP_NOWARN)
+#define __GFP_COMP     ((__force gfp_t)___GFP_COMP)
+#define __GFP_ZERO     ((__force gfp_t)___GFP_ZERO)
+#define __GFP_NOTRACK  ((__force gfp_t)___GFP_NOTRACK)
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
+#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE)
 
-#define __GFP_BITS_SHIFT 25    /* Room for N __GFP_FOO bits */
+/* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 26
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
-/* This equals 0, but use constants in case they ever change */
-#define GFP_NOWAIT     (GFP_ATOMIC & ~__GFP_HIGH)
-/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
-#define GFP_ATOMIC     (__GFP_HIGH)
-#define GFP_NOIO       (__GFP_WAIT)
-#define GFP_NOFS       (__GFP_WAIT | __GFP_IO)
-#define GFP_KERNEL     (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_TEMPORARY  (__GFP_WAIT | __GFP_IO | __GFP_FS | \
+/*
+ * Useful GFP flag combinations that are commonly used. It is recommended
+ * that subsystems start with one of these combinations and then set/clear
+ * __GFP_FOO flags as necessary.
+ *
+ * GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
+ *   watermark is applied to allow access to "atomic reserves"
+ *
+ * GFP_KERNEL is typical for kernel-internal allocations. The caller requires
+ *   ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
+ *
+ * GFP_NOWAIT is for kernel allocations that should not stall for direct
+ *   reclaim, start physical IO or use any filesystem callback.
+ *
+ * GFP_NOIO will use direct reclaim to discard clean pages or slab pages
+ *   that do not require the starting of any physical IO.
+ *
+ * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
+ *
+ * GFP_USER is for userspace allocations that also need to be directly
+ *   accessibly by the kernel or hardware. It is typically used by hardware
+ *   for buffers that are mapped to userspace (e.g. graphics) that hardware
+ *   still must DMA to. cpuset limits are enforced for these allocations.
+ *
+ * GFP_DMA exists for historical reasons and should be avoided where possible.
+ *   The flags indicates that the caller requires that the lowest zone be
+ *   used (ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
+ *   it would require careful auditing as some users really require it and
+ *   others use the flag to avoid lowmem reserves in ZONE_DMA and treat the
+ *   lowest zone as a type of emergency reserve.
+ *
+ * GFP_DMA32 is similar to GFP_DMA except that the caller requires a 32-bit
+ *   address.
+ *
+ * GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
+ *   do not need to be directly accessible by the kernel but that cannot
+ *   move once in use. An example may be a hardware allocation that maps
+ *   data directly into userspace but has no addressing limitations.
+ *
+ * GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
+ *   need direct access to but can use kmap() when access is required. They
+ *   are expected to be movable via page reclaim or page migration. Typically,
+ *   pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE.
+ *
+ * GFP_TRANSHUGE is used for THP allocations. They are compound allocations
+ *   that will fail quickly if memory is not available and will not wake
+ *   kswapd on failure.
+ */
+#define GFP_ATOMIC     (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_KERNEL     (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
+#define GFP_NOWAIT     (__GFP_KSWAPD_RECLAIM)
+#define GFP_NOIO       (__GFP_RECLAIM)
+#define GFP_NOFS       (__GFP_RECLAIM | __GFP_IO)
+#define GFP_TEMPORARY  (__GFP_RECLAIM | __GFP_IO | __GFP_FS | \
                         __GFP_RECLAIMABLE)
-#define GFP_USER       (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
+#define GFP_USER       (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
+#define GFP_DMA                __GFP_DMA
+#define GFP_DMA32      __GFP_DMA32
 #define GFP_HIGHUSER   (GFP_USER | __GFP_HIGHMEM)
 #define GFP_HIGHUSER_MOVABLE   (GFP_HIGHUSER | __GFP_MOVABLE)
-#define GFP_IOFS       (__GFP_IO | __GFP_FS)
-#define GFP_TRANSHUGE  (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
-                        __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
-                        __GFP_NO_KSWAPD)
+#define GFP_IOFS       (__GFP_IO | __GFP_FS | __GFP_KSWAPD_RECLAIM)
+#define GFP_TRANSHUGE  ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+                        __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \
+                        ~__GFP_KSWAPD_RECLAIM)
 
-/* This mask makes up all the page movable related flags */
+/* Convert GFP flags to their corresponding migrate type */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
+#define GFP_MOVABLE_SHIFT 3
 
-/* Control page allocator reclaim behavior */
-#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
-                       __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
-                       __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
-
-/* Control slab gfp mask during early boot */
-#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))
-
-/* Control allocation constraints */
-#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
-
-/* Do not use these with a slab allocator */
-#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-
-/* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
-   platforms, used as appropriate on others */
-
-#define GFP_DMA                __GFP_DMA
-
-/* 4GB DMA on some platforms */
-#define GFP_DMA32      __GFP_DMA32
-
-/* Convert GFP flags to their corresponding migrate type */
 static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
 {
-       WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
+       VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
+       BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
+       BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
 
        if (unlikely(page_group_by_mobility_disabled))
                return MIGRATE_UNMOVABLE;
 
        /* Group based on mobility */
-       return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
-               ((gfp_flags & __GFP_RECLAIMABLE) != 0);
+       return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
+}
+#undef GFP_MOVABLE_MASK
+#undef GFP_MOVABLE_SHIFT
+
+static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
+{
+       return gfp_flags & __GFP_DIRECT_RECLAIM;
 }
 
 #ifdef CONFIG_HIGHMEM
index ecb080d6ff42077513f03b95537dc108bded9e07..f7c3f13f3a9c08e6d39f84d749242b6e35b78f9e 100644 (file)
@@ -19,13 +19,16 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          unsigned long addr,
                                          pmd_t *pmd,
                                          unsigned int flags);
+extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+                       struct vm_area_struct *vma,
+                       pmd_t *pmd, unsigned long addr);
 extern int zap_huge_pmd(struct mmu_gather *tlb,
                        struct vm_area_struct *vma,
                        pmd_t *pmd, unsigned long addr);
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned char *vec);
-extern int move_huge_pmd(struct vm_area_struct *vma,
+extern bool move_huge_pmd(struct vm_area_struct *vma,
                         struct vm_area_struct *new_vma,
                         unsigned long old_addr,
                         unsigned long new_addr, unsigned long old_end,
@@ -48,16 +51,11 @@ enum transparent_hugepage_flag {
 #endif
 };
 
-enum page_check_address_pmd_flag {
-       PAGE_CHECK_ADDRESS_PMD_FLAG,
-       PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG,
-       PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG,
-};
 extern pmd_t *page_check_address_pmd(struct page *page,
                                     struct mm_struct *mm,
                                     unsigned long address,
-                                    enum page_check_address_pmd_flag flag,
                                     spinlock_t **ptl);
+extern int pmd_freeable(pmd_t pmd);
 
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
@@ -95,30 +93,27 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 #endif /* CONFIG_DEBUG_VM */
 
 extern unsigned long transparent_hugepage_flags;
-extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+
+extern void prep_transhuge_page(struct page *page);
+extern void free_transhuge_page(struct page *page);
+
+int split_huge_page_to_list(struct page *page, struct list_head *list);
 static inline int split_huge_page(struct page *page)
 {
        return split_huge_page_to_list(page, NULL);
 }
-extern void __split_huge_page_pmd(struct vm_area_struct *vma,
-               unsigned long address, pmd_t *pmd);
-#define split_huge_page_pmd(__vma, __address, __pmd)                   \
+void deferred_split_huge_page(struct page *page);
+
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long address);
+
+#define split_huge_pmd(__vma, __pmd, __address)                                \
        do {                                                            \
                pmd_t *____pmd = (__pmd);                               \
-               if (unlikely(pmd_trans_huge(*____pmd)))                 \
-                       __split_huge_page_pmd(__vma, __address,         \
-                                       ____pmd);                       \
+               if (pmd_trans_huge(*____pmd))                           \
+                       __split_huge_pmd(__vma, __pmd, __address);      \
        }  while (0)
-#define wait_split_huge_page(__anon_vma, __pmd)                                \
-       do {                                                            \
-               pmd_t *____pmd = (__pmd);                               \
-               anon_vma_lock_write(__anon_vma);                        \
-               anon_vma_unlock_write(__anon_vma);                      \
-               BUG_ON(pmd_trans_splitting(*____pmd) ||                 \
-                      pmd_trans_huge(*____pmd));                       \
-       } while (0)
-extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd);
+
 #if HPAGE_PMD_ORDER >= MAX_ORDER
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
@@ -128,17 +123,17 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                    unsigned long start,
                                    unsigned long end,
                                    long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl);
 /* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
 {
        VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
        if (pmd_trans_huge(*pmd))
                return __pmd_trans_huge_lock(pmd, vma, ptl);
        else
-               return 0;
+               return false;
 }
 static inline int hpage_nr_pages(struct page *page)
 {
@@ -183,11 +178,8 @@ static inline int split_huge_page(struct page *page)
 {
        return 0;
 }
-#define split_huge_page_pmd(__vma, __address, __pmd)   \
-       do { } while (0)
-#define wait_split_huge_page(__anon_vma, __pmd)        \
-       do { } while (0)
-#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
+static inline void deferred_split_huge_page(struct page *page) {}
+#define split_huge_pmd(__vma, __pmd, __address)        \
        do { } while (0)
 static inline int hugepage_madvise(struct vm_area_struct *vma,
                                   unsigned long *vm_flags, int advice)
@@ -201,10 +193,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         long adjust_next)
 {
 }
-static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
 {
-       return 0;
+       return false;
 }
 
 static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
index 5e35379f58a53d09cf50bd693d0e8639c7b79709..4be35b9460c640286ef77efadad42b52678e11e1 100644 (file)
@@ -42,6 +42,16 @@ struct resv_map {
 extern struct resv_map *resv_map_alloc(void);
 void resv_map_release(struct kref *ref);
 
+/*
+ * hugetlb_falloc is used to prevent page faults during falloc hole punch
+ * operations.  During hole punch, inode->i_private points to this struct.
+ */
+struct hugetlb_falloc {
+       wait_queue_head_t *waitq;       /* Page faults waiting on hole punch */
+       pgoff_t start;                  /* Start of fallocate hole */
+       pgoff_t end;                    /* End of fallocate hole */
+};
+
 extern spinlock_t hugetlb_lock;
 extern int hugetlb_max_hstate __read_mostly;
 #define for_each_hstate(h) \
@@ -483,6 +493,17 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 #define hugepages_supported() (HPAGE_SHIFT != 0)
 #endif
 
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);
+
+static inline void hugetlb_count_add(long l, struct mm_struct *mm)
+{
+       atomic_long_add(l, &mm->hugetlb_usage);
+}
+
+static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
+{
+       atomic_long_sub(l, &mm->hugetlb_usage);
+}
 #else  /* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 #define alloc_huge_page(v, a, r) NULL
@@ -519,6 +540,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 {
        return &mm->page_table_lock;
 }
+
+static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
+{
+}
+
+static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
+{
+}
 #endif /* CONFIG_HUGETLB_PAGE */
 
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
index 7edd305152983af1ab6aee93f470dd99289046e8..24154c26d469c60984020b5a0441fcb6dde3fcb0 100644 (file)
@@ -32,7 +32,7 @@ static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
 
        if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
                return NULL;
-       return (struct hugetlb_cgroup *)page[2].lru.next;
+       return (struct hugetlb_cgroup *)page[2].private;
 }
 
 static inline
@@ -42,7 +42,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
 
        if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
                return -1;
-       page[2].lru.next = (void *)h_cg;
+       page[2].private = (unsigned long)h_cg;
        return 0;
 }
 
index 5582410727cbf5cdb70bb1cba30729d7ebf4aa26..2c13f747ac2e8727b090f26f085f8cc79972c6b3 100644 (file)
@@ -413,6 +413,8 @@ extern __printf(2, 3)
 char *kasprintf(gfp_t gfp, const char *fmt, ...);
 extern __printf(2, 0)
 char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
+extern __printf(2, 0)
+const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);
 
 extern __scanf(2, 3)
 int sscanf(const char *, const char *, ...);
index d140b1e9faa71791264d6439bd8429810fff3ddd..2da38f093391eb387dac8a11c17160d000d467ec 100644 (file)
@@ -269,6 +269,8 @@ unsigned long paddr_vmcoreinfo_note(void);
        vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
 #define VMCOREINFO_CONFIG(name) \
        vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+       vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
 
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
index c518eb5892603fdd89cdb6d674f1f633f0451367..24daf8fc4d7c71e4c36ed0063849c01a3b31d022 100644 (file)
@@ -89,10 +89,6 @@ int memblock_add_range(struct memblock_type *type,
                       phys_addr_t base, phys_addr_t size,
                       int nid, unsigned long flags);
 
-int memblock_remove_range(struct memblock_type *type,
-                         phys_addr_t base,
-                         phys_addr_t size);
-
 void __next_mem_range(u64 *idx, int nid, ulong flags,
                      struct memblock_type *type_a,
                      struct memblock_type *type_b, phys_addr_t *out_start,
index 27251ed428f7db8adaf54c58b7f9e41deda9048d..ffc5460ed9e55b54f473bbacd90b62f30f93e978 100644 (file)
@@ -294,15 +294,16 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
 
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
-                         gfp_t gfp_mask, struct mem_cgroup **memcgp);
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound);
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             bool lrucare);
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
+                             bool lrucare, bool compound);
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+               bool compound);
 void mem_cgroup_uncharge(struct page *page);
 void mem_cgroup_uncharge_list(struct list_head *page_list);
 
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
-                       bool lrucare);
+void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage);
 
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
@@ -384,7 +385,7 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
        return mz->lru_size[lru];
 }
 
-static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
        unsigned long inactive_ratio;
        unsigned long inactive;
@@ -403,24 +404,26 @@ static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
        return inactive * inactive_ratio < active;
 }
 
+void mem_cgroup_handle_over_high(void);
+
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
                                struct task_struct *p);
 
 static inline void mem_cgroup_oom_enable(void)
 {
-       WARN_ON(current->memcg_oom.may_oom);
-       current->memcg_oom.may_oom = 1;
+       WARN_ON(current->memcg_may_oom);
+       current->memcg_may_oom = 1;
 }
 
 static inline void mem_cgroup_oom_disable(void)
 {
-       WARN_ON(!current->memcg_oom.may_oom);
-       current->memcg_oom.may_oom = 0;
+       WARN_ON(!current->memcg_may_oom);
+       current->memcg_may_oom = 0;
 }
 
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
-       return p->memcg_oom.memcg;
+       return p->memcg_in_oom;
 }
 
 bool mem_cgroup_oom_synchronize(bool wait);
@@ -512,7 +515,8 @@ static inline bool mem_cgroup_low(struct mem_cgroup *root,
 
 static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                                        gfp_t gfp_mask,
-                                       struct mem_cgroup **memcgp)
+                                       struct mem_cgroup **memcgp,
+                                       bool compound)
 {
        *memcgp = NULL;
        return 0;
@@ -520,12 +524,13 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 
 static inline void mem_cgroup_commit_charge(struct page *page,
                                            struct mem_cgroup *memcg,
-                                           bool lrucare)
+                                           bool lrucare, bool compound)
 {
 }
 
 static inline void mem_cgroup_cancel_charge(struct page *page,
-                                           struct mem_cgroup *memcg)
+                                           struct mem_cgroup *memcg,
+                                           bool compound)
 {
 }
 
@@ -537,9 +542,7 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }
 
-static inline void mem_cgroup_migrate(struct page *oldpage,
-                                     struct page *newpage,
-                                     bool lrucare)
+static inline void mem_cgroup_replace_page(struct page *old, struct page *new)
 {
 }
 
@@ -585,10 +588,10 @@ static inline bool mem_cgroup_disabled(void)
        return true;
 }
 
-static inline int
+static inline bool
 mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
-       return 1;
+       return true;
 }
 
 static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
@@ -622,6 +625,10 @@ static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 {
 }
 
+static inline void mem_cgroup_handle_over_high(void)
+{
+}
+
 static inline void mem_cgroup_oom_enable(void)
 {
 }
@@ -748,11 +755,10 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
  * conditions, but because they are pretty simple, they are expected to be
  * fast.
  */
-bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
-                                       int order);
-void __memcg_kmem_commit_charge(struct page *page,
-                                      struct mem_cgroup *memcg, int order);
-void __memcg_kmem_uncharge_pages(struct page *page, int order);
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+                             struct mem_cgroup *memcg);
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
+void __memcg_kmem_uncharge(struct page *page, int order);
 
 /*
  * helper for acessing a memcg's index. It will be used as an index in the
@@ -767,77 +773,42 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
 void __memcg_kmem_put_cache(struct kmem_cache *cachep);
 
-struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr);
-
-int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                     unsigned long nr_pages);
-void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
-
-/**
- * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
- * @gfp: the gfp allocation flags.
- * @memcg: a pointer to the memcg this was charged against.
- * @order: allocation order.
- *
- * returns true if the memcg where the current task belongs can hold this
- * allocation.
- *
- * We return true automatically if this allocation is not to be accounted to
- * any memcg.
- */
-static inline bool
-memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+static inline bool __memcg_kmem_bypass(gfp_t gfp)
 {
        if (!memcg_kmem_enabled())
                return true;
-
        if (gfp & __GFP_NOACCOUNT)
                return true;
-       /*
-        * __GFP_NOFAIL allocations will move on even if charging is not
-        * possible. Therefore we don't even try, and have this allocation
-        * unaccounted. We could in theory charge it forcibly, but we hope
-        * those allocations are rare, and won't be worth the trouble.
-        */
-       if (gfp & __GFP_NOFAIL)
-               return true;
        if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
                return true;
-
-       /* If the test is dying, just let it go. */
-       if (unlikely(fatal_signal_pending(current)))
-               return true;
-
-       return __memcg_kmem_newpage_charge(gfp, memcg, order);
+       return false;
 }
 
 /**
- * memcg_kmem_uncharge_pages: uncharge pages from memcg
- * @page: pointer to struct page being freed
- * @order: allocation order.
+ * memcg_kmem_charge: charge a kmem page
+ * @page: page to charge
+ * @gfp: reclaim mode
+ * @order: allocation order
+ *
+ * Returns 0 on success, an error code on failure.
  */
-static inline void
-memcg_kmem_uncharge_pages(struct page *page, int order)
+static __always_inline int memcg_kmem_charge(struct page *page,
+                                            gfp_t gfp, int order)
 {
-       if (memcg_kmem_enabled())
-               __memcg_kmem_uncharge_pages(page, order);
+       if (__memcg_kmem_bypass(gfp))
+               return 0;
+       return __memcg_kmem_charge(page, gfp, order);
 }
 
 /**
- * memcg_kmem_commit_charge: embeds correct memcg in a page
- * @page: pointer to struct page recently allocated
- * @memcg: the memcg structure we charged against
- * @order: allocation order.
- *
- * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
- * failure of the allocation. if @page is NULL, this function will revert the
- * charges. Otherwise, it will commit @page to @memcg.
+ * memcg_kmem_uncharge: uncharge a kmem page
+ * @page: page to uncharge
+ * @order: allocation order
  */
-static inline void
-memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+static __always_inline void memcg_kmem_uncharge(struct page *page, int order)
 {
-       if (memcg_kmem_enabled() && memcg)
-               __memcg_kmem_commit_charge(page, memcg, order);
+       if (memcg_kmem_enabled())
+               __memcg_kmem_uncharge(page, order);
 }
 
 /**
@@ -850,17 +821,8 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
 static __always_inline struct kmem_cache *
 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
-       if (!memcg_kmem_enabled())
+       if (__memcg_kmem_bypass(gfp))
                return cachep;
-       if (gfp & __GFP_NOACCOUNT)
-               return cachep;
-       if (gfp & __GFP_NOFAIL)
-               return cachep;
-       if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
-               return cachep;
-       if (unlikely(fatal_signal_pending(current)))
-               return cachep;
-
        return __memcg_kmem_get_cache(cachep);
 }
 
@@ -869,13 +831,6 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
        if (memcg_kmem_enabled())
                __memcg_kmem_put_cache(cachep);
 }
-
-static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
-{
-       if (!memcg_kmem_enabled())
-               return NULL;
-       return __mem_cgroup_from_kmem(ptr);
-}
 #else
 #define for_each_memcg_cache_index(_idx)       \
        for (; NULL; )
@@ -890,18 +845,12 @@ static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
        return false;
 }
 
-static inline bool
-memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
 {
-       return true;
+       return 0;
 }
 
-static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
-{
-}
-
-static inline void
-memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+static inline void memcg_kmem_uncharge(struct page *page, int order)
 {
 }
 
@@ -927,11 +876,5 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
 }
-
-static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
-{
-       return NULL;
-}
 #endif /* CONFIG_MEMCG_KMEM */
 #endif /* _LINUX_MEMCONTROL_H */
-
index 80001de019ba33d86b90b9922b39722270cb0449..30ef3b535444f111f9be160c6c38ed45671f441b 100644 (file)
@@ -139,6 +139,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY    0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND  0x00040000      /* Cannot expand with mremap() */
+#define VM_LOCKONFAULT 0x00080000      /* Lock the pages covered when they are faulted in */
 #define VM_ACCOUNT     0x00100000      /* Is a VM accounted object */
 #define VM_NORESERVE   0x00200000      /* should the VM suppress accounting */
 #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
@@ -202,6 +203,9 @@ extern unsigned int kobjsize(const void *objp);
 /* This mask defines which mm->def_flags a process can inherit its parent */
 #define VM_INIT_DEF_MASK       VM_NOHUGEPAGE
 
+/* This mask is used to clear all the VMA flags used by mlock */
+#define VM_LOCKED_CLEAR_MASK   (~(VM_LOCKED | VM_LOCKONFAULT))
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
@@ -391,79 +395,17 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 
 extern void kvfree(const void *addr);
 
-static inline void compound_lock(struct page *page)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       VM_BUG_ON_PAGE(PageSlab(page), page);
-       bit_spin_lock(PG_compound_lock, &page->flags);
-#endif
-}
-
-static inline void compound_unlock(struct page *page)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       VM_BUG_ON_PAGE(PageSlab(page), page);
-       bit_spin_unlock(PG_compound_lock, &page->flags);
-#endif
-}
-
-static inline unsigned long compound_lock_irqsave(struct page *page)
+static inline atomic_t *compound_mapcount_ptr(struct page *page)
 {
-       unsigned long uninitialized_var(flags);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       local_irq_save(flags);
-       compound_lock(page);
-#endif
-       return flags;
+       return &page[1].compound_mapcount;
 }
 
-static inline void compound_unlock_irqrestore(struct page *page,
-                                             unsigned long flags)
+static inline int compound_mapcount(struct page *page)
 {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       compound_unlock(page);
-       local_irq_restore(flags);
-#endif
-}
-
-static inline struct page *compound_head_by_tail(struct page *tail)
-{
-       struct page *head = tail->first_page;
-
-       /*
-        * page->first_page may be a dangling pointer to an old
-        * compound page, so recheck that it is still a tail
-        * page before returning.
-        */
-       smp_rmb();
-       if (likely(PageTail(tail)))
-               return head;
-       return tail;
-}
-
-/*
- * Since either compound page could be dismantled asynchronously in THP
- * or we access asynchronously arbitrary positioned struct page, there
- * would be tail flag race. To handle this race, we should call
- * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
- */
-static inline struct page *compound_head(struct page *page)
-{
-       if (unlikely(PageTail(page)))
-               return compound_head_by_tail(page);
-       return page;
-}
-
-/*
- * If we access compound page synchronously such as access to
- * allocated page, there is no need to handle tail flag race, so we can
- * check tail flag directly without any synchronization primitive.
- */
-static inline struct page *compound_head_fast(struct page *page)
-{
-       if (unlikely(PageTail(page)))
-               return page->first_page;
-       return page;
+       if (!PageCompound(page))
+               return 0;
+       page = compound_head(page);
+       return atomic_read(compound_mapcount_ptr(page)) + 1;
 }
 
 /*
@@ -478,8 +420,17 @@ static inline void page_mapcount_reset(struct page *page)
 
 static inline int page_mapcount(struct page *page)
 {
+       int ret;
        VM_BUG_ON_PAGE(PageSlab(page), page);
-       return atomic_read(&page->_mapcount) + 1;
+
+       ret = atomic_read(&page->_mapcount) + 1;
+       if (PageCompound(page)) {
+               page = compound_head(page);
+               ret += atomic_read(compound_mapcount_ptr(page)) + 1;
+               if (PageDoubleMap(page))
+                       ret--;
+       }
+       return ret;
 }
 
 static inline int page_count(struct page *page)
@@ -487,44 +438,9 @@ static inline int page_count(struct page *page)
        return atomic_read(&compound_head(page)->_count);
 }
 
-static inline bool __compound_tail_refcounted(struct page *page)
-{
-       return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
-}
-
-/*
- * This takes a head page as parameter and tells if the
- * tail page reference counting can be skipped.
- *
- * For this to be safe, PageSlab and PageHeadHuge must remain true on
- * any given page where they return true here, until all tail pins
- * have been released.
- */
-static inline bool compound_tail_refcounted(struct page *page)
-{
-       VM_BUG_ON_PAGE(!PageHead(page), page);
-       return __compound_tail_refcounted(page);
-}
-
-static inline void get_huge_page_tail(struct page *page)
-{
-       /*
-        * __split_huge_page_refcount() cannot run from under us.
-        */
-       VM_BUG_ON_PAGE(!PageTail(page), page);
-       VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
-       VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
-       if (compound_tail_refcounted(page->first_page))
-               atomic_inc(&page->_mapcount);
-}
-
-extern bool __get_page_tail(struct page *page);
-
 static inline void get_page(struct page *page)
 {
-       if (unlikely(PageTail(page)))
-               if (likely(__get_page_tail(page)))
-                       return;
+       page = compound_head(page);
        /*
         * Getting a normal page or the head of a compound page
         * requires to already have an elevated page->_count.
@@ -537,13 +453,7 @@ static inline struct page *virt_to_head_page(const void *x)
 {
        struct page *page = virt_to_page(x);
 
-       /*
-        * We don't need to worry about synchronization of tail flag
-        * when we call virt_to_head_page() since it is only called for
-        * already allocated page and this page won't be freed until
-        * this virt_to_head_page() is finished. So use _fast variant.
-        */
-       return compound_head_fast(page);
+       return compound_head(page);
 }
 
 /*
@@ -555,7 +465,15 @@ static inline void init_page_count(struct page *page)
        atomic_set(&page->_count, 1);
 }
 
-void put_page(struct page *page);
+void __put_page(struct page *page);
+
+static inline void put_page(struct page *page)
+{
+       page = compound_head(page);
+       if (put_page_testzero(page))
+               __put_page(page);
+}
+
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
@@ -564,32 +482,51 @@ int split_free_page(struct page *page);
 /*
  * Compound pages have a destructor function.  Provide a
  * prototype for that function and accessor functions.
- * These are _only_ valid on the head of a PG_compound page.
+ * These are _only_ valid on the head of a compound page.
  */
+typedef void compound_page_dtor(struct page *);
+
+/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
+enum compound_dtor_id {
+       NULL_COMPOUND_DTOR,
+       COMPOUND_PAGE_DTOR,
+#ifdef CONFIG_HUGETLB_PAGE
+       HUGETLB_PAGE_DTOR,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       TRANSHUGE_PAGE_DTOR,
+#endif
+       NR_COMPOUND_DTORS,
+};
+extern compound_page_dtor * const compound_page_dtors[];
 
 static inline void set_compound_page_dtor(struct page *page,
-                                               compound_page_dtor *dtor)
+               enum compound_dtor_id compound_dtor)
 {
-       page[1].compound_dtor = dtor;
+       VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page);
+       page[1].compound_dtor = compound_dtor;
 }
 
 static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
 {
-       return page[1].compound_dtor;
+       VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
+       return compound_page_dtors[page[1].compound_dtor];
 }
 
-static inline int compound_order(struct page *page)
+static inline unsigned int compound_order(struct page *page)
 {
        if (!PageHead(page))
                return 0;
        return page[1].compound_order;
 }
 
-static inline void set_compound_order(struct page *page, unsigned long order)
+static inline void set_compound_order(struct page *page, unsigned int order)
 {
        page[1].compound_order = order;
 }
 
+void free_compound_page(struct page *page);
+
 #ifdef CONFIG_MMU
 /*
  * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
@@ -1006,10 +943,21 @@ static inline pgoff_t page_file_index(struct page *page)
 
 /*
  * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
  */
-static inline int page_mapped(struct page *page)
-{
-       return atomic_read(&(page)->_mapcount) >= 0;
+static inline bool page_mapped(struct page *page)
+{
+       int i;
+       if (likely(!PageCompound(page)))
+               return atomic_read(&page->_mapcount) >= 0;
+       page = compound_head(page);
+       if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+               return true;
+       for (i = 0; i < hpage_nr_pages(page); i++) {
+               if (atomic_read(&page[i]._mapcount) >= 0)
+                       return true;
+       }
+       return false;
 }
 
 /*
@@ -1568,8 +1516,7 @@ static inline bool ptlock_init(struct page *page)
         * with 0. Make sure nobody took it in use in between.
         *
         * It can happen if arch try to use slab for page table allocation:
-        * slab code uses page->slab_cache and page->first_page (for tail
-        * pages), which share storage with page->ptl.
+        * slab code uses page->slab_cache, which share storage with page->ptl.
         */
        VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
        if (!ptlock_alloc(page))
@@ -1606,8 +1553,10 @@ static inline void pgtable_init(void)
 
 static inline bool pgtable_page_ctor(struct page *page)
 {
+       if (!ptlock_init(page))
+               return false;
        inc_zone_page_state(page, NR_PAGETABLE);
-       return ptlock_init(page);
+       return true;
 }
 
 static inline void pgtable_page_dtor(struct page *page)
@@ -1837,7 +1786,8 @@ extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
 
 extern __printf(3, 4)
-void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
+void warn_alloc_failed(gfp_t gfp_mask, unsigned int order,
+               const char *fmt, ...);
 
 extern void setup_per_cpu_pageset(void);
 
@@ -2036,8 +1986,6 @@ void page_cache_async_readahead(struct address_space *mapping,
                                pgoff_t offset,
                                unsigned long size);
 
-unsigned long max_sane_readahead(unsigned long nr);
-
 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
 
@@ -2137,6 +2085,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
 #define FOLL_NUMA      0x200   /* force NUMA hinting page fault */
 #define FOLL_MIGRATION 0x400   /* wait for page to replace migration entry */
 #define FOLL_TRIED     0x800   /* a retry, previous pass started an IO */
+#define FOLL_MLOCK     0x1000  /* lock present pages */
 
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                        void *data);
index 3d6baa7d4534c68918dd7181e82674ade0b8ed98..5e37e918c614b817b5f592e78284e696082b5809 100644 (file)
@@ -28,8 +28,6 @@ struct mem_cgroup;
                IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
 #define ALLOC_SPLIT_PTLOCKS    (SPINLOCK_SIZE > BITS_PER_LONG/8)
 
-typedef void compound_page_dtor(struct page *);
-
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -56,6 +54,8 @@ struct page {
                                                 * see PAGE_MAPPING_ANON below.
                                                 */
                void *s_mem;                    /* slab first object */
+               atomic_t compound_mapcount;     /* first tail page */
+               /* page_deferred_list().next     -- second tail page */
        };
 
        /* Second double word */
@@ -63,6 +63,7 @@ struct page {
                union {
                        pgoff_t index;          /* Our offset within mapping. */
                        void *freelist;         /* sl[aou]b first free object */
+                       /* page_deferred_list().prev    -- second tail page */
                };
 
                union {
@@ -83,20 +84,9 @@ struct page {
 
                                union {
                                        /*
-                                        * Count of ptes mapped in
-                                        * mms, to show when page is
-                                        * mapped & limit reverse map
-                                        * searches.
-                                        *
-                                        * Used also for tail pages
-                                        * refcounting instead of
-                                        * _count. Tail pages cannot
-                                        * be mapped and keeping the
-                                        * tail page _count zero at
-                                        * all times guarantees
-                                        * get_page_unless_zero() will
-                                        * never succeed on tail
-                                        * pages.
+                                        * Count of ptes mapped in mms, to show
+                                        * when page is mapped & limit reverse
+                                        * map searches.
                                         */
                                        atomic_t _mapcount;
 
@@ -113,7 +103,13 @@ struct page {
                };
        };
 
-       /* Third double word block */
+       /*
+        * Third double word block
+        *
+        * WARNING: bit 0 of the first word encode PageTail(). That means
+        * the rest users of the storage space MUST NOT use the bit to
+        * avoid collision and false-positive PageTail().
+        */
        union {
                struct list_head lru;   /* Pageout list, eg. active_list
                                         * protected by zone->lru_lock !
@@ -131,18 +127,37 @@ struct page {
 #endif
                };
 
-               struct slab *slab_page; /* slab fields */
                struct rcu_head rcu_head;       /* Used by SLAB
                                                 * when destroying via RCU
                                                 */
-               /* First tail page of compound page */
+               /* Tail pages of compound page */
                struct {
-                       compound_page_dtor *compound_dtor;
-                       unsigned long compound_order;
+                       unsigned long compound_head; /* If bit zero is set */
+
+                       /* First tail page only */
+#ifdef CONFIG_64BIT
+                       /*
+                        * On 64 bit system we have enough space in struct page
+                        * to encode compound_dtor and compound_order with
+                        * unsigned int. It can help compiler generate better or
+                        * smaller code on some archtectures.
+                        */
+                       unsigned int compound_dtor;
+                       unsigned int compound_order;
+#else
+                       unsigned short int compound_dtor;
+                       unsigned short int compound_order;
+#endif
                };
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
-               pgtable_t pmd_huge_pte; /* protected by page->ptl */
+               struct {
+                       unsigned long __pad;    /* do not overlay pmd_huge_pte
+                                                * with compound_head to avoid
+                                                * possible bit 0 collision.
+                                                */
+                       pgtable_t pmd_huge_pte; /* protected by page->ptl */
+               };
 #endif
        };
 
@@ -163,7 +178,6 @@ struct page {
 #endif
 #endif
                struct kmem_cache *slab_cache;  /* SL[AU]B: Pointer to slab */
-               struct page *first_page;        /* Compound tail pages */
        };
 
 #ifdef CONFIG_MEMCG
@@ -486,6 +500,9 @@ struct mm_struct {
        /* address of the bounds directory */
        void __user *bd_addr;
 #endif
+#ifdef CONFIG_HUGETLB_PAGE
+       atomic_long_t hugetlb_usage;
+#endif
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
index 877ef226f90fb3b945831c7e8f9edbabba1e9775..c447d8055e50e34033036ce0277379ac7abd55c8 100644 (file)
@@ -55,4 +55,10 @@ void dump_mm(const struct mm_struct *mm);
 #define VIRTUAL_BUG_ON(cond) do { } while (0)
 #endif
 
+#ifdef CONFIG_DEBUG_VM_PGFLAGS
+#define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page)
+#else
+#define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond)
+#endif
+
 #endif
index d943477372928c13a586cbcd78472a7e29bf65c9..e23a9e704536278dad66bc5e5d1f9f798036b8be 100644 (file)
 
 enum {
        MIGRATE_UNMOVABLE,
-       MIGRATE_RECLAIMABLE,
        MIGRATE_MOVABLE,
+       MIGRATE_RECLAIMABLE,
        MIGRATE_PCPTYPES,       /* the number of types on the pcp lists */
-       MIGRATE_RESERVE = MIGRATE_PCPTYPES,
+       MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
 #ifdef CONFIG_CMA
        /*
         * MIGRATE_CMA migration type is designed to mimic the way
@@ -334,13 +334,16 @@ struct zone {
        /* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long watermark[NR_WMARK];
 
+       unsigned long nr_reserved_highatomic;
+
        /*
-        * We don't know if the memory that we're going to allocate will be freeable
-        * or/and it will be released eventually, so to avoid totally wasting several
-        * GB of ram we must reserve some of the lower zone memory (otherwise we risk
-        * to run OOM on the lower zones despite there's tons of freeable ram
-        * on the higher zones). This array is recalculated at runtime if the
-        * sysctl_lowmem_reserve_ratio sysctl changes.
+        * We don't know if the memory that we're going to allocate will be
+        * freeable or/and it will be released eventually, so to avoid totally
+        * wasting several GB of ram we must reserve some of the lower zone
+        * memory (otherwise we risk to run OOM on the lower zones despite
+        * there being tons of freeable ram on the higher zones).  This array is
+        * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
+        * changes.
         */
        long lowmem_reserve[MAX_NR_ZONES];
 
@@ -429,12 +432,6 @@ struct zone {
 
        const char              *name;
 
-       /*
-        * Number of MIGRATE_RESERVE page block. To maintain for just
-        * optimization. Protected by zone->lock.
-        */
-       int                     nr_migrate_reserve_block;
-
 #ifdef CONFIG_MEMORY_ISOLATION
        /*
         * Number of isolated pageblock. It is used to solve incorrect
@@ -589,75 +586,8 @@ static inline bool zone_is_empty(struct zone *zone)
  * [1] : No fallback (__GFP_THISNODE)
  */
 #define MAX_ZONELISTS 2
-
-
-/*
- * We cache key information from each zonelist for smaller cache
- * footprint when scanning for free pages in get_page_from_freelist().
- *
- * 1) The BITMAP fullzones tracks which zones in a zonelist have come
- *    up short of free memory since the last time (last_fullzone_zap)
- *    we zero'd fullzones.
- * 2) The array z_to_n[] maps each zone in the zonelist to its node
- *    id, so that we can efficiently evaluate whether that node is
- *    set in the current tasks mems_allowed.
- *
- * Both fullzones and z_to_n[] are one-to-one with the zonelist,
- * indexed by a zones offset in the zonelist zones[] array.
- *
- * The get_page_from_freelist() routine does two scans.  During the
- * first scan, we skip zones whose corresponding bit in 'fullzones'
- * is set or whose corresponding node in current->mems_allowed (which
- * comes from cpusets) is not set.  During the second scan, we bypass
- * this zonelist_cache, to ensure we look methodically at each zone.
- *
- * Once per second, we zero out (zap) fullzones, forcing us to
- * reconsider nodes that might have regained more free memory.
- * The field last_full_zap is the time we last zapped fullzones.
- *
- * This mechanism reduces the amount of time we waste repeatedly
- * reexaming zones for free memory when they just came up low on
- * memory momentarilly ago.
- *
- * The zonelist_cache struct members logically belong in struct
- * zonelist.  However, the mempolicy zonelists constructed for
- * MPOL_BIND are intentionally variable length (and usually much
- * shorter).  A general purpose mechanism for handling structs with
- * multiple variable length members is more mechanism than we want
- * here.  We resort to some special case hackery instead.
- *
- * The MPOL_BIND zonelists don't need this zonelist_cache (in good
- * part because they are shorter), so we put the fixed length stuff
- * at the front of the zonelist struct, ending in a variable length
- * zones[], as is needed by MPOL_BIND.
- *
- * Then we put the optional zonelist cache on the end of the zonelist
- * struct.  This optional stuff is found by a 'zlcache_ptr' pointer in
- * the fixed length portion at the front of the struct.  This pointer
- * both enables us to find the zonelist cache, and in the case of
- * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
- * to know that the zonelist cache is not there.
- *
- * The end result is that struct zonelists come in two flavors:
- *  1) The full, fixed length version, shown below, and
- *  2) The custom zonelists for MPOL_BIND.
- * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
- *
- * Even though there may be multiple CPU cores on a node modifying
- * fullzones or last_full_zap in the same zonelist_cache at the same
- * time, we don't lock it.  This is just hint data - if it is wrong now
- * and then, the allocator will still function, perhaps a bit slower.
- */
-
-
-struct zonelist_cache {
-       unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];          /* zone->nid */
-       DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);      /* zone full? */
-       unsigned long last_full_zap;            /* when last zap'd (jiffies) */
-};
 #else
 #define MAX_ZONELISTS 1
-struct zonelist_cache;
 #endif
 
 /*
@@ -675,9 +605,6 @@ struct zoneref {
  * allocation, the other zones are fallback zones, in decreasing
  * priority.
  *
- * If zlcache_ptr is not NULL, then it is just the address of zlcache,
- * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
- * *
  * To speed the reading of the zonelist, the zonerefs contain the zone index
  * of the entry being read. Helper functions to access information given
  * a struct zoneref are
@@ -687,11 +614,7 @@ struct zoneref {
  * zonelist_node_idx() - Return the index of the node for an entry
  */
 struct zonelist {
-       struct zonelist_cache *zlcache_ptr;                  // NULL or &zlcache
        struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
-#ifdef CONFIG_NUMA
-       struct zonelist_cache zlcache;                       // optional ...
-#endif
 };
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -817,14 +740,13 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int classzone_idx, int alloc_flags);
 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
-               unsigned long mark, int classzone_idx, int alloc_flags);
+               unsigned long mark, int classzone_idx);
 enum memmap_context {
        MEMMAP_EARLY,
        MEMMAP_HOTPLUG,
 };
 extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
-                                    unsigned long size,
-                                    enum memmap_context context);
+                                    unsigned long size);
 
 extern void lruvec_init(struct lruvec *lruvec);
 
index c12f2147c350593fb827b7b29aebb33e8f9f0b4a..52666d90ca9452251c539383ecc7b038ba9afbea 100644 (file)
@@ -386,6 +386,7 @@ extern int param_get_ullong(char *buffer, const struct kernel_param *kp);
 extern const struct kernel_param_ops param_ops_charp;
 extern int param_set_charp(const char *val, const struct kernel_param *kp);
 extern int param_get_charp(char *buffer, const struct kernel_param *kp);
+extern void param_free_charp(void *arg);
 #define param_check_charp(name, p) __param_check(name, p, char *)
 
 /* We used to allow int as well as bool.  We're taking that away! */
index 78488e099ce7a4263dff9f9a08fa2e39f60f0e49..7ec5b86735f33d512a27086635ac15a221039101 100644 (file)
@@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 extern unsigned long *watchdog_cpumask_bits;
 extern int sysctl_softlockup_all_cpu_backtrace;
+extern int sysctl_hardlockup_all_cpu_backtrace;
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
                         void __user *, size_t *, loff_t *);
index 416509e26d6d16bfa0f75ef793b7e32e6b5fb090..72356fbc3f2dff03e32e70f531e6da3b26d471d8 100644 (file)
@@ -86,12 +86,7 @@ enum pageflags {
        PG_private,             /* If pagecache, has fs-private data */
        PG_private_2,           /* If pagecache, has fs aux data */
        PG_writeback,           /* Page is under writeback */
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
        PG_head,                /* A head page */
-       PG_tail,                /* A tail page */
-#else
-       PG_compound,            /* A compound page */
-#endif
        PG_swapcache,           /* Swap page: swp_entry_t in private */
        PG_mappedtodisk,        /* Has blocks allocated on-disk */
        PG_reclaim,             /* To be reclaimed asap */
@@ -106,9 +101,6 @@ enum pageflags {
 #ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,            /* hardware poisoned page. Don't touch */
 #endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       PG_compound_lock,
-#endif
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
        PG_young,
        PG_idle,
@@ -134,53 +126,108 @@ enum pageflags {
 
        /* SLOB */
        PG_slob_free = PG_private,
+
+       /* Compound pages. Stored in first tail page's flags */
+       PG_double_map = PG_private_2,
 };
 
 #ifndef __GENERATING_BOUNDS_H
 
+struct page;   /* forward declaration */
+
+static inline struct page *compound_head(struct page *page)
+{
+       unsigned long head = READ_ONCE(page->compound_head);
+
+       if (unlikely(head & 1))
+               return (struct page *) (head - 1);
+       return page;
+}
+
+static inline int PageTail(struct page *page)
+{
+       return READ_ONCE(page->compound_head) & 1;
+}
+
+static inline int PageCompound(struct page *page)
+{
+       return test_bit(PG_head, &page->flags) || PageTail(page);
+}
+
+/*
+ * Page flags policies wrt compound pages
+ *
+ * PF_ANY:
+ *     the page flag is relevant for small, head and tail pages.
+ *
+ * PF_HEAD:
+ *     for compound page all operations related to the page flag applied to
+ *     head page.
+ *
+ * PF_NO_TAIL:
+ *     modifications of the page flag must be done on small or head pages,
+ *     checks can be done on tail pages too.
+ *
+ * PF_NO_COMPOUND:
+ *     the page flag is not relevant for compound pages.
+ */
+#define PF_ANY(page, enforce)  page
+#define PF_HEAD(page, enforce) compound_head(page)
+#define PF_NO_TAIL(page, enforce) ({                                   \
+               VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);     \
+               compound_head(page);})
+#define PF_NO_COMPOUND(page, enforce) ({                               \
+               VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \
+               page;})
+
 /*
  * Macros to create function definitions for page flags
  */
-#define TESTPAGEFLAG(uname, lname)                                     \
-static inline int Page##uname(const struct page *page)                 \
-                       { return test_bit(PG_##lname, &page->flags); }
+#define TESTPAGEFLAG(uname, lname, policy)                             \
+static inline int Page##uname(struct page *page)                       \
+       { return test_bit(PG_##lname, &policy(page, 0)->flags); }
 
-#define SETPAGEFLAG(uname, lname)                                      \
+#define SETPAGEFLAG(uname, lname, policy)                              \
 static inline void SetPage##uname(struct page *page)                   \
-                       { set_bit(PG_##lname, &page->flags); }
+       { set_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define CLEARPAGEFLAG(uname, lname)                                    \
+#define CLEARPAGEFLAG(uname, lname, policy)                            \
 static inline void ClearPage##uname(struct page *page)                 \
-                       { clear_bit(PG_##lname, &page->flags); }
+       { clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define __SETPAGEFLAG(uname, lname)                                    \
+#define __SETPAGEFLAG(uname, lname, policy)                            \
 static inline void __SetPage##uname(struct page *page)                 \
-                       { __set_bit(PG_##lname, &page->flags); }
+       { __set_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define __CLEARPAGEFLAG(uname, lname)                                  \
+#define __CLEARPAGEFLAG(uname, lname, policy)                          \
 static inline void __ClearPage##uname(struct page *page)               \
-                       { __clear_bit(PG_##lname, &page->flags); }
+       { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define TESTSETFLAG(uname, lname)                                      \
+#define TESTSETFLAG(uname, lname, policy)                              \
 static inline int TestSetPage##uname(struct page *page)                        \
-               { return test_and_set_bit(PG_##lname, &page->flags); }
+       { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define TESTCLEARFLAG(uname, lname)                                    \
+#define TESTCLEARFLAG(uname, lname, policy)                            \
 static inline int TestClearPage##uname(struct page *page)              \
-               { return test_and_clear_bit(PG_##lname, &page->flags); }
+       { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define __TESTCLEARFLAG(uname, lname)                                  \
+#define __TESTCLEARFLAG(uname, lname, policy)                          \
 static inline int __TestClearPage##uname(struct page *page)            \
-               { return __test_and_clear_bit(PG_##lname, &page->flags); }
+       { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
-#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)              \
-       SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
+#define PAGEFLAG(uname, lname, policy)                                 \
+       TESTPAGEFLAG(uname, lname, policy)                              \
+       SETPAGEFLAG(uname, lname, policy)                               \
+       CLEARPAGEFLAG(uname, lname, policy)
 
-#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)            \
-       __SETPAGEFLAG(uname, lname)  __CLEARPAGEFLAG(uname, lname)
+#define __PAGEFLAG(uname, lname, policy)                               \
+       TESTPAGEFLAG(uname, lname, policy)                              \
+       __SETPAGEFLAG(uname, lname, policy)                             \
+       __CLEARPAGEFLAG(uname, lname, policy)
 
-#define TESTSCFLAG(uname, lname)                                       \
-       TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
+#define TESTSCFLAG(uname, lname, policy)                               \
+       TESTSETFLAG(uname, lname, policy)                               \
+       TESTCLEARFLAG(uname, lname, policy)
 
 #define TESTPAGEFLAG_FALSE(uname)                                      \
 static inline int Page##uname(const struct page *page) { return 0; }
@@ -209,84 +256,96 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
 #define TESTSCFLAG_FALSE(uname)                                                \
        TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
 
-struct page;   /* forward declaration */
-
-TESTPAGEFLAG(Locked, locked)
-PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
-PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
-       __SETPAGEFLAG(Referenced, referenced)
-PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
-PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
-PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
-       TESTCLEARFLAG(Active, active)
-__PAGEFLAG(Slab, slab)
-PAGEFLAG(Checked, checked)             /* Used by some filesystems */
-PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)    /* Xen */
-PAGEFLAG(SavePinned, savepinned);                      /* Xen */
-PAGEFLAG(Foreign, foreign);                            /* Xen */
-PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
-       __SETPAGEFLAG(SwapBacked, swapbacked)
-
-__PAGEFLAG(SlobFree, slob_free)
+__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Referenced, referenced, PF_HEAD)
+       TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
+       __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
+       __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
+PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
+       TESTCLEARFLAG(Active, active, PF_HEAD)
+__PAGEFLAG(Slab, slab, PF_NO_TAIL)
+__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
+PAGEFLAG(Checked, checked, PF_NO_COMPOUND)        /* Used by some filesystems */
+
+/* Xen */
+PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
+       TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
+PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
+PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
+
+PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+       __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+       __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+       __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
 
 /*
  * Private page markings that may be used by the filesystem that owns the page
  * for its own purposes.
  * - PG_private and PG_private_2 cause releasepage() and co to be invoked
  */
-PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
-       __CLEARPAGEFLAG(Private, private)
-PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
-PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
+       __CLEARPAGEFLAG(Private, private, PF_ANY)
+PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+       TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
 
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
  */
-TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-PAGEFLAG(MappedToDisk, mappedtodisk)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
+       TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
 
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+       TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+       TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
 
 #ifdef CONFIG_HIGHMEM
 /*
  * Must use a macro here due to header dependency issues. page_zone() is not
  * available at this point.
  */
-#define PageHighMem(__p) is_highmem(page_zone(__p))
+#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
 #else
 PAGEFLAG_FALSE(HighMem)
 #endif
 
 #ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache)
+PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
 #else
 PAGEFLAG_FALSE(SwapCache)
 #endif
 
-PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
-       TESTCLEARFLAG(Unevictable, unevictable)
+PAGEFLAG(Unevictable, unevictable, PF_HEAD)
+       __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
+       TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
 
 #ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
-       TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
+PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+       __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+       TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
+       __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL)
 #else
 PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
        TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
 #endif
 
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached)
+PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
 #else
 PAGEFLAG_FALSE(Uncached)
 #endif
 
 #ifdef CONFIG_MEMORY_FAILURE
-PAGEFLAG(HWPoison, hwpoison)
-TESTSCFLAG(HWPoison, hwpoison)
+PAGEFLAG(HWPoison, hwpoison, PF_ANY)
+TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
 #define __PG_HWPOISON (1UL << PG_hwpoison)
 #else
 PAGEFLAG_FALSE(HWPoison)
@@ -294,10 +353,10 @@ PAGEFLAG_FALSE(HWPoison)
 #endif
 
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
-TESTPAGEFLAG(Young, young)
-SETPAGEFLAG(Young, young)
-TESTCLEARFLAG(Young, young)
-PAGEFLAG(Idle, idle)
+TESTPAGEFLAG(Young, young, PF_ANY)
+SETPAGEFLAG(Young, young, PF_ANY)
+TESTCLEARFLAG(Young, young, PF_ANY)
+PAGEFLAG(Idle, idle, PF_ANY)
 #endif
 
 /*
@@ -322,6 +381,7 @@ PAGEFLAG(Idle, idle)
 
 static inline int PageAnon(struct page *page)
 {
+       page = compound_head(page);
        return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
 }
 
@@ -334,6 +394,7 @@ static inline int PageAnon(struct page *page)
  */
 static inline int PageKsm(struct page *page)
 {
+       page = compound_head(page);
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
 }
@@ -345,8 +406,9 @@ u64 stable_page_flags(struct page *page);
 
 static inline int PageUptodate(struct page *page)
 {
-       int ret = test_bit(PG_uptodate, &(page)->flags);
-
+       int ret;
+       page = compound_head(page);
+       ret = test_bit(PG_uptodate, &(page)->flags);
        /*
         * Must ensure that the data we read out of the page is loaded
         * _after_ we've loaded page->flags to check for PageUptodate.
@@ -363,22 +425,24 @@ static inline int PageUptodate(struct page *page)
 
 static inline void __SetPageUptodate(struct page *page)
 {
+       VM_BUG_ON_PAGE(PageTail(page), page);
        smp_wmb();
-       __set_bit(PG_uptodate, &(page)->flags);
+       __set_bit(PG_uptodate, &page->flags);
 }
 
 static inline void SetPageUptodate(struct page *page)
 {
+       VM_BUG_ON_PAGE(PageTail(page), page);
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
         * so that all previous stores issued in order to bring the page
         * uptodate are actually visible before PageUptodate becomes true.
         */
        smp_wmb();
-       set_bit(PG_uptodate, &(page)->flags);
+       set_bit(PG_uptodate, &page->flags);
 }
 
-CLEARPAGEFLAG(Uptodate, uptodate)
+CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
 
 int test_clear_page_writeback(struct page *page);
 int __test_set_page_writeback(struct page *page, bool keep_write);
@@ -398,85 +462,27 @@ static inline void set_page_writeback_keepwrite(struct page *page)
        test_set_page_writeback_keepwrite(page);
 }
 
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
-/*
- * System with lots of page flags available. This allows separate
- * flags for PageHead() and PageTail() checks of compound pages so that bit
- * tests can be used in performance sensitive paths. PageCompound is
- * generally not used in hot code paths except arch/powerpc/mm/init_64.c
- * and arch/powerpc/kvm/book3s_64_vio_hv.c which use it to detect huge pages
- * and avoid handling those in real mode.
- */
-__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
-__PAGEFLAG(Tail, tail)
-
-static inline int PageCompound(struct page *page)
-{
-       return page->flags & ((1L << PG_head) | (1L << PG_tail));
-
-}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline void ClearPageCompound(struct page *page)
-{
-       BUG_ON(!PageHead(page));
-       ClearPageHead(page);
-}
-#endif
-
-#define PG_head_mask ((1L << PG_head))
+__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
 
-#else
-/*
- * Reduce page flag use as much as possible by overlapping
- * compound page flags with the flags used for page cache pages. Possible
- * because PageCompound is always set for compound pages and not for
- * pages on the LRU and/or pagecache.
- */
-TESTPAGEFLAG(Compound, compound)
-__SETPAGEFLAG(Head, compound)  __CLEARPAGEFLAG(Head, compound)
-
-/*
- * PG_reclaim is used in combination with PG_compound to mark the
- * head and tail of a compound page. This saves one page flag
- * but makes it impossible to use compound pages for the page cache.
- * The PG_reclaim bit would have to be used for reclaim or readahead
- * if compound pages enter the page cache.
- *
- * PG_compound & PG_reclaim    => Tail page
- * PG_compound & ~PG_reclaim   => Head page
- */
-#define PG_head_mask ((1L << PG_compound))
-#define PG_head_tail_mask ((1L << PG_compound) | (1L << PG_reclaim))
-
-static inline int PageHead(struct page *page)
-{
-       return ((page->flags & PG_head_tail_mask) == PG_head_mask);
-}
-
-static inline int PageTail(struct page *page)
+static inline void set_compound_head(struct page *page, struct page *head)
 {
-       return ((page->flags & PG_head_tail_mask) == PG_head_tail_mask);
+       WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
 }
 
-static inline void __SetPageTail(struct page *page)
+static inline void clear_compound_head(struct page *page)
 {
-       page->flags |= PG_head_tail_mask;
-}
-
-static inline void __ClearPageTail(struct page *page)
-{
-       page->flags &= ~PG_head_tail_mask;
+       WRITE_ONCE(page->compound_head, 0);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline void ClearPageCompound(struct page *page)
 {
-       BUG_ON((page->flags & PG_head_tail_mask) != (1 << PG_compound));
-       clear_bit(PG_compound, &page->flags);
+       BUG_ON(!PageHead(page));
+       ClearPageHead(page);
 }
 #endif
 
-#endif /* !PAGEFLAGS_EXTENDED */
+#define PG_head_mask ((1L << PG_head))
 
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
@@ -528,22 +534,44 @@ static inline int PageTransTail(struct page *page)
        return PageTail(page);
 }
 
-#else
-
-static inline int PageTransHuge(struct page *page)
+/*
+ * PageDoubleMap indicates that the compound page is mapped with PTEs as well
+ * as PMDs.
+ *
+ * This is required for optimization of rmap oprations for THP: we can postpone
+ * per small page mapcount accounting (and its overhead from atomic operations)
+ * until the first PMD split.
+ *
+ * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
+ * by one. This reference will go away with last compound_mapcount.
+ *
+ * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap().
+ */
+static inline int PageDoubleMap(struct page *page)
 {
-       return 0;
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+       return test_bit(PG_double_map, &page[1].flags);
 }
 
-static inline int PageTransCompound(struct page *page)
+static inline int TestSetPageDoubleMap(struct page *page)
 {
-       return 0;
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+       return test_and_set_bit(PG_double_map, &page[1].flags);
 }
 
-static inline int PageTransTail(struct page *page)
+static inline int TestClearPageDoubleMap(struct page *page)
 {
-       return 0;
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+       return test_and_clear_bit(PG_double_map, &page[1].flags);
 }
+
+#else
+TESTPAGEFLAG_FALSE(TransHuge)
+TESTPAGEFLAG_FALSE(TransCompound)
+TESTPAGEFLAG_FALSE(TransTail)
+TESTPAGEFLAG_FALSE(DoubleMap)
+       TESTSETFLAG_FALSE(DoubleMap)
+       TESTCLEARFLAG_FALSE(DoubleMap)
 #endif
 
 /*
@@ -627,12 +655,6 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
 #define __PG_MLOCKED           0
 #endif
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define __PG_COMPOUND_LOCK             (1 << PG_compound_lock)
-#else
-#define __PG_COMPOUND_LOCK             0
-#endif
-
 /*
  * Flags checked when a page is freed.  Pages being freed should not have
  * these flags set.  It they are, there is a problem.
@@ -642,8 +664,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
         1 << PG_private | 1 << PG_private_2 | \
         1 << PG_writeback | 1 << PG_reserved | \
         1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
-        1 << PG_unevictable | __PG_MLOCKED | \
-        __PG_COMPOUND_LOCK)
+        1 << PG_unevictable | __PG_MLOCKED)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
@@ -670,6 +691,10 @@ static inline int page_has_private(struct page *page)
        return !!(page->flags & PAGE_FLAGS_PRIVATE);
 }
 
+#undef PF_ANY
+#undef PF_HEAD
+#undef PF_NO_TAIL
+#undef PF_NO_COMPOUND
 #endif /* !__GENERATING_BOUNDS_H */
 
 #endif /* PAGE_FLAGS_H */
index 2baeee12f48ed1c29ba0a3fa1fea12294d4b60c2..e942558b3585f99e6d705bbe1c81c904c3f52658 100644 (file)
@@ -44,7 +44,7 @@ enum pageblock_bits {
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 
 /* Huge page sizes are variable */
-extern int pageblock_order;
+extern unsigned int pageblock_order;
 
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
index a6c78e00ea9684a784938ed39229c2018ffd8e75..4d08b6c33557250edda8e949ff64f5f2fd1ffdc1 100644 (file)
@@ -69,6 +69,13 @@ static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
        return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
 }
 
+/* Restricts the given gfp_mask to what the mapping allows. */
+static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
+               gfp_t gfp_mask)
+{
+       return mapping_gfp_mask(mapping) & gfp_mask;
+}
+
 /*
  * This is non-atomic.  Only to be used before the mapping is activated.
  * Probably needs a barrier...
@@ -387,10 +394,21 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
  */
 static inline pgoff_t page_to_pgoff(struct page *page)
 {
+       pgoff_t pgoff;
+
        if (unlikely(PageHeadHuge(page)))
                return page->index << compound_order(page);
-       else
+
+       if (likely(!PageTransTail(page)))
                return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+       /*
+        *  We don't initialize ->index for tail pages: calculate based on
+        *  head page
+        */
+       pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+       pgoff += page - compound_head(page);
+       return pgoff;
 }
 
 /*
@@ -426,18 +444,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                                unsigned int flags);
 extern void unlock_page(struct page *page);
 
-static inline void __set_page_locked(struct page *page)
-{
-       __set_bit(PG_locked, &page->flags);
-}
-
-static inline void __clear_page_locked(struct page *page)
-{
-       __clear_bit(PG_locked, &page->flags);
-}
-
 static inline int trylock_page(struct page *page)
 {
+       page = compound_head(page);
        return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
 }
 
@@ -490,9 +499,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page,
 
 static inline int wait_on_page_locked_killable(struct page *page)
 {
-       if (PageLocked(page))
-               return wait_on_page_bit_killable(page, PG_locked);
-       return 0;
+       if (!PageLocked(page))
+               return 0;
+       return wait_on_page_bit_killable(compound_head(page), PG_locked);
 }
 
 extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -511,7 +520,7 @@ static inline void wake_up_page(struct page *page, int bit)
 static inline void wait_on_page_locked(struct page *page)
 {
        if (PageLocked(page))
-               wait_on_page_bit(page, PG_locked);
+               wait_on_page_bit(compound_head(page), PG_locked);
 }
 
 /* 
@@ -657,17 +666,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
 
 /*
  * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __set_page_locked() against it.
+ * the page is new, so we can just run __SetPageLocked() against it.
  */
 static inline int add_to_page_cache(struct page *page,
                struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
 {
        int error;
 
-       __set_page_locked(page);
+       __SetPageLocked(page);
        error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
        if (unlikely(error))
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
        return error;
 }
 
index 317e16de09e508ed64b87dae6c5006b1efcb129d..deabe23b426d90c65bb0cdd10b69e69c66a05ba9 100644 (file)
  * Magic number "tsta" to indicate a static timer initializer
  * for the object debugging code.
  */
-#define TIMER_ENTRY_STATIC     ((void *) 0x74737461)
+#define TIMER_ENTRY_STATIC     ((void *) 0x300 + POISON_POINTER_DELTA)
 
 /********** mm/debug-pagealloc.c **********/
 #define PAGE_POISON 0xaa
 
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING   ((void *) 0x400 + POISON_POINTER_DELTA)
+
 /********** mm/slab.c **********/
 /*
  * Magic nums for obj red zoning.
@@ -73,6 +77,9 @@
 #define MUTEX_DEBUG_INIT       0x11
 #define MUTEX_DEBUG_FREE       0x22
 
+/********** lib/dma_debug.c **********/
+#define DMA_ALLOC_POISON       0xee
+
 /********** lib/flex_array.c **********/
 #define FLEX_ARRAY_FREE        0x6c    /* for use-after-free poisoning */
 
index 830c4992088d5806125577723c6a7c1272161689..a5aa7ae671f42d64727ebc2df1ba2950cebae66f 100644 (file)
@@ -101,13 +101,21 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent
        })
 
 /**
- * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of
- * given type safe against removal of rb_node entry
+ * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
+ * given type allowing the backing memory of @pos to be invalidated
  *
  * @pos:       the 'type *' to use as a loop cursor.
  * @n:         another 'type *' to use as temporary storage
  * @root:      'rb_root *' of the rbtree.
  * @field:     the name of the rb_node field within 'type'.
+ *
+ * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
+ * list_for_each_entry_safe() and allows the iteration to continue independent
+ * of changes to @pos by the body of the loop.
+ *
+ * Note, however, that it cannot handle other modifications that re-order the
+ * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
+ * rb_erase() may rebalance the tree, causing us to miss some nodes.
  */
 #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
        for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
index 29446aeef36e553aa361774d39c0852517c87405..853f4f3c6742baa90f2b240ea4dc704320f4bd75 100644 (file)
@@ -85,6 +85,7 @@ enum ttu_flags {
        TTU_UNMAP = 1,                  /* unmap mode */
        TTU_MIGRATION = 2,              /* migration mode */
        TTU_MUNLOCK = 4,                /* munlock mode */
+       TTU_FREE = 8,                   /* free mode */
 
        TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
        TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
@@ -161,25 +162,31 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
 
 struct anon_vma *page_get_anon_vma(struct page *page);
 
+/* bitflags for do_page_add_anon_rmap() */
+#define RMAP_EXCLUSIVE 0x01
+#define RMAP_COMPOUND 0x02
+
 /*
  * rmap interfaces called when adding or removing pte of page
  */
 void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_anon_rmap(struct page *, struct vm_area_struct *,
+               unsigned long, bool);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
                           unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+               unsigned long, bool);
 void page_add_file_rmap(struct page *);
-void page_remove_rmap(struct page *);
+void page_remove_rmap(struct page *, bool);
 
 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
                            unsigned long);
 void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
                                unsigned long);
 
-static inline void page_dup_rmap(struct page *page)
+static inline void page_dup_rmap(struct page *page, bool compound)
 {
-       atomic_inc(&page->_mapcount);
+       atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount);
 }
 
 /*
index 4effb1025fbb1555bc9c3ce6f80d98db004271a2..4069febaa34af9e93be8bb98e807db04a67c4380 100644 (file)
@@ -384,6 +384,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
                                  void __user *buffer,
                                  size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
+extern unsigned int  hardlockup_panic;
 void lockup_detector_init(void);
 #else
 static inline void touch_softlockup_watchdog(void)
@@ -1460,7 +1461,9 @@ struct task_struct {
        unsigned sched_reset_on_fork:1;
        unsigned sched_contributes_to_load:1;
        unsigned sched_migrated:1;
-
+#ifdef CONFIG_MEMCG
+       unsigned memcg_may_oom:1;
+#endif
 #ifdef CONFIG_MEMCG_KMEM
        unsigned memcg_kmem_skip_account:1;
 #endif
@@ -1567,9 +1570,7 @@ struct task_struct {
 
        unsigned long sas_ss_sp;
        size_t sas_ss_size;
-       int (*notifier)(void *priv);
-       void *notifier_data;
-       sigset_t *notifier_mask;
+
        struct callback_head *task_works;
 
        struct audit_context *audit_context;
@@ -1791,12 +1792,12 @@ struct task_struct {
        unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
 #ifdef CONFIG_MEMCG
-       struct memcg_oom_info {
-               struct mem_cgroup *memcg;
-               gfp_t gfp_mask;
-               int order;
-               unsigned int may_oom:1;
-       } memcg_oom;
+       struct mem_cgroup *memcg_in_oom;
+       gfp_t memcg_oom_gfp_mask;
+       int memcg_oom_order;
+
+       /* number of pages to reclaim on returning to userland */
+       unsigned int memcg_nr_pages_over_high;
 #endif
 #ifdef CONFIG_UPROBES
        struct uprobe_task *utask;
@@ -2461,21 +2462,29 @@ extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
 
-static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+static inline int kernel_dequeue_signal(siginfo_t *info)
 {
-       unsigned long flags;
+       struct task_struct *tsk = current;
+       siginfo_t __info;
        int ret;
 
-       spin_lock_irqsave(&tsk->sighand->siglock, flags);
-       ret = dequeue_signal(tsk, mask, info);
-       spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+       spin_lock_irq(&tsk->sighand->siglock);
+       ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
+       spin_unlock_irq(&tsk->sighand->siglock);
 
        return ret;
 }
 
-extern void block_all_signals(int (*notifier)(void *priv), void *priv,
-                             sigset_t *mask);
-extern void unblock_all_signals(void);
+static inline void kernel_signal_stop(void)
+{
+       spin_lock_irq(&current->sighand->siglock);
+       if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+               __set_current_state(TASK_STOPPED);
+       spin_unlock_irq(&current->sighand->siglock);
+
+       schedule();
+}
+
 extern void release_task(struct task_struct * p);
 extern int send_sig_info(int, struct siginfo *, struct task_struct *);
 extern int force_sigsegv(int, struct task_struct *);
index 24f4dfd94c517b3b387682509180dee161e0912d..4355129fff91b6f188136af2a499d6100f2e5bfd 100644 (file)
@@ -1224,7 +1224,7 @@ static inline int skb_cloned(const struct sk_buff *skb)
 
 static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
 {
-       might_sleep_if(pri & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(pri));
 
        if (skb_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);
@@ -1308,7 +1308,7 @@ static inline int skb_shared(const struct sk_buff *skb)
  */
 static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
 {
-       might_sleep_if(pri & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, pri);
 
@@ -1344,7 +1344,7 @@ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
 static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
                                          gfp_t pri)
 {
-       might_sleep_if(pri & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_cloned(skb)) {
                struct sk_buff *nskb = skb_copy(skb, pri);
 
index 7e37d448ed910854e1876682ac1038cebee93cd1..96940772bb92715b10028fde03ead671e6c3df56 100644 (file)
@@ -111,7 +111,7 @@ struct mem_cgroup;
  * struct kmem_cache related prototypes
  */
 void __init kmem_cache_init(void);
-int slab_is_available(void);
+bool slab_is_available(void);
 
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
                        unsigned long,
@@ -157,6 +157,24 @@ size_t ksize(const void *);
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 #endif
 
+/*
+ * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
+ * Intended for arches that get misalignment faults even for 64 bit integer
+ * aligned buffers.
+ */
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
+#endif
+
+/*
+ * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned
+ * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN
+ * aligned pointers.
+ */
+#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
+#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
+#define __assume_page_alignment __assume_aligned(PAGE_SIZE)
+
 /*
  * Kmalloc array related definitions
  */
@@ -286,8 +304,8 @@ static __always_inline int kmalloc_index(size_t size)
 }
 #endif /* !CONFIG_SLOB */
 
-void *__kmalloc(size_t size, gfp_t flags);
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment;
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment;
 void kmem_cache_free(struct kmem_cache *, void *);
 
 /*
@@ -301,8 +319,8 @@ void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
 bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
 #ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node);
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
 #else
 static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
@@ -316,12 +334,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
 #endif
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment;
 
 #ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
                                           gfp_t gfpflags,
-                                          int node, size_t size);
+                                          int node, size_t size) __assume_slab_alignment;
 #else
 static __always_inline void *
 kmem_cache_alloc_node_trace(struct kmem_cache *s,
@@ -354,10 +372,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 }
 #endif /* CONFIG_TRACING */
 
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
 
 #ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
 #else
 static __always_inline void *
 kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
@@ -482,15 +500,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
        return __kmalloc_node(size, flags, node);
 }
 
-/*
- * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
- * Intended for arches that get misalignment faults even for 64 bit integer
- * aligned buffers.
- */
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
-#endif
-
 struct memcg_cache_array {
        struct rcu_head rcu;
        struct kmem_cache *entries[0];
index 9ef7795e65e40c5dfbee53726909fbcb2ce341b0..cc77d6477fc23e95e8fe9f344e680b2d654c71ef 100644 (file)
@@ -121,6 +121,7 @@ extern void kfree_const(const void *x);
 extern char *kstrdup(const char *s, gfp_t gfp);
 extern const char *kstrdup_const(const char *s, gfp_t gfp);
 extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
 
 extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
index 7ba7dccaf0e7e1291b3489c9ece30318cb44fc6f..d08feef3d047a69847efa6c92c223de8f314a0ab 100644 (file)
@@ -154,7 +154,7 @@ enum {
        SWP_SCANNING    = (1 << 10),    /* refcount in scan_swap_map */
 };
 
-#define SWAP_CLUSTER_MAX 32UL
+#define SWAP_CLUSTER_MAX 256UL
 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
 
 /*
@@ -308,6 +308,7 @@ extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
 extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
@@ -539,7 +540,8 @@ static inline int swp_swapcount(swp_entry_t entry)
        return 0;
 }
 
-#define reuse_swap_page(page)  (page_mapcount(page) == 1)
+#define reuse_swap_page(page) \
+       (!PageTransCompound(page) && page_mapcount(page) == 1)
 
 static inline int try_to_free_swap(struct page *page)
 {
index a460e2ef28437237d2b4bc2d09486b794290f21d..a156b82dd14cbad0b6f2867d07fc883fbc685c1e 100644 (file)
@@ -887,4 +887,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
 
 asmlinkage long sys_membarrier(int cmd, int flags);
 
+asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
+
 #endif
index 84d497297c5f44af0bcc1ace0a0584092d77ab92..26c152122a424dc337764357db5bb297a40074fd 100644 (file)
@@ -50,6 +50,7 @@
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/task_work.h>
+#include <linux/memcontrol.h>
 struct linux_binprm;
 
 /*
@@ -188,6 +189,8 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
        smp_mb__after_atomic();
        if (unlikely(current->task_works))
                task_work_run();
+
+       mem_cgroup_handle_over_high();
 }
 
 #endif /* <linux/tracehook.h> */
index c314989d91585fa028c8b287aca521dd298abf07..70d8500bddf15e9c68bc5149b29c487e28e5f201 100644 (file)
@@ -205,11 +205,25 @@ struct ustat {
  * struct callback_head - callback structure for use with RCU and task_work
  * @next: next update requests in a list
  * @func: actual update function to call after the grace period.
+ *
+ * The struct is aligned to size of pointer. On most architectures it happens
+ * naturally due ABI requirements, but some architectures (like CRIS) have
+ * weird ABI and we need to ask it explicitly.
+ *
+ * The alignment is required to guarantee that bits 0 and 1 of @next will be
+ * clear under normal conditions -- as long as we use call_rcu(),
+ * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback.
+ *
+ * This guarantee is important for few reasons:
+ *  - future call_rcu_lazy() will make use of lower bits in the pointer;
+ *  - the structure shares storage spacer in struct page with @compound_head,
+ *    which encode PageTail() in bit 0. The guarantee is needed to avoid
+ *    false-positive PageTail().
  */
 struct callback_head {
        struct callback_head *next;
        void (*func)(struct callback_head *head);
-};
+} __attribute__((aligned(sizeof(void *))));
 #define rcu_head callback_head
 
 typedef void (*rcu_callback_t)(struct rcu_head *head);
index d6f2c2c5b04337ee21aadf5a5454d6c9a6ac9260..558129af828a7eb97ad64b1531ac2a4e3f71174d 100644 (file)
@@ -75,36 +75,6 @@ static inline unsigned long __copy_from_user_nocache(void *to,
 
 #endif         /* ARCH_HAS_NOCACHE_UACCESS */
 
-/**
- * probe_kernel_address(): safely attempt to read from a location
- * @addr: address to read from - its type is type typeof(retval)*
- * @retval: read into this variable
- *
- * Safely read from address @addr into variable @revtal.  If a kernel fault
- * happens, handle that and return -EFAULT.
- * We ensure that the __get_user() is executed in atomic context so that
- * do_page_fault() doesn't attempt to take mmap_sem.  This makes
- * probe_kernel_address() suitable for use within regions where the caller
- * already holds mmap_sem, or other locks which nest inside mmap_sem.
- * This must be a macro because __get_user() needs to know the types of the
- * args.
- *
- * We don't include enough header files to be able to do the set_fs().  We
- * require that the probe_kernel_address() caller will do that.
- */
-#define probe_kernel_address(addr, retval)             \
-       ({                                              \
-               long ret;                               \
-               mm_segment_t old_fs = get_fs();         \
-                                                       \
-               set_fs(KERNEL_DS);                      \
-               pagefault_disable();                    \
-               ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval));            \
-               pagefault_enable();                     \
-               set_fs(old_fs);                         \
-               ret;                                    \
-       })
-
 /*
  * probe_kernel_read(): safely attempt to read from a location
  * @dst: pointer to the buffer that shall take the data
@@ -131,4 +101,14 @@ extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size
 
 extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
 
+/**
+ * probe_kernel_address(): safely attempt to read from a location
+ * @addr: address to read from
+ * @retval: read into this variable
+ *
+ * Returns 0 on success, or -EFAULT.
+ */
+#define probe_kernel_address(addr, retval)             \
+       probe_kernel_read(&retval, addr, sizeof(retval))
+
 #endif         /* __LINUX_UACCESS_H__ */
index 9246d32dc9734374d893a5357e60fdbdd0c296e3..67c1dbd19c6df356012227cb7d3c0e283c49f833 100644 (file)
 #endif
 
 #ifdef CONFIG_HIGHMEM
-#define HIGHMEM_ZONE(xx) , xx##_HIGH
+#define HIGHMEM_ZONE(xx) xx##_HIGH,
 #else
 #define HIGHMEM_ZONE(xx)
 #endif
 
-#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL HIGHMEM_ZONE(xx) , xx##_MOVABLE
+#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
 
 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                FOR_ALL_ZONES(PGALLOC),
                PGFREE, PGACTIVATE, PGDEACTIVATE,
                PGFAULT, PGMAJFAULT,
+               PGLAZYFREED,
                FOR_ALL_ZONES(PGREFILL),
                FOR_ALL_ZONES(PGSTEAL_KSWAPD),
                FOR_ALL_ZONES(PGSTEAL_DIRECT),
@@ -68,7 +69,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                THP_FAULT_FALLBACK,
                THP_COLLAPSE_ALLOC,
                THP_COLLAPSE_ALLOC_FAILED,
-               THP_SPLIT,
+               THP_SPLIT_PAGE,
+               THP_SPLIT_PAGE_FAILED,
+               THP_SPLIT_PMD,
                THP_ZERO_PAGE_ALLOC,
                THP_ZERO_PAGE_ALLOC_FAILED,
 #endif
index 82e7db7f7100f9e141de4a90196bffa24814ecc8..49dfe40b3673b6c80544dc1cb27a558218d92264 100644 (file)
@@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 }
 
 #ifdef CONFIG_NUMA
-/*
- * Determine the per node value of a stat item. This function
- * is called frequently in a NUMA machine, so try to be as
- * frugal as possible.
- */
-static inline unsigned long node_page_state(int node,
-                                enum zone_stat_item item)
-{
-       struct zone *zones = NODE_DATA(node)->node_zones;
-
-       return
-#ifdef CONFIG_ZONE_DMA
-               zone_page_state(&zones[ZONE_DMA], item) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
-               zone_page_state(&zones[ZONE_DMA32], item) +
-#endif
-#ifdef CONFIG_HIGHMEM
-               zone_page_state(&zones[ZONE_HIGHMEM], item) +
-#endif
-               zone_page_state(&zones[ZONE_NORMAL], item) +
-               zone_page_state(&zones[ZONE_MOVABLE], item);
-}
 
+extern unsigned long node_page_state(int node, enum zone_stat_item item);
 extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
 
 #else
index 42f8ec9924523aa1436ca72ef606ef6030e360c8..2e97b7707dffcb9f3067e4c574e3c73700469146 100644 (file)
@@ -38,10 +38,10 @@ enum zpool_mapmode {
 
 bool zpool_has_pool(char *type);
 
-struct zpool *zpool_create_pool(char *type, char *name,
+struct zpool *zpool_create_pool(const char *type, const char *name,
                        gfp_t gfp, const struct zpool_ops *ops);
 
-char *zpool_get_type(struct zpool *pool);
+const char *zpool_get_type(struct zpool *pool);
 
 void zpool_destroy_pool(struct zpool *pool);
 
@@ -83,7 +83,9 @@ struct zpool_driver {
        atomic_t refcount;
        struct list_head list;
 
-       void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops,
+       void *(*create)(const char *name,
+                       gfp_t gfp,
+                       const struct zpool_ops *ops,
                        struct zpool *zpool);
        void (*destroy)(void *pool);
 
index 6398dfae53f103200a38b39c2a00cf89c4d12448..34eb16098a333317a3185a9c9109cb000cf5b41f 100644 (file)
@@ -41,7 +41,7 @@ struct zs_pool_stats {
 
 struct zs_pool;
 
-struct zs_pool *zs_create_pool(char *name, gfp_t flags);
+struct zs_pool *zs_create_pool(const char *name, gfp_t flags);
 void zs_destroy_pool(struct zs_pool *pool);
 
 unsigned long zs_malloc(struct zs_pool *pool, size_t size);
index 6adfa9a6ffe963dd89dcf895f65c576d1c7a7c97..663689521759d3d02682fe890ca50afde6bafb27 100644 (file)
@@ -68,10 +68,10 @@ typedef uLong (*check_func) (uLong check, const Byte *buf,
    An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
    much faster. Usage example:
 
-     uLong adler = adler32(0L, NULL, 0);
+     uLong adler = zlib_adler32(0L, NULL, 0);
 
      while (read_buffer(buffer, length) != EOF) {
-       adler = adler32(adler, buffer, length);
+       adler = zlib_adler32(adler, buffer, length);
      }
      if (adler != original_adler) error();
 */
index aeed5c95f3caedcdb4c10668c67764d8557e9369..59a71965b47682edadc1b37b69d112c8b347a95a 100644 (file)
@@ -2054,7 +2054,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
  */
 static inline struct page_frag *sk_page_frag(struct sock *sk)
 {
-       if (sk->sk_allocation & __GFP_WAIT)
+       if (gfpflags_allow_blocking(sk->sk_allocation))
                return &current->task_frag;
 
        return &sk->sk_frag;
index 9a6a3fe0fb51adcf0648119388ca9627a57eaf7b..c92d1e1cbad9171cdce14eaa7fc0a9400783388a 100644 (file)
@@ -9,6 +9,62 @@
 #include <linux/tracepoint.h>
 #include <trace/events/gfpflags.h>
 
+#define COMPACTION_STATUS                                      \
+       EM( COMPACT_DEFERRED,           "deferred")             \
+       EM( COMPACT_SKIPPED,            "skipped")              \
+       EM( COMPACT_CONTINUE,           "continue")             \
+       EM( COMPACT_PARTIAL,            "partial")              \
+       EM( COMPACT_COMPLETE,           "complete")             \
+       EM( COMPACT_NO_SUITABLE_PAGE,   "no_suitable_page")     \
+       EM( COMPACT_NOT_SUITABLE_ZONE,  "not_suitable_zone")    \
+       EMe(COMPACT_CONTENDED,          "contended")
+
+#ifdef CONFIG_ZONE_DMA
+#define IFDEF_ZONE_DMA(X) X
+#else
+#define IFDEF_ZONE_DMA(X)
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+#define IFDEF_ZONE_DMA32(X) X
+#else
+#define IFDEF_ZONE_DMA32(X)
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#define IFDEF_ZONE_HIGHMEM(X) X
+#else
+#define IFDEF_ZONE_HIGHMEM(X)
+#endif
+
+#define ZONE_TYPE                                              \
+       IFDEF_ZONE_DMA(         EM (ZONE_DMA,    "DMA"))        \
+       IFDEF_ZONE_DMA32(       EM (ZONE_DMA32,  "DMA32"))      \
+                               EM (ZONE_NORMAL, "Normal")      \
+       IFDEF_ZONE_HIGHMEM(     EM (ZONE_HIGHMEM,"HighMem"))    \
+                               EMe(ZONE_MOVABLE,"Movable")
+
+/*
+ * First define the enums in the above macros to be exported to userspace
+ * via TRACE_DEFINE_ENUM().
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)       TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)      TRACE_DEFINE_ENUM(a);
+
+COMPACTION_STATUS
+ZONE_TYPE
+
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)       {a, b},
+#define EMe(a, b)      {a, b}
+
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
 
        TP_PROTO(
@@ -161,7 +217,7 @@ TRACE_EVENT(mm_compaction_end,
                __entry->free_pfn,
                __entry->zone_end,
                __entry->sync ? "sync" : "async",
-               compaction_status_string[__entry->status])
+               __print_symbolic(__entry->status, COMPACTION_STATUS))
 );
 
 TRACE_EVENT(mm_compaction_try_to_compact_pages,
@@ -201,23 +257,23 @@ DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
 
        TP_STRUCT__entry(
                __field(int, nid)
-               __field(char *, name)
+               __field(enum zone_type, idx)
                __field(int, order)
                __field(int, ret)
        ),
 
        TP_fast_assign(
                __entry->nid = zone_to_nid(zone);
-               __entry->name = (char *)zone->name;
+               __entry->idx = zone_idx(zone);
                __entry->order = order;
                __entry->ret = ret;
        ),
 
        TP_printk("node=%d zone=%-8s order=%d ret=%s",
                __entry->nid,
-               __entry->name,
+               __print_symbolic(__entry->idx, ZONE_TYPE),
                __entry->order,
-               compaction_status_string[__entry->ret])
+               __print_symbolic(__entry->ret, COMPACTION_STATUS))
 );
 
 DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished,
@@ -247,7 +303,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template,
 
        TP_STRUCT__entry(
                __field(int, nid)
-               __field(char *, name)
+               __field(enum zone_type, idx)
                __field(int, order)
                __field(unsigned int, considered)
                __field(unsigned int, defer_shift)
@@ -256,7 +312,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template,
 
        TP_fast_assign(
                __entry->nid = zone_to_nid(zone);
-               __entry->name = (char *)zone->name;
+               __entry->idx = zone_idx(zone);
                __entry->order = order;
                __entry->considered = zone->compact_considered;
                __entry->defer_shift = zone->compact_defer_shift;
@@ -265,7 +321,7 @@ DECLARE_EVENT_CLASS(mm_compaction_defer_template,
 
        TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu",
                __entry->nid,
-               __entry->name,
+               __print_symbolic(__entry->idx, ZONE_TYPE),
                __entry->order,
                __entry->order_failed,
                __entry->considered,
index d6fd8e5b14b76c41bfd532c3fa86255e4e92b0f3..dde6bf092c8ac356d1502b7d444e73ce8b8c143a 100644 (file)
@@ -20,7 +20,7 @@
        {(unsigned long)GFP_ATOMIC,             "GFP_ATOMIC"},          \
        {(unsigned long)GFP_NOIO,               "GFP_NOIO"},            \
        {(unsigned long)__GFP_HIGH,             "GFP_HIGH"},            \
-       {(unsigned long)__GFP_WAIT,             "GFP_WAIT"},            \
+       {(unsigned long)__GFP_ATOMIC,           "GFP_ATOMIC"},          \
        {(unsigned long)__GFP_IO,               "GFP_IO"},              \
        {(unsigned long)__GFP_COLD,             "GFP_COLD"},            \
        {(unsigned long)__GFP_NOWARN,           "GFP_NOWARN"},          \
@@ -36,7 +36,8 @@
        {(unsigned long)__GFP_RECLAIMABLE,      "GFP_RECLAIMABLE"},     \
        {(unsigned long)__GFP_MOVABLE,          "GFP_MOVABLE"},         \
        {(unsigned long)__GFP_NOTRACK,          "GFP_NOTRACK"},         \
-       {(unsigned long)__GFP_NO_KSWAPD,        "GFP_NO_KSWAPD"},       \
+       {(unsigned long)__GFP_DIRECT_RECLAIM,   "GFP_DIRECT_RECLAIM"},  \
+       {(unsigned long)__GFP_KSWAPD_RECLAIM,   "GFP_KSWAPD_RECLAIM"},  \
        {(unsigned long)__GFP_OTHER_NODE,       "GFP_OTHER_NODE"}       \
        ) : "GFP_NOWAIT"
 
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
new file mode 100644 (file)
index 0000000..11c59ca
--- /dev/null
@@ -0,0 +1,166 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM huge_memory
+
+#if !defined(__HUGE_MEMORY_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HUGE_MEMORY_H
+
+#include  <linux/tracepoint.h>
+
+#include <trace/events/gfpflags.h>
+
+#define SCAN_STATUS                                                    \
+       EM( SCAN_FAIL,                  "failed")                       \
+       EM( SCAN_SUCCEED,               "succeeded")                    \
+       EM( SCAN_PMD_NULL,              "pmd_null")                     \
+       EM( SCAN_EXCEED_NONE_PTE,       "exceed_none_pte")              \
+       EM( SCAN_PTE_NON_PRESENT,       "pte_non_present")              \
+       EM( SCAN_PAGE_RO,               "no_writable_page")             \
+       EM( SCAN_NO_REFERENCED_PAGE,    "no_referenced_page")           \
+       EM( SCAN_PAGE_NULL,             "page_null")                    \
+       EM( SCAN_SCAN_ABORT,            "scan_aborted")                 \
+       EM( SCAN_PAGE_COUNT,            "not_suitable_page_count")      \
+       EM( SCAN_PAGE_LRU,              "page_not_in_lru")              \
+       EM( SCAN_PAGE_LOCK,             "page_locked")                  \
+       EM( SCAN_PAGE_ANON,             "page_not_anon")                \
+       EM( SCAN_PAGE_COMPOUND,         "page_compound")                \
+       EM( SCAN_ANY_PROCESS,           "no_process_for_page")          \
+       EM( SCAN_VMA_NULL,              "vma_null")                     \
+       EM( SCAN_VMA_CHECK,             "vma_check_failed")             \
+       EM( SCAN_ADDRESS_RANGE,         "not_suitable_address_range")   \
+       EM( SCAN_SWAP_CACHE_PAGE,       "page_swap_cache")              \
+       EM( SCAN_DEL_PAGE_LRU,          "could_not_delete_page_from_lru")\
+       EM( SCAN_ALLOC_HUGE_PAGE_FAIL,  "alloc_huge_page_failed")       \
+       EM( SCAN_CGROUP_CHARGE_FAIL,    "ccgroup_charge_failed")        \
+       EMe( SCAN_EXCEED_SWAP_PTE,      "exceed_swap_pte")
+
+#undef EM
+#undef EMe
+#define EM(a, b)       TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)      TRACE_DEFINE_ENUM(a);
+
+SCAN_STATUS
+
+#undef EM
+#undef EMe
+#define EM(a, b)       {a, b},
+#define EMe(a, b)      {a, b}
+
+TRACE_EVENT(mm_khugepaged_scan_pmd,
+
+       TP_PROTO(struct mm_struct *mm, unsigned long pfn, bool writable,
+                bool referenced, int none_or_zero, int status, int unmapped),
+
+       TP_ARGS(mm, pfn, writable, referenced, none_or_zero, status, unmapped),
+
+       TP_STRUCT__entry(
+               __field(struct mm_struct *, mm)
+               __field(unsigned long, pfn)
+               __field(bool, writable)
+               __field(bool, referenced)
+               __field(int, none_or_zero)
+               __field(int, status)
+               __field(int, unmapped)
+       ),
+
+       TP_fast_assign(
+               __entry->mm = mm;
+               __entry->pfn = pfn;
+               __entry->writable = writable;
+               __entry->referenced = referenced;
+               __entry->none_or_zero = none_or_zero;
+               __entry->status = status;
+               __entry->unmapped = unmapped;
+       ),
+
+       TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
+               __entry->mm,
+               __entry->pfn,
+               __entry->writable,
+               __entry->referenced,
+               __entry->none_or_zero,
+               __print_symbolic(__entry->status, SCAN_STATUS),
+               __entry->unmapped)
+);
+
+TRACE_EVENT(mm_collapse_huge_page,
+
+       TP_PROTO(struct mm_struct *mm, int isolated, int status),
+
+       TP_ARGS(mm, isolated, status),
+
+       TP_STRUCT__entry(
+               __field(struct mm_struct *, mm)
+               __field(int, isolated)
+               __field(int, status)
+       ),
+
+       TP_fast_assign(
+               __entry->mm = mm;
+               __entry->isolated = isolated;
+               __entry->status = status;
+       ),
+
+       TP_printk("mm=%p, isolated=%d, status=%s",
+               __entry->mm,
+               __entry->isolated,
+               __print_symbolic(__entry->status, SCAN_STATUS))
+);
+
+TRACE_EVENT(mm_collapse_huge_page_isolate,
+
+       TP_PROTO(unsigned long pfn, int none_or_zero,
+                bool referenced, bool  writable, int status),
+
+       TP_ARGS(pfn, none_or_zero, referenced, writable, status),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, pfn)
+               __field(int, none_or_zero)
+               __field(bool, referenced)
+               __field(bool, writable)
+               __field(int, status)
+       ),
+
+       TP_fast_assign(
+               __entry->pfn = pfn;
+               __entry->none_or_zero = none_or_zero;
+               __entry->referenced = referenced;
+               __entry->writable = writable;
+               __entry->status = status;
+       ),
+
+       TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s",
+               __entry->pfn,
+               __entry->none_or_zero,
+               __entry->referenced,
+               __entry->writable,
+               __print_symbolic(__entry->status, SCAN_STATUS))
+);
+
+TRACE_EVENT(mm_collapse_huge_page_swapin,
+
+       TP_PROTO(struct mm_struct *mm, int swapped_in, int ret),
+
+       TP_ARGS(mm, swapped_in, ret),
+
+       TP_STRUCT__entry(
+               __field(struct mm_struct *, mm)
+               __field(int, swapped_in)
+               __field(int, ret)
+       ),
+
+       TP_fast_assign(
+               __entry->mm = mm;
+               __entry->swapped_in = swapped_in;
+               __entry->ret = ret;
+       ),
+
+       TP_printk("mm=%p, swapped_in=%d, ret=%d",
+               __entry->mm,
+               __entry->swapped_in,
+               __entry->ret)
+);
+
+#endif /* __HUGE_MEMORY_H */
+#include <trace/define_trace.h>
+
diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h
new file mode 100644 (file)
index 0000000..c780581
--- /dev/null
@@ -0,0 +1,224 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nilfs2
+
+#if !defined(_TRACE_NILFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NILFS2_H
+
+#include <linux/tracepoint.h>
+
+struct nilfs_sc_info;
+
+#define show_collection_stage(type)                                    \
+       __print_symbolic(type,                                          \
+       { NILFS_ST_INIT, "ST_INIT" },                                   \
+       { NILFS_ST_GC, "ST_GC" },                                       \
+       { NILFS_ST_FILE, "ST_FILE" },                                   \
+       { NILFS_ST_IFILE, "ST_IFILE" },                                 \
+       { NILFS_ST_CPFILE, "ST_CPFILE" },                               \
+       { NILFS_ST_SUFILE, "ST_SUFILE" },                               \
+       { NILFS_ST_DAT, "ST_DAT" },                                     \
+       { NILFS_ST_SR, "ST_SR" },                                       \
+       { NILFS_ST_DSYNC, "ST_DSYNC" },                                 \
+       { NILFS_ST_DONE, "ST_DONE"})
+
+TRACE_EVENT(nilfs2_collection_stage_transition,
+
+           TP_PROTO(struct nilfs_sc_info *sci),
+
+           TP_ARGS(sci),
+
+           TP_STRUCT__entry(
+                   __field(void *, sci)
+                   __field(int, stage)
+           ),
+
+           TP_fast_assign(
+                       __entry->sci = sci;
+                       __entry->stage = sci->sc_stage.scnt;
+                   ),
+
+           TP_printk("sci = %p stage = %s",
+                     __entry->sci,
+                     show_collection_stage(__entry->stage))
+);
+
+#ifndef TRACE_HEADER_MULTI_READ
+enum nilfs2_transaction_transition_state {
+       TRACE_NILFS2_TRANSACTION_BEGIN,
+       TRACE_NILFS2_TRANSACTION_COMMIT,
+       TRACE_NILFS2_TRANSACTION_ABORT,
+       TRACE_NILFS2_TRANSACTION_TRYLOCK,
+       TRACE_NILFS2_TRANSACTION_LOCK,
+       TRACE_NILFS2_TRANSACTION_UNLOCK,
+};
+#endif
+
+#define show_transaction_state(type)                                   \
+       __print_symbolic(type,                                          \
+                        { TRACE_NILFS2_TRANSACTION_BEGIN, "BEGIN" },   \
+                        { TRACE_NILFS2_TRANSACTION_COMMIT, "COMMIT" }, \
+                        { TRACE_NILFS2_TRANSACTION_ABORT, "ABORT" },   \
+                        { TRACE_NILFS2_TRANSACTION_TRYLOCK, "TRYLOCK" }, \
+                        { TRACE_NILFS2_TRANSACTION_LOCK, "LOCK" },     \
+                        { TRACE_NILFS2_TRANSACTION_UNLOCK, "UNLOCK" })
+
+TRACE_EVENT(nilfs2_transaction_transition,
+           TP_PROTO(struct super_block *sb,
+                    struct nilfs_transaction_info *ti,
+                    int count,
+                    unsigned int flags,
+                    enum nilfs2_transaction_transition_state state),
+
+           TP_ARGS(sb, ti, count, flags, state),
+
+           TP_STRUCT__entry(
+                   __field(void *, sb)
+                   __field(void *, ti)
+                   __field(int, count)
+                   __field(unsigned int, flags)
+                   __field(int, state)
+           ),
+
+           TP_fast_assign(
+                   __entry->sb = sb;
+                   __entry->ti = ti;
+                   __entry->count = count;
+                   __entry->flags = flags;
+                   __entry->state = state;
+                   ),
+
+           TP_printk("sb = %p ti = %p count = %d flags = %x state = %s",
+                     __entry->sb,
+                     __entry->ti,
+                     __entry->count,
+                     __entry->flags,
+                     show_transaction_state(__entry->state))
+);
+
+TRACE_EVENT(nilfs2_segment_usage_check,
+           TP_PROTO(struct inode *sufile,
+                    __u64 segnum,
+                    unsigned long cnt),
+
+           TP_ARGS(sufile, segnum, cnt),
+
+           TP_STRUCT__entry(
+                   __field(struct inode *, sufile)
+                   __field(__u64, segnum)
+                   __field(unsigned long, cnt)
+           ),
+
+           TP_fast_assign(
+                   __entry->sufile = sufile;
+                   __entry->segnum = segnum;
+                   __entry->cnt = cnt;
+                   ),
+
+           TP_printk("sufile = %p segnum = %llu cnt = %lu",
+                     __entry->sufile,
+                     __entry->segnum,
+                     __entry->cnt)
+);
+
+TRACE_EVENT(nilfs2_segment_usage_allocated,
+           TP_PROTO(struct inode *sufile,
+                    __u64 segnum),
+
+           TP_ARGS(sufile, segnum),
+
+           TP_STRUCT__entry(
+                   __field(struct inode *, sufile)
+                   __field(__u64, segnum)
+           ),
+
+           TP_fast_assign(
+                   __entry->sufile = sufile;
+                   __entry->segnum = segnum;
+                   ),
+
+           TP_printk("sufile = %p segnum = %llu",
+                     __entry->sufile,
+                     __entry->segnum)
+);
+
+TRACE_EVENT(nilfs2_segment_usage_freed,
+           TP_PROTO(struct inode *sufile,
+                    __u64 segnum),
+
+           TP_ARGS(sufile, segnum),
+
+           TP_STRUCT__entry(
+                   __field(struct inode *, sufile)
+                   __field(__u64, segnum)
+           ),
+
+           TP_fast_assign(
+                   __entry->sufile = sufile;
+                   __entry->segnum = segnum;
+                   ),
+
+           TP_printk("sufile = %p segnum = %llu",
+                     __entry->sufile,
+                     __entry->segnum)
+);
+
+TRACE_EVENT(nilfs2_mdt_insert_new_block,
+           TP_PROTO(struct inode *inode,
+                    unsigned long ino,
+                    unsigned long block),
+
+           TP_ARGS(inode, ino, block),
+
+           TP_STRUCT__entry(
+                   __field(struct inode *, inode)
+                   __field(unsigned long, ino)
+                   __field(unsigned long, block)
+           ),
+
+           TP_fast_assign(
+                   __entry->inode = inode;
+                   __entry->ino = ino;
+                   __entry->block = block;
+                   ),
+
+           TP_printk("inode = %p ino = %lu block = %lu",
+                     __entry->inode,
+                     __entry->ino,
+                     __entry->block)
+);
+
+TRACE_EVENT(nilfs2_mdt_submit_block,
+           TP_PROTO(struct inode *inode,
+                    unsigned long ino,
+                    unsigned long blkoff,
+                    int mode),
+
+           TP_ARGS(inode, ino, blkoff, mode),
+
+           TP_STRUCT__entry(
+                   __field(struct inode *, inode)
+                   __field(unsigned long, ino)
+                   __field(unsigned long, blkoff)
+                   __field(int, mode)
+           ),
+
+           TP_fast_assign(
+                   __entry->inode = inode;
+                   __entry->ino = ino;
+                   __entry->blkoff = blkoff;
+                   __entry->mode = mode;
+                   ),
+
+           TP_printk("inode = %p ino = %lu blkoff = %lu mode = %x",
+                     __entry->inode,
+                     __entry->ino,
+                     __entry->blkoff,
+                     __entry->mode)
+);
+
+#endif /* _TRACE_NILFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE nilfs2
+#include <trace/define_trace.h>
index ddc3b36f1046bd6605ecdd769e4f70e59404e16b..0e821e3c3d45d1679ef4ba3fe9820469b6f4ca1e 100644 (file)
 # define MAP_UNINITIALIZED 0x0         /* Don't support this flag */
 #endif
 
+/*
+ * Flags for mlock
+ */
+#define MLOCK_ONFAULT  0x01            /* Lock pages in range after they are faulted in, do not prefault */
+
 #define MS_ASYNC       1               /* sync memory asynchronously */
 #define MS_INVALIDATE  2               /* invalidate the caches */
 #define MS_SYNC                4               /* synchronous memory sync */
@@ -34,6 +39,7 @@
 #define MADV_SEQUENTIAL        2               /* expect sequential page references */
 #define MADV_WILLNEED  3               /* will need these pages */
 #define MADV_DONTNEED  4               /* don't need these pages */
+#define MADV_FREE      5               /* free pages only if memory pressure */
 
 /* common parameters: try to keep these consistent across architectures */
 #define MADV_REMOVE    9               /* remove these pages & resources */
index e9fe6fd2a07450b36b607b321c056089e7bd59dc..7162cd4cca737b5e92e873e31f2c2a3bf50e3029 100644 (file)
@@ -17,5 +17,6 @@
 
 #define MCL_CURRENT    1               /* lock all current mappings */
 #define MCL_FUTURE     2               /* lock all future mappings */
+#define MCL_ONFAULT    4               /* lock all pages that are faulted in */
 
 #endif /* __ASM_GENERIC_MMAN_H */
index ee124009e12adb073ec17221bc570da60cf5eaea..1324b0292ec28ed7ed61f1dc875d1fd84503a6d1 100644 (file)
@@ -713,9 +713,11 @@ __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
 __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
 #define __NR_membarrier 283
 __SYSCALL(__NR_membarrier, sys_membarrier)
+#define __NR_mlock2 284
+__SYSCALL(__NR_mlock2, sys_mlock2)
 
 #undef __NR_syscalls
-#define __NR_syscalls 284
+#define __NR_syscalls 285
 
 /*
  * All syscalls below here should go away really,
index 1471db9a7e6112b3316ae887b50c6d8d1352f171..59559a215401a3763cfe15f15b8a16b3a22f4a52 100644 (file)
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -37,6 +37,7 @@
 #include <linux/rwsem.h>
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
+#include <linux/freezer.h>
 
 #include <asm/current.h>
 #include <linux/uaccess.h>
@@ -675,7 +676,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
 
                ipc_unlock_object(&msq->q_perm);
                rcu_read_unlock();
-               schedule();
+               freezable_schedule();
 
                rcu_read_lock();
                ipc_lock_object(&msq->q_perm);
@@ -917,7 +918,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
 
                ipc_unlock_object(&msq->q_perm);
                rcu_read_unlock();
-               schedule();
+               freezable_schedule();
 
                /* Lockless receive, part 1:
                 * Disable preemption.  We don't hold a reference to the queue
index 71f448e5e927aed0ccd8f5af24a928e82cfe616f..ed81aafd239263c7523db5f7ae2983770fea06cb 100644 (file)
@@ -123,7 +123,6 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
        size_t len = src->m_ts;
        size_t alen;
 
-       WARN_ON(dst == NULL);
        if (src->m_ts > dst->m_ts)
                return ERR_PTR(-EINVAL);
 
index 8a056a32ded7d2b4560612af7d84ac317f7a17b3..5ffcbd354a520b88781ed2d66c7839a7aaa7f86d 100644 (file)
@@ -1371,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        if (unlikely(audit_filter_type(type)))
                return NULL;
 
-       if (gfp_mask & __GFP_WAIT) {
+       if (gfp_mask & __GFP_DIRECT_RECLAIM) {
                if (audit_pid && audit_pid == current->pid)
-                       gfp_mask &= ~__GFP_WAIT;
+                       gfp_mask &= ~__GFP_DIRECT_RECLAIM;
                else
                        reserve = 0;
        }
 
        while (audit_backlog_limit
               && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
-               if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+               if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
                        long sleep_time;
 
                        sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
index b9d0cce3f9ce54937fea988b531d0cc7bf52f692..f1603c153890d2b9dbd37a5c687fd297c6137f24 100644 (file)
@@ -299,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 
        idr_preload(gfp_mask);
        spin_lock_bh(&cgroup_idr_lock);
-       ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
+       ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
        spin_unlock_bh(&cgroup_idr_lock);
        idr_preload_end();
        return ret;
index c02d677c541c68067f76f0053864ed176ba39ccc..c9ea63ff70a7433d02a7791aed66a681c5058227 100644 (file)
@@ -2602,22 +2602,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 }
 
 /**
- * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @tsk: pointer to task_struct of some task.
+ * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
  *
- * Description: Prints @task's name, cpuset name, and cached copy of its
+ * Description: Prints current's name, cpuset name, and cached copy of its
  * mems_allowed to the kernel log.
  */
-void cpuset_print_task_mems_allowed(struct task_struct *tsk)
+void cpuset_print_current_mems_allowed(void)
 {
        struct cgroup *cgrp;
 
        rcu_read_lock();
 
-       cgrp = task_cs(tsk)->css.cgroup;
-       pr_info("%s cpuset=", tsk->comm);
+       cgrp = task_cs(current)->css.cgroup;
+       pr_info("%s cpuset=", current->comm);
        pr_cont_cgroup_name(cgrp);
-       pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
+       pr_cont(" mems_allowed=%*pbl\n",
+               nodemask_pr_args(&current->mems_allowed));
 
        rcu_read_unlock();
 }
index 4e5e9798aa0c0d426962642b69985b9eb09021d6..51373997f479e06883898dbc8c6b6bde7dbb98d3 100644 (file)
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        const unsigned long mmun_end   = addr + PAGE_SIZE;
        struct mem_cgroup *memcg;
 
-       err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+       err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
+                       false);
        if (err)
                return err;
 
@@ -175,8 +176,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
                goto unlock;
 
        get_page(kpage);
-       page_add_new_anon_rmap(kpage, vma, addr);
-       mem_cgroup_commit_charge(kpage, memcg, false);
+       page_add_new_anon_rmap(kpage, vma, addr, false);
+       mem_cgroup_commit_charge(kpage, memcg, false, false);
        lru_cache_add_active_or_unevictable(kpage, vma);
 
        if (!PageAnon(page)) {
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        ptep_clear_flush_notify(vma, addr, ptep);
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 
-       page_remove_rmap(page);
+       page_remove_rmap(page, false);
        if (!page_mapped(page))
                try_to_free_swap(page);
        pte_unmap_unlock(ptep, ptl);
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
        err = 0;
  unlock:
-       mem_cgroup_cancel_charge(kpage, memcg);
+       mem_cgroup_cancel_charge(kpage, memcg, false);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(page);
        return err;
index 825ecc32454d23f4e60216bedfb2de31fe504699..f97f2c449f5cf556ea6c54cb4aec6e894dd8bab5 100644 (file)
@@ -455,7 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
-               tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
+               tmp->vm_flags &=
+                       ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
                tmp->vm_next = tmp->vm_prev = NULL;
                tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                file = tmp->vm_file;
index 684d7549825a4300ced2002a3fbec0a5698a18d1..470c06c3299a7feb5cc0598508715e453274c677 100644 (file)
@@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-       struct page *page, *page_head;
+       struct page *page;
        int err, ro = 0;
 
        /*
@@ -519,46 +519,9 @@ again:
        else
                err = 0;
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       page_head = page;
-       if (unlikely(PageTail(page))) {
-               put_page(page);
-               /* serialize against __split_huge_page_splitting() */
-               local_irq_disable();
-               if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
-                       page_head = compound_head(page);
-                       /*
-                        * page_head is valid pointer but we must pin
-                        * it before taking the PG_lock and/or
-                        * PG_compound_lock. The moment we re-enable
-                        * irqs __split_huge_page_splitting() can
-                        * return and the head page can be freed from
-                        * under us. We can't take the PG_lock and/or
-                        * PG_compound_lock on a page that could be
-                        * freed from under us.
-                        */
-                       if (page != page_head) {
-                               get_page(page_head);
-                               put_page(page);
-                       }
-                       local_irq_enable();
-               } else {
-                       local_irq_enable();
-                       goto again;
-               }
-       }
-#else
-       page_head = compound_head(page);
-       if (page != page_head) {
-               get_page(page_head);
-               put_page(page);
-       }
-#endif
-
-       lock_page(page_head);
-
+       lock_page(page);
        /*
-        * If page_head->mapping is NULL, then it cannot be a PageAnon
+        * If page->mapping is NULL, then it cannot be a PageAnon
         * page; but it might be the ZERO_PAGE or in the gate area or
         * in a special mapping (all cases which we are happy to fail);
         * or it may have been a good file page when get_user_pages_fast
@@ -570,12 +533,12 @@ again:
         *
         * The case we do have to guard against is when memory pressure made
         * shmem_writepage move it from filecache to swapcache beneath us:
-        * an unlikely race, but we do need to retry for page_head->mapping.
+        * an unlikely race, but we do need to retry for page->mapping.
         */
-       if (!page_head->mapping) {
-               int shmem_swizzled = PageSwapCache(page_head);
-               unlock_page(page_head);
-               put_page(page_head);
+       if (!page->mapping) {
+               int shmem_swizzled = PageSwapCache(page);
+               unlock_page(page);
+               put_page(page);
                if (shmem_swizzled)
                        goto again;
                return -EFAULT;
@@ -588,7 +551,7 @@ again:
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
-       if (PageAnon(page_head)) {
+       if (PageAnon(page)) {
                /*
                 * A RO anonymous page will never change and thus doesn't make
                 * sense for futex operations.
@@ -603,15 +566,15 @@ again:
                key->private.address = address;
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-               key->shared.inode = page_head->mapping->host;
+               key->shared.inode = page->mapping->host;
                key->shared.pgoff = basepage_index(page);
        }
 
        get_futex_key_refs(key); /* implies MB (B) */
 
 out:
-       unlock_page(page_head);
-       put_page(page_head);
+       unlock_page(page);
+       put_page(page);
        return err;
 }
 
index 4c5edc357923a1b6198c9f8122b90b73b9a5e38f..d873b64fbddcdd9e1666738f2b6192c6c5dc0029 100644 (file)
@@ -6,6 +6,8 @@
  * Version 2.  See the file COPYING for more details.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
index bd9f8a03cefa4ef05c08d54a357910286487afd8..11b64a63c0f88817b80a2c35117d70bcfe446fa1 100644 (file)
@@ -6,7 +6,7 @@
  * Version 2.  See the file COPYING for more details.
  */
 
-#define pr_fmt(fmt)    "kexec: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/capability.h>
 #include <linux/mm.h>
@@ -1027,7 +1027,7 @@ static int __init crash_notes_memory_init(void)
 
        crash_notes = __alloc_percpu(size, align);
        if (!crash_notes) {
-               pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+               pr_warn("Memory allocation for saving cpu register states failed\n");
                return -ENOMEM;
        }
        return 0;
index 6a9a3f2a0e8e3e7977483b0bfbb00dc19bc21496..b70ada0028d251d7171f13b0de7e9e47e1e6086c 100644 (file)
@@ -9,6 +9,8 @@
  * Version 2.  See the file COPYING for more details.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
index 4e49cc4c9952ca82eff8a2b5e5e61765d48ea96f..deae3907ac1eec585bbe71a44f6dc57ad024784a 100644 (file)
@@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
                return;
 
        /* no reclaim without waiting on it */
-       if (!(gfp_mask & __GFP_WAIT))
+       if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
                return;
 
        /* this guy won't enter reclaim */
index 04e91ff7560b3a35445f006a5be6e1d786bde245..4579dbb7ed872a7e1c119fdbbf98d9dc8cd67a87 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/sysrq.h>
 #include <linux/init.h>
 #include <linux/nmi.h>
+#include <linux/console.h>
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -147,6 +148,15 @@ void panic(const char *fmt, ...)
 
        bust_spinlocks(0);
 
+       /*
+        * We may have ended up stopping the CPU holding the lock (in
+        * smp_send_stop()) while still having some valuable data in the console
+        * buffer.  Try to acquire the lock then release it regardless of the
+        * result.  The release will also print the buffers out.
+        */
+       console_trylock();
+       console_unlock();
+
        if (!panic_blink)
                panic_blink = no_blink;
 
index ed1e0a1cffa7c7b78d750df0b72770a7769817b7..a6d6149c0fe60df1ca38d9a66acef281b78ee79d 100644 (file)
@@ -326,10 +326,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_get_charp);
 
-static void param_free_charp(void *arg)
+void param_free_charp(void *arg)
 {
        maybe_kfree_parameter(*((char **)arg));
 }
+EXPORT_SYMBOL(param_free_charp);
 
 const struct kernel_param_ops param_ops_charp = {
        .set = param_set_charp,
index 5235dd4e1e2f68a97fa6836d98854d2e8e46724e..3a970604308ff5365f431e8190d1b82f7ac306d1 100644 (file)
@@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
        while (to_alloc-- > 0) {
                struct page *page;
 
-               page = alloc_image_page(__GFP_HIGHMEM);
+               page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
                memory_bm_set_bit(bm, page_to_pfn(page));
        }
        return nr_highmem;
index b2066fb5b10f7639af4a391fe9786429cff90ac4..12cd989dadf639c3276ca228fef1431284c862ec 100644 (file)
@@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
        struct bio *bio;
        int error = 0;
 
-       bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+       bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
        bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
        bio->bi_bdev = hib_resume_bdev;
 
@@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
                return -ENOSPC;
 
        if (hb) {
-               src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+               src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
                                              __GFP_NORETRY);
                if (src) {
                        copy_page(src, buf);
@@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
                        ret = hib_wait_io(hb); /* Free pages */
                        if (ret)
                                return ret;
-                       src = (void *)__get_free_page(__GFP_WAIT |
+                       src = (void *)__get_free_page(__GFP_RECLAIM |
                                                      __GFP_NOWARN |
                                                      __GFP_NORETRY);
                        if (src) {
@@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
        nr_threads = num_online_cpus() - 1;
        nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
 
-       page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+       page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
        if (!page) {
                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
                ret = -ENOMEM;
@@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
                last = tmp;
 
                tmp->map = (struct swap_map_page *)
-                          __get_free_page(__GFP_WAIT | __GFP_HIGH);
+                          __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
                if (!tmp->map) {
                        release_swap_reader(handle);
                        return -ENOMEM;
@@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
        for (i = 0; i < read_pages; i++) {
                page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
-                                                 __GFP_WAIT | __GFP_HIGH :
-                                                 __GFP_WAIT | __GFP_NOWARN |
-                                                 __GFP_NORETRY);
+                                                 __GFP_RECLAIM | __GFP_HIGH :
+                                                 __GFP_RECLAIM | __GFP_NOWARN |
+                                                 __GFP_NORETRY);
 
                if (!page[i]) {
                        if (i < LZO_CMP_PAGES) {
index 99513e1160e518d322f6d0ce0f346e6da9fcbbf0..e02d1e783eb4fc93c0a34d3951e46ea3f1454e4a 100644 (file)
@@ -336,7 +336,7 @@ static int profile_cpu_callback(struct notifier_block *info,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-               node = cpu_to_mem(cpu);
+               node = cpu_to_node(cpu);
                per_cpu(cpu_profile_flip, cpu) = 0;
                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
                        page = __alloc_pages_node(node,
index 0f6bbbe77b46c092d0de31e0c9eec8a0f17e6791..c0b01fe24bbd3fae555b2e479e86f9199f3a7c3b 100644 (file)
@@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig)
        return !tsk->ptrace;
 }
 
-/*
- * Notify the system that a driver wants to block all signals for this
- * process, and wants to be notified if any signals at all were to be
- * sent/acted upon.  If the notifier routine returns non-zero, then the
- * signal will be acted upon after all.  If the notifier routine returns 0,
- * then then signal will be blocked.  Only one block per process is
- * allowed.  priv is a pointer to private data that the notifier routine
- * can use to determine if the signal should be blocked or not.
- */
-void
-block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&current->sighand->siglock, flags);
-       current->notifier_mask = mask;
-       current->notifier_data = priv;
-       current->notifier = notifier;
-       spin_unlock_irqrestore(&current->sighand->siglock, flags);
-}
-
-/* Notify the system that blocking has ended. */
-
-void
-unblock_all_signals(void)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&current->sighand->siglock, flags);
-       current->notifier = NULL;
-       current->notifier_data = NULL;
-       recalc_sigpending();
-       spin_unlock_irqrestore(&current->sighand->siglock, flags);
-}
-
 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 {
        struct sigqueue *q, *first = NULL;
@@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 {
        int sig = next_signal(pending, mask);
 
-       if (sig) {
-               if (current->notifier) {
-                       if (sigismember(current->notifier_mask, sig)) {
-                               if (!(current->notifier)(current->notifier_data)) {
-                                       clear_thread_flag(TIF_SIGPENDING);
-                                       return 0;
-                               }
-                       }
-               }
-
+       if (sig)
                collect_signal(sig, pending, info);
-       }
-
        return sig;
 }
 
@@ -834,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
        sigset_t flush;
 
        if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
-               if (signal->flags & SIGNAL_GROUP_COREDUMP)
+               if (!(signal->flags & SIGNAL_GROUP_EXIT))
                        return sig == SIGKILL;
                /*
                 * The process is in the middle of dying, nothing to do.
@@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
 EXPORT_SYMBOL(sigprocmask);
-EXPORT_SYMBOL(block_all_signals);
-EXPORT_SYMBOL(unblock_all_signals);
-
 
 /*
  * System call entry points.
index 07854477c16447ff87df874b4f117bcafadd4f33..d903c02223afbaa2776b2610f00ae3def7de442e 100644 (file)
@@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
        cpumask_var_t cpus;
        int cpu, ret;
 
-       might_sleep_if(gfp_flags & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(gfp_flags));
 
        if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
                preempt_disable();
index fa2f2f671a5cadb271367b13f8b119a21c8bf6b7..6af9212ab5aab7cef491eb4e40e7c9277c1f2ac0 100644 (file)
@@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
                                goto out_unlock;        /* No processes for this user */
                }
                do_each_thread(g, p) {
-                       if (uid_eq(task_uid(p), uid))
+                       if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
                                error = set_one_prio(p, niceval, error);
                } while_each_thread(g, p);
                if (!uid_eq(uid, cred->uid))
@@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                                goto out_unlock;        /* No processes for this user */
                }
                do_each_thread(g, p) {
-                       if (uid_eq(task_uid(p), uid)) {
+                       if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
                                niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
index a02decf155832fa03117ce6e6ebf89e6a0b2809f..0623787ec67af5fb95f505b55c1577a2f56ce0f4 100644 (file)
@@ -194,6 +194,7 @@ cond_syscall(sys_mlock);
 cond_syscall(sys_munlock);
 cond_syscall(sys_mlockall);
 cond_syscall(sys_munlockall);
+cond_syscall(sys_mlock2);
 cond_syscall(sys_mincore);
 cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
index 96c856b040819e30f5e9d4dad4ac396569f0eba0..dc6858d6639ed022d65129bdbb869ff7bcc05789 100644 (file)
@@ -888,6 +888,17 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+       {
+               .procname       = "hardlockup_panic",
+               .data           = &hardlockup_panic,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif
 #ifdef CONFIG_SMP
        {
                .procname       = "softlockup_all_cpu_backtrace",
@@ -898,6 +909,15 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+       {
+               .procname       = "hardlockup_all_cpu_backtrace",
+               .data           = &sysctl_hardlockup_all_cpu_backtrace,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
 #endif /* CONFIG_SMP */
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
index 64ed1c37bd1fdc6c2874f797e987b8bd4c4c5308..0a23125369f14d4614131f407aa7f1ec87984fb1 100644 (file)
@@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
 
 #ifdef CONFIG_SMP
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
 #else
 #define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
 #endif
 static struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -110,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn;
  * Should we panic when a soft-lockup or hard-lockup occurs:
  */
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic =
+unsigned int __read_mostly hardlockup_panic =
                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
 /*
  * We may not want to enable hard lockup detection by default in all cases,
  * for example when running the kernel as a guest on a hypervisor. In these
@@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
        return 1;
 }
 __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+       sysctl_hardlockup_all_cpu_backtrace =
+               !!simple_strtol(str, NULL, 0);
+       return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 #endif
 
 /*
@@ -263,15 +273,15 @@ void touch_softlockup_watchdog_sync(void)
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 /* watchdog detector functions */
-static int is_hardlockup(void)
+static bool is_hardlockup(void)
 {
        unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
 
        if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
-               return 1;
+               return true;
 
        __this_cpu_write(hrtimer_interrupts_saved, hrint);
-       return 0;
+       return false;
 }
 #endif
 
@@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event,
         */
        if (is_hardlockup()) {
                int this_cpu = smp_processor_id();
+               struct pt_regs *regs = get_irq_regs();
 
                /* only print hardlockups once */
                if (__this_cpu_read(hard_watchdog_warn) == true)
                        return;
 
-               if (hardlockup_panic)
-                       panic("Watchdog detected hard LOCKUP on cpu %d",
-                             this_cpu);
+               pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+               print_modules();
+               print_irqtrace_events(current);
+               if (regs)
+                       show_regs(regs);
                else
-                       WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
-                            this_cpu);
+                       dump_stack();
+
+               /*
+                * Perform all-CPU dump only once to avoid multiple hardlockups
+                * generating interleaving traces
+                */
+               if (sysctl_hardlockup_all_cpu_backtrace &&
+                               !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+                       trigger_allbutself_cpu_backtrace();
+
+               if (hardlockup_panic)
+                       panic("Hard LOCKUP");
 
                __this_cpu_write(hard_watchdog_warn, true);
                return;
@@ -347,6 +370,9 @@ static void watchdog_interrupt_count(void)
 static int watchdog_nmi_enable(unsigned int cpu);
 static void watchdog_nmi_disable(unsigned int cpu);
 
+static int watchdog_enable_all_cpus(void);
+static void watchdog_disable_all_cpus(void);
+
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
@@ -651,6 +677,12 @@ static struct smp_hotplug_thread watchdog_threads = {
 
 /*
  * park all watchdog threads that are specified in 'watchdog_cpumask'
+ *
+ * This function returns an error if kthread_park() of a watchdog thread
+ * fails. In this situation, the watchdog threads of some CPUs can already
+ * be parked and the watchdog threads of other CPUs can still be runnable.
+ * Callers are expected to handle this special condition as appropriate in
+ * their context.
  */
 static int watchdog_park_threads(void)
 {
@@ -662,10 +694,6 @@ static int watchdog_park_threads(void)
                if (ret)
                        break;
        }
-       if (ret) {
-               for_each_watchdog_cpu(cpu)
-                       kthread_unpark(per_cpu(softlockup_watchdog, cpu));
-       }
        put_online_cpus();
 
        return ret;
@@ -704,6 +732,11 @@ int lockup_detector_suspend(void)
 
        if (ret == 0)
                watchdog_suspended++;
+       else {
+               watchdog_disable_all_cpus();
+               pr_err("Failed to suspend lockup detectors, disabled\n");
+               watchdog_enabled = 0;
+       }
 
        mutex_unlock(&watchdog_proc_mutex);
 
@@ -728,10 +761,17 @@ void lockup_detector_resume(void)
        mutex_unlock(&watchdog_proc_mutex);
 }
 
-static void update_watchdog_all_cpus(void)
+static int update_watchdog_all_cpus(void)
 {
-       watchdog_park_threads();
+       int ret;
+
+       ret = watchdog_park_threads();
+       if (ret)
+               return ret;
+
        watchdog_unpark_threads();
+
+       return 0;
 }
 
 static int watchdog_enable_all_cpus(void)
@@ -750,15 +790,20 @@ static int watchdog_enable_all_cpus(void)
                 * Enable/disable the lockup detectors or
                 * change the sample period 'on the fly'.
                 */
-               update_watchdog_all_cpus();
+               err = update_watchdog_all_cpus();
+
+               if (err) {
+                       watchdog_disable_all_cpus();
+                       pr_err("Failed to update lockup detectors, disabled\n");
+               }
        }
 
+       if (err)
+               watchdog_enabled = 0;
+
        return err;
 }
 
-/* prepare/enable/disable routines */
-/* sysctl functions */
-#ifdef CONFIG_SYSCTL
 static void watchdog_disable_all_cpus(void)
 {
        if (watchdog_running) {
@@ -767,6 +812,8 @@ static void watchdog_disable_all_cpus(void)
        }
 }
 
+#ifdef CONFIG_SYSCTL
+
 /*
  * Update the run state of the lockup detectors.
  */
@@ -849,12 +896,13 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
                } while (cmpxchg(&watchdog_enabled, old, new) != old);
 
                /*
-                * Update the run state of the lockup detectors.
-                * Restore 'watchdog_enabled' on failure.
+                * Update the run state of the lockup detectors. There is _no_
+                * need to check the value returned by proc_watchdog_update()
+                * and to restore the previous value of 'watchdog_enabled' as
+                * both lockup detectors are disabled if proc_watchdog_update()
+                * returns an error.
                 */
                err = proc_watchdog_update();
-               if (err)
-                       watchdog_enabled = old;
        }
 out:
        mutex_unlock(&watchdog_proc_mutex);
@@ -914,13 +962,14 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
                goto out;
 
        /*
-        * Update the sample period.
-        * Restore 'watchdog_thresh' on failure.
+        * Update the sample period. Restore on failure.
         */
        set_sample_period();
        err = proc_watchdog_update();
-       if (err)
+       if (err) {
                watchdog_thresh = old;
+               set_sample_period();
+       }
 out:
        mutex_unlock(&watchdog_proc_mutex);
        return err;
index f0df318104e7272ef97641421c938069a047a85e..f7e64c7748cb9e73e8b2767355171f271cc54559 100644 (file)
@@ -185,6 +185,13 @@ config CRC8
          when they need to do cyclic redundancy check according CRC8
          algorithm. Module will be called crc8.
 
+config CRC64_ECMA
+       tristate "CRC64 ECMA function"
+       help
+         This option provides CRC64 ECMA function. Drivers may select this
+         when they need to do cyclic redundancy check according to the CRC64
+         ECMA algorithm.
+
 config AUDIT_GENERIC
        bool
        depends on AUDIT && !AUDIT_ARCH
index 565783733cd013166b1edc367ba3bc919215d2bd..526105c18566c16b0bdd6f0dc0d958b262724e0d 100644 (file)
@@ -580,6 +580,14 @@ config DEBUG_VM_RB
 
          If unsure, say N.
 
+config DEBUG_VM_PGFLAGS
+       bool "Debug page-flags operations"
+       depends on DEBUG_VM
+       help
+         Enables extra validation on page flags operations.
+
+         If unsure, say N.
+
 config DEBUG_VIRTUAL
        bool "Debug VM translations"
        depends on DEBUG_KERNEL && X86
@@ -1695,6 +1703,9 @@ config TEST_STRING_HELPERS
 config TEST_KSTRTOX
        tristate "Test kstrto*() family of functions at runtime"
 
+config TEST_PRINTF
+       tristate "Test printf() family of functions at runtime"
+
 config TEST_RHASHTABLE
        tristate "Perform selftest on resizable hash table"
        default n
@@ -1762,6 +1773,16 @@ config DMA_API_DEBUG
 
          If unsure, say N.
 
+config DMA_API_DEBUG_POISON
+       bool "Poison coherent DMA buffers"
+       depends on DMA_API_DEBUG && EXPERT
+       help
+         Poison DMA buffers returned by dma_alloc_coherent unless __GFP_ZERO
+         is explicitly specified, to catch drivers depending on zeroed buffers
+         without passing the correct flags.
+
+         Only say Y if you're prepared for almost everything to break.
+
 config TEST_LKM
        tristate "Test module loading with 'hello world' module"
        default n
index 39f24d6721e5a29d8016811d4f5332fca66c2ec2..0fee5acd5aa09e2bd65ade1200cae5e48bacfa22 100644 (file)
@@ -15,8 +15,7 @@ config KASAN
          global variables requires gcc 5.0 or later.
          This feature consumes about 1/8 of available memory and brings about
          ~x3 performance slowdown.
-         For better error detection enable CONFIG_STACKTRACE,
-         and add slub_debug=U to boot cmdline.
+         For better error detection enable CONFIG_STACKTRACE.
 
 choice
        prompt "Instrumentation type"
index 8de3b012eac77ed2c14160022d0cc6b9f75773c6..8498a5c9815a3c0273d99fd45c3d10bdf42c3375 100644 (file)
@@ -42,6 +42,7 @@ obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
 obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
 obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o
 obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
+obj-$(CONFIG_TEST_PRINTF) += test_printf.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
@@ -82,6 +83,7 @@ obj-$(CONFIG_CRC32)   += crc32.o
 obj-$(CONFIG_CRC7)     += crc7.o
 obj-$(CONFIG_LIBCRC32C)        += libcrc32c.o
 obj-$(CONFIG_CRC8)     += crc8.o
+obj-$(CONFIG_CRC64_ECMA)       += crc64_ecma.o
 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
 
 obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644 (file)
index 0000000..41629ea
--- /dev/null
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK                        0xFF
+#define CRC64_TABLE_SIZE               256
+
+
+struct crc64_table {
+       u64 seed;
+       u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+       CRC64_DEFAULT_INITVAL,
+       {
+               0x0000000000000000ULL,
+               0xb32e4cbe03a75f6fULL,
+               0xf4843657a840a05bULL,
+               0x47aa7ae9abe7ff34ULL,
+               0x7bd0c384ff8f5e33ULL,
+               0xc8fe8f3afc28015cULL,
+               0x8f54f5d357cffe68ULL,
+               0x3c7ab96d5468a107ULL,
+               0xf7a18709ff1ebc66ULL,
+               0x448fcbb7fcb9e309ULL,
+               0x0325b15e575e1c3dULL,
+               0xb00bfde054f94352ULL,
+               0x8c71448d0091e255ULL,
+               0x3f5f08330336bd3aULL,
+               0x78f572daa8d1420eULL,
+               0xcbdb3e64ab761d61ULL,
+               0x7d9ba13851336649ULL,
+               0xceb5ed8652943926ULL,
+               0x891f976ff973c612ULL,
+               0x3a31dbd1fad4997dULL,
+               0x064b62bcaebc387aULL,
+               0xb5652e02ad1b6715ULL,
+               0xf2cf54eb06fc9821ULL,
+               0x41e11855055bc74eULL,
+               0x8a3a2631ae2dda2fULL,
+               0x39146a8fad8a8540ULL,
+               0x7ebe1066066d7a74ULL,
+               0xcd905cd805ca251bULL,
+               0xf1eae5b551a2841cULL,
+               0x42c4a90b5205db73ULL,
+               0x056ed3e2f9e22447ULL,
+               0xb6409f5cfa457b28ULL,
+               0xfb374270a266cc92ULL,
+               0x48190ecea1c193fdULL,
+               0x0fb374270a266cc9ULL,
+               0xbc9d3899098133a6ULL,
+               0x80e781f45de992a1ULL,
+               0x33c9cd4a5e4ecdceULL,
+               0x7463b7a3f5a932faULL,
+               0xc74dfb1df60e6d95ULL,
+               0x0c96c5795d7870f4ULL,
+               0xbfb889c75edf2f9bULL,
+               0xf812f32ef538d0afULL,
+               0x4b3cbf90f69f8fc0ULL,
+               0x774606fda2f72ec7ULL,
+               0xc4684a43a15071a8ULL,
+               0x83c230aa0ab78e9cULL,
+               0x30ec7c140910d1f3ULL,
+               0x86ace348f355aadbULL,
+               0x3582aff6f0f2f5b4ULL,
+               0x7228d51f5b150a80ULL,
+               0xc10699a158b255efULL,
+               0xfd7c20cc0cdaf4e8ULL,
+               0x4e526c720f7dab87ULL,
+               0x09f8169ba49a54b3ULL,
+               0xbad65a25a73d0bdcULL,
+               0x710d64410c4b16bdULL,
+               0xc22328ff0fec49d2ULL,
+               0x85895216a40bb6e6ULL,
+               0x36a71ea8a7ace989ULL,
+               0x0adda7c5f3c4488eULL,
+               0xb9f3eb7bf06317e1ULL,
+               0xfe5991925b84e8d5ULL,
+               0x4d77dd2c5823b7baULL,
+               0x64b62bcaebc387a1ULL,
+               0xd7986774e864d8ceULL,
+               0x90321d9d438327faULL,
+               0x231c512340247895ULL,
+               0x1f66e84e144cd992ULL,
+               0xac48a4f017eb86fdULL,
+               0xebe2de19bc0c79c9ULL,
+               0x58cc92a7bfab26a6ULL,
+               0x9317acc314dd3bc7ULL,
+               0x2039e07d177a64a8ULL,
+               0x67939a94bc9d9b9cULL,
+               0xd4bdd62abf3ac4f3ULL,
+               0xe8c76f47eb5265f4ULL,
+               0x5be923f9e8f53a9bULL,
+               0x1c4359104312c5afULL,
+               0xaf6d15ae40b59ac0ULL,
+               0x192d8af2baf0e1e8ULL,
+               0xaa03c64cb957be87ULL,
+               0xeda9bca512b041b3ULL,
+               0x5e87f01b11171edcULL,
+               0x62fd4976457fbfdbULL,
+               0xd1d305c846d8e0b4ULL,
+               0x96797f21ed3f1f80ULL,
+               0x2557339fee9840efULL,
+               0xee8c0dfb45ee5d8eULL,
+               0x5da24145464902e1ULL,
+               0x1a083bacedaefdd5ULL,
+               0xa9267712ee09a2baULL,
+               0x955cce7fba6103bdULL,
+               0x267282c1b9c65cd2ULL,
+               0x61d8f8281221a3e6ULL,
+               0xd2f6b4961186fc89ULL,
+               0x9f8169ba49a54b33ULL,
+               0x2caf25044a02145cULL,
+               0x6b055fede1e5eb68ULL,
+               0xd82b1353e242b407ULL,
+               0xe451aa3eb62a1500ULL,
+               0x577fe680b58d4a6fULL,
+               0x10d59c691e6ab55bULL,
+               0xa3fbd0d71dcdea34ULL,
+               0x6820eeb3b6bbf755ULL,
+               0xdb0ea20db51ca83aULL,
+               0x9ca4d8e41efb570eULL,
+               0x2f8a945a1d5c0861ULL,
+               0x13f02d374934a966ULL,
+               0xa0de61894a93f609ULL,
+               0xe7741b60e174093dULL,
+               0x545a57dee2d35652ULL,
+               0xe21ac88218962d7aULL,
+               0x5134843c1b317215ULL,
+               0x169efed5b0d68d21ULL,
+               0xa5b0b26bb371d24eULL,
+               0x99ca0b06e7197349ULL,
+               0x2ae447b8e4be2c26ULL,
+               0x6d4e3d514f59d312ULL,
+               0xde6071ef4cfe8c7dULL,
+               0x15bb4f8be788911cULL,
+               0xa6950335e42fce73ULL,
+               0xe13f79dc4fc83147ULL,
+               0x521135624c6f6e28ULL,
+               0x6e6b8c0f1807cf2fULL,
+               0xdd45c0b11ba09040ULL,
+               0x9aefba58b0476f74ULL,
+               0x29c1f6e6b3e0301bULL,
+               0xc96c5795d7870f42ULL,
+               0x7a421b2bd420502dULL,
+               0x3de861c27fc7af19ULL,
+               0x8ec62d7c7c60f076ULL,
+               0xb2bc941128085171ULL,
+               0x0192d8af2baf0e1eULL,
+               0x4638a2468048f12aULL,
+               0xf516eef883efae45ULL,
+               0x3ecdd09c2899b324ULL,
+               0x8de39c222b3eec4bULL,
+               0xca49e6cb80d9137fULL,
+               0x7967aa75837e4c10ULL,
+               0x451d1318d716ed17ULL,
+               0xf6335fa6d4b1b278ULL,
+               0xb199254f7f564d4cULL,
+               0x02b769f17cf11223ULL,
+               0xb4f7f6ad86b4690bULL,
+               0x07d9ba1385133664ULL,
+               0x4073c0fa2ef4c950ULL,
+               0xf35d8c442d53963fULL,
+               0xcf273529793b3738ULL,
+               0x7c0979977a9c6857ULL,
+               0x3ba3037ed17b9763ULL,
+               0x888d4fc0d2dcc80cULL,
+               0x435671a479aad56dULL,
+               0xf0783d1a7a0d8a02ULL,
+               0xb7d247f3d1ea7536ULL,
+               0x04fc0b4dd24d2a59ULL,
+               0x3886b22086258b5eULL,
+               0x8ba8fe9e8582d431ULL,
+               0xcc0284772e652b05ULL,
+               0x7f2cc8c92dc2746aULL,
+               0x325b15e575e1c3d0ULL,
+               0x8175595b76469cbfULL,
+               0xc6df23b2dda1638bULL,
+               0x75f16f0cde063ce4ULL,
+               0x498bd6618a6e9de3ULL,
+               0xfaa59adf89c9c28cULL,
+               0xbd0fe036222e3db8ULL,
+               0x0e21ac88218962d7ULL,
+               0xc5fa92ec8aff7fb6ULL,
+               0x76d4de52895820d9ULL,
+               0x317ea4bb22bfdfedULL,
+               0x8250e80521188082ULL,
+               0xbe2a516875702185ULL,
+               0x0d041dd676d77eeaULL,
+               0x4aae673fdd3081deULL,
+               0xf9802b81de97deb1ULL,
+               0x4fc0b4dd24d2a599ULL,
+               0xfceef8632775faf6ULL,
+               0xbb44828a8c9205c2ULL,
+               0x086ace348f355aadULL,
+               0x34107759db5dfbaaULL,
+               0x873e3be7d8faa4c5ULL,
+               0xc094410e731d5bf1ULL,
+               0x73ba0db070ba049eULL,
+               0xb86133d4dbcc19ffULL,
+               0x0b4f7f6ad86b4690ULL,
+               0x4ce50583738cb9a4ULL,
+               0xffcb493d702be6cbULL,
+               0xc3b1f050244347ccULL,
+               0x709fbcee27e418a3ULL,
+               0x3735c6078c03e797ULL,
+               0x841b8ab98fa4b8f8ULL,
+               0xadda7c5f3c4488e3ULL,
+               0x1ef430e13fe3d78cULL,
+               0x595e4a08940428b8ULL,
+               0xea7006b697a377d7ULL,
+               0xd60abfdbc3cbd6d0ULL,
+               0x6524f365c06c89bfULL,
+               0x228e898c6b8b768bULL,
+               0x91a0c532682c29e4ULL,
+               0x5a7bfb56c35a3485ULL,
+               0xe955b7e8c0fd6beaULL,
+               0xaeffcd016b1a94deULL,
+               0x1dd181bf68bdcbb1ULL,
+               0x21ab38d23cd56ab6ULL,
+               0x9285746c3f7235d9ULL,
+               0xd52f0e859495caedULL,
+               0x6601423b97329582ULL,
+               0xd041dd676d77eeaaULL,
+               0x636f91d96ed0b1c5ULL,
+               0x24c5eb30c5374ef1ULL,
+               0x97eba78ec690119eULL,
+               0xab911ee392f8b099ULL,
+               0x18bf525d915feff6ULL,
+               0x5f1528b43ab810c2ULL,
+               0xec3b640a391f4fadULL,
+               0x27e05a6e926952ccULL,
+               0x94ce16d091ce0da3ULL,
+               0xd3646c393a29f297ULL,
+               0x604a2087398eadf8ULL,
+               0x5c3099ea6de60cffULL,
+               0xef1ed5546e415390ULL,
+               0xa8b4afbdc5a6aca4ULL,
+               0x1b9ae303c601f3cbULL,
+               0x56ed3e2f9e224471ULL,
+               0xe5c372919d851b1eULL,
+               0xa26908783662e42aULL,
+               0x114744c635c5bb45ULL,
+               0x2d3dfdab61ad1a42ULL,
+               0x9e13b115620a452dULL,
+               0xd9b9cbfcc9edba19ULL,
+               0x6a978742ca4ae576ULL,
+               0xa14cb926613cf817ULL,
+               0x1262f598629ba778ULL,
+               0x55c88f71c97c584cULL,
+               0xe6e6c3cfcadb0723ULL,
+               0xda9c7aa29eb3a624ULL,
+               0x69b2361c9d14f94bULL,
+               0x2e184cf536f3067fULL,
+               0x9d36004b35545910ULL,
+               0x2b769f17cf112238ULL,
+               0x9858d3a9ccb67d57ULL,
+               0xdff2a94067518263ULL,
+               0x6cdce5fe64f6dd0cULL,
+               0x50a65c93309e7c0bULL,
+               0xe388102d33392364ULL,
+               0xa4226ac498dedc50ULL,
+               0x170c267a9b79833fULL,
+               0xdcd7181e300f9e5eULL,
+               0x6ff954a033a8c131ULL,
+               0x28532e49984f3e05ULL,
+               0x9b7d62f79be8616aULL,
+               0xa707db9acf80c06dULL,
+               0x14299724cc279f02ULL,
+               0x5383edcd67c06036ULL,
+               0xe0ada17364673f59ULL
+       }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+       return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+       unsigned int i;
+       u64 crc = seed;
+
+       for (i = 0; i < nbytes; i++)
+               crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+                       (crc >> 8);
+
+       return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
index fcb65d2a0b947e85e335599229bd3a44a1c425d7..af6262b4e02c62c11bbf6295abc22c38ca4bc5a1 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/sched.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
+#include <linux/poison.h>
 #include <linux/slab.h>
 
 #include <asm/sections.h>
@@ -1249,6 +1250,14 @@ static void check_sync(struct device *dev,
                                dir2name[entry->direction],
                                dir2name[ref->direction]);
 
+       if (ref->sg_call_ents && ref->type == dma_debug_sg &&
+           ref->sg_call_ents != entry->sg_call_ents) {
+               err_printk(ref->dev, entry, "DMA-API: device driver syncs "
+                          "DMA sg list with different entry count "
+                          "[map count=%d] [sync count=%d]\n",
+                          entry->sg_call_ents, ref->sg_call_ents);
+       }
+
 out:
        put_hash_bucket(bucket, &flags);
 }
@@ -1439,7 +1448,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 EXPORT_SYMBOL(debug_dma_unmap_sg);
 
 void debug_dma_alloc_coherent(struct device *dev, size_t size,
-                             dma_addr_t dma_addr, void *virt)
+                             dma_addr_t dma_addr, void *virt, gfp_t flags)
 {
        struct dma_debug_entry *entry;
 
@@ -1449,6 +1458,9 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
        if (unlikely(virt == NULL))
                return;
 
+       if (IS_ENABLED(CONFIG_DMA_API_DEBUG_POISON) && !(flags & __GFP_ZERO))
+               memset(virt, DMA_ALLOC_POISON, size);
+
        entry = dma_entry_alloc();
        if (!entry)
                return;
index e491e02eff549fbe8c2e245c3f475b908818938f..e3952e9c8ec04256e656ada5ca39ccdd62843093 100644 (file)
@@ -42,7 +42,7 @@ extern struct _ddebug __stop___verbose[];
 
 struct ddebug_table {
        struct list_head link;
-       char *mod_name;
+       const char *mod_name;
        unsigned int num_ddebugs;
        struct _ddebug *ddebugs;
 };
@@ -841,12 +841,12 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
                             const char *name)
 {
        struct ddebug_table *dt;
-       char *new_name;
+       const char *new_name;
 
        dt = kzalloc(sizeof(*dt), GFP_KERNEL);
        if (dt == NULL)
                return -ENOMEM;
-       new_name = kstrdup(name, GFP_KERNEL);
+       new_name = kstrdup_const(name, GFP_KERNEL);
        if (new_name == NULL) {
                kfree(dt);
                return -ENOMEM;
@@ -907,7 +907,7 @@ int ddebug_dyndbg_module_param_cb(char *param, char *val, const char *module)
 static void ddebug_table_free(struct ddebug_table *dt)
 {
        list_del_init(&dt->link);
-       kfree(dt->mod_name);
+       kfree_const(dt->mod_name);
        kfree(dt);
 }
 
index a8fe6274a13cfe4265f8f60a7726ba8f1def6edb..137e861d96902241455d8d4ffb01d5b79a4ef2e3 100644 (file)
@@ -1,6 +1,7 @@
 #include <linux/compiler.h>
 #include <linux/export.h>
 #include <linux/cryptohash.h>
+#include <linux/bitops.h>
 
 /* F, G and H are basic MD4 functions: selection, majority, parity */
 #define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
@@ -14,7 +15,7 @@
  * Rotation is separate from addition to prevent recomputation
  */
 #define ROUND(f, a, b, c, d, x, s)     \
-       (a += f(b, c, d) + x, a = (a << s) | (a >> (32 - s)))
+       (a += f(b, c, d) + x, a = rol32(a, s))
 #define K1 0
 #define K2 013240474631UL
 #define K3 015666365641UL
index 5335c43adf46af55bb90717aa63afe33a6a30e9c..6098336df2672e09f0d022aa5c1a1c8252aa73c7 100644 (file)
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -399,7 +399,7 @@ void idr_preload(gfp_t gfp_mask)
         * allocation guarantee.  Disallow usage from those contexts.
         */
        WARN_ON_ONCE(in_interrupt());
-       might_sleep_if(gfp_mask & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(gfp_mask));
 
        preempt_disable();
 
@@ -453,7 +453,7 @@ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask)
        struct idr_layer *pa[MAX_IDR_LEVEL + 1];
        int id;
 
-       might_sleep_if(gfp_mask & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(gfp_mask));
 
        /* sanity checks */
        if (WARN_ON_ONCE(start < 0))
index bd2bea963364c757e90db1e47ce557a94d053c0b..391fd23976a2c00fc9c08cfe6e2a2ab44953c5cf 100644 (file)
@@ -36,8 +36,7 @@ bool current_is_single_threaded(void)
                if (unlikely(p == task->group_leader))
                        continue;
 
-               t = p;
-               do {
+               for_each_thread(p, t) {
                        if (unlikely(t->mm == mm))
                                goto found;
                        if (likely(t->mm))
@@ -48,7 +47,7 @@ bool current_is_single_threaded(void)
                         * forked before exiting.
                         */
                        smp_rmb();
-               } while_each_thread(p, t);
+               }
        }
        ret = true;
 found:
index 32f12150fc4f4a82aace1ff42afc17969f8692a3..f194e6e593e19db22ad6b7e50aeee0e7c46eafa6 100644 (file)
@@ -31,6 +31,22 @@ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL(kvasprintf);
 
+/*
+ * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt
+ * (or the sole vararg) points to rodata, we will then save a memory
+ * allocation and string copy. In any case, the return value should be
+ * freed using kfree_const().
+ */
+const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap)
+{
+       if (!strchr(fmt, '%'))
+               return kstrdup_const(fmt, gfp);
+       if (!strcmp(fmt, "%s"))
+               return kstrdup_const(va_arg(ap, const char*), gfp);
+       return kvasprintf(gfp, fmt, ap);
+}
+EXPORT_SYMBOL(kvasprintf_const);
+
 char *kasprintf(gfp_t gfp, const char *fmt, ...)
 {
        va_list ap;
index 0554077462669074d4df0fd01e2d087d7f997837..7cbccd2b4c72042595484e32c2e11906948414b7 100644 (file)
@@ -257,18 +257,32 @@ static int kobject_add_internal(struct kobject *kobj)
 int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
                                  va_list vargs)
 {
-       char *s;
+       const char *s;
 
        if (kobj->name && !fmt)
                return 0;
 
-       s = kvasprintf(GFP_KERNEL, fmt, vargs);
+       s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
        if (!s)
                return -ENOMEM;
 
-       /* ewww... some of these buggers have '/' in the name ... */
-       strreplace(s, '/', '!');
-       kfree(kobj->name);
+       /*
+        * ewww... some of these buggers have '/' in the name ... If
+        * that's the case, we need to make sure we have an actual
+        * allocated copy to modify, since kvasprintf_const may have
+        * returned something from .rodata.
+        */
+       if (strchr(s, '/')) {
+               char *t;
+
+               t = kstrdup(s, GFP_KERNEL);
+               kfree_const(s);
+               if (!t)
+                       return -ENOMEM;
+               strreplace(t, '/', '!');
+               s = t;
+       }
+       kfree_const(kobj->name);
        kobj->name = s;
 
        return 0;
@@ -466,7 +480,7 @@ int kobject_rename(struct kobject *kobj, const char *new_name)
        envp[0] = devpath_string;
        envp[1] = NULL;
 
-       name = dup_name = kstrdup(new_name, GFP_KERNEL);
+       name = dup_name = kstrdup_const(new_name, GFP_KERNEL);
        if (!name) {
                error = -ENOMEM;
                goto out;
@@ -486,7 +500,7 @@ int kobject_rename(struct kobject *kobj, const char *new_name)
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);
 
 out:
-       kfree(dup_name);
+       kfree_const(dup_name);
        kfree(devpath_string);
        kfree(devpath);
        kobject_put(kobj);
@@ -634,7 +648,7 @@ static void kobject_cleanup(struct kobject *kobj)
        /* free name if we allocated it */
        if (name) {
                pr_debug("kobject: '%s': free name\n", name);
-               kfree(name);
+               kfree_const(name);
        }
 }
 
index 0b0e9779d6753b1a3982551afdbb685e0b6a209e..ae5872b1df0c669fc8365ce1754d2a128651a890 100644 (file)
@@ -66,12 +66,12 @@ struct llist_node *llist_del_first(struct llist_head *head)
 {
        struct llist_node *entry, *old_entry, *next;
 
-       entry = head->first;
+       entry = smp_load_acquire(&head->first);
        for (;;) {
                if (entry == NULL)
                        return NULL;
                old_entry = entry;
-               next = entry->next;
+               next = READ_ONCE(entry->next);
                entry = cmpxchg(&head->first, old_entry, next);
                if (entry == old_entry)
                        break;
index f75715131f2094cc3b76d2d1f14ba31e1ccf3744..6d40944960de77bff090f41a0f94f0f85e8522a4 100644 (file)
@@ -135,7 +135,7 @@ static inline unsigned alloc_local_tag(struct percpu_ida_cpu *tags)
  * TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, of course).
  *
  * @gfp indicates whether or not to wait until a free id is available (it's not
- * used for internal memory allocations); thus if passed __GFP_WAIT we may sleep
+ * used for internal memory allocations); thus if passed __GFP_RECLAIM we may sleep
  * however long it takes until another thread frees an id (same semantics as a
  * mempool).
  *
index f9ebe1c82060ec330ac7ae7a9d1678f18058cf65..fcf5d98574ce46871dca087d2c803dbfb67c0b81 100644 (file)
@@ -188,7 +188,7 @@ radix_tree_node_alloc(struct radix_tree_root *root)
         * preloading in the interrupt anyway as all the allocations have to
         * be atomic. So just do normal allocation when in interrupt.
         */
-       if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) {
+       if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
                struct radix_tree_preload *rtp;
 
                /*
@@ -249,7 +249,7 @@ radix_tree_node_free(struct radix_tree_node *node)
  * with preemption not disabled.
  *
  * To make use of this facility, the radix tree must be initialised without
- * __GFP_WAIT being passed to INIT_RADIX_TREE().
+ * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
  */
 static int __radix_tree_preload(gfp_t gfp_mask)
 {
@@ -286,12 +286,12 @@ out:
  * with preemption not disabled.
  *
  * To make use of this facility, the radix tree must be initialised without
- * __GFP_WAIT being passed to INIT_RADIX_TREE().
+ * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
  */
 int radix_tree_preload(gfp_t gfp_mask)
 {
        /* Warn on non-sensical use... */
-       WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT));
+       WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
        return __radix_tree_preload(gfp_mask);
 }
 EXPORT_SYMBOL(radix_tree_preload);
@@ -303,7 +303,7 @@ EXPORT_SYMBOL(radix_tree_preload);
  */
 int radix_tree_maybe_preload(gfp_t gfp_mask)
 {
-       if (gfp_mask & __GFP_WAIT)
+       if (gfpflags_allow_blocking(gfp_mask))
                return __radix_tree_preload(gfp_mask);
        /* Preloading doesn't help anything with this gfp mask, skip it */
        preempt_disable();
index 8e376efd88a40bde12b819861fedff4b679792ab..98866a770770c8bba0acf7bdbacea4699d9f14bf 100644 (file)
@@ -326,6 +326,39 @@ out:
        kfree(out_test);
 }
 
+#define string_get_size_maxbuf 16
+#define test_string_get_size_one(size, blk_size, units, exp_result)            \
+       do {                                                                   \
+               BUILD_BUG_ON(sizeof(exp_result) >= string_get_size_maxbuf);    \
+               __test_string_get_size((size), (blk_size), (units),            \
+                                      (exp_result));                          \
+       } while (0)
+
+
+static __init void __test_string_get_size(const u64 size, const u64 blk_size,
+                                         const enum string_size_units units,
+                                         const char *exp_result)
+{
+       char buf[string_get_size_maxbuf];
+
+       string_get_size(size, blk_size, units, buf, sizeof(buf));
+       if (!memcmp(buf, exp_result, strlen(exp_result) + 1))
+               return;
+
+       buf[sizeof(buf) - 1] = '\0';
+       pr_warn("Test 'test_string_get_size_one' failed!\n");
+       pr_warn("string_get_size(size = %llu, blk_size = %llu, units = %d\n",
+               size, blk_size, units);
+       pr_warn("expected: '%s', got '%s'\n", exp_result, buf);
+}
+
+static __init void test_string_get_size(void)
+{
+       test_string_get_size_one(16384, 512, STRING_UNITS_2, "8.00 MiB");
+       test_string_get_size_one(8192, 4096, STRING_UNITS_10, "32.7 MB");
+       test_string_get_size_one(1, 512, STRING_UNITS_10, "512 B");
+}
+
 static int __init test_string_helpers_init(void)
 {
        unsigned int i;
@@ -344,6 +377,9 @@ static int __init test_string_helpers_init(void)
        for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++)
                test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1);
 
+       /* Test string_get_size() */
+       test_string_get_size();
+
        return -EINVAL;
 }
 module_init(test_string_helpers_init);
index c1efb1b610179013baf5d662f40f739a5f9abc60..c32f3b0048dc826cdad5d04a3c4d15a7ecbe9645 100644 (file)
@@ -138,6 +138,71 @@ static noinline void __init kmalloc_oob_16(void)
        kfree(ptr2);
 }
 
+static noinline void __init kmalloc_oob_memset_2(void)
+{
+       char *ptr;
+       size_t size = 8;
+
+       pr_info("out-of-bounds in memset2\n");
+       ptr = kmalloc(size, GFP_KERNEL);
+       if (!ptr) {
+               pr_err("Allocation failed\n");
+               return;
+       }
+
+       memset(ptr+7, 0, 2);
+       kfree(ptr);
+}
+
+static noinline void __init kmalloc_oob_memset_4(void)
+{
+       char *ptr;
+       size_t size = 8;
+
+       pr_info("out-of-bounds in memset4\n");
+       ptr = kmalloc(size, GFP_KERNEL);
+       if (!ptr) {
+               pr_err("Allocation failed\n");
+               return;
+       }
+
+       memset(ptr+5, 0, 4);
+       kfree(ptr);
+}
+
+
+static noinline void __init kmalloc_oob_memset_8(void)
+{
+       char *ptr;
+       size_t size = 8;
+
+       pr_info("out-of-bounds in memset8\n");
+       ptr = kmalloc(size, GFP_KERNEL);
+       if (!ptr) {
+               pr_err("Allocation failed\n");
+               return;
+       }
+
+       memset(ptr+1, 0, 8);
+       kfree(ptr);
+}
+
+static noinline void __init kmalloc_oob_memset_16(void)
+{
+       char *ptr;
+       size_t size = 16;
+
+       pr_info("out-of-bounds in memset16\n");
+       ptr = kmalloc(size, GFP_KERNEL);
+       if (!ptr) {
+               pr_err("Allocation failed\n");
+               return;
+       }
+
+       memset(ptr+1, 0, 16);
+       kfree(ptr);
+}
+
 static noinline void __init kmalloc_oob_in_memset(void)
 {
        char *ptr;
@@ -264,6 +329,10 @@ static int __init kmalloc_tests_init(void)
        kmalloc_oob_krealloc_less();
        kmalloc_oob_16();
        kmalloc_oob_in_memset();
+       kmalloc_oob_memset_2();
+       kmalloc_oob_memset_4();
+       kmalloc_oob_memset_8();
+       kmalloc_oob_memset_16();
        kmalloc_uaf();
        kmalloc_uaf_memset();
        kmalloc_uaf2();
diff --git a/lib/test_printf.c b/lib/test_printf.c
new file mode 100644 (file)
index 0000000..c5a666a
--- /dev/null
@@ -0,0 +1,362 @@
+/*
+ * Test cases for printf facility.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <linux/socket.h>
+#include <linux/in.h>
+
+#define BUF_SIZE 256
+#define FILL_CHAR '$'
+
+#define PTR1 ((void*)0x01234567)
+#define PTR2 ((void*)(long)(int)0xfedcba98)
+
+#if BITS_PER_LONG == 64
+#define PTR1_ZEROES "000000000"
+#define PTR1_SPACES "         "
+#define PTR1_STR "1234567"
+#define PTR2_STR "fffffffffedcba98"
+#define PTR_WIDTH 16
+#else
+#define PTR1_ZEROES "0"
+#define PTR1_SPACES " "
+#define PTR1_STR "1234567"
+#define PTR2_STR "fedcba98"
+#define PTR_WIDTH 8
+#endif
+#define PTR_WIDTH_STR stringify(PTR_WIDTH)
+
+static unsigned total_tests __initdata;
+static unsigned failed_tests __initdata;
+static char *test_buffer __initdata;
+
+static int __printf(4, 0) __init
+do_test(int bufsize, const char *expect, int elen,
+       const char *fmt, va_list ap)
+{
+       va_list aq;
+       int ret, written;
+
+       total_tests++;
+
+       memset(test_buffer, FILL_CHAR, BUF_SIZE);
+       va_copy(aq, ap);
+       ret = vsnprintf(test_buffer, bufsize, fmt, aq);
+       va_end(aq);
+
+       if (ret != elen) {
+               pr_warn("vsnprintf(buf, %d, \"%s\", ...) returned %d, expected %d\n",
+                       bufsize, fmt, ret, elen);
+               return 1;
+       }
+
+       if (!bufsize) {
+               if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE)) {
+                       pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n",
+                               fmt);
+                       return 1;
+               }
+               return 0;
+       }
+
+       written = min(bufsize-1, elen);
+       if (test_buffer[written]) {
+               pr_warn("vsnprintf(buf, %d, \"%s\", ...) did not nul-terminate buffer\n",
+                       bufsize, fmt);
+               return 1;
+       }
+
+       if (memcmp(test_buffer, expect, written)) {
+               pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n",
+                       bufsize, fmt, test_buffer, written, expect);
+               return 1;
+       }
+       return 0;
+}
+
+static void __printf(3, 4) __init
+__test(const char *expect, int elen, const char *fmt, ...)
+{
+       va_list ap;
+       int rand;
+       char *p;
+
+       BUG_ON(elen >= BUF_SIZE);
+
+       va_start(ap, fmt);
+
+       /*
+        * Every fmt+args is subjected to four tests: Three where we
+        * tell vsnprintf varying buffer sizes (plenty, not quite
+        * enough and 0), and then we also test that kvasprintf would
+        * be able to print it as expected.
+        */
+       failed_tests += do_test(BUF_SIZE, expect, elen, fmt, ap);
+       rand = 1 + prandom_u32_max(elen+1);
+       /* Since elen < BUF_SIZE, we have 1 <= rand <= BUF_SIZE. */
+       failed_tests += do_test(rand, expect, elen, fmt, ap);
+       failed_tests += do_test(0, expect, elen, fmt, ap);
+
+       p = kvasprintf(GFP_KERNEL, fmt, ap);
+       if (p) {
+               if (memcmp(p, expect, elen+1)) {
+                       pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n",
+                               fmt, p, expect);
+                       failed_tests++;
+               }
+               kfree(p);
+       }
+       va_end(ap);
+}
+
+#define test(expect, fmt, ...)                                 \
+       __test(expect, strlen(expect), fmt, ##__VA_ARGS__)
+
+static void __init
+test_basic(void)
+{
+       /* Work around annoying "warning: zero-length gnu_printf format string". */
+       char nul = '\0';
+
+       test("", &nul);
+       test("100%", "100%%");
+       test("xxx%yyy", "xxx%cyyy", '%');
+       __test("xxx\0yyy", 7, "xxx%cyyy", '\0');
+}
+
+static void __init
+test_number(void)
+{
+       test("0x1234abcd  ", "%#-12x", 0x1234abcd);
+       test("  0x1234abcd", "%#12x", 0x1234abcd);
+       test("0|001| 12|+123| 1234|-123|-1234", "%d|%03d|%3d|%+d|% d|%+d|% d", 0, 1, 12, 123, 1234, -123, -1234);
+}
+
+static void __init
+test_string(void)
+{
+       test("", "%s%.0s", "", "123");
+       test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456");
+       test("1  |  2|3  |  4|5  ", "%-3s|%3s|%-*s|%*s|%*s", "1", "2", 3, "3", 3, "4", -3, "5");
+       /*
+        * POSIX and C99 say that a missing precision should be
+        * treated as a precision of 0. However, the kernel's printf
+        * implementation treats this case as if the . wasn't
+        * present. Let's add a test case documenting the current
+        * behaviour; should anyone ever feel the need to follow the
+        * standards more closely, this can be revisited.
+        */
+       test("a||", "%.s|%.0s|%.*s", "a", "b", 0, "c");
+       test("a  |   |   ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c");
+}
+
+static void __init
+plain(void)
+{
+       test(PTR1_ZEROES PTR1_STR " " PTR2_STR, "%p %p", PTR1, PTR2);
+       /*
+        * The field width is overloaded for some %p extensions to
+        * pass another piece of information. For plain pointers, the
+        * behaviour is slightly odd: One cannot pass either the 0
+        * flag nor a precision to %p without gcc complaining, and if
+        * one explicitly gives a field width, the number is no longer
+        * zero-padded.
+        */
+       test("|" PTR1_STR PTR1_SPACES "  |  " PTR1_SPACES PTR1_STR "|",
+            "|%-*p|%*p|", PTR_WIDTH+2, PTR1, PTR_WIDTH+2, PTR1);
+       test("|" PTR2_STR "  |  " PTR2_STR "|",
+            "|%-*p|%*p|", PTR_WIDTH+2, PTR2, PTR_WIDTH+2, PTR2);
+
+       /*
+        * Unrecognized %p extensions are treated as plain %p, but the
+        * alphanumeric suffix is ignored (that is, does not occur in
+        * the output.)
+        */
+       test("|"PTR1_ZEROES PTR1_STR"|", "|%p0y|", PTR1);
+       test("|"PTR2_STR"|", "|%p0y|", PTR2);
+}
+
+static void __init
+symbol_ptr(void)
+{
+}
+
+static void __init
+kernel_ptr(void)
+{
+}
+
+static void __init
+struct_resource(void)
+{
+}
+
+static void __init
+addr(void)
+{
+}
+
+static void __init
+escaped_str(void)
+{
+}
+
+static void __init
+hex_string(void)
+{
+       const char buf[3] = {0xc0, 0xff, 0xee};
+
+       test("c0 ff ee|c0:ff:ee|c0-ff-ee|c0ffee",
+            "%3ph|%3phC|%3phD|%3phN", buf, buf, buf, buf);
+       test("c0 ff ee|c0:ff:ee|c0-ff-ee|c0ffee",
+            "%*ph|%*phC|%*phD|%*phN", 3, buf, 3, buf, 3, buf, 3, buf);
+}
+
+static void __init
+mac(void)
+{
+       const u8 addr[6] = {0x2d, 0x48, 0xd6, 0xfc, 0x7a, 0x05};
+
+       test("2d:48:d6:fc:7a:05", "%pM", addr);
+       test("05:7a:fc:d6:48:2d", "%pMR", addr);
+       test("2d-48-d6-fc-7a-05", "%pMF", addr);
+       test("2d48d6fc7a05", "%pm", addr);
+       test("057afcd6482d", "%pmR", addr);
+}
+
+static void __init
+ip4(void)
+{
+       struct sockaddr_in sa;
+
+       sa.sin_family = AF_INET;
+       sa.sin_port = cpu_to_be16(12345);
+       sa.sin_addr.s_addr = cpu_to_be32(0x7f000001);
+
+       test("127.000.000.001|127.0.0.1", "%pi4|%pI4", &sa.sin_addr, &sa.sin_addr);
+       test("127.000.000.001|127.0.0.1", "%piS|%pIS", &sa, &sa);
+       sa.sin_addr.s_addr = cpu_to_be32(0x01020304);
+       test("001.002.003.004:12345|1.2.3.4:12345", "%piSp|%pISp", &sa, &sa);
+}
+
+static void __init
+ip6(void)
+{
+}
+
+static void __init
+ip(void)
+{
+       ip4();
+       ip6();
+}
+
+static void __init
+uuid(void)
+{
+       const char uuid[16] = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+                              0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf};
+
+       test("00010203-0405-0607-0809-0a0b0c0d0e0f", "%pUb", uuid);
+       test("00010203-0405-0607-0809-0A0B0C0D0E0F", "%pUB", uuid);
+       test("03020100-0504-0706-0809-0a0b0c0d0e0f", "%pUl", uuid);
+       test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid);
+}
+
+static void __init
+dentry(void)
+{
+}
+
+static void __init
+struct_va_format(void)
+{
+}
+
+static void __init
+struct_clk(void)
+{
+}
+
+static void __init
+bitmap(void)
+{
+       DECLARE_BITMAP(bits, 20);
+       const int primes[] = {2,3,5,7,11,13,17,19};
+       int i;
+
+       bitmap_zero(bits, 20);
+       test("00000|00000", "%20pb|%*pb", bits, 20, bits);
+       test("|", "%20pbl|%*pbl", bits, 20, bits);
+
+       for (i = 0; i < ARRAY_SIZE(primes); ++i)
+               set_bit(primes[i], bits);
+       test("a28ac|a28ac", "%20pb|%*pb", bits, 20, bits);
+       test("2-3,5,7,11,13,17,19|2-3,5,7,11,13,17,19", "%20pbl|%*pbl", bits, 20, bits);
+
+       bitmap_fill(bits, 20);
+       test("fffff|fffff", "%20pb|%*pb", bits, 20, bits);
+       test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits);
+}
+
+static void __init
+netdev_features(void)
+{
+}
+
+static void __init
+test_pointer(void)
+{
+       plain();
+       symbol_ptr();
+       kernel_ptr();
+       struct_resource();
+       addr();
+       escaped_str();
+       hex_string();
+       mac();
+       ip();
+       uuid();
+       dentry();
+       struct_va_format();
+       struct_clk();
+       bitmap();
+       netdev_features();
+}
+
+static int __init
+test_printf_init(void)
+{
+       test_buffer = kmalloc(BUF_SIZE, GFP_KERNEL);
+       if (!test_buffer)
+               return -ENOMEM;
+
+       test_basic();
+       test_number();
+       test_string();
+       test_pointer();
+
+       kfree(test_buffer);
+
+       if (failed_tests == 0)
+               pr_info("all %u tests passed\n", total_tests);
+       else
+               pr_warn("failed %u out of %u tests\n", failed_tests, total_tests);
+
+       return failed_tests ? -EINVAL : 0;
+}
+
+module_init(test_printf_init);
+
+MODULE_AUTHOR("Rasmus Villemoes <linux@rasmusvillemoes.dk>");
+MODULE_LICENSE("GPL");
index 95cd63b43b99fa83798fb68dadfbd95a8c0bc57c..f9cee8e1233c0fe0f626fe2680ca4b9ff9d3153e 100644 (file)
@@ -1449,6 +1449,8 @@ int kptr_restrict __read_mostly;
  *        (legacy clock framework) of the clock
  * - 'Cr' For a clock, it prints the current rate of the clock
  *
+ * ** Please update also Documentation/printk-formats.txt when making changes **
+ *
  * Note: The difference between 'S' and 'F' is that on ia64 and ppc64
  * function pointers are really function descriptors, which contain a
  * pointer to the real address.
@@ -1457,7 +1459,7 @@ static noinline_for_stack
 char *pointer(const char *fmt, char *buf, char *end, void *ptr,
              struct printf_spec spec)
 {
-       int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0);
+       const int default_width = 2 * sizeof(void *);
 
        if (!ptr && *fmt != 'K') {
                /*
@@ -1769,14 +1771,14 @@ qualifier:
 
        case 'n':
                /*
-                * Since %n poses a greater security risk than utility, treat
-                * it as an invalid format specifier. Warn about its use so
-                * that new instances don't get added.
+                * Since %n poses a greater security risk than
+                * utility, treat it as any other invalid or
+                * unsupported format specifier.
                 */
-               WARN_ONCE(1, "Please remove ignored %%n in '%s'\n", fmt);
                /* Fall-through */
 
        default:
+               WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt);
                spec->type = FORMAT_TYPE_INVALID;
                return fmt - start;
        }
@@ -1811,41 +1813,16 @@ qualifier:
  * @fmt: The format string to use
  * @args: Arguments for the format string
  *
- * This function follows C99 vsnprintf, but has some extensions:
- * %pS output the name of a text symbol with offset
- * %ps output the name of a text symbol without offset
- * %pF output the name of a function pointer with its offset
- * %pf output the name of a function pointer without its offset
- * %pB output the name of a backtrace symbol with its offset
- * %pR output the address range in a struct resource with decoded flags
- * %pr output the address range in a struct resource with raw flags
- * %pb output the bitmap with field width as the number of bits
- * %pbl output the bitmap as range list with field width as the number of bits
- * %pM output a 6-byte MAC address with colons
- * %pMR output a 6-byte MAC address with colons in reversed order
- * %pMF output a 6-byte MAC address with dashes
- * %pm output a 6-byte MAC address without colons
- * %pmR output a 6-byte MAC address without colons in reversed order
- * %pI4 print an IPv4 address without leading zeros
- * %pi4 print an IPv4 address with leading zeros
- * %pI6 print an IPv6 address with colons
- * %pi6 print an IPv6 address without colons
- * %pI6c print an IPv6 address as specified by RFC 5952
- * %pIS depending on sa_family of 'struct sockaddr *' print IPv4/IPv6 address
- * %piS depending on sa_family of 'struct sockaddr *' print IPv4/IPv6 address
- * %pU[bBlL] print a UUID/GUID in big or little endian using lower or upper
- *   case.
- * %*pE[achnops] print an escaped buffer
- * %*ph[CDN] a variable-length hex string with a separator (supports up to 64
- *           bytes of the input)
- * %pC output the name (Common Clock Framework) or address (legacy clock
- *     framework) of a clock
- * %pCn output the name (Common Clock Framework) or address (legacy clock
- *      framework) of a clock
- * %pCr output the current rate of a clock
- * %n is ignored
+ * This function generally follows C99 vsnprintf, but has some
+ * extensions and a few limitations:
+ *
+ * %n is unsupported
+ * %p* is handled by pointer()
  *
- * ** Please update Documentation/printk-formats.txt when making changes **
+ * See pointer() or Documentation/printk-formats.txt for more
+ * extensive description.
+ *
+ * ** Please update the documentation in both places when making changes **
  *
  * The return value is the number of characters which would
  * be generated for the given input, excluding the trailing
@@ -1944,10 +1921,15 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
                        break;
 
                case FORMAT_TYPE_INVALID:
-                       if (str < end)
-                               *str = '%';
-                       ++str;
-                       break;
+                       /*
+                        * Presumably the arguments passed gcc's type
+                        * checking, but there is no safe or sane way
+                        * for us to continue parsing the format and
+                        * fetching from the va_list; the remaining
+                        * specifiers and arguments would be out of
+                        * sync.
+                        */
+                       goto out;
 
                default:
                        switch (spec.type) {
@@ -1992,6 +1974,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
                }
        }
 
+out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
@@ -2189,9 +2172,10 @@ do {                                                                     \
 
                switch (spec.type) {
                case FORMAT_TYPE_NONE:
-               case FORMAT_TYPE_INVALID:
                case FORMAT_TYPE_PERCENT_CHAR:
                        break;
+               case FORMAT_TYPE_INVALID:
+                       goto out;
 
                case FORMAT_TYPE_WIDTH:
                case FORMAT_TYPE_PRECISION:
@@ -2253,6 +2237,7 @@ do {                                                                      \
                }
        }
 
+out:
        return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
 #undef save_arg
 }
@@ -2286,7 +2271,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
        char *str, *end;
        const char *args = (const char *)bin_buf;
 
-       if (WARN_ON_ONCE((int) size < 0))
+       if (WARN_ON_ONCE(size > INT_MAX))
                return 0;
 
        str = buf;
@@ -2375,12 +2360,14 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
                        break;
 
                case FORMAT_TYPE_PERCENT_CHAR:
-               case FORMAT_TYPE_INVALID:
                        if (str < end)
                                *str = '%';
                        ++str;
                        break;
 
+               case FORMAT_TYPE_INVALID:
+                       goto out;
+
                default: {
                        unsigned long long num;
 
@@ -2423,6 +2410,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
                } /* switch(spec.type) */
        } /* while(*fmt) */
 
+out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
index 0d9fdcd01e479d87a45cb487db54e8c1fff27883..97a4e06b15c00dfd7f8b8bc0f1e1e4610b769938 100644 (file)
@@ -200,18 +200,6 @@ config MEMORY_HOTREMOVE
        depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
        depends on MIGRATION
 
-#
-# If we have space for more page flags then we can enable additional
-# optimizations and functionality.
-#
-# Regular Sparsemem takes page flag bits for the sectionid if it does not
-# use a virtual memmap. Disable extended page flags for 32 bit platforms
-# that require the use of a sectionid in the page flags.
-#
-config PAGEFLAGS_EXTENDED
-       def_bool y
-       depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
-
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
index 619984fc07ec32792349c7fe8aec7c8b56e3d2a3..8ed2ffd963c53b910f91e1b60b04c56385a3129f 100644 (file)
@@ -637,7 +637,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 {
        struct bdi_writeback *wb;
 
-       might_sleep_if(gfp & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(gfp));
 
        if (!memcg_css->parent)
                return &bdi->wb;
index fcad8322ef36781c5c59e92ddee26caeb4c3c5f2..d3116be5a00fa51646b5a0b45683a138a4ed3f7c 100644 (file)
@@ -199,23 +199,17 @@ int balloon_page_migrate(struct page *newpage,
        struct balloon_dev_info *balloon = balloon_page_device(page);
        int rc = -EAGAIN;
 
-       /*
-        * Block others from accessing the 'newpage' when we get around to
-        * establishing additional references. We should be the only one
-        * holding a reference to the 'newpage' at this point.
-        */
-       BUG_ON(!trylock_page(newpage));
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 
        if (WARN_ON(!__is_movable_balloon_page(page))) {
                dump_page(page, "not movable balloon page");
-               unlock_page(newpage);
                return rc;
        }
 
        if (balloon && balloon->migratepage)
                rc = balloon->migratepage(balloon, newpage, page, mode);
 
-       unlock_page(newpage);
        return rc;
 }
 #endif /* CONFIG_BALLOON_COMPACTION */
index 4eb56badf37e60e63a5a1badd093d1934a21ad35..ea506eb18cd6b2cff00606a6c1e1387b48099ffa 100644 (file)
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -363,7 +363,9 @@ err:
  */
 struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
 {
-       unsigned long mask, offset, pfn, start = 0;
+       unsigned long mask, offset;
+       unsigned long pfn = -1;
+       unsigned long start = 0;
        unsigned long bitmap_maxno, bitmap_no, bitmap_count;
        struct page *page = NULL;
        int ret;
@@ -418,7 +420,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
                start = bitmap_no + mask + 1;
        }
 
-       trace_cma_alloc(page ? pfn : -1UL, page, count, align);
+       trace_cma_alloc(pfn, page, count, align);
 
        pr_debug("%s(): returned %p\n", __func__, page);
        return page;
index c5c627aae9962daf9c64d4f482c075e4cd96422a..de3e1e71cd9f265d9df971e98ed931fbdc8611a8 100644 (file)
@@ -35,17 +35,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 #endif
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
-#ifdef CONFIG_TRACEPOINTS
-static const char *const compaction_status_string[] = {
-       "deferred",
-       "skipped",
-       "continue",
-       "partial",
-       "complete",
-       "no_suitable_page",
-       "not_suitable_zone",
-};
-#endif
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/compaction.h>
@@ -1197,6 +1186,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
 
+/*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+static inline bool is_via_compact_memory(int order)
+{
+       return order == -1;
+}
+
 static int __compact_finished(struct zone *zone, struct compact_control *cc,
                            const int migratetype)
 {
@@ -1204,7 +1202,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
        unsigned long watermark;
 
        if (cc->contended || fatal_signal_pending(current))
-               return COMPACT_PARTIAL;
+               return COMPACT_CONTENDED;
 
        /* Compaction run completes if the migrate and free scanner meet */
        if (compact_scanners_met(cc)) {
@@ -1223,11 +1221,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
                return COMPACT_COMPLETE;
        }
 
-       /*
-        * order == -1 is expected when compacting via
-        * /proc/sys/vm/compact_memory
-        */
-       if (cc->order == -1)
+       if (is_via_compact_memory(cc->order))
                return COMPACT_CONTINUE;
 
        /* Compaction run is not finished if the watermark is not met */
@@ -1290,11 +1284,7 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
        int fragindex;
        unsigned long watermark;
 
-       /*
-        * order == -1 is expected when compacting via
-        * /proc/sys/vm/compact_memory
-        */
-       if (order == -1)
+       if (is_via_compact_memory(order))
                return COMPACT_CONTINUE;
 
        watermark = low_wmark_pages(zone);
@@ -1403,7 +1393,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
                switch (isolate_migratepages(zone, cc)) {
                case ISOLATE_ABORT:
-                       ret = COMPACT_PARTIAL;
+                       ret = COMPACT_CONTENDED;
                        putback_movable_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
                        goto out;
@@ -1434,7 +1424,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                         * and we want compact_finished() to detect it
                         */
                        if (err == -ENOMEM && !compact_scanners_met(cc)) {
-                               ret = COMPACT_PARTIAL;
+                               ret = COMPACT_CONTENDED;
                                goto out;
                        }
                }
@@ -1487,6 +1477,9 @@ out:
        trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
                                cc->free_pfn, end_pfn, sync, ret);
 
+       if (ret == COMPACT_CONTENDED)
+               ret = COMPACT_PARTIAL;
+
        return ret;
 }
 
@@ -1658,10 +1651,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                 * this makes sure we compact the whole zone regardless of
                 * cached scanner positions.
                 */
-               if (cc->order == -1)
+               if (is_via_compact_memory(cc->order))
                        __reset_isolation_suitable(zone);
 
-               if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+               if (is_via_compact_memory(cc->order) ||
+                               !compaction_deferred(zone, cc->order))
                        compact_zone(zone, cc);
 
                if (cc->order > 0) {
index 6c1b3ea61bfddfe4f042a6ef067e53e34f82792b..836276586185f2360a2ad10b981744347c00a79c 100644 (file)
@@ -25,12 +25,7 @@ static const struct trace_print_flags pageflag_names[] = {
        {1UL << PG_private,             "private"       },
        {1UL << PG_private_2,           "private_2"     },
        {1UL << PG_writeback,           "writeback"     },
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
        {1UL << PG_head,                "head"          },
-       {1UL << PG_tail,                "tail"          },
-#else
-       {1UL << PG_compound,            "compound"      },
-#endif
        {1UL << PG_swapcache,           "swapcache"     },
        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
        {1UL << PG_reclaim,             "reclaim"       },
@@ -45,9 +40,6 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_MEMORY_FAILURE
        {1UL << PG_hwpoison,            "hwpoison"      },
 #endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       {1UL << PG_compound_lock,       "compound_lock" },
-#endif
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
        {1UL << PG_young,               "young"         },
        {1UL << PG_idle,                "idle"          },
@@ -87,9 +79,12 @@ static void dump_flags(unsigned long flags,
 void dump_page_badflags(struct page *page, const char *reason,
                unsigned long badflags)
 {
-       pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+       pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
                  page, atomic_read(&page->_count), page_mapcount(page),
                  page->mapping, page->index);
+       if (PageCompound(page))
+               pr_cont(" compound_mapcount: %d", compound_mapcount(page));
+       pr_cont("\n");
        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
        dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
        if (reason)
@@ -125,6 +120,7 @@ static const struct trace_print_flags vmaflags_names[] = {
        {VM_GROWSDOWN,                  "growsdown"     },
        {VM_PFNMAP,                     "pfnmap"        },
        {VM_DENYWRITE,                  "denywrite"     },
+       {VM_LOCKONFAULT,                "lockonfault"   },
        {VM_LOCKED,                     "locked"        },
        {VM_IO,                         "io"            },
        {VM_SEQ_READ,                   "seqread"       },
index 312a716fa14c2ef0d2780832bc378c05a3d08d16..57312b5d6e12aaf2da3f6acd38a19bbe28959077 100644 (file)
@@ -326,7 +326,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
        size_t offset;
        void *retval;
 
-       might_sleep_if(mem_flags & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(mem_flags));
 
        spin_lock_irqsave(&pool->lock, flags);
        list_for_each_entry(page, &pool->page_list, page_list) {
index 17ae14b5aefa2e5302a90e5b748a719e22a22f3b..6d5717bd7197ba0428c32941df27d90da04264b5 100644 (file)
@@ -126,7 +126,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
        /*
         * Mappings have to be page-aligned
         */
-       offset = phys_addr & ~PAGE_MASK;
+       offset = offset_in_page(phys_addr);
        phys_addr &= PAGE_MASK;
        size = PAGE_ALIGN(last_addr + 1) - phys_addr;
 
@@ -189,7 +189,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
        if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
                return;
 
-       offset = virt_addr & ~PAGE_MASK;
+       offset = offset_in_page(virt_addr);
        nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
 
        idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
@@ -234,7 +234,7 @@ void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
        char *p;
 
        while (size) {
-               slop = src & ~PAGE_MASK;
+               slop = offset_in_page(src);
                clen = size;
                if (clen > MAX_MAP_CHUNK - slop)
                        clen = MAX_MAP_CHUNK - slop;
index 98fb490311eb94386aebd2f4ceb77c729f4fa01e..79171b4a58269986491198403a322d6c2a7dc814 100644 (file)
@@ -3,11 +3,11 @@
 
 static struct {
        struct fault_attr attr;
-       bool ignore_gfp_wait;
+       bool ignore_gfp_reclaim;
        bool cache_filter;
 } failslab = {
        .attr = FAULT_ATTR_INITIALIZER,
-       .ignore_gfp_wait = true,
+       .ignore_gfp_reclaim = true,
        .cache_filter = false,
 };
 
@@ -16,7 +16,7 @@ bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
        if (gfpflags & __GFP_NOFAIL)
                return false;
 
-        if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
+       if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
                return false;
 
        if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
@@ -42,7 +42,7 @@ static int __init failslab_debugfs_init(void)
                return PTR_ERR(dir);
 
        if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                               &failslab.ignore_gfp_wait))
+                               &failslab.ignore_gfp_reclaim))
                goto fail;
        if (!debugfs_create_bool("cache-filter", mode, dir,
                                &failslab.cache_filter))
index 327910c2400c6ce36f440383147fdc768cf14692..834cd142530744d3175d09f88611ddeb30c8664a 100644 (file)
@@ -204,7 +204,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
                __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
                __dec_zone_page_state(page, NR_SHMEM);
-       BUG_ON(page_mapped(page));
+       VM_BUG_ON_PAGE(page_mapped(page), page);
 
        /*
         * At this point page must be either written or cleaned by truncate.
@@ -331,23 +331,14 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
 
-/**
- * filemap_fdatawait_range - wait for writeback to complete
- * @mapping:           address space structure to wait for
- * @start_byte:                offset in bytes where the range starts
- * @end_byte:          offset in bytes where the range ends (inclusive)
- *
- * Walk the list of under-writeback pages of the given address space
- * in the given range and wait for all of them.
- */
-int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
-                           loff_t end_byte)
+static int __filemap_fdatawait_range(struct address_space *mapping,
+                                    loff_t start_byte, loff_t end_byte)
 {
        pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
        pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
        struct pagevec pvec;
        int nr_pages;
-       int ret2, ret = 0;
+       int ret = 0;
 
        if (end_byte < start_byte)
                goto out;
@@ -374,6 +365,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                cond_resched();
        }
 out:
+       return ret;
+}
+
+/**
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping:           address space structure to wait for
+ * @start_byte:                offset in bytes where the range starts
+ * @end_byte:          offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them.  Check error status of
+ * the address space and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+                           loff_t end_byte)
+{
+       int ret, ret2;
+
+       ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
        ret2 = filemap_check_errors(mapping);
        if (!ret)
                ret = ret2;
@@ -382,12 +396,39 @@ out:
 }
 EXPORT_SYMBOL(filemap_fdatawait_range);
 
+/**
+ * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
+ * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.  Unlike filemap_fdatawait(), this function
+ * does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves.  Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+void filemap_fdatawait_keep_errors(struct address_space *mapping)
+{
+       loff_t i_size = i_size_read(mapping->host);
+
+       if (i_size == 0)
+               return;
+
+       __filemap_fdatawait_range(mapping, 0, i_size - 1);
+}
+
 /**
  * filemap_fdatawait - wait for all under-writeback pages to complete
  * @mapping: address space structure to wait for
  *
  * Walk the list of under-writeback pages of the given address space
- * and wait for all of them.
+ * and wait for all of them.  Check error status of the address space
+ * and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
  */
 int filemap_fdatawait(struct address_space *mapping)
 {
@@ -510,7 +551,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                        __inc_zone_page_state(new, NR_SHMEM);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
                mem_cgroup_end_page_stat(memcg);
-               mem_cgroup_migrate(old, new, true);
+               mem_cgroup_replace_page(old, new);
                radix_tree_preload_end();
                if (freepage)
                        freepage(old);
@@ -577,7 +618,7 @@ static int __add_to_page_cache_locked(struct page *page,
 
        if (!huge) {
                error = mem_cgroup_try_charge(page, current->mm,
-                                             gfp_mask, &memcg);
+                                             gfp_mask, &memcg, false);
                if (error)
                        return error;
        }
@@ -585,7 +626,7 @@ static int __add_to_page_cache_locked(struct page *page,
        error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error) {
                if (!huge)
-                       mem_cgroup_cancel_charge(page, memcg);
+                       mem_cgroup_cancel_charge(page, memcg, false);
                return error;
        }
 
@@ -604,7 +645,7 @@ static int __add_to_page_cache_locked(struct page *page,
                __inc_zone_page_state(page, NR_FILE_PAGES);
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
-               mem_cgroup_commit_charge(page, memcg, false);
+               mem_cgroup_commit_charge(page, memcg, false, false);
        trace_mm_filemap_add_to_page_cache(page);
        return 0;
 err_insert:
@@ -612,7 +653,7 @@ err_insert:
        /* Leave page->index set: truncation relies upon it */
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, false);
        page_cache_release(page);
        return error;
 }
@@ -641,11 +682,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
        void *shadow = NULL;
        int ret;
 
-       __set_page_locked(page);
+       __SetPageLocked(page);
        ret = __add_to_page_cache_locked(page, mapping, offset,
                                         gfp_mask, &shadow);
        if (unlikely(ret))
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
        else {
                /*
                 * The page might have been evicted from cache only
@@ -768,6 +809,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
  */
 void unlock_page(struct page *page)
 {
+       page = compound_head(page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        clear_bit_unlock(PG_locked, &page->flags);
        smp_mb__after_atomic();
@@ -832,18 +874,20 @@ EXPORT_SYMBOL_GPL(page_endio);
  */
 void __lock_page(struct page *page)
 {
-       DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+       struct page *page_head = compound_head(page);
+       DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
 
-       __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+       __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_page);
 
 int __lock_page_killable(struct page *page)
 {
-       DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+       struct page *page_head = compound_head(page);
+       DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
 
-       return __wait_on_bit_lock(page_waitqueue(page), &wait,
+       return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
                                        bit_wait_io, TASK_KILLABLE);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -1681,7 +1725,7 @@ no_cached_page:
                        goto out;
                }
                error = add_to_page_cache_lru(page, mapping, index,
-                                       GFP_KERNEL & mapping_gfp_mask(mapping));
+                               mapping_gfp_constraint(mapping, GFP_KERNEL));
                if (error) {
                        page_cache_release(page);
                        if (error == -EEXIST) {
@@ -1783,7 +1827,7 @@ static int page_cache_read(struct file *file, pgoff_t offset)
                        return -ENOMEM;
 
                ret = add_to_page_cache_lru(page, mapping, offset,
-                               GFP_KERNEL & mapping_gfp_mask(mapping));
+                               mapping_gfp_constraint(mapping, GFP_KERNEL));
                if (ret == 0)
                        ret = mapping->a_ops->readpage(file, page);
                else if (ret == -EEXIST)
@@ -1807,7 +1851,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
                                   struct file *file,
                                   pgoff_t offset)
 {
-       unsigned long ra_pages;
        struct address_space *mapping = file->f_mapping;
 
        /* If we don't want any read-ahead, don't bother */
@@ -1836,10 +1879,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
        /*
         * mmap read-around
         */
-       ra_pages = max_sane_readahead(ra->ra_pages);
-       ra->start = max_t(long, 0, offset - ra_pages / 2);
-       ra->size = ra_pages;
-       ra->async_size = ra_pages / 4;
+       ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+       ra->size = ra->ra_pages;
+       ra->async_size = ra->ra_pages / 4;
        ra_submit(ra, mapping, file);
 }
 
@@ -2674,7 +2716,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
  * page is known to the local caching routines.
  *
  * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
+ * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
  *
  */
 int try_to_release_page(struct page *page, gfp_t gfp_mask)
index cdabcb93c6a61965c7de7ce7f2ae4419437da013..7cf2b7163222b1d304ba1a4dde0604c5f29a3bbd 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/pagemap.h>
 #include <linux/sched.h>
 
-/*
+/**
  * get_vaddr_frames() - map virtual addresses to pfns
  * @start:     starting user address
  * @nr_frames: number of pages / pfns from start to map
index a798293fc6486bac215ecb58ed071263a5f775f0..e95b0cb6ed8173e80ecf3ae7d8798b763b46f965 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -116,8 +116,21 @@ retry:
                }
        }
 
+       if (flags & FOLL_SPLIT && PageTransCompound(page)) {
+               int ret;
+               get_page(page);
+               pte_unmap_unlock(ptep, ptl);
+               lock_page(page);
+               ret = split_huge_page(page);
+               unlock_page(page);
+               put_page(page);
+               if (ret)
+                       return ERR_PTR(ret);
+               goto retry;
+       }
+
        if (flags & FOLL_GET)
-               get_page_foll(page);
+               get_page(page);
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
@@ -129,7 +142,11 @@ retry:
                 */
                mark_page_accessed(page);
        }
-       if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
+       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+               /* Do not mlock pte-mapped THP */
+               if (PageTransCompound(page))
+                       goto out;
+
                /*
                 * The preliminary mapping check is mainly to avoid the
                 * pointless overhead of lock_page on the ZERO_PAGE
@@ -220,27 +237,38 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
        }
        if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                return no_page_table(vma, flags);
-       if (pmd_trans_huge(*pmd)) {
-               if (flags & FOLL_SPLIT) {
-                       split_huge_page_pmd(vma, address, pmd);
-                       return follow_page_pte(vma, address, pmd, flags);
-               }
-               ptl = pmd_lock(mm, pmd);
-               if (likely(pmd_trans_huge(*pmd))) {
-                       if (unlikely(pmd_trans_splitting(*pmd))) {
-                               spin_unlock(ptl);
-                               wait_split_huge_page(vma->anon_vma, pmd);
-                       } else {
-                               page = follow_trans_huge_pmd(vma, address,
-                                                            pmd, flags);
-                               spin_unlock(ptl);
-                               *page_mask = HPAGE_PMD_NR - 1;
-                               return page;
-                       }
-               } else
+       if (likely(!pmd_trans_huge(*pmd)))
+               return follow_page_pte(vma, address, pmd, flags);
+
+       ptl = pmd_lock(mm, pmd);
+       if (unlikely(!pmd_trans_huge(*pmd))) {
+               spin_unlock(ptl);
+               return follow_page_pte(vma, address, pmd, flags);
+       }
+       if (flags & FOLL_SPLIT) {
+               int ret;
+               page = pmd_page(*pmd);
+               if (is_huge_zero_page(page)) {
+                       spin_unlock(ptl);
+                       ret = 0;
+                       split_huge_pmd(vma, pmd, address);
+               } else {
+                       get_page(page);
                        spin_unlock(ptl);
+                       lock_page(page);
+                       ret = split_huge_page(page);
+                       unlock_page(page);
+                       put_page(page);
+               }
+
+               return ret ? ERR_PTR(ret) :
+                       follow_page_pte(vma, address, pmd, flags);
        }
-       return follow_page_pte(vma, address, pmd, flags);
+
+       page = follow_trans_huge_pmd(vma, address, pmd, flags);
+       spin_unlock(ptl);
+       *page_mask = HPAGE_PMD_NR - 1;
+       return page;
 }
 
 static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -299,6 +327,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
        unsigned int fault_flags = 0;
        int ret;
 
+       /* mlock all present pages, but do not fault in new pages */
+       if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
+               return -ENOENT;
        /* For mm_populate(), just skip the stack guard page. */
        if ((*flags & FOLL_POPULATE) &&
                        (stack_guard_page_start(vma, address) ||
@@ -890,7 +921,9 @@ long populate_vma_page_range(struct vm_area_struct *vma,
        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
 
-       gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+       gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+       if (vma->vm_flags & VM_LOCKONFAULT)
+               gup_flags &= ~FOLL_POPULATE;
        /*
         * We want to touch writable mappings with a write fault in order
         * to break COW, except for shared mappings because these don't COW
@@ -1030,9 +1063,6 @@ struct page *get_dump_page(unsigned long addr)
  *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
  *      pages containing page tables.
  *
- *  *) THP splits will broadcast an IPI, this can be achieved by overriding
- *      pmdp_splitting_flush.
- *
  *  *) ptes can be read atomically by the architecture.
  *
  *  *) access_ok is sufficient to validate userspace address ranges.
@@ -1060,7 +1090,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                 * for an example see gup_get_pte in arch/x86/mm/gup.c
                 */
                pte_t pte = READ_ONCE(*ptep);
-               struct page *page;
+               struct page *head, *page;
 
                /*
                 * Similar to the PMD case below, NUMA hinting must take slow
@@ -1072,15 +1102,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);
+               head = compound_head(page);
 
-               if (!page_cache_get_speculative(page))
+               if (!page_cache_get_speculative(head))
                        goto pte_unmap;
 
                if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-                       put_page(page);
+                       put_page(head);
                        goto pte_unmap;
                }
 
+               VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
                (*nr)++;
 
@@ -1113,7 +1145,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        int refs;
 
        if (write && !pmd_write(orig))
@@ -1122,7 +1154,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
        refs = 0;
        head = pmd_page(orig);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
@@ -1143,24 +1174,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                return 0;
        }
 
-       /*
-        * Any tail pages need their mapcount reference taken before we
-        * return. (This allows the THP code to bump their ref count when
-        * they are split into base pages).
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
 
 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                unsigned long end, int write, struct page **pages, int *nr)
 {
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        int refs;
 
        if (write && !pud_write(orig))
@@ -1169,7 +1189,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
        refs = 0;
        head = pud_page(orig);
        page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
@@ -1190,12 +1209,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
                return 0;
        }
 
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
 
@@ -1204,7 +1217,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                        struct page **pages, int *nr)
 {
        int refs;
-       struct page *head, *page, *tail;
+       struct page *head, *page;
 
        if (write && !pgd_write(orig))
                return 0;
@@ -1212,7 +1225,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
        refs = 0;
        head = pgd_page(orig);
        page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
@@ -1233,12 +1245,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                return 0;
        }
 
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
-
        return 1;
 }
 
@@ -1253,7 +1259,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                pmd_t pmd = READ_ONCE(*pmdp);
 
                next = pmd_addr_end(addr, end);
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
 
                if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
index 440be97ad2bb0fcb0b3e8d83aaaa9ebc0696a008..4b3420ade697b04659e6032ca602c7ac77149ad7 100644 (file)
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/page_idle.h>
+#include <linux/swapops.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
 
+enum scan_result {
+       SCAN_FAIL,
+       SCAN_SUCCEED,
+       SCAN_PMD_NULL,
+       SCAN_EXCEED_NONE_PTE,
+       SCAN_PTE_NON_PRESENT,
+       SCAN_PAGE_RO,
+       SCAN_NO_REFERENCED_PAGE,
+       SCAN_PAGE_NULL,
+       SCAN_SCAN_ABORT,
+       SCAN_PAGE_COUNT,
+       SCAN_PAGE_LRU,
+       SCAN_PAGE_LOCK,
+       SCAN_PAGE_ANON,
+       SCAN_PAGE_COMPOUND,
+       SCAN_ANY_PROCESS,
+       SCAN_VMA_NULL,
+       SCAN_VMA_CHECK,
+       SCAN_ADDRESS_RANGE,
+       SCAN_SWAP_CACHE_PAGE,
+       SCAN_DEL_PAGE_LRU,
+       SCAN_ALLOC_HUGE_PAGE_FAIL,
+       SCAN_CGROUP_CHARGE_FAIL,
+       SCAN_EXCEED_SWAP_PTE
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/huge_memory.h>
+
 /*
  * By default transparent hugepage support is disabled in order that avoid
  * to risk increase the memory footprint of applications without a guaranteed
@@ -67,6 +97,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  * fault.
  */
 static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_swap __read_mostly = HPAGE_PMD_NR/8;
 
 static int khugepaged(void *none);
 static int khugepaged_slab_init(void);
@@ -106,6 +137,10 @@ static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
 };
 
+static DEFINE_SPINLOCK(split_queue_lock);
+static LIST_HEAD(split_queue);
+static unsigned long split_queue_len;
+static struct shrinker deferred_split_shrinker;
 
 static void set_recommended_min_free_kbytes(void)
 {
@@ -116,7 +151,7 @@ static void set_recommended_min_free_kbytes(void)
        for_each_populated_zone(zone)
                nr_zones++;
 
-       /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+       /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
        recommended_min = pageblock_nr_pages * nr_zones * 2;
 
        /*
@@ -553,6 +588,33 @@ static struct kobj_attribute khugepaged_max_ptes_none_attr =
        __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
               khugepaged_max_ptes_none_store);
 
+static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+                                            struct kobj_attribute *attr,
+                                            char *buf)
+{
+       return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+}
+
+static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+                                             struct kobj_attribute *attr,
+                                             const char *buf, size_t count)
+{
+       int err;
+       unsigned long max_ptes_swap;
+
+       err  = kstrtoul(buf, 10, &max_ptes_swap);
+       if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+               return -EINVAL;
+
+       khugepaged_max_ptes_swap = max_ptes_swap;
+
+       return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+       __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+              khugepaged_max_ptes_swap_store);
+
 static struct attribute *khugepaged_attr[] = {
        &khugepaged_defrag_attr.attr,
        &khugepaged_max_ptes_none_attr.attr,
@@ -561,6 +623,7 @@ static struct attribute *khugepaged_attr[] = {
        &full_scans_attr.attr,
        &scan_sleep_millisecs_attr.attr,
        &alloc_sleep_millisecs_attr.attr,
+       &khugepaged_max_ptes_swap_attr.attr,
        NULL,
 };
 
@@ -638,6 +701,9 @@ static int __init hugepage_init(void)
        err = register_shrinker(&huge_zero_page_shrinker);
        if (err)
                goto err_hzp_shrinker;
+       err = register_shrinker(&deferred_split_shrinker);
+       if (err)
+               goto err_split_shrinker;
 
        /*
         * By default disable transparent hugepages on smaller systems,
@@ -655,6 +721,8 @@ static int __init hugepage_init(void)
 
        return 0;
 err_khugepaged:
+       unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
        unregister_shrinker(&huge_zero_page_shrinker);
 err_hzp_shrinker:
        khugepaged_slab_exit();
@@ -711,6 +779,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
        return entry;
 }
 
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+       /*
+        * ->lru in the tail pages is occupied by compound_head.
+        * Let's use ->mapping + ->index in the second tail page as list_head.
+        */
+       return (struct list_head *)&page[2].mapping;
+}
+
+void prep_transhuge_page(struct page *page)
+{
+       /*
+        * we use page->mapping and page->indexlru in second tail page
+        * as list_head: assuming THP order >= 2
+        */
+       BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
+       INIT_LIST_HEAD(page_deferred_list(page));
+       set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
+
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmd,
@@ -724,7 +813,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
        VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-       if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+       if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -732,7 +821,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
        pgtable = pte_alloc_one(mm, haddr);
        if (unlikely(!pgtable)) {
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, true);
                put_page(page);
                return VM_FAULT_OOM;
        }
@@ -748,7 +837,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
        ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_none(*pmd))) {
                spin_unlock(ptl);
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, true);
                put_page(page);
                pte_free(mm, pgtable);
        } else {
@@ -759,7 +848,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                        int ret;
 
                        spin_unlock(ptl);
-                       mem_cgroup_cancel_charge(page, memcg);
+                       mem_cgroup_cancel_charge(page, memcg, true);
                        put_page(page);
                        pte_free(mm, pgtable);
                        ret = handle_userfault(vma, address, flags,
@@ -770,8 +859,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-               page_add_new_anon_rmap(page, vma, haddr);
-               mem_cgroup_commit_charge(page, memcg, false);
+               page_add_new_anon_rmap(page, vma, haddr, true);
+               mem_cgroup_commit_charge(page, memcg, false, true);
                lru_cache_add_active_or_unevictable(page, vma);
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                set_pmd_at(mm, haddr, pmd, entry);
@@ -786,7 +875,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 {
-       return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+       return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
 }
 
 /* Caller must hold page table lock. */
@@ -865,6 +954,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
+       prep_transhuge_page(page);
        return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
                                            flags);
 }
@@ -956,19 +1046,10 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out_unlock;
        }
 
-       if (unlikely(pmd_trans_splitting(pmd))) {
-               /* split huge page running from under us */
-               spin_unlock(src_ptl);
-               spin_unlock(dst_ptl);
-               pte_free(dst_mm, pgtable);
-
-               wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
-               goto out;
-       }
        src_page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
        get_page(src_page);
-       page_dup_rmap(src_page);
+       page_dup_rmap(src_page, true);
        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -1008,37 +1089,6 @@ unlock:
        spin_unlock(ptl);
 }
 
-/*
- * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
- * during copy_user_huge_page()'s copy_page_rep(): in the case when
- * the source page gets split and a tail freed before copy completes.
- * Called under pmd_lock of checked pmd, so safe from splitting itself.
- */
-static void get_user_huge_page(struct page *page)
-{
-       if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
-               struct page *endpage = page + HPAGE_PMD_NR;
-
-               atomic_add(HPAGE_PMD_NR, &page->_count);
-               while (++page < endpage)
-                       get_huge_page_tail(page);
-       } else {
-               get_page(page);
-       }
-}
-
-static void put_user_huge_page(struct page *page)
-{
-       if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
-               struct page *endpage = page + HPAGE_PMD_NR;
-
-               while (page < endpage)
-                       put_page(page++);
-       } else {
-               put_page(page);
-       }
-}
-
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -1068,13 +1118,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                               vma, address, page_to_nid(page));
                if (unlikely(!pages[i] ||
                             mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
-                                                  &memcg))) {
+                                                  &memcg, false))) {
                        if (pages[i])
                                put_page(pages[i]);
                        while (--i >= 0) {
                                memcg = (void *)page_private(pages[i]);
                                set_page_private(pages[i], 0);
-                               mem_cgroup_cancel_charge(pages[i], memcg);
+                               mem_cgroup_cancel_charge(pages[i], memcg,
+                                               false);
                                put_page(pages[i]);
                        }
                        kfree(pages);
@@ -1112,8 +1163,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
-               page_add_new_anon_rmap(pages[i], vma, haddr);
-               mem_cgroup_commit_charge(pages[i], memcg, false);
+               page_add_new_anon_rmap(pages[i], vma, haddr, false);
+               mem_cgroup_commit_charge(pages[i], memcg, false, false);
                lru_cache_add_active_or_unevictable(pages[i], vma);
                pte = pte_offset_map(&_pmd, haddr);
                VM_BUG_ON(!pte_none(*pte));
@@ -1124,7 +1175,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
-       page_remove_rmap(page);
+       page_remove_rmap(page, true);
        spin_unlock(ptl);
 
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1141,7 +1192,7 @@ out_free_pages:
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
-               mem_cgroup_cancel_charge(pages[i], memcg);
+               mem_cgroup_cancel_charge(pages[i], memcg, false);
                put_page(pages[i]);
        }
        kfree(pages);
@@ -1171,7 +1222,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        page = pmd_page(orig_pmd);
        VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
-       if (page_mapcount(page) == 1) {
+       /*
+        * We can only reuse the page if nobody else maps the huge page or it's
+        * part. We can do it by checking page_mapcount() on each sub-page, but
+        * it's expensive.
+        * The cheaper way is to check page_count() to be equal 1: every
+        * mapcount takes page reference reference, so this way we can
+        * guarantee, that the PMD is the only mapping.
+        * This can give false negative if somebody pinned the page, but that's
+        * fine.
+        */
+       if (page_mapcount(page) == 1 && page_count(page) == 1) {
                pmd_t entry;
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1180,7 +1241,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
-       get_user_huge_page(page);
+       get_page(page);
        spin_unlock(ptl);
 alloc:
        if (transparent_hugepage_enabled(vma) &&
@@ -1190,30 +1251,33 @@ alloc:
        } else
                new_page = NULL;
 
-       if (unlikely(!new_page)) {
+       if (likely(new_page)) {
+               prep_transhuge_page(new_page);
+       } else {
                if (!page) {
-                       split_huge_page_pmd(vma, address, pmd);
+                       split_huge_pmd(vma, pmd, address);
                        ret |= VM_FAULT_FALLBACK;
                } else {
                        ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
                                        pmd, orig_pmd, page, haddr);
                        if (ret & VM_FAULT_OOM) {
-                               split_huge_page(page);
+                               split_huge_pmd(vma, pmd, address);
                                ret |= VM_FAULT_FALLBACK;
                        }
-                       put_user_huge_page(page);
+                       put_page(page);
                }
                count_vm_event(THP_FAULT_FALLBACK);
                goto out;
        }
 
-       if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
+       if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp,
+                                       &memcg, true))) {
                put_page(new_page);
                if (page) {
-                       split_huge_page(page);
-                       put_user_huge_page(page);
+                       split_huge_pmd(vma, pmd, address);
+                       put_page(page);
                } else
-                       split_huge_page_pmd(vma, address, pmd);
+                       split_huge_pmd(vma, pmd, address);
                ret |= VM_FAULT_FALLBACK;
                count_vm_event(THP_FAULT_FALLBACK);
                goto out;
@@ -1233,10 +1297,10 @@ alloc:
 
        spin_lock(ptl);
        if (page)
-               put_user_huge_page(page);
+               put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(ptl);
-               mem_cgroup_cancel_charge(new_page, memcg);
+               mem_cgroup_cancel_charge(new_page, memcg, true);
                put_page(new_page);
                goto out_mn;
        } else {
@@ -1244,8 +1308,8 @@ alloc:
                entry = mk_huge_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-               page_add_new_anon_rmap(new_page, vma, haddr);
-               mem_cgroup_commit_charge(new_page, memcg, false);
+               page_add_new_anon_rmap(new_page, vma, haddr, true);
+               mem_cgroup_commit_charge(new_page, memcg, false, true);
                lru_cache_add_active_or_unevictable(new_page, vma);
                set_pmd_at(mm, haddr, pmd, entry);
                update_mmu_cache_pmd(vma, address, pmd);
@@ -1254,7 +1318,7 @@ alloc:
                        put_huge_zero_page();
                } else {
                        VM_BUG_ON_PAGE(!PageHead(page), page);
-                       page_remove_rmap(page);
+                       page_remove_rmap(page, true);
                        put_page(page);
                }
                ret |= VM_FAULT_WRITE;
@@ -1307,8 +1371,21 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          pmd, _pmd,  1))
                        update_mmu_cache_pmd(vma, addr, pmd);
        }
-       if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
-               if (page->mapping && trylock_page(page)) {
+       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+               /*
+                * We don't mlock() pte-mapped THPs. This way we can avoid
+                * leaking mlocked pages into non-VM_LOCKED VMAs.
+                *
+                * In most cases the pmd is the only mapping of the page as we
+                * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+                * writable private mappings in populate_vma_page_range().
+                *
+                * The only scenario when we have the page shared here is if we
+                * mlocking read-only mapping shared over fork(). We skip
+                * mlocking such pages.
+                */
+               if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+                               page->mapping && trylock_page(page)) {
                        lru_add_drain();
                        if (page->mapping)
                                mlock_vma_page(page);
@@ -1318,7 +1395,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON_PAGE(!PageCompound(page), page);
        if (flags & FOLL_GET)
-               get_page_foll(page);
+               get_page(page);
 
 out:
        return page;
@@ -1453,13 +1530,47 @@ out:
        return 0;
 }
 
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                pmd_t *pmd, unsigned long addr)
+
+{
+       spinlock_t *ptl;
+       struct mm_struct *mm = tlb->mm;
+       int ret = 1;
+
+       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+               struct page *page;
+               pmd_t orig_pmd;
+
+               if (is_huge_zero_pmd(*pmd))
+                       goto out;
+
+               orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmd);
+
+               /* No hugepage in swapcache */
+               page = pmd_page(orig_pmd);
+               VM_BUG_ON_PAGE(PageSwapCache(page), page);
+
+               orig_pmd = pmd_mkold(orig_pmd);
+               orig_pmd = pmd_mkclean(orig_pmd);
+
+               set_pmd_at(mm, addr, pmd, orig_pmd);
+               tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+out:
+               spin_unlock(ptl);
+               ret = 0;
+       }
+
+       return ret;
+}
+
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
        pmd_t orig_pmd;
        spinlock_t *ptl;
 
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+       if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
                return 0;
        /*
         * For architectures like ppc64 we look at deposited pgtable
@@ -1481,7 +1592,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                put_huge_zero_page();
        } else {
                struct page *page = pmd_page(orig_pmd);
-               page_remove_rmap(page);
+               page_remove_rmap(page, true);
                VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
                add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
                VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1493,13 +1604,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        return 1;
 }
 
-int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                  unsigned long old_addr,
                  unsigned long new_addr, unsigned long old_end,
                  pmd_t *old_pmd, pmd_t *new_pmd)
 {
        spinlock_t *old_ptl, *new_ptl;
-       int ret = 0;
        pmd_t pmd;
 
        struct mm_struct *mm = vma->vm_mm;
@@ -1508,7 +1618,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
            (new_addr & ~HPAGE_PMD_MASK) ||
            old_end - old_addr < HPAGE_PMD_SIZE ||
            (new_vma->vm_flags & VM_NOHUGEPAGE))
-               goto out;
+               return false;
 
        /*
         * The destination pmd shouldn't be established, free_pgtables()
@@ -1516,15 +1626,14 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
         */
        if (WARN_ON(!pmd_none(*new_pmd))) {
                VM_BUG_ON(pmd_trans_huge(*new_pmd));
-               goto out;
+               return false;
        }
 
        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_sem prevents deadlock.
         */
-       ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
-       if (ret == 1) {
+       if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -1540,9 +1649,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                if (new_ptl != old_ptl)
                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
+               return true;
        }
-out:
-       return ret;
+       return false;
 }
 
 /*
@@ -1558,7 +1667,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        spinlock_t *ptl;
        int ret = 0;
 
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
                pmd_t entry;
                bool preserve_write = prot_numa && pmd_write(*pmd);
                ret = 1;
@@ -1589,29 +1698,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 }
 
 /*
- * Returns 1 if a given pmd maps a stable (not under splitting) thp.
- * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+ * Returns true if a given pmd maps a thp, false otherwise.
  *
- * Note that if it returns 1, this routine returns without unlocking page
- * table locks. So callers must unlock them.
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
  */
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
 {
        *ptl = pmd_lock(vma->vm_mm, pmd);
-       if (likely(pmd_trans_huge(*pmd))) {
-               if (unlikely(pmd_trans_splitting(*pmd))) {
-                       spin_unlock(*ptl);
-                       wait_split_huge_page(vma->anon_vma, pmd);
-                       return -1;
-               } else {
-                       /* Thp mapped by 'pmd' is stable, so we can
-                        * handle it as it is. */
-                       return 1;
-               }
-       }
+       if (likely(pmd_trans_huge(*pmd)))
+               return true;
        spin_unlock(*ptl);
-       return 0;
+       return false;
 }
 
 /*
@@ -1625,7 +1724,6 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
 pmd_t *page_check_address_pmd(struct page *page,
                              struct mm_struct *mm,
                              unsigned long address,
-                             enum page_check_address_pmd_flag flag,
                              spinlock_t **ptl)
 {
        pgd_t *pgd;
@@ -1648,349 +1746,13 @@ pmd_t *page_check_address_pmd(struct page *page,
                goto unlock;
        if (pmd_page(*pmd) != page)
                goto unlock;
-       /*
-        * split_vma() may create temporary aliased mappings. There is
-        * no risk as long as all huge pmd are found and have their
-        * splitting bit set before __split_huge_page_refcount
-        * runs. Finding the same huge pmd more than once during the
-        * same rmap walk is not a problem.
-        */
-       if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
-           pmd_trans_splitting(*pmd))
-               goto unlock;
-       if (pmd_trans_huge(*pmd)) {
-               VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
-                         !pmd_trans_splitting(*pmd));
+       if (pmd_trans_huge(*pmd))
                return pmd;
-       }
 unlock:
        spin_unlock(*ptl);
        return NULL;
 }
 
-static int __split_huge_page_splitting(struct page *page,
-                                      struct vm_area_struct *vma,
-                                      unsigned long address)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       spinlock_t *ptl;
-       pmd_t *pmd;
-       int ret = 0;
-       /* For mmu_notifiers */
-       const unsigned long mmun_start = address;
-       const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
-
-       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       pmd = page_check_address_pmd(page, mm, address,
-                       PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
-       if (pmd) {
-               /*
-                * We can't temporarily set the pmd to null in order
-                * to split it, the pmd must remain marked huge at all
-                * times or the VM won't take the pmd_trans_huge paths
-                * and it won't wait on the anon_vma->root->rwsem to
-                * serialize against split_huge_page*.
-                */
-               pmdp_splitting_flush(vma, address, pmd);
-
-               ret = 1;
-               spin_unlock(ptl);
-       }
-       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-
-       return ret;
-}
-
-static void __split_huge_page_refcount(struct page *page,
-                                      struct list_head *list)
-{
-       int i;
-       struct zone *zone = page_zone(page);
-       struct lruvec *lruvec;
-       int tail_count = 0;
-
-       /* prevent PageLRU to go away from under us, and freeze lru stats */
-       spin_lock_irq(&zone->lru_lock);
-       lruvec = mem_cgroup_page_lruvec(page, zone);
-
-       compound_lock(page);
-       /* complete memcg works before add pages to LRU */
-       mem_cgroup_split_huge_fixup(page);
-
-       for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
-               struct page *page_tail = page + i;
-
-               /* tail_page->_mapcount cannot change */
-               BUG_ON(page_mapcount(page_tail) < 0);
-               tail_count += page_mapcount(page_tail);
-               /* check for overflow */
-               BUG_ON(tail_count < 0);
-               BUG_ON(atomic_read(&page_tail->_count) != 0);
-               /*
-                * tail_page->_count is zero and not changing from
-                * under us. But get_page_unless_zero() may be running
-                * from under us on the tail_page. If we used
-                * atomic_set() below instead of atomic_add(), we
-                * would then run atomic_set() concurrently with
-                * get_page_unless_zero(), and atomic_set() is
-                * implemented in C not using locked ops. spin_unlock
-                * on x86 sometime uses locked ops because of PPro
-                * errata 66, 92, so unless somebody can guarantee
-                * atomic_set() here would be safe on all archs (and
-                * not only on x86), it's safer to use atomic_add().
-                */
-               atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
-                          &page_tail->_count);
-
-               /* after clearing PageTail the gup refcount can be released */
-               smp_mb__after_atomic();
-
-               page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-               page_tail->flags |= (page->flags &
-                                    ((1L << PG_referenced) |
-                                     (1L << PG_swapbacked) |
-                                     (1L << PG_mlocked) |
-                                     (1L << PG_uptodate) |
-                                     (1L << PG_active) |
-                                     (1L << PG_unevictable)));
-               page_tail->flags |= (1L << PG_dirty);
-
-               /* clear PageTail before overwriting first_page */
-               smp_wmb();
-
-               if (page_is_young(page))
-                       set_page_young(page_tail);
-               if (page_is_idle(page))
-                       set_page_idle(page_tail);
-
-               /*
-                * __split_huge_page_splitting() already set the
-                * splitting bit in all pmd that could map this
-                * hugepage, that will ensure no CPU can alter the
-                * mapcount on the head page. The mapcount is only
-                * accounted in the head page and it has to be
-                * transferred to all tail pages in the below code. So
-                * for this code to be safe, the split the mapcount
-                * can't change. But that doesn't mean userland can't
-                * keep changing and reading the page contents while
-                * we transfer the mapcount, so the pmd splitting
-                * status is achieved setting a reserved bit in the
-                * pmd, not by clearing the present bit.
-               */
-               page_tail->_mapcount = page->_mapcount;
-
-               BUG_ON(page_tail->mapping);
-               page_tail->mapping = page->mapping;
-
-               page_tail->index = page->index + i;
-               page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
-
-               BUG_ON(!PageAnon(page_tail));
-               BUG_ON(!PageUptodate(page_tail));
-               BUG_ON(!PageDirty(page_tail));
-               BUG_ON(!PageSwapBacked(page_tail));
-
-               lru_add_page_tail(page, page_tail, lruvec, list);
-       }
-       atomic_sub(tail_count, &page->_count);
-       BUG_ON(atomic_read(&page->_count) <= 0);
-
-       __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-
-       ClearPageCompound(page);
-       compound_unlock(page);
-       spin_unlock_irq(&zone->lru_lock);
-
-       for (i = 1; i < HPAGE_PMD_NR; i++) {
-               struct page *page_tail = page + i;
-               BUG_ON(page_count(page_tail) <= 0);
-               /*
-                * Tail pages may be freed if there wasn't any mapping
-                * like if add_to_swap() is running on a lru page that
-                * had its mapping zapped. And freeing these pages
-                * requires taking the lru_lock so we do the put_page
-                * of the tail pages after the split is complete.
-                */
-               put_page(page_tail);
-       }
-
-       /*
-        * Only the head page (now become a regular page) is required
-        * to be pinned by the caller.
-        */
-       BUG_ON(page_count(page) <= 0);
-}
-
-static int __split_huge_page_map(struct page *page,
-                                struct vm_area_struct *vma,
-                                unsigned long address)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       spinlock_t *ptl;
-       pmd_t *pmd, _pmd;
-       int ret = 0, i;
-       pgtable_t pgtable;
-       unsigned long haddr;
-
-       pmd = page_check_address_pmd(page, mm, address,
-                       PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
-       if (pmd) {
-               pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-               pmd_populate(mm, &_pmd, pgtable);
-               if (pmd_write(*pmd))
-                       BUG_ON(page_mapcount(page) != 1);
-
-               haddr = address;
-               for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-                       pte_t *pte, entry;
-                       BUG_ON(PageCompound(page+i));
-                       /*
-                        * Note that NUMA hinting access restrictions are not
-                        * transferred to avoid any possibility of altering
-                        * permissions across VMAs.
-                        */
-                       entry = mk_pte(page + i, vma->vm_page_prot);
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                       if (!pmd_write(*pmd))
-                               entry = pte_wrprotect(entry);
-                       if (!pmd_young(*pmd))
-                               entry = pte_mkold(entry);
-                       pte = pte_offset_map(&_pmd, haddr);
-                       BUG_ON(!pte_none(*pte));
-                       set_pte_at(mm, haddr, pte, entry);
-                       pte_unmap(pte);
-               }
-
-               smp_wmb(); /* make pte visible before pmd */
-               /*
-                * Up to this point the pmd is present and huge and
-                * userland has the whole access to the hugepage
-                * during the split (which happens in place). If we
-                * overwrite the pmd with the not-huge version
-                * pointing to the pte here (which of course we could
-                * if all CPUs were bug free), userland could trigger
-                * a small page size TLB miss on the small sized TLB
-                * while the hugepage TLB entry is still established
-                * in the huge TLB. Some CPU doesn't like that. See
-                * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
-                * Erratum 383 on page 93. Intel should be safe but is
-                * also warns that it's only safe if the permission
-                * and cache attributes of the two entries loaded in
-                * the two TLB is identical (which should be the case
-                * here). But it is generally safer to never allow
-                * small and huge TLB entries for the same virtual
-                * address to be loaded simultaneously. So instead of
-                * doing "pmd_populate(); flush_pmd_tlb_range();" we first
-                * mark the current pmd notpresent (atomically because
-                * here the pmd_trans_huge and pmd_trans_splitting
-                * must remain set at all times on the pmd until the
-                * split is complete for this pmd), then we flush the
-                * SMP TLB and finally we write the non-huge version
-                * of the pmd entry with pmd_populate.
-                */
-               pmdp_invalidate(vma, address, pmd);
-               pmd_populate(mm, pmd, pgtable);
-               ret = 1;
-               spin_unlock(ptl);
-       }
-
-       return ret;
-}
-
-/* must be called with anon_vma->root->rwsem held */
-static void __split_huge_page(struct page *page,
-                             struct anon_vma *anon_vma,
-                             struct list_head *list)
-{
-       int mapcount, mapcount2;
-       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-       struct anon_vma_chain *avc;
-
-       BUG_ON(!PageHead(page));
-       BUG_ON(PageTail(page));
-
-       mapcount = 0;
-       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-               struct vm_area_struct *vma = avc->vma;
-               unsigned long addr = vma_address(page, vma);
-               BUG_ON(is_vma_temporary_stack(vma));
-               mapcount += __split_huge_page_splitting(page, vma, addr);
-       }
-       /*
-        * It is critical that new vmas are added to the tail of the
-        * anon_vma list. This guarantes that if copy_huge_pmd() runs
-        * and establishes a child pmd before
-        * __split_huge_page_splitting() freezes the parent pmd (so if
-        * we fail to prevent copy_huge_pmd() from running until the
-        * whole __split_huge_page() is complete), we will still see
-        * the newly established pmd of the child later during the
-        * walk, to be able to set it as pmd_trans_splitting too.
-        */
-       if (mapcount != page_mapcount(page)) {
-               pr_err("mapcount %d page_mapcount %d\n",
-                       mapcount, page_mapcount(page));
-               BUG();
-       }
-
-       __split_huge_page_refcount(page, list);
-
-       mapcount2 = 0;
-       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-               struct vm_area_struct *vma = avc->vma;
-               unsigned long addr = vma_address(page, vma);
-               BUG_ON(is_vma_temporary_stack(vma));
-               mapcount2 += __split_huge_page_map(page, vma, addr);
-       }
-       if (mapcount != mapcount2) {
-               pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
-                       mapcount, mapcount2, page_mapcount(page));
-               BUG();
-       }
-}
-
-/*
- * Split a hugepage into normal pages. This doesn't change the position of head
- * page. If @list is null, tail pages will be added to LRU list, otherwise, to
- * @list. Both head page and tail pages will inherit mapping, flags, and so on
- * from the hugepage.
- * Return 0 if the hugepage is split successfully otherwise return 1.
- */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
-{
-       struct anon_vma *anon_vma;
-       int ret = 1;
-
-       BUG_ON(is_huge_zero_page(page));
-       BUG_ON(!PageAnon(page));
-
-       /*
-        * The caller does not necessarily hold an mmap_sem that would prevent
-        * the anon_vma disappearing so we first we take a reference to it
-        * and then lock the anon_vma for write. This is similar to
-        * page_lock_anon_vma_read except the write lock is taken to serialise
-        * against parallel split or collapse operations.
-        */
-       anon_vma = page_get_anon_vma(page);
-       if (!anon_vma)
-               goto out;
-       anon_vma_lock_write(anon_vma);
-
-       ret = 0;
-       if (!PageCompound(page))
-               goto out_unlock;
-
-       BUG_ON(!PageSwapBacked(page));
-       __split_huge_page(page, anon_vma, list);
-       count_vm_event(THP_SPLIT);
-
-       BUG_ON(PageCompound(page));
-out_unlock:
-       anon_vma_unlock_write(anon_vma);
-       put_anon_vma(anon_vma);
-out:
-       return ret;
-}
-
 #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
 
 int hugepage_madvise(struct vm_area_struct *vma,
@@ -2199,26 +1961,33 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pte_t *pte)
 {
-       struct page *page;
+       struct page *page = NULL;
        pte_t *_pte;
-       int none_or_zero = 0;
+       int none_or_zero = 0, result = 0;
        bool referenced = false, writable = false;
+
        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval) || (pte_present(pteval) &&
                                is_zero_pfn(pte_pfn(pteval)))) {
                        if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none)
+                           ++none_or_zero <= khugepaged_max_ptes_none) {
                                continue;
-                       else
+                       } else {
+                               result = SCAN_EXCEED_NONE_PTE;
                                goto out;
+                       }
                }
-               if (!pte_present(pteval))
+               if (!pte_present(pteval)) {
+                       result = SCAN_PTE_NON_PRESENT;
                        goto out;
+               }
                page = vm_normal_page(vma, address, pteval);
-               if (unlikely(!page))
+               if (unlikely(!page)) {
+                       result = SCAN_PAGE_NULL;
                        goto out;
+               }
 
                VM_BUG_ON_PAGE(PageCompound(page), page);
                VM_BUG_ON_PAGE(!PageAnon(page), page);
@@ -2230,8 +1999,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                 * is needed to serialize against split_huge_page
                 * when invoked from the VM.
                 */
-               if (!trylock_page(page))
+               if (!trylock_page(page)) {
+                       result = SCAN_PAGE_LOCK;
                        goto out;
+               }
 
                /*
                 * cannot use mapcount: can't collapse if there's a gup pin.
@@ -2240,6 +2011,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                 */
                if (page_count(page) != 1 + !!PageSwapCache(page)) {
                        unlock_page(page);
+                       result = SCAN_PAGE_COUNT;
                        goto out;
                }
                if (pte_write(pteval)) {
@@ -2247,6 +2019,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                } else {
                        if (PageSwapCache(page) && !reuse_swap_page(page)) {
                                unlock_page(page);
+                               result = SCAN_SWAP_CACHE_PAGE;
                                goto out;
                        }
                        /*
@@ -2261,6 +2034,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                 */
                if (isolate_lru_page(page)) {
                        unlock_page(page);
+                       result = SCAN_DEL_PAGE_LRU;
                        goto out;
                }
                /* 0 stands for page_is_file_cache(page) == false */
@@ -2274,10 +2048,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = true;
        }
-       if (likely(referenced && writable))
-               return 1;
+       if (likely(writable)) {
+               if (likely(referenced)) {
+                       result = SCAN_SUCCEED;
+                       trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+                                                           referenced, writable, result);
+                       return 1;
+               }
+       } else {
+               result = SCAN_PAGE_RO;
+       }
+
 out:
        release_pte_pages(pte, _pte);
+       trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+                                           referenced, writable, result);
        return 0;
 }
 
@@ -2322,7 +2107,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                         * superfluous.
                         */
                        pte_clear(vma->vm_mm, address, _pte);
-                       page_remove_rmap(src_page);
+                       page_remove_rmap(src_page, false);
                        spin_unlock(ptl);
                        free_page_and_swap_cache(src_page);
                }
@@ -2433,6 +2218,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                return NULL;
        }
 
+       prep_transhuge_page(*hpage);
        count_vm_event(THP_COLLAPSE_ALLOC);
        return *hpage;
 }
@@ -2444,8 +2230,12 @@ static int khugepaged_find_target_node(void)
 
 static inline struct page *alloc_hugepage(int defrag)
 {
-       return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
-                          HPAGE_PMD_ORDER);
+       struct page *page;
+
+       page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+       if (page)
+               prep_transhuge_page(page);
+       return page;
 }
 
 static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -2496,7 +2286,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
            (vma->vm_flags & VM_NOHUGEPAGE))
                return false;
-
        if (!vma->anon_vma || vma->vm_ops)
                return false;
        if (is_vma_temporary_stack(vma))
@@ -2505,6 +2294,44 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
        return true;
 }
 
+/*
+ * Bring missing pages in from swap, to complete THP collapse.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ *
+ * Called and returns without pte mapped or spinlocks held,
+ * but with mmap_sem held to protect against vma changes.
+ */
+
+static void __collapse_huge_page_swapin(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       unsigned long address, pmd_t *pmd)
+{
+       unsigned long _address;
+       pte_t *pte, pteval;
+       int swapped_in = 0, ret = 0;
+
+       pte = pte_offset_map(pmd, address);
+       for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
+            pte++, _address += PAGE_SIZE) {
+               pteval = *pte;
+               if (!is_swap_pte(pteval))
+                       continue;
+               swapped_in++;
+               ret = do_swap_page(mm, vma, _address, pte, pmd,
+                                  FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+                                  pteval);
+               if (ret & VM_FAULT_ERROR) {
+                       trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
+                       return;
+               }
+               /* pte is unmapped now, we need to map it */
+               pte = pte_offset_map(pmd, _address);
+       }
+       pte--;
+       pte_unmap(pte);
+       trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+}
+
 static void collapse_huge_page(struct mm_struct *mm,
                                   unsigned long address,
                                   struct page **hpage,
@@ -2516,7 +2343,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        pgtable_t pgtable;
        struct page *new_page;
        spinlock_t *pmd_ptl, *pte_ptl;
-       int isolated;
+       int isolated = 0, result = 0;
        unsigned long hstart, hend;
        struct mem_cgroup *memcg;
        unsigned long mmun_start;       /* For mmu_notifiers */
@@ -2531,12 +2358,15 @@ static void collapse_huge_page(struct mm_struct *mm,
 
        /* release the mmap_sem read lock. */
        new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
-       if (!new_page)
-               return;
+       if (!new_page) {
+               result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+               goto out_nolock;
+       }
 
-       if (unlikely(mem_cgroup_try_charge(new_page, mm,
-                                          gfp, &memcg)))
-               return;
+       if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+               result = SCAN_CGROUP_CHARGE_FAIL;
+               goto out_nolock;
+       }
 
        /*
         * Prevent all access to pagetables with the exception of
@@ -2544,21 +2374,33 @@ static void collapse_huge_page(struct mm_struct *mm,
         * handled by the anon_vma lock + PG_lock.
         */
        down_write(&mm->mmap_sem);
-       if (unlikely(khugepaged_test_exit(mm)))
+       if (unlikely(khugepaged_test_exit(mm))) {
+               result = SCAN_ANY_PROCESS;
                goto out;
+       }
 
        vma = find_vma(mm, address);
-       if (!vma)
+       if (!vma) {
+               result = SCAN_VMA_NULL;
                goto out;
+       }
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
-       if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+       if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
+               result = SCAN_ADDRESS_RANGE;
                goto out;
-       if (!hugepage_vma_check(vma))
+       }
+       if (!hugepage_vma_check(vma)) {
+               result = SCAN_VMA_CHECK;
                goto out;
+       }
        pmd = mm_find_pmd(mm, address);
-       if (!pmd)
+       if (!pmd) {
+               result = SCAN_PMD_NULL;
                goto out;
+       }
+
+       __collapse_huge_page_swapin(mm, vma, address, pmd);
 
        anon_vma_lock_write(vma->anon_vma);
 
@@ -2595,6 +2437,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(pmd_ptl);
                anon_vma_unlock_write(vma->anon_vma);
+               result = SCAN_FAIL;
                goto out;
        }
 
@@ -2621,8 +2464,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 
        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
-       page_add_new_anon_rmap(new_page, vma, address);
-       mem_cgroup_commit_charge(new_page, memcg, false);
+       page_add_new_anon_rmap(new_page, vma, address, true);
+       mem_cgroup_commit_charge(new_page, memcg, false, true);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
@@ -2632,12 +2475,14 @@ static void collapse_huge_page(struct mm_struct *mm,
        *hpage = NULL;
 
        khugepaged_pages_collapsed++;
+       result = SCAN_SUCCEED;
 out_up_write:
        up_write(&mm->mmap_sem);
+out_nolock:
+       trace_mm_collapse_huge_page(mm, isolated, result);
        return;
-
 out:
-       mem_cgroup_cancel_charge(new_page, memcg);
+       mem_cgroup_cancel_charge(new_page, memcg, true);
        goto out_up_write;
 }
 
@@ -2648,39 +2493,62 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 {
        pmd_t *pmd;
        pte_t *pte, *_pte;
-       int ret = 0, none_or_zero = 0;
-       struct page *page;
+       int ret = 0, none_or_zero = 0, result = 0;
+       struct page *page = NULL;
        unsigned long _address;
        spinlock_t *ptl;
-       int node = NUMA_NO_NODE;
+       int node = NUMA_NO_NODE, unmapped = 0;
        bool writable = false, referenced = false;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
        pmd = mm_find_pmd(mm, address);
-       if (!pmd)
+       if (!pmd) {
+               result = SCAN_PMD_NULL;
                goto out;
+       }
 
        memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
+               if (is_swap_pte(pteval)) {
+                       if (++unmapped <= khugepaged_max_ptes_swap) {
+                               continue;
+                       } else {
+                               result = SCAN_EXCEED_SWAP_PTE;
+                               goto out_unmap;
+                       }
+               }
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none)
+                           ++none_or_zero <= khugepaged_max_ptes_none) {
                                continue;
-                       else
+                       } else {
+                               result = SCAN_EXCEED_NONE_PTE;
                                goto out_unmap;
+                       }
                }
-               if (!pte_present(pteval))
+               if (!pte_present(pteval)) {
+                       result = SCAN_PTE_NON_PRESENT;
                        goto out_unmap;
+               }
                if (pte_write(pteval))
                        writable = true;
 
                page = vm_normal_page(vma, _address, pteval);
-               if (unlikely(!page))
+               if (unlikely(!page)) {
+                       result = SCAN_PAGE_NULL;
+                       goto out_unmap;
+               }
+
+               /* TODO: teach khugepaged to collapse THP mapped with pte */
+               if (PageCompound(page)) {
+                       result = SCAN_PAGE_COMPOUND;
                        goto out_unmap;
+               }
+
                /*
                 * Record which node the original page is from and save this
                 * information to khugepaged_node_load[].
@@ -2688,26 +2556,48 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                 * hit record.
                 */
                node = page_to_nid(page);
-               if (khugepaged_scan_abort(node))
+               if (khugepaged_scan_abort(node)) {
+                       result = SCAN_SCAN_ABORT;
                        goto out_unmap;
+               }
                khugepaged_node_load[node]++;
-               VM_BUG_ON_PAGE(PageCompound(page), page);
-               if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+               if (!PageLRU(page)) {
+                       result = SCAN_SCAN_ABORT;
+                       goto out_unmap;
+               }
+               if (PageLocked(page)) {
+                       result = SCAN_PAGE_LOCK;
+                       goto out_unmap;
+               }
+               if (!PageAnon(page)) {
+                       result = SCAN_PAGE_ANON;
                        goto out_unmap;
+               }
+
                /*
                 * cannot use mapcount: can't collapse if there's a gup pin.
                 * The page must only be referenced by the scanned process
                 * and page swap cache.
                 */
-               if (page_count(page) != 1 + !!PageSwapCache(page))
+               if (page_count(page) != 1 + !!PageSwapCache(page)) {
+                       result = SCAN_PAGE_COUNT;
                        goto out_unmap;
+               }
                if (pte_young(pteval) ||
                    page_is_young(page) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = true;
        }
-       if (referenced && writable)
-               ret = 1;
+       if (writable) {
+               if (referenced) {
+                       result = SCAN_SUCCEED;
+                       ret = 1;
+               } else {
+                       result = SCAN_NO_REFERENCED_PAGE;
+               }
+       } else {
+               result = SCAN_PAGE_RO;
+       }
 out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (ret) {
@@ -2716,6 +2606,8 @@ out_unmap:
                collapse_huge_page(mm, address, hpage, vma, node);
        }
 out:
+       trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+                                    none_or_zero, result, unmapped);
        return ret;
 }
 
@@ -2941,8 +2833,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        pmd_t _pmd;
        int i;
 
-       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
+       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
@@ -2961,66 +2853,123 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        put_huge_zero_page();
 }
 
-void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
-               pmd_t *pmd)
+static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long haddr, bool freeze)
 {
-       spinlock_t *ptl;
-       struct page *page = NULL;
        struct mm_struct *mm = vma->vm_mm;
-       unsigned long haddr = address & HPAGE_PMD_MASK;
-       unsigned long mmun_start;       /* For mmu_notifiers */
-       unsigned long mmun_end;         /* For mmu_notifiers */
+       struct page *page;
+       pgtable_t pgtable;
+       pmd_t _pmd;
+       bool young, write;
+       int i;
 
-       BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+       VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+       VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+       VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+       VM_BUG_ON(!pmd_trans_huge(*pmd));
+
+       count_vm_event(THP_SPLIT_PMD);
 
-       mmun_start = haddr;
-       mmun_end   = haddr + HPAGE_PMD_SIZE;
-again:
-       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       ptl = pmd_lock(mm, pmd);
-       if (unlikely(!pmd_trans_huge(*pmd)))
-               goto unlock;
        if (vma_is_dax(vma)) {
                pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
                if (is_huge_zero_pmd(_pmd))
                        put_huge_zero_page();
+               return;
        } else if (is_huge_zero_pmd(*pmd)) {
-               __split_huge_zero_page_pmd(vma, haddr, pmd);
-       } else {
-               page = pmd_page(*pmd);
-               VM_BUG_ON_PAGE(!page_count(page), page);
-               get_page(page);
+               return __split_huge_zero_page_pmd(vma, haddr, pmd);
        }
- unlock:
-       spin_unlock(ptl);
-       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
-       if (!page)
-               return;
+       page = pmd_page(*pmd);
+       VM_BUG_ON_PAGE(!page_count(page), page);
+       atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+       write = pmd_write(*pmd);
+       young = pmd_young(*pmd);
 
-       split_huge_page(page);
-       put_page(page);
+       /* leave pmd empty until pte is filled */
+       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+
+       pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+       pmd_populate(mm, &_pmd, pgtable);
+
+       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+               pte_t entry, *pte;
+               /*
+                * Note that NUMA hinting access restrictions are not
+                * transferred to avoid any possibility of altering
+                * permissions across VMAs.
+                */
+               if (freeze) {
+                       swp_entry_t swp_entry;
+                       swp_entry = make_migration_entry(page + i, write);
+                       entry = swp_entry_to_pte(swp_entry);
+               } else {
+                       entry = mk_pte(page + i, vma->vm_page_prot);
+                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                       if (!write)
+                               entry = pte_wrprotect(entry);
+                       if (!young)
+                               entry = pte_mkold(entry);
+               }
+               pte = pte_offset_map(&_pmd, haddr);
+               BUG_ON(!pte_none(*pte));
+               set_pte_at(mm, haddr, pte, entry);
+               atomic_inc(&page[i]._mapcount);
+               pte_unmap(pte);
+       }
 
        /*
-        * We don't always have down_write of mmap_sem here: a racing
-        * do_huge_pmd_wp_page() might have copied-on-write to another
-        * huge page before our split_huge_page() got the anon_vma lock.
+        * Set PG_double_map before dropping compound_mapcount to avoid
+        * false-negative page_mapped().
         */
-       if (unlikely(pmd_trans_huge(*pmd)))
-               goto again;
+       if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
+               for (i = 0; i < HPAGE_PMD_NR; i++)
+                       atomic_inc(&page[i]._mapcount);
+       }
+
+       if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+               /* Last compound_mapcount is gone. */
+               __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+               if (TestClearPageDoubleMap(page)) {
+                       /* No need in mapcount reference anymore */
+                       for (i = 0; i < HPAGE_PMD_NR; i++)
+                               atomic_dec(&page[i]._mapcount);
+               }
+       }
+
+       smp_wmb(); /* make pte visible before pmd */
+       pmd_populate(mm, pmd, pgtable);
 }
 
-void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd)
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long address)
 {
-       struct vm_area_struct *vma;
+       spinlock_t *ptl;
+       struct mm_struct *mm = vma->vm_mm;
+       struct page *page = NULL;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
 
-       vma = find_vma(mm, address);
-       BUG_ON(vma == NULL);
-       split_huge_page_pmd(vma, address, pmd);
+       mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       ptl = pmd_lock(mm, pmd);
+       if (unlikely(!pmd_trans_huge(*pmd)))
+               goto out;
+       page = pmd_page(*pmd);
+       __split_huge_pmd_locked(vma, pmd, haddr, false);
+       if (PageMlocked(page))
+               get_page(page);
+       else
+               page = NULL;
+out:
+       spin_unlock(ptl);
+       mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       if (page) {
+               lock_page(page);
+               munlock_vma_page(page);
+               unlock_page(page);
+               put_page(page);
+       }
 }
 
-static void split_huge_page_address(struct mm_struct *mm,
+static void split_huge_pmd_address(struct vm_area_struct *vma,
                                    unsigned long address)
 {
        pgd_t *pgd;
@@ -3029,7 +2978,7 @@ static void split_huge_page_address(struct mm_struct *mm,
 
        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
 
-       pgd = pgd_offset(mm, address);
+       pgd = pgd_offset(vma->vm_mm, address);
        if (!pgd_present(*pgd))
                return;
 
@@ -3038,13 +2987,13 @@ static void split_huge_page_address(struct mm_struct *mm,
                return;
 
        pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd))
+       if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
         * materialize from under us.
         */
-       split_huge_page_pmd_mm(mm, address, pmd);
+       split_huge_pmd(vma, pmd, address);
 }
 
 void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3060,7 +3009,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
        if (start & ~HPAGE_PMD_MASK &&
            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_page_address(vma->vm_mm, start);
+               split_huge_pmd_address(vma, start);
 
        /*
         * If the new end address isn't hpage aligned and it could
@@ -3070,7 +3019,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
        if (end & ~HPAGE_PMD_MASK &&
            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_page_address(vma->vm_mm, end);
+               split_huge_pmd_address(vma, end);
 
        /*
         * If we're also updating the vma->vm_next->vm_start, if the new
@@ -3084,6 +3033,415 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
                if (nstart & ~HPAGE_PMD_MASK &&
                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
-                       split_huge_page_address(next->vm_mm, nstart);
+                       split_huge_pmd_address(next, nstart);
+       }
+}
+
+static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+               unsigned long address)
+{
+       spinlock_t *ptl;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       int i;
+
+       pgd = pgd_offset(vma->vm_mm, address);
+       if (!pgd_present(*pgd))
+               return;
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return;
+       pmd = pmd_offset(pud, address);
+       ptl = pmd_lock(vma->vm_mm, pmd);
+       if (!pmd_present(*pmd)) {
+               spin_unlock(ptl);
+               return;
+       }
+       if (pmd_trans_huge(*pmd)) {
+               if (page == pmd_page(*pmd))
+                       __split_huge_pmd_locked(vma, pmd, address, true);
+               spin_unlock(ptl);
+               return;
+       }
+       spin_unlock(ptl);
+
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+       for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+               pte_t entry, swp_pte;
+               swp_entry_t swp_entry;
+
+               if (!pte_present(pte[i]))
+                       continue;
+               if (page_to_pfn(page) != pte_pfn(pte[i]))
+                       continue;
+               flush_cache_page(vma, address, page_to_pfn(page));
+               entry = ptep_clear_flush(vma, address, pte + i);
+               swp_entry = make_migration_entry(page, pte_write(entry));
+               swp_pte = swp_entry_to_pte(swp_entry);
+               if (pte_soft_dirty(entry))
+                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+               set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
+       }
+       pte_unmap_unlock(pte, ptl);
+}
+
+static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+       struct anon_vma_chain *avc;
+       pgoff_t pgoff = page_to_pgoff(page);
+
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+                       pgoff + HPAGE_PMD_NR - 1) {
+               unsigned long haddr;
+
+               haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
+               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                               haddr, haddr + HPAGE_PMD_SIZE);
+               freeze_page_vma(avc->vma, page, haddr);
+               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                               haddr, haddr + HPAGE_PMD_SIZE);
+       }
+}
+
+static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+               unsigned long address)
+{
+       spinlock_t *ptl;
+       pmd_t *pmd;
+       pte_t *pte, entry;
+       swp_entry_t swp_entry;
+       int i;
+
+       pmd = mm_find_pmd(vma->vm_mm, address);
+       if (!pmd)
+               return;
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+       for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+               if (!page_mapped(page))
+                       continue;
+               if (!is_swap_pte(pte[i]))
+                       continue;
+
+               swp_entry = pte_to_swp_entry(pte[i]);
+               if (!is_migration_entry(swp_entry))
+                       continue;
+               if (migration_entry_to_page(swp_entry) != page)
+                       continue;
+
+               entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+               entry = pte_mkdirty(entry);
+               if (is_write_migration_entry(swp_entry))
+                       entry = maybe_mkwrite(entry, vma);
+
+               flush_dcache_page(page);
+               set_pte_at(vma->vm_mm, address, pte + i, entry);
+
+               /* No need to invalidate - it was non-present before */
+               update_mmu_cache(vma, address, pte + i);
+       }
+       pte_unmap_unlock(pte, ptl);
+}
+
+static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+       struct anon_vma_chain *avc;
+       pgoff_t pgoff = page_to_pgoff(page);
+
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+                       pgoff, pgoff + HPAGE_PMD_NR - 1) {
+               unsigned long address = __vma_address(page, avc->vma);
+
+               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+               unfreeze_page_vma(avc->vma, page, address);
+               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
        }
 }
+
+static int total_mapcount(struct page *page)
+{
+       int i, ret;
+
+       ret = compound_mapcount(page);
+       for (i = 0; i < HPAGE_PMD_NR; i++)
+               ret += atomic_read(&page[i]._mapcount) + 1;
+
+       if (PageDoubleMap(page))
+               ret -= HPAGE_PMD_NR;
+
+       return ret;
+}
+
+static int __split_huge_page_tail(struct page *head, int tail,
+               struct lruvec *lruvec, struct list_head *list)
+{
+       int mapcount;
+       struct page *page_tail = head + tail;
+
+       mapcount = atomic_read(&page_tail->_mapcount) + 1;
+       VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+
+       /*
+        * tail_page->_count is zero and not changing from under us. But
+        * get_page_unless_zero() may be running from under us on the
+        * tail_page. If we used atomic_set() below instead of atomic_add(), we
+        * would then run atomic_set() concurrently with
+        * get_page_unless_zero(), and atomic_set() is implemented in C not
+        * using locked ops. spin_unlock on x86 sometime uses locked ops
+        * because of PPro errata 66, 92, so unless somebody can guarantee
+        * atomic_set() here would be safe on all archs (and not only on x86),
+        * it's safer to use atomic_add().
+        */
+       atomic_add(mapcount + 1, &page_tail->_count);
+
+       /* after clearing PageTail the gup refcount can be released */
+       smp_mb__after_atomic();
+
+       page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+       page_tail->flags |= (head->flags &
+                       ((1L << PG_referenced) |
+                        (1L << PG_swapbacked) |
+                        (1L << PG_mlocked) |
+                        (1L << PG_uptodate) |
+                        (1L << PG_active) |
+                        (1L << PG_locked) |
+                        (1L << PG_unevictable)));
+       page_tail->flags |= (1L << PG_dirty);
+
+       clear_compound_head(page_tail);
+
+       if (page_is_young(head))
+               set_page_young(page_tail);
+       if (page_is_idle(head))
+               set_page_idle(page_tail);
+
+       /* ->mapping in first tail page is compound_mapcount */
+       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+                       page_tail);
+       page_tail->mapping = head->mapping;
+
+       page_tail->index = head->index + tail;
+       page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+       lru_add_page_tail(head, page_tail, lruvec, list);
+
+       return mapcount;
+}
+
+static void __split_huge_page(struct page *page, struct list_head *list)
+{
+       struct page *head = compound_head(page);
+       struct zone *zone = page_zone(head);
+       struct lruvec *lruvec;
+       int i, tail_mapcount;
+
+       /* prevent PageLRU to go away from under us, and freeze lru stats */
+       spin_lock_irq(&zone->lru_lock);
+       lruvec = mem_cgroup_page_lruvec(head, zone);
+
+       spin_lock(&split_queue_lock);
+       if (!list_empty(page_deferred_list(head))) {
+               split_queue_len--;
+               list_del(page_deferred_list(head));
+       }
+       spin_unlock(&split_queue_lock);
+
+       /* complete memcg works before add pages to LRU */
+       mem_cgroup_split_huge_fixup(head);
+
+       tail_mapcount = 0;
+       for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+               tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+       atomic_sub(tail_mapcount, &head->_count);
+
+       ClearPageCompound(head);
+       spin_unlock_irq(&zone->lru_lock);
+
+       unfreeze_page(page_anon_vma(head), head);
+
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               struct page *subpage = head + i;
+               if (subpage == page)
+                       continue;
+               unlock_page(subpage);
+
+               /*
+                * Subpages may be freed if there wasn't any mapping
+                * like if add_to_swap() is running on a lru page that
+                * had its mapping zapped. And freeing these pages
+                * requires taking the lru_lock so we do the put_page
+                * of the tail pages after the split is complete.
+                */
+               put_page(subpage);
+       }
+}
+
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+       struct page *head = compound_head(page);
+       struct anon_vma *anon_vma;
+       int count, mapcount, ret;
+
+       VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+       VM_BUG_ON_PAGE(!PageAnon(page), page);
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+       VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+       /*
+        * The caller does not necessarily hold an mmap_sem that would prevent
+        * the anon_vma disappearing so we first we take a reference to it
+        * and then lock the anon_vma for write. This is similar to
+        * page_lock_anon_vma_read except the write lock is taken to serialise
+        * against parallel split or collapse operations.
+        */
+       anon_vma = page_get_anon_vma(head);
+       if (!anon_vma) {
+               ret = -EBUSY;
+               goto out;
+       }
+       anon_vma_lock_write(anon_vma);
+
+       /*
+        * Racy check if we can split the page, before freeze_page() will
+        * split PMDs
+        */
+       if (total_mapcount(head) != page_count(head) - 1) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+
+       freeze_page(anon_vma, head);
+       VM_BUG_ON_PAGE(compound_mapcount(head), head);
+
+       count = page_count(head);
+       mapcount = total_mapcount(head);
+       if (mapcount == count - 1) {
+               __split_huge_page(page, list);
+               ret = 0;
+       } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
+               pr_alert("total_mapcount: %u, page_count(): %u\n",
+                               mapcount, count);
+               if (PageTail(page))
+                       dump_page(head, NULL);
+               dump_page(page, "total_mapcount(head) > page_count(head) - 1");
+               BUG();
+       } else {
+               unfreeze_page(anon_vma, head);
+               ret = -EBUSY;
+       }
+
+out_unlock:
+       anon_vma_unlock_write(anon_vma);
+       put_anon_vma(anon_vma);
+out:
+       count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+       return ret;
+}
+
+void free_transhuge_page(struct page *page)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (!list_empty(page_deferred_list(page))) {
+               split_queue_len--;
+               list_del(page_deferred_list(page));
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+       free_compound_page(page);
+}
+
+void deferred_split_huge_page(struct page *page)
+{
+       unsigned long flags;
+
+       VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (list_empty(page_deferred_list(page))) {
+               list_add_tail(page_deferred_list(page), &split_queue);
+               split_queue_len++;
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+}
+
+static unsigned long deferred_split_count(struct shrinker *shrink,
+               struct shrink_control *sc)
+{
+       /*
+        * Split a page from split_queue will free up at least one page,
+        * at most HPAGE_PMD_NR - 1. We don't track exact number.
+        * Let's use HPAGE_PMD_NR / 2 as ballpark.
+        */
+       return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+}
+
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+               struct shrink_control *sc)
+{
+       unsigned long flags;
+       LIST_HEAD(list), *pos, *next;
+       struct page *page;
+       int split = 0;
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_init(&split_queue, &list);
+
+       /* Take pin on all head pages to avoid freeing them under us */
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               page = compound_head(page);
+               /* race with put_compound_page() */
+               if (!get_page_unless_zero(page)) {
+                       list_del_init(page_deferred_list(page));
+                       split_queue_len--;
+               }
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               lock_page(page);
+               /* split_huge_page() removes page from list on success */
+               if (!split_huge_page(page))
+                       split++;
+               unlock_page(page);
+               put_page(page);
+       }
+
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_tail(&list, &split_queue);
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+
+       return split * HPAGE_PMD_NR / 2;
+}
+
+static struct shrinker deferred_split_shrinker = {
+       .count_objects = deferred_split_count,
+       .scan_objects = deferred_split_scan,
+       .seeks = DEFAULT_SEEKS,
+};
index 9cc773483624e4cbb1592ddde74f9c8faa21ef87..371aa737722add1bc38a8117772424dee1f1e001 100644 (file)
@@ -994,23 +994,22 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
 
 #if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
 static void destroy_compound_gigantic_page(struct page *page,
-                                       unsigned long order)
+                                       unsigned int order)
 {
        int i;
        int nr_pages = 1 << order;
        struct page *p = page + 1;
 
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
-               __ClearPageTail(p);
+               clear_compound_head(p);
                set_page_refcounted(p);
-               p->first_page = NULL;
        }
 
        set_compound_order(page, 0);
        __ClearPageHead(page);
 }
 
-static void free_gigantic_page(struct page *page, unsigned order)
+static void free_gigantic_page(struct page *page, unsigned int order)
 {
        free_contig_range(page_to_pfn(page), 1 << order);
 }
@@ -1054,7 +1053,7 @@ static bool zone_spans_last_pfn(const struct zone *zone,
        return zone_spans_pfn(zone, last_pfn);
 }
 
-static struct page *alloc_gigantic_page(int nid, unsigned order)
+static struct page *alloc_gigantic_page(int nid, unsigned int order)
 {
        unsigned long nr_pages = 1 << order;
        unsigned long ret, pfn, flags;
@@ -1090,7 +1089,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned order)
 }
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+static void prep_compound_gigantic_page(struct page *page, unsigned int order);
 
 static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
 {
@@ -1123,9 +1122,9 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
 static inline bool gigantic_page_supported(void) { return true; }
 #else
 static inline bool gigantic_page_supported(void) { return false; }
-static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void free_gigantic_page(struct page *page, unsigned int order) { }
 static inline void destroy_compound_gigantic_page(struct page *page,
-                                               unsigned long order) { }
+                                               unsigned int order) { }
 static inline int alloc_fresh_gigantic_page(struct hstate *h,
                                        nodemask_t *nodes_allowed) { return 0; }
 #endif
@@ -1146,7 +1145,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_writeback);
        }
        VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
-       set_compound_page_dtor(page, NULL);
+       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
        set_page_refcounted(page);
        if (hstate_is_gigantic(h)) {
                destroy_compound_gigantic_page(page, huge_page_order(h));
@@ -1242,7 +1241,7 @@ void free_huge_page(struct page *page)
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
        INIT_LIST_HEAD(&page->lru);
-       set_compound_page_dtor(page, free_huge_page);
+       set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        spin_lock(&hugetlb_lock);
        set_hugetlb_cgroup(page, NULL);
        h->nr_huge_pages++;
@@ -1251,7 +1250,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
        put_page(page); /* free it into the hugepage allocator */
 }
 
-static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned int order)
 {
        int i;
        int nr_pages = 1 << order;
@@ -1259,8 +1258,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 
        /* we rely on prep_new_huge_page to set the destructor */
        set_compound_order(page, order);
-       __SetPageHead(page);
        __ClearPageReserved(page);
+       __SetPageHead(page);
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                /*
                 * For gigantic hugepages allocated through bootmem at
@@ -1276,10 +1275,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
                 */
                __ClearPageReserved(p);
                set_page_count(p, 0);
-               p->first_page = page;
-               /* Make sure p->first_page is always valid for PageTail() */
-               smp_wmb();
-               __SetPageTail(p);
+               set_compound_head(p, page);
        }
 }
 
@@ -1294,7 +1290,7 @@ int PageHuge(struct page *page)
                return 0;
 
        page = compound_head(page);
-       return get_compound_page_dtor(page) == free_huge_page;
+       return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
 }
 EXPORT_SYMBOL_GPL(PageHuge);
 
@@ -1437,7 +1433,82 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
                dissolve_free_huge_page(pfn_to_page(pfn));
 }
 
-static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
+/*
+ * There are 3 ways this can get called:
+ * 1. With vma+addr: we use the VMA's memory policy
+ * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge
+ *    page from any node, and let the buddy allocator itself figure
+ *    it out.
+ * 3. With !vma, but nid!=NUMA_NO_NODE.  We allocate a huge page
+ *    strictly from 'nid'
+ */
+static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
+               struct vm_area_struct *vma, unsigned long addr, int nid)
+{
+       int order = huge_page_order(h);
+       gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
+       unsigned int cpuset_mems_cookie;
+
+       /*
+        * We need a VMA to get a memory policy.  If we do not
+        * have one, we use the 'nid' argument.
+        *
+        * The mempolicy stuff below has some non-inlined bits
+        * and calls ->vm_ops.  That makes it hard to optimize at
+        * compile-time, even when NUMA is off and it does
+        * nothing.  This helps the compiler optimize it out.
+        */
+       if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
+               /*
+                * If a specific node is requested, make sure to
+                * get memory from there, but only when a node
+                * is explicitly specified.
+                */
+               if (nid != NUMA_NO_NODE)
+                       gfp |= __GFP_THISNODE;
+               /*
+                * Make sure to call something that can handle
+                * nid=NUMA_NO_NODE
+                */
+               return alloc_pages_node(nid, gfp, order);
+       }
+
+       /*
+        * OK, so we have a VMA.  Fetch the mempolicy and try to
+        * allocate a huge page with it.  We will only reach this
+        * when CONFIG_NUMA=y.
+        */
+       do {
+               struct page *page;
+               struct mempolicy *mpol;
+               struct zonelist *zl;
+               nodemask_t *nodemask;
+
+               cpuset_mems_cookie = read_mems_allowed_begin();
+               zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
+               mpol_cond_put(mpol);
+               page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
+               if (page)
+                       return page;
+       } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+       return NULL;
+}
+
+/*
+ * There are two ways to allocate a huge page:
+ * 1. When you have a VMA and an address (like a fault)
+ * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
+ *
+ * 'vma' and 'addr' are only for (1).  'nid' is always NUMA_NO_NODE in
+ * this case which signifies that the allocation should be done with
+ * respect for the VMA's memory policy.
+ *
+ * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
+ * implies that memory policies will not be taken in to account.
+ */
+static struct page *__alloc_buddy_huge_page(struct hstate *h,
+               struct vm_area_struct *vma, unsigned long addr, int nid)
 {
        struct page *page;
        unsigned int r_nid;
@@ -1445,6 +1516,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
        if (hstate_is_gigantic(h))
                return NULL;
 
+       if (vma || addr) {
+               VM_WARN_ON_ONCE(!addr || addr == -1);
+               VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
+       }
        /*
         * Assume we will successfully allocate the surplus page to
         * prevent racing processes from causing the surplus to exceed
@@ -1478,20 +1553,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
        }
        spin_unlock(&hugetlb_lock);
 
-       if (nid == NUMA_NO_NODE)
-               page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
-                                  __GFP_REPEAT|__GFP_NOWARN,
-                                  huge_page_order(h));
-       else
-               page = __alloc_pages_node(nid,
-                       htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
-                       __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
+       page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
 
        spin_lock(&hugetlb_lock);
        if (page) {
                INIT_LIST_HEAD(&page->lru);
                r_nid = page_to_nid(page);
-               set_compound_page_dtor(page, free_huge_page);
+               set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
                set_hugetlb_cgroup(page, NULL);
                /*
                 * We incremented the global counters already
@@ -1509,6 +1577,29 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
        return page;
 }
 
+/*
+ * Allocate a huge page from 'nid'.  Note, 'nid' may be
+ * NUMA_NO_NODE, which means that it may be allocated
+ * anywhere.
+ */
+static
+struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
+{
+       unsigned long addr = -1;
+
+       return __alloc_buddy_huge_page(h, NULL, addr, nid);
+}
+
+/*
+ * Use the VMA's mpolicy to allocate a huge page from the buddy.
+ */
+static
+struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+               struct vm_area_struct *vma, unsigned long addr)
+{
+       return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+}
+
 /*
  * This allocation function is useful in the context where vma is irrelevant.
  * E.g. soft-offlining uses this function because it only cares physical
@@ -1524,7 +1615,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
        spin_unlock(&hugetlb_lock);
 
        if (!page)
-               page = alloc_buddy_huge_page(h, nid);
+               page = __alloc_buddy_huge_page_no_mpol(h, nid);
 
        return page;
 }
@@ -1554,7 +1645,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-               page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+               page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
                if (!page) {
                        alloc_ok = false;
                        break;
@@ -1787,7 +1878,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
        if (!page) {
                spin_unlock(&hugetlb_lock);
-               page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+               page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
                if (!page)
                        goto out_uncharge_cgroup;
 
@@ -1872,7 +1963,8 @@ found:
        return 1;
 }
 
-static void __init prep_compound_huge_page(struct page *page, int order)
+static void __init prep_compound_huge_page(struct page *page,
+               unsigned int order)
 {
        if (unlikely(order > (MAX_ORDER - 1)))
                prep_compound_gigantic_page(page, order);
@@ -2376,7 +2468,7 @@ struct node_hstate {
        struct kobject          *hugepages_kobj;
        struct kobject          *hstate_kobjs[HUGE_MAX_HSTATE];
 };
-struct node_hstate node_hstates[MAX_NUMNODES];
+static struct node_hstate node_hstates[MAX_NUMNODES];
 
 /*
  * A subset of global hstate attributes for node devices
@@ -2583,7 +2675,7 @@ static int __init hugetlb_init(void)
 module_init(hugetlb_init);
 
 /* Should be called on processing a hugepagesz=... option */
-void __init hugetlb_add_hstate(unsigned order)
+void __init hugetlb_add_hstate(unsigned int order)
 {
        struct hstate *h;
        unsigned long i;
@@ -2790,6 +2882,12 @@ void hugetlb_show_meminfo(void)
                                1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
 }
 
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
+{
+       seq_printf(m, "HugetlbPages:\t%8lu kB\n",
+                  atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
+}
+
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
@@ -3023,8 +3121,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
-                       page_dup_rmap(ptepage);
+                       page_dup_rmap(ptepage, true);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
+                       hugetlb_count_add(pages_per_huge_page(h), dst);
                }
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
@@ -3105,7 +3204,8 @@ again:
                if (huge_pte_dirty(pte))
                        set_page_dirty(page);
 
-               page_remove_rmap(page);
+               hugetlb_count_sub(pages_per_huge_page(h), mm);
+               page_remove_rmap(page, true);
                force_flush = !__tlb_remove_page(tlb, page);
                if (force_flush) {
                        address += sz;
@@ -3334,7 +3434,7 @@ retry_avoidcopy:
                mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
-               page_remove_rmap(old_page);
+               page_remove_rmap(old_page, true);
                hugepage_add_new_anon_rmap(new_page, vma, address);
                /* Make the old page be freed below */
                new_page = old_page;
@@ -3504,11 +3604,12 @@ retry:
                ClearPagePrivate(page);
                hugepage_add_new_anon_rmap(page, vma, address);
        } else
-               page_dup_rmap(page);
+               page_dup_rmap(page, true);
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
                                && (vma->vm_flags & VM_SHARED)));
        set_huge_pte_at(mm, address, ptep, new_pte);
 
+       hugetlb_count_add(pages_per_huge_page(h), mm);
        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
                ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
@@ -3574,6 +3675,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *pagecache_page = NULL;
        struct hstate *h = hstate_vma(vma);
        struct address_space *mapping;
+       struct inode *inode = file_inode(vma->vm_file);
        int need_wait_lock = 0;
 
        address &= huge_page_mask(h);
@@ -3596,6 +3698,44 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        mapping = vma->vm_file->f_mapping;
        idx = vma_hugecache_offset(h, vma, address);
 
+       /*
+        * page faults could race with fallocate hole punch.  If a page
+        * is faulted between unmap and deallocation, it will still remain
+        * in the punched hole.  During hole punch operations, a hugetlb_falloc
+        * structure will be pointed to by i_private.  If this fault is for
+        * a page in a hole being punched, wait for the operation to finish
+        * before proceeding.
+        *
+        * Even with this strategy, it is still possible for a page fault to
+        * race with hole punch.  In this case, remove_inode_hugepages() will
+        * unmap the page and then remove.  Checking i_private as below should
+        * catch most of these races as we want to minimize unmapping a page
+        * multiple times.
+        */
+       if (unlikely(inode->i_private)) {
+               struct hugetlb_falloc *hugetlb_falloc;
+
+               spin_lock(&inode->i_lock);
+               hugetlb_falloc = inode->i_private;
+               if (hugetlb_falloc && hugetlb_falloc->waitq &&
+                   idx >= hugetlb_falloc->start &&
+                   idx <= hugetlb_falloc->end) {
+                       wait_queue_head_t *hugetlb_falloc_waitq;
+                       DEFINE_WAIT(hugetlb_fault_wait);
+
+                       hugetlb_falloc_waitq = hugetlb_falloc->waitq;
+                       prepare_to_wait(hugetlb_falloc_waitq,
+                                       &hugetlb_fault_wait,
+                                       TASK_UNINTERRUPTIBLE);
+                       spin_unlock(&inode->i_lock);
+                       schedule();
+
+                       spin_lock(&inode->i_lock);
+                       finish_wait(hugetlb_falloc_waitq, &hugetlb_fault_wait);
+               }
+               spin_unlock(&inode->i_lock);
+       }
+
        /*
         * Serialize hugepage allocation and instantiation, so that we don't
         * get spurious allocation failures if two CPUs race to instantiate
@@ -3783,7 +3923,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 same_page:
                if (pages) {
                        pages[i] = mem_map_offset(page, pfn_offset);
-                       get_page_foll(pages[i]);
+                       get_page(pages[i]);
                }
 
                if (vmas)
@@ -4028,8 +4168,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
        unsigned long s_end = sbase + PUD_SIZE;
 
        /* Allow segments to share if only one is marked locked */
-       unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
-       unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+       unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+       unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
 
        /*
         * match the virtual addresses, permission and the alignment of the
index 6e0057439a469b2cf38e9e9aee1f40a17dd08d59..6a4426372698ecb78d10d8bd32a03841861502ac 100644 (file)
@@ -384,7 +384,7 @@ void __init hugetlb_cgroup_file_init(void)
                /*
                 * Add cgroup control files only if the huge page consists
                 * of more than two normal pages. This is because we use
-                * page[2].lru.next for storing cgroup details.
+                * page[2].private for storing cgroup details.
                 */
                if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
                        __hugetlb_cgroup_file_init(hstate_index(h));
index bc0fa9a69e463771ca1de684c686e96644bc30dd..dbe0436c81b6a670ba013b76d3bf323b2fadc13f 100644 (file)
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
+
+extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pte_t *page_table, pmd_t *pmd,
+                       unsigned int flags, pte_t orig_pte);
+
+/*
+ * The set of flags that only affect watermark checking and reclaim
+ * behaviour. This is used by the MM to obey the caller constraints
+ * about IO, FS and watermark checking while ignoring placement
+ * hints such as HIGHMEM usage.
+ */
+#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
+                       __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
+                       __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
+
+/* The GFP flags allowed during early boot */
+#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
+
+/* Control allocation cpuset and node placement constraints */
+#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
+
+/* Do not use these with a slab allocator */
+#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
 
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long floor, unsigned long ceiling);
@@ -47,50 +71,6 @@ static inline void set_page_refcounted(struct page *page)
        set_page_count(page, 1);
 }
 
-static inline void __get_page_tail_foll(struct page *page,
-                                       bool get_page_head)
-{
-       /*
-        * If we're getting a tail page, the elevated page->_count is
-        * required only in the head page and we will elevate the head
-        * page->_count and tail page->_mapcount.
-        *
-        * We elevate page_tail->_mapcount for tail pages to force
-        * page_tail->_count to be zero at all times to avoid getting
-        * false positives from get_page_unless_zero() with
-        * speculative page access (like in
-        * page_cache_get_speculative()) on tail pages.
-        */
-       VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page);
-       if (get_page_head)
-               atomic_inc(&page->first_page->_count);
-       get_huge_page_tail(page);
-}
-
-/*
- * This is meant to be called as the FOLL_GET operation of
- * follow_page() and it must be called while holding the proper PT
- * lock while the pte (or pmd_trans_huge) is still mapping the page.
- */
-static inline void get_page_foll(struct page *page)
-{
-       if (unlikely(PageTail(page)))
-               /*
-                * This is safe only because
-                * __split_huge_page_refcount() can't run under
-                * get_page_foll() because we hold the proper PT lock.
-                */
-               __get_page_tail_foll(page, true);
-       else {
-               /*
-                * Getting a normal page or the head of a compound page
-                * requires to already have an elevated page->_count.
-                */
-               VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
-               atomic_inc(&page->_count);
-       }
-}
-
 extern unsigned long highest_memmap_pfn;
 
 /*
@@ -129,6 +109,7 @@ struct alloc_context {
        int classzone_idx;
        int migratetype;
        enum zone_type high_zoneidx;
+       bool spread_dirty_pages;
 };
 
 /*
@@ -157,7 +138,7 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
 extern int __isolate_free_page(struct page *page, unsigned int order);
 extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
                                        unsigned int order);
-extern void prep_compound_page(struct page *page, unsigned long order);
+extern void prep_compound_page(struct page *page, unsigned int order);
 #ifdef CONFIG_MEMORY_FAILURE
 extern bool is_free_buddy_page(struct page *page);
 #endif
@@ -215,7 +196,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
  * page cannot be allocated or merged in parallel. Alternatively, it must
  * handle invalid values gracefully, and use page_order_unsafe() below.
  */
-static inline unsigned long page_order(struct page *page)
+static inline unsigned int page_order(struct page *page)
 {
        /* PageBuddy() must be checked by the caller */
        return page_private(page);
@@ -271,29 +252,45 @@ extern unsigned int munlock_vma_page(struct page *page);
 extern void clear_page_mlock(struct page *page);
 
 /*
- * mlock_migrate_page - called only from migrate_page_copy() to
- * migrate the Mlocked page flag; update statistics.
+ * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
+ * (because that does not go through the full procedure of migration ptes):
+ * to migrate the Mlocked page flag; update statistics.
  */
 static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 {
        if (TestClearPageMlocked(page)) {
-               unsigned long flags;
                int nr_pages = hpage_nr_pages(page);
 
-               local_irq_save(flags);
+               /* Holding pmd lock, no change in irq context: __mod is safe */
                __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
                SetPageMlocked(newpage);
                __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
-               local_irq_restore(flags);
        }
 }
 
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern unsigned long vma_address(struct page *page,
-                                struct vm_area_struct *vma);
-#endif
+/*
+ * At what user virtual address is page expected in @vma?
+ */
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
+{
+       pgoff_t pgoff = page_to_pgoff(page);
+       return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+       unsigned long address = __vma_address(page, vma);
+
+       /* page should be within @vma mapping range */
+       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+
+       return address;
+}
+
 #else /* !CONFIG_MMU */
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
index 8da211411b57f1b0db116d8d67401bae30628ec8..d41b21bce6a030a0ea0356d34e9a495f6dfbb5b0 100644 (file)
@@ -4,7 +4,7 @@
  * Copyright (c) 2014 Samsung Electronics Co., Ltd.
  * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
  *
- * Some of code borrowed from https://github.com/xairy/linux by
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
  *        Andrey Konovalov <adech.fo@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -86,6 +86,11 @@ static __always_inline bool memory_is_poisoned_2(unsigned long addr)
                if (memory_is_poisoned_1(addr + 1))
                        return true;
 
+               /*
+                * If single shadow byte covers 2-byte access, we don't
+                * need to do anything more. Otherwise, test the first
+                * shadow byte.
+                */
                if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
                        return false;
 
@@ -103,6 +108,11 @@ static __always_inline bool memory_is_poisoned_4(unsigned long addr)
                if (memory_is_poisoned_1(addr + 3))
                        return true;
 
+               /*
+                * If single shadow byte covers 4-byte access, we don't
+                * need to do anything more. Otherwise, test the first
+                * shadow byte.
+                */
                if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
                        return false;
 
@@ -120,7 +130,12 @@ static __always_inline bool memory_is_poisoned_8(unsigned long addr)
                if (memory_is_poisoned_1(addr + 7))
                        return true;
 
-               if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+               /*
+                * If single shadow byte covers 8-byte access, we don't
+                * need to do anything more. Otherwise, test the first
+                * shadow byte.
+                */
+               if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
                        return false;
 
                return unlikely(*(u8 *)shadow_addr);
@@ -139,7 +154,12 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr)
                if (unlikely(shadow_first_bytes))
                        return true;
 
-               if (likely(IS_ALIGNED(addr, 8)))
+               /*
+                * If two shadow bytes covers 16-byte access, we don't
+                * need to do anything more. Otherwise, test the last
+                * shadow byte.
+                */
+               if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
                        return false;
 
                return memory_is_poisoned_1(addr + 15);
@@ -203,7 +223,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr,
                s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
 
                if (unlikely(ret != (unsigned long)last_shadow ||
-                       ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+                       ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
                        return true;
        }
        return false;
@@ -235,18 +255,12 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
 static __always_inline void check_memory_region(unsigned long addr,
                                                size_t size, bool write)
 {
-       struct kasan_access_info info;
-
        if (unlikely(size == 0))
                return;
 
        if (unlikely((void *)addr <
                kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
-               info.access_addr = (void *)addr;
-               info.access_size = size;
-               info.is_write = write;
-               info.ip = _RET_IP_;
-               kasan_report_user_access(&info);
+               kasan_report(addr, size, write, _RET_IP_);
                return;
        }
 
@@ -524,7 +538,7 @@ static int kasan_mem_notifier(struct notifier_block *nb,
 
 static int __init kasan_memhotplug_init(void)
 {
-       pr_err("WARNING: KASan doesn't support memory hot-add\n");
+       pr_err("WARNING: KASAN doesn't support memory hot-add\n");
        pr_err("Memory hot-add will be disabled\n");
 
        hotplug_memory_notifier(kasan_mem_notifier, 0);
index c242adf6bc8578846e1cc54ad0a931b052668381..4f6c62e5c21edd2d800484b2f18025e45ce034fb 100644 (file)
@@ -54,16 +54,13 @@ struct kasan_global {
 #endif
 };
 
-void kasan_report_error(struct kasan_access_info *info);
-void kasan_report_user_access(struct kasan_access_info *info);
-
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
        return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
                << KASAN_SHADOW_SCALE_SHIFT);
 }
 
-static inline bool kasan_enabled(void)
+static inline bool kasan_report_enabled(void)
 {
        return !current->kasan_depth;
 }
index e07c94fbd0ac5a141ecf95ab7d39d046fea13e67..12f222d0224b93120ed7131a80172666e78276d4 100644 (file)
@@ -4,7 +4,7 @@
  * Copyright (c) 2014 Samsung Electronics Co., Ltd.
  * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
  *
- * Some of code borrowed from https://github.com/xairy/linux by
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
  *        Andrey Konovalov <adech.fo@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -22,6 +22,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/kasan.h>
+#include <linux/module.h>
 
 #include <asm/sections.h>
 
@@ -48,34 +49,49 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
 
 static void print_error_description(struct kasan_access_info *info)
 {
-       const char *bug_type = "unknown crash";
-       u8 shadow_val;
+       const char *bug_type = "unknown-crash";
+       u8 *shadow_addr;
 
        info->first_bad_addr = find_first_bad_addr(info->access_addr,
                                                info->access_size);
 
-       shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+       shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
 
-       switch (shadow_val) {
-       case KASAN_FREE_PAGE:
-       case KASAN_KMALLOC_FREE:
-               bug_type = "use after free";
+       /*
+        * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
+        * at the next shadow byte to determine the type of the bad access.
+        */
+       if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
+               shadow_addr++;
+
+       switch (*shadow_addr) {
+       case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+               /*
+                * In theory it's still possible to see these shadow values
+                * due to a data race in the kernel code.
+                */
+               bug_type = "out-of-bounds";
                break;
        case KASAN_PAGE_REDZONE:
        case KASAN_KMALLOC_REDZONE:
+               bug_type = "slab-out-of-bounds";
+               break;
        case KASAN_GLOBAL_REDZONE:
-       case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
-               bug_type = "out of bounds access";
+               bug_type = "global-out-of-bounds";
                break;
        case KASAN_STACK_LEFT:
        case KASAN_STACK_MID:
        case KASAN_STACK_RIGHT:
        case KASAN_STACK_PARTIAL:
-               bug_type = "out of bounds on stack";
+               bug_type = "stack-out-of-bounds";
+               break;
+       case KASAN_FREE_PAGE:
+       case KASAN_KMALLOC_FREE:
+               bug_type = "use-after-free";
                break;
        }
 
-       pr_err("BUG: KASan: %s in %pS at addr %p\n",
+       pr_err("BUG: KASAN: %s in %pS at addr %p\n",
                bug_type, (void *)info->ip,
                info->access_addr);
        pr_err("%s of size %zu by task %s/%d\n",
@@ -85,9 +101,11 @@ static void print_error_description(struct kasan_access_info *info)
 
 static inline bool kernel_or_module_addr(const void *addr)
 {
-       return (addr >= (void *)_stext && addr < (void *)_end)
-               || (addr >= (void *)MODULES_VADDR
-                       && addr < (void *)MODULES_END);
+       if (addr >= (void *)_stext && addr < (void *)_end)
+               return true;
+       if (is_module_address((unsigned long)addr))
+               return true;
+       return false;
 }
 
 static inline bool init_task_stack_addr(const void *addr)
@@ -161,15 +179,19 @@ static void print_shadow_for_address(const void *addr)
        for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
                const void *kaddr = kasan_shadow_to_mem(shadow_row);
                char buffer[4 + (BITS_PER_LONG/8)*2];
+               char shadow_buf[SHADOW_BYTES_PER_ROW];
 
                snprintf(buffer, sizeof(buffer),
                        (i == 0) ? ">%p: " : " %p: ", kaddr);
-
-               kasan_disable_current();
+               /*
+                * We should not pass a shadow pointer to generic
+                * function, because generic functions may try to
+                * access kasan mapping for the passed address.
+                */
+               memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
                print_hex_dump(KERN_ERR, buffer,
                        DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
-                       shadow_row, SHADOW_BYTES_PER_ROW, 0);
-               kasan_enable_current();
+                       shadow_buf, SHADOW_BYTES_PER_ROW, 0);
 
                if (row_is_guilty(shadow_row, shadow))
                        pr_err("%*c\n",
@@ -182,37 +204,43 @@ static void print_shadow_for_address(const void *addr)
 
 static DEFINE_SPINLOCK(report_lock);
 
-void kasan_report_error(struct kasan_access_info *info)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&report_lock, flags);
-       pr_err("================================="
-               "=================================\n");
-       print_error_description(info);
-       print_address_description(info);
-       print_shadow_for_address(info->first_bad_addr);
-       pr_err("================================="
-               "=================================\n");
-       spin_unlock_irqrestore(&report_lock, flags);
-}
-
-void kasan_report_user_access(struct kasan_access_info *info)
+static void kasan_report_error(struct kasan_access_info *info)
 {
        unsigned long flags;
+       const char *bug_type;
 
+       /*
+        * Make sure we don't end up in loop.
+        */
+       kasan_disable_current();
        spin_lock_irqsave(&report_lock, flags);
        pr_err("================================="
                "=================================\n");
-       pr_err("BUG: KASan: user-memory-access on address %p\n",
-               info->access_addr);
-       pr_err("%s of size %zu by task %s/%d\n",
-               info->is_write ? "Write" : "Read",
-               info->access_size, current->comm, task_pid_nr(current));
-       dump_stack();
+       if (info->access_addr <
+                       kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
+               if ((unsigned long)info->access_addr < PAGE_SIZE)
+                       bug_type = "null-ptr-deref";
+               else if ((unsigned long)info->access_addr < TASK_SIZE)
+                       bug_type = "user-memory-access";
+               else
+                       bug_type = "wild-memory-access";
+               pr_err("BUG: KASAN: %s on address %p\n",
+                       bug_type, info->access_addr);
+               pr_err("%s of size %zu by task %s/%d\n",
+                       info->is_write ? "Write" : "Read",
+                       info->access_size, current->comm,
+                       task_pid_nr(current));
+               dump_stack();
+       } else {
+               print_error_description(info);
+               print_address_description(info);
+               print_shadow_for_address(info->first_bad_addr);
+       }
        pr_err("================================="
                "=================================\n");
+       add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
        spin_unlock_irqrestore(&report_lock, flags);
+       kasan_enable_current();
 }
 
 void kasan_report(unsigned long addr, size_t size,
@@ -220,13 +248,14 @@ void kasan_report(unsigned long addr, size_t size,
 {
        struct kasan_access_info info;
 
-       if (likely(!kasan_enabled()))
+       if (likely(!kasan_report_enabled()))
                return;
 
        info.access_addr = (void *)addr;
        info.access_size = size;
        info.is_write = is_write;
        info.ip = ip;
+
        kasan_report_error(&info);
 }
 
index 77191eccdc6f6c372e84e2f49750f7f95e56c324..19423a45d7d7d96de3b403c60806d14b37788331 100644 (file)
@@ -479,7 +479,7 @@ static void put_object(struct kmemleak_object *object)
 static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
 {
        unsigned long flags;
-       struct kmemleak_object *object = NULL;
+       struct kmemleak_object *object;
 
        rcu_read_lock();
        read_lock_irqsave(&kmemleak_lock, flags);
index 7ee101eaacdfe9eb82061585bb820df243a3650b..659e2b5119c043671cf7a0e19afcf1c68a78a3c0 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -349,6 +349,24 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
        return atomic_read(&mm->mm_users) == 0;
 }
 
+/*
+ * If the mm isn't the one associated with the current
+ * ksm_scan.mm_slot ksm_exit() will not down_write();up_write() and in
+ * turn the ksm_test_exit() check run inside a mm->mmap_sem critical
+ * section, will not prevent exit_mmap() to run from under us. In
+ * turn, in those cases where we could work with an "mm" that isn't
+ * guaranteed to be associated with the current ksm_scan.mm_slot,
+ * ksm_get_mm() is needed instead of the ksm_test_exit() run inside
+ * the mmap_sem. Return true if the mm_users was incremented or false
+ * if it we failed at taking the mm because it was freed from under
+ * us. If it returns 1, the caller must take care of calling mmput()
+ * after it finishes using the mm.
+ */
+static __always_inline bool ksm_get_mm(struct mm_struct *mm)
+{
+       return likely(atomic_inc_not_zero(&mm->mm_users));
+}
+
 /*
  * We use break_ksm to break COW on a ksm page: it's a stripped down
  *
@@ -412,8 +430,6 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
                unsigned long addr)
 {
        struct vm_area_struct *vma;
-       if (ksm_test_exit(mm))
-               return NULL;
        vma = find_vma(mm, addr);
        if (!vma || vma->vm_start > addr)
                return NULL;
@@ -434,25 +450,21 @@ static void break_cow(struct rmap_item *rmap_item)
         */
        put_anon_vma(rmap_item->anon_vma);
 
+       /*
+        * The "mm" of the unstable tree rmap_item isn't necessairly
+        * associated with the current ksm_scan.mm_slot, it could be
+        * any random mm. So we need ksm_get_mm here to prevent the
+        * exit_mmap to run from under us in mmput().
+        */
+       if (!ksm_get_mm(mm))
+               return;
+
        down_read(&mm->mmap_sem);
        vma = find_mergeable_vma(mm, addr);
        if (vma)
                break_ksm(vma, addr);
        up_read(&mm->mmap_sem);
-}
-
-static struct page *page_trans_compound_anon(struct page *page)
-{
-       if (PageTransCompound(page)) {
-               struct page *head = compound_head(page);
-               /*
-                * head may actually be splitted and freed from under
-                * us but it's ok here.
-                */
-               if (PageAnon(head))
-                       return head;
-       }
-       return NULL;
+       mmput(mm);
 }
 
 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
@@ -462,6 +474,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        struct vm_area_struct *vma;
        struct page *page;
 
+       /*
+        * The "mm" of the unstable tree rmap_item isn't necessairly
+        * associated with the current ksm_scan.mm_slot, it could be
+        * any random mm. So we need ksm_get_mm here to prevent the
+        * exit_mmap to run from under us in mmput().
+        */
+       if (!ksm_get_mm(mm))
+               return NULL;
+
        down_read(&mm->mmap_sem);
        vma = find_mergeable_vma(mm, addr);
        if (!vma)
@@ -470,7 +491,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        page = follow_page(vma, addr, FOLL_GET);
        if (IS_ERR_OR_NULL(page))
                goto out;
-       if (PageAnon(page) || page_trans_compound_anon(page)) {
+       if (PageAnon(page)) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        } else {
@@ -478,6 +499,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 out:           page = NULL;
        }
        up_read(&mm->mmap_sem);
+       mmput(mm);
        return page;
 }
 
@@ -625,7 +647,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                unlock_page(page);
                put_page(page);
 
-               if (stable_node->hlist.first)
+               if (!hlist_empty(&stable_node->hlist))
                        ksm_pages_sharing--;
                else
                        ksm_pages_shared--;
@@ -957,13 +979,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        }
 
        get_page(kpage);
-       page_add_anon_rmap(kpage, vma, addr);
+       page_add_anon_rmap(kpage, vma, addr, false);
 
        flush_cache_page(vma, addr, pte_pfn(*ptep));
        ptep_clear_flush_notify(vma, addr, ptep);
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
 
-       page_remove_rmap(page);
+       page_remove_rmap(page, false);
        if (!page_mapped(page))
                try_to_free_swap(page);
        put_page(page);
@@ -976,33 +998,6 @@ out:
        return err;
 }
 
-static int page_trans_compound_anon_split(struct page *page)
-{
-       int ret = 0;
-       struct page *transhuge_head = page_trans_compound_anon(page);
-       if (transhuge_head) {
-               /* Get the reference on the head to split it. */
-               if (get_page_unless_zero(transhuge_head)) {
-                       /*
-                        * Recheck we got the reference while the head
-                        * was still anonymous.
-                        */
-                       if (PageAnon(transhuge_head))
-                               ret = split_huge_page(transhuge_head);
-                       else
-                               /*
-                                * Retry later if split_huge_page run
-                                * from under us.
-                                */
-                               ret = 1;
-                       put_page(transhuge_head);
-               } else
-                       /* Retry later if split_huge_page run from under us. */
-                       ret = 1;
-       }
-       return ret;
-}
-
 /*
  * try_to_merge_one_page - take two pages and merge them into one
  * @vma: the vma that holds the pte pointing to page
@@ -1021,11 +1016,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
        if (page == kpage)                      /* ksm page forked */
                return 0;
 
-       if (!(vma->vm_flags & VM_MERGEABLE))
-               goto out;
-       if (PageTransCompound(page) && page_trans_compound_anon_split(page))
-               goto out;
-       BUG_ON(PageTransCompound(page));
        if (!PageAnon(page))
                goto out;
 
@@ -1038,6 +1028,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
         */
        if (!trylock_page(page))
                goto out;
+
+       if (PageTransCompound(page)) {
+               err = split_huge_page(page);
+               if (err)
+                       goto out_unlock;
+       }
+
        /*
         * If this anonymous page is mapped only here, its pte may need
         * to be write-protected.  If it's mapped elsewhere, all of its
@@ -1053,6 +1050,18 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
                         */
                        set_page_stable_node(page, NULL);
                        mark_page_accessed(page);
+                       /*
+                        * Stable page could be shared by several processes
+                        * and last process could own the page among them after
+                        * CoW or zapping for every process except last process
+                        * happens. Then, page table entry of the page
+                        * in last process can have no dirty bit.
+                        * In this case, MADV_FREE could discard the page
+                        * wrongly.
+                        * For preventing it, we mark stable page dirty.
+                        */
+                       if (!PageDirty(page))
+                               SetPageDirty(page);
                        err = 0;
                } else if (pages_identical(page, kpage))
                        err = replace_page(vma, page, kpage, orig_pte);
@@ -1068,6 +1077,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
                }
        }
 
+out_unlock:
        unlock_page(page);
 out:
        return err;
@@ -1086,11 +1096,21 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
        struct vm_area_struct *vma;
        int err = -EFAULT;
 
+       /*
+        * The "mm" of the unstable tree rmap_item isn't necessairly
+        * associated with the current ksm_scan.mm_slot, it could be
+        * any random mm. So we need ksm_get_mm() here to prevent the
+        * exit_mmap to run from under us in mmput(). Otherwise
+        * rmap_item->anon_vma could point to an anon_vma that has
+        * already been freed (i.e. get_anon_vma() below would run too
+        * late).
+        */
+       if (!ksm_get_mm(mm))
+               return err;
+
        down_read(&mm->mmap_sem);
-       if (ksm_test_exit(mm))
-               goto out;
-       vma = find_vma(mm, rmap_item->address);
-       if (!vma || vma->vm_start > rmap_item->address)
+       vma = find_mergeable_vma(mm, rmap_item->address);
+       if (!vma)
                goto out;
 
        err = try_to_merge_one_page(vma, page, kpage);
@@ -1105,6 +1125,7 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
        get_anon_vma(vma->anon_vma);
 out:
        up_read(&mm->mmap_sem);
+       mmput(mm);
        return err;
 }
 
@@ -1178,7 +1199,18 @@ again:
                stable_node = rb_entry(*new, struct stable_node, node);
                tree_page = get_ksm_page(stable_node, false);
                if (!tree_page)
-                       return NULL;
+                       /*
+                        * If we walked over a stale stable_node,
+                        * get_ksm_page() will call rb_erase() and it
+                        * may rebalance the tree from under us. So
+                        * restart the search from scratch. Returning
+                        * NULL would be safe too, but we'd generate
+                        * false negative insertions just because some
+                        * stable_node was stale which would waste CPU
+                        * by doing the preparation work twice at the
+                        * next KSM pass.
+                        */
+                       goto again;
 
                ret = memcmp_pages(page, tree_page);
                put_page(tree_page);
@@ -1254,12 +1286,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
        unsigned long kpfn;
        struct rb_root *root;
        struct rb_node **new;
-       struct rb_node *parent = NULL;
+       struct rb_node *parent;
        struct stable_node *stable_node;
 
        kpfn = page_to_pfn(kpage);
        nid = get_kpfn_nid(kpfn);
        root = root_stable_tree + nid;
+again:
+       parent = NULL;
        new = &root->rb_node;
 
        while (*new) {
@@ -1270,7 +1304,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
                stable_node = rb_entry(*new, struct stable_node, node);
                tree_page = get_ksm_page(stable_node, false);
                if (!tree_page)
-                       return NULL;
+                       /*
+                        * If we walked over a stale stable_node,
+                        * get_ksm_page() will call rb_erase() and it
+                        * may rebalance the tree from under us. So
+                        * restart the search from scratch. Returning
+                        * NULL would be safe too, but we'd generate
+                        * false negative insertions just because some
+                        * stable_node was stale which would waste CPU
+                        * by doing the preparation work twice at the
+                        * next KSM pass.
+                        */
+                       goto again;
 
                ret = memcmp_pages(kpage, tree_page);
                put_page(tree_page);
@@ -1340,7 +1385,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
                cond_resched();
                tree_rmap_item = rb_entry(*new, struct rmap_item, node);
                tree_page = get_mergeable_page(tree_rmap_item);
-               if (IS_ERR_OR_NULL(tree_page))
+               if (!tree_page)
                        return NULL;
 
                /*
@@ -1620,8 +1665,7 @@ next_mm:
                                cond_resched();
                                continue;
                        }
-                       if (PageAnon(*page) ||
-                           page_trans_compound_anon(*page)) {
+                       if (PageAnon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
@@ -1884,7 +1928,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 
                SetPageDirty(new_page);
                __SetPageUptodate(new_page);
-               __set_page_locked(new_page);
+               __SetPageLocked(new_page);
        }
 
        return new_page;
@@ -1914,9 +1958,11 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
 
+               cond_resched();
                anon_vma_lock_read(anon_vma);
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
+                       cond_resched();
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
index e1da19fac1b3629f7aeff4815a64681f9ca9bd60..afc71ea9a381f853faf65a05edc7f411a378d7e5 100644 (file)
@@ -42,6 +42,10 @@ static void list_lru_unregister(struct list_lru *lru)
 #ifdef CONFIG_MEMCG_KMEM
 static inline bool list_lru_memcg_aware(struct list_lru *lru)
 {
+       /*
+        * This needs node 0 to be always present, even
+        * in the systems supporting sparse numa ids.
+        */
        return !!lru->node[0].memcg_lrus;
 }
 
@@ -59,6 +63,16 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
        return &nlru->lru;
 }
 
+static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+       struct page *page;
+
+       if (!memcg_kmem_enabled())
+               return NULL;
+       page = virt_to_head_page(ptr);
+       return page->mem_cgroup;
+}
+
 static inline struct list_lru_one *
 list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
 {
@@ -377,16 +391,20 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
 {
        int i;
 
-       for (i = 0; i < nr_node_ids; i++) {
-               if (!memcg_aware)
-                       lru->node[i].memcg_lrus = NULL;
-               else if (memcg_init_list_lru_node(&lru->node[i]))
+       if (!memcg_aware)
+               return 0;
+
+       for_each_node(i) {
+               if (memcg_init_list_lru_node(&lru->node[i]))
                        goto fail;
        }
        return 0;
 fail:
-       for (i = i - 1; i >= 0; i--)
+       for (i = i - 1; i >= 0; i--) {
+               if (!lru->node[i].memcg_lrus)
+                       continue;
                memcg_destroy_list_lru_node(&lru->node[i]);
+       }
        return -ENOMEM;
 }
 
@@ -397,7 +415,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
        if (!list_lru_memcg_aware(lru))
                return;
 
-       for (i = 0; i < nr_node_ids; i++)
+       for_each_node(i)
                memcg_destroy_list_lru_node(&lru->node[i]);
 }
 
@@ -409,16 +427,20 @@ static int memcg_update_list_lru(struct list_lru *lru,
        if (!list_lru_memcg_aware(lru))
                return 0;
 
-       for (i = 0; i < nr_node_ids; i++) {
+       for_each_node(i) {
                if (memcg_update_list_lru_node(&lru->node[i],
                                               old_size, new_size))
                        goto fail;
        }
        return 0;
 fail:
-       for (i = i - 1; i >= 0; i--)
+       for (i = i - 1; i >= 0; i--) {
+               if (!lru->node[i].memcg_lrus)
+                       continue;
+
                memcg_cancel_update_list_lru_node(&lru->node[i],
                                                  old_size, new_size);
+       }
        return -ENOMEM;
 }
 
@@ -430,7 +452,7 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru,
        if (!list_lru_memcg_aware(lru))
                return;
 
-       for (i = 0; i < nr_node_ids; i++)
+       for_each_node(i)
                memcg_cancel_update_list_lru_node(&lru->node[i],
                                                  old_size, new_size);
 }
@@ -485,7 +507,7 @@ static void memcg_drain_list_lru(struct list_lru *lru,
        if (!list_lru_memcg_aware(lru))
                return;
 
-       for (i = 0; i < nr_node_ids; i++)
+       for_each_node(i)
                memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
 }
 
@@ -522,7 +544,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
        if (!lru->node)
                goto out;
 
-       for (i = 0; i < nr_node_ids; i++) {
+       for_each_node(i) {
                spin_lock_init(&lru->node[i].lock);
                if (key)
                        lockdep_set_class(&lru->node[i].lock, key);
index 34fe24759ed1f32cb58cd38d2245ae7b82dc7f37..d159b1c96e484d902f6edb34ebb5a83a3977bd57 100644 (file)
  *
  * Safely read from address @src to the buffer at @dst.  If a kernel fault
  * happens, handle that and return -EFAULT.
+ *
+ * We ensure that the copy_from_user is executed in atomic context so that
+ * do_page_fault() doesn't attempt to take mmap_sem.  This makes
+ * probe_kernel_read() suitable for use within regions where the caller
+ * already holds mmap_sem, or other locks which nest inside mmap_sem.
  */
 
 long __weak probe_kernel_read(void *dst, const void *src, size_t size)
@@ -99,5 +104,5 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
        pagefault_enable();
        set_fs(old_fs);
 
-       return ret < 0 ? ret : src - unsafe_addr;
+       return ret ? -EFAULT : src - unsafe_addr;
 }
index c889fcbb530e98d8779ef75750e1fde08bf786cf..5db5464312854586a5987fc3d13fc4a21487dfc9 100644 (file)
@@ -20,6 +20,9 @@
 #include <linux/backing-dev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
 
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior)
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
+       case MADV_FREE:
                return 0;
        default:
                /* be safe, default to 1. list exceptions explicitly */
@@ -256,6 +260,166 @@ static long madvise_willneed(struct vm_area_struct *vma,
        return 0;
 }
 
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+                               unsigned long end, struct mm_walk *walk)
+
+{
+       struct mmu_gather *tlb = walk->private;
+       struct mm_struct *mm = tlb->mm;
+       struct vm_area_struct *vma = walk->vma;
+       spinlock_t *ptl;
+       pte_t *pte, ptent;
+       struct page *page;
+       swp_entry_t entry;
+       unsigned long next;
+       int nr_swap = 0;
+
+       next = pmd_addr_end(addr, end);
+       if (pmd_trans_huge(*pmd)) {
+               if (next - addr != HPAGE_PMD_SIZE)
+                       split_huge_pmd(vma, pmd, addr);
+               else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr))
+                       goto next;
+               /* fall through */
+       }
+
+       if (pmd_trans_unstable(pmd))
+               return 0;
+
+       pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       arch_enter_lazy_mmu_mode();
+       for (; addr != end; pte++, addr += PAGE_SIZE) {
+               ptent = *pte;
+
+               if (pte_none(ptent))
+                       continue;
+               /*
+                * If the pte has swp_entry, just clear page table to
+                * prevent swap-in which is more expensive rather than
+                * (page allocation + zeroing).
+                */
+               if (!pte_present(ptent)) {
+                       entry = pte_to_swp_entry(ptent);
+                       if (non_swap_entry(entry))
+                               continue;
+                       nr_swap--;
+                       free_swap_and_cache(entry);
+                       pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+                       continue;
+               }
+
+               page = vm_normal_page(vma, addr, ptent);
+               if (!page)
+                       continue;
+
+               if (PageSwapCache(page) || PageDirty(page)) {
+                       if (!trylock_page(page))
+                               continue;
+                       /*
+                        * If page is shared with others, we couldn't clear
+                        * PG_dirty of the page.
+                        */
+                       if (page_count(page) != 1 + !!PageSwapCache(page)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (PageSwapCache(page) && !try_to_free_swap(page)) {
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       ClearPageDirty(page);
+                       unlock_page(page);
+               }
+
+               /*
+                * Some of architecture(ex, PPC) don't update TLB
+                * with set_pte_at and tlb_remove_tlb_entry so for
+                * the portability, remap the pte with old|clean
+                * after pte clearing.
+                */
+               ptent = ptep_get_and_clear_full(mm, addr, pte,
+                                               tlb->fullmm);
+               ptent = pte_mkold(ptent);
+               ptent = pte_mkclean(ptent);
+               set_pte_at(mm, addr, pte, ptent);
+               if (PageActive(page))
+                       deactivate_page(page);
+               tlb_remove_tlb_entry(tlb, pte, addr);
+       }
+
+       if (nr_swap) {
+               if (current->mm == mm)
+                       sync_mm_rss(mm);
+
+               add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+       }
+
+       arch_leave_lazy_mmu_mode();
+       pte_unmap_unlock(pte - 1, ptl);
+next:
+       cond_resched();
+       return 0;
+}
+
+static void madvise_free_page_range(struct mmu_gather *tlb,
+                            struct vm_area_struct *vma,
+                            unsigned long addr, unsigned long end)
+{
+       struct mm_walk free_walk = {
+               .pmd_entry = madvise_free_pte_range,
+               .mm = vma->vm_mm,
+               .private = tlb,
+       };
+
+       BUG_ON(addr >= end);
+       tlb_start_vma(tlb, vma);
+       walk_page_range(addr, end, &free_walk);
+       tlb_end_vma(tlb, vma);
+}
+
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+                       unsigned long start_addr, unsigned long end_addr)
+{
+       unsigned long start, end;
+       struct mm_struct *mm = vma->vm_mm;
+       struct mmu_gather tlb;
+
+       if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+               return -EINVAL;
+
+       /* MADV_FREE works for only anon vma at the moment */
+       if (!vma_is_anonymous(vma))
+               return -EINVAL;
+
+       start = max(vma->vm_start, start_addr);
+       if (start >= vma->vm_end)
+               return -EINVAL;
+       end = min(vma->vm_end, end_addr);
+       if (end <= vma->vm_start)
+               return -EINVAL;
+
+       lru_add_drain();
+       tlb_gather_mmu(&tlb, mm, start, end);
+       update_hiwater_rss(mm);
+
+       mmu_notifier_invalidate_range_start(mm, start, end);
+       madvise_free_page_range(&tlb, vma, start, end);
+       mmu_notifier_invalidate_range_end(mm, start, end);
+       tlb_finish_mmu(&tlb, start, end);
+
+       return 0;
+}
+
+static long madvise_free(struct vm_area_struct *vma,
+                            struct vm_area_struct **prev,
+                            unsigned long start, unsigned long end)
+{
+       *prev = vma;
+       return madvise_free_single_vma(vma, start, end);
+}
+
 /*
  * Application no longer needs these pages.  If the pages are dirty,
  * it's OK to just throw them away.  The app will be more careful about
@@ -379,6 +543,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                return madvise_remove(vma, prev, start, end);
        case MADV_WILLNEED:
                return madvise_willneed(vma, prev, start, end);
+       case MADV_FREE:
+               /*
+                * XXX: In this implementation, MADV_FREE works like
+                * MADV_DONTNEED on swapless system or full swap.
+                */
+               if (get_nr_swap_pages() > 0)
+                       return madvise_free(vma, prev, start, end);
+               /* passthrough */
        case MADV_DONTNEED:
                return madvise_dontneed(vma, prev, start, end);
        default:
@@ -398,6 +570,7 @@ madvise_behavior_valid(int behavior)
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
+       case MADV_FREE:
 #ifdef CONFIG_KSM
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
index 1c7b647e58971ee77e0892bb0edbad5b40ec50fd..d300f1329814ba538b271fafabb0351d3182d639 100644 (file)
@@ -706,7 +706,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
        return 0;
 }
 
-int __init_memblock memblock_remove_range(struct memblock_type *type,
+static int __init_memblock memblock_remove_range(struct memblock_type *type,
                                          phys_addr_t base, phys_addr_t size)
 {
        int start_rgn, end_rgn;
index b732edfddb767025185f27c8879903591c2b0c82..48735e7c617b3d9454b35ac5ea6a4f9ff5ef472c 100644 (file)
@@ -62,6 +62,7 @@
 #include <linux/oom.h>
 #include <linux/lockdep.h>
 #include <linux/file.h>
+#include <linux/tracehook.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -695,7 +696,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                                         struct page *page,
-                                        int nr_pages)
+                                        bool compound, int nr_pages)
 {
        /*
         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
@@ -708,9 +709,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
                                nr_pages);
 
-       if (PageTransHuge(page))
+       if (compound) {
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
                                nr_pages);
+       }
 
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
@@ -1661,7 +1664,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-       if (!current->memcg_oom.may_oom)
+       if (!current->memcg_may_oom)
                return;
        /*
         * We are in the middle of the charge context here, so we
@@ -1678,9 +1681,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
         * and when we know whether the fault was overall successful.
         */
        css_get(&memcg->css);
-       current->memcg_oom.memcg = memcg;
-       current->memcg_oom.gfp_mask = mask;
-       current->memcg_oom.order = order;
+       current->memcg_in_oom = memcg;
+       current->memcg_oom_gfp_mask = mask;
+       current->memcg_oom_order = order;
 }
 
 /**
@@ -1702,7 +1705,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  */
 bool mem_cgroup_oom_synchronize(bool handle)
 {
-       struct mem_cgroup *memcg = current->memcg_oom.memcg;
+       struct mem_cgroup *memcg = current->memcg_in_oom;
        struct oom_wait_info owait;
        bool locked;
 
@@ -1730,8 +1733,8 @@ bool mem_cgroup_oom_synchronize(bool handle)
        if (locked && !memcg->oom_kill_disable) {
                mem_cgroup_unmark_under_oom(memcg);
                finish_wait(&memcg_oom_waitq, &owait.wait);
-               mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-                                        current->memcg_oom.order);
+               mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+                                        current->memcg_oom_order);
        } else {
                schedule();
                mem_cgroup_unmark_under_oom(memcg);
@@ -1748,7 +1751,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
                memcg_oom_recover(memcg);
        }
 cleanup:
-       current->memcg_oom.memcg = NULL;
+       current->memcg_in_oom = NULL;
        css_put(&memcg->css);
        return true;
 }
@@ -1972,6 +1975,31 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
        return NOTIFY_OK;
 }
 
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+       unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       struct mem_cgroup *memcg, *pos;
+
+       if (likely(!nr_pages))
+               return;
+
+       pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+       do {
+               if (page_counter_read(&pos->memory) <= pos->high)
+                       continue;
+               mem_cgroup_events(pos, MEMCG_HIGH, 1);
+               try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+       } while ((pos = parent_mem_cgroup(pos)));
+
+       css_put(&memcg->css);
+       current->memcg_nr_pages_over_high = 0;
+}
+
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                      unsigned int nr_pages)
 {
@@ -1982,13 +2010,12 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        unsigned long nr_reclaimed;
        bool may_swap = true;
        bool drained = false;
-       int ret = 0;
 
        if (mem_cgroup_is_root(memcg))
-               goto done;
+               return 0;
 retry:
        if (consume_stock(memcg, nr_pages))
-               goto done;
+               return 0;
 
        if (!do_swap_account ||
            !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
@@ -2016,12 +2043,12 @@ retry:
        if (unlikely(test_thread_flag(TIF_MEMDIE) ||
                     fatal_signal_pending(current) ||
                     current->flags & PF_EXITING))
-               goto bypass;
+               goto force;
 
        if (unlikely(task_in_memcg_oom(current)))
                goto nomem;
 
-       if (!(gfp_mask & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(gfp_mask))
                goto nomem;
 
        mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
@@ -2062,38 +2089,54 @@ retry:
                goto retry;
 
        if (gfp_mask & __GFP_NOFAIL)
-               goto bypass;
+               goto force;
 
        if (fatal_signal_pending(current))
-               goto bypass;
+               goto force;
 
        mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
 
-       mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
+       mem_cgroup_oom(mem_over_limit, gfp_mask,
+                      get_order(nr_pages * PAGE_SIZE));
 nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
                return -ENOMEM;
-bypass:
-       return -EINTR;
+force:
+       /*
+        * The allocation either can't fail or will lead to more memory
+        * being freed very soon.  Allow memory usage go over the limit
+        * temporarily by force charging it.
+        */
+       page_counter_charge(&memcg->memory, nr_pages);
+       if (do_swap_account)
+               page_counter_charge(&memcg->memsw, nr_pages);
+       css_get_many(&memcg->css, nr_pages);
+
+       return 0;
 
 done_restock:
        css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
-       if (!(gfp_mask & __GFP_WAIT))
-               goto done;
+
        /*
-        * If the hierarchy is above the normal consumption range,
-        * make the charging task trim their excess contribution.
+        * If the hierarchy is above the normal consumption range, schedule
+        * reclaim on returning to userland.  We can perform reclaim here
+        * if __GFP_WAIT but let's always punt for simplicity and so that
+        * GFP_KERNEL can consistently be used during reclaim.  @memcg is
+        * not recorded as it most likely matches current's and won't
+        * change in the meantime.  As high limit is checked again before
+        * reclaim, the cost of mismatch is negligible.
         */
        do {
-               if (page_counter_read(&memcg->memory) <= memcg->high)
-                       continue;
-               mem_cgroup_events(memcg, MEMCG_HIGH, 1);
-               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+               if (page_counter_read(&memcg->memory) > memcg->high) {
+                       current->memcg_nr_pages_over_high += nr_pages;
+                       set_notify_resume(current);
+                       break;
+               }
        } while ((memcg = parent_mem_cgroup(memcg)));
-done:
-       return ret;
+
+       return 0;
 }
 
 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2174,55 +2217,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                     unsigned long nr_pages)
-{
-       struct page_counter *counter;
-       int ret = 0;
-
-       ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
-       if (ret < 0)
-               return ret;
-
-       ret = try_charge(memcg, gfp, nr_pages);
-       if (ret == -EINTR)  {
-               /*
-                * try_charge() chose to bypass to root due to OOM kill or
-                * fatal signal.  Since our only options are to either fail
-                * the allocation or charge it to this cgroup, do it as a
-                * temporary condition. But we can't fail. From a kmem/slab
-                * perspective, the cache has already been selected, by
-                * mem_cgroup_kmem_get_cache(), so it is too late to change
-                * our minds.
-                *
-                * This condition will only trigger if the task entered
-                * memcg_charge_kmem in a sane state, but was OOM-killed
-                * during try_charge() above. Tasks that were already dying
-                * when the allocation triggers should have been already
-                * directed to the root cgroup in memcontrol.h
-                */
-               page_counter_charge(&memcg->memory, nr_pages);
-               if (do_swap_account)
-                       page_counter_charge(&memcg->memsw, nr_pages);
-               css_get_many(&memcg->css, nr_pages);
-               ret = 0;
-       } else if (ret)
-               page_counter_uncharge(&memcg->kmem, nr_pages);
-
-       return ret;
-}
-
-void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
-       page_counter_uncharge(&memcg->memory, nr_pages);
-       if (do_swap_account)
-               page_counter_uncharge(&memcg->memsw, nr_pages);
-
-       page_counter_uncharge(&memcg->kmem, nr_pages);
-
-       css_put_many(&memcg->css, nr_pages);
-}
-
 static int memcg_alloc_cache_id(void)
 {
        int id, size;
@@ -2384,85 +2378,59 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep)
                css_put(&cachep->memcg_params.memcg->css);
 }
 
-/*
- * We need to verify if the allocation against current->mm->owner's memcg is
- * possible for the given order. But the page is not allocated yet, so we'll
- * need a further commit step to do the final arrangements.
- *
- * It is possible for the task to switch cgroups in this mean time, so at
- * commit time, we can't rely on task conversion any longer.  We'll then use
- * the handle argument to return to the caller which cgroup we should commit
- * against. We could also return the memcg directly and avoid the pointer
- * passing, but a boolean return value gives better semantics considering
- * the compiled-out case as well.
- *
- * Returning true means the allocation is possible.
- */
-bool
-__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+                             struct mem_cgroup *memcg)
 {
-       struct mem_cgroup *memcg;
-       int ret;
+       unsigned int nr_pages = 1 << order;
+       struct page_counter *counter;
+       int ret = 0;
 
-       *_memcg = NULL;
+       if (!memcg_kmem_is_active(memcg))
+               return 0;
 
-       memcg = get_mem_cgroup_from_mm(current->mm);
+       ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
+       if (ret)
+               return ret;
 
-       if (!memcg_kmem_is_active(memcg)) {
-               css_put(&memcg->css);
-               return true;
+       ret = try_charge(memcg, gfp, nr_pages);
+       if (ret) {
+               page_counter_uncharge(&memcg->kmem, nr_pages);
+               return ret;
        }
 
-       ret = memcg_charge_kmem(memcg, gfp, 1 << order);
-       if (!ret)
-               *_memcg = memcg;
+       page->mem_cgroup = memcg;
 
-       css_put(&memcg->css);
-       return (ret == 0);
+       return 0;
 }
 
-void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
 {
-       VM_BUG_ON(mem_cgroup_is_root(memcg));
+       struct mem_cgroup *memcg;
+       int ret;
 
-       /* The page allocation failed. Revert */
-       if (!page) {
-               memcg_uncharge_kmem(memcg, 1 << order);
-               return;
-       }
-       page->mem_cgroup = memcg;
+       memcg = get_mem_cgroup_from_mm(current->mm);
+       ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+       css_put(&memcg->css);
+       return ret;
 }
 
-void __memcg_kmem_uncharge_pages(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
 {
        struct mem_cgroup *memcg = page->mem_cgroup;
+       unsigned int nr_pages = 1 << order;
 
        if (!memcg)
                return;
 
        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
 
-       memcg_uncharge_kmem(memcg, 1 << order);
-       page->mem_cgroup = NULL;
-}
-
-struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
-{
-       struct mem_cgroup *memcg = NULL;
-       struct kmem_cache *cachep;
-       struct page *page;
-
-       page = virt_to_head_page(ptr);
-       if (PageSlab(page)) {
-               cachep = page->slab_cache;
-               if (!is_root_cache(cachep))
-                       memcg = cachep->memcg_params.memcg;
-       } else
-               /* page allocated by alloc_kmem_pages */
-               memcg = page->mem_cgroup;
+       page_counter_uncharge(&memcg->kmem, nr_pages);
+       page_counter_uncharge(&memcg->memory, nr_pages);
+       if (do_swap_account)
+               page_counter_uncharge(&memcg->memsw, nr_pages);
 
-       return memcg;
+       page->mem_cgroup = NULL;
+       css_put_many(&memcg->css, nr_pages);
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
@@ -2470,9 +2438,7 @@ struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
 
 /*
  * Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compound_lock.
- * charge/uncharge will be never happen and move_account() is done under
- * compound_lock(), so we don't have to take care of races.
+ * zone->lru_lock and migration entries setup in all page mappings.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
@@ -4400,28 +4366,16 @@ static int mem_cgroup_do_precharge(unsigned long count)
 {
        int ret;
 
-       /* Try a single bulk charge without reclaim first */
-       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+       /* Try a single bulk charge without reclaim first, kswapd may wake */
+       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
        if (!ret) {
                mc.precharge += count;
                return ret;
        }
-       if (ret == -EINTR) {
-               cancel_charge(root_mem_cgroup, count);
-               return ret;
-       }
 
        /* Try charges one by one with reclaim */
        while (count--) {
                ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
-               /*
-                * In case of failure, any residual charges against
-                * mc.to will be dropped by mem_cgroup_clear_mc()
-                * later on.  However, cancel any charges that are
-                * bypassed to root right away or they'll be lost.
-                */
-               if (ret == -EINTR)
-                       cancel_charge(root_mem_cgroup, 1);
                if (ret)
                        return ret;
                mc.precharge++;
@@ -4547,39 +4501,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
  * @from: mem_cgroup which the page is moved from.
  * @to:        mem_cgroup which the page is moved to. @from != @to.
  *
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
+ * The caller must make sure the page is not on LRU (isolate_page() is useful.)
  *
  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
  * from old cgroup.
  */
 static int mem_cgroup_move_account(struct page *page,
-                                  unsigned int nr_pages,
+                                  bool compound,
                                   struct mem_cgroup *from,
                                   struct mem_cgroup *to)
 {
        unsigned long flags;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
        int ret;
        bool anon;
 
        VM_BUG_ON(from == to);
        VM_BUG_ON_PAGE(PageLRU(page), page);
-       /*
-        * The page is isolated from LRU. So, collapse function
-        * will not handle this page. But page splitting can happen.
-        * Do this check under compound_page_lock(). The caller should
-        * hold it.
-        */
-       ret = -EBUSY;
-       if (nr_pages > 1 && !PageTransHuge(page))
-               goto out;
+       VM_BUG_ON(compound && !PageTransHuge(page));
 
        /*
-        * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
-        * of its source page while we change it: page migration takes
-        * both pages off the LRU, but page cache replacement doesn't.
+        * Prevent mem_cgroup_replace_page() from looking at
+        * page->mem_cgroup of its source page while we change it.
         */
+       ret = -EBUSY;
        if (!trylock_page(page))
                goto out;
 
@@ -4634,9 +4579,9 @@ static int mem_cgroup_move_account(struct page *page,
        ret = 0;
 
        local_irq_disable();
-       mem_cgroup_charge_statistics(to, page, nr_pages);
+       mem_cgroup_charge_statistics(to, page, compound, nr_pages);
        memcg_check_events(to, page);
-       mem_cgroup_charge_statistics(from, page, -nr_pages);
+       mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
        memcg_check_events(from, page);
        local_irq_enable();
 out_unlock:
@@ -4726,7 +4671,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                        mc.precharge += HPAGE_PMD_NR;
                spin_unlock(ptl);
@@ -4910,17 +4855,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        union mc_target target;
        struct page *page;
 
-       /*
-        * We don't take compound_lock() here but no race with splitting thp
-        * happens because:
-        *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
-        *    under splitting, which means there's no concurrent thp split,
-        *  - if another thread runs into split_huge_page() just after we
-        *    entered this if-block, the thread must wait for page table lock
-        *    to be unlocked in __split_huge_page_splitting(), where the main
-        *    part of thp split is not executed yet.
-        */
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (mc.precharge < HPAGE_PMD_NR) {
                        spin_unlock(ptl);
                        return 0;
@@ -4929,7 +4864,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                if (target_type == MC_TARGET_PAGE) {
                        page = target.page;
                        if (!isolate_lru_page(page)) {
-                               if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+                               if (!mem_cgroup_move_account(page, true,
                                                             mc.from, mc.to)) {
                                        mc.precharge -= HPAGE_PMD_NR;
                                        mc.moved_charge += HPAGE_PMD_NR;
@@ -4958,7 +4893,8 @@ retry:
                        page = target.page;
                        if (isolate_lru_page(page))
                                goto put;
-                       if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
+                       if (!mem_cgroup_move_account(page, false,
+                                               mc.from, mc.to)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
@@ -5085,7 +5021,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 static u64 memory_current_read(struct cgroup_subsys_state *css,
                               struct cftype *cft)
 {
-       return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+       return page_counter_read(&mem_cgroup_from_css(css)->memory);
 }
 
 static int memory_low_show(struct seq_file *m, void *v)
@@ -5197,6 +5133,7 @@ static int memory_events_show(struct seq_file *m, void *v)
 static struct cftype memory_files[] = {
        {
                .name = "current",
+               .flags = CFTYPE_NOT_ON_ROOT,
                .read_u64 = memory_current_read,
        },
        {
@@ -5296,10 +5233,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
  * with mem_cgroup_cancel_charge() in case page instantiation fails.
  */
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
-                         gfp_t gfp_mask, struct mem_cgroup **memcgp)
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound)
 {
        struct mem_cgroup *memcg = NULL;
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
        int ret = 0;
 
        if (mem_cgroup_disabled())
@@ -5329,22 +5267,12 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                }
        }
 
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
-
        if (!memcg)
                memcg = get_mem_cgroup_from_mm(mm);
 
        ret = try_charge(memcg, gfp_mask, nr_pages);
 
        css_put(&memcg->css);
-
-       if (ret == -EINTR) {
-               memcg = root_mem_cgroup;
-               ret = 0;
-       }
 out:
        *memcgp = memcg;
        return ret;
@@ -5367,9 +5295,9 @@ out:
  * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
  */
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             bool lrucare)
+                             bool lrucare, bool compound)
 {
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
 
        VM_BUG_ON_PAGE(!page->mapping, page);
        VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
@@ -5386,13 +5314,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
 
        commit_charge(page, memcg, lrucare);
 
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
-
        local_irq_disable();
-       mem_cgroup_charge_statistics(memcg, page, nr_pages);
+       mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
        memcg_check_events(memcg, page);
        local_irq_enable();
 
@@ -5414,9 +5337,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
  *
  * Cancel a charge transaction started by mem_cgroup_try_charge().
  */
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+               bool compound)
 {
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
 
        if (mem_cgroup_disabled())
                return;
@@ -5428,11 +5352,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
        if (!memcg)
                return;
 
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
-
        cancel_charge(memcg, nr_pages);
 }
 
@@ -5559,7 +5478,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
 }
 
 /**
- * mem_cgroup_migrate - migrate a charge to another page
+ * mem_cgroup_replace_page - migrate a charge to another page
  * @oldpage: currently charged page
  * @newpage: page to transfer the charge to
  * @lrucare: either or both pages might be on the LRU already
@@ -5568,16 +5487,13 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
  *
  * Both pages must be locked, @newpage->mapping must be set up.
  */
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
-                       bool lrucare)
+void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
 {
        struct mem_cgroup *memcg;
        int isolated;
 
        VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
        VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-       VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
-       VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
        VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
        VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
                       newpage);
@@ -5589,25 +5505,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
        if (newpage->mem_cgroup)
                return;
 
-       /*
-        * Swapcache readahead pages can get migrated before being
-        * charged, and migration from compaction can happen to an
-        * uncharged page when the PFN walker finds a page that
-        * reclaim just put back on the LRU but has not released yet.
-        */
+       /* Swapcache readahead pages can get replaced before being charged */
        memcg = oldpage->mem_cgroup;
        if (!memcg)
                return;
 
-       if (lrucare)
-               lock_page_lru(oldpage, &isolated);
-
+       lock_page_lru(oldpage, &isolated);
        oldpage->mem_cgroup = NULL;
+       unlock_page_lru(oldpage, isolated);
 
-       if (lrucare)
-               unlock_page_lru(oldpage, isolated);
-
-       commit_charge(newpage, memcg, lrucare);
+       commit_charge(newpage, memcg, true);
 }
 
 /*
@@ -5690,7 +5597,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * only synchronisation we have for udpating the per-CPU variables.
         */
        VM_BUG_ON(!irqs_disabled());
-       mem_cgroup_charge_statistics(memcg, page, -1);
+       mem_cgroup_charge_statistics(memcg, page, false, -1);
        memcg_check_events(memcg, page);
 }
 
index 95882692e747c2a534488190287e5954fba35d39..a2c987df80ebb8357556da359e50c9746b4e7721 100644 (file)
@@ -56,6 +56,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/mm_inline.h>
 #include <linux/kfifo.h>
+#include <linux/ratelimit.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -775,8 +776,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 #define lru            (1UL << PG_lru)
 #define swapbacked     (1UL << PG_swapbacked)
 #define head           (1UL << PG_head)
-#define tail           (1UL << PG_tail)
-#define compound       (1UL << PG_compound)
 #define slab           (1UL << PG_slab)
 #define reserved       (1UL << PG_reserved)
 
@@ -799,12 +798,7 @@ static struct page_state {
         */
        { slab,         slab,           MF_MSG_SLAB,    me_kernel },
 
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
        { head,         head,           MF_MSG_HUGE,            me_huge_page },
-       { tail,         tail,           MF_MSG_HUGE,            me_huge_page },
-#else
-       { compound,     compound,       MF_MSG_HUGE,            me_huge_page },
-#endif
 
        { sc|dirty,     sc|dirty,       MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
        { sc|dirty,     sc,             MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
@@ -1155,7 +1149,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        }
 
        if (!PageHuge(p) && PageTransHuge(hpage)) {
+               lock_page(hpage);
                if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+                       unlock_page(hpage);
                        if (!PageAnon(hpage))
                                pr_err("MCE: %#lx: non anonymous thp\n", pfn);
                        else
@@ -1165,6 +1161,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        put_hwpoison_page(p);
                        return -EBUSY;
                }
+               unlock_page(hpage);
                VM_BUG_ON_PAGE(!page_count(p), p);
                hpage = compound_head(p);
        }
@@ -1172,7 +1169,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        /*
         * We ignore non-LRU pages for good reasons.
         * - PG_locked is only well defined for LRU pages and a few others
-        * - to avoid races with __set_page_locked()
+        * - to avoid races with __SetPageLocked()
         * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
@@ -1403,6 +1400,12 @@ static int __init memory_failure_init(void)
 }
 core_initcall(memory_failure_init);
 
+#define unpoison_pr_info(fmt, pfn, rs)                 \
+({                                                     \
+       if (__ratelimit(rs))                            \
+               pr_info(fmt, pfn);                      \
+})
+
 /**
  * unpoison_memory - Unpoison a previously poisoned page
  * @pfn: Page number of the to be unpoisoned page
@@ -1421,6 +1424,8 @@ int unpoison_memory(unsigned long pfn)
        struct page *p;
        int freeit = 0;
        unsigned int nr_pages;
+       static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                       DEFAULT_RATELIMIT_BURST);
 
        if (!pfn_valid(pfn))
                return -ENXIO;
@@ -1429,23 +1434,26 @@ int unpoison_memory(unsigned long pfn)
        page = compound_head(p);
 
        if (!PageHWPoison(p)) {
-               pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
+               unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n",
+                                pfn, &unpoison_rs);
                return 0;
        }
 
        if (page_count(page) > 1) {
-               pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
+               unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n",
+                                pfn, &unpoison_rs);
                return 0;
        }
 
        if (page_mapped(page)) {
-               pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
+               unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n",
+                                pfn, &unpoison_rs);
                return 0;
        }
 
        if (page_mapping(page)) {
-               pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
-                       pfn);
+               unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+                                pfn, &unpoison_rs);
                return 0;
        }
 
@@ -1455,7 +1463,8 @@ int unpoison_memory(unsigned long pfn)
         * In such case, we yield to memory_failure() and make unpoison fail.
         */
        if (!PageHuge(page) && PageTransHuge(page)) {
-               pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
+               unpoison_pr_info("MCE: Memory failure is now running on %#lx\n",
+                                pfn, &unpoison_rs);
                return 0;
        }
 
@@ -1469,12 +1478,14 @@ int unpoison_memory(unsigned long pfn)
                 * to the end.
                 */
                if (PageHuge(page)) {
-                       pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+                       unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n",
+                                        pfn, &unpoison_rs);
                        return 0;
                }
                if (TestClearPageHWPoison(p))
                        num_poisoned_pages_dec();
-               pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
+               unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n",
+                                pfn, &unpoison_rs);
                return 0;
        }
 
@@ -1486,7 +1497,8 @@ int unpoison_memory(unsigned long pfn)
         * the free buddy page pool.
         */
        if (TestClearPageHWPoison(page)) {
-               pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
+               unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n",
+                                pfn, &unpoison_rs);
                num_poisoned_pages_sub(nr_pages);
                freeit = 1;
                if (PageHuge(page))
@@ -1742,7 +1754,10 @@ int soft_offline_page(struct page *page, int flags)
                return -EBUSY;
        }
        if (!PageHuge(page) && PageTransHuge(hpage)) {
-               if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+               lock_page(page);
+               ret = split_huge_page(hpage);
+               unlock_page(page);
+               if (unlikely(ret)) {
                        pr_info("soft offline: %#lx: failed to split THP\n",
                                pfn);
                        if (flags & MF_COUNT_INCREASED)
index deb679c31f2ab897cafebf72643aec4f66233308..7f3b9f2769ad80b6dee13896e624267b7ab2eeda 100644 (file)
@@ -566,7 +566,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        spinlock_t *ptl;
        pgtable_t new = pte_alloc_one(mm, address);
-       int wait_split_huge_page;
        if (!new)
                return -ENOMEM;
 
@@ -586,18 +585,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 
        ptl = pmd_lock(mm, pmd);
-       wait_split_huge_page = 0;
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                atomic_long_inc(&mm->nr_ptes);
                pmd_populate(mm, pmd, new);
                new = NULL;
-       } else if (unlikely(pmd_trans_splitting(*pmd)))
-               wait_split_huge_page = 1;
+       }
        spin_unlock(ptl);
        if (new)
                pte_free(mm, new);
-       if (wait_split_huge_page)
-               wait_split_huge_page(vma->anon_vma, pmd);
        return 0;
 }
 
@@ -613,8 +608,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
-       } else
-               VM_BUG_ON(pmd_trans_splitting(*pmd));
+       }
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
@@ -873,7 +867,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        page = vm_normal_page(vma, addr, pte);
        if (page) {
                get_page(page);
-               page_dup_rmap(page);
+               page_dup_rmap(page, false);
                if (PageAnon(page))
                        rss[MM_ANONPAGES]++;
                else
@@ -1125,7 +1119,7 @@ again:
                                        mark_page_accessed(page);
                                rss[MM_FILEPAGES]--;
                        }
-                       page_remove_rmap(page);
+                       page_remove_rmap(page, false);
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
                        if (unlikely(!__tlb_remove_page(tlb, page))) {
@@ -1204,7 +1198,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                        BUG();
                                }
 #endif
-                               split_huge_page_pmd(vma, addr, pmd);
+                               split_huge_pmd(vma, pmd, addr);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
@@ -2083,7 +2077,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                cow_user_page(new_page, old_page, address, vma);
        }
 
-       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
+       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
                goto oom_free_new;
 
        __SetPageUptodate(new_page);
@@ -2113,8 +2107,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, address, page_table);
-               page_add_new_anon_rmap(new_page, vma, address);
-               mem_cgroup_commit_charge(new_page, memcg, false);
+               page_add_new_anon_rmap(new_page, vma, address, false);
+               mem_cgroup_commit_charge(new_page, memcg, false, false);
                lru_cache_add_active_or_unevictable(new_page, vma);
                /*
                 * We call the notify macro here because, when using secondary
@@ -2146,14 +2140,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
-                       page_remove_rmap(old_page);
+                       page_remove_rmap(old_page, false);
                }
 
                /* Free the old page.. */
                new_page = old_page;
                page_copied = 1;
        } else {
-               mem_cgroup_cancel_charge(new_page, memcg);
+               mem_cgroup_cancel_charge(new_page, memcg, false);
        }
 
        if (new_page)
@@ -2168,7 +2162,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (page_copied && (vma->vm_flags & VM_LOCKED)) {
                        lock_page(old_page);    /* LRU manipulation */
-                       munlock_vma_page(old_page);
+                       if (PageMlocked(old_page))
+                               munlock_vma_page(old_page);
                        unlock_page(old_page);
                }
                page_cache_release(old_page);
@@ -2443,7 +2438,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
  * We return with the mmap_sem locked or unlocked in the same cases
  * as does filemap_fault().
  */
-static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
                unsigned int flags, pte_t orig_pte)
 {
@@ -2528,7 +2523,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_page;
        }
 
-       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
+       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
                ret = VM_FAULT_OOM;
                goto out_page;
        }
@@ -2562,7 +2557,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                flags &= ~FAULT_FLAG_WRITE;
                ret |= VM_FAULT_WRITE;
-               exclusive = 1;
+               exclusive = RMAP_EXCLUSIVE;
        }
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(orig_pte))
@@ -2570,10 +2565,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        set_pte_at(mm, address, page_table, pte);
        if (page == swapcache) {
                do_page_add_anon_rmap(page, vma, address, exclusive);
-               mem_cgroup_commit_charge(page, memcg, true);
+               mem_cgroup_commit_charge(page, memcg, true, false);
        } else { /* ksm created a completely new copy */
-               page_add_new_anon_rmap(page, vma, address);
-               mem_cgroup_commit_charge(page, memcg, false);
+               page_add_new_anon_rmap(page, vma, address, false);
+               mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        }
 
@@ -2608,7 +2603,7 @@ unlock:
 out:
        return ret;
 out_nomap:
-       mem_cgroup_cancel_charge(page, memcg);
+       mem_cgroup_cancel_charge(page, memcg, false);
        pte_unmap_unlock(page_table, ptl);
 out_page:
        unlock_page(page);
@@ -2702,7 +2697,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!page)
                goto oom;
 
-       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
                goto oom_free_page;
 
        /*
@@ -2723,15 +2718,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(page_table, ptl);
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, false);
                page_cache_release(page);
                return handle_userfault(vma, address, flags,
                                        VM_UFFD_MISSING);
        }
 
        inc_mm_counter_fast(mm, MM_ANONPAGES);
-       page_add_new_anon_rmap(page, vma, address);
-       mem_cgroup_commit_charge(page, memcg, false);
+       page_add_new_anon_rmap(page, vma, address, false);
+       mem_cgroup_commit_charge(page, memcg, false, false);
        lru_cache_add_active_or_unevictable(page, vma);
 setpte:
        set_pte_at(mm, address, page_table, entry);
@@ -2742,7 +2737,7 @@ unlock:
        pte_unmap_unlock(page_table, ptl);
        return 0;
 release:
-       mem_cgroup_cancel_charge(page, memcg);
+       mem_cgroup_cancel_charge(page, memcg, false);
        page_cache_release(page);
        goto unlock;
 oom_free_page:
@@ -2818,7 +2813,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (anon) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-               page_add_new_anon_rmap(page, vma, address);
+               page_add_new_anon_rmap(page, vma, address, false);
        } else {
                inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
                page_add_file_rmap(page);
@@ -2993,7 +2988,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!new_page)
                return VM_FAULT_OOM;
 
-       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
+       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
                page_cache_release(new_page);
                return VM_FAULT_OOM;
        }
@@ -3022,7 +3017,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                goto uncharge_out;
        }
        do_set_pte(vma, address, new_page, pte, true, true);
-       mem_cgroup_commit_charge(new_page, memcg, false);
+       mem_cgroup_commit_charge(new_page, memcg, false, false);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pte_unmap_unlock(pte, ptl);
        if (fault_page) {
@@ -3037,7 +3032,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        return ret;
 uncharge_out:
-       mem_cgroup_cancel_charge(new_page, memcg);
+       mem_cgroup_cancel_charge(new_page, memcg, false);
        page_cache_release(new_page);
        return ret;
 }
@@ -3089,7 +3084,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
-       mapping = fault_page->mapping;
+       mapping = page_rmapping(fault_page);
        unlock_page(fault_page);
        if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
                /*
@@ -3191,6 +3186,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                return 0;
        }
 
+       /* TODO: handle PTE-mapped THP */
+       if (PageCompound(page)) {
+               pte_unmap_unlock(ptep, ptl);
+               return 0;
+       }
+
        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
@@ -3366,14 +3367,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                if (pmd_trans_huge(orig_pmd)) {
                        unsigned int dirty = flags & FAULT_FLAG_WRITE;
 
-                       /*
-                        * If the pmd is splitting, return and retry the
-                        * the fault.  Alternative: wait until the split
-                        * is done, and goto retry.
-                        */
-                       if (pmd_trans_splitting(orig_pmd))
-                               return 0;
-
                        if (pmd_protnone(orig_pmd))
                                return do_huge_pmd_numa_page(mm, vma, address,
                                                             orig_pmd, pmd);
index 0780d118d26e70d5cfc3a3e67ceca4915684a003..67d488ab495e57b9018484932e135078c787903c 100644 (file)
@@ -339,8 +339,8 @@ static int __ref ensure_zone_is_initialized(struct zone *zone,
                        unsigned long start_pfn, unsigned long num_pages)
 {
        if (!zone_is_initialized(zone))
-               return init_currently_empty_zone(zone, start_pfn, num_pages,
-                                                MEMMAP_HOTPLUG);
+               return init_currently_empty_zone(zone, start_pfn, num_pages);
+
        return 0;
 }
 
index 87a177917cb2e60a13b09e6a53836ccd9f9275bf..4a02de4e173ff3d05b94bb3b482035e629a938dd 100644 (file)
@@ -489,14 +489,31 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
        struct page *page;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
-       int nid;
+       int nid, ret;
        pte_t *pte;
        spinlock_t *ptl;
 
-       split_huge_page_pmd(vma, addr, pmd);
-       if (pmd_trans_unstable(pmd))
-               return 0;
+       if (pmd_trans_huge(*pmd)) {
+               ptl = pmd_lock(walk->mm, pmd);
+               if (pmd_trans_huge(*pmd)) {
+                       page = pmd_page(*pmd);
+                       if (is_huge_zero_page(page)) {
+                               spin_unlock(ptl);
+                               split_huge_pmd(vma, pmd, addr);
+                       } else {
+                               get_page(page);
+                               spin_unlock(ptl);
+                               lock_page(page);
+                               ret = split_huge_page(page);
+                               unlock_page(page);
+                               put_page(page);
+                               if (ret)
+                                       return 0;
+                       }
+               }
+       }
 
+retry:
        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                if (!pte_present(*pte))
@@ -513,6 +530,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
                nid = page_to_nid(page);
                if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
                        continue;
+               if (PageTail(page) && PageAnon(page)) {
+                       get_page(page);
+                       pte_unmap_unlock(pte, ptl);
+                       lock_page(page);
+                       ret = split_huge_page(page);
+                       unlock_page(page);
+                       put_page(page);
+                       /* Failed to split -- skip. */
+                       if (ret) {
+                               pte = pte_offset_map_lock(walk->mm, pmd,
+                                               addr, &ptl);
+                               continue;
+                       }
+                       goto retry;
+               }
 
                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                        migrate_page_add(page, qp->pagelist, flags);
index 4c533bc51d733989f12c9f5e25cad713f647baea..004d42b1dfaf928ab174e057696afa580447f3d9 100644 (file)
@@ -320,13 +320,13 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
        gfp_t gfp_temp;
 
        VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
-       might_sleep_if(gfp_mask & __GFP_WAIT);
+       might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
        gfp_mask |= __GFP_NOMEMALLOC;   /* don't allocate emergency reserves */
        gfp_mask |= __GFP_NORETRY;      /* don't loop in __alloc_pages */
        gfp_mask |= __GFP_NOWARN;       /* failures are OK */
 
-       gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
+       gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
 
 repeat_alloc:
 
@@ -349,7 +349,7 @@ repeat_alloc:
        }
 
        /*
-        * We use gfp mask w/o __GFP_WAIT or IO for the first round.  If
+        * We use gfp mask w/o direct reclaim or IO for the first round.  If
         * alloc failed with that and @pool was empty, retry immediately.
         */
        if (gfp_temp != gfp_mask) {
@@ -358,8 +358,8 @@ repeat_alloc:
                goto repeat_alloc;
        }
 
-       /* We must not sleep if !__GFP_WAIT */
-       if (!(gfp_mask & __GFP_WAIT)) {
+       /* We must not sleep if !__GFP_DIRECT_RECLAIM */
+       if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
                spin_unlock_irqrestore(&pool->lock, flags);
                return NULL;
        }
index 842ecd7aaf7fa6ac1371f6137dc155c91851505c..1ae0113559c980d1da49174e777dfb454dcd0d9d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Memory Migration functionality - linux/mm/migration.c
+ * Memory Migration functionality - linux/mm/migrate.c
  *
  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
  *
@@ -30,7 +30,7 @@
 #include <linux/mempolicy.h>
 #include <linux/vmalloc.h>
 #include <linux/security.h>
-#include <linux/memcontrol.h>
+#include <linux/backing-dev.h>
 #include <linux/syscalls.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
@@ -165,12 +165,15 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                if (PageAnon(new))
                        hugepage_add_anon_rmap(new, vma, addr);
                else
-                       page_dup_rmap(new);
+                       page_dup_rmap(new, false);
        } else if (PageAnon(new))
-               page_add_anon_rmap(new, vma, addr);
+               page_add_anon_rmap(new, vma, addr, false);
        else
                page_add_file_rmap(new);
 
+       if (vma->vm_flags & VM_LOCKED)
+               mlock_vma_page(new);
+
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, addr, ptep);
 unlock:
@@ -311,6 +314,8 @@ int migrate_page_move_mapping(struct address_space *mapping,
                struct buffer_head *head, enum migrate_mode mode,
                int extra_count)
 {
+       struct zone *oldzone, *newzone;
+       int dirty;
        int expected_count = 1 + extra_count;
        void **pslot;
 
@@ -318,9 +323,20 @@ int migrate_page_move_mapping(struct address_space *mapping,
                /* Anonymous page without mapping */
                if (page_count(page) != expected_count)
                        return -EAGAIN;
+
+               /* No turning back from here */
+               set_page_memcg(newpage, page_memcg(page));
+               newpage->index = page->index;
+               newpage->mapping = page->mapping;
+               if (PageSwapBacked(page))
+                       SetPageSwapBacked(newpage);
+
                return MIGRATEPAGE_SUCCESS;
        }
 
+       oldzone = page_zone(page);
+       newzone = page_zone(newpage);
+
        spin_lock_irq(&mapping->tree_lock);
 
        pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -353,14 +369,28 @@ int migrate_page_move_mapping(struct address_space *mapping,
        }
 
        /*
-        * Now we know that no one else is looking at the page.
+        * Now we know that no one else is looking at the page:
+        * no turning back from here.
         */
+       set_page_memcg(newpage, page_memcg(page));
+       newpage->index = page->index;
+       newpage->mapping = page->mapping;
+       if (PageSwapBacked(page))
+               SetPageSwapBacked(newpage);
+
        get_page(newpage);      /* add cache reference */
        if (PageSwapCache(page)) {
                SetPageSwapCache(newpage);
                set_page_private(newpage, page_private(page));
        }
 
+       /* Move dirty while page refs frozen and newpage not yet exposed */
+       dirty = PageDirty(page);
+       if (dirty) {
+               ClearPageDirty(page);
+               SetPageDirty(newpage);
+       }
+
        radix_tree_replace_slot(pslot, newpage);
 
        /*
@@ -370,6 +400,9 @@ int migrate_page_move_mapping(struct address_space *mapping,
         */
        page_unfreeze_refs(page, expected_count - 1);
 
+       spin_unlock(&mapping->tree_lock);
+       /* Leave irq disabled to prevent preemption while updating stats */
+
        /*
         * If moved to a different zone then also account
         * the page for that zone. Other VM counters will be
@@ -380,13 +413,19 @@ int migrate_page_move_mapping(struct address_space *mapping,
         * via NR_FILE_PAGES and NR_ANON_PAGES if they
         * are mapped to swap space.
         */
-       __dec_zone_page_state(page, NR_FILE_PAGES);
-       __inc_zone_page_state(newpage, NR_FILE_PAGES);
-       if (!PageSwapCache(page) && PageSwapBacked(page)) {
-               __dec_zone_page_state(page, NR_SHMEM);
-               __inc_zone_page_state(newpage, NR_SHMEM);
+       if (newzone != oldzone) {
+               __dec_zone_state(oldzone, NR_FILE_PAGES);
+               __inc_zone_state(newzone, NR_FILE_PAGES);
+               if (PageSwapBacked(page) && !PageSwapCache(page)) {
+                       __dec_zone_state(oldzone, NR_SHMEM);
+                       __inc_zone_state(newzone, NR_SHMEM);
+               }
+               if (dirty && mapping_cap_account_dirty(mapping)) {
+                       __dec_zone_state(oldzone, NR_FILE_DIRTY);
+                       __inc_zone_state(newzone, NR_FILE_DIRTY);
+               }
        }
-       spin_unlock_irq(&mapping->tree_lock);
+       local_irq_enable();
 
        return MIGRATEPAGE_SUCCESS;
 }
@@ -401,12 +440,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
        int expected_count;
        void **pslot;
 
-       if (!mapping) {
-               if (page_count(page) != 1)
-                       return -EAGAIN;
-               return MIGRATEPAGE_SUCCESS;
-       }
-
        spin_lock_irq(&mapping->tree_lock);
 
        pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -424,6 +457,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
                return -EAGAIN;
        }
 
+       set_page_memcg(newpage, page_memcg(page));
+       newpage->index = page->index;
+       newpage->mapping = page->mapping;
        get_page(newpage);
 
        radix_tree_replace_slot(pslot, newpage);
@@ -510,20 +546,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
        if (PageMappedToDisk(page))
                SetPageMappedToDisk(newpage);
 
-       if (PageDirty(page)) {
-               clear_page_dirty_for_io(page);
-               /*
-                * Want to mark the page and the radix tree as dirty, and
-                * redo the accounting that clear_page_dirty_for_io undid,
-                * but we can't use set_page_dirty because that function
-                * is actually a signal that all of the page has become dirty.
-                * Whereas only part of our page may be dirty.
-                */
-               if (PageSwapBacked(page))
-                       SetPageDirty(newpage);
-               else
-                       __set_page_dirty_nobuffers(newpage);
-       }
+       /* Move dirty on pages not done by migrate_page_move_mapping() */
+       if (PageDirty(page))
+               SetPageDirty(newpage);
 
        if (page_is_young(page))
                set_page_young(newpage);
@@ -537,7 +562,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
        cpupid = page_cpupid_xchg_last(page, -1);
        page_cpupid_xchg_last(newpage, cpupid);
 
-       mlock_migrate_page(newpage, page);
        ksm_migrate_page(newpage, page);
        /*
         * Please do not reorder this without considering how mm/ksm.c's
@@ -721,33 +745,13 @@ static int fallback_migrate_page(struct address_space *mapping,
  *  MIGRATEPAGE_SUCCESS - success
  */
 static int move_to_new_page(struct page *newpage, struct page *page,
-                               int page_was_mapped, enum migrate_mode mode)
+                               enum migrate_mode mode)
 {
        struct address_space *mapping;
        int rc;
 
-       /*
-        * Block others from accessing the page when we get around to
-        * establishing additional references. We are the only one
-        * holding a reference to the new page at this point.
-        */
-       if (!trylock_page(newpage))
-               BUG();
-
-       /* Prepare mapping for the new page.*/
-       newpage->index = page->index;
-       newpage->mapping = page->mapping;
-       if (PageSwapBacked(page))
-               SetPageSwapBacked(newpage);
-
-       /*
-        * Indirectly called below, migrate_page_copy() copies PG_dirty and thus
-        * needs newpage's memcg set to transfer memcg dirty page accounting.
-        * So perform memcg migration in two steps:
-        * 1. set newpage->mem_cgroup (here)
-        * 2. clear page->mem_cgroup (below)
-        */
-       set_page_memcg(newpage, page_memcg(page));
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 
        mapping = page_mapping(page);
        if (!mapping)
@@ -759,23 +763,19 @@ static int move_to_new_page(struct page *newpage, struct page *page,
                 * space which also has its own migratepage callback. This
                 * is the most common path for page migration.
                 */
-               rc = mapping->a_ops->migratepage(mapping,
-                                               newpage, page, mode);
+               rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);
        else
                rc = fallback_migrate_page(mapping, newpage, page, mode);
 
-       if (rc != MIGRATEPAGE_SUCCESS) {
-               set_page_memcg(newpage, NULL);
-               newpage->mapping = NULL;
-       } else {
+       /*
+        * When successful, old pagecache page->mapping must be cleared before
+        * page is freed; but stats require that PageAnon be left as PageAnon.
+        */
+       if (rc == MIGRATEPAGE_SUCCESS) {
                set_page_memcg(page, NULL);
-               if (page_was_mapped)
-                       remove_migration_ptes(page, newpage);
-               page->mapping = NULL;
+               if (!PageAnon(page))
+                       page->mapping = NULL;
        }
-
-       unlock_page(newpage);
-
        return rc;
 }
 
@@ -824,6 +824,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                        goto out_unlock;
                wait_on_page_writeback(page);
        }
+
        /*
         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
         * we cannot notice that anon_vma is freed while we migrates a page.
@@ -831,34 +832,26 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
         * of migration. File cache pages are no problem because of page_lock()
         * File Caches may use write_page() or lock_page() in migration, then,
         * just care Anon page here.
+        *
+        * Only page_get_anon_vma() understands the subtleties of
+        * getting a hold on an anon_vma from outside one of its mms.
+        * But if we cannot get anon_vma, then we won't need it anyway,
+        * because that implies that the anon page is no longer mapped
+        * (and cannot be remapped so long as we hold the page lock).
         */
-       if (PageAnon(page) && !PageKsm(page)) {
-               /*
-                * Only page_lock_anon_vma_read() understands the subtleties of
-                * getting a hold on an anon_vma from outside one of its mms.
-                */
+       if (PageAnon(page) && !PageKsm(page))
                anon_vma = page_get_anon_vma(page);
-               if (anon_vma) {
-                       /*
-                        * Anon page
-                        */
-               } else if (PageSwapCache(page)) {
-                       /*
-                        * We cannot be sure that the anon_vma of an unmapped
-                        * swapcache page is safe to use because we don't
-                        * know in advance if the VMA that this page belonged
-                        * to still exists. If the VMA and others sharing the
-                        * data have been freed, then the anon_vma could
-                        * already be invalid.
-                        *
-                        * To avoid this possibility, swapcache pages get
-                        * migrated but are not remapped when migration
-                        * completes
-                        */
-               } else {
-                       goto out_unlock;
-               }
-       }
+
+       /*
+        * Block others from accessing the new page when we get around to
+        * establishing additional references. We are usually the only one
+        * holding a reference to newpage at this point. We used to have a BUG
+        * here if trylock_page(newpage) fails, but would like to allow for
+        * cases where there might be a race with the previous use of newpage.
+        * This is much like races on refcount of oldpage: just don't BUG().
+        */
+       if (unlikely(!trylock_page(newpage)))
+               goto out_unlock;
 
        if (unlikely(isolated_balloon_page(page))) {
                /*
@@ -869,7 +862,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                 * the page migration right away (proteced by page lock).
                 */
                rc = balloon_page_migrate(newpage, page, mode);
-               goto out_unlock;
+               goto out_unlock_both;
        }
 
        /*
@@ -888,30 +881,30 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                VM_BUG_ON_PAGE(PageAnon(page), page);
                if (page_has_private(page)) {
                        try_to_free_buffers(page);
-                       goto out_unlock;
+                       goto out_unlock_both;
                }
-               goto skip_unmap;
-       }
-
-       /* Establish migration ptes or remove ptes */
-       if (page_mapped(page)) {
+       } else if (page_mapped(page)) {
+               /* Establish migration ptes */
+               VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
+                               page);
                try_to_unmap(page,
                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
                page_was_mapped = 1;
        }
 
-skip_unmap:
        if (!page_mapped(page))
-               rc = move_to_new_page(newpage, page, page_was_mapped, mode);
+               rc = move_to_new_page(newpage, page, mode);
 
-       if (rc && page_was_mapped)
-               remove_migration_ptes(page, page);
+       if (page_was_mapped)
+               remove_migration_ptes(page,
+                       rc == MIGRATEPAGE_SUCCESS ? newpage : page);
 
+out_unlock_both:
+       unlock_page(newpage);
+out_unlock:
        /* Drop an anon_vma reference if we took one */
        if (anon_vma)
                put_anon_vma(anon_vma);
-
-out_unlock:
        unlock_page(page);
 out:
        return rc;
@@ -937,10 +930,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
                                   int force, enum migrate_mode mode,
                                   enum migrate_reason reason)
 {
-       int rc = 0;
+       int rc = MIGRATEPAGE_SUCCESS;
        int *result = NULL;
-       struct page *newpage = get_new_page(page, private, &result);
+       struct page *newpage;
 
+       newpage = get_new_page(page, private, &result);
        if (!newpage)
                return -ENOMEM;
 
@@ -949,11 +943,17 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
                goto out;
        }
 
-       if (unlikely(PageTransHuge(page)))
-               if (unlikely(split_huge_page(page)))
+       if (unlikely(PageTransHuge(page))) {
+               lock_page(page);
+               rc = split_huge_page(page);
+               unlock_page(page);
+               if (rc)
                        goto out;
+       }
 
        rc = __unmap_and_move(page, newpage, force, mode);
+       if (rc == MIGRATEPAGE_SUCCESS)
+               put_new_page = NULL;
 
 out:
        if (rc != -EAGAIN) {
@@ -980,10 +980,9 @@ out:
         * it.  Otherwise, putback_lru_page() will drop the reference grabbed
         * during isolation.
         */
-       if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
-               ClearPageSwapBacked(newpage);
+       if (put_new_page)
                put_new_page(newpage, private);
-       else if (unlikely(__is_movable_balloon_page(newpage))) {
+       else if (unlikely(__is_movable_balloon_page(newpage))) {
                /* drop our reference, page already in the balloon */
                put_page(newpage);
        } else
@@ -1021,7 +1020,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
                                struct page *hpage, int force,
                                enum migrate_mode mode)
 {
-       int rc = 0;
+       int rc = -EAGAIN;
        int *result = NULL;
        int page_was_mapped = 0;
        struct page *new_hpage;
@@ -1043,8 +1042,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (!new_hpage)
                return -ENOMEM;
 
-       rc = -EAGAIN;
-
        if (!trylock_page(hpage)) {
                if (!force || mode != MIGRATE_SYNC)
                        goto out;
@@ -1054,6 +1051,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (PageAnon(hpage))
                anon_vma = page_get_anon_vma(hpage);
 
+       if (unlikely(!trylock_page(new_hpage)))
+               goto put_anon;
+
        if (page_mapped(hpage)) {
                try_to_unmap(hpage,
                        TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
@@ -1061,16 +1061,22 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        }
 
        if (!page_mapped(hpage))
-               rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
+               rc = move_to_new_page(new_hpage, hpage, mode);
+
+       if (page_was_mapped)
+               remove_migration_ptes(hpage,
+                       rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
 
-       if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
-               remove_migration_ptes(hpage, hpage);
+       unlock_page(new_hpage);
 
+put_anon:
        if (anon_vma)
                put_anon_vma(anon_vma);
 
-       if (rc == MIGRATEPAGE_SUCCESS)
+       if (rc == MIGRATEPAGE_SUCCESS) {
                hugetlb_cgroup_migrate(hpage, new_hpage);
+               put_new_page = NULL;
+       }
 
        unlock_page(hpage);
 out:
@@ -1082,7 +1088,7 @@ out:
         * it.  Otherwise, put_page() will drop the reference grabbed during
         * isolation.
         */
-       if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+       if (put_new_page)
                put_new_page(new_hpage, private);
        else
                putback_active_hugepage(new_hpage);
@@ -1112,7 +1118,7 @@ out:
  *
  * The function returns after 10 attempts or if no pages are movable any more
  * because the list has become empty or no retryable pages exist any more.
- * The caller should call putback_lru_pages() to return pages to the LRU
+ * The caller should call putback_movable_pages() to return pages to the LRU
  * or free list only if ret != 0.
  *
  * Returns the number of pages that were not migrated, or an error code.
@@ -1169,7 +1175,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
                        }
                }
        }
-       rc = nr_failed + retry;
+       nr_failed += retry;
+       rc = nr_failed;
 out:
        if (nr_succeeded)
                count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
@@ -1575,7 +1582,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
                                         (GFP_HIGHUSER_MOVABLE |
                                          __GFP_THISNODE | __GFP_NOMEMALLOC |
                                          __GFP_NORETRY | __GFP_NOWARN) &
-                                        ~GFP_IOFS, 0);
+                                        ~(__GFP_IO | __GFP_FS), 0);
 
        return newpage;
 }
@@ -1749,10 +1756,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                goto out_dropref;
 
        new_page = alloc_pages_node(node,
-               (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
+               (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
                HPAGE_PMD_ORDER);
        if (!new_page)
                goto out_fail;
+       prep_transhuge_page(new_page);
 
        isolated = numamigrate_isolate_page(pgdat, page);
        if (!isolated) {
@@ -1764,7 +1772,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                flush_tlb_range(vma, mmun_start, mmun_end);
 
        /* Prepare a page as a migration target */
-       __set_page_locked(new_page);
+       __SetPageLocked(new_page);
        SetPageSwapBacked(new_page);
 
        /* anon mapping, we can simply copy page->mapping to the new page: */
@@ -1786,7 +1794,6 @@ fail_putback:
                        SetPageActive(page);
                if (TestClearPageUnevictable(new_page))
                        SetPageUnevictable(page);
-               mlock_migrate_page(page, new_page);
 
                unlock_page(new_page);
                put_page(new_page);             /* Free it */
@@ -1813,7 +1820,7 @@ fail_putback:
         * guarantee the copy is visible before the pagetable update.
         */
        flush_cache_range(vma, mmun_start, mmun_end);
-       page_add_anon_rmap(new_page, vma, mmun_start);
+       page_add_anon_rmap(new_page, vma, mmun_start, true);
        pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
        set_pmd_at(mm, mmun_start, pmd, entry);
        flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1824,13 +1831,14 @@ fail_putback:
                flush_tlb_range(vma, mmun_start, mmun_end);
                mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
                update_mmu_cache_pmd(vma, address, &entry);
-               page_remove_rmap(new_page);
+               page_remove_rmap(new_page, true);
                goto fail_putback;
        }
 
-       mem_cgroup_migrate(page, new_page, false);
-
-       page_remove_rmap(page);
+       mlock_migrate_page(new_page, page);
+       set_page_memcg(new_page, page_memcg(page));
+       set_page_memcg(page, NULL);
+       page_remove_rmap(page, true);
 
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
index be25efde64a4098c11eb126922a0d20b04bbadce..2a565ed8bb4907398a0d3d2619fd4939df777970 100644 (file)
@@ -117,7 +117,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        unsigned char *vec = walk->private;
        int nr = (end - addr) >> PAGE_SHIFT;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                memset(vec, 1, nr);
                spin_unlock(ptl);
                goto out;
@@ -234,7 +234,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 
        /* This also avoids any overflows on PAGE_CACHE_ALIGN */
        pages = len >> PAGE_SHIFT;
-       pages += (len & ~PAGE_MASK) != 0;
+       pages += (offset_in_page(len)) != 0;
 
        if (!access_ok(VERIFY_WRITE, vec, pages))
                return -EFAULT;
index 25936680064fd433d4cf693c88c3195bdab4bb98..0147b57f9704398f8f75dbb6d9c24ca5efedc6ce 100644 (file)
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
        /* Serialize with page migration */
        BUG_ON(!PageLocked(page));
 
+       VM_BUG_ON_PAGE(PageTail(page), page);
+       VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+
        if (!TestSetPageMlocked(page)) {
                mod_zone_page_state(page_zone(page), NR_MLOCK,
                                    hpage_nr_pages(page));
@@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page)
        /* For try_to_munlock() and to serialize with page migration */
        BUG_ON(!PageLocked(page));
 
+       VM_BUG_ON_PAGE(PageTail(page), page);
+
        /*
         * Serialize with any parallel __split_huge_page_refcount() which
         * might otherwise copy PageMlocked to part of the tail pages before
@@ -422,7 +427,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
 void munlock_vma_pages_range(struct vm_area_struct *vma,
                             unsigned long start, unsigned long end)
 {
-       vma->vm_flags &= ~VM_LOCKED;
+       vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
 
        while (start < end) {
                struct page *page = NULL;
@@ -444,7 +449,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                                &page_mask);
 
                if (page && !IS_ERR(page)) {
-                       if (PageTransHuge(page)) {
+                       if (PageTransTail(page)) {
+                               VM_BUG_ON_PAGE(PageMlocked(page), page);
+                               put_page(page); /* follow_page_mask() */
+                       } else if (PageTransHuge(page)) {
                                lock_page(page);
                                /*
                                 * Any THP page found by follow_page_mask() may
@@ -477,8 +485,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                                goto next;
                        }
                }
-               /* It's a bug to munlock in the middle of a THP page */
-               VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
                page_increm = 1 + page_mask;
                start += page_increm * PAGE_SIZE;
 next:
@@ -506,7 +512,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
        if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
            is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
-               goto out;       /* don't set VM_LOCKED,  don't count */
+               /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
+               goto out;
 
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
@@ -554,13 +561,14 @@ out:
        return ret;
 }
 
-static int do_mlock(unsigned long start, size_t len, int on)
+static int apply_vma_lock_flags(unsigned long start, size_t len,
+                               vm_flags_t flags)
 {
        unsigned long nstart, end, tmp;
        struct vm_area_struct * vma, * prev;
        int error;
 
-       VM_BUG_ON(start & ~PAGE_MASK);
+       VM_BUG_ON(offset_in_page(start));
        VM_BUG_ON(len != PAGE_ALIGN(len));
        end = start + len;
        if (end < start)
@@ -576,14 +584,11 @@ static int do_mlock(unsigned long start, size_t len, int on)
                prev = vma;
 
        for (nstart = start ; ; ) {
-               vm_flags_t newflags;
+               vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
 
-               /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
-
-               newflags = vma->vm_flags & ~VM_LOCKED;
-               if (on)
-                       newflags |= VM_LOCKED;
+               newflags |= flags;
 
+               /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
                tmp = vma->vm_end;
                if (tmp > end)
                        tmp = end;
@@ -605,7 +610,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
 
-SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
 {
        unsigned long locked;
        unsigned long lock_limit;
@@ -616,7 +621,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 
        lru_add_drain_all();    /* flush pagevec */
 
-       len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+       len = PAGE_ALIGN(len + (offset_in_page(start)));
        start &= PAGE_MASK;
 
        lock_limit = rlimit(RLIMIT_MEMLOCK);
@@ -629,7 +634,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 
        /* check against resource limits */
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
-               error = do_mlock(start, len, 1);
+               error = apply_vma_lock_flags(start, len, flags);
 
        up_write(&current->mm->mmap_sem);
        if (error)
@@ -641,37 +646,75 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
        return 0;
 }
 
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
+{
+       return do_mlock(start, len, VM_LOCKED);
+}
+
+SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
+{
+       vm_flags_t vm_flags = VM_LOCKED;
+
+       if (flags & ~MLOCK_ONFAULT)
+               return -EINVAL;
+
+       if (flags & MLOCK_ONFAULT)
+               vm_flags |= VM_LOCKONFAULT;
+
+       return do_mlock(start, len, vm_flags);
+}
+
 SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 {
        int ret;
 
-       len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
+       len = PAGE_ALIGN(len + (offset_in_page(start)));
        start &= PAGE_MASK;
 
        down_write(&current->mm->mmap_sem);
-       ret = do_mlock(start, len, 0);
+       ret = apply_vma_lock_flags(start, len, 0);
        up_write(&current->mm->mmap_sem);
 
        return ret;
 }
 
-static int do_mlockall(int flags)
+/*
+ * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
+ * and translate into the appropriate modifications to mm->def_flags and/or the
+ * flags for all current VMAs.
+ *
+ * There are a couple of subtleties with this.  If mlockall() is called multiple
+ * times with different flags, the values do not necessarily stack.  If mlockall
+ * is called once including the MCL_FUTURE flag and then a second time without
+ * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
+ */
+static int apply_mlockall_flags(int flags)
 {
        struct vm_area_struct * vma, * prev = NULL;
+       vm_flags_t to_add = 0;
 
-       if (flags & MCL_FUTURE)
+       current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
+       if (flags & MCL_FUTURE) {
                current->mm->def_flags |= VM_LOCKED;
-       else
-               current->mm->def_flags &= ~VM_LOCKED;
-       if (flags == MCL_FUTURE)
-               goto out;
+
+               if (flags & MCL_ONFAULT)
+                       current->mm->def_flags |= VM_LOCKONFAULT;
+
+               if (!(flags & MCL_CURRENT))
+                       goto out;
+       }
+
+       if (flags & MCL_CURRENT) {
+               to_add |= VM_LOCKED;
+               if (flags & MCL_ONFAULT)
+                       to_add |= VM_LOCKONFAULT;
+       }
 
        for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
                vm_flags_t newflags;
 
-               newflags = vma->vm_flags & ~VM_LOCKED;
-               if (flags & MCL_CURRENT)
-                       newflags |= VM_LOCKED;
+               newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+               newflags |= to_add;
 
                /* Ignore errors */
                mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -684,14 +727,13 @@ out:
 SYSCALL_DEFINE1(mlockall, int, flags)
 {
        unsigned long lock_limit;
-       int ret = -EINVAL;
+       int ret;
 
-       if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE)))
-               goto out;
+       if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)))
+               return -EINVAL;
 
-       ret = -EPERM;
        if (!can_do_mlock())
-               goto out;
+               return -EPERM;
 
        if (flags & MCL_CURRENT)
                lru_add_drain_all();    /* flush pagevec */
@@ -704,11 +746,11 @@ SYSCALL_DEFINE1(mlockall, int, flags)
 
        if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
            capable(CAP_IPC_LOCK))
-               ret = do_mlockall(flags);
+               ret = apply_mlockall_flags(flags);
        up_write(&current->mm->mmap_sem);
        if (!ret && (flags & MCL_CURRENT))
                mm_populate(0, TASK_SIZE);
-out:
+
        return ret;
 }
 
@@ -717,7 +759,7 @@ SYSCALL_DEFINE0(munlockall)
        int ret;
 
        down_write(&current->mm->mmap_sem);
-       ret = do_mlockall(0);
+       ret = apply_mlockall_flags(0);
        up_write(&current->mm->mmap_sem);
        return ret;
 }
index 79bcc9f92e482de9047c3927e068ff392db5c1bc..2ce04a649f6b4977e54b76a29be9d5bac5e71dab 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1302,7 +1302,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
         * that it represents a valid section of the address space.
         */
        addr = get_unmapped_area(file, addr, len, pgoff, flags);
-       if (addr & ~PAGE_MASK)
+       if (offset_in_page(addr))
                return addr;
 
        /* Do simple checking here so the lower-level routines won't have
@@ -1412,13 +1412,13 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, fd, unsigned long, pgoff)
 {
        struct file *file = NULL;
-       unsigned long retval = -EBADF;
+       unsigned long retval;
 
        if (!(flags & MAP_ANONYMOUS)) {
                audit_mmap_fd(fd, flags);
                file = fget(fd);
                if (!file)
-                       goto out;
+                       return -EBADF;
                if (is_file_hugepages(file))
                        len = ALIGN(len, huge_page_size(hstate_file(file)));
                retval = -EINVAL;
@@ -1453,7 +1453,6 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 out_fput:
        if (file)
                fput(file);
-out:
        return retval;
 }
 
@@ -1473,7 +1472,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 
        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
-       if (a.offset & ~PAGE_MASK)
+       if (offset_in_page(a.offset))
                return -EINVAL;
 
        return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
@@ -1562,7 +1561,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        }
 
        /* Clear old maps */
-       error = -ENOMEM;
        while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
                              &rb_parent)) {
                if (do_munmap(mm, addr, len))
@@ -1663,7 +1661,7 @@ out:
                                        vma == get_gate_vma(current->mm)))
                        mm->locked_vm += (len >> PAGE_SHIFT);
                else
-                       vma->vm_flags &= ~VM_LOCKED;
+                       vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
        }
 
        if (file)
@@ -1989,7 +1987,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-       if (addr & ~PAGE_MASK) {
+       if (offset_in_page(addr)) {
                VM_BUG_ON(addr != -ENOMEM);
                info.flags = 0;
                info.low_limit = TASK_UNMAPPED_BASE;
@@ -2025,7 +2023,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 
        if (addr > TASK_SIZE - len)
                return -ENOMEM;
-       if (addr & ~PAGE_MASK)
+       if (offset_in_page(addr))
                return -EINVAL;
 
        addr = arch_rebalance_pgtables(addr, len);
@@ -2047,7 +2045,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
                return vma;
 
        rb_node = mm->mm_rb.rb_node;
-       vma = NULL;
 
        while (rb_node) {
                struct vm_area_struct *tmp;
@@ -2139,10 +2136,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
        if (security_vm_enough_memory_mm(mm, grow))
                return -ENOMEM;
 
-       /* Ok, everything looks good - let it rip */
-       if (vma->vm_flags & VM_LOCKED)
-               mm->locked_vm += grow;
-       vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
        return 0;
 }
 
@@ -2153,6 +2146,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
  */
 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 {
+       struct mm_struct *mm = vma->vm_mm;
        int error;
 
        if (!(vma->vm_flags & VM_GROWSUP))
@@ -2202,15 +2196,19 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                                 * So, we reuse mm->page_table_lock to guard
                                 * against concurrent vma expansions.
                                 */
-                               spin_lock(&vma->vm_mm->page_table_lock);
+                               spin_lock(&mm->page_table_lock);
+                               if (vma->vm_flags & VM_LOCKED)
+                                       mm->locked_vm += grow;
+                               vm_stat_account(mm, vma->vm_flags,
+                                               vma->vm_file, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
                                anon_vma_interval_tree_post_update_vma(vma);
                                if (vma->vm_next)
                                        vma_gap_update(vma->vm_next);
                                else
-                                       vma->vm_mm->highest_vm_end = address;
-                               spin_unlock(&vma->vm_mm->page_table_lock);
+                                       mm->highest_vm_end = address;
+                               spin_unlock(&mm->page_table_lock);
 
                                perf_event_mmap(vma);
                        }
@@ -2218,7 +2216,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma, vma->vm_flags);
-       validate_mm(vma->vm_mm);
+       validate_mm(mm);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2229,6 +2227,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 int expand_downwards(struct vm_area_struct *vma,
                                   unsigned long address)
 {
+       struct mm_struct *mm = vma->vm_mm;
        int error;
 
        /*
@@ -2273,13 +2272,17 @@ int expand_downwards(struct vm_area_struct *vma,
                                 * So, we reuse mm->page_table_lock to guard
                                 * against concurrent vma expansions.
                                 */
-                               spin_lock(&vma->vm_mm->page_table_lock);
+                               spin_lock(&mm->page_table_lock);
+                               if (vma->vm_flags & VM_LOCKED)
+                                       mm->locked_vm += grow;
+                               vm_stat_account(mm, vma->vm_flags,
+                                               vma->vm_file, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
                                anon_vma_interval_tree_post_update_vma(vma);
                                vma_gap_update(vma);
-                               spin_unlock(&vma->vm_mm->page_table_lock);
+                               spin_unlock(&mm->page_table_lock);
 
                                perf_event_mmap(vma);
                        }
@@ -2287,7 +2290,7 @@ int expand_downwards(struct vm_area_struct *vma,
        }
        vma_unlock_anon_vma(vma);
        khugepaged_enter_vma_merge(vma, vma->vm_flags);
-       validate_mm(vma->vm_mm);
+       validate_mm(mm);
        return error;
 }
 
@@ -2536,7 +2539,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        unsigned long end;
        struct vm_area_struct *vma, *prev, *last;
 
-       if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
+       if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
                return -EINVAL;
 
        len = PAGE_ALIGN(len);
@@ -2734,7 +2737,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
        flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
        error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
-       if (error & ~PAGE_MASK)
+       if (offset_in_page(error))
                return error;
 
        error = mlock_future_check(mm, mm->def_flags, len);
@@ -3049,8 +3052,8 @@ static int special_mapping_fault(struct vm_area_struct *vma,
 static struct vm_area_struct *__install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
-       unsigned long vm_flags, const struct vm_operations_struct *ops,
-       void *priv)
+       unsigned long vm_flags, void *priv,
+       const struct vm_operations_struct *ops)
 {
        int ret;
        struct vm_area_struct *vma;
@@ -3099,8 +3102,8 @@ struct vm_area_struct *_install_special_mapping(
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, const struct vm_special_mapping *spec)
 {
-       return __install_special_mapping(mm, addr, len, vm_flags,
-                                        &special_mapping_vmops, (void *)spec);
+       return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
+                                       &special_mapping_vmops);
 }
 
 int install_special_mapping(struct mm_struct *mm,
@@ -3108,8 +3111,8 @@ int install_special_mapping(struct mm_struct *mm,
                            unsigned long vm_flags, struct page **pages)
 {
        struct vm_area_struct *vma = __install_special_mapping(
-               mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
-               (void *)pages);
+               mm, addr, len, vm_flags, (void *)pages,
+               &legacy_special_mapping_vmops);
 
        return PTR_ERR_OR_ZERO(vma);
 }
index ef5be8eaab001792b469fac1bd5b43cb139d1b0b..9c1445dc8a4c7415d9b785eda3b83d85510f85a4 100644 (file)
@@ -160,7 +160,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
-                               split_huge_page_pmd(vma, addr, pmd);
+                               split_huge_pmd(vma, pmd, addr);
                        else {
                                int nr_ptes = change_huge_pmd(vma, pmd, addr,
                                                newprot, prot_numa);
index 5a71cce8c6ea8cd679dad306bd3ee655ab370a47..9acf51a4f6828d54fdbf076d764298fef817ea2b 100644 (file)
@@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                if (!new_pmd)
                        break;
                if (pmd_trans_huge(*old_pmd)) {
-                       int err = 0;
                        if (extent == HPAGE_PMD_SIZE) {
+                               bool moved;
                                VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
                                              vma);
                                /* See comment in move_ptes() */
                                if (need_rmap_locks)
                                        anon_vma_lock_write(vma->anon_vma);
-                               err = move_huge_pmd(vma, new_vma, old_addr,
+                               moved = move_huge_pmd(vma, new_vma, old_addr,
                                                    new_addr, old_end,
                                                    old_pmd, new_pmd);
                                if (need_rmap_locks)
                                        anon_vma_unlock_write(vma->anon_vma);
+                               if (moved) {
+                                       need_flush = true;
+                                       continue;
+                               }
                        }
-                       if (err > 0) {
-                               need_flush = true;
-                               continue;
-                       } else if (!err) {
-                               split_huge_page_pmd(vma, old_addr, old_pmd);
-                       }
+                       split_huge_pmd(vma, old_pmd, old_addr);
                        VM_BUG_ON(pmd_trans_huge(*old_pmd));
                }
                if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
@@ -401,7 +400,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
        unsigned long charged = 0;
        unsigned long map_flags;
 
-       if (new_addr & ~PAGE_MASK)
+       if (offset_in_page(new_addr))
                goto out;
 
        if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
@@ -435,11 +434,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
        ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
                                ((addr - vma->vm_start) >> PAGE_SHIFT),
                                map_flags);
-       if (ret & ~PAGE_MASK)
+       if (offset_in_page(ret))
                goto out1;
 
        ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
-       if (!(ret & ~PAGE_MASK))
+       if (!(offset_in_page(ret)))
                goto out;
 out1:
        vm_unacct_memory(charged);
@@ -484,7 +483,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
                return ret;
 
-       if (addr & ~PAGE_MASK)
+       if (offset_in_page(addr))
                return ret;
 
        old_len = PAGE_ALIGN(old_len);
@@ -566,7 +565,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                                        vma->vm_pgoff +
                                        ((addr - vma->vm_start) >> PAGE_SHIFT),
                                        map_flags);
-               if (new_addr & ~PAGE_MASK) {
+               if (offset_in_page(new_addr)) {
                        ret = new_addr;
                        goto out;
                }
@@ -574,7 +573,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
        }
 out:
-       if (ret & ~PAGE_MASK) {
+       if (offset_in_page(ret)) {
                vm_unacct_memory(charged);
                locked = 0;
        }
index bb04d53ae8529597d3b0ca6636c49d5243762875..24e612fefa04dc13eae85af00164d674c0b91940 100644 (file)
@@ -38,7 +38,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 
        if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
                goto out;
-       if (start & ~PAGE_MASK)
+       if (offset_in_page(start))
                goto out;
        if ((flags & MS_ASYNC) && (flags & MS_SYNC))
                goto out;
index ab14a2014dea76b62e77b0176d810037e1c76788..92be862c859bdce2b149e86760522ccb6d230524 100644 (file)
@@ -578,16 +578,16 @@ static noinline void validate_nommu_regions(void)
                return;
 
        last = rb_entry(lastp, struct vm_region, vm_rb);
-       BUG_ON(unlikely(last->vm_end <= last->vm_start));
-       BUG_ON(unlikely(last->vm_top < last->vm_end));
+       BUG_ON(last->vm_end <= last->vm_start);
+       BUG_ON(last->vm_top < last->vm_end);
 
        while ((p = rb_next(lastp))) {
                region = rb_entry(p, struct vm_region, vm_rb);
                last = rb_entry(lastp, struct vm_region, vm_rb);
 
-               BUG_ON(unlikely(region->vm_end <= region->vm_start));
-               BUG_ON(unlikely(region->vm_top < region->vm_end));
-               BUG_ON(unlikely(region->vm_start < last->vm_top));
+               BUG_ON(region->vm_end <= region->vm_start);
+               BUG_ON(region->vm_top < region->vm_end);
+               BUG_ON(region->vm_start < last->vm_top);
 
                lastp = p;
        }
@@ -1497,7 +1497,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 
        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
-       if (a.offset & ~PAGE_MASK)
+       if (offset_in_page(a.offset))
                return -EINVAL;
 
        return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
@@ -1653,9 +1653,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
                        goto erase_whole_vma;
                if (start < vma->vm_start || end > vma->vm_end)
                        return -EINVAL;
-               if (start & ~PAGE_MASK)
+               if (offset_in_page(start))
                        return -EINVAL;
-               if (end != vma->vm_end && end & ~PAGE_MASK)
+               if (end != vma->vm_end && offset_in_page(end))
                        return -EINVAL;
                if (start != vma->vm_start && end != vma->vm_end) {
                        ret = split_vma(mm, vma, start, 1);
@@ -1736,7 +1736,7 @@ static unsigned long do_mremap(unsigned long addr,
        if (old_len == 0 || new_len == 0)
                return (unsigned long) -EINVAL;
 
-       if (addr & ~PAGE_MASK)
+       if (offset_in_page(addr))
                return -EINVAL;
 
        if (flags & MREMAP_FIXED && new_addr != addr)
index 1ecc0bcaecc518458765347f8b0fa5d5eed46f75..d13a33918fa23e685bc629bc23f58d97729beb62 100644 (file)
@@ -118,6 +118,15 @@ found:
        return t;
 }
 
+/*
+ * order == -1 means the oom kill is required by sysrq, otherwise only
+ * for display purposes.
+ */
+static inline bool is_sysrq_oom(struct oom_control *oc)
+{
+       return oc->order == -1;
+}
+
 /* return true if the task is not adequate as candidate victim task. */
 static bool oom_unkillable_task(struct task_struct *p,
                struct mem_cgroup *memcg, const nodemask_t *nodemask)
@@ -265,7 +274,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
         * Don't allow any other task to have access to the reserves.
         */
        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-               if (oc->order != -1)
+               if (!is_sysrq_oom(oc))
                        return OOM_SCAN_ABORT;
        }
        if (!task->mm)
@@ -278,7 +287,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
        if (oom_task_origin(task))
                return OOM_SCAN_SELECT;
 
-       if (task_will_free_mem(task) && oc->order != -1)
+       if (task_will_free_mem(task) && !is_sysrq_oom(oc))
                return OOM_SCAN_ABORT;
 
        return OOM_SCAN_OK;
@@ -377,13 +386,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 static void dump_header(struct oom_control *oc, struct task_struct *p,
                        struct mem_cgroup *memcg)
 {
-       task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
                "oom_score_adj=%hd\n",
                current->comm, oc->gfp_mask, oc->order,
                current->signal->oom_score_adj);
-       cpuset_print_task_mems_allowed(current);
-       task_unlock(current);
+       cpuset_print_current_mems_allowed();
        dump_stack();
        if (memcg)
                mem_cgroup_print_oom_info(memcg, p);
@@ -476,6 +483,24 @@ void oom_killer_enable(void)
        oom_killer_disabled = false;
 }
 
+/*
+ * task->mm can be NULL if the task is the exited group leader.  So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+       struct task_struct *t;
+
+       for_each_thread(p, t) {
+               struct mm_struct *t_mm = READ_ONCE(t->mm);
+               if (t_mm)
+                       return t_mm == mm;
+       }
+       return false;
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Must be called while holding a reference to p, which will be released upon
@@ -509,10 +534,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
        if (__ratelimit(&oom_rs))
                dump_header(oc, p, memcg);
 
-       task_lock(p);
        pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
                message, task_pid_nr(p), p->comm, points);
-       task_unlock(p);
 
        /*
         * If any of p's children has a different mm and is eligible for kill,
@@ -525,7 +548,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                list_for_each_entry(child, &t->children, sibling) {
                        unsigned int child_points;
 
-                       if (child->mm == p->mm)
+                       if (process_shares_mm(child, p->mm))
                                continue;
                        /*
                         * oom_badness() returns 0 if the thread is unkillable
@@ -552,8 +575,15 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                victim = p;
        }
 
-       /* mm cannot safely be dereferenced after task_unlock(victim) */
+       /* Get a reference to safely compare mm after task_unlock(victim) */
        mm = victim->mm;
+       atomic_inc(&mm->mm_count);
+       /*
+        * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
+        * the OOM victim from depleting the memory reserves from the user
+        * space under its control.
+        */
+       do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
        mark_oom_victim(victim);
        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
                task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
@@ -571,21 +601,21 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
         * pending fatal signal.
         */
        rcu_read_lock();
-       for_each_process(p)
-               if (p->mm == mm && !same_thread_group(p, victim) &&
-                   !(p->flags & PF_KTHREAD)) {
-                       if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                               continue;
+       for_each_process(p) {
+               if (!process_shares_mm(p, mm))
+                       continue;
+               if (same_thread_group(p, victim))
+                       continue;
+               if (unlikely(p->flags & PF_KTHREAD))
+                       continue;
+               if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                       continue;
 
-                       task_lock(p);   /* Protect ->comm from prctl() */
-                       pr_err("Kill process %d (%s) sharing same memory\n",
-                               task_pid_nr(p), p->comm);
-                       task_unlock(p);
-                       do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
-               }
+               do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+       }
        rcu_read_unlock();
 
-       do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+       mmdrop(mm);
        put_task_struct(victim);
 }
 #undef K
@@ -608,7 +638,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
                        return;
        }
        /* Do not panic for oom kills triggered by sysrq */
-       if (oc->order == -1)
+       if (is_sysrq_oom(oc))
                return;
        dump_header(oc, NULL, memcg);
        panic("Out of memory: %s panic_on_oom is enabled\n",
@@ -688,7 +718,7 @@ bool out_of_memory(struct oom_control *oc)
 
        p = select_bad_process(oc, &points, totalpages);
        /* Found nothing?!?! Either we hang forever, or we panic. */
-       if (!p && oc->order != -1) {
+       if (!p && !is_sysrq_oom(oc)) {
                dump_header(oc, NULL, NULL);
                panic("Out of memory and no killable processes...\n");
        }
index 805bbad2e24e1a84b383ebc90fc825527238957a..d0499fff8c7fb1ee2f33a34bd9e424420939a654 100644 (file)
@@ -169,19 +169,19 @@ void pm_restrict_gfp_mask(void)
        WARN_ON(!mutex_is_locked(&pm_mutex));
        WARN_ON(saved_gfp_mask);
        saved_gfp_mask = gfp_allowed_mask;
-       gfp_allowed_mask &= ~GFP_IOFS;
+       gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
 }
 
 bool pm_suspended_storage(void)
 {
-       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+       if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                return false;
        return true;
 }
 #endif /* CONFIG_PM_SLEEP */
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-int pageblock_order __read_mostly;
+unsigned int pageblock_order __read_mostly;
 #endif
 
 static void __free_pages_ok(struct page *page, unsigned int order);
@@ -229,6 +229,17 @@ static char * const zone_names[MAX_NR_ZONES] = {
 #endif
 };
 
+compound_page_dtor * const compound_page_dtors[] = {
+       NULL,
+       free_compound_page,
+#ifdef CONFIG_HUGETLB_PAGE
+       free_huge_page,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       free_transhuge_page,
+#endif
+};
+
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
 
@@ -436,39 +447,38 @@ out:
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
- * The first PAGE_SIZE page is called the "head page".
+ * The first PAGE_SIZE page is called the "head page" and have PG_head set.
  *
- * The remaining PAGE_SIZE pages are called "tail pages".
+ * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
+ * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
  *
- * All pages have PG_compound set.  All tail pages have their ->first_page
- * pointing at the head page.
+ * The first tail page's ->compound_dtor holds the offset in array of compound
+ * page destructors. See compound_page_dtors.
  *
- * The first tail page's ->lru.next holds the address of the compound page's
- * put_page() function.  Its ->lru.prev holds the order of allocation.
+ * The first tail page's ->compound_order holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 
-static void free_compound_page(struct page *page)
+void free_compound_page(struct page *page)
 {
        __free_pages_ok(page, compound_order(page));
 }
 
-void prep_compound_page(struct page *page, unsigned long order)
+void prep_compound_page(struct page *page, unsigned int order)
 {
        int i;
        int nr_pages = 1 << order;
 
-       set_compound_page_dtor(page, free_compound_page);
+       set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
        set_compound_order(page, order);
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
                set_page_count(p, 0);
-               p->first_page = page;
-               /* Make sure p->first_page is always valid for PageTail() */
-               smp_wmb();
-               __SetPageTail(p);
+               p->mapping = TAIL_MAPPING;
+               set_compound_head(p, page);
        }
+       atomic_set(compound_mapcount_ptr(page), -1);
 }
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -656,7 +666,7 @@ static inline void __free_one_page(struct page *page,
        unsigned long combined_idx;
        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
-       int max_order = MAX_ORDER;
+       unsigned int max_order = MAX_ORDER;
 
        VM_BUG_ON(!zone_is_initialized(zone));
        VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -669,7 +679,7 @@ static inline void __free_one_page(struct page *page,
                 * pageblock. Without this, pageblock isolation
                 * could cause incorrect freepage accounting.
                 */
-               max_order = min(MAX_ORDER, pageblock_order + 1);
+               max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
        } else {
                __mod_zone_freepage_state(zone, 1 << order, migratetype);
        }
@@ -733,7 +743,7 @@ static inline int free_pages_check(struct page *page)
        const char *bad_reason = NULL;
        unsigned long bad_flags = 0;
 
-       if (unlikely(page_mapcount(page)))
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
@@ -817,7 +827,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        if (unlikely(has_isolate_pageblock(zone)))
                                mt = get_pageblock_migratetype(page);
 
-                       /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
                        __free_one_page(page, page_to_pfn(page), zone, 0, mt);
                        trace_mm_page_pcpu_drain(page, 0, mt);
                } while (--to_free && --batch_free && !list_empty(list));
@@ -846,17 +855,52 @@ static void free_one_page(struct zone *zone,
 
 static int free_tail_pages_check(struct page *head_page, struct page *page)
 {
-       if (!IS_ENABLED(CONFIG_DEBUG_VM))
-               return 0;
+       int ret = 1;
+
+       /*
+        * We rely page->lru.next never has bit 0 set, unless the page
+        * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+        */
+       BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+       if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+               ret = 0;
+               goto out;
+       }
+       switch (page - head_page) {
+       case 1:
+               /* the first tail page: ->mapping is compound_mapcount() */
+               if (unlikely(compound_mapcount(page))) {
+                       bad_page(page, "nonzero compound_mapcount", 0);
+                       goto out;
+               }
+               break;
+       case 2:
+               /*
+                * the second tail page: ->mapping is
+                * page_deferred_list().next -- ignore value.
+                */
+               break;
+       default:
+               if (page->mapping != TAIL_MAPPING) {
+                       bad_page(page, "corrupted mapping in tail page", 0);
+                       goto out;
+               }
+               break;
+       }
        if (unlikely(!PageTail(page))) {
                bad_page(page, "PageTail not set", 0);
-               return 1;
+               goto out;
        }
-       if (unlikely(page->first_page != head_page)) {
-               bad_page(page, "first_page not consistent", 0);
-               return 1;
+       if (unlikely(compound_head(page) != head_page)) {
+               bad_page(page, "compound_head not consistent", 0);
+               goto out;
        }
-       return 0;
+       ret = 0;
+out:
+       page->mapping = NULL;
+       clear_compound_head(page);
+       return ret;
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -923,6 +967,10 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
                        struct page *page = pfn_to_page(start_pfn);
 
                        init_reserved_page(start_pfn);
+
+                       /* Avoid false-positive PageTail() */
+                       INIT_LIST_HEAD(&page->lru);
+
                        SetPageReserved(page);
                }
        }
@@ -1314,7 +1362,7 @@ static inline int check_new_page(struct page *page)
        const char *bad_reason = NULL;
        unsigned long bad_flags = 0;
 
-       if (unlikely(page_mapcount(page)))
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
@@ -1417,15 +1465,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][4] = {
-       [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
-       [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
-       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
+       [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
+       [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
+       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
 #ifdef CONFIG_CMA
-       [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
+       [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
 #endif
-       [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 #ifdef CONFIG_MEMORY_ISOLATION
-       [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
+       [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
 #endif
 };
 
@@ -1450,7 +1497,7 @@ int move_freepages(struct zone *zone,
                          int migratetype)
 {
        struct page *page;
-       unsigned long order;
+       unsigned int order;
        int pages_moved = 0;
 
 #ifndef CONFIG_HOLES_IN_ZONE
@@ -1563,7 +1610,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
                                                          int start_type)
 {
-       int current_order = page_order(page);
+       unsigned int current_order = page_order(page);
        int pages;
 
        /* Take ownership for orders >= pageblock_order */
@@ -1598,7 +1645,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
        *can_steal = false;
        for (i = 0;; i++) {
                fallback_mt = fallbacks[migratetype][i];
-               if (fallback_mt == MIGRATE_RESERVE)
+               if (fallback_mt == MIGRATE_TYPES)
                        break;
 
                if (list_empty(&area->free_list[fallback_mt]))
@@ -1617,6 +1664,101 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
        return -1;
 }
 
+/*
+ * Reserve a pageblock for exclusive use of high-order atomic allocations if
+ * there are no empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
+                               unsigned int alloc_order)
+{
+       int mt;
+       unsigned long max_managed, flags;
+
+       /*
+        * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
+        * Check is race-prone but harmless.
+        */
+       max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+       if (zone->nr_reserved_highatomic >= max_managed)
+               return;
+
+       spin_lock_irqsave(&zone->lock, flags);
+
+       /* Recheck the nr_reserved_highatomic limit under the lock */
+       if (zone->nr_reserved_highatomic >= max_managed)
+               goto out_unlock;
+
+       /* Yoink! */
+       mt = get_pageblock_migratetype(page);
+       if (mt != MIGRATE_HIGHATOMIC &&
+                       !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+               zone->nr_reserved_highatomic += pageblock_nr_pages;
+               set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
+               move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+       }
+
+out_unlock:
+       spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ */
+static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
+{
+       struct zonelist *zonelist = ac->zonelist;
+       unsigned long flags;
+       struct zoneref *z;
+       struct zone *zone;
+       struct page *page;
+       int order;
+
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+                                                               ac->nodemask) {
+               /* Preserve at least one pageblock */
+               if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
+                       continue;
+
+               spin_lock_irqsave(&zone->lock, flags);
+               for (order = 0; order < MAX_ORDER; order++) {
+                       struct free_area *area = &(zone->free_area[order]);
+
+                       if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+                               continue;
+
+                       page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next,
+                                               struct page, lru);
+
+                       /*
+                        * It should never happen but changes to locking could
+                        * inadvertently allow a per-cpu drain to add pages
+                        * to MIGRATE_HIGHATOMIC while unreserving so be safe
+                        * and watch for underflows.
+                        */
+                       zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
+                               zone->nr_reserved_highatomic);
+
+                       /*
+                        * Convert to ac->migratetype and avoid the normal
+                        * pageblock stealing heuristics. Minimally, the caller
+                        * is doing the work and needs the pages. More
+                        * importantly, if the block was always converted to
+                        * MIGRATE_UNMOVABLE or another type then the number
+                        * of pageblocks that cannot be completely freed
+                        * may increase.
+                        */
+                       set_pageblock_migratetype(page, ac->migratetype);
+                       move_freepages_block(zone, page, ac->migratetype);
+                       spin_unlock_irqrestore(&zone->lock, flags);
+                       return;
+               }
+               spin_unlock_irqrestore(&zone->lock, flags);
+       }
+}
+
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
@@ -1672,29 +1814,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
-                                               int migratetype)
+                               int migratetype, gfp_t gfp_flags)
 {
        struct page *page;
 
-retry_reserve:
        page = __rmqueue_smallest(zone, order, migratetype);
-
-       if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
+       if (unlikely(!page)) {
                if (migratetype == MIGRATE_MOVABLE)
                        page = __rmqueue_cma_fallback(zone, order);
 
                if (!page)
                        page = __rmqueue_fallback(zone, order, migratetype);
-
-               /*
-                * Use MIGRATE_RESERVE rather than fail an allocation. goto
-                * is used because __rmqueue_smallest is an inline function
-                * and we want just one call site
-                */
-               if (!page) {
-                       migratetype = MIGRATE_RESERVE;
-                       goto retry_reserve;
-               }
        }
 
        trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -1714,7 +1844,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 
        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
-               struct page *page = __rmqueue(zone, order, migratetype);
+               struct page *page = __rmqueue(zone, order, migratetype, 0);
                if (unlikely(page == NULL))
                        break;
 
@@ -2086,7 +2216,7 @@ int split_free_page(struct page *page)
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
                        struct zone *zone, unsigned int order,
-                       gfp_t gfp_flags, int migratetype)
+                       gfp_t gfp_flags, int alloc_flags, int migratetype)
 {
        unsigned long flags;
        struct page *page;
@@ -2129,7 +2259,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                        WARN_ON_ONCE(order > 1);
                }
                spin_lock_irqsave(&zone->lock, flags);
-               page = __rmqueue(zone, order, migratetype);
+
+               page = NULL;
+               if (alloc_flags & ALLOC_HARDER) {
+                       page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+                       if (page)
+                               trace_mm_page_alloc_zone_locked(page, order, migratetype);
+               }
+               if (!page)
+                       page = __rmqueue(zone, order, migratetype, gfp_flags);
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
@@ -2160,11 +2298,11 @@ static struct {
        struct fault_attr attr;
 
        bool ignore_gfp_highmem;
-       bool ignore_gfp_wait;
+       bool ignore_gfp_reclaim;
        u32 min_order;
 } fail_page_alloc = {
        .attr = FAULT_ATTR_INITIALIZER,
-       .ignore_gfp_wait = true,
+       .ignore_gfp_reclaim = true,
        .ignore_gfp_highmem = true,
        .min_order = 1,
 };
@@ -2183,7 +2321,8 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                return false;
-       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+       if (fail_page_alloc.ignore_gfp_reclaim &&
+                       (gfp_mask & __GFP_DIRECT_RECLAIM))
                return false;
 
        return should_fail(&fail_page_alloc.attr, 1 << order);
@@ -2202,7 +2341,7 @@ static int __init fail_page_alloc_debugfs(void)
                return PTR_ERR(dir);
 
        if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                               &fail_page_alloc.ignore_gfp_wait))
+                               &fail_page_alloc.ignore_gfp_reclaim))
                goto fail;
        if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
                                &fail_page_alloc.ignore_gfp_highmem))
@@ -2232,42 +2371,77 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 
 /*
- * Return true if free pages are above 'mark'. This takes into account the order
- * of the allocation.
+ * Return true if free base pages are above 'mark'. For high-order checks it
+ * will return true of the order-0 watermark is reached and there is at least
+ * one free page of a suitable size. Checking now avoids taking the zone lock
+ * to check in the allocation paths if no pages are free.
  */
 static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                        unsigned long mark, int classzone_idx, int alloc_flags,
                        long free_pages)
 {
-       /* free_pages may go negative - that's OK */
        long min = mark;
        int o;
-       long free_cma = 0;
+       const int alloc_harder = (alloc_flags & ALLOC_HARDER);
 
+       /* free_pages may go negative - that's OK */
        free_pages -= (1 << order) - 1;
+
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
-       if (alloc_flags & ALLOC_HARDER)
+
+       /*
+        * If the caller does not have rights to ALLOC_HARDER then subtract
+        * the high-atomic reserves. This will over-estimate the size of the
+        * atomic reserve but it avoids a search.
+        */
+       if (likely(!alloc_harder))
+               free_pages -= z->nr_reserved_highatomic;
+       else
                min -= min / 4;
+
 #ifdef CONFIG_CMA
        /* If allocation can't use CMA areas don't use free CMA pages */
        if (!(alloc_flags & ALLOC_CMA))
-               free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+               free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
 
-       if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
+       /*
+        * Check watermarks for an order-0 allocation request. If these
+        * are not met, then a high-order request also cannot go ahead
+        * even if a suitable page happened to be free.
+        */
+       if (free_pages <= min + z->lowmem_reserve[classzone_idx])
                return false;
-       for (o = 0; o < order; o++) {
-               /* At the next order, this order's pages become unavailable */
-               free_pages -= z->free_area[o].nr_free << o;
 
-               /* Require fewer higher order pages to be free */
-               min >>= 1;
+       /* If this is an order-0 request then the watermark is fine */
+       if (!order)
+               return true;
+
+       /* For a high-order request, check at least one suitable page is free */
+       for (o = order; o < MAX_ORDER; o++) {
+               struct free_area *area = &z->free_area[o];
+               int mt;
 
-               if (free_pages <= min)
-                       return false;
+               if (!area->nr_free)
+                       continue;
+
+               if (alloc_harder)
+                       return true;
+
+               for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
+                       if (!list_empty(&area->free_list[mt]))
+                               return true;
+               }
+
+#ifdef CONFIG_CMA
+               if ((alloc_flags & ALLOC_CMA) &&
+                   !list_empty(&area->free_list[MIGRATE_CMA])) {
+                       return true;
+               }
+#endif
        }
-       return true;
+       return false;
 }
 
 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
@@ -2278,134 +2452,18 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 }
 
 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
-                       unsigned long mark, int classzone_idx, int alloc_flags)
+                       unsigned long mark, int classzone_idx)
 {
        long free_pages = zone_page_state(z, NR_FREE_PAGES);
 
        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 
-       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+       return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
                                                                free_pages);
 }
 
 #ifdef CONFIG_NUMA
-/*
- * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
- * skip over zones that are not allowed by the cpuset, or that have
- * been recently (in last second) found to be nearly full.  See further
- * comments in mmzone.h.  Reduces cache footprint of zonelist scans
- * that have to skip over a lot of full or unallowed zones.
- *
- * If the zonelist cache is present in the passed zonelist, then
- * returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_states[N_MEMORY].)
- *
- * If the zonelist cache is not available for this zonelist, does
- * nothing and returns NULL.
- *
- * If the fullzones BITMAP in the zonelist cache is stale (more than
- * a second since last zap'd) then we zap it out (clear its bits.)
- *
- * We hold off even calling zlc_setup, until after we've checked the
- * first zone in the zonelist, on the theory that most allocations will
- * be satisfied from that first zone, so best to examine that zone as
- * quickly as we can.
- */
-static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
-{
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       nodemask_t *allowednodes;       /* zonelist_cache approximation */
-
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return NULL;
-
-       if (time_after(jiffies, zlc->last_full_zap + HZ)) {
-               bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-               zlc->last_full_zap = jiffies;
-       }
-
-       allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
-                                       &cpuset_current_mems_allowed :
-                                       &node_states[N_MEMORY];
-       return allowednodes;
-}
-
-/*
- * Given 'z' scanning a zonelist, run a couple of quick checks to see
- * if it is worth looking at further for free memory:
- *  1) Check that the zone isn't thought to be full (doesn't have its
- *     bit set in the zonelist_cache fullzones BITMAP).
- *  2) Check that the zones node (obtained from the zonelist_cache
- *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
- * Return true (non-zero) if zone is worth looking at further, or
- * else return false (zero) if it is not.
- *
- * This check -ignores- the distinction between various watermarks,
- * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
- * found to be full for any variation of these watermarks, it will
- * be considered full for up to one second by all requests, unless
- * we are so low on memory on all allowed nodes that we are forced
- * into the second scan of the zonelist.
- *
- * In the second scan we ignore this zonelist cache and exactly
- * apply the watermarks to all zones, even it is slower to do so.
- * We are low on memory in the second scan, and should leave no stone
- * unturned looking for a free page.
- */
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
-                                               nodemask_t *allowednodes)
-{
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       int i;                          /* index of *z in zonelist zones */
-       int n;                          /* node that zone *z is on */
-
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return 1;
-
-       i = z - zonelist->_zonerefs;
-       n = zlc->z_to_n[i];
-
-       /* This zone is worth trying if it is allowed but not full */
-       return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
-}
-
-/*
- * Given 'z' scanning a zonelist, set the corresponding bit in
- * zlc->fullzones, so that subsequent attempts to allocate a page
- * from that zone don't waste time re-examining it.
- */
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
-{
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       int i;                          /* index of *z in zonelist zones */
-
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return;
-
-       i = z - zonelist->_zonerefs;
-
-       set_bit(i, zlc->fullzones);
-}
-
-/*
- * clear all zones full, called after direct reclaim makes progress so that
- * a zone that was recently full is not skipped over for up to a second
- */
-static void zlc_clear_zones_full(struct zonelist *zonelist)
-{
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return;
-
-       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-}
-
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
        return local_zone->node == zone->node;
@@ -2416,28 +2474,7 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
                                RECLAIM_DISTANCE;
 }
-
 #else  /* CONFIG_NUMA */
-
-static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
-{
-       return NULL;
-}
-
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
-                               nodemask_t *allowednodes)
-{
-       return 1;
-}
-
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
-{
-}
-
-static void zlc_clear_zones_full(struct zonelist *zonelist)
-{
-}
-
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
        return true;
@@ -2447,7 +2484,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
        return true;
 }
-
 #endif /* CONFIG_NUMA */
 
 static void reset_alloc_batches(struct zone *preferred_zone)
@@ -2474,11 +2510,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
        struct zoneref *z;
        struct page *page = NULL;
        struct zone *zone;
-       nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
-       int zlc_active = 0;             /* set if using zonelist_cache */
-       int did_zlc_setup = 0;          /* just call zlc_setup() one time */
-       bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
-                               (gfp_mask & __GFP_WRITE);
        int nr_fair_skipped = 0;
        bool zonelist_rescan;
 
@@ -2493,9 +2524,6 @@ zonelist_scan:
                                                                ac->nodemask) {
                unsigned long mark;
 
-               if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
-                       !zlc_zone_worth_trying(zonelist, z, allowednodes))
-                               continue;
                if (cpusets_enabled() &&
                        (alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed(zone, gfp_mask))
@@ -2533,14 +2561,14 @@ zonelist_scan:
                 *
                 * XXX: For now, allow allocations to potentially
                 * exceed the per-zone dirty limit in the slowpath
-                * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                * (spread_dirty_pages unset) before going into reclaim,
                 * which is important when on a NUMA setup the allowed
                 * zones are together not big enough to reach the
                 * global limit.  The proper fix for these situations
                 * will require awareness of zones in the
                 * dirty-throttling and the flusher threads.
                 */
-               if (consider_zone_dirty && !zone_dirty_ok(zone))
+               if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
                        continue;
 
                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
@@ -2553,28 +2581,8 @@ zonelist_scan:
                        if (alloc_flags & ALLOC_NO_WATERMARKS)
                                goto try_this_zone;
 
-                       if (IS_ENABLED(CONFIG_NUMA) &&
-                                       !did_zlc_setup && nr_online_nodes > 1) {
-                               /*
-                                * we do zlc_setup if there are multiple nodes
-                                * and before considering the first zone allowed
-                                * by the cpuset.
-                                */
-                               allowednodes = zlc_setup(zonelist, alloc_flags);
-                               zlc_active = 1;
-                               did_zlc_setup = 1;
-                       }
-
                        if (zone_reclaim_mode == 0 ||
                            !zone_allows_reclaim(ac->preferred_zone, zone))
-                               goto this_zone_full;
-
-                       /*
-                        * As we may have just activated ZLC, check if the first
-                        * eligible zone has failed zone_reclaim recently.
-                        */
-                       if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
-                               !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
 
                        ret = zone_reclaim(zone, gfp_mask, order);
@@ -2591,34 +2599,26 @@ zonelist_scan:
                                                ac->classzone_idx, alloc_flags))
                                        goto try_this_zone;
 
-                               /*
-                                * Failed to reclaim enough to meet watermark.
-                                * Only mark the zone full if checking the min
-                                * watermark or if we failed to reclaim just
-                                * 1<<order pages or else the page allocator
-                                * fastpath will prematurely mark zones full
-                                * when the watermark is between the low and
-                                * min watermarks.
-                                */
-                               if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
-                                   ret == ZONE_RECLAIM_SOME)
-                                       goto this_zone_full;
-
                                continue;
                        }
                }
 
 try_this_zone:
                page = buffered_rmqueue(ac->preferred_zone, zone, order,
-                                               gfp_mask, ac->migratetype);
+                               gfp_mask, alloc_flags, ac->migratetype);
                if (page) {
                        if (prep_new_page(page, order, gfp_mask, alloc_flags))
                                goto try_this_zone;
+
+                       /*
+                        * If this is a high-order atomic allocation then check
+                        * if the pageblock should be reserved for the future
+                        */
+                       if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
+                               reserve_highatomic_pageblock(page, zone, order);
+
                        return page;
                }
-this_zone_full:
-               if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
-                       zlc_mark_zone_full(zonelist, z);
        }
 
        /*
@@ -2639,12 +2639,6 @@ this_zone_full:
                        zonelist_rescan = true;
        }
 
-       if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
-               /* Disable zlc cache for second zonelist scan */
-               zlc_active = 0;
-               zonelist_rescan = true;
-       }
-
        if (zonelist_rescan)
                goto zonelist_scan;
 
@@ -2669,7 +2663,7 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
                DEFAULT_RATELIMIT_INTERVAL,
                DEFAULT_RATELIMIT_BURST);
 
-void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
 {
        unsigned int filter = SHOW_MEM_FILTER_NODES;
 
@@ -2686,7 +2680,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                if (test_thread_flag(TIF_MEMDIE) ||
                    (current->flags & (PF_MEMALLOC | PF_EXITING)))
                        filter &= ~SHOW_MEM_FILTER_NODES;
-       if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+       if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                filter &= ~SHOW_MEM_FILTER_NODES;
 
        if (fmt) {
@@ -2703,7 +2697,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                va_end(args);
        }
 
-       pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+       pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
                current->comm, order, gfp_mask);
 
        dump_stack();
@@ -2889,19 +2883,17 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!(*did_some_progress)))
                return NULL;
 
-       /* After successful reclaim, reconsider all zones for allocation */
-       if (IS_ENABLED(CONFIG_NUMA))
-               zlc_clear_zones_full(ac->zonelist);
-
 retry:
        page = get_page_from_freelist(gfp_mask, order,
                                        alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
 
        /*
         * If an allocation failed after direct reclaim, it could be because
-        * pages are pinned on the per-cpu lists. Drain them and try again
+        * pages are pinned on the per-cpu lists or in high alloc reserves.
+        * Shrink them them and try again
         */
        if (!page && !drained) {
+               unreserve_highatomic_pageblock(ac);
                drain_all_pages(NULL);
                drained = true;
                goto retry;
@@ -2946,7 +2938,6 @@ static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
        int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
-       const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
 
        /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
        BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2955,11 +2946,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         * The caller may dip into page reserves a bit more if the caller
         * cannot run direct reclaim, or if the caller has realtime scheduling
         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
+        * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
         */
        alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 
-       if (atomic) {
+       if (gfp_mask & __GFP_ATOMIC) {
                /*
                 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
                 * if it can't schedule.
@@ -2996,11 +2987,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
        return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 
+static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
+{
+       return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+}
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                                struct alloc_context *ac)
 {
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
        struct page *page = NULL;
        int alloc_flags;
        unsigned long pages_reclaimed = 0;
@@ -3020,16 +3016,24 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                return NULL;
        }
 
+       /*
+        * We also sanity check to catch abuse of atomic reserves being used by
+        * callers that are not in atomic context.
+        */
+       if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
+                               (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+               gfp_mask &= ~__GFP_ATOMIC;
+
        /*
         * If this allocation cannot block and it is for a specific node, then
         * fail early.  There's no need to wakeup kswapd or retry for a
         * speculative node-specific allocation.
         */
-       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
+       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
                goto nopage;
 
 retry:
-       if (!(gfp_mask & __GFP_NO_KSWAPD))
+       if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                wake_all_kswapds(order, ac);
 
        /*
@@ -3072,8 +3076,8 @@ retry:
                }
        }
 
-       /* Atomic allocations - we can't balance anything */
-       if (!wait) {
+       /* Caller is not willing to reclaim, we can't balance anything */
+       if (!can_direct_reclaim) {
                /*
                 * All existing users of the deprecated __GFP_NOFAIL are
                 * blockable, so warn of any new users that actually allow this
@@ -3103,7 +3107,7 @@ retry:
                goto got_pg;
 
        /* Checks for THP-specific high-order allocations */
-       if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+       if (is_thp_gfp_mask(gfp_mask)) {
                /*
                 * If compaction is deferred for high-order allocations, it is
                 * because sync compaction recently failed. If this is the case
@@ -3138,8 +3142,7 @@ retry:
         * fault, so use asynchronous memory compaction for THP unless it is
         * khugepaged trying to collapse.
         */
-       if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
-                                               (current->flags & PF_KTHREAD))
+       if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
                migration_mode = MIGRATE_SYNC_LIGHT;
 
        /* Try direct reclaim and then allocating */
@@ -3210,7 +3213,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 
        lockdep_trace_alloc(gfp_mask);
 
-       might_sleep_if(gfp_mask & __GFP_WAIT);
+       might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
        if (should_fail_alloc_page(gfp_mask, order))
                return NULL;
@@ -3231,6 +3234,10 @@ retry_cpuset:
 
        /* We set it here, as __alloc_pages_slowpath might have changed it */
        ac.zonelist = zonelist;
+
+       /* Dirty zone balancing only done in the fast path */
+       ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+
        /* The preferred zone is used for statistics later */
        preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                                ac.nodemask ? : &cpuset_current_mems_allowed,
@@ -3249,6 +3256,7 @@ retry_cpuset:
                 * complete.
                 */
                alloc_mask = memalloc_noio_flags(gfp_mask);
+               ac.spread_dirty_pages = false;
 
                page = __alloc_pages_slowpath(alloc_mask, order, &ac);
        }
@@ -3428,24 +3436,24 @@ EXPORT_SYMBOL(__free_page_frag);
 struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
 {
        struct page *page;
-       struct mem_cgroup *memcg = NULL;
 
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
        page = alloc_pages(gfp_mask, order);
-       memcg_kmem_commit_charge(page, memcg, order);
+       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+               __free_pages(page, order);
+               page = NULL;
+       }
        return page;
 }
 
 struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
 {
        struct page *page;
-       struct mem_cgroup *memcg = NULL;
 
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
        page = alloc_pages_node(nid, gfp_mask, order);
-       memcg_kmem_commit_charge(page, memcg, order);
+       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+               __free_pages(page, order);
+               page = NULL;
+       }
        return page;
 }
 
@@ -3455,7 +3463,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
  */
 void __free_kmem_pages(struct page *page, unsigned int order)
 {
-       memcg_kmem_uncharge_pages(page, order);
+       memcg_kmem_uncharge(page, order);
        __free_pages(page, order);
 }
 
@@ -3467,7 +3475,8 @@ void free_kmem_pages(unsigned long addr, unsigned int order)
        }
 }
 
-static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+static void *make_alloc_exact(unsigned long addr, unsigned int order,
+               size_t size)
 {
        if (addr) {
                unsigned long alloc_end = addr + (PAGE_SIZE << order);
@@ -3517,7 +3526,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
  */
 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
-       unsigned order = get_order(size);
+       unsigned int order = get_order(size);
        struct page *p = alloc_pages_node(nid, gfp_mask, order);
        if (!p)
                return NULL;
@@ -3666,7 +3675,6 @@ static void show_migration_types(unsigned char type)
                [MIGRATE_UNMOVABLE]     = 'U',
                [MIGRATE_RECLAIMABLE]   = 'E',
                [MIGRATE_MOVABLE]       = 'M',
-               [MIGRATE_RESERVE]       = 'R',
 #ifdef CONFIG_CMA
                [MIGRATE_CMA]           = 'C',
 #endif
@@ -3819,7 +3827,8 @@ void show_free_areas(unsigned int filter)
        }
 
        for_each_populated_zone(zone) {
-               unsigned long nr[MAX_ORDER], flags, order, total = 0;
+               unsigned int order;
+               unsigned long nr[MAX_ORDER], flags, total = 0;
                unsigned char types[MAX_ORDER];
 
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -4168,7 +4177,7 @@ static void build_zonelists(pg_data_t *pgdat)
        nodemask_t used_mask;
        int local_node, prev_node;
        struct zonelist *zonelist;
-       int order = current_zonelist_order;
+       unsigned int order = current_zonelist_order;
 
        /* initialize zonelists */
        for (i = 0; i < MAX_ZONELISTS; i++) {
@@ -4212,20 +4221,6 @@ static void build_zonelists(pg_data_t *pgdat)
        build_thisnode_zonelists(pgdat);
 }
 
-/* Construct the zonelist performance cache - see further mmzone.h */
-static void build_zonelist_cache(pg_data_t *pgdat)
-{
-       struct zonelist *zonelist;
-       struct zonelist_cache *zlc;
-       struct zoneref *z;
-
-       zonelist = &pgdat->node_zonelists[0];
-       zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
-       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-       for (z = zonelist->_zonerefs; z->zone; z++)
-               zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
-}
-
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * Return node id of node used for "local" allocations.
@@ -4286,12 +4281,6 @@ static void build_zonelists(pg_data_t *pgdat)
        zonelist->_zonerefs[j].zone_idx = 0;
 }
 
-/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
-static void build_zonelist_cache(pg_data_t *pgdat)
-{
-       pgdat->node_zonelists[0].zlcache_ptr = NULL;
-}
-
 #endif /* CONFIG_NUMA */
 
 /*
@@ -4332,14 +4321,12 @@ static int __build_all_zonelists(void *data)
 
        if (self && !node_online(self->node_id)) {
                build_zonelists(self);
-               build_zonelist_cache(self);
        }
 
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
 
                build_zonelists(pgdat);
-               build_zonelist_cache(pgdat);
        }
 
        /*
@@ -4362,13 +4349,13 @@ static int __build_all_zonelists(void *data)
                /*
                 * We now know the "local memory node" for each node--
                 * i.e., the node of the first zone in the generic zonelist.
-                * Set up numa_mem percpu variable for on-line cpus.  During
-                * boot, only the boot cpu should be on-line;  we'll init the
-                * secondary cpus' numa_mem as they come on-line.  During
-                * node/memory hotplug, we'll fixup all on-line cpus.
+                * Set up numa_mem percpu variable for all possible cpus
+                * if associated node has been onlined.
                 */
-               if (cpu_online(cpu))
+               if (node_online(cpu_to_node(cpu)))
                        set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+               else
+                       set_cpu_numa_mem(cpu, NUMA_NO_NODE);
 #endif
        }
 
@@ -4498,120 +4485,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
        return ffz(~size);
 }
 
-/*
- * Check if a pageblock contains reserved pages
- */
-static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
-{
-       unsigned long pfn;
-
-       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-               if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
-                       return 1;
-       }
-       return 0;
-}
-
-/*
- * Mark a number of pageblocks as MIGRATE_RESERVE. The number
- * of blocks reserved is based on min_wmark_pages(zone). The memory within
- * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
- * higher will lead to a bigger reserve which will get freed as contiguous
- * blocks as reclaim kicks in
- */
-static void setup_zone_migrate_reserve(struct zone *zone)
-{
-       unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
-       struct page *page;
-       unsigned long block_migratetype;
-       int reserve;
-       int old_reserve;
-
-       /*
-        * Get the start pfn, end pfn and the number of blocks to reserve
-        * We have to be careful to be aligned to pageblock_nr_pages to
-        * make sure that we always check pfn_valid for the first page in
-        * the block.
-        */
-       start_pfn = zone->zone_start_pfn;
-       end_pfn = zone_end_pfn(zone);
-       start_pfn = roundup(start_pfn, pageblock_nr_pages);
-       reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
-                                                       pageblock_order;
-
-       /*
-        * Reserve blocks are generally in place to help high-order atomic
-        * allocations that are short-lived. A min_free_kbytes value that
-        * would result in more than 2 reserve blocks for atomic allocations
-        * is assumed to be in place to help anti-fragmentation for the
-        * future allocation of hugepages at runtime.
-        */
-       reserve = min(2, reserve);
-       old_reserve = zone->nr_migrate_reserve_block;
-
-       /* When memory hot-add, we almost always need to do nothing */
-       if (reserve == old_reserve)
-               return;
-       zone->nr_migrate_reserve_block = reserve;
-
-       for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
-               if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
-                       return;
-
-               if (!pfn_valid(pfn))
-                       continue;
-               page = pfn_to_page(pfn);
-
-               /* Watch out for overlapping nodes */
-               if (page_to_nid(page) != zone_to_nid(zone))
-                       continue;
-
-               block_migratetype = get_pageblock_migratetype(page);
-
-               /* Only test what is necessary when the reserves are not met */
-               if (reserve > 0) {
-                       /*
-                        * Blocks with reserved pages will never free, skip
-                        * them.
-                        */
-                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-                       if (pageblock_is_reserved(pfn, block_end_pfn))
-                               continue;
-
-                       /* If this block is reserved, account for it */
-                       if (block_migratetype == MIGRATE_RESERVE) {
-                               reserve--;
-                               continue;
-                       }
-
-                       /* Suitable for reserving if this block is movable */
-                       if (block_migratetype == MIGRATE_MOVABLE) {
-                               set_pageblock_migratetype(page,
-                                                       MIGRATE_RESERVE);
-                               move_freepages_block(zone, page,
-                                                       MIGRATE_RESERVE);
-                               reserve--;
-                               continue;
-                       }
-               } else if (!old_reserve) {
-                       /*
-                        * At boot time we don't need to scan the whole zone
-                        * for turning off MIGRATE_RESERVE.
-                        */
-                       break;
-               }
-
-               /*
-                * If the reserve is met and this is a previous reserved block,
-                * take it back
-                */
-               if (block_migratetype == MIGRATE_RESERVE) {
-                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-                       move_freepages_block(zone, page, MIGRATE_MOVABLE);
-               }
-       }
-}
-
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
@@ -4651,9 +4524,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                 * movable at startup. This will force kernel allocations
                 * to reserve their blocks rather than leaking throughout
                 * the address space during boot when many long-lived
-                * kernel allocations are made. Later some blocks near
-                * the start are marked MIGRATE_RESERVE by
-                * setup_zone_migrate_reserve()
+                * kernel allocations are made.
                 *
                 * bitmap is created for zone's valid pfn range. but memmap
                 * can be created for invalid pages (for alignment)
@@ -4900,8 +4771,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
 
 int __meminit init_currently_empty_zone(struct zone *zone,
                                        unsigned long zone_start_pfn,
-                                       unsigned long size,
-                                       enum memmap_context context)
+                                       unsigned long size)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
        int ret;
@@ -5413,8 +5283,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 
                set_pageblock_order();
                setup_usemap(pgdat, zone, zone_start_pfn, size);
-               ret = init_currently_empty_zone(zone, zone_start_pfn,
-                                               size, MEMMAP_EARLY);
+               ret = init_currently_empty_zone(zone, zone_start_pfn, size);
                BUG_ON(ret);
                memmap_init(size, nid, j, zone_start_pfn);
                zone_start_pfn += size;
@@ -5423,6 +5292,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
+       unsigned long __maybe_unused offset = 0;
+
        /* Skip empty nodes */
        if (!pgdat->node_spanned_pages)
                return;
@@ -5439,6 +5310,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                 * for the buddy allocator to function correctly.
                 */
                start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+               offset = pgdat->node_start_pfn - start;
                end = pgdat_end_pfn(pgdat);
                end = ALIGN(end, MAX_ORDER_NR_PAGES);
                size =  (end - start) * sizeof(struct page);
@@ -5446,7 +5318,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                if (!map)
                        map = memblock_virt_alloc_node_nopanic(size,
                                                               pgdat->node_id);
-               pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
+               pgdat->node_mem_map = map + offset;
        }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
        /*
@@ -5454,9 +5326,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
         */
        if (pgdat == NODE_DATA(0)) {
                mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
                if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
-                       mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
+                       mem_map -= offset;
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
        }
 #endif
@@ -5668,13 +5540,17 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                 */
                required_movablecore =
                        roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+               required_movablecore = min(totalpages, required_movablecore);
                corepages = totalpages - required_movablecore;
 
                required_kernelcore = max(required_kernelcore, corepages);
        }
 
-       /* If kernelcore was not specified, there is no ZONE_MOVABLE */
-       if (!required_kernelcore)
+       /*
+        * If kernelcore was not specified or kernelcore size is larger
+        * than totalpages, there is no ZONE_MOVABLE.
+        */
+       if (!required_kernelcore || required_kernelcore >= totalpages)
                goto out;
 
        /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
@@ -6209,7 +6085,6 @@ static void __setup_per_zone_wmarks(void)
                        high_wmark_pages(zone) - low_wmark_pages(zone) -
                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
 
-               setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lock, flags);
        }
 
@@ -6831,7 +6706,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
                       unsigned migratetype)
 {
        unsigned long outer_start, outer_end;
-       int ret = 0, order;
+       unsigned int order;
+       int ret = 0;
 
        struct compact_control cc = {
                .nr_migratepages = 0,
index d5dd79041484588cdfb409e7610d9ea23ee3c370..1c245d9027e38f259028651f82ee47051aeffe73 100644 (file)
@@ -61,8 +61,7 @@ static int page_idle_clear_pte_refs_one(struct page *page,
        bool referenced = false;
 
        if (unlikely(PageTransHuge(page))) {
-               pmd = page_check_address_pmd(page, mm, addr,
-                                            PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+               pmd = page_check_address_pmd(page, mm, addr, &ptl);
                if (pmd) {
                        referenced = pmdp_clear_young_notify(vma, addr, pmd);
                        spin_unlock(ptl);
index 29f2f8b853ae51be4f9e35fbc1495ad69297ff82..207244489a681d10f16c318c0a6ff0423b5fe30b 100644 (file)
@@ -58,7 +58,7 @@ again:
                if (!walk->pte_entry)
                        continue;
 
-               split_huge_page_pmd_mm(walk->mm, addr, pmd);
+               split_huge_pmd(walk->vma, pmd, addr);
                if (pmd_trans_unstable(pmd))
                        goto again;
                err = walk_pte_range(pmd, addr, next, walk);
index a63b4d82a14157136f2f50e2a3c264a6484ef993..8a943b97a053a6aef9c939b6b06854fedea01b7b 100644 (file)
@@ -1554,12 +1554,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
 #ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
-       PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
+       PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
 #endif
        PCPU_SETUP_BUG_ON(!base_addr);
-       PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
+       PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
-       PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
+       PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
        PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
@@ -1806,7 +1806,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 
        alloc_size = roundup(min_unit_size, atom_size);
        upa = alloc_size / min_unit_size;
-       while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+       while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                upa--;
        max_upa = upa;
 
@@ -1838,7 +1838,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
        for (upa = max_upa; upa; upa--) {
                int allocs = 0, wasted = 0;
 
-               if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+               if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                        continue;
 
                for (group = 0; group < nr_groups; group++) {
index 7d3db0247983b22b121290c2203ba2c2fb544ec0..69261d4c774dd3d9894447bdbf6342aac746ad34 100644 (file)
@@ -139,18 +139,6 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
-{
-       pmd_t pmd = pmd_mksplitting(*pmdp);
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-       /* tlb flush only to serialize against gup-fast */
-       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-}
-#endif
-
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
index 24682f6f4cfd1d84d7245faea0da78e7fd17e716..ba22d7fe0afbae6e8e1568a9564b6dd6399ca964 100644 (file)
@@ -90,7 +90,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
                page = list_to_page(pages);
                list_del(&page->lru);
                if (add_to_page_cache_lru(page, mapping, page->index,
-                               GFP_KERNEL & mapping_gfp_mask(mapping))) {
+                               mapping_gfp_constraint(mapping, GFP_KERNEL))) {
                        read_cache_pages_invalidate_page(mapping, page);
                        continue;
                }
@@ -128,7 +128,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
                struct page *page = list_to_page(pages);
                list_del(&page->lru);
                if (!add_to_page_cache_lru(page, mapping, page->index,
-                               GFP_KERNEL & mapping_gfp_mask(mapping))) {
+                               mapping_gfp_constraint(mapping, GFP_KERNEL))) {
                        mapping->a_ops->readpage(filp, page);
                }
                page_cache_release(page);
@@ -213,7 +213,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
        if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
                return -EINVAL;
 
-       nr_to_read = max_sane_readahead(nr_to_read);
+       nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
        while (nr_to_read) {
                int err;
 
@@ -232,16 +232,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
        return 0;
 }
 
-#define MAX_READAHEAD   ((512*4096)/PAGE_CACHE_SIZE)
-/*
- * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
- * sensible upper limit.
- */
-unsigned long max_sane_readahead(unsigned long nr)
-{
-       return min(nr, MAX_READAHEAD);
-}
-
 /*
  * Set the initial window size, round to next power of 2 and square
  * for small size, x 4 for medium, and x 2 for large
@@ -380,7 +370,7 @@ ondemand_readahead(struct address_space *mapping,
                   bool hit_readahead_marker, pgoff_t offset,
                   unsigned long req_size)
 {
-       unsigned long max = max_sane_readahead(ra->ra_pages);
+       unsigned long max = ra->ra_pages;
        pgoff_t prev_offset;
 
        /*
index f5b5c1f3dcd755ae313bba1404f2c9b079d5c18f..288622f5f34d75258ae9134a1be9cd14e0e3adf8 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -565,27 +565,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
        anon_vma_unlock_read(anon_vma);
 }
 
-/*
- * At what user virtual address is page expected in @vma?
- */
-static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
-{
-       pgoff_t pgoff = page_to_pgoff(page);
-       return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-}
-
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
-{
-       unsigned long address = __vma_address(page, vma);
-
-       /* page should be within @vma mapping range */
-       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-
-       return address;
-}
-
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 static void percpu_flush_tlb_batch_pages(void *data)
 {
@@ -841,8 +820,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                 * rmap might return false positives; we must filter
                 * these out using page_check_address_pmd().
                 */
-               pmd = page_check_address_pmd(page, mm, address,
-                                            PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+               pmd = page_check_address_pmd(page, mm, address, &ptl);
                if (!pmd)
                        return SWAP_AGAIN;
 
@@ -852,9 +830,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        return SWAP_FAIL; /* To break the loop */
                }
 
-               /* go ahead even if the pmd is pmd_trans_splitting() */
                if (pmdp_clear_flush_young_notify(vma, address, pmd))
                        referenced++;
+
                spin_unlock(ptl);
        } else {
                pte_t *pte;
@@ -884,6 +862,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        if (likely(!(vma->vm_flags & VM_SEQ_READ)))
                                referenced++;
                }
+
                pte_unmap_unlock(pte, ptl);
        }
 
@@ -943,6 +922,7 @@ int page_referenced(struct page *page,
        };
 
        *vm_flags = 0;
+
        if (!page_mapped(page))
                return 0;
 
@@ -1122,7 +1102,7 @@ static void __page_check_anon_rmap(struct page *page,
         * over the call to page_add_new_anon_rmap.
         */
        BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
-       BUG_ON(page->index != linear_page_index(vma, address));
+       BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
 #endif
 }
 
@@ -1131,6 +1111,7 @@ static void __page_check_anon_rmap(struct page *page,
  * @page:      the page to add the mapping to
  * @vma:       the vm area in which the mapping is added
  * @address:   the user virtual address mapped
+ * @compound:  charge the page as compound or small page
  *
  * The caller needs to hold the pte lock, and the page must be locked in
  * the anon_vma case: to serialize mapping,index checking after setting,
@@ -1138,9 +1119,9 @@ static void __page_check_anon_rmap(struct page *page,
  * (but PageKsm is never downgraded to PageAnon).
  */
 void page_add_anon_rmap(struct page *page,
-       struct vm_area_struct *vma, unsigned long address)
+       struct vm_area_struct *vma, unsigned long address, bool compound)
 {
-       do_page_add_anon_rmap(page, vma, address, 0);
+       do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
 }
 
 /*
@@ -1149,29 +1130,54 @@ void page_add_anon_rmap(struct page *page,
  * Everybody else should continue to use page_add_anon_rmap above.
  */
 void do_page_add_anon_rmap(struct page *page,
-       struct vm_area_struct *vma, unsigned long address, int exclusive)
+       struct vm_area_struct *vma, unsigned long address, int flags)
 {
-       int first = atomic_inc_and_test(&page->_mapcount);
+       bool compound = flags & RMAP_COMPOUND;
+       bool first;
+
+       if (PageTransCompound(page)) {
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
+               if (compound) {
+                       atomic_t *mapcount;
+
+                       VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+                       mapcount = compound_mapcount_ptr(page);
+                       first = atomic_inc_and_test(mapcount);
+               } else {
+                       /* Anon THP always mapped first with PMD */
+                       first = 0;
+                       VM_BUG_ON_PAGE(!page_mapcount(page), page);
+                       atomic_inc(&page->_mapcount);
+               }
+       } else {
+               VM_BUG_ON_PAGE(compound, page);
+               first = atomic_inc_and_test(&page->_mapcount);
+       }
+
        if (first) {
+               int nr = compound ? hpage_nr_pages(page) : 1;
                /*
                 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
                 * these counters are not modified in interrupt context, and
                 * pte lock(a spinlock) is held, which implies preemption
                 * disabled.
                 */
-               if (PageTransHuge(page))
+               if (compound) {
+                       VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                        __inc_zone_page_state(page,
                                              NR_ANON_TRANSPARENT_HUGEPAGES);
-               __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
-                               hpage_nr_pages(page));
+               }
+               __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
        }
        if (unlikely(PageKsm(page)))
                return;
 
        VM_BUG_ON_PAGE(!PageLocked(page), page);
+
        /* address might be in next vma when migration races vma_adjust */
        if (first)
-               __page_set_anon_rmap(page, vma, address, exclusive);
+               __page_set_anon_rmap(page, vma, address,
+                               flags & RMAP_EXCLUSIVE);
        else
                __page_check_anon_rmap(page, vma, address);
 }
@@ -1181,21 +1187,31 @@ void do_page_add_anon_rmap(struct page *page,
  * @page:      the page to add the mapping to
  * @vma:       the vm area in which the mapping is added
  * @address:   the user virtual address mapped
+ * @compound:  charge the page as compound or small page
  *
  * Same as page_add_anon_rmap but must only be called on *new* pages.
  * This means the inc-and-test can be bypassed.
  * Page does not have to be locked.
  */
 void page_add_new_anon_rmap(struct page *page,
-       struct vm_area_struct *vma, unsigned long address)
+       struct vm_area_struct *vma, unsigned long address, bool compound)
 {
+       int nr = compound ? hpage_nr_pages(page) : 1;
+
        VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
        SetPageSwapBacked(page);
-       atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
-       if (PageTransHuge(page))
+       if (compound) {
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+               /* increment count (starts at -1) */
+               atomic_set(compound_mapcount_ptr(page), 0);
                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-       __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
-                       hpage_nr_pages(page));
+       } else {
+               /* Anon THP always mapped first with PMD */
+               VM_BUG_ON_PAGE(PageTransCompound(page), page);
+               /* increment count (starts at -1) */
+               atomic_set(&page->_mapcount, 0);
+       }
+       __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
        __page_set_anon_rmap(page, vma, address, 1);
 }
 
@@ -1223,12 +1239,15 @@ static void page_remove_file_rmap(struct page *page)
 
        memcg = mem_cgroup_begin_page_stat(page);
 
-       /* page still mapped by someone else? */
-       if (!atomic_add_negative(-1, &page->_mapcount))
+       /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+       if (unlikely(PageHuge(page))) {
+               /* hugetlb pages are always mapped with pmds */
+               atomic_dec(compound_mapcount_ptr(page));
                goto out;
+       }
 
-       /* Hugepages are not counted in NR_FILE_MAPPED for now. */
-       if (unlikely(PageHuge(page)))
+       /* page still mapped by someone else? */
+       if (!atomic_add_negative(-1, &page->_mapcount))
                goto out;
 
        /*
@@ -1245,41 +1264,76 @@ out:
        mem_cgroup_end_page_stat(memcg);
 }
 
+static void page_remove_anon_compound_rmap(struct page *page)
+{
+       int i, nr;
+
+       if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+               return;
+
+       /* Hugepages are not counted in NR_ANON_PAGES for now. */
+       if (unlikely(PageHuge(page)))
+               return;
+
+       if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+               return;
+
+       __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+       if (TestClearPageDoubleMap(page)) {
+               /*
+                * Subpages can be mapped with PTEs too. Check how many of
+                * themi are still mapped.
+                */
+               for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+                       if (atomic_add_negative(-1, &page[i]._mapcount))
+                               nr++;
+               }
+       } else {
+               nr = HPAGE_PMD_NR;
+       }
+
+       if (nr) {
+               __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+               deferred_split_huge_page(page);
+       }
+}
+
 /**
  * page_remove_rmap - take down pte mapping from a page
- * @page: page to remove mapping from
+ * @page:      page to remove mapping from
+ * @compound:  uncharge the page as compound or small page
  *
  * The caller needs to hold the pte lock.
  */
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, bool compound)
 {
        if (!PageAnon(page)) {
+               VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
                page_remove_file_rmap(page);
                return;
        }
 
+       if (compound)
+               return page_remove_anon_compound_rmap(page);
+
        /* page still mapped by someone else? */
        if (!atomic_add_negative(-1, &page->_mapcount))
                return;
 
-       /* Hugepages are not counted in NR_ANON_PAGES for now. */
-       if (unlikely(PageHuge(page)))
-               return;
-
        /*
         * We use the irq-unsafe __{inc|mod}_zone_page_stat because
         * these counters are not modified in interrupt context, and
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
-       if (PageTransHuge(page))
-               __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-
-       __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
-                             -hpage_nr_pages(page));
+       __dec_zone_page_state(page, NR_ANON_PAGES);
 
        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
 
+       if (PageTransCompound(page))
+               deferred_split_huge_page(compound_head(page));
+
        /*
         * It would be tidy to reset the PageAnon mapping here,
         * but that might overwrite a racing page_add_anon_rmap
@@ -1304,6 +1358,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        int ret = SWAP_AGAIN;
        enum ttu_flags flags = (enum ttu_flags)arg;
 
+       /* munlock has nothing to gain from examining un-locked vmas */
+       if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
+               goto out;
+
        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
@@ -1314,9 +1372,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * skipped over this mm) then we should reactivate it.
         */
        if (!(flags & TTU_IGNORE_MLOCK)) {
-               if (vma->vm_flags & VM_LOCKED)
-                       goto out_mlock;
-
+               if (vma->vm_flags & VM_LOCKED) {
+                       /* Holding pte lock, we do *not* need mmap_sem here */
+                       mlock_vma_page(page);
+                       ret = SWAP_MLOCK;
+                       goto out_unmap;
+               }
                if (flags & TTU_MUNLOCK)
                        goto out_unmap;
        }
@@ -1352,7 +1413,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        update_hiwater_rss(mm);
 
        if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
-               if (!PageHuge(page)) {
+               if (PageHuge(page)) {
+                       hugetlb_count_sub(1 << compound_order(page), mm);
+               } else {
                        if (PageAnon(page))
                                dec_mm_counter(mm, MM_ANONPAGES);
                        else
@@ -1370,80 +1433,66 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                        dec_mm_counter(mm, MM_ANONPAGES);
                else
                        dec_mm_counter(mm, MM_FILEPAGES);
+       } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
+               swp_entry_t entry;
+               pte_t swp_pte;
+               /*
+                * Store the pfn of the page in a special migration
+                * pte. do_swap_page() will wait until the migration
+                * pte is removed and then restart fault handling.
+                */
+               entry = make_migration_entry(page, pte_write(pteval));
+               swp_pte = swp_entry_to_pte(entry);
+               if (pte_soft_dirty(pteval))
+                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+               set_pte_at(mm, address, pte, swp_pte);
        } else if (PageAnon(page)) {
                swp_entry_t entry = { .val = page_private(page) };
                pte_t swp_pte;
 
-               if (PageSwapCache(page)) {
-                       /*
-                        * Store the swap location in the pte.
-                        * See handle_pte_fault() ...
-                        */
-                       if (swap_duplicate(entry) < 0) {
-                               set_pte_at(mm, address, pte, pteval);
-                               ret = SWAP_FAIL;
-                               goto out_unmap;
-                       }
-                       if (list_empty(&mm->mmlist)) {
-                               spin_lock(&mmlist_lock);
-                               if (list_empty(&mm->mmlist))
-                                       list_add(&mm->mmlist, &init_mm.mmlist);
-                               spin_unlock(&mmlist_lock);
-                       }
+               if (!PageDirty(page) && (flags & TTU_FREE)) {
+                       /* It's a freeable page by MADV_FREE */
                        dec_mm_counter(mm, MM_ANONPAGES);
-                       inc_mm_counter(mm, MM_SWAPENTS);
-               } else if (IS_ENABLED(CONFIG_MIGRATION)) {
-                       /*
-                        * Store the pfn of the page in a special migration
-                        * pte. do_swap_page() will wait until the migration
-                        * pte is removed and then restart fault handling.
-                        */
-                       BUG_ON(!(flags & TTU_MIGRATION));
-                       entry = make_migration_entry(page, pte_write(pteval));
+                       goto discard;
                }
+
+               /*
+                * Store the swap location in the pte.
+                * See handle_pte_fault() ...
+                */
+               VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+               if (swap_duplicate(entry) < 0) {
+                       set_pte_at(mm, address, pte, pteval);
+                       ret = SWAP_FAIL;
+                       goto out_unmap;
+               }
+                       if (!PageDirty(page))
+                               SetPageDirty(page);
+               if (list_empty(&mm->mmlist)) {
+                       spin_lock(&mmlist_lock);
+                       if (list_empty(&mm->mmlist))
+                               list_add(&mm->mmlist, &init_mm.mmlist);
+                       spin_unlock(&mmlist_lock);
+               }
+               dec_mm_counter(mm, MM_ANONPAGES);
+               inc_mm_counter(mm, MM_SWAPENTS);
                swp_pte = swp_entry_to_pte(entry);
                if (pte_soft_dirty(pteval))
                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                set_pte_at(mm, address, pte, swp_pte);
-       } else if (IS_ENABLED(CONFIG_MIGRATION) &&
-                  (flags & TTU_MIGRATION)) {
-               /* Establish migration entry for a file page */
-               swp_entry_t entry;
-               entry = make_migration_entry(page, pte_write(pteval));
-               set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
        } else
                dec_mm_counter(mm, MM_FILEPAGES);
 
-       page_remove_rmap(page);
+discard:
+       page_remove_rmap(page, false);
        page_cache_release(page);
 
 out_unmap:
        pte_unmap_unlock(pte, ptl);
-       if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
+       if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK))
                mmu_notifier_invalidate_page(mm, address);
 out:
        return ret;
-
-out_mlock:
-       pte_unmap_unlock(pte, ptl);
-
-
-       /*
-        * We need mmap_sem locking, Otherwise VM_LOCKED check makes
-        * unstable result and race. Plus, We can't wait here because
-        * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
-        * if trylock failed, the page remain in evictable lru and later
-        * vmscan could retry to move the page to unevictable lru if the
-        * page is actually mlocked.
-        */
-       if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
-               if (vma->vm_flags & VM_LOCKED) {
-                       mlock_vma_page(page);
-                       ret = SWAP_MLOCK;
-               }
-               up_read(&vma->vm_mm->mmap_sem);
-       }
-       return ret;
 }
 
 bool is_vma_temporary_stack(struct vm_area_struct *vma)
@@ -1607,6 +1656,8 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
 
+               cond_resched();
+
                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;
 
@@ -1656,6 +1707,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
 
+               cond_resched();
+
                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;
 
@@ -1713,7 +1766,7 @@ void hugepage_add_anon_rmap(struct page *page,
        BUG_ON(!PageLocked(page));
        BUG_ON(!anon_vma);
        /* address might be in next vma when migration races vma_adjust */
-       first = atomic_inc_and_test(&page->_mapcount);
+       first = atomic_inc_and_test(compound_mapcount_ptr(page));
        if (first)
                __hugepage_set_anon_rmap(page, vma, address, 0);
 }
@@ -1722,7 +1775,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
                        struct vm_area_struct *vma, unsigned long address)
 {
        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-       atomic_set(&page->_mapcount, 0);
+       atomic_set(compound_mapcount_ptr(page), 0);
        __hugepage_set_anon_rmap(page, vma, address, 1);
 }
 #endif /* CONFIG_HUGETLB_PAGE */
index 48ce82926d931bef026baf16a971add9d00c45fd..9de8fb0c5beea4c225b13e4e8e3ba429827b45af 100644 (file)
@@ -73,6 +73,8 @@ static struct vfsmount *shm_mnt;
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
+#include "internal.h"
+
 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
 #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
 
@@ -721,7 +723,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
         * the shmem_swaplist_mutex which might hold up shmem_writepage().
         * Charged back to the user (not to caller) when swap account is used.
         */
-       error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
+       error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
+                       false);
        if (error)
                goto out;
        /* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -744,9 +747,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        if (error) {
                if (error != -ENOMEM)
                        error = 0;
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, false);
        } else
-               mem_cgroup_commit_charge(page, memcg, true);
+               mem_cgroup_commit_charge(page, memcg, true, false);
 out:
        unlock_page(page);
        page_cache_release(page);
@@ -996,7 +999,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        copy_highpage(newpage, oldpage);
        flush_dcache_page(newpage);
 
-       __set_page_locked(newpage);
+       __SetPageLocked(newpage);
        SetPageUptodate(newpage);
        SetPageSwapBacked(newpage);
        set_page_private(newpage, swap_index);
@@ -1023,7 +1026,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
                 */
                oldpage = newpage;
        } else {
-               mem_cgroup_migrate(oldpage, newpage, true);
+               mem_cgroup_replace_page(oldpage, newpage);
                lru_cache_add_anon(newpage);
                *pagep = newpage;
        }
@@ -1129,7 +1132,8 @@ repeat:
                                goto failed;
                }
 
-               error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+               error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+                               false);
                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                swp_to_radix_entry(swap));
@@ -1146,14 +1150,14 @@ repeat:
                         * "repeat": reading a hole and writing should succeed.
                         */
                        if (error) {
-                               mem_cgroup_cancel_charge(page, memcg);
+                               mem_cgroup_cancel_charge(page, memcg, false);
                                delete_from_swap_cache(page);
                        }
                }
                if (error)
                        goto failed;
 
-               mem_cgroup_commit_charge(page, memcg, true);
+               mem_cgroup_commit_charge(page, memcg, true, false);
 
                spin_lock(&info->lock);
                info->swapped--;
@@ -1188,11 +1192,12 @@ repeat:
                }
 
                __SetPageSwapBacked(page);
-               __set_page_locked(page);
+               __SetPageLocked(page);
                if (sgp == SGP_WRITE)
                        __SetPageReferenced(page);
 
-               error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+               error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+                               false);
                if (error)
                        goto decused;
                error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1202,10 +1207,10 @@ repeat:
                        radix_tree_preload_end();
                }
                if (error) {
-                       mem_cgroup_cancel_charge(page, memcg);
+                       mem_cgroup_cancel_charge(page, memcg, false);
                        goto decused;
                }
-               mem_cgroup_commit_charge(page, memcg, false);
+               mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_anon(page);
 
                spin_lock(&info->lock);
index 4fcc5dd8d5a6c2776ac2f88d9011ac23c78fb769..2cf30e3124226ee8ab9482b0db8f07ad3b9c127c 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1030,12 +1030,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 }
 
 /*
- * Construct gfp mask to allocate from a specific node but do not invoke reclaim
- * or warn about failures.
+ * Construct gfp mask to allocate from a specific node but do not direct reclaim
+ * or warn about failures. kswapd may still wake to reclaim in the background.
  */
 static inline gfp_t gfp_exact_node(gfp_t flags)
 {
-       return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
+       return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
 }
 #endif
 
@@ -1592,16 +1592,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
                flags |= __GFP_RECLAIMABLE;
 
-       if (memcg_charge_slab(cachep, flags, cachep->gfporder))
-               return NULL;
-
        page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
        if (!page) {
-               memcg_uncharge_slab(cachep, cachep->gfporder);
                slab_out_of_memory(cachep, flags, nodeid);
                return NULL;
        }
 
+       if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) {
+               __free_pages(page, cachep->gfporder);
+               return NULL;
+       }
+
        /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
        if (page_is_pfmemalloc(page))
                pfmemalloc_active = true;
@@ -1653,8 +1654,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
 
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += nr_freed;
-       __free_pages(page, cachep->gfporder);
-       memcg_uncharge_slab(cachep, cachep->gfporder);
+       __free_kmem_pages(page, cachep->gfporder);
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
@@ -1888,21 +1888,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
 
        freelist = page->freelist;
        slab_destroy_debugcheck(cachep, page);
-       if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
-               struct rcu_head *head;
-
-               /*
-                * RCU free overloads the RCU head over the LRU.
-                * slab_page has been overloeaded over the LRU,
-                * however it is not used from now on so that
-                * we can use it safely.
-                */
-               head = (void *)&page->rcu_head;
-               call_rcu(head, kmem_rcu_free);
-
-       } else {
+       if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+               call_rcu(&page->rcu_head, kmem_rcu_free);
+       else
                kmem_freepages(cachep, page);
-       }
 
        /*
         * From now on, we don't use freelist
@@ -2632,7 +2621,7 @@ static int cache_grow(struct kmem_cache *cachep,
 
        offset *= cachep->colour_off;
 
-       if (local_flags & __GFP_WAIT)
+       if (gfpflags_allow_blocking(local_flags))
                local_irq_enable();
 
        /*
@@ -2662,7 +2651,7 @@ static int cache_grow(struct kmem_cache *cachep,
 
        cache_init_objs(cachep, page);
 
-       if (local_flags & __GFP_WAIT)
+       if (gfpflags_allow_blocking(local_flags))
                local_irq_disable();
        check_irq_off();
        spin_lock(&n->list_lock);
@@ -2676,7 +2665,7 @@ static int cache_grow(struct kmem_cache *cachep,
 opps1:
        kmem_freepages(cachep, page);
 failed:
-       if (local_flags & __GFP_WAIT)
+       if (gfpflags_allow_blocking(local_flags))
                local_irq_disable();
        return 0;
 }
@@ -2868,7 +2857,7 @@ force_grow:
 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
                                                gfp_t flags)
 {
-       might_sleep_if(flags & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(flags));
 #if DEBUG
        kmem_flagcheck(cachep, flags);
 #endif
@@ -3056,11 +3045,11 @@ retry:
                 */
                struct page *page;
 
-               if (local_flags & __GFP_WAIT)
+               if (gfpflags_allow_blocking(local_flags))
                        local_irq_enable();
                kmem_flagcheck(cache, flags);
                page = kmem_getpages(cache, local_flags, numa_mem_id());
-               if (local_flags & __GFP_WAIT)
+               if (gfpflags_allow_blocking(local_flags))
                        local_irq_disable();
                if (page) {
                        /*
@@ -3241,11 +3230,15 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 #endif /* CONFIG_NUMA */
 
 static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller,
+          bool irq_off_needed)
 {
        unsigned long save_flags;
        void *objp;
 
+       /* Compiler need to remove irq_off_needed branch statements */
+       BUILD_BUG_ON(!__builtin_constant_p(irq_off_needed));
+
        flags &= gfp_allowed_mask;
 
        lockdep_trace_alloc(flags);
@@ -3256,9 +3249,11 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
        cachep = memcg_kmem_get_cache(cachep, flags);
 
        cache_alloc_debugcheck_before(cachep, flags);
-       local_irq_save(save_flags);
+       if (irq_off_needed)
+               local_irq_save(save_flags);
        objp = __do_cache_alloc(cachep, flags);
-       local_irq_restore(save_flags);
+       if (irq_off_needed)
+               local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
        kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
                                 flags);
@@ -3414,7 +3409,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
-       void *ret = slab_alloc(cachep, flags, _RET_IP_);
+       void *ret = slab_alloc(cachep, flags, _RET_IP_, true);
 
        trace_kmem_cache_alloc(_RET_IP_, ret,
                               cachep->object_size, cachep->size, flags);
@@ -3423,16 +3418,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
-{
-       __kmem_cache_free_bulk(s, size, p);
-}
-EXPORT_SYMBOL(kmem_cache_free_bulk);
-
+/* Note that interrupts must be enabled when calling this function. */
 bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
-                                                               void **p)
+                          void **p)
 {
-       return __kmem_cache_alloc_bulk(s, flags, size, p);
+       size_t i;
+
+       local_irq_disable();
+       for (i = 0; i < size; i++) {
+               void *x = p[i] = slab_alloc(s, flags, _RET_IP_, false);
+
+               if (!x) {
+                       __kmem_cache_free_bulk(s, i, p);
+                       return false;
+               }
+       }
+       local_irq_enable();
+       return true;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
 
@@ -3442,7 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
 {
        void *ret;
 
-       ret = slab_alloc(cachep, flags, _RET_IP_);
+       ret = slab_alloc(cachep, flags, _RET_IP_, true);
 
        trace_kmalloc(_RET_IP_, ret,
                      size, cachep->size, flags);
@@ -3533,7 +3535,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
        cachep = kmalloc_slab(size, flags);
        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
                return cachep;
-       ret = slab_alloc(cachep, flags, caller);
+       ret = slab_alloc(cachep, flags, caller, true);
 
        trace_kmalloc(caller, ret,
                      size, cachep->size, flags);
@@ -3553,32 +3555,56 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
 
-/**
- * kmem_cache_free - Deallocate an object
- * @cachep: The cache the allocation was from.
- * @objp: The previously allocated object.
- *
- * Free an object which was previously allocated from this
- * cache.
- */
-void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+/* Caller is responsible for disabling local IRQs */
+static __always_inline void __kmem_cache_free(struct kmem_cache *cachep,
+                                             void *objp, bool irq_off_needed)
 {
        unsigned long flags;
+
+       /* Compiler need to remove irq_off_needed branch statements */
+       BUILD_BUG_ON(!__builtin_constant_p(irq_off_needed));
+
        cachep = cache_from_obj(cachep, objp);
        if (!cachep)
                return;
 
-       local_irq_save(flags);
+       if (irq_off_needed)
+               local_irq_save(flags);
        debug_check_no_locks_freed(objp, cachep->object_size);
        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
                debug_check_no_obj_freed(objp, cachep->object_size);
        __cache_free(cachep, objp, _RET_IP_);
-       local_irq_restore(flags);
+       if (irq_off_needed)
+               local_irq_restore(flags);
+}
 
+/**
+ * kmem_cache_free - Deallocate an object
+ * @cachep: The cache the allocation was from.
+ * @objp: The previously allocated object.
+ *
+ * Free an object which was previously allocated from this
+ * cache.
+ */
+void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+{
+       __kmem_cache_free(cachep, objp, true);
        trace_kmem_cache_free(_RET_IP_, objp);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+       size_t i;
+
+       local_irq_disable();
+       for (i = 0; i < size; i++)
+               __kmem_cache_free(s, p[i], false);
+       local_irq_enable();
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
 /**
  * kfree - free previously allocated memory
  * @objp: pointer returned by kmalloc.
index a3a967d7d7c27f1fec35acdfb3e07eb6f33f6fc3..27492eb678f722400147e22da62a68de893497e8 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -181,10 +181,6 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
        list_for_each_entry(iter, &(root)->memcg_params.list, \
                            memcg_params.list)
 
-#define for_each_memcg_cache_safe(iter, tmp, root) \
-       list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
-                                memcg_params.list)
-
 static inline bool is_root_cache(struct kmem_cache *s)
 {
        return s->memcg_params.is_root_cache;
@@ -240,23 +236,16 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
        return s->memcg_params.root_cache;
 }
 
-static __always_inline int memcg_charge_slab(struct kmem_cache *s,
-                                            gfp_t gfp, int order)
+static __always_inline int memcg_charge_slab(struct page *page,
+                                            gfp_t gfp, int order,
+                                            struct kmem_cache *s)
 {
        if (!memcg_kmem_enabled())
                return 0;
        if (is_root_cache(s))
                return 0;
-       return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
-}
-
-static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
-{
-       if (!memcg_kmem_enabled())
-               return;
-       if (is_root_cache(s))
-               return;
-       memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
+       return __memcg_kmem_charge_memcg(page, gfp, order,
+                                        s->memcg_params.memcg);
 }
 
 extern void slab_init_memcg_params(struct kmem_cache *);
@@ -265,8 +254,6 @@ extern void slab_init_memcg_params(struct kmem_cache *);
 
 #define for_each_memcg_cache(iter, root) \
        for ((void)(iter), (void)(root); 0; )
-#define for_each_memcg_cache_safe(iter, tmp, root) \
-       for ((void)(iter), (void)(tmp), (void)(root); 0; )
 
 static inline bool is_root_cache(struct kmem_cache *s)
 {
@@ -295,15 +282,12 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
        return s;
 }
 
-static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
+                                   struct kmem_cache *s)
 {
        return 0;
 }
 
-static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
-{
-}
-
 static inline void slab_init_memcg_params(struct kmem_cache *s)
 {
 }
index 5ce4faeb16fbbdfa19b16c3d551aace8c3a495c0..d88e97c10a2e31669753681efc74f6bf50fdf2ef 100644 (file)
@@ -316,10 +316,10 @@ unsigned long calculate_alignment(unsigned long flags,
        return ALIGN(align, sizeof(void *));
 }
 
-static struct kmem_cache *
-do_kmem_cache_create(const char *name, size_t object_size, size_t size,
-                    size_t align, unsigned long flags, void (*ctor)(void *),
-                    struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+static struct kmem_cache *create_cache(const char *name,
+               size_t object_size, size_t size, size_t align,
+               unsigned long flags, void (*ctor)(void *),
+               struct mem_cgroup *memcg, struct kmem_cache *root_cache)
 {
        struct kmem_cache *s;
        int err;
@@ -384,7 +384,7 @@ struct kmem_cache *
 kmem_cache_create(const char *name, size_t size, size_t align,
                  unsigned long flags, void (*ctor)(void *))
 {
-       struct kmem_cache *s;
+       struct kmem_cache *s = NULL;
        const char *cache_name;
        int err;
 
@@ -396,7 +396,6 @@ kmem_cache_create(const char *name, size_t size, size_t align,
 
        err = kmem_cache_sanity_check(name, size);
        if (err) {
-               s = NULL;       /* suppress uninit var warning */
                goto out_unlock;
        }
 
@@ -418,9 +417,9 @@ kmem_cache_create(const char *name, size_t size, size_t align,
                goto out_unlock;
        }
 
-       s = do_kmem_cache_create(cache_name, size, size,
-                                calculate_alignment(flags, align, size),
-                                flags, ctor, NULL, NULL);
+       s = create_cache(cache_name, size, size,
+                        calculate_alignment(flags, align, size),
+                        flags, ctor, NULL, NULL);
        if (IS_ERR(s)) {
                err = PTR_ERR(s);
                kfree_const(cache_name);
@@ -448,29 +447,20 @@ out_unlock:
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
-static int do_kmem_cache_shutdown(struct kmem_cache *s,
+static int shutdown_cache(struct kmem_cache *s,
                struct list_head *release, bool *need_rcu_barrier)
 {
-       if (__kmem_cache_shutdown(s) != 0) {
-               printk(KERN_ERR "kmem_cache_destroy %s: "
-                      "Slab cache still has objects\n", s->name);
-               dump_stack();
+       if (__kmem_cache_shutdown(s) != 0)
                return -EBUSY;
-       }
 
        if (s->flags & SLAB_DESTROY_BY_RCU)
                *need_rcu_barrier = true;
 
-#ifdef CONFIG_MEMCG_KMEM
-       if (!is_root_cache(s))
-               list_del(&s->memcg_params.list);
-#endif
        list_move(&s->list, release);
        return 0;
 }
 
-static void do_kmem_cache_release(struct list_head *release,
-                                 bool need_rcu_barrier)
+static void release_caches(struct list_head *release, bool need_rcu_barrier)
 {
        struct kmem_cache *s, *s2;
 
@@ -536,10 +526,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
        if (!cache_name)
                goto out_unlock;
 
-       s = do_kmem_cache_create(cache_name, root_cache->object_size,
-                                root_cache->size, root_cache->align,
-                                root_cache->flags, root_cache->ctor,
-                                memcg, root_cache);
+       s = create_cache(cache_name, root_cache->object_size,
+                        root_cache->size, root_cache->align,
+                        root_cache->flags, root_cache->ctor,
+                        memcg, root_cache);
        /*
         * If we could not create a memcg cache, do not complain, because
         * that's not critical at all as we can always proceed with the root
@@ -598,6 +588,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
        put_online_cpus();
 }
 
+static int __shutdown_memcg_cache(struct kmem_cache *s,
+               struct list_head *release, bool *need_rcu_barrier)
+{
+       BUG_ON(is_root_cache(s));
+
+       if (shutdown_cache(s, release, need_rcu_barrier))
+               return -EBUSY;
+
+       list_del(&s->memcg_params.list);
+       return 0;
+}
+
 void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 {
        LIST_HEAD(release);
@@ -615,14 +617,76 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
                 * The cgroup is about to be freed and therefore has no charges
                 * left. Hence, all its caches must be empty by now.
                 */
-               BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+               BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier));
        }
        mutex_unlock(&slab_mutex);
 
        put_online_mems();
        put_online_cpus();
 
-       do_kmem_cache_release(&release, need_rcu_barrier);
+       release_caches(&release, need_rcu_barrier);
+}
+
+static int shutdown_memcg_caches(struct kmem_cache *s,
+               struct list_head *release, bool *need_rcu_barrier)
+{
+       struct memcg_cache_array *arr;
+       struct kmem_cache *c, *c2;
+       LIST_HEAD(busy);
+       int i;
+
+       BUG_ON(!is_root_cache(s));
+
+       /*
+        * First, shutdown active caches, i.e. caches that belong to online
+        * memory cgroups.
+        */
+       arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+                                       lockdep_is_held(&slab_mutex));
+       for_each_memcg_cache_index(i) {
+               c = arr->entries[i];
+               if (!c)
+                       continue;
+               if (__shutdown_memcg_cache(c, release, need_rcu_barrier))
+                       /*
+                        * The cache still has objects. Move it to a temporary
+                        * list so as not to try to destroy it for a second
+                        * time while iterating over inactive caches below.
+                        */
+                       list_move(&c->memcg_params.list, &busy);
+               else
+                       /*
+                        * The cache is empty and will be destroyed soon. Clear
+                        * the pointer to it in the memcg_caches array so that
+                        * it will never be accessed even if the root cache
+                        * stays alive.
+                        */
+                       arr->entries[i] = NULL;
+       }
+
+       /*
+        * Second, shutdown all caches left from memory cgroups that are now
+        * offline.
+        */
+       list_for_each_entry_safe(c, c2, &s->memcg_params.list,
+                                memcg_params.list)
+               __shutdown_memcg_cache(c, release, need_rcu_barrier);
+
+       list_splice(&busy, &s->memcg_params.list);
+
+       /*
+        * A cache being destroyed must be empty. In particular, this means
+        * that all per memcg caches attached to it must be empty too.
+        */
+       if (!list_empty(&s->memcg_params.list))
+               return -EBUSY;
+       return 0;
+}
+#else
+static inline int shutdown_memcg_caches(struct kmem_cache *s,
+               struct list_head *release, bool *need_rcu_barrier)
+{
+       return 0;
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
@@ -635,16 +699,13 @@ void slab_kmem_cache_release(struct kmem_cache *s)
 
 void kmem_cache_destroy(struct kmem_cache *s)
 {
-       struct kmem_cache *c, *c2;
        LIST_HEAD(release);
        bool need_rcu_barrier = false;
-       bool busy = false;
+       int err;
 
        if (unlikely(!s))
                return;
 
-       BUG_ON(!is_root_cache(s));
-
        get_online_cpus();
        get_online_mems();
 
@@ -654,21 +715,22 @@ void kmem_cache_destroy(struct kmem_cache *s)
        if (s->refcount)
                goto out_unlock;
 
-       for_each_memcg_cache_safe(c, c2, s) {
-               if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
-                       busy = true;
-       }
-
-       if (!busy)
-               do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
+       err = shutdown_memcg_caches(s, &release, &need_rcu_barrier);
+       if (!err)
+               err = shutdown_cache(s, &release, &need_rcu_barrier);
 
+       if (err) {
+               pr_err("kmem_cache_destroy %s: "
+                      "Slab cache still has objects\n", s->name);
+               dump_stack();
+       }
 out_unlock:
        mutex_unlock(&slab_mutex);
 
        put_online_mems();
        put_online_cpus();
 
-       do_kmem_cache_release(&release, need_rcu_barrier);
+       release_caches(&release, need_rcu_barrier);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
@@ -692,7 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 
-int slab_is_available(void)
+bool slab_is_available(void)
 {
        return slab_state >= UP;
 }
index f614b5dc396bc17b43cebacd97383243bbb03b99..438ebf8bbab1ddaba0e25db37a8f2b1913859b96 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
  */
 static __always_inline void slab_lock(struct page *page)
 {
+       VM_BUG_ON_PAGE(PageTail(page), page);
        bit_spin_lock(PG_locked, &page->flags);
 }
 
 static __always_inline void slab_unlock(struct page *page)
 {
+       VM_BUG_ON_PAGE(PageTail(page), page);
        __bit_spin_unlock(PG_locked, &page->flags);
 }
 
@@ -459,8 +461,10 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
 /*
  * Debug settings:
  */
-#ifdef CONFIG_SLUB_DEBUG_ON
+#if defined(CONFIG_SLUB_DEBUG_ON)
 static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#elif defined(CONFIG_KASAN)
+static int slub_debug = SLAB_STORE_USER;
 #else
 static int slub_debug;
 #endif
@@ -1063,11 +1067,15 @@ bad:
        return 0;
 }
 
+/* Supports checking bulk free of a constructed freelist */
 static noinline struct kmem_cache_node *free_debug_processing(
-       struct kmem_cache *s, struct page *page, void *object,
+       struct kmem_cache *s, struct page *page,
+       void *head, void *tail, int bulk_cnt,
        unsigned long addr, unsigned long *flags)
 {
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+       void *object = head;
+       int cnt = 0;
 
        spin_lock_irqsave(&n->list_lock, *flags);
        slab_lock(page);
@@ -1075,6 +1083,9 @@ static noinline struct kmem_cache_node *free_debug_processing(
        if (!check_slab(s, page))
                goto fail;
 
+next_object:
+       cnt++;
+
        if (!check_valid_pointer(s, page, object)) {
                slab_err(s, page, "Invalid object pointer 0x%p", object);
                goto fail;
@@ -1105,8 +1116,19 @@ static noinline struct kmem_cache_node *free_debug_processing(
        if (s->flags & SLAB_STORE_USER)
                set_track(s, object, TRACK_FREE, addr);
        trace(s, page, object, 0);
+       /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
        init_object(s, object, SLUB_RED_INACTIVE);
+
+       /* Reached end of constructed freelist yet? */
+       if (object != tail) {
+               object = get_freepointer(s, object);
+               goto next_object;
+       }
 out:
+       if (cnt != bulk_cnt)
+               slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
+                        bulk_cnt, cnt);
+
        slab_unlock(page);
        /*
         * Keep node_lock to preserve integrity
@@ -1202,7 +1224,7 @@ unsigned long kmem_cache_flags(unsigned long object_size,
 
        return flags;
 }
-#else
+#else /* !CONFIG_SLUB_DEBUG */
 static inline void setup_object_debug(struct kmem_cache *s,
                        struct page *page, void *object) {}
 
@@ -1210,7 +1232,8 @@ static inline int alloc_debug_processing(struct kmem_cache *s,
        struct page *page, void *object, unsigned long addr) { return 0; }
 
 static inline struct kmem_cache_node *free_debug_processing(
-       struct kmem_cache *s, struct page *page, void *object,
+       struct kmem_cache *s, struct page *page,
+       void *head, void *tail, int bulk_cnt,
        unsigned long addr, unsigned long *flags) { return NULL; }
 
 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
@@ -1263,7 +1286,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
 {
        flags &= gfp_allowed_mask;
        lockdep_trace_alloc(flags);
-       might_sleep_if(flags & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(flags));
 
        if (should_failslab(s->object_size, flags, s->flags))
                return NULL;
@@ -1306,6 +1329,29 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
        kasan_slab_free(s, x);
 }
 
+static inline void slab_free_freelist_hook(struct kmem_cache *s,
+                                          void *head, void *tail)
+{
+/*
+ * Compiler cannot detect this function can be removed if slab_free_hook()
+ * evaluates to nothing.  Thus, catch all relevant config debug options here.
+ */
+#if defined(CONFIG_KMEMCHECK) ||               \
+       defined(CONFIG_LOCKDEP) ||              \
+       defined(CONFIG_DEBUG_KMEMLEAK) ||       \
+       defined(CONFIG_DEBUG_OBJECTS_FREE) ||   \
+       defined(CONFIG_KASAN)
+
+       void *object = head;
+       void *tail_obj = tail ? : head;
+
+       do {
+               slab_free_hook(s, object);
+       } while ((object != tail_obj) &&
+                (object = get_freepointer(s, object)));
+#endif
+}
+
 static void setup_object(struct kmem_cache *s, struct page *page,
                                void *object)
 {
@@ -1328,16 +1374,15 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
 
        flags |= __GFP_NOTRACK;
 
-       if (memcg_charge_slab(s, flags, order))
-               return NULL;
-
        if (node == NUMA_NO_NODE)
                page = alloc_pages(flags, order);
        else
                page = __alloc_pages_node(node, flags, order);
 
-       if (!page)
-               memcg_uncharge_slab(s, order);
+       if (page && memcg_charge_slab(page, flags, order, s)) {
+               __free_pages(page, order);
+               page = NULL;
+       }
 
        return page;
 }
@@ -1352,7 +1397,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 
        flags &= gfp_allowed_mask;
 
-       if (flags & __GFP_WAIT)
+       if (gfpflags_allow_blocking(flags))
                local_irq_enable();
 
        flags |= s->allocflags;
@@ -1362,8 +1407,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
         * so we fall-back to the minimum order allocation.
         */
        alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
-       if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min))
-               alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT;
+       if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
+               alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
 
        page = alloc_slab_page(s, alloc_gfp, node, oo);
        if (unlikely(!page)) {
@@ -1423,7 +1468,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
        page->frozen = 1;
 
 out:
-       if (flags & __GFP_WAIT)
+       if (gfpflags_allow_blocking(flags))
                local_irq_disable();
        if (!page)
                return NULL;
@@ -1476,8 +1521,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        page_mapcount_reset(page);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += pages;
-       __free_pages(page, order);
-       memcg_uncharge_slab(s, order);
+       __free_kmem_pages(page, order);
 }
 
 #define need_reserve_slab_rcu                                          \
@@ -1507,10 +1551,7 @@ static void free_slab(struct kmem_cache *s, struct page *page)
                        VM_BUG_ON(s->reserved != sizeof(*head));
                        head = page_address(page) + offset;
                } else {
-                       /*
-                        * RCU free overloads the RCU head over the LRU
-                        */
-                       head = (void *)&page->lru;
+                       head = &page->rcu_head;
                }
 
                call_rcu(head, rcu_free_slab);
@@ -2298,23 +2339,15 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
  * And if we were unable to get a new slab from the partial slab lists then
  * we need to allocate a new slab. This is the slowest path since it involves
  * a call to the page allocator and the setup of a new slab.
+ *
+ * Version of __slab_alloc to use when we know that interrupts are
+ * already disabled (which is the case for bulk allocation).
  */
-static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                          unsigned long addr, struct kmem_cache_cpu *c)
 {
        void *freelist;
        struct page *page;
-       unsigned long flags;
-
-       local_irq_save(flags);
-#ifdef CONFIG_PREEMPT
-       /*
-        * We may have been preempted and rescheduled on a different
-        * cpu before disabling interrupts. Need to reload cpu area
-        * pointer.
-        */
-       c = this_cpu_ptr(s->cpu_slab);
-#endif
 
        page = c->page;
        if (!page)
@@ -2372,7 +2405,6 @@ load_freelist:
        VM_BUG_ON(!c->page->frozen);
        c->freelist = get_freepointer(s, freelist);
        c->tid = next_tid(c->tid);
-       local_irq_restore(flags);
        return freelist;
 
 new_slab:
@@ -2389,7 +2421,6 @@ new_slab:
 
        if (unlikely(!freelist)) {
                slab_out_of_memory(s, gfpflags, node);
-               local_irq_restore(flags);
                return NULL;
        }
 
@@ -2405,10 +2436,34 @@ new_slab:
        deactivate_slab(s, page, get_freepointer(s, freelist));
        c->page = NULL;
        c->freelist = NULL;
-       local_irq_restore(flags);
        return freelist;
 }
 
+/*
+ * Another one that disabled interrupt and compensates for possible
+ * cpu changes by refetching the per cpu area pointer.
+ */
+static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+                         unsigned long addr, struct kmem_cache_cpu *c)
+{
+       void *p;
+       unsigned long flags;
+
+       local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+       /*
+        * We may have been preempted and rescheduled on a different
+        * cpu before disabling interrupts. Need to reload cpu area
+        * pointer.
+        */
+       c = this_cpu_ptr(s->cpu_slab);
+#endif
+
+       p = ___slab_alloc(s, gfpflags, node, addr, c);
+       local_irq_restore(flags);
+       return p;
+}
+
 /*
  * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
  * have the fastpath folded into their functions. So no function call
@@ -2572,10 +2627,11 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
  * handling required then we can return immediately.
  */
 static void __slab_free(struct kmem_cache *s, struct page *page,
-                       void *x, unsigned long addr)
+                       void *head, void *tail, int cnt,
+                       unsigned long addr)
+
 {
        void *prior;
-       void **object = (void *)x;
        int was_frozen;
        struct page new;
        unsigned long counters;
@@ -2585,7 +2641,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
        stat(s, FREE_SLOWPATH);
 
        if (kmem_cache_debug(s) &&
-               !(n = free_debug_processing(s, page, x, addr, &flags)))
+           !(n = free_debug_processing(s, page, head, tail, cnt,
+                                       addr, &flags)))
                return;
 
        do {
@@ -2595,10 +2652,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                }
                prior = page->freelist;
                counters = page->counters;
-               set_freepointer(s, object, prior);
+               set_freepointer(s, tail, prior);
                new.counters = counters;
                was_frozen = new.frozen;
-               new.inuse--;
+               new.inuse -= cnt;
                if ((!new.inuse || !prior) && !was_frozen) {
 
                        if (kmem_cache_has_cpu_partial(s) && !prior) {
@@ -2629,7 +2686,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 
        } while (!cmpxchg_double_slab(s, page,
                prior, counters,
-               object, new.counters,
+               head, new.counters,
                "__slab_free"));
 
        if (likely(!n)) {
@@ -2694,15 +2751,20 @@ slab_empty:
  *
  * If fastpath is not possible then fall back to __slab_free where we deal
  * with all sorts of special processing.
+ *
+ * Bulk free of a freelist with several objects (all pointing to the
+ * same page) possible by specifying head and tail ptr, plus objects
+ * count (cnt). Bulk free indicated by tail pointer being set.
  */
-static __always_inline void slab_free(struct kmem_cache *s,
-                       struct page *page, void *x, unsigned long addr)
+static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+                                     void *head, void *tail, int cnt,
+                                     unsigned long addr)
 {
-       void **object = (void *)x;
+       void *tail_obj = tail ? : head;
        struct kmem_cache_cpu *c;
        unsigned long tid;
 
-       slab_free_hook(s, x);
+       slab_free_freelist_hook(s, head, tail);
 
 redo:
        /*
@@ -2721,19 +2783,19 @@ redo:
        barrier();
 
        if (likely(page == c->page)) {
-               set_freepointer(s, object, c->freelist);
+               set_freepointer(s, tail_obj, c->freelist);
 
                if (unlikely(!this_cpu_cmpxchg_double(
                                s->cpu_slab->freelist, s->cpu_slab->tid,
                                c->freelist, tid,
-                               object, next_tid(tid)))) {
+                               head, next_tid(tid)))) {
 
                        note_cmpxchg_failure("slab_free", s, tid);
                        goto redo;
                }
                stat(s, FREE_FASTPATH);
        } else
-               __slab_free(s, page, x, addr);
+               __slab_free(s, page, head, tail_obj, cnt, addr);
 
 }
 
@@ -2742,49 +2804,98 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
        s = cache_from_obj(s, x);
        if (!s)
                return;
-       slab_free(s, virt_to_head_page(x), x, _RET_IP_);
+       slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
        trace_kmem_cache_free(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
-/* Note that interrupts must be enabled when calling this function. */
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
-{
-       struct kmem_cache_cpu *c;
+struct detached_freelist {
        struct page *page;
-       int i;
+       void *tail;
+       void *freelist;
+       int cnt;
+};
 
-       local_irq_disable();
-       c = this_cpu_ptr(s->cpu_slab);
+/*
+ * This function progressively scans the array with free objects (with
+ * a limited look ahead) and extract objects belonging to the same
+ * page.  It builds a detached freelist directly within the given
+ * page/objects.  This can happen without any need for
+ * synchronization, because the objects are owned by running process.
+ * The freelist is build up as a single linked list in the objects.
+ * The idea is, that this detached freelist can then be bulk
+ * transferred to the real freelist(s), but only requiring a single
+ * synchronization primitive.  Look ahead in the array is limited due
+ * to performance reasons.
+ */
+static int build_detached_freelist(struct kmem_cache *s, size_t size,
+                                  void **p, struct detached_freelist *df)
+{
+       size_t first_skipped_index = 0;
+       int lookahead = 3;
+       void *object;
 
-       for (i = 0; i < size; i++) {
-               void *object = p[i];
+       /* Always re-init detached_freelist */
+       df->page = NULL;
 
-               BUG_ON(!object);
-               /* kmem cache debug support */
-               s = cache_from_obj(s, object);
-               if (unlikely(!s))
-                       goto exit;
-               slab_free_hook(s, object);
+       do {
+               object = p[--size];
+       } while (!object && size);
+
+       if (!object)
+               return 0;
 
-               page = virt_to_head_page(object);
+       /* Start new detached freelist */
+       set_freepointer(s, object, NULL);
+       df->page = virt_to_head_page(object);
+       df->tail = object;
+       df->freelist = object;
+       p[size] = NULL; /* mark object processed */
+       df->cnt = 1;
+
+       while (size) {
+               object = p[--size];
+               if (!object)
+                       continue; /* Skip processed objects */
+
+               /* df->page is always set at this point */
+               if (df->page == virt_to_head_page(object)) {
+                       /* Opportunity build freelist */
+                       set_freepointer(s, object, df->freelist);
+                       df->freelist = object;
+                       df->cnt++;
+                       p[size] = NULL; /* mark object processed */
 
-               if (c->page == page) {
-                       /* Fastpath: local CPU free */
-                       set_freepointer(s, object, c->freelist);
-                       c->freelist = object;
-               } else {
-                       c->tid = next_tid(c->tid);
-                       local_irq_enable();
-                       /* Slowpath: overhead locked cmpxchg_double_slab */
-                       __slab_free(s, page, object, _RET_IP_);
-                       local_irq_disable();
-                       c = this_cpu_ptr(s->cpu_slab);
+                       continue;
                }
+
+               /* Limit look ahead search */
+               if (!--lookahead)
+                       break;
+
+               if (!first_skipped_index)
+                       first_skipped_index = size + 1;
        }
-exit:
-       c->tid = next_tid(c->tid);
-       local_irq_enable();
+
+       return first_skipped_index;
+}
+
+
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+       if (WARN_ON(!size))
+               return;
+
+       do {
+               struct detached_freelist df;
+
+               size = build_detached_freelist(s, size, p, &df);
+               if (unlikely(!df.page))
+                       continue;
+
+               slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
+       } while (likely(size));
 }
 EXPORT_SYMBOL(kmem_cache_free_bulk);
 
@@ -2807,30 +2918,23 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                void *object = c->freelist;
 
                if (unlikely(!object)) {
-                       local_irq_enable();
                        /*
                         * Invoking slow path likely have side-effect
                         * of re-populating per CPU c->freelist
                         */
-                       p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
+                       p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
                                            _RET_IP_, c);
-                       if (unlikely(!p[i])) {
-                               __kmem_cache_free_bulk(s, i, p);
-                               return false;
-                       }
-                       local_irq_disable();
+                       if (unlikely(!p[i]))
+                               goto error;
+
                        c = this_cpu_ptr(s->cpu_slab);
                        continue; /* goto for-loop */
                }
 
                /* kmem_cache debug support */
                s = slab_pre_alloc_hook(s, flags);
-               if (unlikely(!s)) {
-                       __kmem_cache_free_bulk(s, i, p);
-                       c->tid = next_tid(c->tid);
-                       local_irq_enable();
-                       return false;
-               }
+               if (unlikely(!s))
+                       goto error;
 
                c->freelist = get_freepointer(s, object);
                p[i] = object;
@@ -2850,6 +2954,11 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
        }
 
        return true;
+
+error:
+       __kmem_cache_free_bulk(s, i, p);
+       local_irq_enable();
+       return false;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
 
@@ -2912,20 +3021,15 @@ static inline int slab_order(int size, int min_objects,
        if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
                return get_order(size * MAX_OBJS_PER_PAGE) - 1;
 
-       for (order = max(min_order,
-                               fls(min_objects * size - 1) - PAGE_SHIFT);
+       for (order = max(min_order, get_order(min_objects * size + reserved));
                        order <= max_order; order++) {
 
                unsigned long slab_size = PAGE_SIZE << order;
 
-               if (slab_size < min_objects * size + reserved)
-                       continue;
-
                rem = (slab_size - reserved) % size;
 
                if (rem <= slab_size / fract_leftover)
                        break;
-
        }
 
        return order;
@@ -2943,7 +3047,7 @@ static inline int calculate_order(int size, int reserved)
         * works by first attempting to generate a layout with
         * the best configuration and backing off gradually.
         *
-        * First we reduce the acceptable waste in a slab. Then
+        * First we increase the acceptable waste in a slab. Then
         * we reduce the minimum objects required in a slab.
         */
        min_objects = slub_min_objects;
@@ -3519,7 +3623,7 @@ void kfree(const void *x)
                __free_kmem_pages(page, compound_order(page));
                return;
        }
-       slab_free(page->slab_cache, page, object, _RET_IP_);
+       slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
 }
 EXPORT_SYMBOL(kfree);
 
index 983f692a47fdfbb80505fa77f673b9af37d08739..674e2c93da4e958780abf8b4c6de11c8407ef25e 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -45,6 +45,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 
 /*
  * This path almost never happens for VM activity - pages are normally
@@ -89,260 +90,14 @@ static void __put_compound_page(struct page *page)
        (*dtor)(page);
 }
 
-/**
- * Two special cases here: we could avoid taking compound_lock_irqsave
- * and could skip the tail refcounting(in _mapcount).
- *
- * 1. Hugetlbfs page:
- *
- *    PageHeadHuge will remain true until the compound page
- *    is released and enters the buddy allocator, and it could
- *    not be split by __split_huge_page_refcount().
- *
- *    So if we see PageHeadHuge set, and we have the tail page pin,
- *    then we could safely put head page.
- *
- * 2. Slab THP page:
- *
- *    PG_slab is cleared before the slab frees the head page, and
- *    tail pin cannot be the last reference left on the head page,
- *    because the slab code is free to reuse the compound page
- *    after a kfree/kmem_cache_free without having to check if
- *    there's any tail pin left.  In turn all tail pinsmust be always
- *    released while the head is still pinned by the slab code
- *    and so we know PG_slab will be still set too.
- *
- *    So if we see PageSlab set, and we have the tail page pin,
- *    then we could safely put head page.
- */
-static __always_inline
-void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
-{
-       /*
-        * If @page is a THP tail, we must read the tail page
-        * flags after the head page flags. The
-        * __split_huge_page_refcount side enforces write memory barriers
-        * between clearing PageTail and before the head page
-        * can be freed and reallocated.
-        */
-       smp_rmb();
-       if (likely(PageTail(page))) {
-               /*
-                * __split_huge_page_refcount cannot race
-                * here, see the comment above this function.
-                */
-               VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-               if (put_page_testzero(page_head)) {
-                       /*
-                        * If this is the tail of a slab THP page,
-                        * the tail pin must not be the last reference
-                        * held on the page, because the PG_slab cannot
-                        * be cleared before all tail pins (which skips
-                        * the _mapcount tail refcounting) have been
-                        * released.
-                        *
-                        * If this is the tail of a hugetlbfs page,
-                        * the tail pin may be the last reference on
-                        * the page instead, because PageHeadHuge will
-                        * not go away until the compound page enters
-                        * the buddy allocator.
-                        */
-                       VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
-                       __put_compound_page(page_head);
-               }
-       } else
-               /*
-                * __split_huge_page_refcount run before us,
-                * @page was a THP tail. The split @page_head
-                * has been freed and reallocated as slab or
-                * hugetlbfs page of smaller order (only
-                * possible if reallocated as slab on x86).
-                */
-               if (put_page_testzero(page))
-                       __put_single_page(page);
-}
-
-static __always_inline
-void put_refcounted_compound_page(struct page *page_head, struct page *page)
-{
-       if (likely(page != page_head && get_page_unless_zero(page_head))) {
-               unsigned long flags;
-
-               /*
-                * @page_head wasn't a dangling pointer but it may not
-                * be a head page anymore by the time we obtain the
-                * lock. That is ok as long as it can't be freed from
-                * under us.
-                */
-               flags = compound_lock_irqsave(page_head);
-               if (unlikely(!PageTail(page))) {
-                       /* __split_huge_page_refcount run before us */
-                       compound_unlock_irqrestore(page_head, flags);
-                       if (put_page_testzero(page_head)) {
-                               /*
-                                * The @page_head may have been freed
-                                * and reallocated as a compound page
-                                * of smaller order and then freed
-                                * again.  All we know is that it
-                                * cannot have become: a THP page, a
-                                * compound page of higher order, a
-                                * tail page.  That is because we
-                                * still hold the refcount of the
-                                * split THP tail and page_head was
-                                * the THP head before the split.
-                                */
-                               if (PageHead(page_head))
-                                       __put_compound_page(page_head);
-                               else
-                                       __put_single_page(page_head);
-                       }
-out_put_single:
-                       if (put_page_testzero(page))
-                               __put_single_page(page);
-                       return;
-               }
-               VM_BUG_ON_PAGE(page_head != page->first_page, page);
-               /*
-                * We can release the refcount taken by
-                * get_page_unless_zero() now that
-                * __split_huge_page_refcount() is blocked on the
-                * compound_lock.
-                */
-               if (put_page_testzero(page_head))
-                       VM_BUG_ON_PAGE(1, page_head);
-               /* __split_huge_page_refcount will wait now */
-               VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
-               atomic_dec(&page->_mapcount);
-               VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
-               VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
-               compound_unlock_irqrestore(page_head, flags);
-
-               if (put_page_testzero(page_head)) {
-                       if (PageHead(page_head))
-                               __put_compound_page(page_head);
-                       else
-                               __put_single_page(page_head);
-               }
-       } else {
-               /* @page_head is a dangling pointer */
-               VM_BUG_ON_PAGE(PageTail(page), page);
-               goto out_put_single;
-       }
-}
-
-static void put_compound_page(struct page *page)
-{
-       struct page *page_head;
-
-       /*
-        * We see the PageCompound set and PageTail not set, so @page maybe:
-        *  1. hugetlbfs head page, or
-        *  2. THP head page.
-        */
-       if (likely(!PageTail(page))) {
-               if (put_page_testzero(page)) {
-                       /*
-                        * By the time all refcounts have been released
-                        * split_huge_page cannot run anymore from under us.
-                        */
-                       if (PageHead(page))
-                               __put_compound_page(page);
-                       else
-                               __put_single_page(page);
-               }
-               return;
-       }
-
-       /*
-        * We see the PageCompound set and PageTail set, so @page maybe:
-        *  1. a tail hugetlbfs page, or
-        *  2. a tail THP page, or
-        *  3. a split THP page.
-        *
-        *  Case 3 is possible, as we may race with
-        *  __split_huge_page_refcount tearing down a THP page.
-        */
-       page_head = compound_head_by_tail(page);
-       if (!__compound_tail_refcounted(page_head))
-               put_unrefcounted_compound_page(page_head, page);
-       else
-               put_refcounted_compound_page(page_head, page);
-}
-
-void put_page(struct page *page)
+void __put_page(struct page *page)
 {
        if (unlikely(PageCompound(page)))
-               put_compound_page(page);
-       else if (put_page_testzero(page))
+               __put_compound_page(page);
+       else
                __put_single_page(page);
 }
-EXPORT_SYMBOL(put_page);
-
-/*
- * This function is exported but must not be called by anything other
- * than get_page(). It implements the slow path of get_page().
- */
-bool __get_page_tail(struct page *page)
-{
-       /*
-        * This takes care of get_page() if run on a tail page
-        * returned by one of the get_user_pages/follow_page variants.
-        * get_user_pages/follow_page itself doesn't need the compound
-        * lock because it runs __get_page_tail_foll() under the
-        * proper PT lock that already serializes against
-        * split_huge_page().
-        */
-       unsigned long flags;
-       bool got;
-       struct page *page_head = compound_head(page);
-
-       /* Ref to put_compound_page() comment. */
-       if (!__compound_tail_refcounted(page_head)) {
-               smp_rmb();
-               if (likely(PageTail(page))) {
-                       /*
-                        * This is a hugetlbfs page or a slab
-                        * page. __split_huge_page_refcount
-                        * cannot race here.
-                        */
-                       VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
-                       __get_page_tail_foll(page, true);
-                       return true;
-               } else {
-                       /*
-                        * __split_huge_page_refcount run
-                        * before us, "page" was a THP
-                        * tail. The split page_head has been
-                        * freed and reallocated as slab or
-                        * hugetlbfs page of smaller order
-                        * (only possible if reallocated as
-                        * slab on x86).
-                        */
-                       return false;
-               }
-       }
-
-       got = false;
-       if (likely(page != page_head && get_page_unless_zero(page_head))) {
-               /*
-                * page_head wasn't a dangling pointer but it
-                * may not be a head page anymore by the time
-                * we obtain the lock. That is ok as long as it
-                * can't be freed from under us.
-                */
-               flags = compound_lock_irqsave(page_head);
-               /* here __split_huge_page_refcount won't run anymore */
-               if (likely(PageTail(page))) {
-                       __get_page_tail_foll(page, false);
-                       got = true;
-               }
-               compound_unlock_irqrestore(page_head, flags);
-               if (unlikely(!got))
-                       put_page(page_head);
-       }
-       return got;
-}
-EXPORT_SYMBOL(__get_page_tail);
+EXPORT_SYMBOL(__put_page);
 
 /**
  * put_pages_list() - release a list of pages
@@ -604,6 +359,7 @@ static void __lru_cache_activate_page(struct page *page)
  */
 void mark_page_accessed(struct page *page)
 {
+       page = compound_head(page);
        if (!PageActive(page) && !PageUnevictable(page) &&
                        PageReferenced(page)) {
 
@@ -799,6 +555,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
        update_page_reclaim_stat(lruvec, file, 0);
 }
 
+
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+                           void *arg)
+{
+       if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+               int file = page_is_file_cache(page);
+               int lru = page_lru_base_type(page);
+
+               del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+               ClearPageActive(page);
+               ClearPageReferenced(page);
+               add_page_to_lru_list(page, lruvec, lru);
+
+               __count_vm_event(PGDEACTIVATE);
+               update_page_reclaim_stat(lruvec, file, 0);
+       }
+}
+
 /*
  * Drain pages out of the cpu's pagevecs.
  * Either "cpu" is the current CPU, and preemption has already been
@@ -825,6 +599,10 @@ void lru_add_drain_cpu(int cpu)
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 
+       pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+       if (pagevec_count(pvec))
+               pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
        activate_page_drain(cpu);
 }
 
@@ -854,6 +632,26 @@ void deactivate_file_page(struct page *page)
        }
 }
 
+/**
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page.  This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+       if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+               page_cache_get(page);
+               if (!pagevec_add(pvec, page))
+                       pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+               put_cpu_var(lru_deactivate_pvecs);
+       }
+}
+
 void lru_add_drain(void)
 {
        lru_add_drain_cpu(get_cpu());
@@ -883,6 +681,7 @@ void lru_add_drain_all(void)
                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
                    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+                   pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        schedule_work_on(cpu, work);
@@ -918,15 +717,6 @@ void release_pages(struct page **pages, int nr, bool cold)
        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];
 
-               if (unlikely(PageCompound(page))) {
-                       if (zone) {
-                               spin_unlock_irqrestore(&zone->lru_lock, flags);
-                               zone = NULL;
-                       }
-                       put_compound_page(page);
-                       continue;
-               }
-
                /*
                 * Make sure the IRQ-safe lock-holding time does not get
                 * excessive with a continuous string of pages from the
@@ -937,9 +727,19 @@ void release_pages(struct page **pages, int nr, bool cold)
                        zone = NULL;
                }
 
+               page = compound_head(page);
                if (!put_page_testzero(page))
                        continue;
 
+               if (PageCompound(page)) {
+                       if (zone) {
+                               spin_unlock_irqrestore(&zone->lru_lock, flags);
+                               zone = NULL;
+                       }
+                       __put_compound_page(page);
+                       continue;
+               }
+
                if (PageLRU(page)) {
                        struct zone *pagezone = page_zone(page);
 
index d504adb7fa5f08ced98eeb2a285976c0db64a9ae..676ff2991380120275ba5d81d9660592bdc15b75 100644 (file)
@@ -185,13 +185,12 @@ int add_to_swap(struct page *page, struct list_head *list)
         * deadlock in the swap out path.
         */
        /*
-        * Add it to the swap cache and mark it dirty
+        * Add it to the swap cache.
         */
        err = add_to_swap_cache(page, entry,
                        __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
 
-       if (!err) {     /* Success */
-               SetPageDirty(page);
+       if (!err) {
                return 1;
        } else {        /* -ENOMEM radix-tree allocation failure */
                /*
@@ -353,7 +352,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                }
 
                /* May fail (-ENOMEM) if radix-tree node allocation failed. */
-               __set_page_locked(new_page);
+               __SetPageLocked(new_page);
                SetPageSwapBacked(new_page);
                err = __add_to_swap_cache(new_page, entry);
                if (likely(!err)) {
@@ -367,7 +366,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                }
                radix_tree_preload_end();
                ClearPageSwapBacked(new_page);
-               __clear_page_locked(new_page);
+               __ClearPageLocked(new_page);
                /*
                 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
                 * clear SWAP_HAS_CACHE flag.
index 58877312cf6b94b74da00b8913bf8db64d56ab26..7073faecb38f9cd087a88c06b6cec276d5a34154 100644 (file)
@@ -929,6 +929,9 @@ int reuse_swap_page(struct page *page)
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (unlikely(PageKsm(page)))
                return 0;
+       /* The page is part of THP and cannot be reused */
+       if (PageTransCompound(page))
+               return 0;
        count = page_mapcount(page);
        if (count <= 1 && PageSwapCache(page)) {
                count += page_swapcount(page);
@@ -1145,14 +1148,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        if (unlikely(!page))
                return -ENOMEM;
 
-       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
+       if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+                               &memcg, false)) {
                ret = -ENOMEM;
                goto out_nolock;
        }
 
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, false);
                ret = 0;
                goto out;
        }
@@ -1163,11 +1167,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        set_pte_at(vma->vm_mm, addr, pte,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
        if (page == swapcache) {
-               page_add_anon_rmap(page, vma, addr);
-               mem_cgroup_commit_charge(page, memcg, true);
+               page_add_anon_rmap(page, vma, addr, false);
+               mem_cgroup_commit_charge(page, memcg, true, false);
        } else { /* ksm created a completely new copy */
-               page_add_new_anon_rmap(page, vma, addr);
-               mem_cgroup_commit_charge(page, memcg, false);
+               page_add_new_anon_rmap(page, vma, addr, false);
+               mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        }
        swap_free(entry);
index 77fee9325a5727825b9adcdac156b4e22247b6cf..806b0c758c5b6113de9b921ff0cfd01fd45b6424 100644 (file)
@@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
        __SetPageUptodate(page);
 
        ret = -ENOMEM;
-       if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+       if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
                goto out_release;
 
        _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
@@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
                goto out_release_uncharge_unlock;
 
        inc_mm_counter(dst_mm, MM_ANONPAGES);
-       page_add_new_anon_rmap(page, dst_vma, dst_addr);
-       mem_cgroup_commit_charge(page, memcg, false);
+       page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+       mem_cgroup_commit_charge(page, memcg, false, false);
        lru_cache_add_active_or_unevictable(page, dst_vma);
 
        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -91,7 +91,7 @@ out:
        return ret;
 out_release_uncharge_unlock:
        pte_unmap_unlock(dst_pte, ptl);
-       mem_cgroup_cancel_charge(page, memcg);
+       mem_cgroup_cancel_charge(page, memcg, false);
 out_release:
        page_cache_release(page);
        goto out;
index 68ff8a5361e79a30233fe9ecc4b26b09e614e1b8..55012d786a3545bc415d30b4774bf9b62dbf27ac 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -3,6 +3,7 @@
 #include <linux/string.h>
 #include <linux/compiler.h>
 #include <linux/export.h>
+#include <linux/ctype.h>
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/security.h>
@@ -99,6 +100,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
 }
 EXPORT_SYMBOL(kstrndup);
 
+/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+       char *buf;
+       char *begin = skip_spaces(s);
+       size_t len = strlen(begin);
+
+       while (len && isspace(begin[len - 1]))
+               len--;
+
+       buf = kmalloc_track_caller(len + 1, gfp);
+       if (!buf)
+               return NULL;
+
+       memcpy(buf, begin, len);
+       buf[len] = '\0';
+
+       return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
 /**
  * kmemdup - duplicate region of memory
  *
@@ -309,7 +339,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 {
        if (unlikely(offset + PAGE_ALIGN(len) < offset))
                return -EINVAL;
-       if (unlikely(offset & ~PAGE_MASK))
+       if (unlikely(offset_in_page(offset)))
                return -EINVAL;
 
        return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
@@ -355,7 +385,9 @@ struct anon_vma *page_anon_vma(struct page *page)
 
 struct address_space *page_mapping(struct page *page)
 {
-       unsigned long mapping;
+       struct address_space *mapping;
+
+       page = compound_head(page);
 
        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(PageSlab(page)))
@@ -368,10 +400,10 @@ struct address_space *page_mapping(struct page *page)
                return swap_address_space(entry);
        }
 
-       mapping = (unsigned long)page->mapping;
-       if (mapping & PAGE_MAPPING_FLAGS)
+       mapping = page->mapping;
+       if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
                return NULL;
-       return page->mapping;
+       return mapping;
 }
 
 int overcommit_ratio_handler(struct ctl_table *table, int write,
index b6e3662fe339532d8b8f1578f58dd208dd23e249..fd09dc9c6812bb86d483b1638d519b565958049a 100644 (file)
@@ -52,7 +52,7 @@ void vmacache_flush_all(struct mm_struct *mm)
  * Also handle the case where a kernel thread has adopted this mm via use_mm().
  * That kernel thread's vmacache is not applicable to this mm.
  */
-static bool vmacache_valid_mm(struct mm_struct *mm)
+static inline bool vmacache_valid_mm(struct mm_struct *mm)
 {
        return current->mm == mm && !(current->flags & PF_KTHREAD);
 }
index 2faaa2976447a104ac5017ce7f4ad1e52b808b0b..842c12c0cc6e9fbf3d500601efa2ed6249fbd04c 100644 (file)
@@ -35,6 +35,8 @@
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
 
+#include "internal.h"
+
 struct vfree_deferred {
        struct llist_head list;
        struct work_struct wq;
@@ -358,7 +360,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
        struct vmap_area *first;
 
        BUG_ON(!size);
-       BUG_ON(size & ~PAGE_MASK);
+       BUG_ON(offset_in_page(size));
        BUG_ON(!is_power_of_2(align));
 
        va = kmalloc_node(sizeof(struct vmap_area),
@@ -936,7 +938,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
        void *vaddr = NULL;
        unsigned int order;
 
-       BUG_ON(size & ~PAGE_MASK);
+       BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
        if (WARN_ON(size == 0)) {
                /*
@@ -989,7 +991,7 @@ static void vb_free(const void *addr, unsigned long size)
        unsigned int order;
        struct vmap_block *vb;
 
-       BUG_ON(size & ~PAGE_MASK);
+       BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
 
        flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
@@ -1617,7 +1619,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                        goto fail;
                }
                area->pages[i] = page;
-               if (gfp_mask & __GFP_WAIT)
+               if (gfpflags_allow_blocking(gfp_mask))
                        cond_resched();
        }
 
@@ -1902,7 +1904,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
        while (count) {
                unsigned long offset, length;
 
-               offset = (unsigned long)addr & ~PAGE_MASK;
+               offset = offset_in_page(addr);
                length = PAGE_SIZE - offset;
                if (length > count)
                        length = count;
@@ -1941,7 +1943,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
        while (count) {
                unsigned long offset, length;
 
-               offset = (unsigned long)addr & ~PAGE_MASK;
+               offset = offset_in_page(addr);
                length = PAGE_SIZE - offset;
                if (length > count)
                        length = count;
@@ -2392,7 +2394,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
        bool purged = false;
 
        /* verify parameters and allocate data structures */
-       BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
+       BUG_ON(offset_in_page(align) || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
                start = offsets[area];
                end = start + sizes[area];
index c5afd573d7da79afc814225043319cc66120addf..4c25e621a40b1f64b511feaac2362e945a2f5d76 100644 (file)
@@ -38,7 +38,7 @@
  * TODO: Make the window size depend on machine size, as we do for vmstat
  * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
  */
-static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 2;
 
 /*
  * These thresholds are used when we account memory pressure through
index e7057af54b6e267558a99749fac80dc77dd7855f..9b52ecf9119420bef8ce4ea2d503eb3ef4754c7f 100644 (file)
@@ -194,7 +194,7 @@ static bool sane_reclaim(struct scan_control *sc)
 
 static unsigned long zone_reclaimable_pages(struct zone *zone)
 {
-       int nr;
+       unsigned long nr;
 
        nr = zone_page_state(zone, NR_ACTIVE_FILE) +
             zone_page_state(zone, NR_INACTIVE_FILE);
@@ -796,6 +796,8 @@ static enum page_references page_check_references(struct page *page,
        int referenced_ptes, referenced_page;
        unsigned long vm_flags;
 
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+
        referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
                                          &vm_flags);
        referenced_page = TestClearPageReferenced(page);
@@ -906,6 +908,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                bool dirty, writeback;
+               bool freeable = false;
 
                cond_resched();
 
@@ -1049,8 +1052,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                        if (!add_to_swap(page, page_list))
                                goto activate_locked;
+                       freeable = true;
                        may_enter_fs = 1;
-
                        /* Adding to swap updated mapping */
                        mapping = page_mapping(page);
                }
@@ -1060,8 +1063,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page,
-                                       ttu_flags|TTU_BATCH_FLUSH)) {
+                       switch (try_to_unmap(page, freeable ?
+                                       ttu_flags | TTU_BATCH_FLUSH | TTU_FREE :
+                                       ttu_flags | TTU_BATCH_FLUSH)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -1184,8 +1188,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * we obviously don't have to worry about waking up a process
                 * waiting on the page lock, because there are no references.
                 */
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
 free_it:
+               if (freeable && !PageDirty(page))
+                       count_vm_event(PGLAZYFREED);
+
                nr_reclaimed++;
 
                /*
@@ -1476,7 +1483,7 @@ static int too_many_isolated(struct zone *zone, int file,
         * won't get blocked by normal direct-reclaimers, forming a circular
         * deadlock.
         */
-       if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+       if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                inactive >>= 3;
 
        return isolated > inactive;
@@ -1859,17 +1866,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
 }
 
 #ifdef CONFIG_SWAP
-static int inactive_anon_is_low_global(struct zone *zone)
+static bool inactive_anon_is_low_global(struct zone *zone)
 {
        unsigned long active, inactive;
 
        active = zone_page_state(zone, NR_ACTIVE_ANON);
        inactive = zone_page_state(zone, NR_INACTIVE_ANON);
 
-       if (inactive * zone->inactive_ratio < active)
-               return 1;
-
-       return 0;
+       return inactive * zone->inactive_ratio < active;
 }
 
 /**
@@ -1879,14 +1883,14 @@ static int inactive_anon_is_low_global(struct zone *zone)
  * Returns true if the zone does not have enough inactive anon pages,
  * meaning some active anon pages need to be deactivated.
  */
-static int inactive_anon_is_low(struct lruvec *lruvec)
+static bool inactive_anon_is_low(struct lruvec *lruvec)
 {
        /*
         * If we don't have swap space, anonymous page deactivation
         * is pointless.
         */
        if (!total_swap_pages)
-               return 0;
+               return false;
 
        if (!mem_cgroup_disabled())
                return mem_cgroup_inactive_anon_is_low(lruvec);
@@ -1894,9 +1898,9 @@ static int inactive_anon_is_low(struct lruvec *lruvec)
        return inactive_anon_is_low_global(lruvec_zone(lruvec));
 }
 #else
-static inline int inactive_anon_is_low(struct lruvec *lruvec)
+static inline bool inactive_anon_is_low(struct lruvec *lruvec)
 {
-       return 0;
+       return false;
 }
 #endif
 
@@ -1914,7 +1918,7 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
  * This uses a different ratio than the anonymous pages, because
  * the page cache uses a use-once replacement algorithm.
  */
-static int inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_file_is_low(struct lruvec *lruvec)
 {
        unsigned long inactive;
        unsigned long active;
@@ -1925,7 +1929,7 @@ static int inactive_file_is_low(struct lruvec *lruvec)
        return active > inactive;
 }
 
-static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
+static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
 {
        if (is_file_lru(lru))
                return inactive_file_is_low(lruvec);
@@ -2480,7 +2484,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
        balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
                        zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
        watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
-       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0);
 
        /*
         * If compaction is deferred, reclaim up to a point where
@@ -2963,7 +2967,7 @@ static bool zone_balanced(struct zone *zone, int order,
                          unsigned long balance_gap, int classzone_idx)
 {
        if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
-                                   balance_gap, classzone_idx, 0))
+                                   balance_gap, classzone_idx))
                return false;
 
        if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
@@ -3696,10 +3700,10 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
 }
 
 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long zone_pagecache_reclaimable(struct zone *zone)
 {
-       long nr_pagecache_reclaimable;
-       long delta = 0;
+       unsigned long nr_pagecache_reclaimable;
+       unsigned long delta = 0;
 
        /*
         * If RECLAIM_UNMAP is set, then all file pages are considered
@@ -3794,7 +3798,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        /*
         * Do not scan if the allocation should not be delayed.
         */
-       if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
+       if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
                return ZONE_RECLAIM_NOSCAN;
 
        /*
index fbf14485a0498bf181e81f43bc69a0522e67afd5..34f480b7b08f0a07e2995872bd0670da6d2b8676 100644 (file)
@@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
        else
                __inc_zone_state(z, NUMA_OTHER);
 }
+
+/*
+ * Determine the per node value of a stat item.
+ */
+unsigned long node_page_state(int node, enum zone_stat_item item)
+{
+       struct zone *zones = NODE_DATA(node)->node_zones;
+
+       return
+#ifdef CONFIG_ZONE_DMA
+               zone_page_state(&zones[ZONE_DMA], item) +
+#endif
+#ifdef CONFIG_ZONE_DMA32
+               zone_page_state(&zones[ZONE_DMA32], item) +
+#endif
+#ifdef CONFIG_HIGHMEM
+               zone_page_state(&zones[ZONE_HIGHMEM], item) +
+#endif
+               zone_page_state(&zones[ZONE_NORMAL], item) +
+               zone_page_state(&zones[ZONE_MOVABLE], item);
+}
+
 #endif
 
 #ifdef CONFIG_COMPACTION
@@ -759,6 +781,7 @@ const char * const vmstat_text[] = {
 
        "pgfault",
        "pgmajfault",
+       "pglazyfreed",
 
        TEXTS_FOR_ZONES("pgrefill")
        TEXTS_FOR_ZONES("pgsteal_kswapd")
@@ -820,7 +843,9 @@ const char * const vmstat_text[] = {
        "thp_fault_fallback",
        "thp_collapse_alloc",
        "thp_collapse_alloc_failed",
-       "thp_split",
+       "thp_split_page",
+       "thp_split_page_failed",
+       "thp_split_pmd",
        "thp_zero_page_alloc",
        "thp_zero_page_alloc_failed",
 #endif
@@ -901,7 +926,7 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
        "Unmovable",
        "Reclaimable",
        "Movable",
-       "Reserve",
+       "HighAtomic",
 #ifdef CONFIG_CMA
        "CMA",
 #endif
index fa48bcdff9d5b921d6a52b6d203fa4e89d7c7cb6..d8a181fd779bf7344e390476d7d46819dc51c05a 100644 (file)
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -137,7 +137,7 @@ static const struct zbud_ops zbud_zpool_ops = {
        .evict =        zbud_zpool_evict
 };
 
-static void *zbud_zpool_create(char *name, gfp_t gfp,
+static void *zbud_zpool_create(const char *name, gfp_t gfp,
                               const struct zpool_ops *zpool_ops,
                               struct zpool *zpool)
 {
index 8f670d3e87060f6277f5651a79cfaa8d27a30713..fd3ff719c32cb9cf25dafca9d73f789d8071bad5 100644 (file)
@@ -18,8 +18,6 @@
 #include <linux/zpool.h>
 
 struct zpool {
-       char *type;
-
        struct zpool_driver *driver;
        void *pool;
        const struct zpool_ops *ops;
@@ -73,7 +71,8 @@ int zpool_unregister_driver(struct zpool_driver *driver)
 }
 EXPORT_SYMBOL(zpool_unregister_driver);
 
-static struct zpool_driver *zpool_get_driver(char *type)
+/* this assumes @type is null-terminated. */
+static struct zpool_driver *zpool_get_driver(const char *type)
 {
        struct zpool_driver *driver;
 
@@ -113,6 +112,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
  * not be loaded, and calling @zpool_create_pool() with the pool type will
  * fail.
  *
+ * The @type string must be null-terminated.
+ *
  * Returns: true if @type pool is available, false if not
  */
 bool zpool_has_pool(char *type)
@@ -145,9 +146,11 @@ EXPORT_SYMBOL(zpool_has_pool);
  *
  * Implementations must guarantee this to be thread-safe.
  *
+ * The @type and @name strings must be null-terminated.
+ *
  * Returns: New zpool on success, NULL on failure.
  */
-struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
                const struct zpool_ops *ops)
 {
        struct zpool_driver *driver;
@@ -174,7 +177,6 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
                return NULL;
        }
 
-       zpool->type = driver->type;
        zpool->driver = driver;
        zpool->pool = driver->create(name, gfp, ops, zpool);
        zpool->ops = ops;
@@ -208,7 +210,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
  */
 void zpool_destroy_pool(struct zpool *zpool)
 {
-       pr_debug("destroying pool type %s\n", zpool->type);
+       pr_debug("destroying pool type %s\n", zpool->driver->type);
 
        spin_lock(&pools_lock);
        list_del(&zpool->list);
@@ -228,9 +230,9 @@ void zpool_destroy_pool(struct zpool *zpool)
  *
  * Returns: The type of zpool.
  */
-char *zpool_get_type(struct zpool *zpool)
+const char *zpool_get_type(struct zpool *zpool)
 {
-       return zpool->type;
+       return zpool->driver->type;
 }
 
 /**
index f135b1b6fcdcab49aaf0845e078c6fc299b4b28b..9f15bdd9163c2360edf2e070023b6099b683a85c 100644 (file)
@@ -16,7 +16,7 @@
  * struct page(s) to form a zspage.
  *
  * Usage of struct page fields:
- *     page->first_page: points to the first component (0-order) page
+ *     page->private: points to the first component (0-order) page
  *     page->index (union with page->freelist): offset of the first object
  *             starting in this page. For the first page, this is
  *             always 0, so we use this field (aka freelist) to point
@@ -26,8 +26,7 @@
  *
  *     For _first_ page only:
  *
- *     page->private (union with page->first_page): refers to the
- *             component page after the first page
+ *     page->private: refers to the component page after the first page
  *             If the page is first_page for huge object, it stores handle.
  *             Look at size_class->huge.
  *     page->freelist: points to the first free object in zspage.
@@ -38,6 +37,7 @@
  *     page->lru: links together first pages of various zspages.
  *             Basically forming list of zspages in a fullness group.
  *     page->mapping: class index and fullness group of the zspage
+ *     page->inuse: the number of objects that are used in this zspage
  *
  * Usage of struct page flags:
  *     PG_private: identifies the first component page
@@ -58,7 +58,7 @@
 #include <linux/cpumask.h>
 #include <linux/cpu.h>
 #include <linux/vmalloc.h>
-#include <linux/hardirq.h>
+#include <linux/preempt.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/debugfs.h>
@@ -166,9 +166,14 @@ enum zs_stat_type {
        OBJ_USED,
        CLASS_ALMOST_FULL,
        CLASS_ALMOST_EMPTY,
-       NR_ZS_STAT_TYPE,
 };
 
+#ifdef CONFIG_ZSMALLOC_STAT
+#define NR_ZS_STAT_TYPE        (CLASS_ALMOST_EMPTY + 1)
+#else
+#define NR_ZS_STAT_TYPE        (OBJ_USED + 1)
+#endif
+
 struct zs_size_stat {
        unsigned long objs[NR_ZS_STAT_TYPE];
 };
@@ -237,7 +242,7 @@ struct link_free {
 };
 
 struct zs_pool {
-       char *name;
+       const char *name;
 
        struct size_class **size_class;
        struct kmem_cache *handle_cachep;
@@ -311,7 +316,7 @@ static void record_obj(unsigned long handle, unsigned long obj)
 
 #ifdef CONFIG_ZPOOL
 
-static void *zs_zpool_create(char *name, gfp_t gfp,
+static void *zs_zpool_create(const char *name, gfp_t gfp,
                             const struct zpool_ops *zpool_ops,
                             struct zpool *zpool)
 {
@@ -447,19 +452,23 @@ static int get_size_class_index(int size)
 static inline void zs_stat_inc(struct size_class *class,
                                enum zs_stat_type type, unsigned long cnt)
 {
-       class->stats.objs[type] += cnt;
+       if (type < NR_ZS_STAT_TYPE)
+               class->stats.objs[type] += cnt;
 }
 
 static inline void zs_stat_dec(struct size_class *class,
                                enum zs_stat_type type, unsigned long cnt)
 {
-       class->stats.objs[type] -= cnt;
+       if (type < NR_ZS_STAT_TYPE)
+               class->stats.objs[type] -= cnt;
 }
 
 static inline unsigned long zs_stat_get(struct size_class *class,
                                enum zs_stat_type type)
 {
-       return class->stats.objs[type];
+       if (type < NR_ZS_STAT_TYPE)
+               return class->stats.objs[type];
+       return 0;
 }
 
 #ifdef CONFIG_ZSMALLOC_STAT
@@ -548,7 +557,7 @@ static const struct file_operations zs_stat_size_ops = {
        .release        = single_release,
 };
 
-static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+static int zs_pool_stat_create(const char *name, struct zs_pool *pool)
 {
        struct dentry *entry;
 
@@ -588,7 +597,7 @@ static void __exit zs_stat_exit(void)
 {
 }
 
-static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool)
 {
        return 0;
 }
@@ -764,7 +773,7 @@ static struct page *get_first_page(struct page *page)
        if (is_first_page(page))
                return page;
        else
-               return page->first_page;
+               return (struct page *)page_private(page);
 }
 
 static struct page *get_next_page(struct page *page)
@@ -824,7 +833,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page,
 {
        if (class->huge) {
                VM_BUG_ON(!is_first_page(page));
-               return *(unsigned long *)page_private(page);
+               return page_private(page);
        } else
                return *(unsigned long *)obj;
 }
@@ -949,7 +958,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
         * Allocate individual pages and link them together as:
         * 1. first page->private = first sub-page
         * 2. all sub-pages are linked together using page->lru
-        * 3. each sub-page is linked to the first page using page->first_page
+        * 3. each sub-page is linked to the first page using page->private
         *
         * For each size class, First/Head pages are linked together using
         * page->lru. Also, we set PG_private to identify the first page
@@ -974,7 +983,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
                if (i == 1)
                        set_page_private(first_page, (unsigned long)page);
                if (i >= 1)
-                       page->first_page = first_page;
+                       set_page_private(page, (unsigned long)first_page);
                if (i >= 2)
                        list_add(&page->lru, &prev_page->lru);
                if (i == class->pages_per_zspage - 1)   /* last page */
@@ -1428,8 +1437,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class,
        struct page *first_page, *f_page;
        unsigned long f_objidx, f_offset;
        void *vaddr;
-       int class_idx;
-       enum fullness_group fullness;
 
        BUG_ON(!obj);
 
@@ -1437,7 +1444,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class,
        obj_to_location(obj, &f_page, &f_objidx);
        first_page = get_first_page(f_page);
 
-       get_zspage_mapping(first_page, &class_idx, &fullness);
        f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
 
        vaddr = kmap_atomic(f_page);
@@ -1822,9 +1828,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
        struct zs_pool *pool = container_of(shrinker, struct zs_pool,
                        shrinker);
 
-       if (!pool->shrinker_enabled)
-               return 0;
-
        for (i = zs_size_classes - 1; i >= 0; i--) {
                class = pool->size_class[i];
                if (!class)
@@ -1866,7 +1869,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
  * On success, a pointer to the newly created pool is returned,
  * otherwise NULL.
  */
-struct zs_pool *zs_create_pool(char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
 {
        int i;
        struct zs_pool *pool;
index 4043df7c672fb6f5b1be298b8d510fd17a3bbf42..025f8dc723dedf60da7d2d9d580ac208e75cd8f6 100644 (file)
@@ -82,33 +82,27 @@ module_param_named(enabled, zswap_enabled, bool, 0644);
 
 /* Crypto compressor to use */
 #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
-static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT;
-static struct kparam_string zswap_compressor_kparam = {
-       .string =       zswap_compressor,
-       .maxlen =       sizeof(zswap_compressor),
-};
+static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
 static int zswap_compressor_param_set(const char *,
                                      const struct kernel_param *);
 static struct kernel_param_ops zswap_compressor_param_ops = {
        .set =          zswap_compressor_param_set,
-       .get =          param_get_string,
+       .get =          param_get_charp,
+       .free =         param_free_charp,
 };
 module_param_cb(compressor, &zswap_compressor_param_ops,
-               &zswap_compressor_kparam, 0644);
+               &zswap_compressor, 0644);
 
 /* Compressed storage zpool to use */
 #define ZSWAP_ZPOOL_DEFAULT "zbud"
-static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT;
-static struct kparam_string zswap_zpool_kparam = {
-       .string =       zswap_zpool_type,
-       .maxlen =       sizeof(zswap_zpool_type),
-};
+static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
 static int zswap_zpool_param_set(const char *, const struct kernel_param *);
 static struct kernel_param_ops zswap_zpool_param_ops = {
-       .set =  zswap_zpool_param_set,
-       .get =  param_get_string,
+       .set =          zswap_zpool_param_set,
+       .get =          param_get_charp,
+       .free =         param_free_charp,
 };
-module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644);
+module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
 
 /* The maximum percentage of memory that the compressed pool can occupy */
 static unsigned int zswap_max_pool_percent = 20;
@@ -342,7 +336,7 @@ static void zswap_entry_put(struct zswap_tree *tree,
 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
                                pgoff_t offset)
 {
-       struct zswap_entry *entry = NULL;
+       struct zswap_entry *entry;
 
        entry = zswap_rb_search(root, offset);
        if (entry)
@@ -571,7 +565,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 {
        struct zswap_pool *pool;
-       gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+       gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
 
        pool = kzalloc(sizeof(*pool), GFP_KERNEL);
        if (!pool) {
@@ -615,19 +609,29 @@ error:
        return NULL;
 }
 
-static struct zswap_pool *__zswap_pool_create_fallback(void)
+static __init struct zswap_pool *__zswap_pool_create_fallback(void)
 {
        if (!crypto_has_comp(zswap_compressor, 0, 0)) {
+               if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
+                       pr_err("default compressor %s not available\n",
+                              zswap_compressor);
+                       return NULL;
+               }
                pr_err("compressor %s not available, using default %s\n",
                       zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
-               strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT,
-                       sizeof(zswap_compressor));
+               param_free_charp(&zswap_compressor);
+               zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
        }
        if (!zpool_has_pool(zswap_zpool_type)) {
+               if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
+                       pr_err("default zpool %s not available\n",
+                              zswap_zpool_type);
+                       return NULL;
+               }
                pr_err("zpool %s not available, using default %s\n",
                       zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
-               strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT,
-                       sizeof(zswap_zpool_type));
+               param_free_charp(&zswap_zpool_type);
+               zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
        }
 
        return zswap_pool_create(zswap_zpool_type, zswap_compressor);
@@ -684,43 +688,39 @@ static void zswap_pool_put(struct zswap_pool *pool)
 * param callbacks
 **********************************/
 
+/* val must be a null-terminated string */
 static int __zswap_param_set(const char *val, const struct kernel_param *kp,
                             char *type, char *compressor)
 {
        struct zswap_pool *pool, *put_pool = NULL;
-       char str[kp->str->maxlen], *s;
+       char *s = strstrip((char *)val);
        int ret;
 
-       /*
-        * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined
-        * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or
-        * 32 (arbitrary).
-        */
-       strlcpy(str, val, kp->str->maxlen);
-       s = strim(str);
+       /* no change required */
+       if (!strcmp(s, *(char **)kp->arg))
+               return 0;
 
        /* if this is load-time (pre-init) param setting,
         * don't create a pool; that's done during init.
         */
        if (!zswap_init_started)
-               return param_set_copystring(s, kp);
-
-       /* no change required */
-       if (!strncmp(kp->str->string, s, kp->str->maxlen))
-               return 0;
+               return param_set_charp(s, kp);
 
        if (!type) {
-               type = s;
-               if (!zpool_has_pool(type)) {
-                       pr_err("zpool %s not available\n", type);
+               if (!zpool_has_pool(s)) {
+                       pr_err("zpool %s not available\n", s);
                        return -ENOENT;
                }
+               type = s;
        } else if (!compressor) {
-               compressor = s;
-               if (!crypto_has_comp(compressor, 0, 0)) {
-                       pr_err("compressor %s not available\n", compressor);
+               if (!crypto_has_comp(s, 0, 0)) {
+                       pr_err("compressor %s not available\n", s);
                        return -ENOENT;
                }
+               compressor = s;
+       } else {
+               WARN_ON(1);
+               return -EINVAL;
        }
 
        spin_lock(&zswap_pools_lock);
@@ -736,7 +736,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
        }
 
        if (pool)
-               ret = param_set_copystring(s, kp);
+               ret = param_set_charp(s, kp);
        else
                ret = -EINVAL;
 
@@ -1011,7 +1011,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
        /* store */
        len = dlen + sizeof(struct zswap_header);
        ret = zpool_malloc(entry->pool->zpool, len,
-                          __GFP_NORETRY | __GFP_NOWARN, &handle);
+                          __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
+                          &handle);
        if (ret == -ENOSPC) {
                zswap_reject_compress_poor++;
                goto put_dstmem;
index fab4599ba8b261dc43977af8349a336edc4d2799..aa41e6dd642913f2181832145e22ec1dad6fa7b8 100644 (file)
@@ -414,7 +414,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
        len += NET_SKB_PAD;
 
        if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
-           (gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+           (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
@@ -481,7 +481,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
        len += NET_SKB_PAD + NET_IP_ALIGN;
 
        if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
-           (gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+           (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
@@ -4452,7 +4452,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                return NULL;
 
        gfp_head = gfp_mask;
-       if (gfp_head & __GFP_WAIT)
+       if (gfp_head & __GFP_DIRECT_RECLAIM)
                gfp_head |= __GFP_REPEAT;
 
        *errcode = -ENOBUFS;
@@ -4467,7 +4467,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
 
                while (order) {
                        if (npages >= 1 << order) {
-                               page = alloc_pages((gfp_mask & ~__GFP_WAIT) |
+                               page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
                                                   __GFP_COMP |
                                                   __GFP_NOWARN |
                                                   __GFP_NORETRY,
index 0ef30aa90132c7a1a04971c773d4de8ed4ac146b..a2040bb09916e8f54f7a30e1990944116ba255c8 100644 (file)
@@ -1922,8 +1922,10 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 
        pfrag->offset = 0;
        if (SKB_FRAG_PAGE_ORDER) {
-               pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
-                                         __GFP_NOWARN | __GFP_NORETRY,
+               /* Avoid direct reclaim but allow kswapd to wake */
+               pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+                                         __GFP_COMP | __GFP_NOWARN |
+                                         __GFP_NORETRY,
                                          SKB_FRAG_PAGE_ORDER);
                if (likely(pfrag->page)) {
                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
index fafe33bdb61989e680dc4b26dbe99dcc1d4064b5..59651af8cc2705b39f3ad1ea71ab0b161668af02 100644 (file)
@@ -2116,7 +2116,7 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid
        consume_skb(info.skb2);
 
        if (info.delivered) {
-               if (info.congested && (allocation & __GFP_WAIT))
+               if (info.congested && gfpflags_allow_blocking(allocation))
                        yield();
                return 0;
        }
index 0ea128eeeab2f835221b2068b1098a81fe1d731d..619f1d710eac0d9f9aab37d5d2340f91fa92b639 100644 (file)
@@ -71,7 +71,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
                           const struct sk_buff *skb)
 {
        struct flow_stats *stats;
-       int node = numa_node_id();
+       int node = numa_mem_id();
        int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
        stats = rcu_dereference(flow->stats[node]);
index 96744b75db9387aa2ef3b28d8ea103d81997a9ab..977fb86065b75dbef916bd0acb9b94876c0f5c04 100644 (file)
@@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
        gfp_t slab_mask = GFP_NOWAIT;
        gfp_t page_mask = GFP_NOWAIT;
 
-       if (gfp & __GFP_WAIT) {
+       if (gfp & __GFP_DIRECT_RECLAIM) {
                slab_mask = GFP_KERNEL;
                page_mask = GFP_HIGHUSER;
        }
@@ -379,7 +379,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
        struct ib_recv_wr *failed_wr;
        unsigned int posted = 0;
        int ret = 0;
-       bool can_wait = !!(gfp & __GFP_WAIT);
+       bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
        u32 pos;
 
        /* the goal here is to just make sure that someone, somewhere
index 692b3e67fb54418ffb143491b9e2f1dc82d8d503..6c71ed1caf16727a587c90ff81dcd6a7abd3d10b 100644 (file)
@@ -500,7 +500,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx,
                if (bundle->num_conns >= 20) {
                        _debug("too many conns");
 
-                       if (!(gfp & __GFP_WAIT)) {
+                       if (!gfpflags_allow_blocking(gfp)) {
                                _leave(" = -EAGAIN");
                                return -EAGAIN;
                        }
index b00f1f9611d64a7f46fdd37460d9c5ec9711f37f..559afd0ee7de099ba013c8921045fedb4e7e5488 100644 (file)
@@ -1590,7 +1590,7 @@ int sctp_assoc_lookup_laddr(struct sctp_association *asoc,
 /* Set an association id for a given association */
 int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp)
 {
-       bool preload = !!(gfp & __GFP_WAIT);
+       bool preload = gfpflags_allow_blocking(gfp);
        int ret;
 
        /* If the id is already assigned, keep it. */
index f2a1131b2f8baf06f28e286d6b9203d4d1a873f6..2d88cbf9f884d6fbf8c74e9bb2038b9bc5a6bddf 100755 (executable)
@@ -2313,42 +2313,43 @@ sub process {
                              "Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr);
                }
 
+# Check if the commit log is in a possible stack dump
+               if ($in_commit_log && !$commit_log_possible_stack_dump &&
+                   ($line =~ /^\s*(?:WARNING:|BUG:)/ ||
+                    $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
+                                       # timestamp
+                    $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
+                                       # stack dump address
+                       $commit_log_possible_stack_dump = 1;
+               }
+
 # Check for line lengths > 75 in commit log, warn once
                if ($in_commit_log && !$commit_log_long_line &&
-                   length($line) > 75 &&
-                   !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ ||
-                                       # file delta changes
-                     $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ ||
-                                       # filename then :
-                     $line =~ /^\s*(?:Fixes:|Link:)/i ||
-                                       # A Fixes: or Link: line
-                     $commit_log_possible_stack_dump)) {
+                   length($line) > 75 &&
+                   !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ ||
+                                       # file delta changes
+                     $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ ||
+                                       # filename then :
+                     $line =~ /^\s*(?:Fixes:|Link:)/i ||
+                                       # A Fixes: or Link: line
+                     $commit_log_possible_stack_dump)) {
                        WARN("COMMIT_LOG_LONG_LINE",
                             "Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr);
                        $commit_log_long_line = 1;
                }
 
-# Check if the commit log is in a possible stack dump
-               if ($in_commit_log && !$commit_log_possible_stack_dump &&
-                   ($line =~ /^\s*(?:WARNING:|BUG:)/ ||
-                    $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
-                               # timestamp
-                    $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
-                               # stack dump address
-                       $commit_log_possible_stack_dump = 1;
-               }
-
 # Reset possible stack dump if a blank line is found
-               if ($in_commit_log && $commit_log_possible_stack_dump &&
-                   $line =~ /^\s*$/) {
-                       $commit_log_possible_stack_dump = 0;
-               }
+               if ($in_commit_log && $commit_log_possible_stack_dump &&
+                   $line =~ /^\s*$/) {
+                       $commit_log_possible_stack_dump = 0;
+               }
 
 # Check for git id commit length and improperly formed commit descriptions
-               if ($in_commit_log &&
+               if ($in_commit_log && !$commit_log_possible_stack_dump &&
                    ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
-                    ($line =~ /\b[0-9a-f]{12,40}\b/i &&
-                     $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) {
+                    ($line =~ /\b[0-9a-f]{12,40}\b/i &&
+                     $line !~ /[\<\[][0-9a-f]{12,40}[\>\]]/i &&
+                     $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) {
                        my $init_char = "c";
                        my $orig_commit = "";
                        my $short = 1;
index e24121afb2f2773eacced7410dbdd5953a49c206..6eb62936c6723c02ca0b565ebe955aab2eb74c17 100644 (file)
@@ -126,7 +126,7 @@ static void *ima_alloc_pages(loff_t max_size, size_t *allocated_size,
 {
        void *ptr;
        int order = ima_maxorder;
-       gfp_t gfp_mask = __GFP_WAIT | __GFP_NOWARN | __GFP_NORETRY;
+       gfp_t gfp_mask = __GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY;
 
        if (order)
                order = min(get_order(max_size), order);
index 2458288a8287861c87bc868296ea068016efb673..c8edff6803d1db0b9b36585746c3ecaf45b6681c 100644 (file)
@@ -6,6 +6,7 @@ TARGETS += firmware
 TARGETS += ftrace
 TARGETS += futex
 TARGETS += kcmp
+TARGETS += lib
 TARGETS += membarrier
 TARGETS += memfd
 TARGETS += memory-hotplug
diff --git a/tools/testing/selftests/lib/Makefile b/tools/testing/selftests/lib/Makefile
new file mode 100644 (file)
index 0000000..47147b9
--- /dev/null
@@ -0,0 +1,8 @@
+# Makefile for lib/ function selftests
+
+# No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
+all:
+
+TEST_PROGS := printf.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/lib/printf.sh b/tools/testing/selftests/lib/printf.sh
new file mode 100644 (file)
index 0000000..4fdc70f
--- /dev/null
@@ -0,0 +1,10 @@
+#!/bin/sh
+# Runs printf infrastructure using test_printf kernel module
+
+if /sbin/modprobe -q test_printf; then
+       /sbin/modprobe -q -r test_printf
+       echo "printf: ok"
+else
+       echo "printf: [FAIL]"
+       exit 1
+fi
index 3c53cac15de141a7298d2d600517f20c6b3aac36..e4bb1de1d526d8717b447ed932e70138e9ddc1d7 100644 (file)
@@ -5,6 +5,8 @@ BINARIES = compaction_test
 BINARIES += hugepage-mmap
 BINARIES += hugepage-shm
 BINARIES += map_hugetlb
+BINARIES += mlock2-tests
+BINARIES += on-fault-limit
 BINARIES += thuge-gen
 BINARIES += transhuge-stress
 BINARIES += userfaultfd
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c
new file mode 100644 (file)
index 0000000..909802e
--- /dev/null
@@ -0,0 +1,737 @@
+#include <sys/mman.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <syscall.h>
+#include <errno.h>
+#include <stdbool.h>
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 1
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+       return syscall(__NR_mlock2, start, len, flags);
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+struct vm_boundaries {
+       unsigned long start;
+       unsigned long end;
+};
+
+static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
+{
+       FILE *file;
+       int ret = 1;
+       char line[1024] = {0};
+       char *end_addr;
+       char *stop;
+       unsigned long start;
+       unsigned long end;
+
+       if (!area)
+               return ret;
+
+       file = fopen("/proc/self/maps", "r");
+       if (!file) {
+               perror("fopen");
+               return ret;
+       }
+
+       memset(area, 0, sizeof(struct vm_boundaries));
+
+       while(fgets(line, 1024, file)) {
+               end_addr = strchr(line, '-');
+               if (!end_addr) {
+                       printf("cannot parse /proc/self/maps\n");
+                       goto out;
+               }
+               *end_addr = '\0';
+               end_addr++;
+               stop = strchr(end_addr, ' ');
+               if (!stop) {
+                       printf("cannot parse /proc/self/maps\n");
+                       goto out;
+               }
+               stop = '\0';
+
+               sscanf(line, "%lx", &start);
+               sscanf(end_addr, "%lx", &end);
+
+               if (start <= addr && end > addr) {
+                       area->start = start;
+                       area->end = end;
+                       ret = 0;
+                       goto out;
+               }
+       }
+out:
+       fclose(file);
+       return ret;
+}
+
+static uint64_t get_pageflags(unsigned long addr)
+{
+       FILE *file;
+       uint64_t pfn;
+       unsigned long offset;
+
+       file = fopen("/proc/self/pagemap", "r");
+       if (!file) {
+               perror("fopen pagemap");
+               _exit(1);
+       }
+
+       offset = addr / getpagesize() * sizeof(pfn);
+
+       if (fseek(file, offset, SEEK_SET)) {
+               perror("fseek pagemap");
+               _exit(1);
+       }
+
+       if (fread(&pfn, sizeof(pfn), 1, file) != 1) {
+               perror("fread pagemap");
+               _exit(1);
+       }
+
+       fclose(file);
+       return pfn;
+}
+
+static uint64_t get_kpageflags(unsigned long pfn)
+{
+       uint64_t flags;
+       FILE *file;
+
+       file = fopen("/proc/kpageflags", "r");
+       if (!file) {
+               perror("fopen kpageflags");
+               _exit(1);
+       }
+
+       if (fseek(file, pfn * sizeof(flags), SEEK_SET)) {
+               perror("fseek kpageflags");
+               _exit(1);
+       }
+
+       if (fread(&flags, sizeof(flags), 1, file) != 1) {
+               perror("fread kpageflags");
+               _exit(1);
+       }
+
+       fclose(file);
+       return flags;
+}
+
+static FILE *seek_to_smaps_entry(unsigned long addr)
+{
+       FILE *file;
+       char *line = NULL;
+       size_t size = 0;
+       unsigned long start, end;
+       char perms[5];
+       unsigned long offset;
+       char dev[32];
+       unsigned long inode;
+       char path[BUFSIZ];
+
+       file = fopen("/proc/self/smaps", "r");
+       if (!file) {
+               perror("fopen smaps");
+               _exit(1);
+       }
+
+       while (getline(&line, &size, file) > 0) {
+               if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+                          &start, &end, perms, &offset, dev, &inode, path) < 6)
+                       goto next;
+
+               if (start <= addr && addr < end)
+                       goto out;
+
+next:
+               free(line);
+               line = NULL;
+               size = 0;
+       }
+
+       fclose(file);
+       file = NULL;
+
+out:
+       free(line);
+       return file;
+}
+
+#define VMFLAGS "VmFlags:"
+
+static bool is_vmflag_set(unsigned long addr, const char *vmflag)
+{
+       char *line = NULL;
+       char *flags;
+       size_t size = 0;
+       bool ret = false;
+       FILE *smaps;
+
+       smaps = seek_to_smaps_entry(addr);
+       if (!smaps) {
+               printf("Unable to parse /proc/self/smaps\n");
+               goto out;
+       }
+
+       while (getline(&line, &size, smaps) > 0) {
+               if (!strstr(line, VMFLAGS)) {
+                       free(line);
+                       line = NULL;
+                       size = 0;
+                       continue;
+               }
+
+               flags = line + strlen(VMFLAGS);
+               ret = (strstr(flags, vmflag) != NULL);
+               goto out;
+       }
+
+out:
+       free(line);
+       fclose(smaps);
+       return ret;
+}
+
+#define SIZE "Size:"
+#define RSS  "Rss:"
+#define LOCKED "lo"
+
+static bool is_vma_lock_on_fault(unsigned long addr)
+{
+       bool ret = false;
+       bool locked;
+       FILE *smaps = NULL;
+       unsigned long vma_size, vma_rss;
+       char *line = NULL;
+       char *value;
+       size_t size = 0;
+
+       locked = is_vmflag_set(addr, LOCKED);
+       if (!locked)
+               goto out;
+
+       smaps = seek_to_smaps_entry(addr);
+       if (!smaps) {
+               printf("Unable to parse /proc/self/smaps\n");
+               goto out;
+       }
+
+       while (getline(&line, &size, smaps) > 0) {
+               if (!strstr(line, SIZE)) {
+                       free(line);
+                       line = NULL;
+                       size = 0;
+                       continue;
+               }
+
+               value = line + strlen(SIZE);
+               if (sscanf(value, "%lu kB", &vma_size) < 1) {
+                       printf("Unable to parse smaps entry for Size\n");
+                       goto out;
+               }
+               break;
+       }
+
+       while (getline(&line, &size, smaps) > 0) {
+               if (!strstr(line, RSS)) {
+                       free(line);
+                       line = NULL;
+                       size = 0;
+                       continue;
+               }
+
+               value = line + strlen(RSS);
+               if (sscanf(value, "%lu kB", &vma_rss) < 1) {
+                       printf("Unable to parse smaps entry for Rss\n");
+                       goto out;
+               }
+               break;
+       }
+
+       ret = locked && (vma_rss < vma_size);
+out:
+       free(line);
+       if (smaps)
+               fclose(smaps);
+       return ret;
+}
+
+#define PRESENT_BIT     0x8000000000000000
+#define PFN_MASK        0x007FFFFFFFFFFFFF
+#define UNEVICTABLE_BIT (1UL << 18)
+
+static int lock_check(char *map)
+{
+       unsigned long page_size = getpagesize();
+       uint64_t page1_flags, page2_flags;
+
+       page1_flags = get_pageflags((unsigned long)map);
+       page2_flags = get_pageflags((unsigned long)map + page_size);
+
+       /* Both pages should be present */
+       if (((page1_flags & PRESENT_BIT) == 0) ||
+           ((page2_flags & PRESENT_BIT) == 0)) {
+               printf("Failed to make both pages present\n");
+               return 1;
+       }
+
+       page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+       page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+       /* Both pages should be unevictable */
+       if (((page1_flags & UNEVICTABLE_BIT) == 0) ||
+           ((page2_flags & UNEVICTABLE_BIT) == 0)) {
+               printf("Failed to make both pages unevictable\n");
+               return 1;
+       }
+
+       if (!is_vmflag_set((unsigned long)map, LOCKED)) {
+               printf("VMA flag %s is missing on page 1\n", LOCKED);
+               return 1;
+       }
+
+       if (!is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
+               printf("VMA flag %s is missing on page 2\n", LOCKED);
+               return 1;
+       }
+
+       return 0;
+}
+
+static int unlock_lock_check(char *map)
+{
+       unsigned long page_size = getpagesize();
+       uint64_t page1_flags, page2_flags;
+
+       page1_flags = get_pageflags((unsigned long)map);
+       page2_flags = get_pageflags((unsigned long)map + page_size);
+       page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+       page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+       if ((page1_flags & UNEVICTABLE_BIT) || (page2_flags & UNEVICTABLE_BIT)) {
+               printf("A page is still marked unevictable after unlock\n");
+               return 1;
+       }
+
+       if (is_vmflag_set((unsigned long)map, LOCKED)) {
+               printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+               return 1;
+       }
+
+       if (is_vmflag_set((unsigned long)map + page_size, LOCKED)) {
+               printf("VMA flag %s is present on page 2 after unlock\n", LOCKED);
+               return 1;
+       }
+
+       return 0;
+}
+
+static int test_mlock_lock()
+{
+       char *map;
+       int ret = 1;
+       unsigned long page_size = getpagesize();
+
+       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (map == MAP_FAILED) {
+               perror("test_mlock_locked mmap");
+               goto out;
+       }
+
+       if (mlock2_(map, 2 * page_size, 0)) {
+               if (errno == ENOSYS) {
+                       printf("Cannot call new mlock family, skipping test\n");
+                       _exit(0);
+               }
+               perror("mlock2(0)");
+               goto unmap;
+       }
+
+       if (lock_check(map))
+               goto unmap;
+
+       /* Now unlock and recheck attributes */
+       if (munlock(map, 2 * page_size)) {
+               perror("munlock()");
+               goto unmap;
+       }
+
+       ret = unlock_lock_check(map);
+
+unmap:
+       munmap(map, 2 * page_size);
+out:
+       return ret;
+}
+
+static int onfault_check(char *map)
+{
+       unsigned long page_size = getpagesize();
+       uint64_t page1_flags, page2_flags;
+
+       page1_flags = get_pageflags((unsigned long)map);
+       page2_flags = get_pageflags((unsigned long)map + page_size);
+
+       /* Neither page should be present */
+       if ((page1_flags & PRESENT_BIT) || (page2_flags & PRESENT_BIT)) {
+               printf("Pages were made present by MLOCK_ONFAULT\n");
+               return 1;
+       }
+
+       *map = 'a';
+       page1_flags = get_pageflags((unsigned long)map);
+       page2_flags = get_pageflags((unsigned long)map + page_size);
+
+       /* Only page 1 should be present */
+       if ((page1_flags & PRESENT_BIT) == 0) {
+               printf("Page 1 is not present after fault\n");
+               return 1;
+       } else if (page2_flags & PRESENT_BIT) {
+               printf("Page 2 was made present\n");
+               return 1;
+       }
+
+       page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+
+       /* Page 1 should be unevictable */
+       if ((page1_flags & UNEVICTABLE_BIT) == 0) {
+               printf("Failed to make faulted page unevictable\n");
+               return 1;
+       }
+
+       if (!is_vma_lock_on_fault((unsigned long)map)) {
+               printf("VMA is not marked for lock on fault\n");
+               return 1;
+       }
+
+       if (!is_vma_lock_on_fault((unsigned long)map + page_size)) {
+               printf("VMA is not marked for lock on fault\n");
+               return 1;
+       }
+
+       return 0;
+}
+
+static int unlock_onfault_check(char *map)
+{
+       unsigned long page_size = getpagesize();
+       uint64_t page1_flags;
+
+       page1_flags = get_pageflags((unsigned long)map);
+       page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+
+       if (page1_flags & UNEVICTABLE_BIT) {
+               printf("Page 1 is still marked unevictable after unlock\n");
+               return 1;
+       }
+
+       if (is_vma_lock_on_fault((unsigned long)map) ||
+           is_vma_lock_on_fault((unsigned long)map + page_size)) {
+               printf("VMA is still lock on fault after unlock\n");
+               return 1;
+       }
+
+       return 0;
+}
+
+static int test_mlock_onfault()
+{
+       char *map;
+       int ret = 1;
+       unsigned long page_size = getpagesize();
+
+       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (map == MAP_FAILED) {
+               perror("test_mlock_locked mmap");
+               goto out;
+       }
+
+       if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+               if (errno == ENOSYS) {
+                       printf("Cannot call new mlock family, skipping test\n");
+                       _exit(0);
+               }
+               perror("mlock2(MLOCK_ONFAULT)");
+               goto unmap;
+       }
+
+       if (onfault_check(map))
+               goto unmap;
+
+       /* Now unlock and recheck attributes */
+       if (munlock(map, 2 * page_size)) {
+               if (errno == ENOSYS) {
+                       printf("Cannot call new mlock family, skipping test\n");
+                       _exit(0);
+               }
+               perror("munlock()");
+               goto unmap;
+       }
+
+       ret = unlock_onfault_check(map);
+unmap:
+       munmap(map, 2 * page_size);
+out:
+       return ret;
+}
+
+static int test_lock_onfault_of_present()
+{
+       char *map;
+       int ret = 1;
+       unsigned long page_size = getpagesize();
+       uint64_t page1_flags, page2_flags;
+
+       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (map == MAP_FAILED) {
+               perror("test_mlock_locked mmap");
+               goto out;
+       }
+
+       *map = 'a';
+
+       if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+               if (errno == ENOSYS) {
+                       printf("Cannot call new mlock family, skipping test\n");
+                       _exit(0);
+               }
+               perror("mlock2(MLOCK_ONFAULT)");
+               goto unmap;
+       }
+
+       page1_flags = get_pageflags((unsigned long)map);
+       page2_flags = get_pageflags((unsigned long)map + page_size);
+       page1_flags = get_kpageflags(page1_flags & PFN_MASK);
+       page2_flags = get_kpageflags(page2_flags & PFN_MASK);
+
+       /* Page 1 should be unevictable */
+       if ((page1_flags & UNEVICTABLE_BIT) == 0) {
+               printf("Failed to make present page unevictable\n");
+               goto unmap;
+       }
+
+       if (!is_vma_lock_on_fault((unsigned long)map) ||
+           !is_vma_lock_on_fault((unsigned long)map + page_size)) {
+               printf("VMA with present pages is not marked lock on fault\n");
+               goto unmap;
+       }
+       ret = 0;
+unmap:
+       munmap(map, 2 * page_size);
+out:
+       return ret;
+}
+
+static int test_munlockall()
+{
+       char *map;
+       int ret = 1;
+       unsigned long page_size = getpagesize();
+
+       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+
+       if (map == MAP_FAILED) {
+               perror("test_munlockall mmap");
+               goto out;
+       }
+
+       if (mlockall(MCL_CURRENT)) {
+               perror("mlockall(MCL_CURRENT)");
+               goto out;
+       }
+
+       if (lock_check(map))
+               goto unmap;
+
+       if (munlockall()) {
+               perror("munlockall()");
+               goto unmap;
+       }
+
+       if (unlock_lock_check(map))
+               goto unmap;
+
+       munmap(map, 2 * page_size);
+
+       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+
+       if (map == MAP_FAILED) {
+               perror("test_munlockall second mmap");
+               goto out;
+       }
+
+       if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
+               perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
+               goto unmap;
+       }
+
+       if (onfault_check(map))
+               goto unmap;
+
+       if (munlockall()) {
+               perror("munlockall()");
+               goto unmap;
+       }
+
+       if (unlock_onfault_check(map))
+               goto unmap;
+
+       if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+               perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
+               goto out;
+       }
+
+       if (lock_check(map))
+               goto unmap;
+
+       if (munlockall()) {
+               perror("munlockall()");
+               goto unmap;
+       }
+
+       ret = unlock_lock_check(map);
+
+unmap:
+       munmap(map, 2 * page_size);
+out:
+       munlockall();
+       return ret;
+}
+
+static int test_vma_management(bool call_mlock)
+{
+       int ret = 1;
+       void *map;
+       unsigned long page_size = getpagesize();
+       struct vm_boundaries page1;
+       struct vm_boundaries page2;
+       struct vm_boundaries page3;
+
+       map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (map == MAP_FAILED) {
+               perror("mmap()");
+               return ret;
+       }
+
+       if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
+               if (errno == ENOSYS) {
+                       printf("Cannot call new mlock family, skipping test\n");
+                       _exit(0);
+               }
+               perror("mlock(ONFAULT)\n");
+               goto out;
+       }
+
+       if (get_vm_area((unsigned long)map, &page1) ||
+           get_vm_area((unsigned long)map + page_size, &page2) ||
+           get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+               printf("couldn't find mapping in /proc/self/maps\n");
+               goto out;
+       }
+
+       /*
+        * Before we unlock a portion, we need to that all three pages are in
+        * the same VMA.  If they are not we abort this test (Note that this is
+        * not a failure)
+        */
+       if (page1.start != page2.start || page2.start != page3.start) {
+               printf("VMAs are not merged to start, aborting test\n");
+               ret = 0;
+               goto out;
+       }
+
+       if (munlock(map + page_size, page_size)) {
+               perror("munlock()");
+               goto out;
+       }
+
+       if (get_vm_area((unsigned long)map, &page1) ||
+           get_vm_area((unsigned long)map + page_size, &page2) ||
+           get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+               printf("couldn't find mapping in /proc/self/maps\n");
+               goto out;
+       }
+
+       /* All three VMAs should be different */
+       if (page1.start == page2.start || page2.start == page3.start) {
+               printf("failed to split VMA for munlock\n");
+               goto out;
+       }
+
+       /* Now unlock the first and third page and check the VMAs again */
+       if (munlock(map, page_size * 3)) {
+               perror("munlock()");
+               goto out;
+       }
+
+       if (get_vm_area((unsigned long)map, &page1) ||
+           get_vm_area((unsigned long)map + page_size, &page2) ||
+           get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+               printf("couldn't find mapping in /proc/self/maps\n");
+               goto out;
+       }
+
+       /* Now all three VMAs should be the same */
+       if (page1.start != page2.start || page2.start != page3.start) {
+               printf("failed to merge VMAs after munlock\n");
+               goto out;
+       }
+
+       ret = 0;
+out:
+       munmap(map, 3 * page_size);
+       return ret;
+}
+
+static int test_mlockall(int (test_function)(bool call_mlock))
+{
+       int ret = 1;
+
+       if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+               perror("mlockall");
+               return ret;
+       }
+
+       ret = test_function(false);
+       munlockall();
+       return ret;
+}
+
+int main(int argc, char **argv)
+{
+       int ret = 0;
+       ret += test_mlock_lock();
+       ret += test_mlock_onfault();
+       ret += test_munlockall();
+       ret += test_lock_onfault_of_present();
+       ret += test_vma_management(true);
+       ret += test_mlockall(test_vma_management);
+       return ret;
+}
+
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c
new file mode 100644 (file)
index 0000000..245accc
--- /dev/null
@@ -0,0 +1,47 @@
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int test_limit(void)
+{
+       int ret = 1;
+       struct rlimit lims;
+       void *map;
+
+       if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
+               perror("getrlimit");
+               return ret;
+       }
+
+       if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+               perror("mlockall");
+               return ret;
+       }
+
+       map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, 0, 0);
+       if (map != MAP_FAILED)
+               printf("mmap should have failed, but didn't\n");
+       else {
+               ret = 0;
+               munmap(map, 2 * lims.rlim_max);
+       }
+
+       munlockall();
+       return ret;
+}
+
+int main(int argc, char **argv)
+{
+       int ret = 0;
+
+       ret += test_limit();
+       return ret;
+}
index 9179ce8df485d7b125b7e3619f7d82ee4df5ea64..2df21b3bb26dbc9a946b4f5ec5038aec6f562ca8 100755 (executable)
@@ -106,4 +106,26 @@ else
        echo "[PASS]"
 fi
 
+echo "--------------------"
+echo "running on-fault-limit"
+echo "--------------------"
+sudo -u nobody ./on-fault-limit
+if [ $? -ne 0 ]; then
+       echo "[FAIL]"
+       exitcode=1
+else
+       echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running mlock2-tests"
+echo "--------------------"
+./mlock2-tests
+if [ $? -ne 0 ]; then
+       echo "[FAIL]"
+       exitcode=1
+else
+       echo "[PASS]"
+fi
+
 exit $exitcode
diff --git a/tools/vm/slabinfo-gnuplot.sh b/tools/vm/slabinfo-gnuplot.sh
new file mode 100644 (file)
index 0000000..35b0398
--- /dev/null
@@ -0,0 +1,275 @@
+#!/bin/sh
+
+# Sergey Senozhatsky, 2015
+# sergey.senozhatsky.work@gmail.com
+#
+# This software is licensed under the terms of the GNU General Public
+# License version 2, as published by the Free Software Foundation, and
+# may be copied, distributed, and modified under those terms.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+
+# This program is intended to plot a `slabinfo -X' stats, collected,
+# for example, using the following command:
+#   while [ 1 ]; do slabinfo -X >> stats; sleep 1; done
+#
+# Use `slabinfo-gnuplot.sh stats' to pre-process collected records
+# and generate graphs (totals, slabs sorted by size, slabs sorted
+# by size).
+#
+# Graphs can be [individually] regenerate with different ranges and
+# size (-r %d,%d and -s %d,%d options).
+#
+# To visually compare N `totals' graphs, do
+# slabinfo-gnuplot.sh -t FILE1-totals FILE2-totals ... FILEN-totals
+#
+
+min_slab_name_size=11
+xmin=0
+xmax=0
+width=1500
+height=700
+mode=preprocess
+
+usage()
+{
+       echo "Usage: [-s W,H] [-r MIN,MAX] [-t|-l] FILE1 [FILE2 ..]"
+       echo "FILEs must contain 'slabinfo -X' samples"
+       echo "-t                        - plot totals for FILE(s)"
+       echo "-l                        - plot slabs stats for FILE(s)"
+       echo "-s %d,%d          - set image width and height"
+       echo "-r %d,%d          - use data samples from a given range"
+}
+
+check_file_exist()
+{
+       if [ ! -f "$1" ]; then
+               echo "File '$1' does not exist"
+               exit 1
+       fi
+}
+
+do_slabs_plotting()
+{
+       local file=$1
+       local out_file
+       local range="every ::$xmin"
+       local xtic=""
+       local xtic_rotate="norotate"
+       local lines=2000000
+       local wc_lines
+
+       check_file_exist "$file"
+
+       out_file=`basename "$file"`
+       if [ $xmax -ne 0 ]; then
+               range="$range::$xmax"
+               lines=$((xmax-xmin))
+       fi
+
+       wc_lines=`cat "$file" | wc -l`
+       if [ $? -ne 0 ] || [ "$wc_lines" -eq 0 ] ; then
+               wc_lines=$lines
+       fi
+
+       if [ "$wc_lines" -lt "$lines" ]; then
+               lines=$wc_lines
+       fi
+
+       if [ $((width / lines)) -gt $min_slab_name_size ]; then
+               xtic=":xtic(1)"
+               xtic_rotate=90
+       fi
+
+gnuplot -p << EOF
+#!/usr/bin/env gnuplot
+
+set terminal png enhanced size $width,$height large
+set output '$out_file.png'
+set autoscale xy
+set xlabel 'samples'
+set ylabel 'bytes'
+set style histogram columnstacked title textcolor lt -1
+set style fill solid 0.15
+set xtics rotate $xtic_rotate
+set key left above Left title reverse
+
+plot "$file" $range u 2$xtic title 'SIZE' with boxes,\
+       '' $range u 3 title 'LOSS' with boxes
+EOF
+
+       if [ $? -eq 0 ]; then
+               echo "$out_file.png"
+       fi
+}
+
+do_totals_plotting()
+{
+       local gnuplot_cmd=""
+       local range="every ::$xmin"
+       local file=""
+
+       if [ $xmax -ne 0 ]; then
+               range="$range::$xmax"
+       fi
+
+       for i in "${t_files[@]}"; do
+               check_file_exist "$i"
+
+               file="$file"`basename "$i"`
+               gnuplot_cmd="$gnuplot_cmd '$i' $range using 1 title\
+                       '$i Memory usage' with lines,"
+               gnuplot_cmd="$gnuplot_cmd '' $range using 2 title \
+                       '$i Loss' with lines,"
+       done
+
+gnuplot -p << EOF
+#!/usr/bin/env gnuplot
+
+set terminal png enhanced size $width,$height large
+set autoscale xy
+set output '$file.png'
+set xlabel 'samples'
+set ylabel 'bytes'
+set key left above Left title reverse
+
+plot $gnuplot_cmd
+EOF
+
+       if [ $? -eq 0 ]; then
+               echo "$file.png"
+       fi
+}
+
+do_preprocess()
+{
+       local out
+       local lines
+       local in=$1
+
+       check_file_exist "$in"
+
+       # use only 'TOP' slab (biggest memory usage or loss)
+       let lines=3
+       out=`basename "$in"`"-slabs-by-loss"
+       `cat "$in" | grep -A "$lines" 'Slabs sorted by loss' |\
+               egrep -iv '\-\-|Name|Slabs'\
+               | awk '{print $1" "$4+$2*$3" "$4}' > "$out"`
+       if [ $? -eq 0 ]; then
+               do_slabs_plotting "$out"
+       fi
+
+       let lines=3
+       out=`basename "$in"`"-slabs-by-size"
+       `cat "$in" | grep -A "$lines" 'Slabs sorted by size' |\
+               egrep -iv '\-\-|Name|Slabs'\
+               | awk '{print $1" "$4" "$4-$2*$3}' > "$out"`
+       if [ $? -eq 0 ]; then
+               do_slabs_plotting "$out"
+       fi
+
+       out=`basename "$in"`"-totals"
+       `cat "$in" | grep "Memory used" |\
+               awk '{print $3" "$7}' > "$out"`
+       if [ $? -eq 0 ]; then
+               t_files[0]=$out
+               do_totals_plotting
+       fi
+}
+
+parse_opts()
+{
+       local opt
+
+       while getopts "tlr::s::h" opt; do
+               case $opt in
+                       t)
+                               mode=totals
+                               ;;
+                       l)
+                               mode=slabs
+                               ;;
+                       s)
+                               array=(${OPTARG//,/ })
+                               width=${array[0]}
+                               height=${array[1]}
+                               ;;
+                       r)
+                               array=(${OPTARG//,/ })
+                               xmin=${array[0]}
+                               xmax=${array[1]}
+                               ;;
+                       h)
+                               usage
+                               exit 0
+                               ;;
+                       \?)
+                               echo "Invalid option: -$OPTARG" >&2
+                               exit 1
+                               ;;
+                       :)
+                               echo "-$OPTARG requires an argument." >&2
+                               exit 1
+                               ;;
+               esac
+       done
+
+       return $OPTIND
+}
+
+parse_args()
+{
+       local idx=0
+       local p
+
+       for p in "$@"; do
+               case $mode in
+                       preprocess)
+                               files[$idx]=$p
+                               idx=$idx+1
+                               ;;
+                       totals)
+                               t_files[$idx]=$p
+                               idx=$idx+1
+                               ;;
+                       slabs)
+                               files[$idx]=$p
+                               idx=$idx+1
+                               ;;
+               esac
+       done
+}
+
+parse_opts "$@"
+argstart=$?
+parse_args "${@:$argstart}"
+
+if [ ${#files[@]} -eq 0 ] && [ ${#t_files[@]} -eq 0 ]; then
+       usage
+       exit 1
+fi
+
+case $mode in
+       preprocess)
+               for i in "${files[@]}"; do
+                       do_preprocess "$i"
+               done
+               ;;
+       totals)
+               do_totals_plotting
+               ;;
+       slabs)
+               for i in "${files[@]}"; do
+                       do_slabs_plotting "$i"
+               done
+               ;;
+       *)
+               echo "Unknown mode $mode" >&2
+               usage
+               exit 1
+       ;;
+esac
index 808d5a9d5dcfb062a5dc9d9fc10246cb6e7e00d2..86e698d07e20d66931f7846b8061c82018927382 100644 (file)
@@ -53,39 +53,43 @@ struct aliasinfo {
        struct slabinfo *slab;
 } aliasinfo[MAX_ALIASES];
 
-int slabs = 0;
-int actual_slabs = 0;
-int aliases = 0;
-int alias_targets = 0;
-int highest_node = 0;
+int slabs;
+int actual_slabs;
+int aliases;
+int alias_targets;
+int highest_node;
 
 char buffer[4096];
 
-int show_empty = 0;
-int show_report = 0;
-int show_alias = 0;
-int show_slab = 0;
+int show_empty;
+int show_report;
+int show_alias;
+int show_slab;
 int skip_zero = 1;
-int show_numa = 0;
-int show_track = 0;
-int show_first_alias = 0;
-int validate = 0;
-int shrink = 0;
-int show_inverted = 0;
-int show_single_ref = 0;
-int show_totals = 0;
-int sort_size = 0;
-int sort_active = 0;
-int set_debug = 0;
-int show_ops = 0;
-int show_activity = 0;
+int show_numa;
+int show_track;
+int show_first_alias;
+int validate;
+int shrink;
+int show_inverted;
+int show_single_ref;
+int show_totals;
+int sort_size;
+int sort_active;
+int set_debug;
+int show_ops;
+int show_activity;
+int output_lines = -1;
+int sort_loss;
+int extended_totals;
+int show_bytes;
 
 /* Debug options */
-int sanity = 0;
-int redzone = 0;
-int poison = 0;
-int tracking = 0;
-int tracing = 0;
+int sanity;
+int redzone;
+int poison;
+int tracking;
+int tracing;
 
 int page_size;
 
@@ -124,6 +128,10 @@ static void usage(void)
                "-v|--validate          Validate slabs\n"
                "-z|--zero              Include empty slabs\n"
                "-1|--1ref              Single reference\n"
+               "-N|--lines=K           Show the first K slabs\n"
+               "-L|--Loss              Sort by loss\n"
+               "-X|--Xtotals           Show extended summary information\n"
+               "-B|--Bytes             Show size in bytes\n"
                "\nValid debug options (FZPUT may be combined)\n"
                "a / A          Switch on all debug options (=FZUP)\n"
                "-              Switch off all debug options\n"
@@ -225,15 +233,17 @@ static int store_size(char *buffer, unsigned long value)
        char trailer = 0;
        int n;
 
-       if (value > 1000000000UL) {
-               divisor = 100000000UL;
-               trailer = 'G';
-       } else if (value > 1000000UL) {
-               divisor = 100000UL;
-               trailer = 'M';
-       } else if (value > 1000UL) {
-               divisor = 100;
-               trailer = 'K';
+       if (!show_bytes) {
+               if (value > 1000000000UL) {
+                       divisor = 100000000UL;
+                       trailer = 'G';
+               } else if (value > 1000000UL) {
+                       divisor = 100000UL;
+                       trailer = 'M';
+               } else if (value > 1000UL) {
+                       divisor = 100;
+                       trailer = 'K';
+               }
        }
 
        value /= divisor;
@@ -297,10 +307,12 @@ int line = 0;
 static void first_line(void)
 {
        if (show_activity)
-               printf("Name                   Objects      Alloc       Free   %%Fast Fallb O CmpX   UL\n");
+               printf("Name                   Objects      Alloc       Free"
+                       "   %%Fast Fallb O CmpX   UL\n");
        else
-               printf("Name                   Objects Objsize    Space "
-                       "Slabs/Part/Cpu  O/S O %%Fr %%Ef Flg\n");
+               printf("Name                   Objects Objsize           %s "
+                       "Slabs/Part/Cpu  O/S O %%Fr %%Ef Flg\n",
+                       sort_loss ? " Loss" : "Space");
 }
 
 /*
@@ -333,6 +345,11 @@ static unsigned long slab_activity(struct slabinfo *s)
                s->alloc_slowpath + s->free_slowpath;
 }
 
+static unsigned long slab_waste(struct slabinfo *s)
+{
+       return  slab_size(s) - s->objects * s->object_size;
+}
+
 static void slab_numa(struct slabinfo *s, int mode)
 {
        int node;
@@ -504,7 +521,7 @@ static void report(struct slabinfo *s)
        if (strcmp(s->name, "*") == 0)
                return;
 
-       printf("\nSlabcache: %-20s  Aliases: %2d Order : %2d Objects: %lu\n",
+       printf("\nSlabcache: %-15s  Aliases: %2d Order : %2d Objects: %lu\n",
                s->name, s->aliases, s->order, s->objects);
        if (s->hwcache_align)
                printf("** Hardware cacheline aligned\n");
@@ -561,7 +578,10 @@ static void slabcache(struct slabinfo *s)
        if (show_empty && s->slabs)
                return;
 
-       store_size(size_str, slab_size(s));
+       if (sort_loss == 0)
+               store_size(size_str, slab_size(s));
+       else
+               store_size(size_str, slab_waste(s));
        snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
                                                s->partial, s->cpu_slabs);
 
@@ -602,15 +622,15 @@ static void slabcache(struct slabinfo *s)
                        total_free ? (s->free_fastpath * 100 / total_free) : 0,
                        s->order_fallback, s->order, s->cmpxchg_double_fail,
                        s->cmpxchg_double_cpu_fail);
-       }
-       else
-               printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
+       } else {
+               printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n",
                        s->name, s->objects, s->object_size, size_str, dist_str,
                        s->objs_per_slab, s->order,
                        s->slabs ? (s->partial * 100) / s->slabs : 100,
                        s->slabs ? (s->objects * s->object_size * 100) /
                                (s->slabs * (page_size << s->order)) : 100,
                        flags);
+       }
 }
 
 /*
@@ -918,84 +938,88 @@ static void totals(void)
 
        printf("Slabcache Totals\n");
        printf("----------------\n");
-       printf("Slabcaches : %3d      Aliases  : %3d->%-3d Active: %3d\n",
+       printf("Slabcaches : %15d   Aliases  : %11d->%-3d  Active:    %3d\n",
                        slabs, aliases, alias_targets, used_slabs);
 
        store_size(b1, total_size);store_size(b2, total_waste);
        store_size(b3, total_waste * 100 / total_used);
-       printf("Memory used: %6s   # Loss   : %6s   MRatio:%6s%%\n", b1, b2, b3);
+       printf("Memory used: %15s   # Loss   : %15s   MRatio:%6s%%\n", b1, b2, b3);
 
        store_size(b1, total_objects);store_size(b2, total_partobj);
        store_size(b3, total_partobj * 100 / total_objects);
-       printf("# Objects  : %6s   # PartObj: %6s   ORatio:%6s%%\n", b1, b2, b3);
+       printf("# Objects  : %15s   # PartObj: %15s   ORatio:%6s%%\n", b1, b2, b3);
 
        printf("\n");
-       printf("Per Cache    Average         Min         Max       Total\n");
-       printf("---------------------------------------------------------\n");
+       printf("Per Cache         Average              "
+               "Min              Max            Total\n");
+       printf("---------------------------------------"
+               "-------------------------------------\n");
 
        store_size(b1, avg_objects);store_size(b2, min_objects);
        store_size(b3, max_objects);store_size(b4, total_objects);
-       printf("#Objects  %10s  %10s  %10s  %10s\n",
+       printf("#Objects  %15s  %15s  %15s  %15s\n",
                        b1,     b2,     b3,     b4);
 
        store_size(b1, avg_slabs);store_size(b2, min_slabs);
        store_size(b3, max_slabs);store_size(b4, total_slabs);
-       printf("#Slabs    %10s  %10s  %10s  %10s\n",
+       printf("#Slabs    %15s  %15s  %15s  %15s\n",
                        b1,     b2,     b3,     b4);
 
        store_size(b1, avg_partial);store_size(b2, min_partial);
        store_size(b3, max_partial);store_size(b4, total_partial);
-       printf("#PartSlab %10s  %10s  %10s  %10s\n",
+       printf("#PartSlab %15s  %15s  %15s  %15s\n",
                        b1,     b2,     b3,     b4);
        store_size(b1, avg_ppart);store_size(b2, min_ppart);
        store_size(b3, max_ppart);
        store_size(b4, total_partial * 100  / total_slabs);
-       printf("%%PartSlab%10s%% %10s%% %10s%% %10s%%\n",
+       printf("%%PartSlab%15s%% %15s%% %15s%% %15s%%\n",
                        b1,     b2,     b3,     b4);
 
        store_size(b1, avg_partobj);store_size(b2, min_partobj);
        store_size(b3, max_partobj);
        store_size(b4, total_partobj);
-       printf("PartObjs  %10s  %10s  %10s  %10s\n",
+       printf("PartObjs  %15s  %15s  %15s  %15s\n",
                        b1,     b2,     b3,     b4);
 
        store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj);
        store_size(b3, max_ppartobj);
        store_size(b4, total_partobj * 100 / total_objects);
-       printf("%% PartObj%10s%% %10s%% %10s%% %10s%%\n",
+       printf("%% PartObj%15s%% %15s%% %15s%% %15s%%\n",
                        b1,     b2,     b3,     b4);
 
        store_size(b1, avg_size);store_size(b2, min_size);
        store_size(b3, max_size);store_size(b4, total_size);
-       printf("Memory    %10s  %10s  %10s  %10s\n",
+       printf("Memory    %15s  %15s  %15s  %15s\n",
                        b1,     b2,     b3,     b4);
 
        store_size(b1, avg_used);store_size(b2, min_used);
        store_size(b3, max_used);store_size(b4, total_used);
-       printf("Used      %10s  %10s  %10s  %10s\n",
+       printf("Used      %15s  %15s  %15s  %15s\n",
                        b1,     b2,     b3,     b4);
 
        store_size(b1, avg_waste);store_size(b2, min_waste);
        store_size(b3, max_waste);store_size(b4, total_waste);
-       printf("Loss      %10s  %10s  %10s  %10s\n",
+       printf("Loss      %15s  %15s  %15s  %15s\n",
                        b1,     b2,     b3,     b4);
 
        printf("\n");
-       printf("Per Object   Average         Min         Max\n");
-       printf("---------------------------------------------\n");
+       printf("Per Object        Average              "
+               "Min              Max\n");
+       printf("---------------------------------------"
+               "--------------------\n");
 
        store_size(b1, avg_memobj);store_size(b2, min_memobj);
        store_size(b3, max_memobj);
-       printf("Memory    %10s  %10s  %10s\n",
+       printf("Memory    %15s  %15s  %15s\n",
                        b1,     b2,     b3);
        store_size(b1, avg_objsize);store_size(b2, min_objsize);
        store_size(b3, max_objsize);
-       printf("User      %10s  %10s  %10s\n",
+       printf("User      %15s  %15s  %15s\n",
                        b1,     b2,     b3);
 
        store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
        store_size(b3, max_objwaste);
-       printf("Loss      %10s  %10s  %10s\n",
+       printf("Loss      %15s  %15s  %15s\n",
                        b1,     b2,     b3);
 }
 
@@ -1011,6 +1035,8 @@ static void sort_slabs(void)
                                result = slab_size(s1) < slab_size(s2);
                        else if (sort_active)
                                result = slab_activity(s1) < slab_activity(s2);
+                       else if (sort_loss)
+                               result = slab_waste(s1) < slab_waste(s2);
                        else
                                result = strcasecmp(s1->name, s2->name);
 
@@ -1095,7 +1121,7 @@ static void alias(void)
                        active = a->slab->name;
                }
                else
-                       printf("%-20s -> %s\n", a->name, a->slab->name);
+                       printf("%-15s -> %s\n", a->name, a->slab->name);
        }
        if (active)
                printf("\n");
@@ -1241,12 +1267,16 @@ static void read_slab_dir(void)
 static void output_slabs(void)
 {
        struct slabinfo *slab;
+       int lines = output_lines;
 
-       for (slab = slabinfo; slab < slabinfo + slabs; slab++) {
+       for (slab = slabinfo; (slab < slabinfo + slabs) &&
+                       lines != 0; slab++) {
 
                if (slab->alias)
                        continue;
 
+               if (lines != -1)
+                       lines--;
 
                if (show_numa)
                        slab_numa(slab, 0);
@@ -1267,24 +1297,54 @@ static void output_slabs(void)
        }
 }
 
+static void xtotals(void)
+{
+       totals();
+
+       link_slabs();
+       rename_slabs();
+
+       printf("\nSlabs sorted by size\n");
+       printf("--------------------\n");
+       sort_loss = 0;
+       sort_size = 1;
+       sort_slabs();
+       output_slabs();
+
+       printf("\nSlabs sorted by loss\n");
+       printf("--------------------\n");
+       line = 0;
+       sort_loss = 1;
+       sort_size = 0;
+       sort_slabs();
+       output_slabs();
+       printf("\n");
+}
+
 struct option opts[] = {
-       { "aliases", 0, NULL, 'a' },
-       { "activity", 0, NULL, 'A' },
-       { "debug", 2, NULL, 'd' },
-       { "display-activity", 0, NULL, 'D' },
-       { "empty", 0, NULL, 'e' },
-       { "first-alias", 0, NULL, 'f' },
-       { "help", 0, NULL, 'h' },
-       { "inverted", 0, NULL, 'i'},
-       { "numa", 0, NULL, 'n' },
-       { "ops", 0, NULL, 'o' },
-       { "report", 0, NULL, 'r' },
-       { "shrink", 0, NULL, 's' },
-       { "slabs", 0, NULL, 'l' },
-       { "track", 0, NULL, 't'},
-       { "validate", 0, NULL, 'v' },
-       { "zero", 0, NULL, 'z' },
-       { "1ref", 0, NULL, '1'},
+       { "aliases", no_argument, NULL, 'a' },
+       { "activity", no_argument, NULL, 'A' },
+       { "debug", optional_argument, NULL, 'd' },
+       { "display-activity", no_argument, NULL, 'D' },
+       { "empty", no_argument, NULL, 'e' },
+       { "first-alias", no_argument, NULL, 'f' },
+       { "help", no_argument, NULL, 'h' },
+       { "inverted", no_argument, NULL, 'i'},
+       { "slabs", no_argument, NULL, 'l' },
+       { "numa", no_argument, NULL, 'n' },
+       { "ops", no_argument, NULL, 'o' },
+       { "shrink", no_argument, NULL, 's' },
+       { "report", no_argument, NULL, 'r' },
+       { "Size", no_argument, NULL, 'S'},
+       { "tracking", no_argument, NULL, 't'},
+       { "Totals", no_argument, NULL, 'T'},
+       { "validate", no_argument, NULL, 'v' },
+       { "zero", no_argument, NULL, 'z' },
+       { "1ref", no_argument, NULL, '1'},
+       { "lines", required_argument, NULL, 'N'},
+       { "Loss", no_argument, NULL, 'L'},
+       { "Xtotals", no_argument, NULL, 'X'},
+       { "Bytes", no_argument, NULL, 'B'},
        { NULL, 0, NULL, 0 }
 };
 
@@ -1296,7 +1356,7 @@ int main(int argc, char *argv[])
 
        page_size = getpagesize();
 
-       while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS",
+       while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXB",
                                                opts, NULL)) != -1)
                switch (c) {
                case '1':
@@ -1358,7 +1418,25 @@ int main(int argc, char *argv[])
                case 'S':
                        sort_size = 1;
                        break;
-
+               case 'N':
+                       if (optarg) {
+                               output_lines = atoi(optarg);
+                               if (output_lines < 1)
+                                       output_lines = 1;
+                       }
+                       break;
+               case 'L':
+                       sort_loss = 1;
+                       break;
+               case 'X':
+                       if (output_lines == -1)
+                               output_lines = 1;
+                       extended_totals = 1;
+                       show_bytes = 1;
+                       break;
+               case 'B':
+                       show_bytes = 1;
+                       break;
                default:
                        fatal("%s: Invalid option '%c'\n", argv[0], optopt);
 
@@ -1378,12 +1456,13 @@ int main(int argc, char *argv[])
                fatal("%s: Invalid pattern '%s' code %d\n",
                        argv[0], pattern_source, err);
        read_slab_dir();
-       if (show_alias)
+       if (show_alias) {
                alias();
-       else
-       if (show_totals)
+       } else if (extended_totals) {
+               xtotals();
+       } else if (show_totals) {
                totals();
-       else {
+       else {
                link_slabs();
                rename_slabs();
                sort_slabs();