]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'akpm-current/current'
authorStephen Rothwell <sfr@canb.auug.org.au>
Mon, 2 Nov 2015 03:45:18 +0000 (14:45 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Mon, 2 Nov 2015 03:45:26 +0000 (14:45 +1100)
99 files changed:
1  2 
Documentation/filesystems/proc.txt
Documentation/kernel-parameters.txt
MAINTAINERS
arch/arc/mm/cache.c
arch/arm/mm/dma-mapping.c
arch/arm/xen/mm.c
arch/arm64/include/asm/pgtable.h
arch/arm64/mm/dma-mapping.c
arch/mips/mm/tlbex.c
arch/powerpc/include/asm/pgtable-ppc64.h
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/mm/numa.c
arch/powerpc/sysdev/fsl_pci.c
arch/s390/include/asm/pgtable.h
arch/x86/Kconfig
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_types.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/pci-dma.c
arch/x86/kernel/smpboot.c
arch/x86/mm/gup.c
block/blk-core.c
block/blk-mq-tag.c
block/blk-mq.c
block/genhd.c
drivers/block/drbd/drbd_bitmap.c
drivers/block/nbd.c
drivers/block/pktcdvd.c
drivers/gpu/drm/drm_gem.c
drivers/gpu/drm/i915/i915_gem.c
drivers/infiniband/core/sa_query.c
drivers/iommu/amd_iommu.c
drivers/iommu/intel-iommu.c
drivers/md/dm-crypt.c
drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c
drivers/media/pci/solo6x10/solo6x10-v4l2.c
drivers/media/pci/tw68/tw68-video.c
drivers/misc/vmw_balloon.c
drivers/mtd/mtdcore.c
drivers/nvme/host/pci.c
drivers/staging/android/ion/ion_system_heap.c
drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
drivers/staging/rdma/hfi1/init.c
drivers/staging/rdma/ipath/ipath_file_ops.c
drivers/usb/gadget/function/f_mass_storage.c
drivers/usb/host/u132-hcd.c
fs/9p/vfs_file.c
fs/cifs/file.c
fs/coredump.c
fs/direct-io.c
fs/ext4/inode.c
fs/ext4/readpage.c
fs/ext4/super.c
fs/fs-writeback.c
fs/jffs2/wbuf.c
fs/mpage.c
fs/namei.c
fs/nfs/file.c
fs/ocfs2/cluster/heartbeat.c
fs/proc/array.c
fs/proc/task_mmu.c
fs/xfs/xfs_qm.c
include/asm-generic/pgtable.h
include/drm/drmP.h
include/linux/compiler-gcc.h
include/linux/compiler.h
include/linux/fs.h
include/linux/hugetlb_cgroup.h
include/linux/memcontrol.h
include/linux/sched.h
include/linux/skbuff.h
include/net/sock.h
kernel/audit.c
kernel/cgroup.c
kernel/cpuset.c
kernel/fork.c
kernel/futex.c
kernel/kexec_core.c
kernel/params.c
kernel/sysctl.c
lib/Kconfig.debug
lib/Makefile
lib/dma-debug.c
lib/kobject.c
mm/backing-dev.c
mm/failslab.c
mm/huge_memory.c
mm/memcontrol.c
mm/memory_hotplug.c
mm/page_alloc.c
mm/pgtable-generic.c
mm/vmscan.c
net/core/sock.c
net/netlink/af_netlink.c
net/openvswitch/flow.c
net/rds/ib_recv.c
net/rxrpc/ar-connection.c
tools/testing/selftests/Makefile

index 3a9d65c912e780977c12102d7719a0374241b962,d6f259eaa5efb384cb503d312808ec06660efba5..55ffd0820feba09105555137efa4eb54abc39be1
@@@ -140,8 -140,7 +140,8 @@@ Table 1-1: Process specific entries in 
   stat         Process status
   statm                Process memory status information
   status               Process status in human readable form
 - wchan                If CONFIG_KALLSYMS is set, a pre-decoded wchan
 + wchan                Present with CONFIG_KALLSYMS=y: it shows the kernel function
 +              symbol the task is blocked in - or "0" if not blocked.
   pagemap      Page table
   stack                Report full stack trace, enable via CONFIG_STACKTRACE
   smaps                a extension based on maps, showing the memory consumption of
@@@ -175,6 -174,7 +175,7 @@@ read the file /proc/PID/status
    VmLib:      1412 kB
    VmPTE:        20 kb
    VmSwap:        0 kB
+   HugetlbPages:          0 kB
    Threads:        1
    SigQ:   0/28578
    SigPnd: 0000000000000000
@@@ -238,6 -238,7 +239,7 @@@ Table 1-2: Contents of the status file
   VmPTE                       size of page table entries
   VmPMD                       size of second level page tables
   VmSwap                      size of swap usage (the number of referred swapents)
+  HugetlbPages                size of hugetlb memory portions
   Threads                     number of threads
   SigQ                        number of signals queued/max. number for queue
   SigPnd                      bitmap of pending signals for the thread
@@@ -311,7 -312,7 +313,7 @@@ Table 1-4: Contents of the stat files (
    blocked       bitmap of blocked signals
    sigign        bitmap of ignored signals
    sigcatch      bitmap of caught signals
 -  wchan         address where process went to sleep
 +  0           (place holder, used to be the wchan address, use /proc/PID/wchan instead)
    0             (place holder)
    0             (place holder)
    exit_signal   signal to send to parent thread on exit
@@@ -424,6 -425,9 +426,9 @@@ Private_Clean:         0 k
  Private_Dirty:         0 kB
  Referenced:          892 kB
  Anonymous:             0 kB
+ AnonHugePages:         0 kB
+ Shared_Hugetlb:        0 kB
+ Private_Hugetlb:        0 kB
  Swap:                  0 kB
  SwapPss:               0 kB
  KernelPageSize:        4 kB
@@@ -452,6 -456,11 +457,11 @@@ and a page is modified, the file page i
  "Swap" shows how much would-be-anonymous memory is also used, but out on
  swap.
  "SwapPss" shows proportional swap share of this mapping.
+ "AnonHugePages" shows the ammount of memory backed by transparent hugepage.
+ "Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by
+ hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
+ reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
  "VmFlags" field deserves a separate description. This member represents the kernel
  flags associated with the particular virtual memory area in two letter encoded
  manner. The codes are the following:
      ac  - area is accountable
      nr  - swap space is not reserved for the area
      ht  - area uses huge tlb pages
-     nl  - non-linear mapping
      ar  - architecture specific flag
      dd  - do not include area into core dump
      sd  - soft-dirty flag
index 101573c07788945b4423e3d29dbbed9275ab5cda,b4af96e01b06fc24d9636b278ff23b111f9173da..f8aae632f02f678000f9de7a8d507a5ea0bb9404
@@@ -167,8 -167,7 +167,8 @@@ bytes respectively. Such letter suffixe
  
        acpi=           [HW,ACPI,X86,ARM64]
                        Advanced Configuration and Power Interface
 -                      Format: { force | off | strict | noirq | rsdt }
 +                      Format: { force | off | strict | noirq | rsdt |
 +                                copy_dsdt }
                        force -- enable ACPI if default was off
                        off -- disable ACPI if default was on
                        noirq -- do not use ACPI for IRQ routing
                        is passed, kernel could allocate physical memory region
                        above 4G, that cause second kernel crash on system
                        that require some amount of low memory, e.g. swiotlb
 -                      requires at least 64M+32K low memory.  Kernel would
 -                      try to allocate 72M below 4G automatically.
 +                      requires at least 64M+32K low memory, also enough extra
 +                      low memory is needed to make sure DMA buffers for 32-bit
 +                      devices won't run out. Kernel would try to allocate at
 +                      at least 256M below 4G automatically.
                        This one let user to specify own low range under 4G
                        for second kernel instead.
                        0: to disable low allocation.
                        The filter can be disabled or changed to another
                        driver later using sysfs.
  
 -      drm_kms_helper.edid_firmware=[<connector>:]<file>
 -                      Broken monitors, graphic adapters and KVMs may
 -                      send no or incorrect EDID data sets. This parameter
 -                      allows to specify an EDID data set in the
 -                      /lib/firmware directory that is used instead.
 +      drm_kms_helper.edid_firmware=[<connector>:]<file>[,[<connector>:]<file>]
 +                      Broken monitors, graphic adapters, KVMs and EDIDless
 +                      panels may send no or incorrect EDID data sets.
 +                      This parameter allows to specify an EDID data sets
 +                      in the /lib/firmware directory that are used instead.
                        Generic built-in EDID data sets are used, if one of
                        edid/1024x768.bin, edid/1280x1024.bin,
                        edid/1680x1050.bin, or edid/1920x1080.bin is given
                        available in Documentation/EDID/HOWTO.txt. An EDID
                        data set will only be used for a particular connector,
                        if its name and a colon are prepended to the EDID
 -                      name.
 +                      name. Each connector may use a unique EDID data
 +                      set by separating the files with a comma.  An EDID
 +                      data set with no connector name will be used for
 +                      any connectors not explicitly specified.
  
        dscc4.setup=    [NET]
  
  
        earlycon=       [KNL] Output early console device and options.
  
 +                      When used with no options, the early console is
 +                      determined by the stdout-path property in device
 +                      tree's chosen node.
 +
                cdns,<addr>
                        Start an early, polled-mode console on a cadence serial
                        port at the specified address. The cadence serial port
                        serial port must already be setup and configured.
                        Options are not yet supported.
  
 +              lpuart,<addr>
 +              lpuart32,<addr>
 +                      Use early console provided by Freescale LP UART driver
 +                      found on Freescale Vybrid and QorIQ LS1021A processors.
 +                      A valid base address must be provided, and the serial
 +                      port must already be setup and configured.
 +
        earlyprintk=    [X86,SH,BLACKFIN,ARM,M68k]
                        earlyprintk=vga
                        earlyprintk=efi
                        you are really sure that your UEFI does sane gc and
                        fulfills the spec otherwise your board may brick.
  
 +      efi_fake_mem=   nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86]
 +                      Add arbitrary attribute to specific memory range by
 +                      updating original EFI memory map.
 +                      Region of memory which aa attribute is added to is
 +                      from ss to ss+nn.
 +                      If efi_fake_mem=2G@4G:0x10000,2G@0x10a0000000:0x10000
 +                      is specified, EFI_MEMORY_MORE_RELIABLE(0x10000)
 +                      attribute is added to range 0x100000000-0x180000000 and
 +                      0x10a0000000-0x1120000000.
 +
 +                      Using this parameter you can do debugging of EFI memmap
 +                      related feature. For example, you can do debugging of
 +                      Address Range Mirroring feature even if your box
 +                      doesn't support it.
 +
        eisa_irq_edge=  [PARISC,HW]
                        See header of drivers/parisc/eisa.c.
  
                        Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
                        Default: 1024
  
+       hardlockup_all_cpu_backtrace=
+                       [KNL] Should the hard-lockup detector generate
+                       backtraces on all cpus.
+                       Format: <integer>
        hashdist=       [KNL,NUMA] Large hashes allocated during boot
                        are distributed across NUMA nodes.  Defaults on
                        for 64-bit NUMA, off otherwise.
                hwp_only
                        Only load intel_pstate on systems which support
                        hardware P state control (HWP) if available.
 +              no_acpi
 +                      Don't use ACPI processor performance control objects
 +                      _PSS and _PPC specified limits.
  
        intremap=       [X86-64, Intel-IOMMU]
                        on      enable Interrupt Remapping (default)
                        nosid   disable Source ID checking
                        no_x2apic_optout
                                BIOS x2APIC opt-out request will be ignored
 +                      nopost  disable Interrupt Posting
  
        iomem=          Disable strict checking of access to MMIO memory
                strict  regions from userspace.
        nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
                        Format: [panic,][nopanic,][num]
                        Valid num: 0 or 1
 -                      0 - turn nmi_watchdog off
 -                      1 - turn nmi_watchdog on
 +                      0 - turn hardlockup detector in nmi_watchdog off
 +                      1 - turn hardlockup detector in nmi_watchdog on
                        When panic is specified, panic when an NMI watchdog
                        timeout occurs (or 'nopanic' to override the opposite
 -                      default).
 +                      default). To disable both hard and soft lockup detectors,
 +                      please see 'nowatchdog'.
                        This is useful when you use a panic=... timeout and
                        need the box quickly up again.
  
                        cache-to-cache transfer latencies.
  
        rcutree.rcu_fanout_leaf= [KNL]
 -                      Increase the number of CPUs assigned to each
 -                      leaf rcu_node structure.  Useful for very large
 -                      systems.
 +                      Change the number of CPUs assigned to each
 +                      leaf rcu_node structure.  Useful for very
 +                      large systems, which will choose the value 64,
 +                      and for NUMA systems with large remote-access
 +                      latencies, which will choose a value aligned
 +                      with the appropriate hardware boundaries.
  
        rcutree.jiffies_till_sched_qs= [KNL]
                        Set required age in jiffies for a
diff --combined MAINTAINERS
index 7016beec7f93a157694bd97981722472f6bf815a,5f36f5a294439f4fd8ce948deab9d14745e10b29..c16f3f95db26998179fcc942bd2051290fc69200
@@@ -240,12 -240,6 +240,12 @@@ L:       lm-sensors@lm-sensors.or
  S:    Maintained
  F:    drivers/hwmon/abituguru3.c
  
 +ACCES 104-IDIO-16 GPIO DRIVER
 +M:    "William Breathitt Gray" <vilhelm.gray@gmail.com>
 +L:    linux-gpio@vger.kernel.org
 +S:    Maintained
 +F:    drivers/gpio/gpio-104-idio-16.c
 +
  ACENIC DRIVER
  M:    Jes Sorensen <jes@trained-monkey.org>
  L:    linux-acenic@sunsite.dk
@@@ -660,6 -654,11 +660,6 @@@ F:        drivers/gpu/drm/radeon/radeon_kfd.
  F:    drivers/gpu/drm/radeon/radeon_kfd.h
  F:    include/uapi/linux/kfd_ioctl.h
  
 -AMD MICROCODE UPDATE SUPPORT
 -M:    Borislav Petkov <bp@alien8.de>
 -S:    Maintained
 -F:    arch/x86/kernel/cpu/microcode/amd*
 -
  AMD XGBE DRIVER
  M:    Tom Lendacky <thomas.lendacky@amd.com>
  L:    netdev@vger.kernel.org
@@@ -789,11 -788,6 +789,11 @@@ S:       Maintaine
  F:    drivers/net/appletalk/
  F:    net/appletalk/
  
 +APPLIED MICRO (APM) X-GENE DEVICE TREE SUPPORT
 +M:    Duc Dang <dhdang@apm.com>
 +S:    Supported
 +F:    arch/arm64/boot/dts/apm/
 +
  APPLIED MICRO (APM) X-GENE SOC ETHERNET DRIVER
  M:    Iyappan Subramanian <isubramanian@apm.com>
  M:    Keyur Chudgar <kchudgar@apm.com>
@@@ -828,13 -822,12 +828,13 @@@ F:      arch/arm/include/asm/floppy.
  
  ARM PMU PROFILING AND DEBUGGING
  M:    Will Deacon <will.deacon@arm.com>
 +R:    Mark Rutland <mark.rutland@arm.com>
  S:    Maintained
 -F:    arch/arm/kernel/perf_*
 +F:    arch/arm*/kernel/perf_*
  F:    arch/arm/oprofile/common.c
 -F:    arch/arm/kernel/hw_breakpoint.c
 -F:    arch/arm/include/asm/hw_breakpoint.h
 -F:    arch/arm/include/asm/perf_event.h
 +F:    arch/arm*/kernel/hw_breakpoint.c
 +F:    arch/arm*/include/asm/hw_breakpoint.h
 +F:    arch/arm*/include/asm/perf_event.h
  F:    drivers/perf/arm_pmu.c
  F:    include/linux/perf/arm_pmu.h
  
@@@ -901,12 -894,11 +901,12 @@@ M:      Lennert Buytenhek <kernel@wantstofly
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  
 -ARM/Allwinner A1X SoC support
 +ARM/Allwinner sunXi SoC support
  M:    Maxime Ripard <maxime.ripard@free-electrons.com>
 +M:    Chen-Yu Tsai <wens@csie.org>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
 -N:    sun[x4567]i
 +N:    sun[x456789]i
  
  ARM/Allwinner SoC Clock Support
  M:    Emilio López <emilio@elopez.com.ar>
@@@ -925,7 -917,7 +925,7 @@@ M: Tsahee Zidenberg <tsahee@annapurnala
  S:    Maintained
  F:    arch/arm/mach-alpine/
  
 -ARM/ATMEL AT91RM9200 AND AT91SAM ARM ARCHITECTURES
 +ARM/ATMEL AT91RM9200, AT91SAM9 AND SAMA5 SOC SUPPORT
  M:    Nicolas Ferre <nicolas.ferre@atmel.com>
  M:    Alexandre Belloni <alexandre.belloni@free-electrons.com>
  M:    Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
@@@ -1238,13 -1230,6 +1238,13 @@@ ARM/LPC18XX ARCHITECTUR
  M:    Joachim Eastwood <manabian@gmail.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
 +F:    arch/arm/boot/dts/lpc43*
 +F:    drivers/clk/nxp/clk-lpc18xx*
 +F:    drivers/clocksource/time-lpc32xx.c
 +F:    drivers/i2c/busses/i2c-lpc2k.c
 +F:    drivers/memory/pl172.c
 +F:    drivers/mtd/spi-nor/nxp-spifi.c
 +F:    drivers/rtc/rtc-lpc24xx.c
  N:    lpc18xx
  
  ARM/MAGICIAN MACHINE SUPPORT
@@@ -1312,13 -1297,6 +1312,13 @@@ F:    arch/arm/mach-mediatek
  N:    mtk
  K:    mediatek
  
 +ARM/Mediatek USB3 PHY DRIVER
 +M:    Chunfeng Yun <chunfeng.yun@mediatek.com>
 +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 +L:    linux-mediatek@lists.infradead.org (moderated for non-subscribers)
 +S:    Maintained
 +F:    drivers/phy/phy-mt65xx-usb3.c
 +
  ARM/MICREL KS8695 ARCHITECTURE
  M:    Greg Ungerer <gerg@uclinux.org>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -1466,12 -1444,7 +1466,12 @@@ F:    arch/arm/mach-exynos*
  F:    drivers/*/*s3c2410*
  F:    drivers/*/*/*s3c2410*
  F:    drivers/spi/spi-s3c*
 +F:    drivers/soc/samsung/*
  F:    sound/soc/samsung/*
 +F:    Documentation/arm/Samsung/
 +F:    Documentation/devicetree/bindings/arm/samsung/
 +F:    Documentation/devicetree/bindings/sram/samsung-sram.txt
 +F:    Documentation/devicetree/bindings/power/pd-samsung.txt
  N:    exynos
  
  ARM/SAMSUNG MOBILE MACHINE SUPPORT
@@@ -1506,14 -1479,6 +1506,14 @@@ L:    linux-media@vger.kernel.or
  S:    Maintained
  F:    drivers/media/platform/s5p-tv/
  
 +ARM/SAMSUNG S5P SERIES JPEG CODEC SUPPORT
 +M:    Andrzej Pietrasiewicz <andrzej.p@samsung.com>
 +M:    Jacek Anaszewski <j.anaszewski@samsung.com>
 +L:    linux-arm-kernel@lists.infradead.org
 +L:    linux-media@vger.kernel.org
 +S:    Maintained
 +F:    drivers/media/platform/s5p-jpeg/
 +
  ARM/SHMOBILE ARM ARCHITECTURE
  M:    Simon Horman <horms@verge.net.au>
  M:    Magnus Damm <magnus.damm@gmail.com>
@@@ -1526,6 -1491,8 +1526,6 @@@ F:      arch/arm/boot/dts/emev2
  F:    arch/arm/boot/dts/r7s*
  F:    arch/arm/boot/dts/r8a*
  F:    arch/arm/boot/dts/sh*
 -F:    arch/arm/configs/bockw_defconfig
 -F:    arch/arm/configs/marzen_defconfig
  F:    arch/arm/configs/shmobile_defconfig
  F:    arch/arm/include/debug/renesas-scif.S
  F:    arch/arm/mach-shmobile/
@@@ -1560,7 -1527,6 +1560,7 @@@ W:      http://www.stlinux.co
  S:    Maintained
  F:    arch/arm/mach-sti/
  F:    arch/arm/boot/dts/sti*
 +F:    drivers/char/hw_random/st-rng.c
  F:    drivers/clocksource/arm_global_timer.c
  F:    drivers/clocksource/clksrc_st_lpc.c
  F:    drivers/i2c/busses/i2c-st.c
@@@ -1640,10 -1606,7 +1640,10 @@@ M:    Masahiro Yamada <yamada.masahiro@soc
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
  F:    arch/arm/boot/dts/uniphier*
 +F:    arch/arm/include/asm/hardware/cache-uniphier.h
  F:    arch/arm/mach-uniphier/
 +F:    arch/arm/mm/cache-uniphier.c
 +F:    drivers/i2c/busses/i2c-uniphier*
  F:    drivers/pinctrl/uniphier/
  F:    drivers/tty/serial/8250/8250_uniphier.c
  N:    uniphier
@@@ -1816,14 -1779,6 +1816,14 @@@ S:    Supporte
  F:    Documentation/aoe/
  F:    drivers/block/aoe/
  
 +ATHEROS 71XX/9XXX GPIO DRIVER
 +M:    Alban Bedel <albeu@free.fr>
 +W:    https://github.com/AlbanBedel/linux
 +T:    git git://github.com/AlbanBedel/linux
 +S:    Maintained
 +F:    drivers/gpio/gpio-ath79.c
 +F:    Documentation/devicetree/bindings/gpio/gpio-ath79.txt
 +
  ATHEROS ATH GENERIC UTILITIES
  M:    "Luis R. Rodriguez" <mcgrof@do-not-panic.com>
  L:    linux-wireless@vger.kernel.org
@@@ -2405,27 -2360,19 +2405,27 @@@ L:   linux-scsi@vger.kernel.or
  S:    Supported
  F:    drivers/scsi/bnx2i/
  
 -BROADCOM CYGNUS/IPROC ARM ARCHITECTURE
 +BROADCOM IPROC ARM ARCHITECTURE
  M:    Ray Jui <rjui@broadcom.com>
  M:    Scott Branden <sbranden@broadcom.com>
 +M:    Jon Mason <jonmason@broadcom.com>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  L:    bcm-kernel-feedback-list@broadcom.com
  T:    git git://github.com/broadcom/cygnus-linux.git
  S:    Maintained
  N:    iproc
  N:    cygnus
 +N:    nsp
  N:    bcm9113*
  N:    bcm9583*
 -N:    bcm583*
 +N:    bcm9585*
 +N:    bcm9586*
 +N:    bcm988312
  N:    bcm113*
 +N:    bcm583*
 +N:    bcm585*
 +N:    bcm586*
 +N:    bcm88312
  
  BROADCOM BRCMSTB GPIO DRIVER
  M:    Gregory Fong <gregory.0xf0@gmail.com>
@@@ -2783,10 -2730,9 +2783,10 @@@ S:    Supporte
  F:    drivers/net/ethernet/cisco/enic/
  
  CISCO VIC LOW LATENCY NIC DRIVER
 -M:    Upinder Malhi <umalhi@cisco.com>
 +M:    Christian Benvenuti <benve@cisco.com>
 +M:    Dave Goodell <dgoodell@cisco.com>
  S:    Supported
 -F:    drivers/infiniband/hw/usnic
 +F:    drivers/infiniband/hw/usnic/
  
  CIRRUS LOGIC EP93XX ETHERNET DRIVER
  M:    Hartley Sweeten <hsweeten@visionengravers.com>
@@@ -3421,7 -3367,6 +3421,7 @@@ M:      Support Opensource <support.opensour
  W:    http://www.dialog-semiconductor.com/products
  S:    Supported
  F:    Documentation/hwmon/da90??
 +F:    Documentation/devicetree/bindings/sound/da[79]*.txt
  F:    drivers/gpio/gpio-da90??.c
  F:    drivers/hwmon/da90??-hwmon.c
  F:    drivers/iio/adc/da91??-*.c
@@@ -3556,15 -3501,13 +3556,15 @@@ M:   Jonathan Corbet <corbet@lwn.net
  L:    linux-doc@vger.kernel.org
  S:    Maintained
  F:    Documentation/
 +F:    scripts/docproc.c
 +F:    scripts/kernel-doc*
  X:    Documentation/ABI/
  X:    Documentation/devicetree/
  X:    Documentation/acpi
  X:    Documentation/power
  X:    Documentation/spi
  X:    Documentation/DocBook/media
 -T:    git git://git.lwn.net/linux-2.6.git docs-next
 +T:    git git://git.lwn.net/linux.git docs-next
  
  DOUBLETALK DRIVER
  M:    "James R. Van Zandt" <jrv@vanzandt.mv.com>
@@@ -3641,7 -3584,6 +3641,7 @@@ M:      Daniel Vetter <daniel.vetter@intel.c
  M:    Jani Nikula <jani.nikula@linux.intel.com>
  L:    intel-gfx@lists.freedesktop.org
  L:    dri-devel@lists.freedesktop.org
 +W:    https://01.org/linuxgraphics/
  Q:    http://patchwork.freedesktop.org/project/intel-gfx/
  T:    git git://anongit.freedesktop.org/drm-intel
  S:    Supported
@@@ -4394,13 -4336,6 +4394,13 @@@ F:    include/linux/fmc*.
  F:    include/linux/ipmi-fru.h
  K:    fmc_d.*register
  
 +FPGA MANAGER FRAMEWORK
 +M:    Alan Tull <atull@opensource.altera.com>
 +S:    Maintained
 +F:    drivers/fpga/
 +F:    include/linux/fpga/fpga-mgr.h
 +W:    http://www.rocketboards.org
 +
  FPU EMULATOR
  M:    Bill Metzenthen <billm@melbpc.org.au>
  W:    http://floatingpoint.sourceforge.net/emulator/index.html
@@@ -4492,14 -4427,6 +4492,14 @@@ L:    linuxppc-dev@lists.ozlabs.or
  S:    Maintained
  F:    drivers/net/ethernet/freescale/ucc_geth*
  
 +FREESCALE eTSEC ETHERNET DRIVER (GIANFAR)
 +M:    Claudiu Manoil <claudiu.manoil@freescale.com>
 +L:    netdev@vger.kernel.org
 +S:    Maintained
 +F:    drivers/net/ethernet/freescale/gianfar*
 +X:    drivers/net/ethernet/freescale/gianfar_ptp.c
 +F:    Documentation/devicetree/bindings/net/fsl-tsec-phy.txt
 +
  FREESCALE QUICC ENGINE UCC UART DRIVER
  M:    Timur Tabi <timur@tabi.org>
  L:    linuxppc-dev@lists.ozlabs.org
@@@ -5177,7 -5104,6 +5177,7 @@@ S:      Maintaine
  F:    Documentation/devicetree/bindings/i2c/
  F:    Documentation/i2c/
  F:    drivers/i2c/
 +F:    drivers/i2c/*/
  F:    include/linux/i2c.h
  F:    include/linux/i2c-*.h
  F:    include/uapi/linux/i2c.h
@@@ -5519,6 -5445,12 +5519,6 @@@ W:     https://01.org/linux-acp
  S:    Supported
  F:    drivers/platform/x86/intel_menlow.c
  
 -INTEL IA32 MICROCODE UPDATE SUPPORT
 -M:    Borislav Petkov <bp@alien8.de>
 -S:    Maintained
 -F:    arch/x86/kernel/cpu/microcode/core*
 -F:    arch/x86/kernel/cpu/microcode/intel*
 -
  INTEL I/OAT DMA DRIVER
  M:    Dave Jiang <dave.jiang@intel.com>
  R:    Dan Williams <dan.j.williams@intel.com>
@@@ -5598,12 -5530,6 +5598,12 @@@ F:    Documentation/networking/README.ipw2
  F:    Documentation/networking/README.ipw2200
  F:    drivers/net/wireless/ipw2x00/
  
 +INTEL(R) TRACE HUB
 +M:    Alexander Shishkin <alexander.shishkin@linux.intel.com>
 +S:    Supported
 +F:    Documentation/trace/intel_th.txt
 +F:    drivers/hwtracing/intel_th/
 +
  INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT)
  M:    Richard L Maliszewski <richard.l.maliszewski@intel.com>
  M:    Gang Wei <gang.wei@intel.com>
@@@ -5635,7 -5561,7 +5635,7 @@@ F:      drivers/net/wireless/iwlegacy
  INTEL WIRELESS WIFI LINK (iwlwifi)
  M:    Johannes Berg <johannes.berg@intel.com>
  M:    Emmanuel Grumbach <emmanuel.grumbach@intel.com>
 -M:    Intel Linux Wireless <ilw@linux.intel.com>
 +M:    Intel Linux Wireless <linuxwifi@intel.com>
  L:    linux-wireless@vger.kernel.org
  W:    http://intellinuxwireless.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/iwlwifi/iwlwifi.git
@@@ -5651,22 -5577,6 +5651,22 @@@ F:    include/linux/mei_cl_bus.
  F:    drivers/misc/mei/*
  F:    Documentation/misc-devices/mei/*
  
 +INTEL MIC DRIVERS (mic)
 +M:    Sudeep Dutt <sudeep.dutt@intel.com>
 +M:    Ashutosh Dixit <ashutosh.dixit@intel.com>
 +S:    Supported
 +W:    https://github.com/sudeepdutt/mic
 +W:    http://software.intel.com/en-us/mic-developer
 +F:    include/linux/mic_bus.h
 +F:    include/linux/scif.h
 +F:    include/uapi/linux/mic_common.h
 +F:    include/uapi/linux/mic_ioctl.h
 +F     include/uapi/linux/scif_ioctl.h
 +F:    drivers/misc/mic/
 +F:    drivers/dma/mic_x100_dma.c
 +F:    drivers/dma/mic_x100_dma.h
 +F     Documentation/mic/
 +
  INTEL PMC IPC DRIVER
  M:    Zha Qipeng<qipeng.zha@intel.com>
  L:    platform-driver-x86@vger.kernel.org
@@@ -6198,13 -6108,6 +6198,13 @@@ F:    Documentation/auxdisplay/ks010
  F:    drivers/auxdisplay/ks0108.c
  F:    include/linux/ks0108.h
  
 +L3MDEV
 +M:    David Ahern <dsa@cumulusnetworks.com>
 +L:    netdev@vger.kernel.org
 +S:    Maintained
 +F:    net/l3mdev
 +F:    include/net/l3mdev.h
 +
  LAPB module
  L:    linux-x25@vger.kernel.org
  S:    Orphan
@@@ -6355,14 -6258,6 +6355,14 @@@ F:    drivers/nvdimm/pmem.
  F:    include/linux/pmem.h
  F:    arch/*/include/asm/pmem.h
  
 +LIGHTNVM PLATFORM SUPPORT
 +M:    Matias Bjorling <mb@lightnvm.io>
 +W:    http://github/OpenChannelSSD
 +S:    Maintained
 +F:    drivers/lightnvm/
 +F:    include/linux/lightnvm.h
 +F:    include/uapi/linux/lightnvm.h
 +
  LINUX FOR IBM pSERIES (RS/6000)
  M:    Paul Mackerras <paulus@au.ibm.com>
  W:    http://www.ibm.com/linux/ltc/projects/ppc
@@@ -6680,13 -6575,6 +6680,13 @@@ M:    Guenter Roeck <linux@roeck-us.net
  S:    Maintained
  F:    drivers/net/dsa/mv88e6352.c
  
 +MARVELL CRYPTO DRIVER
 +M:    Boris Brezillon <boris.brezillon@free-electrons.com>
 +M:    Arnaud Ebalard <arno@natisbad.org>
 +F:    drivers/crypto/marvell/
 +S:    Maintained
 +L:    linux-crypto@vger.kernel.org
 +
  MARVELL GIGABIT ETHERNET DRIVERS (skge/sky2)
  M:    Mirko Lindner <mlindner@marvell.com>
  M:    Stephen Hemminger <stephen@networkplumber.org>
@@@ -6809,12 -6697,6 +6809,12 @@@ W:    http://linuxtv.or
  S:    Maintained
  F:    drivers/media/radio/radio-maxiradio*
  
 +MCP4531 MICROCHIP DIGITAL POTENTIOMETER DRIVER
 +M:    Peter Rosin <peda@axentia.se>
 +L:    linux-iio@vger.kernel.org
 +S:    Maintained
 +F:    drivers/iio/potentiometer/mcp4531.c
 +
  MEDIA DRIVERS FOR RENESAS - VSP1
  M:    Laurent Pinchart <laurent.pinchart@ideasonboard.com>
  L:    linux-media@vger.kernel.org
@@@ -7050,13 -6932,6 +7050,13 @@@ S:    Supporte
  F:    include/linux/mlx5/
  F:    drivers/infiniband/hw/mlx5/
  
 +MELEXIS MLX90614 DRIVER
 +M:    Crt Mori <cmo@melexis.com>
 +L:    linux-iio@vger.kernel.org
 +W:    http://www.melexis.com
 +S:    Supported
 +F:    drivers/iio/temperature/mlx90614.c
 +
  MN88472 MEDIA DRIVER
  M:    Antti Palosaari <crope@iki.fi>
  L:    linux-media@vger.kernel.org
@@@ -7110,7 -6985,6 +7110,7 @@@ M:      Alan Ott <alan@signal11.us
  L:    linux-wpan@vger.kernel.org
  S:    Maintained
  F:    drivers/net/ieee802154/mrf24j40.c
 +F:    Documentation/devicetree/bindings/net/ieee802154/mrf24j40.txt
  
  MSI LAPTOP SUPPORT
  M:    "Lee, Chun-Yi" <jlee@suse.com>
@@@ -7183,6 -7057,7 +7183,6 @@@ F:      drivers/media/i2c/mt9v032.
  F:    include/media/mt9v032.h
  
  MULTIFUNCTION DEVICES (MFD)
 -M:    Samuel Ortiz <sameo@linux.intel.com>
  M:    Lee Jones <lee.jones@linaro.org>
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git
  S:    Supported
@@@ -7444,6 -7319,7 +7444,6 @@@ S:      Odd Fixe
  F:    drivers/net/
  F:    include/linux/if_*
  F:    include/linux/netdevice.h
 -F:    include/linux/arcdevice.h
  F:    include/linux/etherdevice.h
  F:    include/linux/fcdevice.h
  F:    include/linux/fddidevice.h
@@@ -7509,6 -7385,7 +7509,7 @@@ S:      Supporte
  F:    Documentation/filesystems/nilfs2.txt
  F:    fs/nilfs2/
  F:    include/linux/nilfs2_fs.h
+ F:    include/trace/events/nilfs2.h
  
  NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER
  M:    YOKOTA Hiroshi <yokota@netlab.is.tsukuba.ac.jp>
@@@ -7536,10 -7413,10 +7537,10 @@@ NOKIA N900 POWER SUPPLY DRIVER
  M:    Pali Rohár <pali.rohar@gmail.com>
  S:    Maintained
  F:    include/linux/power/bq2415x_charger.h
 -F:    include/linux/power/bq27x00_battery.h
 +F:    include/linux/power/bq27xxx_battery.h
  F:    include/linux/power/isp1704_charger.h
  F:    drivers/power/bq2415x_charger.c
 -F:    drivers/power/bq27x00_battery.c
 +F:    drivers/power/bq27xxx_battery.c
  F:    drivers/power/isp1704_charger.c
  F:    drivers/power/rx51_battery.c
  
@@@ -7582,13 -7459,11 +7583,13 @@@ F:   drivers/video/fbdev/riva
  F:    drivers/video/fbdev/nvidia/
  
  NVM EXPRESS DRIVER
 -M:    Matthew Wilcox <willy@linux.intel.com>
 +M:    Keith Busch <keith.busch@intel.com>
 +M:    Jens Axboe <axboe@fb.com>
  L:    linux-nvme@lists.infradead.org
 -T:    git git://git.infradead.org/users/willy/linux-nvme.git
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
 +W:    https://kernel.googlesource.com/pub/scm/linux/kernel/git/axboe/linux-block/
  S:    Supported
 -F:    drivers/block/nvme*
 +F:    drivers/nvme/host/
  F:    include/linux/nvme.h
  
  NVMEM FRAMEWORK
@@@ -8083,14 -7958,6 +8084,14 @@@ F:    include/linux/pci
  F:    arch/x86/pci/
  F:    arch/x86/kernel/quirks.c
  
 +PCI DRIVER FOR ALTERA PCIE IP
 +M:    Ley Foon Tan <lftan@altera.com>
 +L:    rfi@lists.rocketboards.org (moderated for non-subscribers)
 +L:    linux-pci@vger.kernel.org
 +S:    Supported
 +F:    Documentation/devicetree/bindings/pci/altera-pcie.txt
 +F:    drivers/pci/host/pcie-altera.c
 +
  PCI DRIVER FOR ARM VERSATILE PLATFORM
  M:    Rob Herring <robh@kernel.org>
  L:    linux-pci@vger.kernel.org
@@@ -8192,14 -8059,6 +8193,14 @@@ L:    linux-pci@vger.kernel.or
  S:    Maintained
  F:    drivers/pci/host/*spear*
  
 +PCI MSI DRIVER FOR ALTERA MSI IP
 +M:    Ley Foon Tan <lftan@altera.com>
 +L:    rfi@lists.rocketboards.org (moderated for non-subscribers)
 +L:    linux-pci@vger.kernel.org
 +S:    Supported
 +F:    Documentation/devicetree/bindings/pci/altera-pcie-msi.txt
 +F:    drivers/pci/host/pcie-altera-msi.c
 +
  PCI MSI DRIVER FOR APPLIEDMICRO XGENE
  M:    Duc Dang <dhdang@apm.com>
  L:    linux-pci@vger.kernel.org
@@@ -8208,13 -8067,6 +8209,13 @@@ S:    Maintaine
  F:    Documentation/devicetree/bindings/pci/xgene-pci-msi.txt
  F:    drivers/pci/host/pci-xgene-msi.c
  
 +PCIE DRIVER FOR HISILICON
 +M:    Zhou Wang <wangzhou1@hisilicon.com>
 +L:    linux-pci@vger.kernel.org
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
 +F:    drivers/pci/host/pcie-hisi.c
 +
  PCMCIA SUBSYSTEM
  P:    Linux PCMCIA Team
  L:    linux-pcmcia@lists.infradead.org
@@@ -8321,13 -8173,6 +8322,13 @@@ L:    linux-arm-kernel@lists.infradead.or
  S:    Maintained
  F:    drivers/pinctrl/pinctrl-at91.*
  
 +PIN CONTROLLER - ATMEL AT91 PIO4
 +M:    Ludovic Desroches <ludovic.desroches@atmel.com>
 +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 +L:    linux-gpio@vger.kernel.org
 +S:    Supported
 +F:    drivers/pinctrl/pinctrl-at91-pio4.*
 +
  PIN CONTROLLER - INTEL
  M:    Mika Westerberg <mika.westerberg@linux.intel.com>
  M:    Heikki Krogerus <heikki.krogerus@linux.intel.com>
@@@ -8431,6 -8276,12 +8432,6 @@@ M:     "Rafael J. Wysocki" <rafael.j.wysock
  S:    Maintained
  F:    drivers/pnp/
  
 -PNXxxxx I2C DRIVER
 -M:    Vitaly Wool <vitalywool@gmail.com>
 -L:    linux-i2c@vger.kernel.org
 -S:    Maintained
 -F:    drivers/i2c/busses/i2c-pnx.c
 -
  PPP PROTOCOL DRIVERS AND COMPRESSORS
  M:    Paul Mackerras <paulus@samba.org>
  L:    linux-ppp@vger.kernel.org
@@@ -8683,16 -8534,6 +8684,16 @@@ L:    netdev@vger.kernel.or
  S:    Supported
  F:    drivers/net/ethernet/qlogic/qlge/
  
 +QLOGIC QL4xxx ETHERNET DRIVER
 +M:    Yuval Mintz <Yuval.Mintz@qlogic.com>
 +M:    Ariel Elior <Ariel.Elior@qlogic.com>
 +M:    everest-linux-l2@qlogic.com
 +L:    netdev@vger.kernel.org
 +S:    Supported
 +F:    drivers/net/ethernet/qlogic/qed/
 +F:    include/linux/qed/
 +F:    drivers/net/ethernet/qlogic/qede/
 +
  QNX4 FILESYSTEM
  M:    Anders Larsen <al@alarsen.net>
  W:    http://www.alarsen.net/linux/qnx4fs/
@@@ -9044,13 -8885,6 +9045,13 @@@ S:    Maintaine
  F:    drivers/net/wireless/rtlwifi/
  F:    drivers/net/wireless/rtlwifi/rtl8192ce/
  
 +RTL8XXXU WIRELESS DRIVER (rtl8xxxu)
 +M:    Jes Sorensen <Jes.Sorensen@redhat.com>
 +L:    linux-wireless@vger.kernel.org
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jes/linux.git rtl8723au-mac80211
 +S:    Maintained
 +F:    drivers/net/wireless/realtek/rtl8xxxu/
 +
  S3 SAVAGE FRAMEBUFFER DRIVER
  M:    Antonino Daplas <adaplas@gmail.com>
  L:    linux-fbdev@vger.kernel.org
@@@ -9124,13 -8958,6 +9125,13 @@@ F:    drivers/s390/net/*iucv
  F:    include/net/iucv/
  F:    net/iucv/
  
 +S390 IOMMU (PCI)
 +M:    Gerald Schaefer <gerald.schaefer@de.ibm.com>
 +L:    linux-s390@vger.kernel.org
 +W:    http://www.ibm.com/developerworks/linux/linux390/
 +S:    Supported
 +F:    drivers/iommu/s390-iommu.c
 +
  S3C24XX SD/MMC Driver
  M:    Ben Dooks <ben-linux@fluff.org>
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -9306,14 -9133,6 +9307,14 @@@ S:    Maintaine
  F:    include/linux/mmc/dw_mmc.h
  F:    drivers/mmc/host/dw_mmc*
  
 +SYSTEM TRACE MODULE CLASS
 +M:    Alexander Shishkin <alexander.shishkin@linux.intel.com>
 +S:    Maintained
 +F:    Documentation/trace/stm.txt
 +F:    drivers/hwtracing/stm/
 +F:    include/linux/stm.h
 +F:    include/uapi/linux/stm.h
 +
  THUNDERBOLT DRIVER
  M:    Andreas Noever <andreas.noever@gmail.com>
  S:    Maintained
@@@ -9358,16 -9177,6 +9359,16 @@@ W:    http://www.sunplus.co
  S:    Supported
  F:    arch/score/
  
 +SYSTEM CONTROL & POWER INTERFACE (SCPI) Message Protocol drivers
 +M:    Sudeep Holla <sudeep.holla@arm.com>
 +L:    linux-arm-kernel@lists.infradead.org
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/arm/arm,scpi.txt
 +F:    drivers/clk/clk-scpi.c
 +F:    drivers/cpufreq/scpi-cpufreq.c
 +F:    drivers/firmware/arm_scpi.c
 +F:    include/linux/scpi_protocol.h
 +
  SCSI CDROM DRIVER
  M:    Jens Axboe <axboe@kernel.dk>
  L:    linux-scsi@vger.kernel.org
@@@ -9529,8 -9338,8 +9530,8 @@@ F:      include/uapi/linux/phantom.
  
  SERVER ENGINES 10Gbps iSCSI - BladeEngine 2 DRIVER
  M:    Jayamohan Kallickal <jayamohan.kallickal@avagotech.com>
 -M:    Minh Tran <minh.tran@avagotech.com>
 -M:    John Soni Jose <sony.john-n@avagotech.com>
 +M:    Ketan Mukadam <ketan.mukadam@avagotech.com>
 +M:    John Soni Jose <sony.john@avagotech.com>
  L:    linux-scsi@vger.kernel.org
  W:    http://www.avagotech.com
  S:    Supported
@@@ -10192,11 -10001,9 +10193,11 @@@ F: drivers/staging/vt665?
  
  STAGING - WILC1000 WIFI DRIVER
  M:    Johnny Kim <johnny.kim@atmel.com>
 -M:    Rachel Kim <rachel.kim@atmel.com>
 -M:    Dean Lee <dean.lee@atmel.com>
 +M:    Austin Shin <austin.shin@atmel.com>
  M:    Chris Park <chris.park@atmel.com>
 +M:    Tony Cho <tony.cho@atmel.com>
 +M:    Glen Lee <glen.lee@atmel.com>
 +M:    Leo Kim <leo.kim@atmel.com>
  L:    linux-wireless@vger.kernel.org
  S:    Supported
  F:    drivers/staging/wilc1000/
@@@ -10285,7 -10092,6 +10286,7 @@@ F:   include/net/switchdev.
  
  SYNOPSYS ARC ARCHITECTURE
  M:    Vineet Gupta <vgupta@synopsys.com>
 +L:    linux-snps-arc@lists.infraded.org
  S:    Supported
  F:    arch/arc/
  F:    Documentation/devicetree/bindings/arc/*
@@@ -10746,12 -10552,6 +10747,12 @@@ L: platform-driver-x86@vger.kernel.or
  S:    Maintained
  F:    drivers/platform/x86/toshiba_haps.c
  
 +TOSHIBA WMI HOTKEYS DRIVER
 +M:    Azael Avalos <coproscefalo@gmail.com>
 +L:    platform-driver-x86@vger.kernel.org
 +S:    Maintained
 +F:    drivers/platform/x86/toshiba-wmi.c
 +
  TOSHIBA SMM DRIVER
  M:    Jonathan Buzzard <jonathan@buzzard.org.uk>
  W:    http://www.buzzard.org.uk/toshiba/
@@@ -10809,7 -10609,6 +10810,7 @@@ F:   drivers/media/pci/tw68
  TPM DEVICE DRIVER
  M:    Peter Huewe <peterhuewe@gmx.de>
  M:    Marcel Selhorst <tpmdd@selhorst.net>
 +M:    Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
  R:    Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
  W:    http://tpmdd.sourceforge.net
  L:    tpmdd-devel@lists.sourceforge.net (moderated for non-subscribers)
@@@ -11294,12 -11093,6 +11295,12 @@@ S: Maintaine
  F:    Documentation/fb/uvesafb.txt
  F:    drivers/video/fbdev/uvesafb.*
  
 +VF610 NAND DRIVER
 +M:    Stefan Agner <stefan@agner.ch>
 +L:    linux-mtd@lists.infradead.org
 +S:    Supported
 +F:    drivers/mtd/nand/vf610_nfc.c
 +
  VFAT/FAT/MSDOS FILESYSTEM
  M:    OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
  S:    Maintained
@@@ -11330,12 -11123,6 +11331,12 @@@ S: Maintaine
  F:    drivers/media/v4l2-core/videobuf2-*
  F:    include/media/videobuf2-*
  
 +VIRTUAL SERIO DEVICE DRIVER
 +M:    Stephen Chandler Paul <thatslyude@gmail.com>
 +S:    Maintained
 +F:    drivers/input/serio/userio.c
 +F:    include/uapi/linux/userio.h
 +
  VIRTIO CONSOLE DRIVER
  M:    Amit Shah <amit.shah@redhat.com>
  L:    virtualization@lists.linux-foundation.org
@@@ -11413,13 -11200,6 +11414,13 @@@ L: netdev@vger.kernel.or
  S:    Maintained
  F:    drivers/net/ethernet/via/via-velocity.*
  
 +VIRT LIB
 +M:    Alex Williamson <alex.williamson@redhat.com>
 +M:    Paolo Bonzini <pbonzini@redhat.com>
 +L:    kvm@vger.kernel.org
 +S:    Supported
 +F:    virt/lib/
 +
  VIVID VIRTUAL VIDEO DRIVER
  M:    Hans Verkuil <hverkuil@xs4all.nl>
  L:    linux-media@vger.kernel.org
@@@ -11508,6 -11288,7 +11509,6 @@@ M:   Shrijeet Mukherjee <shm@cumulusnetwo
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    drivers/net/vrf.c
 -F:    include/net/vrf.h
  F:    Documentation/networking/vrf.txt
  
  VT1211 HARDWARE MONITOR DRIVER
@@@ -11626,9 -11407,6 +11627,9 @@@ T:   git https://github.com/CirrusLogic/l
  W:    https://github.com/CirrusLogic/linux-drivers/wiki
  S:    Supported
  F:    Documentation/hwmon/wm83??
 +F:    Documentation/devicetree/bindings/extcon/extcon-arizona.txt
 +F:    Documentation/devicetree/bindings/regulator/arizona-regulator.txt
 +F:    Documentation/devicetree/bindings/mfd/arizona.txt
  F:    arch/arm/mach-s3c64xx/mach-crag6410*
  F:    drivers/clk/clk-wm83*.c
  F:    drivers/extcon/extcon-arizona.c
@@@ -11689,7 -11467,6 +11690,7 @@@ L:   platform-driver-x86@vger.kernel.or
  T:    git git://git.infradead.org/users/dvhart/linux-platform-drivers-x86.git
  S:    Maintained
  F:    drivers/platform/x86/
 +F:    drivers/platform/olpc/
  
  X86 MCE INFRASTRUCTURE
  M:    Tony Luck <tony.luck@intel.com>
@@@ -11698,11 -11475,6 +11699,11 @@@ L: linux-edac@vger.kernel.or
  S:    Maintained
  F:    arch/x86/kernel/cpu/mcheck/*
  
 +X86 MICROCODE UPDATE SUPPORT
 +M:    Borislav Petkov <bp@alien8.de>
 +S:    Maintained
 +F:    arch/x86/kernel/cpu/microcode/*
 +
  X86 VDSO
  M:    Andy Lutomirski <luto@amacapital.net>
  L:    linux-kernel@vger.kernel.org
diff --combined arch/arc/mm/cache.c
index ff7ff6cbb8112408c05a38a2f8e001265d5d3726,875ac2e918c55d7fcc17b738c8dcfdeafacb8d79..b65f797e9ad6723abd7c38bba09e382df52450b4
@@@ -25,7 -25,7 +25,7 @@@ static int l2_line_sz
  int ioc_exists;
  volatile int slc_enable = 1, ioc_enable = 1;
  
 -void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr,
 +void (*_cache_line_loop_ic_fn)(phys_addr_t paddr, unsigned long vaddr,
                               unsigned long sz, const int cacheop);
  
  void (*__dma_cache_wback_inv)(unsigned long start, unsigned long sz);
@@@ -37,6 -37,7 +37,6 @@@ char *arc_cache_mumbojumbo(int c, char 
        int n = 0;
        struct cpuinfo_arc_cache *p;
  
 -#define IS_USED_RUN(v)                ((v) ? "" : "(disabled) ")
  #define PR_CACHE(p, cfg, str)                                         \
        if (!(p)->ver)                                                  \
                n += scnprintf(buf + n, len - n, str"\t\t: N/A\n");     \
@@@ -46,7 -47,7 +46,7 @@@
                        (p)->sz_k, (p)->assoc, (p)->line_len,           \
                        (p)->vipt ? "VIPT" : "PIPT",                    \
                        (p)->alias ? " aliasing" : "",                  \
 -                      IS_ENABLED(cfg) ? "" : " (not used)");
 +                      IS_USED_CFG(cfg));
  
        PR_CACHE(&cpuinfo_arc700[c].icache, CONFIG_ARC_HAS_ICACHE, "I-Cache");
        PR_CACHE(&cpuinfo_arc700[c].dcache, CONFIG_ARC_HAS_DCACHE, "D-Cache");
@@@ -62,7 -63,7 +62,7 @@@
  
        if (ioc_exists)
                n += scnprintf(buf + n, len - n, "IOC\t\t:%s\n",
 -                              IS_USED_RUN(ioc_enable));
 +                              IS_DISABLED_RUN(ioc_enable));
  
        return buf;
  }
@@@ -216,7 -217,7 +216,7 @@@ slc_chk
   */
  
  static inline
 -void __cache_line_loop_v2(unsigned long paddr, unsigned long vaddr,
 +void __cache_line_loop_v2(phys_addr_t paddr, unsigned long vaddr,
                          unsigned long sz, const int op)
  {
        unsigned int aux_cmd;
        }
  }
  
 +/*
 + * For ARC700 MMUv3 I-cache and D-cache flushes
 + * Also reused for HS38 aliasing I-cache configuration
 + */
  static inline
 -void __cache_line_loop_v3(unsigned long paddr, unsigned long vaddr,
 +void __cache_line_loop_v3(phys_addr_t paddr, unsigned long vaddr,
                          unsigned long sz, const int op)
  {
        unsigned int aux_cmd, aux_tag;
        if (full_page)
                write_aux_reg(aux_tag, paddr);
  
 +      /*
 +       * This is technically for MMU v4, using the MMU v3 programming model
 +       * Special work for HS38 aliasing I-cache configuratino with PAE40
 +       *   - upper 8 bits of paddr need to be written into PTAG_HI
 +       *   - (and needs to be written before the lower 32 bits)
 +       * Note that PTAG_HI is hoisted outside the line loop
 +       */
 +      if (is_pae40_enabled() && op == OP_INV_IC)
 +              write_aux_reg(ARC_REG_IC_PTAG_HI, (u64)paddr >> 32);
 +
        while (num_lines-- > 0) {
                if (!full_page) {
                        write_aux_reg(aux_tag, paddr);
  }
  
  /*
 - * In HS38x (MMU v4), although icache is VIPT, only paddr is needed for cache
 - * maintenance ops (in IVIL reg), as long as icache doesn't alias.
 + * In HS38x (MMU v4), I-cache is VIPT (can alias), D-cache is PIPT
 + * Here's how cache ops are implemented
 + *
 + *  - D-cache: only paddr needed (in DC_IVDL/DC_FLDL)
 + *  - I-cache Non Aliasing: Despite VIPT, only paddr needed (in IC_IVIL)
 + *  - I-cache Aliasing: Both vaddr and paddr needed (in IC_IVIL, IC_PTAG
 + *    respectively, similar to MMU v3 programming model, hence
 + *    __cache_line_loop_v3() is used)
   *
 - * For Aliasing icache, vaddr is also needed (in IVIL), while paddr is
 - * specified in PTAG (similar to MMU v3)
 + * If PAE40 is enabled, independent of aliasing considerations, the higher bits
 + * needs to be written into PTAG_HI
   */
  static inline
 -void __cache_line_loop_v4(unsigned long paddr, unsigned long vaddr,
 +void __cache_line_loop_v4(phys_addr_t paddr, unsigned long vaddr,
                          unsigned long sz, const int cacheop)
  {
        unsigned int aux_cmd;
  
        num_lines = DIV_ROUND_UP(sz, L1_CACHE_BYTES);
  
 +      /*
 +       * For HS38 PAE40 configuration
 +       *   - upper 8 bits of paddr need to be written into PTAG_HI
 +       *   - (and needs to be written before the lower 32 bits)
 +       */
 +      if (is_pae40_enabled()) {
 +              if (cacheop == OP_INV_IC)
 +                      /*
 +                       * Non aliasing I-cache in HS38,
 +                       * aliasing I-cache handled in __cache_line_loop_v3()
 +                       */
 +                      write_aux_reg(ARC_REG_IC_PTAG_HI, (u64)paddr >> 32);
 +              else
 +                      write_aux_reg(ARC_REG_DC_PTAG_HI, (u64)paddr >> 32);
 +      }
 +
        while (num_lines-- > 0) {
                write_aux_reg(aux_cmd, paddr);
                paddr += L1_CACHE_BYTES;
@@@ -448,7 -413,7 +448,7 @@@ static inline void __dc_entire_op(cons
  /*
   * D-Cache Line ops: Per Line INV (discard or wback+discard) or FLUSH (wback)
   */
 -static inline void __dc_line_op(unsigned long paddr, unsigned long vaddr,
 +static inline void __dc_line_op(phys_addr_t paddr, unsigned long vaddr,
                                unsigned long sz, const int op)
  {
        unsigned long flags;
@@@ -481,7 -446,7 +481,7 @@@ static inline void __ic_entire_inv(void
  }
  
  static inline void
 -__ic_line_inv_vaddr_local(unsigned long paddr, unsigned long vaddr,
 +__ic_line_inv_vaddr_local(phys_addr_t paddr, unsigned long vaddr,
                          unsigned long sz)
  {
        unsigned long flags;
  #else
  
  struct ic_inv_args {
 -      unsigned long paddr, vaddr;
 +      phys_addr_t paddr, vaddr;
        int sz;
  };
  
@@@ -509,7 -474,7 +509,7 @@@ static void __ic_line_inv_vaddr_helper(
          __ic_line_inv_vaddr_local(ic_inv->paddr, ic_inv->vaddr, ic_inv->sz);
  }
  
 -static void __ic_line_inv_vaddr(unsigned long paddr, unsigned long vaddr,
 +static void __ic_line_inv_vaddr(phys_addr_t paddr, unsigned long vaddr,
                                unsigned long sz)
  {
        struct ic_inv_args ic_inv = {
  
  #endif /* CONFIG_ARC_HAS_ICACHE */
  
 -noinline void slc_op(unsigned long paddr, unsigned long sz, const int op)
 +noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op)
  {
  #ifdef CONFIG_ISA_ARCV2
        /*
@@@ -617,10 -582,10 +617,10 @@@ void flush_dcache_page(struct page *pag
         */
        if (!mapping_mapped(mapping)) {
                clear_bit(PG_dc_clean, &page->flags);
-       } else if (page_mapped(page)) {
+       } else if (page_mapcount(page)) {
  
                /* kernel reading from page with U-mapping */
 -              unsigned long paddr = (unsigned long)page_address(page);
 +              phys_addr_t paddr = (unsigned long)page_address(page);
                unsigned long vaddr = page->index << PAGE_CACHE_SHIFT;
  
                if (addr_not_cache_congruent(paddr, vaddr))
@@@ -768,14 -733,14 +768,14 @@@ EXPORT_SYMBOL(flush_icache_range)
   *    builtin kernel page will not have any virtual mappings.
   *    kprobe on loadable module will be kernel vaddr.
   */
 -void __sync_icache_dcache(unsigned long paddr, unsigned long vaddr, int len)
 +void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len)
  {
        __dc_line_op(paddr, vaddr, len, OP_FLUSH_N_INV);
        __ic_line_inv_vaddr(paddr, vaddr, len);
  }
  
  /* wrapper to compile time eliminate alignment checks in flush loop */
 -void __inv_icache_page(unsigned long paddr, unsigned long vaddr)
 +void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr)
  {
        __ic_line_inv_vaddr(paddr, vaddr, PAGE_SIZE);
  }
   * wrapper to clearout kernel or userspace mappings of a page
   * For kernel mappings @vaddr == @paddr
   */
 -void __flush_dcache_page(unsigned long paddr, unsigned long vaddr)
 +void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr)
  {
        __dc_line_op(paddr, vaddr & PAGE_MASK, PAGE_SIZE, OP_FLUSH_N_INV);
  }
@@@ -842,8 -807,8 +842,8 @@@ void flush_anon_page(struct vm_area_str
  void copy_user_highpage(struct page *to, struct page *from,
        unsigned long u_vaddr, struct vm_area_struct *vma)
  {
 -      unsigned long kfrom = (unsigned long)page_address(from);
 -      unsigned long kto = (unsigned long)page_address(to);
 +      void *kfrom = kmap_atomic(from);
 +      void *kto = kmap_atomic(to);
        int clean_src_k_mappings = 0;
  
        /*
         *
         * Note that while @u_vaddr refers to DST page's userspace vaddr, it is
         * equally valid for SRC page as well
 +       *
 +       * For !VIPT cache, all of this gets compiled out as
 +       * addr_not_cache_congruent() is 0
         */
-       if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
+       if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
 -              __flush_dcache_page(kfrom, u_vaddr);
 +              __flush_dcache_page((unsigned long)kfrom, u_vaddr);
                clean_src_k_mappings = 1;
        }
  
 -      copy_page((void *)kto, (void *)kfrom);
 +      copy_page(kto, kfrom);
  
        /*
         * Mark DST page K-mapping as dirty for a later finalization by
         * sync the kernel mapping back to physical page
         */
        if (clean_src_k_mappings) {
 -              __flush_dcache_page(kfrom, kfrom);
 +              __flush_dcache_page((unsigned long)kfrom, (unsigned long)kfrom);
                set_bit(PG_dc_clean, &from->flags);
        } else {
                clear_bit(PG_dc_clean, &from->flags);
        }
 +
 +      kunmap_atomic(kto);
 +      kunmap_atomic(kfrom);
  }
  
  void clear_user_page(void *to, unsigned long u_vaddr, struct page *page)
index ad4eb2d26e1697fc6a16f47a8805e532e198a693,38307d8312ac6b7a51bb155980f52bc7fca8ea7e..e62400e5fb99fdbf864af966e718a98decf85e29
@@@ -651,12 -651,12 +651,12 @@@ static void *__dma_alloc(struct device 
  
        if (nommu())
                addr = __alloc_simple_buffer(dev, size, gfp, &page);
-       else if (dev_get_cma_area(dev) && (gfp & __GFP_WAIT))
+       else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM))
                addr = __alloc_from_contiguous(dev, size, prot, &page,
                                               caller, want_vaddr);
        else if (is_coherent)
                addr = __alloc_simple_buffer(dev, size, gfp, &page);
-       else if (!(gfp & __GFP_WAIT))
+       else if (!gfpflags_allow_blocking(gfp))
                addr = __alloc_from_pool(size, &page);
        else
                addr = __alloc_remap_buffer(dev, size, gfp, prot, &page,
@@@ -1363,7 -1363,7 +1363,7 @@@ static void *arm_iommu_alloc_attrs(stru
        *handle = DMA_ERROR_CODE;
        size = PAGE_ALIGN(size);
  
-       if (!(gfp & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(gfp))
                return __iommu_alloc_atomic(dev, size, handle);
  
        /*
@@@ -1407,19 -1407,12 +1407,19 @@@ static int arm_iommu_mmap_attrs(struct 
        unsigned long uaddr = vma->vm_start;
        unsigned long usize = vma->vm_end - vma->vm_start;
        struct page **pages = __iommu_get_pages(cpu_addr, attrs);
 +      unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
 +      unsigned long off = vma->vm_pgoff;
  
        vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot);
  
        if (!pages)
                return -ENXIO;
  
 +      if (off >= nr_pages || (usize >> PAGE_SHIFT) > nr_pages - off)
 +              return -ENXIO;
 +
 +      pages += off;
 +
        do {
                int ret = vm_insert_page(vma, uaddr, *pages++);
                if (ret) {
diff --combined arch/arm/xen/mm.c
index 7c34f7126b046abe9d61637a1716a2ae5139bfe5,99eec9063f68afd075dfe87e5a2f0d0ef30502cf..c5f9a9e3d1f393daa161ce8349c820462870a942
@@@ -25,7 -25,7 +25,7 @@@
  unsigned long xen_get_swiotlb_free_pages(unsigned int order)
  {
        struct memblock_region *reg;
-       gfp_t flags = __GFP_NOWARN;
+       gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM;
  
        for_each_memblock(memory, reg) {
                if (reg->base < (phys_addr_t)0xffffffff) {
@@@ -48,22 -48,22 +48,22 @@@ static void dma_cache_maint(dma_addr_t 
        size_t size, enum dma_data_direction dir, enum dma_cache_op op)
  {
        struct gnttab_cache_flush cflush;
 -      unsigned long pfn;
 +      unsigned long xen_pfn;
        size_t left = size;
  
 -      pfn = (handle >> PAGE_SHIFT) + offset / PAGE_SIZE;
 -      offset %= PAGE_SIZE;
 +      xen_pfn = (handle >> XEN_PAGE_SHIFT) + offset / XEN_PAGE_SIZE;
 +      offset %= XEN_PAGE_SIZE;
  
        do {
                size_t len = left;
        
                /* buffers in highmem or foreign pages cannot cross page
                 * boundaries */
 -              if (len + offset > PAGE_SIZE)
 -                      len = PAGE_SIZE - offset;
 +              if (len + offset > XEN_PAGE_SIZE)
 +                      len = XEN_PAGE_SIZE - offset;
  
                cflush.op = 0;
 -              cflush.a.dev_bus_addr = pfn << PAGE_SHIFT;
 +              cflush.a.dev_bus_addr = xen_pfn << XEN_PAGE_SHIFT;
                cflush.offset = offset;
                cflush.length = len;
  
@@@ -79,7 -79,7 +79,7 @@@
                        HYPERVISOR_grant_table_op(GNTTABOP_cache_flush, &cflush, 1);
  
                offset = 0;
 -              pfn++;
 +              xen_pfn++;
                left -= len;
        } while (left);
  }
@@@ -138,29 -138,10 +138,29 @@@ void __xen_dma_sync_single_for_device(s
  }
  
  bool xen_arch_need_swiotlb(struct device *dev,
 -                         unsigned long pfn,
 -                         unsigned long bfn)
 +                         phys_addr_t phys,
 +                         dma_addr_t dev_addr)
  {
 -      return (!hypercall_cflush && (pfn != bfn) && !is_device_dma_coherent(dev));
 +      unsigned int xen_pfn = XEN_PFN_DOWN(phys);
 +      unsigned int bfn = XEN_PFN_DOWN(dev_addr);
 +
 +      /*
 +       * The swiotlb buffer should be used if
 +       *      - Xen doesn't have the cache flush hypercall
 +       *      - The Linux page refers to foreign memory
 +       *      - The device doesn't support coherent DMA request
 +       *
 +       * The Linux page may be spanned acrros multiple Xen page, although
 +       * it's not possible to have a mix of local and foreign Xen page.
 +       * Furthermore, range_straddles_page_boundary is already checking
 +       * if buffer is physically contiguous in the host RAM.
 +       *
 +       * Therefore we only need to check the first Xen page to know if we
 +       * require a bounce buffer because the device doesn't support coherent
 +       * memory and we are not able to flush the cache.
 +       */
 +      return (!hypercall_cflush && (xen_pfn != bfn) &&
 +              !is_device_dma_coherent(dev));
  }
  
  int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
index f3acf421ded4f55616abd7b68a7dcf83081e6f38,0e98f9bc0674401a6e64934348a66254fece2031..5687caf59dd565183213bc920b390bb0232158fa
   *    fixed mappings and modules
   */
  #define VMEMMAP_SIZE          ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
 -#define VMALLOC_START         (UL(0xffffffffffffffff) << VA_BITS)
 +
 +#ifndef CONFIG_KASAN
 +#define VMALLOC_START         (VA_START)
 +#else
 +#include <asm/kasan.h>
 +#define VMALLOC_START         (KASAN_SHADOW_END + SZ_64K)
 +#endif
 +
  #define VMALLOC_END           (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
  
  #define vmemmap                       ((struct page *)(VMALLOC_END + SZ_64K))
@@@ -67,10 -60,8 +67,10 @@@ extern void __pgd_error(const char *fil
  #define PROT_DEFAULT          (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
  #define PROT_SECT_DEFAULT     (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
  
 +#define PROT_DEVICE_nGnRnE    (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
  #define PROT_DEVICE_nGnRE     (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE))
  #define PROT_NORMAL_NC                (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC))
 +#define PROT_NORMAL_WT                (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_WT))
  #define PROT_NORMAL           (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL))
  
  #define PROT_SECT_DEVICE_nGnRE        (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE))
@@@ -81,7 -72,6 +81,7 @@@
  
  #define PAGE_KERNEL           __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
  #define PAGE_KERNEL_EXEC      __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
 +#define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
  
  #define PAGE_HYP              __pgprot(_PAGE_DEFAULT | PTE_HYP)
  #define PAGE_HYP_DEVICE               __pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
@@@ -150,7 -140,6 +150,7 @@@ extern struct page *empty_zero_page
  #define pte_special(pte)      (!!(pte_val(pte) & PTE_SPECIAL))
  #define pte_write(pte)                (!!(pte_val(pte) & PTE_WRITE))
  #define pte_exec(pte)         (!(pte_val(pte) & PTE_UXN))
 +#define pte_cont(pte)         (!!(pte_val(pte) & PTE_CONT))
  
  #ifdef CONFIG_ARM64_HW_AFDBM
  #define pte_hw_dirty(pte)     (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
@@@ -213,16 -202,6 +213,16 @@@ static inline pte_t pte_mkspecial(pte_
        return set_pte_bit(pte, __pgprot(PTE_SPECIAL));
  }
  
 +static inline pte_t pte_mkcont(pte_t pte)
 +{
 +      return set_pte_bit(pte, __pgprot(PTE_CONT));
 +}
 +
 +static inline pte_t pte_mknoncont(pte_t pte)
 +{
 +      return clear_pte_bit(pte, __pgprot(PTE_CONT));
 +}
 +
  static inline void set_pte(pte_t *ptep, pte_t pte)
  {
        *ptep = pte;
@@@ -331,21 -310,15 +331,15 @@@ static inline pgprot_t mk_sect_prot(pgp
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  #define pmd_trans_huge(pmd)   (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
- #define pmd_trans_splitting(pmd)      pte_special(pmd_pte(pmd))
- #ifdef CONFIG_HAVE_RCU_TABLE_FREE
- #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- struct vm_area_struct;
- void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp);
- #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
  #define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
  #define pmd_young(pmd)                pte_young(pmd_pte(pmd))
+ #define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
  #define pmd_wrprotect(pmd)    pte_pmd(pte_wrprotect(pmd_pte(pmd)))
- #define pmd_mksplitting(pmd)  pte_pmd(pte_mkspecial(pmd_pte(pmd)))
  #define pmd_mkold(pmd)                pte_pmd(pte_mkold(pmd_pte(pmd)))
  #define pmd_mkwrite(pmd)      pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+ #define pmd_mkclean(pmd)      pte_pmd(pte_mkclean(pmd_pte(pmd)))
  #define pmd_mkdirty(pmd)      pte_pmd(pte_mkdirty(pmd_pte(pmd)))
  #define pmd_mkyoung(pmd)      pte_pmd(pte_mkyoung(pmd_pte(pmd)))
  #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
@@@ -667,17 -640,14 +661,17 @@@ static inline void update_mmu_cache(str
                                    unsigned long addr, pte_t *ptep)
  {
        /*
 -       * set_pte() does not have a DSB for user mappings, so make sure that
 -       * the page table write is visible.
 +       * We don't do anything here, so there's a very small chance of
 +       * us retaking a user fault which we just fixed up. The alternative
 +       * is doing a dsb(ishst), but that penalises the fastpath.
         */
 -      dsb(ishst);
  }
  
  #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
  
 +#define kc_vaddr_to_offset(v) ((v) & ~VA_START)
 +#define kc_offset_to_vaddr(o) ((o) | VA_START)
 +
  #endif /* !__ASSEMBLY__ */
  
  #endif /* __ASM_PGTABLE_H */
index 6320361d8d4c703cf4072ba2e47fdf7884c7f779,478234383c2cd8c90087eca0031b425a3d01b606..131a199114b405e8403f05137e560a2b317f4941
@@@ -100,7 -100,7 +100,7 @@@ static void *__dma_alloc_coherent(struc
        if (IS_ENABLED(CONFIG_ZONE_DMA) &&
            dev->coherent_dma_mask <= DMA_BIT_MASK(32))
                flags |= GFP_DMA;
-       if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) {
+       if (dev_get_cma_area(dev) && gfpflags_allow_blocking(flags)) {
                struct page *page;
                void *addr;
  
@@@ -148,7 -148,7 +148,7 @@@ static void *__dma_alloc(struct device 
  
        size = PAGE_ALIGN(size);
  
-       if (!coherent && !(flags & __GFP_WAIT)) {
+       if (!coherent && !gfpflags_allow_blocking(flags)) {
                struct page *page = NULL;
                void *addr = __alloc_from_pool(size, &page, flags);
  
@@@ -533,460 -533,3 +533,460 @@@ static int __init dma_debug_do_init(voi
        return 0;
  }
  fs_initcall(dma_debug_do_init);
-       if (gfp & __GFP_WAIT) {
 +
 +
 +#ifdef CONFIG_IOMMU_DMA
 +#include <linux/dma-iommu.h>
 +#include <linux/platform_device.h>
 +#include <linux/amba/bus.h>
 +
 +/* Thankfully, all cache ops are by VA so we can ignore phys here */
 +static void flush_page(struct device *dev, const void *virt, phys_addr_t phys)
 +{
 +      __dma_flush_range(virt, virt + PAGE_SIZE);
 +}
 +
 +static void *__iommu_alloc_attrs(struct device *dev, size_t size,
 +                               dma_addr_t *handle, gfp_t gfp,
 +                               struct dma_attrs *attrs)
 +{
 +      bool coherent = is_device_dma_coherent(dev);
 +      int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
 +      void *addr;
 +
 +      if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
 +              return NULL;
 +      /*
 +       * Some drivers rely on this, and we probably don't want the
 +       * possibility of stale kernel data being read by devices anyway.
 +       */
 +      gfp |= __GFP_ZERO;
 +
++      if (gfpflags_allow_blocking(gfp)) {
 +              struct page **pages;
 +              pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
 +
 +              pages = iommu_dma_alloc(dev, size, gfp, ioprot, handle,
 +                                      flush_page);
 +              if (!pages)
 +                      return NULL;
 +
 +              addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
 +                                            __builtin_return_address(0));
 +              if (!addr)
 +                      iommu_dma_free(dev, pages, size, handle);
 +      } else {
 +              struct page *page;
 +              /*
 +               * In atomic context we can't remap anything, so we'll only
 +               * get the virtually contiguous buffer we need by way of a
 +               * physically contiguous allocation.
 +               */
 +              if (coherent) {
 +                      page = alloc_pages(gfp, get_order(size));
 +                      addr = page ? page_address(page) : NULL;
 +              } else {
 +                      addr = __alloc_from_pool(size, &page, gfp);
 +              }
 +              if (!addr)
 +                      return NULL;
 +
 +              *handle = iommu_dma_map_page(dev, page, 0, size, ioprot);
 +              if (iommu_dma_mapping_error(dev, *handle)) {
 +                      if (coherent)
 +                              __free_pages(page, get_order(size));
 +                      else
 +                              __free_from_pool(addr, size);
 +                      addr = NULL;
 +              }
 +      }
 +      return addr;
 +}
 +
 +static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
 +                             dma_addr_t handle, struct dma_attrs *attrs)
 +{
 +      /*
 +       * @cpu_addr will be one of 3 things depending on how it was allocated:
 +       * - A remapped array of pages from iommu_dma_alloc(), for all
 +       *   non-atomic allocations.
 +       * - A non-cacheable alias from the atomic pool, for atomic
 +       *   allocations by non-coherent devices.
 +       * - A normal lowmem address, for atomic allocations by
 +       *   coherent devices.
 +       * Hence how dodgy the below logic looks...
 +       */
 +      if (__in_atomic_pool(cpu_addr, size)) {
 +              iommu_dma_unmap_page(dev, handle, size, 0, NULL);
 +              __free_from_pool(cpu_addr, size);
 +      } else if (is_vmalloc_addr(cpu_addr)){
 +              struct vm_struct *area = find_vm_area(cpu_addr);
 +
 +              if (WARN_ON(!area || !area->pages))
 +                      return;
 +              iommu_dma_free(dev, area->pages, size, &handle);
 +              dma_common_free_remap(cpu_addr, size, VM_USERMAP);
 +      } else {
 +              iommu_dma_unmap_page(dev, handle, size, 0, NULL);
 +              __free_pages(virt_to_page(cpu_addr), get_order(size));
 +      }
 +}
 +
 +static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 +                            void *cpu_addr, dma_addr_t dma_addr, size_t size,
 +                            struct dma_attrs *attrs)
 +{
 +      struct vm_struct *area;
 +      int ret;
 +
 +      vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot,
 +                                           is_device_dma_coherent(dev));
 +
 +      if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
 +              return ret;
 +
 +      area = find_vm_area(cpu_addr);
 +      if (WARN_ON(!area || !area->pages))
 +              return -ENXIO;
 +
 +      return iommu_dma_mmap(area->pages, size, vma);
 +}
 +
 +static int __iommu_get_sgtable(struct device *dev, struct sg_table *sgt,
 +                             void *cpu_addr, dma_addr_t dma_addr,
 +                             size_t size, struct dma_attrs *attrs)
 +{
 +      unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 +      struct vm_struct *area = find_vm_area(cpu_addr);
 +
 +      if (WARN_ON(!area || !area->pages))
 +              return -ENXIO;
 +
 +      return sg_alloc_table_from_pages(sgt, area->pages, count, 0, size,
 +                                       GFP_KERNEL);
 +}
 +
 +static void __iommu_sync_single_for_cpu(struct device *dev,
 +                                      dma_addr_t dev_addr, size_t size,
 +                                      enum dma_data_direction dir)
 +{
 +      phys_addr_t phys;
 +
 +      if (is_device_dma_coherent(dev))
 +              return;
 +
 +      phys = iommu_iova_to_phys(iommu_get_domain_for_dev(dev), dev_addr);
 +      __dma_unmap_area(phys_to_virt(phys), size, dir);
 +}
 +
 +static void __iommu_sync_single_for_device(struct device *dev,
 +                                         dma_addr_t dev_addr, size_t size,
 +                                         enum dma_data_direction dir)
 +{
 +      phys_addr_t phys;
 +
 +      if (is_device_dma_coherent(dev))
 +              return;
 +
 +      phys = iommu_iova_to_phys(iommu_get_domain_for_dev(dev), dev_addr);
 +      __dma_map_area(phys_to_virt(phys), size, dir);
 +}
 +
 +static dma_addr_t __iommu_map_page(struct device *dev, struct page *page,
 +                                 unsigned long offset, size_t size,
 +                                 enum dma_data_direction dir,
 +                                 struct dma_attrs *attrs)
 +{
 +      bool coherent = is_device_dma_coherent(dev);
 +      int prot = dma_direction_to_prot(dir, coherent);
 +      dma_addr_t dev_addr = iommu_dma_map_page(dev, page, offset, size, prot);
 +
 +      if (!iommu_dma_mapping_error(dev, dev_addr) &&
 +          !dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
 +              __iommu_sync_single_for_device(dev, dev_addr, size, dir);
 +
 +      return dev_addr;
 +}
 +
 +static void __iommu_unmap_page(struct device *dev, dma_addr_t dev_addr,
 +                             size_t size, enum dma_data_direction dir,
 +                             struct dma_attrs *attrs)
 +{
 +      if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
 +              __iommu_sync_single_for_cpu(dev, dev_addr, size, dir);
 +
 +      iommu_dma_unmap_page(dev, dev_addr, size, dir, attrs);
 +}
 +
 +static void __iommu_sync_sg_for_cpu(struct device *dev,
 +                                  struct scatterlist *sgl, int nelems,
 +                                  enum dma_data_direction dir)
 +{
 +      struct scatterlist *sg;
 +      int i;
 +
 +      if (is_device_dma_coherent(dev))
 +              return;
 +
 +      for_each_sg(sgl, sg, nelems, i)
 +              __dma_unmap_area(sg_virt(sg), sg->length, dir);
 +}
 +
 +static void __iommu_sync_sg_for_device(struct device *dev,
 +                                     struct scatterlist *sgl, int nelems,
 +                                     enum dma_data_direction dir)
 +{
 +      struct scatterlist *sg;
 +      int i;
 +
 +      if (is_device_dma_coherent(dev))
 +              return;
 +
 +      for_each_sg(sgl, sg, nelems, i)
 +              __dma_map_area(sg_virt(sg), sg->length, dir);
 +}
 +
 +static int __iommu_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
 +                              int nelems, enum dma_data_direction dir,
 +                              struct dma_attrs *attrs)
 +{
 +      bool coherent = is_device_dma_coherent(dev);
 +
 +      if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
 +              __iommu_sync_sg_for_device(dev, sgl, nelems, dir);
 +
 +      return iommu_dma_map_sg(dev, sgl, nelems,
 +                      dma_direction_to_prot(dir, coherent));
 +}
 +
 +static void __iommu_unmap_sg_attrs(struct device *dev,
 +                                 struct scatterlist *sgl, int nelems,
 +                                 enum dma_data_direction dir,
 +                                 struct dma_attrs *attrs)
 +{
 +      if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
 +              __iommu_sync_sg_for_cpu(dev, sgl, nelems, dir);
 +
 +      iommu_dma_unmap_sg(dev, sgl, nelems, dir, attrs);
 +}
 +
 +static struct dma_map_ops iommu_dma_ops = {
 +      .alloc = __iommu_alloc_attrs,
 +      .free = __iommu_free_attrs,
 +      .mmap = __iommu_mmap_attrs,
 +      .get_sgtable = __iommu_get_sgtable,
 +      .map_page = __iommu_map_page,
 +      .unmap_page = __iommu_unmap_page,
 +      .map_sg = __iommu_map_sg_attrs,
 +      .unmap_sg = __iommu_unmap_sg_attrs,
 +      .sync_single_for_cpu = __iommu_sync_single_for_cpu,
 +      .sync_single_for_device = __iommu_sync_single_for_device,
 +      .sync_sg_for_cpu = __iommu_sync_sg_for_cpu,
 +      .sync_sg_for_device = __iommu_sync_sg_for_device,
 +      .dma_supported = iommu_dma_supported,
 +      .mapping_error = iommu_dma_mapping_error,
 +};
 +
 +/*
 + * TODO: Right now __iommu_setup_dma_ops() gets called too early to do
 + * everything it needs to - the device is only partially created and the
 + * IOMMU driver hasn't seen it yet, so it can't have a group. Thus we
 + * need this delayed attachment dance. Once IOMMU probe ordering is sorted
 + * to move the arch_setup_dma_ops() call later, all the notifier bits below
 + * become unnecessary, and will go away.
 + */
 +struct iommu_dma_notifier_data {
 +      struct list_head list;
 +      struct device *dev;
 +      const struct iommu_ops *ops;
 +      u64 dma_base;
 +      u64 size;
 +};
 +static LIST_HEAD(iommu_dma_masters);
 +static DEFINE_MUTEX(iommu_dma_notifier_lock);
 +
 +/*
 + * Temporarily "borrow" a domain feature flag to to tell if we had to resort
 + * to creating our own domain here, in case we need to clean it up again.
 + */
 +#define __IOMMU_DOMAIN_FAKE_DEFAULT           (1U << 31)
 +
 +static bool do_iommu_attach(struct device *dev, const struct iommu_ops *ops,
 +                         u64 dma_base, u64 size)
 +{
 +      struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 +
 +      /*
 +       * Best case: The device is either part of a group which was
 +       * already attached to a domain in a previous call, or it's
 +       * been put in a default DMA domain by the IOMMU core.
 +       */
 +      if (!domain) {
 +              /*
 +               * Urgh. The IOMMU core isn't going to do default domains
 +               * for non-PCI devices anyway, until it has some means of
 +               * abstracting the entirely implementation-specific
 +               * sideband data/SoC topology/unicorn dust that may or
 +               * may not differentiate upstream masters.
 +               * So until then, HORRIBLE HACKS!
 +               */
 +              domain = ops->domain_alloc(IOMMU_DOMAIN_DMA);
 +              if (!domain)
 +                      goto out_no_domain;
 +
 +              domain->ops = ops;
 +              domain->type = IOMMU_DOMAIN_DMA | __IOMMU_DOMAIN_FAKE_DEFAULT;
 +
 +              if (iommu_attach_device(domain, dev))
 +                      goto out_put_domain;
 +      }
 +
 +      if (iommu_dma_init_domain(domain, dma_base, size))
 +              goto out_detach;
 +
 +      dev->archdata.dma_ops = &iommu_dma_ops;
 +      return true;
 +
 +out_detach:
 +      iommu_detach_device(domain, dev);
 +out_put_domain:
 +      if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT)
 +              iommu_domain_free(domain);
 +out_no_domain:
 +      pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
 +              dev_name(dev));
 +      return false;
 +}
 +
 +static void queue_iommu_attach(struct device *dev, const struct iommu_ops *ops,
 +                            u64 dma_base, u64 size)
 +{
 +      struct iommu_dma_notifier_data *iommudata;
 +
 +      iommudata = kzalloc(sizeof(*iommudata), GFP_KERNEL);
 +      if (!iommudata)
 +              return;
 +
 +      iommudata->dev = dev;
 +      iommudata->ops = ops;
 +      iommudata->dma_base = dma_base;
 +      iommudata->size = size;
 +
 +      mutex_lock(&iommu_dma_notifier_lock);
 +      list_add(&iommudata->list, &iommu_dma_masters);
 +      mutex_unlock(&iommu_dma_notifier_lock);
 +}
 +
 +static int __iommu_attach_notifier(struct notifier_block *nb,
 +                                 unsigned long action, void *data)
 +{
 +      struct iommu_dma_notifier_data *master, *tmp;
 +
 +      if (action != BUS_NOTIFY_ADD_DEVICE)
 +              return 0;
 +
 +      mutex_lock(&iommu_dma_notifier_lock);
 +      list_for_each_entry_safe(master, tmp, &iommu_dma_masters, list) {
 +              if (do_iommu_attach(master->dev, master->ops,
 +                              master->dma_base, master->size)) {
 +                      list_del(&master->list);
 +                      kfree(master);
 +              }
 +      }
 +      mutex_unlock(&iommu_dma_notifier_lock);
 +      return 0;
 +}
 +
 +static int register_iommu_dma_ops_notifier(struct bus_type *bus)
 +{
 +      struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL);
 +      int ret;
 +
 +      if (!nb)
 +              return -ENOMEM;
 +      /*
 +       * The device must be attached to a domain before the driver probe
 +       * routine gets a chance to start allocating DMA buffers. However,
 +       * the IOMMU driver also needs a chance to configure the iommu_group
 +       * via its add_device callback first, so we need to make the attach
 +       * happen between those two points. Since the IOMMU core uses a bus
 +       * notifier with default priority for add_device, do the same but
 +       * with a lower priority to ensure the appropriate ordering.
 +       */
 +      nb->notifier_call = __iommu_attach_notifier;
 +      nb->priority = -100;
 +
 +      ret = bus_register_notifier(bus, nb);
 +      if (ret) {
 +              pr_warn("Failed to register DMA domain notifier; IOMMU DMA ops unavailable on bus '%s'\n",
 +                      bus->name);
 +              kfree(nb);
 +      }
 +      return ret;
 +}
 +
 +static int __init __iommu_dma_init(void)
 +{
 +      int ret;
 +
 +      ret = iommu_dma_init();
 +      if (!ret)
 +              ret = register_iommu_dma_ops_notifier(&platform_bus_type);
 +      if (!ret)
 +              ret = register_iommu_dma_ops_notifier(&amba_bustype);
 +      return ret;
 +}
 +arch_initcall(__iommu_dma_init);
 +
 +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 +                                const struct iommu_ops *ops)
 +{
 +      struct iommu_group *group;
 +
 +      if (!ops)
 +              return;
 +      /*
 +       * TODO: As a concession to the future, we're ready to handle being
 +       * called both early and late (i.e. after bus_add_device). Once all
 +       * the platform bus code is reworked to call us late and the notifier
 +       * junk above goes away, move the body of do_iommu_attach here.
 +       */
 +      group = iommu_group_get(dev);
 +      if (group) {
 +              do_iommu_attach(dev, ops, dma_base, size);
 +              iommu_group_put(group);
 +      } else {
 +              queue_iommu_attach(dev, ops, dma_base, size);
 +      }
 +}
 +
 +void arch_teardown_dma_ops(struct device *dev)
 +{
 +      struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 +
 +      if (domain) {
 +              iommu_detach_device(domain, dev);
 +              if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT)
 +                      iommu_domain_free(domain);
 +      }
 +
 +      dev->archdata.dma_ops = NULL;
 +}
 +
 +#else
 +
 +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 +                                struct iommu_ops *iommu)
 +{ }
 +
 +#endif  /* CONFIG_IOMMU_DMA */
 +
 +void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 +                      struct iommu_ops *iommu, bool coherent)
 +{
 +      if (!acpi_disabled && !dev->archdata.dma_ops)
 +              dev->archdata.dma_ops = dma_ops;
 +
 +      dev->archdata.dma_coherent = coherent;
 +      __iommu_setup_dma_ops(dev, dma_base, size, iommu);
 +}
diff --combined arch/mips/mm/tlbex.c
index 32e0be27673fefbeca6839929e61a581c8980902,b190ae9fe909fc4fa8663e250ac14ad13d8b2288..482192cc8f2b88ae89f4cf1495c0dd6055a5bda6
@@@ -240,7 -240,6 +240,6 @@@ static void output_pgtable_bits_defines
        pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT);
  #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
        pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT);
-       pr_define("_PAGE_SPLITTING_SHIFT %d\n", _PAGE_SPLITTING_SHIFT);
  #endif
  #ifdef CONFIG_CPU_MIPSR2
        if (cpu_has_rixi) {
@@@ -311,7 -310,6 +310,7 @@@ static struct uasm_label labels[128]
  static struct uasm_reloc relocs[128];
  
  static int check_for_high_segbits;
 +static bool fill_includes_sw_bits;
  
  static unsigned int kscratch_used_mask;
  
@@@ -631,14 -629,8 +630,14 @@@ static void build_tlb_write_entry(u32 *
  static __maybe_unused void build_convert_pte_to_entrylo(u32 **p,
                                                        unsigned int reg)
  {
 -      if (cpu_has_rixi) {
 -              UASM_i_ROTR(p, reg, reg, ilog2(_PAGE_GLOBAL));
 +      if (cpu_has_rixi && _PAGE_NO_EXEC) {
 +              if (fill_includes_sw_bits) {
 +                      UASM_i_ROTR(p, reg, reg, ilog2(_PAGE_GLOBAL));
 +              } else {
 +                      UASM_i_SRL(p, reg, reg, ilog2(_PAGE_NO_EXEC));
 +                      UASM_i_ROTR(p, reg, reg,
 +                                  ilog2(_PAGE_GLOBAL) - ilog2(_PAGE_NO_EXEC));
 +              }
        } else {
  #ifdef CONFIG_PHYS_ADDR_T_64BIT
                uasm_i_dsrl_safe(p, reg, reg, ilog2(_PAGE_GLOBAL));
@@@ -1012,7 -1004,21 +1011,7 @@@ static void build_update_entries(u32 **
         * 64bit address support (36bit on a 32bit CPU) in a 32bit
         * Kernel is a special case. Only a few CPUs use it.
         */
 -#ifdef CONFIG_PHYS_ADDR_T_64BIT
 -      if (cpu_has_64bits) {
 -              uasm_i_ld(p, tmp, 0, ptep); /* get even pte */
 -              uasm_i_ld(p, ptep, sizeof(pte_t), ptep); /* get odd pte */
 -              if (cpu_has_rixi) {
 -                      UASM_i_ROTR(p, tmp, tmp, ilog2(_PAGE_GLOBAL));
 -                      UASM_i_MTC0(p, tmp, C0_ENTRYLO0); /* load it */
 -                      UASM_i_ROTR(p, ptep, ptep, ilog2(_PAGE_GLOBAL));
 -              } else {
 -                      uasm_i_dsrl_safe(p, tmp, tmp, ilog2(_PAGE_GLOBAL)); /* convert to entrylo0 */
 -                      UASM_i_MTC0(p, tmp, C0_ENTRYLO0); /* load it */
 -                      uasm_i_dsrl_safe(p, ptep, ptep, ilog2(_PAGE_GLOBAL)); /* convert to entrylo1 */
 -              }
 -              UASM_i_MTC0(p, ptep, C0_ENTRYLO1); /* load it */
 -      } else {
 +      if (config_enabled(CONFIG_PHYS_ADDR_T_64BIT) && !cpu_has_64bits) {
                int pte_off_even = sizeof(pte_t) / 2;
                int pte_off_odd = pte_off_even + sizeof(pte_t);
  #ifdef CONFIG_XPA
                uasm_i_mthc0(p, tmp, C0_ENTRYLO0);
                uasm_i_mthc0(p, ptep, C0_ENTRYLO1);
  #endif
 +              return;
        }
 -#else
 +
        UASM_i_LW(p, tmp, 0, ptep); /* get even pte */
        UASM_i_LW(p, ptep, sizeof(pte_t), ptep); /* get odd pte */
        if (r45k_bvahwbug())
                build_tlb_probe_entry(p);
 -      if (cpu_has_rixi) {
 -              UASM_i_ROTR(p, tmp, tmp, ilog2(_PAGE_GLOBAL));
 -              if (r4k_250MHZhwbug())
 -                      UASM_i_MTC0(p, 0, C0_ENTRYLO0);
 -              UASM_i_MTC0(p, tmp, C0_ENTRYLO0); /* load it */
 -              UASM_i_ROTR(p, ptep, ptep, ilog2(_PAGE_GLOBAL));
 -      } else {
 -              UASM_i_SRL(p, tmp, tmp, ilog2(_PAGE_GLOBAL)); /* convert to entrylo0 */
 -              if (r4k_250MHZhwbug())
 -                      UASM_i_MTC0(p, 0, C0_ENTRYLO0);
 -              UASM_i_MTC0(p, tmp, C0_ENTRYLO0); /* load it */
 -              UASM_i_SRL(p, ptep, ptep, ilog2(_PAGE_GLOBAL)); /* convert to entrylo1 */
 -              if (r45k_bvahwbug())
 -                      uasm_i_mfc0(p, tmp, C0_INDEX);
 -      }
 +      build_convert_pte_to_entrylo(p, tmp);
 +      if (r4k_250MHZhwbug())
 +              UASM_i_MTC0(p, 0, C0_ENTRYLO0);
 +      UASM_i_MTC0(p, tmp, C0_ENTRYLO0); /* load it */
 +      build_convert_pte_to_entrylo(p, ptep);
 +      if (r45k_bvahwbug())
 +              uasm_i_mfc0(p, tmp, C0_INDEX);
        if (r4k_250MHZhwbug())
                UASM_i_MTC0(p, 0, C0_ENTRYLO1);
        UASM_i_MTC0(p, ptep, C0_ENTRYLO1); /* load it */
 -#endif
  }
  
  struct mips_huge_tlb_info {
@@@ -2284,10 -2298,6 +2283,10 @@@ static void config_htw_params(void
        /* re-initialize the PTI field including the even/odd bit */
        pwfield &= ~MIPS_PWFIELD_PTI_MASK;
        pwfield |= PAGE_SHIFT << MIPS_PWFIELD_PTI_SHIFT;
 +      if (CONFIG_PGTABLE_LEVELS >= 3) {
 +              pwfield &= ~MIPS_PWFIELD_MDI_MASK;
 +              pwfield |= PMD_SHIFT << MIPS_PWFIELD_MDI_SHIFT;
 +      }
        /* Set the PTEI right shift */
        ptei = _PAGE_GLOBAL_SHIFT << MIPS_PWFIELD_PTEI_SHIFT;
        pwfield |= ptei;
  
        pwsize = ilog2(PTRS_PER_PGD) << MIPS_PWSIZE_GDW_SHIFT;
        pwsize |= ilog2(PTRS_PER_PTE) << MIPS_PWSIZE_PTW_SHIFT;
 +      if (CONFIG_PGTABLE_LEVELS >= 3)
 +              pwsize |= ilog2(PTRS_PER_PMD) << MIPS_PWSIZE_MDW_SHIFT;
  
        /* If XPA has been enabled, PTEs are 64-bit in size. */
 -      if (read_c0_pagegrain() & PG_ELPA)
 +      if (config_enabled(CONFIG_64BITS) || (read_c0_pagegrain() & PG_ELPA))
                pwsize |= 1;
  
        write_c0_pwsize(pwsize);
@@@ -2351,41 -2359,6 +2350,41 @@@ static void config_xpa_params(void
  #endif
  }
  
 +static void check_pabits(void)
 +{
 +      unsigned long entry;
 +      unsigned pabits, fillbits;
 +
 +      if (!cpu_has_rixi || !_PAGE_NO_EXEC) {
 +              /*
 +               * We'll only be making use of the fact that we can rotate bits
 +               * into the fill if the CPU supports RIXI, so don't bother
 +               * probing this for CPUs which don't.
 +               */
 +              return;
 +      }
 +
 +      write_c0_entrylo0(~0ul);
 +      back_to_back_c0_hazard();
 +      entry = read_c0_entrylo0();
 +
 +      /* clear all non-PFN bits */
 +      entry &= ~((1 << MIPS_ENTRYLO_PFN_SHIFT) - 1);
 +      entry &= ~(MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI);
 +
 +      /* find a lower bound on PABITS, and upper bound on fill bits */
 +      pabits = fls_long(entry) + 6;
 +      fillbits = max_t(int, (int)BITS_PER_LONG - pabits, 0);
 +
 +      /* minus the RI & XI bits */
 +      fillbits -= min_t(unsigned, fillbits, 2);
 +
 +      if (fillbits >= ilog2(_PAGE_NO_EXEC))
 +              fill_includes_sw_bits = true;
 +
 +      pr_debug("Entry* registers contain %u fill bits\n", fillbits);
 +}
 +
  void build_tlb_refill_handler(void)
  {
        /*
        static int run_once = 0;
  
        output_pgtable_bits_defines();
 +      check_pabits();
  
  #ifdef CONFIG_64BIT
        check_for_high_segbits = current_cpu_data.vmbits > (PGDIR_SHIFT + PGD_ORDER + PAGE_SHIFT - 3);
index 3245f2d96d4f59e5140348b8c4dddbe836c5dda6,3c3a45632a953d30b21aa9c703819605b4b5143f..21d961bbac0e1e284b6f9cdb76cb472c8be48227
@@@ -373,11 -373,6 +373,6 @@@ void pgtable_cache_add(unsigned shift, 
  void pgtable_cache_init(void);
  #endif /* __ASSEMBLY__ */
  
- /*
-  * THP pages can't be special. So use the _PAGE_SPECIAL
-  */
- #define _PAGE_SPLITTING _PAGE_SPECIAL
  /*
   * We need to differentiate between explicit huge page and THP huge
   * page, since THP huge page also need to track real subpage details
  /*
   * set of bits not changed in pmd_modify.
   */
- #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |             \
-                        _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
-                        _PAGE_THP_HUGE)
+ #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+                        _PAGE_ACCESSED | _PAGE_THP_HUGE)
  
  #ifndef __ASSEMBLY__
  /*
@@@ -437,9 -431,9 +431,9 @@@ static inline char *get_hpte_slot_array
  
  }
  
 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
                                   pmd_t *pmdp, unsigned long old_pmd);
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
  extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
  extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
@@@ -471,22 -465,7 +465,15 @@@ static inline int pmd_trans_huge(pmd_t 
        return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
  }
  
- static inline int pmd_trans_splitting(pmd_t pmd)
- {
-       if (pmd_trans_huge(pmd))
-               return pmd_val(pmd) & _PAGE_SPLITTING;
-       return 0;
- }
  extern int has_transparent_hugepage(void);
 +#else
 +static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
 +                                        unsigned long addr, pmd_t *pmdp,
 +                                        unsigned long old_pmd)
 +{
 +
 +      WARN(1, "%s called with THP disabled\n", __func__);
 +}
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
  static inline int pmd_large(pmd_t pmd)
@@@ -515,9 -494,11 +502,11 @@@ static inline pte_t *pmdp_ptep(pmd_t *p
  #define pmd_pfn(pmd)          pte_pfn(pmd_pte(pmd))
  #define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
  #define pmd_young(pmd)                pte_young(pmd_pte(pmd))
+ #define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
  #define pmd_mkold(pmd)                pte_pmd(pte_mkold(pmd_pte(pmd)))
  #define pmd_wrprotect(pmd)    pte_pmd(pte_wrprotect(pmd_pte(pmd)))
  #define pmd_mkdirty(pmd)      pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+ #define pmd_mkclean(pmd)      pte_pmd(pte_mkclean(pmd_pte(pmd)))
  #define pmd_mkyoung(pmd)      pte_pmd(pte_mkyoung(pmd_pte(pmd)))
  #define pmd_mkwrite(pmd)      pte_pmd(pte_mkwrite(pmd_pte(pmd)))
  
@@@ -536,12 -517,6 +525,6 @@@ static inline pmd_t pmd_mknotpresent(pm
        return pmd;
  }
  
- static inline pmd_t pmd_mksplitting(pmd_t pmd)
- {
-       pmd_val(pmd) |= _PAGE_SPLITTING;
-       return pmd;
- }
  #define __HAVE_ARCH_PMD_SAME
  static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
  {
@@@ -592,10 -567,6 +575,6 @@@ static inline void pmdp_set_wrprotect(s
        pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
  }
  
- #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long address, pmd_t *pmdp);
  extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
  #define pmdp_collapse_flush pmdp_collapse_flush
index 9833fee493ec414be50c241153889d7ac4259402,8e01e4121fac870d2a3ee03acfa4b8dcfebe647f..cd2d82efe1cd15b1fe003b2eceee8bee33e0a072
@@@ -89,25 -89,6 +89,25 @@@ int pgd_huge(pgd_t pgd
         */
        return ((pgd_val(pgd) & 0x3) != 0x0);
  }
 +
 +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_DEBUG_VM)
 +/*
 + * This enables us to catch the wrong page directory format
 + * Moved here so that we can use WARN() in the call.
 + */
 +int hugepd_ok(hugepd_t hpd)
 +{
 +      bool is_hugepd;
 +
 +      /*
 +       * We should not find this format in page directory, warn otherwise.
 +       */
 +      is_hugepd = (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0));
 +      WARN(is_hugepd, "Found wrong page directory format\n");
 +      return 0;
 +}
 +#endif
 +
  #else
  int pmd_huge(pmd_t pmd)
  {
@@@ -128,7 -109,7 +128,7 @@@ int pgd_huge(pgd_t pgd
  pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  {
        /* Only called for hugetlbfs pages, hence can ignore THP */
 -      return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
 +      return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
  }
  
  static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@@ -703,14 -684,13 +703,14 @@@ void hugetlb_free_pgd_range(struct mmu_
  struct page *
  follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
  {
 +      bool is_thp;
        pte_t *ptep, pte;
        unsigned shift;
        unsigned long mask, flags;
        struct page *page = ERR_PTR(-EINVAL);
  
        local_irq_save(flags);
 -      ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
 +      ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift);
        if (!ptep)
                goto no_page;
        pte = READ_ONCE(*ptep);
         * Transparent hugepages are handled by generic code. We can skip them
         * here.
         */
 -      if (!shift || pmd_trans_huge(__pmd(pte_val(pte))))
 +      if (!shift || is_thp)
                goto no_page;
  
        if (!pte_present(pte)) {
@@@ -976,7 -956,7 +976,7 @@@ void flush_dcache_icache_hugepage(struc
   */
  
  pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 -                                 unsigned *shift)
 +                                 bool *is_thp, unsigned *shift)
  {
        pgd_t pgd, *pgdp;
        pud_t pud, *pudp;
        if (shift)
                *shift = 0;
  
 +      if (is_thp)
 +              *is_thp = false;
 +
        pgdp = pgdir + pgd_index(ea);
        pgd  = READ_ONCE(*pgdp);
        /*
                        /*
                         * A hugepage collapse is captured by pmd_none, because
                         * it mark the pmd none and do a hpte invalidate.
-                        *
-                        * We don't worry about pmd_trans_splitting here, The
-                        * caller if it needs to handle the splitting case
-                        * should check for that.
                         */
                        if (pmd_none(pmd))
                                return NULL;
  
 -                      if (pmd_huge(pmd) || pmd_large(pmd)) {
 +                      if (pmd_trans_huge(pmd)) {
 +                              if (is_thp)
 +                                      *is_thp = true;
 +                              ret_pte = (pte_t *) pmdp;
 +                              goto out;
 +                      }
 +
 +                      if (pmd_huge(pmd)) {
                                ret_pte = (pte_t *) pmdp;
                                goto out;
                        } else if (is_hugepd(__hugepd(pmd_val(pmd))))
@@@ -1071,7 -1037,7 +1067,7 @@@ int gup_hugepte(pte_t *ptep, unsigned l
  {
        unsigned long mask;
        unsigned long pte_end;
-       struct page *head, *page, *tail;
+       struct page *head, *page;
        pte_t pte;
        int refs;
  
        head = pte_page(pte);
  
        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
-       tail = page;
        do {
                VM_BUG_ON(compound_head(page) != head);
                pages[*nr] = page;
                return 0;
        }
  
-       /*
-        * Any tail page need their mapcount reference taken before we
-        * return.
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
        return 1;
  }
diff --combined arch/powerpc/mm/numa.c
index b85d44271c3b9a1591e1fc5c3cd75db9501a5133,8d8a541211d0dc2164d9f48437a4e21fd3fe13e7..669a15e7fa76a07ad57c3d2b82712766520a049a
@@@ -80,7 -80,7 +80,7 @@@ static void __init setup_node_to_cpumas
                setup_nr_node_ids();
  
        /* allocate the map */
-       for (node = 0; node < nr_node_ids; node++)
+       for_each_node(node)
                alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
  
        /* cpumask_of_node() will now work */
@@@ -276,6 -276,7 +276,6 @@@ static int of_node_to_nid_single(struc
  /* Walk the device tree upwards, looking for an associativity id */
  int of_node_to_nid(struct device_node *device)
  {
 -      struct device_node *tmp;
        int nid = -1;
  
        of_node_get(device);
                if (nid != -1)
                        break;
  
 -              tmp = device;
 -              device = of_get_parent(tmp);
 -              of_node_put(tmp);
 +              device = of_get_next_parent(device);
        }
        of_node_put(device);
  
index 1c65ef92768dbb553563506373094338ad92e7f9,13b9bcf5485e5f70f340f98c72a51f6e2af4b62b..610f472f91d14c25cef49a118ecc7f1d9eaf73b1
@@@ -179,19 -179,6 +179,19 @@@ static int setup_one_atmu(struct ccsr_p
        return i;
  }
  
 +static bool is_kdump(void)
 +{
 +      struct device_node *node;
 +
 +      node = of_find_node_by_type(NULL, "memory");
 +      if (!node) {
 +              WARN_ON_ONCE(1);
 +              return false;
 +      }
 +
 +      return of_property_read_bool(node, "linux,usable-memory");
 +}
 +
  /* atmu setup for fsl pci/pcie controller */
  static void setup_pci_atmu(struct pci_controller *hose)
  {
        const char *name = hose->dn->full_name;
        const u64 *reg;
        int len;
 +      bool setup_inbound;
 +
 +      /*
 +       * If this is kdump, we don't want to trigger a bunch of PCI
 +       * errors by closing the window on in-flight DMA.
 +       *
 +       * We still run most of the function's logic so that things like
 +       * hose->dma_window_size still get set.
 +       */
 +      setup_inbound = !is_kdump();
  
        if (early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP)) {
                if (in_be32(&pci->block_rev1) >= PCIE_IP_REV_2_2) {
        /* Disable all windows (except powar0 since it's ignored) */
        for(i = 1; i < 5; i++)
                out_be32(&pci->pow[i].powar, 0);
 -      for (i = start_idx; i < end_idx; i++)
 -              out_be32(&pci->piw[i].piwar, 0);
 +
 +      if (setup_inbound) {
 +              for (i = start_idx; i < end_idx; i++)
 +                      out_be32(&pci->piw[i].piwar, 0);
 +      }
  
        /* Setup outbound MEM window */
        for(i = 0, j = 1; i < 3; i++) {
  
        /* Setup inbound mem window */
        mem = memblock_end_of_DRAM();
 +      pr_info("%s: end of DRAM %llx\n", __func__, mem);
  
        /*
         * The msi-address-64 property, if it exists, indicates the physical
  
                piwar |= ((mem_log - 1) & PIWAR_SZ_MASK);
  
 -              /* Setup inbound memory window */
 -              out_be32(&pci->piw[win_idx].pitar,  0x00000000);
 -              out_be32(&pci->piw[win_idx].piwbar, 0x00000000);
 -              out_be32(&pci->piw[win_idx].piwar,  piwar);
 -              win_idx--;
 +              if (setup_inbound) {
 +                      /* Setup inbound memory window */
 +                      out_be32(&pci->piw[win_idx].pitar,  0x00000000);
 +                      out_be32(&pci->piw[win_idx].piwbar, 0x00000000);
 +                      out_be32(&pci->piw[win_idx].piwar,  piwar);
 +              }
  
 +              win_idx--;
                hose->dma_window_base_cur = 0x00000000;
                hose->dma_window_size = (resource_size_t)sz;
  
  
                        piwar = (piwar & ~PIWAR_SZ_MASK) | (mem_log - 1);
  
 -                      /* Setup inbound memory window */
 -                      out_be32(&pci->piw[win_idx].pitar,  0x00000000);
 -                      out_be32(&pci->piw[win_idx].piwbear,
 -                                      pci64_dma_offset >> 44);
 -                      out_be32(&pci->piw[win_idx].piwbar,
 -                                      pci64_dma_offset >> 12);
 -                      out_be32(&pci->piw[win_idx].piwar,  piwar);
 +                      if (setup_inbound) {
 +                              /* Setup inbound memory window */
 +                              out_be32(&pci->piw[win_idx].pitar,  0x00000000);
 +                              out_be32(&pci->piw[win_idx].piwbear,
 +                                              pci64_dma_offset >> 44);
 +                              out_be32(&pci->piw[win_idx].piwbar,
 +                                              pci64_dma_offset >> 12);
 +                              out_be32(&pci->piw[win_idx].piwar,  piwar);
 +                      }
  
                        /*
                         * install our own dma_set_mask handler to fixup dma_ops
        } else {
                u64 paddr = 0;
  
 -              /* Setup inbound memory window */
 -              out_be32(&pci->piw[win_idx].pitar,  paddr >> 12);
 -              out_be32(&pci->piw[win_idx].piwbar, paddr >> 12);
 -              out_be32(&pci->piw[win_idx].piwar,  (piwar | (mem_log - 1)));
 -              win_idx--;
 +              if (setup_inbound) {
 +                      /* Setup inbound memory window */
 +                      out_be32(&pci->piw[win_idx].pitar,  paddr >> 12);
 +                      out_be32(&pci->piw[win_idx].piwbar, paddr >> 12);
 +                      out_be32(&pci->piw[win_idx].piwar,
 +                               (piwar | (mem_log - 1)));
 +              }
  
 +              win_idx--;
                paddr += 1ull << mem_log;
                sz -= 1ull << mem_log;
  
                        mem_log = ilog2(sz);
                        piwar |= (mem_log - 1);
  
 -                      out_be32(&pci->piw[win_idx].pitar,  paddr >> 12);
 -                      out_be32(&pci->piw[win_idx].piwbar, paddr >> 12);
 -                      out_be32(&pci->piw[win_idx].piwar,  piwar);
 -                      win_idx--;
 +                      if (setup_inbound) {
 +                              out_be32(&pci->piw[win_idx].pitar,
 +                                       paddr >> 12);
 +                              out_be32(&pci->piw[win_idx].piwbar,
 +                                       paddr >> 12);
 +                              out_be32(&pci->piw[win_idx].piwar, piwar);
 +                      }
  
 +                      win_idx--;
                        paddr += 1ull << mem_log;
                }
  
@@@ -1037,10 -999,10 +1037,10 @@@ int fsl_pci_mcheck_exception(struct pt_
                        ret = get_user(regs->nip, &inst);
                        pagefault_enable();
                } else {
-                       ret = probe_kernel_address(regs->nip, inst);
+                       ret = probe_kernel_address((void *)regs->nip, inst);
                }
  
 -              if (mcheck_handle_load(regs, inst)) {
 +              if (!ret && mcheck_handle_load(regs, inst)) {
                        regs->nip += 4;
                        return 1;
                }
index 024f85f947aec50ea93c881e56a73ba3a5591d3c,5690abafe13ea240109441cd83e309b828a48796..64ead80912488b476e19a004eaf01924dbdc6b4c
@@@ -193,15 -193,9 +193,15 @@@ static inline int is_module_addr(void *
  #define _PAGE_UNUSED  0x080           /* SW bit for pgste usage state */
  #define __HAVE_ARCH_PTE_SPECIAL
  
 +#ifdef CONFIG_MEM_SOFT_DIRTY
 +#define _PAGE_SOFT_DIRTY 0x002                /* SW pte soft dirty bit */
 +#else
 +#define _PAGE_SOFT_DIRTY 0x000
 +#endif
 +
  /* Set of bits not changed in pte_modify */
  #define _PAGE_CHG_MASK                (PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \
 -                               _PAGE_YOUNG)
 +                               _PAGE_YOUNG | _PAGE_SOFT_DIRTY)
  
  /*
   * handle_pte_fault uses pte_present and pte_none to find out the pte type
  
  #define _SEGMENT_ENTRY_DIRTY  0x2000  /* SW segment dirty bit */
  #define _SEGMENT_ENTRY_YOUNG  0x1000  /* SW segment young bit */
- #define _SEGMENT_ENTRY_SPLIT  0x0800  /* THP splitting bit */
  #define _SEGMENT_ENTRY_LARGE  0x0400  /* STE-format control, large page */
  #define _SEGMENT_ENTRY_READ   0x0002  /* SW segment read bit */
  #define _SEGMENT_ENTRY_WRITE  0x0001  /* SW segment write bit */
  
 +#ifdef CONFIG_MEM_SOFT_DIRTY
 +#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */
 +#else
 +#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */
 +#endif
 +
  /*
   * Segment table entry encoding (R = read-only, I = invalid, y = young bit):
   *                            dy..R...I...wr
   * SW-bits: y young, d dirty, r read, w write
   */
  
- #define _SEGMENT_ENTRY_SPLIT_BIT 11   /* THP splitting bit number */
  /* Page status table bits for virtualization */
  #define PGSTE_ACC_BITS        0xf000000000000000UL
  #define PGSTE_FP_BIT  0x0800000000000000UL
@@@ -523,10 -508,6 +520,6 @@@ static inline int pmd_bad(pmd_t pmd
        return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
  }
  
- #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long addr, pmd_t *pmdp);
  #define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
  extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
@@@ -601,43 -582,6 +594,43 @@@ static inline int pmd_protnone(pmd_t pm
  }
  #endif
  
 +static inline int pte_soft_dirty(pte_t pte)
 +{
 +      return pte_val(pte) & _PAGE_SOFT_DIRTY;
 +}
 +#define pte_swp_soft_dirty pte_soft_dirty
 +
 +static inline pte_t pte_mksoft_dirty(pte_t pte)
 +{
 +      pte_val(pte) |= _PAGE_SOFT_DIRTY;
 +      return pte;
 +}
 +#define pte_swp_mksoft_dirty pte_mksoft_dirty
 +
 +static inline pte_t pte_clear_soft_dirty(pte_t pte)
 +{
 +      pte_val(pte) &= ~_PAGE_SOFT_DIRTY;
 +      return pte;
 +}
 +#define pte_swp_clear_soft_dirty pte_clear_soft_dirty
 +
 +static inline int pmd_soft_dirty(pmd_t pmd)
 +{
 +      return pmd_val(pmd) & _SEGMENT_ENTRY_SOFT_DIRTY;
 +}
 +
 +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 +{
 +      pmd_val(pmd) |= _SEGMENT_ENTRY_SOFT_DIRTY;
 +      return pmd;
 +}
 +
 +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
 +{
 +      pmd_val(pmd) &= ~_SEGMENT_ENTRY_SOFT_DIRTY;
 +      return pmd;
 +}
 +
  static inline pgste_t pgste_get_lock(pte_t *ptep)
  {
        unsigned long new = 0;
@@@ -938,7 -882,7 +931,7 @@@ static inline pte_t pte_mkclean(pte_t p
  
  static inline pte_t pte_mkdirty(pte_t pte)
  {
 -      pte_val(pte) |= _PAGE_DIRTY;
 +      pte_val(pte) |= _PAGE_DIRTY | _PAGE_SOFT_DIRTY;
        if (pte_val(pte) & _PAGE_WRITE)
                pte_val(pte) &= ~_PAGE_PROTECT;
        return pte;
@@@ -1267,10 -1211,8 +1260,10 @@@ static inline int ptep_set_access_flags
                                        pte_t entry, int dirty)
  {
        pgste_t pgste;
 +      pte_t oldpte;
  
 -      if (pte_same(*ptep, entry))
 +      oldpte = *ptep;
 +      if (pte_same(oldpte, entry))
                return 0;
        if (mm_has_pgste(vma->vm_mm)) {
                pgste = pgste_get_lock(ptep);
        ptep_flush_direct(vma->vm_mm, address, ptep);
  
        if (mm_has_pgste(vma->vm_mm)) {
 -              pgste_set_key(ptep, pgste, entry, vma->vm_mm);
 +              if (pte_val(oldpte) & _PAGE_INVALID)
 +                      pgste_set_key(ptep, pgste, entry, vma->vm_mm);
                pgste = pgste_set_pte(ptep, pgste, entry);
                pgste_set_unlock(ptep, pgste);
        } else
@@@ -1392,8 -1333,7 +1385,8 @@@ static inline pmd_t pmd_mkclean(pmd_t p
  static inline pmd_t pmd_mkdirty(pmd_t pmd)
  {
        if (pmd_large(pmd)) {
 -              pmd_val(pmd) |= _SEGMENT_ENTRY_DIRTY;
 +              pmd_val(pmd) |= _SEGMENT_ENTRY_DIRTY |
 +                              _SEGMENT_ENTRY_SOFT_DIRTY;
                if (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE)
                        pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
        }
@@@ -1424,8 -1364,7 +1417,7 @@@ static inline pmd_t pmd_modify(pmd_t pm
        if (pmd_large(pmd)) {
                pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
                        _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
-                       _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT |
-                       _SEGMENT_ENTRY_SOFT_DIRTY;
 -                      _SEGMENT_ENTRY_LARGE;
++                      _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
                pmd_val(pmd) |= massage_pgprot_pmd(newprot);
                if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
                        pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
@@@ -1533,12 -1472,6 +1525,6 @@@ extern void pgtable_trans_huge_deposit(
  #define __HAVE_ARCH_PGTABLE_WITHDRAW
  extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
  
- static inline int pmd_trans_splitting(pmd_t pmd)
- {
-       return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) &&
-               (pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT);
- }
  static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t entry)
  {
diff --combined arch/x86/Kconfig
index c22df590e7e7463c71ac80a3f7e795656f1d33c2,9e079f013c07e08d495d3ffff09cf68575b3b263..0f1ccc3b3d2b06729230b9ce4c44252bc283980b
@@@ -636,7 -636,7 +636,7 @@@ config X86_32_IRI
  
  config SCHED_OMIT_FRAME_POINTER
        def_bool y
 -      prompt "Single-depth WCHAN output"
 +      prompt "Single-depth WCHAN output" if !LTO && !FRAME_POINTER
        depends on X86
        ---help---
          Calculate simpler /proc/<PID>/wchan values. If this option
@@@ -1123,10 -1123,8 +1123,10 @@@ config X86_REBOOTFIXUP
          Say N otherwise.
  
  config MICROCODE
 -      tristate "CPU microcode loading support"
 +      bool "CPU microcode loading support"
 +      default y
        depends on CPU_SUP_AMD || CPU_SUP_INTEL
 +      depends on BLK_DEV_INITRD
        select FW_LOADER
        ---help---
  
@@@ -1168,6 -1166,24 +1168,6 @@@ config MICROCODE_OLD_INTERFAC
        def_bool y
        depends on MICROCODE
  
 -config MICROCODE_INTEL_EARLY
 -      bool
 -
 -config MICROCODE_AMD_EARLY
 -      bool
 -
 -config MICROCODE_EARLY
 -      bool "Early load microcode"
 -      depends on MICROCODE=y && BLK_DEV_INITRD
 -      select MICROCODE_INTEL_EARLY if MICROCODE_INTEL
 -      select MICROCODE_AMD_EARLY if MICROCODE_AMD
 -      default y
 -      help
 -        This option provides functionality to read additional microcode data
 -        at the beginning of initrd image. The data tells kernel to load
 -        microcode to CPU's as early as possible. No functional change if no
 -        microcode data is glued to the initrd, therefore it's safe to say Y.
 -
  config X86_MSR
        tristate "/dev/cpu/*/msr - Model-specific register support"
        ---help---
@@@ -2027,55 -2043,6 +2027,55 @@@ config COMPAT_VDS
          If unsure, say N: if you are compiling your own kernel, you
          are unlikely to be using a buggy version of glibc.
  
 +choice
 +      prompt "vsyscall table for legacy applications"
 +      depends on X86_64
 +      default LEGACY_VSYSCALL_EMULATE
 +      help
 +        Legacy user code that does not know how to find the vDSO expects
 +        to be able to issue three syscalls by calling fixed addresses in
 +        kernel space. Since this location is not randomized with ASLR,
 +        it can be used to assist security vulnerability exploitation.
 +
 +        This setting can be changed at boot time via the kernel command
 +        line parameter vsyscall=[native|emulate|none].
 +
 +        On a system with recent enough glibc (2.14 or newer) and no
 +        static binaries, you can say None without a performance penalty
 +        to improve security.
 +
 +        If unsure, select "Emulate".
 +
 +      config LEGACY_VSYSCALL_NATIVE
 +              bool "Native"
 +              help
 +                Actual executable code is located in the fixed vsyscall
 +                address mapping, implementing time() efficiently. Since
 +                this makes the mapping executable, it can be used during
 +                security vulnerability exploitation (traditionally as
 +                ROP gadgets). This configuration is not recommended.
 +
 +      config LEGACY_VSYSCALL_EMULATE
 +              bool "Emulate"
 +              help
 +                The kernel traps and emulates calls into the fixed
 +                vsyscall address mapping. This makes the mapping
 +                non-executable, but it still contains known contents,
 +                which could be used in certain rare security vulnerability
 +                exploits. This configuration is recommended when userspace
 +                still uses the vsyscall area.
 +
 +      config LEGACY_VSYSCALL_NONE
 +              bool "None"
 +              help
 +                There will be no vsyscall mapping at all. This will
 +                eliminate any risk of ASLR bypass due to the vsyscall
 +                fixed address mapping. Attempts to use the vsyscalls
 +                will be reported to dmesg, so that either old or
 +                malicious userspace programs can be identified.
 +
 +endchoice
 +
  config CMDLINE_BOOL
        bool "Built-in kernel command line"
        ---help---
@@@ -2151,6 -2118,9 +2151,9 @@@ config USE_PERCPU_NUMA_NODE_I
        def_bool y
        depends on NUMA
  
+ config HAVE_MEMORYLESS_NODES
+       def_bool NUMA
  config ARCH_ENABLE_SPLIT_PMD_PTLOCK
        def_bool y
        depends on X86_64 || X86_PAE
index caa2c712d1e70c5895d92cf856b91a02d9123083,143ef9f37932691c7af983924a7213e9a7824301..f17705e1332cc3b81dc9a3a7551ece5d1848d5db
@@@ -8,7 -8,7 +8,7 @@@
  #
  0     i386    restart_syscall         sys_restart_syscall
  1     i386    exit                    sys_exit
 -2     i386    fork                    sys_fork                        stub32_fork
 +2     i386    fork                    sys_fork                        sys_fork
  3     i386    read                    sys_read
  4     i386    write                   sys_write
  5     i386    open                    sys_open                        compat_sys_open
@@@ -17,7 -17,7 +17,7 @@@
  8     i386    creat                   sys_creat
  9     i386    link                    sys_link
  10    i386    unlink                  sys_unlink
 -11    i386    execve                  sys_execve                      stub32_execve
 +11    i386    execve                  sys_execve                      compat_sys_execve
  12    i386    chdir                   sys_chdir
  13    i386    time                    sys_time                        compat_sys_time
  14    i386    mknod                   sys_mknod
  116   i386    sysinfo                 sys_sysinfo                     compat_sys_sysinfo
  117   i386    ipc                     sys_ipc                         compat_sys_ipc
  118   i386    fsync                   sys_fsync
 -119   i386    sigreturn               sys_sigreturn                   stub32_sigreturn
 +119   i386    sigreturn               sys_sigreturn                   sys32_sigreturn
  120   i386    clone                   sys_clone                       stub32_clone
  121   i386    setdomainname           sys_setdomainname
  122   i386    uname                   sys_newuname
  170   i386    setresgid               sys_setresgid16
  171   i386    getresgid               sys_getresgid16
  172   i386    prctl                   sys_prctl
 -173   i386    rt_sigreturn            sys_rt_sigreturn                stub32_rt_sigreturn
 +173   i386    rt_sigreturn            sys_rt_sigreturn                sys32_rt_sigreturn
  174   i386    rt_sigaction            sys_rt_sigaction                compat_sys_rt_sigaction
  175   i386    rt_sigprocmask          sys_rt_sigprocmask
  176   i386    rt_sigpending           sys_rt_sigpending               compat_sys_rt_sigpending
  187   i386    sendfile                sys_sendfile                    compat_sys_sendfile
  188   i386    getpmsg
  189   i386    putpmsg
 -190   i386    vfork                   sys_vfork                       stub32_vfork
 +190   i386    vfork                   sys_vfork                       sys_vfork
  191   i386    ugetrlimit              sys_getrlimit                   compat_sys_getrlimit
  192   i386    mmap2                   sys_mmap_pgoff
  193   i386    truncate64              sys_truncate64                  sys32_truncate64
  355   i386    getrandom               sys_getrandom
  356   i386    memfd_create            sys_memfd_create
  357   i386    bpf                     sys_bpf
 -358   i386    execveat                sys_execveat                    stub32_execveat
 +358   i386    execveat                sys_execveat                    compat_sys_execveat
  359   i386    socket                  sys_socket
  360   i386    socketpair              sys_socketpair
  361   i386    bind                    sys_bind
  373   i386    shutdown                sys_shutdown
  374   i386    userfaultfd             sys_userfaultfd
  375   i386    membarrier              sys_membarrier
+ 376   i386    mlock2                  sys_mlock2
index 6ec0c8b2e9df5b1d4c7702fd7f1d96c2c24db5d4,a8fdfe0d72775fa14d59189b5b3b48308e93ad9a..9ff592003afda8b9d1d2bf1d3353ae8f04625d60
  #include <asm/x86_init.h>
  
  void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
 +void ptdump_walk_pgd_level_checkwx(void);
 +
 +#ifdef CONFIG_DEBUG_WX
 +#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
 +#else
 +#define debug_checkwx() do { } while (0)
 +#endif
  
  /*
   * ZERO_PAGE is a global shared page that is always zero: used
@@@ -149,12 -142,12 +149,12 @@@ static inline unsigned long pte_pfn(pte
  
  static inline unsigned long pmd_pfn(pmd_t pmd)
  {
 -      return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
 +      return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
  }
  
  static inline unsigned long pud_pfn(pud_t pud)
  {
 -      return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT;
 +      return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
  }
  
  #define pte_page(pte) pfn_to_page(pte_pfn(pte))
@@@ -165,11 -158,6 +165,6 @@@ static inline int pmd_large(pmd_t pte
  }
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
- static inline int pmd_trans_splitting(pmd_t pmd)
- {
-       return pmd_val(pmd) & _PAGE_SPLITTING;
- }
  static inline int pmd_trans_huge(pmd_t pmd)
  {
        return pmd_val(pmd) & _PAGE_PSE;
@@@ -274,6 -262,11 +269,11 @@@ static inline pmd_t pmd_mkold(pmd_t pmd
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
  }
  
+ static inline pmd_t pmd_mkclean(pmd_t pmd)
+ {
+       return pmd_clear_flags(pmd, _PAGE_DIRTY);
+ }
  static inline pmd_t pmd_wrprotect(pmd_t pmd)
  {
        return pmd_clear_flags(pmd, _PAGE_RW);
@@@ -325,16 -318,6 +325,16 @@@ static inline pmd_t pmd_mksoft_dirty(pm
        return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
  }
  
 +static inline pte_t pte_clear_soft_dirty(pte_t pte)
 +{
 +      return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
 +}
 +
 +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
 +{
 +      return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
 +}
 +
  #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
  
  /*
@@@ -396,9 -379,7 +396,9 @@@ static inline pgprot_t pgprot_modify(pg
        return __pgprot(preservebits | addbits);
  }
  
 -#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
 +#define pte_pgprot(x) __pgprot(pte_flags(x))
 +#define pmd_pgprot(x) __pgprot(pmd_flags(x))
 +#define pud_pgprot(x) __pgprot(pud_flags(x))
  
  #define canon_pgprot(p) __pgprot(massage_pgprot(p))
  
@@@ -521,15 -502,14 +521,15 @@@ static inline int pmd_none(pmd_t pmd
  
  static inline unsigned long pmd_page_vaddr(pmd_t pmd)
  {
 -      return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
 +      return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
  }
  
  /*
   * Currently stuck as a macro due to indirect forward reference to
   * linux/mmzone.h's __section_mem_map_addr() definition:
   */
 -#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
 +#define pmd_page(pmd)         \
 +      pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT)
  
  /*
   * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@@ -590,15 -570,14 +590,15 @@@ static inline int pud_present(pud_t pud
  
  static inline unsigned long pud_page_vaddr(pud_t pud)
  {
 -      return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
 +      return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud));
  }
  
  /*
   * Currently stuck as a macro due to indirect forward reference to
   * linux/mmzone.h's __section_mem_map_addr() definition:
   */
 -#define pud_page(pud)         pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
 +#define pud_page(pud)         \
 +      pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT)
  
  /* Find an entry in the second-level page table.. */
  static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@@ -816,10 -795,6 +816,6 @@@ extern int pmdp_clear_flush_young(struc
                                  unsigned long address, pmd_t *pmdp);
  
  
- #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long addr, pmd_t *pmdp);
  #define __HAVE_ARCH_PMD_WRITE
  static inline int pmd_write(pmd_t pmd)
  {
index dd5b0aa9dd2f93a01b554029ebc243aa56ae91d3,d173197cfd9e0cfb4051a566ac96aaa663aacd9d..116fc4ee586f3750da9bb2d3d9d7867e71fbea09
@@@ -22,7 -22,6 +22,6 @@@
  #define _PAGE_BIT_PAT_LARGE   12      /* On 2MB or 1GB pages */
  #define _PAGE_BIT_SPECIAL     _PAGE_BIT_SOFTW1
  #define _PAGE_BIT_CPA_TEST    _PAGE_BIT_SOFTW1
- #define _PAGE_BIT_SPLITTING   _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
  #define _PAGE_BIT_HIDDEN      _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
  #define _PAGE_BIT_SOFT_DIRTY  _PAGE_BIT_SOFTW3 /* software dirty tracking */
  #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
@@@ -46,7 -45,6 +45,6 @@@
  #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
  #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
  #define _PAGE_CPA_TEST        (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
- #define _PAGE_SPLITTING       (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
  #define __HAVE_ARCH_PTE_SPECIAL
  
  #ifdef CONFIG_KMEMCHECK
@@@ -209,10 -207,10 +207,10 @@@ enum page_cache_mode 
  
  #include <linux/types.h>
  
 -/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
 +/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
  #define PTE_PFN_MASK          ((pteval_t)PHYSICAL_PAGE_MASK)
  
 -/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
 +/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */
  #define PTE_FLAGS_MASK                (~PTE_PFN_MASK)
  
  typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
@@@ -276,46 -274,14 +274,46 @@@ static inline pmdval_t native_pmd_val(p
  }
  #endif
  
 +static inline pudval_t pud_pfn_mask(pud_t pud)
 +{
 +      if (native_pud_val(pud) & _PAGE_PSE)
 +              return PUD_PAGE_MASK & PHYSICAL_PAGE_MASK;
 +      else
 +              return PTE_PFN_MASK;
 +}
 +
 +static inline pudval_t pud_flags_mask(pud_t pud)
 +{
 +      if (native_pud_val(pud) & _PAGE_PSE)
 +              return ~(PUD_PAGE_MASK & (pudval_t)PHYSICAL_PAGE_MASK);
 +      else
 +              return ~PTE_PFN_MASK;
 +}
 +
  static inline pudval_t pud_flags(pud_t pud)
  {
 -      return native_pud_val(pud) & PTE_FLAGS_MASK;
 +      return native_pud_val(pud) & pud_flags_mask(pud);
 +}
 +
 +static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
 +{
 +      if (native_pmd_val(pmd) & _PAGE_PSE)
 +              return PMD_PAGE_MASK & PHYSICAL_PAGE_MASK;
 +      else
 +              return PTE_PFN_MASK;
 +}
 +
 +static inline pmdval_t pmd_flags_mask(pmd_t pmd)
 +{
 +      if (native_pmd_val(pmd) & _PAGE_PSE)
 +              return ~(PMD_PAGE_MASK & (pmdval_t)PHYSICAL_PAGE_MASK);
 +      else
 +              return ~PTE_PFN_MASK;
  }
  
  static inline pmdval_t pmd_flags(pmd_t pmd)
  {
 -      return native_pmd_val(pmd) & PTE_FLAGS_MASK;
 +      return native_pmd_val(pmd) & pmd_flags_mask(pmd);
  }
  
  static inline pte_t native_make_pte(pteval_t val)
index e75907601a41c349e05c8dfe63047dc460ccdf30,b98f1f915357de77f21d22ed74f9836fa4df1e06..3625ac798821366bbdd2680f9db172a081550fea
@@@ -705,8 -705,14 +705,14 @@@ static void acpi_map_cpu2node(acpi_hand
  
        nid = acpi_get_node(handle);
        if (nid != -1) {
+               if (try_online_node(nid)) {
+                       pr_warn("failed to online node%d for CPU%d, use node%d instead.\n",
+                               nid, cpu, first_node(node_online_map));
+                       nid = first_node(node_online_map);
+               }
                set_apicid_to_node(physid, nid);
                numa_set_node(cpu, nid);
+               set_cpu_numa_mem(cpu, local_memory_node(nid));
        }
  #endif
  }
@@@ -733,9 -739,10 +739,10 @@@ int acpi_unmap_cpu(int cpu
  {
  #ifdef CONFIG_ACPI_NUMA
        set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+       set_cpu_numa_mem(cpu, NUMA_NO_NODE);
  #endif
  
-       per_cpu(x86_cpu_to_apicid, cpu) = -1;
+       per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
        set_cpu_present(cpu, false);
        num_processors--;
  
@@@ -976,8 -983,6 +983,8 @@@ static int __init acpi_parse_madt_lapic
  {
        int count;
        int x2count = 0;
 +      int ret;
 +      struct acpi_subtable_proc madt_proc[2];
  
        if (!cpu_has_apic)
                return -ENODEV;
                                      acpi_parse_sapic, MAX_LOCAL_APIC);
  
        if (!count) {
 -              x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
 -                                      acpi_parse_x2apic, MAX_LOCAL_APIC);
 -              count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
 -                                      acpi_parse_lapic, MAX_LOCAL_APIC);
 +              memset(madt_proc, 0, sizeof(madt_proc));
 +              madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
 +              madt_proc[0].handler = acpi_parse_lapic;
 +              madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
 +              madt_proc[1].handler = acpi_parse_x2apic;
 +              ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
 +                              sizeof(struct acpi_table_madt),
 +                              madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
 +              if (ret < 0) {
 +                      printk(KERN_ERR PREFIX
 +                                      "Error parsing LAPIC/X2APIC entries\n");
 +                      return ret;
 +              }
 +
 +              x2count = madt_proc[0].count;
 +              count = madt_proc[1].count;
        }
        if (!count && !x2count) {
                printk(KERN_ERR PREFIX "No LAPIC entries present\n");
index cd99433b8ba17597cbc9e91aba9c40eee7e05e4b,a8e618b16a66a25f07260dac1379998a79b8ac9a..6ba014c61d62d20a078dd260103f23465a47a8cd
@@@ -90,7 -90,7 +90,7 @@@ void *dma_generic_alloc_coherent(struc
  again:
        page = NULL;
        /* CMA can be used only in the context which permits sleeping */
-       if (flag & __GFP_WAIT) {
+       if (gfpflags_allow_blocking(flag)) {
                page = dma_alloc_from_contiguous(dev, count, get_order(size));
                if (page && page_to_phys(page) + size > dma_mask) {
                        dma_release_from_contiguous(dev, page, count);
@@@ -131,12 -131,11 +131,12 @@@ void dma_generic_free_coherent(struct d
  
  bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
  {
 +      if (!*dev)
 +              *dev = &x86_dma_fallback_dev;
 +
        *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
        *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
  
 -      if (!*dev)
 -              *dev = &x86_dma_fallback_dev;
        if (!is_device_dma_capable(*dev))
                return false;
        return true;
index 892ee2e5ecbce417df506715f7b28d28c403ef91,a2a58e5337fbfbee38900fb0463c69f5f7449f1f..5ed24ea0e9455558931a414a38045cac44edb2ae
@@@ -155,6 -155,8 +155,8 @@@ static void smp_callin(void
         */
        phys_id = read_apic_id();
  
+       set_numa_mem(local_memory_node(cpu_to_node(cpuid)));
        /*
         * the boot CPU has finished the init stage and is spinning
         * on callin_map until we finish. We are free to set up this
@@@ -509,7 -511,7 +511,7 @@@ void __inquire_remote_apic(int apicid
   */
  #define UDELAY_10MS_DEFAULT 10000
  
 -static unsigned int init_udelay = UDELAY_10MS_DEFAULT;
 +static unsigned int init_udelay = INT_MAX;
  
  static int __init cpu_init_udelay(char *str)
  {
@@@ -522,16 -524,13 +524,16 @@@ early_param("cpu_init_udelay", cpu_init
  static void __init smp_quirk_init_udelay(void)
  {
        /* if cmdline changed it from default, leave it alone */
 -      if (init_udelay != UDELAY_10MS_DEFAULT)
 +      if (init_udelay != INT_MAX)
                return;
  
        /* if modern processor, use no delay */
        if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
            ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF)))
                init_udelay = 0;
 +
 +      /* else, use legacy delay */
 +      init_udelay = UDELAY_10MS_DEFAULT;
  }
  
  /*
@@@ -660,9 -659,7 +662,9 @@@ wakeup_secondary_cpu_via_init(int phys_
                /*
                 * Give the other CPU some time to accept the IPI.
                 */
 -              if (init_udelay)
 +              if (init_udelay == 0)
 +                      udelay(10);
 +              else
                        udelay(300);
  
                pr_debug("Startup point 1\n");
                /*
                 * Give the other CPU some time to accept the IPI.
                 */
 -              if (init_udelay)
 +              if (init_udelay == 0)
 +                      udelay(10);
 +              else
                        udelay(200);
  
                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
diff --combined arch/x86/mm/gup.c
index ae9a37bf13711460892584e67d02880291168f86,49bbbc57603b5e31326ff404cbbffe4d55f39a2b..f8cb3e8ac250ecc8ae288ec0135f5689ca7cc0b9
@@@ -118,26 -118,25 +118,24 @@@ static noinline int gup_huge_pmd(pmd_t 
                unsigned long end, int write, struct page **pages, int *nr)
  {
        unsigned long mask;
 -      pte_t pte = *(pte_t *)&pmd;
        struct page *head, *page;
        int refs;
  
        mask = _PAGE_PRESENT|_PAGE_USER;
        if (write)
                mask |= _PAGE_RW;
 -      if ((pte_flags(pte) & mask) != mask)
 +      if ((pmd_flags(pmd) & mask) != mask)
                return 0;
        /* hugepages are never "special" */
 -      VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
 -      VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 +      VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
 +      VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
  
        refs = 0;
 -      head = pte_page(pte);
 +      head = pmd_page(pmd);
        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
@@@ -158,18 -157,7 +156,7 @@@ static int gup_pmd_range(pud_t pud, uns
                pmd_t pmd = *pmdp;
  
                next = pmd_addr_end(addr, end);
-               /*
-                * The pmd_trans_splitting() check below explains why
-                * pmdp_splitting_flush has to flush the tlb, to stop
-                * this gup-fast code from running while we set the
-                * splitting bit in the pmd. Returning zero will take
-                * the slow path that will call wait_split_huge_page()
-                * if the pmd is still in splitting state. gup-fast
-                * can't because it has irq disabled and
-                * wait_split_huge_page() would never return as the
-                * tlb flush IPI wouldn't run.
-                */
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                        return 0;
                if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
                        /*
@@@ -194,26 -182,25 +181,24 @@@ static noinline int gup_huge_pud(pud_t 
                unsigned long end, int write, struct page **pages, int *nr)
  {
        unsigned long mask;
 -      pte_t pte = *(pte_t *)&pud;
        struct page *head, *page;
        int refs;
  
        mask = _PAGE_PRESENT|_PAGE_USER;
        if (write)
                mask |= _PAGE_RW;
 -      if ((pte_flags(pte) & mask) != mask)
 +      if ((pud_flags(pud) & mask) != mask)
                return 0;
        /* hugepages are never "special" */
 -      VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
 -      VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 +      VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
 +      VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
  
        refs = 0;
 -      head = pte_page(pte);
 +      head = pud_page(pud);
        page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        do {
                VM_BUG_ON_PAGE(compound_head(page) != head, page);
                pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                (*nr)++;
                page++;
                refs++;
diff --combined block/blk-core.c
index 89eec79658702a7e53712bc52178dae25bddcc22,0391206868e9a81dca0add4a72fc065c3b66d5c0..5dd1f54d793549e50180b0e4840f8667536351ad
@@@ -554,30 -554,29 +554,30 @@@ void blk_cleanup_queue(struct request_q
         * Drain all requests queued before DYING marking. Set DEAD flag to
         * prevent that q->request_fn() gets invoked after draining finished.
         */
 -      if (q->mq_ops) {
 -              blk_mq_freeze_queue(q);
 -              spin_lock_irq(lock);
 -      } else {
 -              spin_lock_irq(lock);
 +      blk_freeze_queue(q);
 +      spin_lock_irq(lock);
 +      if (!q->mq_ops)
                __blk_drain_queue(q, true);
 -      }
        queue_flag_set(QUEUE_FLAG_DEAD, q);
        spin_unlock_irq(lock);
  
 +      /* for synchronous bio-based driver finish in-flight integrity i/o */
 +      blk_flush_integrity();
 +
        /* @q won't process any more request, flush async actions */
        del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
        blk_sync_queue(q);
  
        if (q->mq_ops)
                blk_mq_free_queue(q);
 +      percpu_ref_exit(&q->q_usage_counter);
  
        spin_lock_irq(lock);
        if (q->queue_lock != &q->__queue_lock)
                q->queue_lock = &q->__queue_lock;
        spin_unlock_irq(lock);
  
 -      bdi_destroy(&q->backing_dev_info);
 +      bdi_unregister(&q->backing_dev_info);
  
        /* @q is and will stay empty, shutdown and put */
        blk_put_queue(q);
@@@ -630,40 -629,6 +630,40 @@@ struct request_queue *blk_alloc_queue(g
  }
  EXPORT_SYMBOL(blk_alloc_queue);
  
-               if (!(gfp & __GFP_WAIT))
 +int blk_queue_enter(struct request_queue *q, gfp_t gfp)
 +{
 +      while (true) {
 +              int ret;
 +
 +              if (percpu_ref_tryget_live(&q->q_usage_counter))
 +                      return 0;
 +
++              if (!gfpflags_allow_blocking(gfp))
 +                      return -EBUSY;
 +
 +              ret = wait_event_interruptible(q->mq_freeze_wq,
 +                              !atomic_read(&q->mq_freeze_depth) ||
 +                              blk_queue_dying(q));
 +              if (blk_queue_dying(q))
 +                      return -ENODEV;
 +              if (ret)
 +                      return ret;
 +      }
 +}
 +
 +void blk_queue_exit(struct request_queue *q)
 +{
 +      percpu_ref_put(&q->q_usage_counter);
 +}
 +
 +static void blk_queue_usage_counter_release(struct percpu_ref *ref)
 +{
 +      struct request_queue *q =
 +              container_of(ref, struct request_queue, q_usage_counter);
 +
 +      wake_up_all(&q->mq_freeze_wq);
 +}
 +
  struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
  {
        struct request_queue *q;
  
        init_waitqueue_head(&q->mq_freeze_wq);
  
 -      if (blkcg_init_queue(q))
 +      /*
 +       * Init percpu_ref in atomic mode so that it's faster to shutdown.
 +       * See blk_register_queue() for details.
 +       */
 +      if (percpu_ref_init(&q->q_usage_counter,
 +                              blk_queue_usage_counter_release,
 +                              PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
                goto fail_bdi;
  
 +      if (blkcg_init_queue(q))
 +              goto fail_ref;
 +
        return q;
  
 +fail_ref:
 +      percpu_ref_exit(&q->q_usage_counter);
  fail_bdi:
        bdi_destroy(&q->backing_dev_info);
  fail_split:
@@@ -1206,8 -1160,8 +1206,8 @@@ rq_starved
   * @bio: bio to allocate request for (can be %NULL)
   * @gfp_mask: allocation mask
   *
-  * Get a free request from @q.  If %__GFP_WAIT is set in @gfp_mask, this
-  * function keeps retrying under memory pressure and fails iff @q is dead.
+  * Get a free request from @q.  If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
+  * this function keeps retrying under memory pressure and fails iff @q is dead.
   *
   * Must be called with @q->queue_lock held and,
   * Returns ERR_PTR on failure, with @q->queue_lock held.
@@@ -1227,7 -1181,7 +1227,7 @@@ retry
        if (!IS_ERR(rq))
                return rq;
  
-       if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
+       if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
                blk_put_rl(rl);
                return rq;
        }
@@@ -1305,11 -1259,11 +1305,11 @@@ EXPORT_SYMBOL(blk_get_request)
   * BUG.
   *
   * WARNING: When allocating/cloning a bio-chain, careful consideration should be
-  * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
-  * anything but the first bio in the chain. Otherwise you risk waiting for IO
-  * completion of a bio that hasn't been submitted yet, thus resulting in a
-  * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
-  * of bio_alloc(), as that avoids the mempool deadlock.
+  * given to how you allocate bios. In particular, you cannot use
+  * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise
+  * you risk waiting for IO completion of a bio that hasn't been submitted yet,
+  * thus resulting in a deadlock. Alternatively bios should be allocated using
+  * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock.
   * If possible a big IO should be split into smaller parts when allocation
   * fails. Partial allocation should not be an error, or you risk a live-lock.
   */
        return ret;
  }
  
 +unsigned int blk_plug_queued_count(struct request_queue *q)
 +{
 +      struct blk_plug *plug;
 +      struct request *rq;
 +      struct list_head *plug_list;
 +      unsigned int ret = 0;
 +
 +      plug = current->plug;
 +      if (!plug)
 +              goto out;
 +
 +      if (q->mq_ops)
 +              plug_list = &plug->mq_list;
 +      else
 +              plug_list = &plug->list;
 +
 +      list_for_each_entry(rq, plug_list, queuelist) {
 +              if (rq->q == q)
 +                      ret++;
 +      }
 +out:
 +      return ret;
 +}
 +
  void init_request_from_bio(struct request *req, struct bio *bio)
  {
        req->cmd_type = REQ_TYPE_FS;
@@@ -1711,11 -1641,9 +1711,11 @@@ static void blk_queue_bio(struct reques
         * Check if we can merge with the plugged list before grabbing
         * any locks.
         */
 -      if (!blk_queue_nomerges(q) &&
 -          blk_attempt_plug_merge(q, bio, &request_count, NULL))
 -              return;
 +      if (!blk_queue_nomerges(q)) {
 +              if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
 +                      return;
 +      } else
 +              request_count = blk_plug_queued_count(q);
  
        spin_lock_irq(q->queue_lock);
  
@@@ -2038,19 -1966,9 +2038,19 @@@ void generic_make_request(struct bio *b
        do {
                struct request_queue *q = bdev_get_queue(bio->bi_bdev);
  
-               if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) {
 -              q->make_request_fn(q, bio);
++              if (likely(blk_queue_enter(q, ___GFP_DIRECT_RECLAIM) == 0)) {
 +
 +                      q->make_request_fn(q, bio);
 +
 +                      blk_queue_exit(q);
  
 -              bio = bio_list_pop(current->bio_list);
 +                      bio = bio_list_pop(current->bio_list);
 +              } else {
 +                      struct bio *bio_next = bio_list_pop(current->bio_list);
 +
 +                      bio_io_error(bio);
 +                      bio = bio_next;
 +              }
        } while (bio);
        current->bio_list = NULL; /* deactivate */
  }
diff --combined block/blk-mq-tag.c
index 60ac684c8b8c52f26fe8a83d290fbd8f75f76581,adbc577d83c481d4763848e9ed4b415afd4193c1..a07ca3488d96fb7a96159fbe48c0c9e5e95391ec
@@@ -75,10 -75,6 +75,10 @@@ void blk_mq_tag_wakeup_all(struct blk_m
        struct blk_mq_bitmap_tags *bt;
        int i, wake_index;
  
 +      /*
 +       * Make sure all changes prior to this are visible from other CPUs.
 +       */
 +      smp_mb();
        bt = &tags->bitmap_tags;
        wake_index = atomic_read(&bt->wake_index);
        for (i = 0; i < BT_WAIT_QUEUES; i++) {
@@@ -268,7 -264,7 +268,7 @@@ static int bt_get(struct blk_mq_alloc_d
        if (tag != -1)
                return tag;
  
-       if (!(data->gfp & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(data->gfp))
                return -1;
  
        bs = bt_wait_ptr(bt, hctx);
@@@ -645,7 -641,6 +645,7 @@@ void blk_mq_free_tags(struct blk_mq_tag
  {
        bt_free(&tags->bitmap_tags);
        bt_free(&tags->breserved_tags);
 +      free_cpumask_var(tags->cpumask);
        kfree(tags);
  }
  
diff --combined block/blk-mq.c
index 27bf3097532d02b0d43228d26bc696362b86cb28,3f3544edb941f169cae54042720c118c40a8563b..34e26163b73a434abf27bae5af8e6b21a6596c20
@@@ -9,7 -9,6 +9,7 @@@
  #include <linux/backing-dev.h>
  #include <linux/bio.h>
  #include <linux/blkdev.h>
 +#include <linux/kmemleak.h>
  #include <linux/mm.h>
  #include <linux/init.h>
  #include <linux/slab.h>
@@@ -78,13 -77,47 +78,13 @@@ static void blk_mq_hctx_clear_pending(s
        clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
  }
  
 -static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
 -{
 -      while (true) {
 -              int ret;
 -
 -              if (percpu_ref_tryget_live(&q->mq_usage_counter))
 -                      return 0;
 -
 -              if (!gfpflags_allow_blocking(gfp))
 -                      return -EBUSY;
 -
 -              ret = wait_event_interruptible(q->mq_freeze_wq,
 -                              !atomic_read(&q->mq_freeze_depth) ||
 -                              blk_queue_dying(q));
 -              if (blk_queue_dying(q))
 -                      return -ENODEV;
 -              if (ret)
 -                      return ret;
 -      }
 -}
 -
 -static void blk_mq_queue_exit(struct request_queue *q)
 -{
 -      percpu_ref_put(&q->mq_usage_counter);
 -}
 -
 -static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 -{
 -      struct request_queue *q =
 -              container_of(ref, struct request_queue, mq_usage_counter);
 -
 -      wake_up_all(&q->mq_freeze_wq);
 -}
 -
  void blk_mq_freeze_queue_start(struct request_queue *q)
  {
        int freeze_depth;
  
        freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
        if (freeze_depth == 1) {
 -              percpu_ref_kill(&q->mq_usage_counter);
 +              percpu_ref_kill(&q->q_usage_counter);
                blk_mq_run_hw_queues(q, false);
        }
  }
@@@ -92,34 -125,18 +92,34 @@@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_s
  
  static void blk_mq_freeze_queue_wait(struct request_queue *q)
  {
 -      wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 +      wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
  }
  
  /*
   * Guarantee no request is in use, so we can change any data structure of
   * the queue afterward.
   */
 -void blk_mq_freeze_queue(struct request_queue *q)
 +void blk_freeze_queue(struct request_queue *q)
  {
 +      /*
 +       * In the !blk_mq case we are only calling this to kill the
 +       * q_usage_counter, otherwise this increases the freeze depth
 +       * and waits for it to return to zero.  For this reason there is
 +       * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 +       * exported to drivers as the only user for unfreeze is blk_mq.
 +       */
        blk_mq_freeze_queue_start(q);
        blk_mq_freeze_queue_wait(q);
  }
 +
 +void blk_mq_freeze_queue(struct request_queue *q)
 +{
 +      /*
 +       * ...just an alias to keep freeze and unfreeze actions balanced
 +       * in the blk_mq_* namespace
 +       */
 +      blk_freeze_queue(q);
 +}
  EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
  
  void blk_mq_unfreeze_queue(struct request_queue *q)
        freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
        WARN_ON_ONCE(freeze_depth < 0);
        if (!freeze_depth) {
 -              percpu_ref_reinit(&q->mq_usage_counter);
 +              percpu_ref_reinit(&q->q_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }
  }
@@@ -238,17 -255,17 +238,17 @@@ struct request *blk_mq_alloc_request(st
        struct blk_mq_alloc_data alloc_data;
        int ret;
  
 -      ret = blk_mq_queue_enter(q, gfp);
 +      ret = blk_queue_enter(q, gfp);
        if (ret)
                return ERR_PTR(ret);
  
        ctx = blk_mq_get_ctx(q);
        hctx = q->mq_ops->map_queue(q, ctx->cpu);
-       blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
+       blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
                        reserved, ctx, hctx);
  
        rq = __blk_mq_alloc_request(&alloc_data, rw);
-       if (!rq && (gfp & __GFP_WAIT)) {
+       if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
                __blk_mq_run_hw_queue(hctx);
                blk_mq_put_ctx(ctx);
  
        }
        blk_mq_put_ctx(ctx);
        if (!rq) {
 -              blk_mq_queue_exit(q);
 +              blk_queue_exit(q);
                return ERR_PTR(-EWOULDBLOCK);
        }
        return rq;
@@@ -280,7 -297,7 +280,7 @@@ static void __blk_mq_free_request(struc
  
        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
        blk_mq_put_tag(hctx, tag, &ctx->last_tag);
 -      blk_mq_queue_exit(q);
 +      blk_queue_exit(q);
  }
  
  void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
@@@ -972,25 -989,18 +972,25 @@@ void blk_mq_delay_queue(struct blk_mq_h
  }
  EXPORT_SYMBOL(blk_mq_delay_queue);
  
 -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 -                                  struct request *rq, bool at_head)
 +static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
 +                                          struct blk_mq_ctx *ctx,
 +                                          struct request *rq,
 +                                          bool at_head)
  {
 -      struct blk_mq_ctx *ctx = rq->mq_ctx;
 -
        trace_block_rq_insert(hctx->queue, rq);
  
        if (at_head)
                list_add(&rq->queuelist, &ctx->rq_list);
        else
                list_add_tail(&rq->queuelist, &ctx->rq_list);
 +}
 +
 +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 +                                  struct request *rq, bool at_head)
 +{
 +      struct blk_mq_ctx *ctx = rq->mq_ctx;
  
 +      __blk_mq_insert_req_list(hctx, ctx, rq, at_head);
        blk_mq_hctx_mark_pending(hctx, ctx);
  }
  
@@@ -1046,9 -1056,8 +1046,9 @@@ static void blk_mq_insert_requests(stru
                rq = list_first_entry(list, struct request, queuelist);
                list_del_init(&rq->queuelist);
                rq->mq_ctx = ctx;
 -              __blk_mq_insert_request(hctx, rq, false);
 +              __blk_mq_insert_req_list(hctx, ctx, rq, false);
        }
 +      blk_mq_hctx_mark_pending(hctx, ctx);
        spin_unlock(&ctx->lock);
  
        blk_mq_run_hw_queue(hctx, from_schedule);
@@@ -1130,7 -1139,7 +1130,7 @@@ static inline bool blk_mq_merge_queue_i
                                         struct blk_mq_ctx *ctx,
                                         struct request *rq, struct bio *bio)
  {
 -      if (!hctx_allow_merges(hctx)) {
 +      if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) {
                blk_mq_bio_to_request(rq, bio);
                spin_lock(&ctx->lock);
  insert_rq:
@@@ -1167,7 -1176,11 +1167,7 @@@ static struct request *blk_mq_map_reque
        int rw = bio_data_dir(bio);
        struct blk_mq_alloc_data alloc_data;
  
 -      if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
 -              bio_io_error(bio);
 -              return NULL;
 -      }
 -
 +      blk_queue_enter_live(q);
        ctx = blk_mq_get_ctx(q);
        hctx = q->mq_ops->map_queue(q, ctx->cpu);
  
                ctx = blk_mq_get_ctx(q);
                hctx = q->mq_ops->map_queue(q, ctx->cpu);
                blk_mq_set_alloc_data(&alloc_data, q,
-                               __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
+                               __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
                rq = __blk_mq_alloc_request(&alloc_data, rw);
                ctx = alloc_data.ctx;
                hctx = alloc_data.hctx;
@@@ -1254,12 -1267,9 +1254,12 @@@ static void blk_mq_make_request(struct 
  
        blk_queue_split(q, &bio, q->bio_split);
  
 -      if (!is_flush_fua && !blk_queue_nomerges(q) &&
 -          blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 -              return;
 +      if (!is_flush_fua && !blk_queue_nomerges(q)) {
 +              if (blk_attempt_plug_merge(q, bio, &request_count,
 +                                         &same_queue_rq))
 +                      return;
 +      } else
 +              request_count = blk_plug_queued_count(q);
  
        rq = blk_mq_map_request(q, bio, &data);
        if (unlikely(!rq))
@@@ -1366,7 -1376,7 +1366,7 @@@ static void blk_sq_make_request(struct 
        plug = current->plug;
        if (plug) {
                blk_mq_bio_to_request(rq, bio);
 -              if (list_empty(&plug->mq_list))
 +              if (!request_count)
                        trace_block_plug(q);
                else if (request_count >= BLK_MAX_REQUEST_COUNT) {
                        blk_flush_plug_list(plug, false);
@@@ -1420,11 -1430,6 +1420,11 @@@ static void blk_mq_free_rq_map(struct b
        while (!list_empty(&tags->page_list)) {
                page = list_first_entry(&tags->page_list, struct page, lru);
                list_del_init(&page->lru);
 +              /*
 +               * Remove kmemleak object previously allocated in
 +               * blk_mq_init_rq_map().
 +               */
 +              kmemleak_free(page_address(page));
                __free_pages(page, page->private);
        }
  
@@@ -1497,11 -1502,6 +1497,11 @@@ static struct blk_mq_tags *blk_mq_init_
                list_add_tail(&page->lru, &tags->page_list);
  
                p = page_address(page);
 +              /*
 +               * Allow kmemleak to scan these pages as they contain pointers
 +               * to additional allocations like via ops->init_request().
 +               */
 +              kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
                entries_per_page = order_to_size(this_order) / rq_size;
                to_do = min(entries_per_page, set->queue_depth - i);
                left -= to_do * rq_size;
@@@ -1989,6 -1989,14 +1989,6 @@@ struct request_queue *blk_mq_init_alloc
                hctxs[i]->queue_num = i;
        }
  
 -      /*
 -       * Init percpu_ref in atomic mode so that it's faster to shutdown.
 -       * See blk_register_queue() for details.
 -       */
 -      if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
 -                          PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
 -              goto err_hctxs;
 -
        setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
        blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
  
@@@ -2069,6 -2077,8 +2069,6 @@@ void blk_mq_free_queue(struct request_q
  
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
        blk_mq_free_hw_queues(q, set);
 -
 -      percpu_ref_exit(&q->mq_usage_counter);
  }
  
  /* Basically redo blk_mq_init_queue with queue frozen */
@@@ -2286,8 -2296,10 +2286,8 @@@ void blk_mq_free_tag_set(struct blk_mq_
        int i;
  
        for (i = 0; i < set->nr_hw_queues; i++) {
 -              if (set->tags[i]) {
 +              if (set->tags[i])
                        blk_mq_free_rq_map(set, set->tags[i], i);
 -                      free_cpumask_var(set->tags[i]->cpumask);
 -              }
        }
  
        kfree(set->tags);
diff --combined block/genhd.c
index e5cafa51567c9d589147523c8ab7b43504f9d725,3213b66515f00bcb446a9c9874d24536a2625bfe..ebb41feea35754525761edf815b7e3a994ab0a06
@@@ -630,7 -630,6 +630,7 @@@ void add_disk(struct gendisk *disk
        WARN_ON(retval);
  
        disk_add_events(disk);
 +      blk_integrity_add(disk);
  }
  EXPORT_SYMBOL(add_disk);
  
@@@ -639,7 -638,6 +639,7 @@@ void del_gendisk(struct gendisk *disk
        struct disk_part_iter piter;
        struct hd_struct *part;
  
 +      blk_integrity_del(disk);
        disk_del_events(disk);
  
        /* invalidate stuff */
@@@ -852,7 -850,7 +852,7 @@@ static int show_partition(struct seq_fi
        char buf[BDEVNAME_SIZE];
  
        /* Don't show non-partitionable removeable devices or empty devices */
-       if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+       if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
                                   (sgp->flags & GENHD_FL_REMOVABLE)))
                return 0;
        if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
index d3d73d114a4615e124e89bd6d4196ba5be35f415,3dc53a16ed3aaf14dfd30311c16e0dc14f6623a0..9462d27528507d693d8e4efe0e6464597ab1768b
@@@ -29,7 -29,7 +29,7 @@@
  #include <linux/string.h>
  #include <linux/drbd.h>
  #include <linux/slab.h>
 -#include <asm/kmap_types.h>
 +#include <linux/highmem.h>
  
  #include "drbd_int.h"
  
@@@ -1007,7 -1007,7 +1007,7 @@@ static void bm_page_io_async(struct drb
        bm_set_page_unchanged(b->bm_pages[page_nr]);
  
        if (ctx->flags & BM_AIO_COPY_PAGES) {
-               page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+               page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM);
                copy_highpage(page, b->bm_pages[page_nr]);
                bm_store_page_idx(page, page_nr);
        } else
diff --combined drivers/block/nbd.c
index 1b87623381e2b1183b5c9d57c870b7c10924f65e,214de17d0659f54f21e2b3034a46d690f821a063..93b3f99b6865fe721f7124412553cadf3c328e7a
@@@ -60,7 -60,6 +60,7 @@@ struct nbd_device 
        bool disconnect; /* a disconnect has been requested by user */
  
        struct timer_list timeout_timer;
 +      spinlock_t tasks_lock;
        struct task_struct *task_recv;
        struct task_struct *task_send;
  
@@@ -141,23 -140,21 +141,23 @@@ static void sock_shutdown(struct nbd_de
  static void nbd_xmit_timeout(unsigned long arg)
  {
        struct nbd_device *nbd = (struct nbd_device *)arg;
 -      struct task_struct *task;
 +      unsigned long flags;
  
        if (list_empty(&nbd->queue_head))
                return;
  
        nbd->disconnect = true;
  
 -      task = READ_ONCE(nbd->task_recv);
 -      if (task)
 -              force_sig(SIGKILL, task);
 +      spin_lock_irqsave(&nbd->tasks_lock, flags);
 +
 +      if (nbd->task_recv)
 +              force_sig(SIGKILL, nbd->task_recv);
  
 -      task = READ_ONCE(nbd->task_send);
 -      if (task)
 +      if (nbd->task_send)
                force_sig(SIGKILL, nbd->task_send);
  
 +      spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 +
        dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n");
  }
  
@@@ -406,24 -403,17 +406,24 @@@ static int nbd_thread_recv(struct nbd_d
  {
        struct request *req;
        int ret;
 +      unsigned long flags;
  
        BUG_ON(nbd->magic != NBD_MAGIC);
  
        sk_set_memalloc(nbd->sock->sk);
  
 +      spin_lock_irqsave(&nbd->tasks_lock, flags);
        nbd->task_recv = current;
 +      spin_unlock_irqrestore(&nbd->tasks_lock, flags);
  
        ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
        if (ret) {
                dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
 +
 +              spin_lock_irqsave(&nbd->tasks_lock, flags);
                nbd->task_recv = NULL;
 +              spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 +
                return ret;
        }
  
  
        device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
  
 +      spin_lock_irqsave(&nbd->tasks_lock, flags);
        nbd->task_recv = NULL;
 +      spin_unlock_irqrestore(&nbd->tasks_lock, flags);
  
        if (signal_pending(current)) {
-               siginfo_t info;
-               ret = dequeue_signal_lock(current, &current->blocked, &info);
+               ret = kernel_dequeue_signal(NULL);
                dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
                         task_pid_nr(current), current->comm, ret);
                mutex_lock(&nbd->tx_lock);
@@@ -546,11 -532,8 +544,11 @@@ static int nbd_thread_send(void *data
  {
        struct nbd_device *nbd = data;
        struct request *req;
 +      unsigned long flags;
  
 +      spin_lock_irqsave(&nbd->tasks_lock, flags);
        nbd->task_send = current;
 +      spin_unlock_irqrestore(&nbd->tasks_lock, flags);
  
        set_user_nice(current, MIN_NICE);
        while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
                                         !list_empty(&nbd->waiting_queue));
  
                if (signal_pending(current)) {
-                       siginfo_t info;
-                       int ret;
+                       int ret = kernel_dequeue_signal(NULL);
  
-                       ret = dequeue_signal_lock(current, &current->blocked,
-                                                 &info);
                        dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
                                 task_pid_nr(current), current->comm, ret);
                        mutex_lock(&nbd->tx_lock);
                nbd_handle_req(nbd, req);
        }
  
 +      spin_lock_irqsave(&nbd->tasks_lock, flags);
        nbd->task_send = NULL;
-       if (signal_pending(current)) {
-               siginfo_t info;
-               dequeue_signal_lock(current, &current->blocked, &info);
-       }
 +      spin_unlock_irqrestore(&nbd->tasks_lock, flags);
 +
 +      /* Clear maybe pending signals */
++      if (signal_pending(current))
++              kernel_dequeue_signal(NULL);
  
        return 0;
  }
@@@ -1075,7 -1047,6 +1068,7 @@@ static int __init nbd_init(void
                nbd_dev[i].magic = NBD_MAGIC;
                INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
                spin_lock_init(&nbd_dev[i].queue_lock);
 +              spin_lock_init(&nbd_dev[i].tasks_lock);
                INIT_LIST_HEAD(&nbd_dev[i].queue_head);
                mutex_init(&nbd_dev[i].tx_lock);
                init_timer(&nbd_dev[i].timeout_timer);
diff --combined drivers/block/pktcdvd.c
index cd813f9110bfc99314604fcf0ae93e58242ac413,5959c2981cc7d18efbbec56615db1d29b2698064..2f477d45d6cfa42d586080db8c293d41406055ae
@@@ -704,14 -704,14 +704,14 @@@ static int pkt_generic_packet(struct pk
        int ret = 0;
  
        rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
-                            WRITE : READ, __GFP_WAIT);
+                            WRITE : READ, __GFP_RECLAIM);
        if (IS_ERR(rq))
                return PTR_ERR(rq);
        blk_rq_set_block_pc(rq);
  
        if (cgc->buflen) {
                ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
-                                     __GFP_WAIT);
+                                     __GFP_RECLAIM);
                if (ret)
                        goto out;
        }
@@@ -2803,7 -2803,8 +2803,7 @@@ out_new_dev
  out_mem2:
        put_disk(disk);
  out_mem:
 -      if (pd->rb_pool)
 -              mempool_destroy(pd->rb_pool);
 +      mempool_destroy(pd->rb_pool);
        kfree(pd);
  out_mutex:
        mutex_unlock(&ctl_mutex);
index abeb9af31f9c5e7c02b3f8d64ddf34f01621cfd0,1d47d2e9487c49de9bf58db9d0902a94afbd9a7a..2e10bba4468b0c6b65aee0d07fa279295c52d2b2
@@@ -244,9 -244,8 +244,9 @@@ drm_gem_object_handle_unreference_unloc
   * @filp: drm file-private structure to use for the handle look up
   * @handle: userspace handle to delete
   *
 - * Removes the GEM handle from the @filp lookup table and if this is the last
 - * handle also cleans up linked resources like GEM names.
 + * Removes the GEM handle from the @filp lookup table which has been added with
 + * drm_gem_handle_create(). If this is the last handle also cleans up linked
 + * resources like GEM names.
   */
  int
  drm_gem_handle_delete(struct drm_file *filp, u32 handle)
@@@ -315,10 -314,6 +315,10 @@@ EXPORT_SYMBOL(drm_gem_dumb_destroy)
   * This expects the dev->object_name_lock to be held already and will drop it
   * before returning. Used to avoid races in establishing new handles when
   * importing an object from either an flink name or a dma-buf.
 + *
 + * Handles must be release again through drm_gem_handle_delete(). This is done
 + * when userspace closes @file_priv for all attached handles, or through the
 + * GEM_CLOSE ioctl for individual handles.
   */
  int
  drm_gem_handle_create_tail(struct drm_file *file_priv,
@@@ -496,7 -491,7 +496,7 @@@ struct page **drm_gem_get_pages(struct 
                 * __GFP_DMA32 to be set in mapping_gfp_mask(inode->i_mapping)
                 * so shmem can relocate pages during swapin if required.
                 */
-               BUG_ON((mapping_gfp_mask(mapping) & __GFP_DMA32) &&
+               BUG_ON(mapping_gfp_constraint(mapping, __GFP_DMA32) &&
                                (page_to_pfn(p) >= 0x00100000UL));
        }
  
@@@ -546,17 -541,7 +546,17 @@@ void drm_gem_put_pages(struct drm_gem_o
  }
  EXPORT_SYMBOL(drm_gem_put_pages);
  
 -/** Returns a reference to the object named by the handle. */
 +/**
 + * drm_gem_object_lookup - look up a GEM object from it's handle
 + * @dev: DRM device
 + * @filp: DRM file private date
 + * @handle: userspace handle
 + *
 + * Returns:
 + *
 + * A reference to the object named by the handle if such exists on @filp, NULL
 + * otherwise.
 + */
  struct drm_gem_object *
  drm_gem_object_lookup(struct drm_device *dev, struct drm_file *filp,
                      u32 handle)
@@@ -778,8 -763,7 +778,8 @@@ EXPORT_SYMBOL(drm_gem_object_release)
  void
  drm_gem_object_free(struct kref *kref)
  {
 -      struct drm_gem_object *obj = (struct drm_gem_object *) kref;
 +      struct drm_gem_object *obj =
 +              container_of(kref, struct drm_gem_object, refcount);
        struct drm_device *dev = obj->dev;
  
        WARN_ON(!mutex_is_locked(&dev->struct_mutex));
  }
  EXPORT_SYMBOL(drm_gem_object_free);
  
 +/**
 + * drm_gem_vm_open - vma->ops->open implementation for GEM
 + * @vma: VM area structure
 + *
 + * This function implements the #vm_operations_struct open() callback for GEM
 + * drivers. This must be used together with drm_gem_vm_close().
 + */
  void drm_gem_vm_open(struct vm_area_struct *vma)
  {
        struct drm_gem_object *obj = vma->vm_private_data;
  }
  EXPORT_SYMBOL(drm_gem_vm_open);
  
 +/**
 + * drm_gem_vm_close - vma->ops->close implementation for GEM
 + * @vma: VM area structure
 + *
 + * This function implements the #vm_operations_struct close() callback for GEM
 + * drivers. This must be used together with drm_gem_vm_open().
 + */
  void drm_gem_vm_close(struct vm_area_struct *vma)
  {
        struct drm_gem_object *obj = vma->vm_private_data;
@@@ -840,6 -810,8 +840,6 @@@ EXPORT_SYMBOL(drm_gem_vm_close)
   * drm_gem_mmap() prevents unprivileged users from mapping random objects. So
   * callers must verify access restrictions before calling this helper.
   *
 - * NOTE: This function has to be protected with dev->struct_mutex
 - *
   * Return 0 or success or -EINVAL if the object size is smaller than the VMA
   * size, or if no gem_vm_ops are provided.
   */
@@@ -848,6 -820,8 +848,6 @@@ int drm_gem_mmap_obj(struct drm_gem_obj
  {
        struct drm_device *dev = obj->dev;
  
 -      lockdep_assert_held(&dev->struct_mutex);
 -
        /* Check for valid size. */
        if (obj_size < vma->vm_end - vma->vm_start)
                return -EINVAL;
@@@ -891,46 -865,30 +891,46 @@@ int drm_gem_mmap(struct file *filp, str
  {
        struct drm_file *priv = filp->private_data;
        struct drm_device *dev = priv->minor->dev;
 -      struct drm_gem_object *obj;
 +      struct drm_gem_object *obj = NULL;
        struct drm_vma_offset_node *node;
        int ret;
  
        if (drm_device_is_unplugged(dev))
                return -ENODEV;
  
 -      mutex_lock(&dev->struct_mutex);
 +      drm_vma_offset_lock_lookup(dev->vma_offset_manager);
 +      node = drm_vma_offset_exact_lookup_locked(dev->vma_offset_manager,
 +                                                vma->vm_pgoff,
 +                                                vma_pages(vma));
 +      if (likely(node)) {
 +              obj = container_of(node, struct drm_gem_object, vma_node);
 +              /*
 +               * When the object is being freed, after it hits 0-refcnt it
 +               * proceeds to tear down the object. In the process it will
 +               * attempt to remove the VMA offset and so acquire this
 +               * mgr->vm_lock.  Therefore if we find an object with a 0-refcnt
 +               * that matches our range, we know it is in the process of being
 +               * destroyed and will be freed as soon as we release the lock -
 +               * so we have to check for the 0-refcnted object and treat it as
 +               * invalid.
 +               */
 +              if (!kref_get_unless_zero(&obj->refcount))
 +                      obj = NULL;
 +      }
 +      drm_vma_offset_unlock_lookup(dev->vma_offset_manager);
  
 -      node = drm_vma_offset_exact_lookup(dev->vma_offset_manager,
 -                                         vma->vm_pgoff,
 -                                         vma_pages(vma));
 -      if (!node) {
 -              mutex_unlock(&dev->struct_mutex);
 +      if (!obj)
                return -EINVAL;
 -      } else if (!drm_vma_node_is_allowed(node, filp)) {
 -              mutex_unlock(&dev->struct_mutex);
 +
 +      if (!drm_vma_node_is_allowed(node, filp)) {
 +              drm_gem_object_unreference_unlocked(obj);
                return -EACCES;
        }
  
 -      obj = container_of(node, struct drm_gem_object, vma_node);
 -      ret = drm_gem_mmap_obj(obj, drm_vma_node_size(node) << PAGE_SHIFT, vma);
 +      ret = drm_gem_mmap_obj(obj, drm_vma_node_size(node) << PAGE_SHIFT,
 +                             vma);
  
 -      mutex_unlock(&dev->struct_mutex);
 +      drm_gem_object_unreference_unlocked(obj);
  
        return ret;
  }
index e57061ac02191dd352d71f72ed0599f58c80b45b,399aab265db3cdd669bced4452d237ad78076b98..5cf4a1998273c3cfcc494c83210c0bc572f35c2e
                if (!needs_clflush_after &&
                    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
                        if (i915_gem_clflush_object(obj, obj->pin_display))
 -                              i915_gem_chipset_flush(dev);
 +                              needs_clflush_after = true;
                }
        }
  
        if (needs_clflush_after)
                i915_gem_chipset_flush(dev);
 +      else
 +              obj->cache_dirty = true;
  
        intel_fb_obj_flush(obj, false, ORIGIN_CPU);
        return ret;
@@@ -1713,8 -1711,8 +1713,8 @@@ i915_gem_mmap_ioctl(struct drm_device *
  
  /**
   * i915_gem_fault - fault a page into the GTT
 - * vma: VMA in question
 - * vmf: fault info
 + * @vma: VMA in question
 + * @vmf: fault info
   *
   * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
   * from userspace.  The fault handler takes care of binding the object to
@@@ -2216,9 -2214,8 +2216,8 @@@ i915_gem_object_get_pages_gtt(struct dr
         * Fail silently without starting the shrinker
         */
        mapping = file_inode(obj->base.filp)->i_mapping;
-       gfp = mapping_gfp_mask(mapping);
-       gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD;
-       gfp &= ~(__GFP_IO | __GFP_WAIT);
+       gfp = mapping_gfp_constraint(mapping, ~(__GFP_IO | __GFP_RECLAIM));
+       gfp |= __GFP_NORETRY | __GFP_NOWARN;
        sg = st->sgl;
        st->nents = 0;
        for (i = 0; i < page_count; i++) {
@@@ -3208,7 -3205,7 +3207,7 @@@ static void i915_gem_object_finish_gtt(
                                            old_write_domain);
  }
  
 -int i915_vma_unbind(struct i915_vma *vma)
 +static int __i915_vma_unbind(struct i915_vma *vma, bool wait)
  {
        struct drm_i915_gem_object *obj = vma->obj;
        struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
  
        BUG_ON(obj->pages == NULL);
  
 -      ret = i915_gem_object_wait_rendering(obj, false);
 -      if (ret)
 -              return ret;
 -      /* Continue on if we fail due to EIO, the GPU is hung so we
 -       * should be safe and we need to cleanup or else we might
 -       * cause memory corruption through use-after-free.
 -       */
 +      if (wait) {
 +              ret = i915_gem_object_wait_rendering(obj, false);
 +              if (ret)
 +                      return ret;
 +      }
  
        if (i915_is_ggtt(vma->vm) &&
            vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL) {
        return 0;
  }
  
 +int i915_vma_unbind(struct i915_vma *vma)
 +{
 +      return __i915_vma_unbind(vma, true);
 +}
 +
 +int __i915_vma_unbind_no_wait(struct i915_vma *vma)
 +{
 +      return __i915_vma_unbind(vma, false);
 +}
 +
  int i915_gpu_idle(struct drm_device *dev)
  {
        struct drm_i915_private *dev_priv = dev->dev_private;
@@@ -3365,10 -3354,11 +3364,10 @@@ i915_gem_object_bind_to_vm(struct drm_i
  {
        struct drm_device *dev = obj->base.dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
 -      u32 size, fence_size, fence_alignment, unfenced_alignment;
 -      u64 start =
 -              flags & PIN_OFFSET_BIAS ? flags & PIN_OFFSET_MASK : 0;
 -      u64 end =
 -              flags & PIN_MAPPABLE ? dev_priv->gtt.mappable_end : vm->total;
 +      u32 fence_alignment, unfenced_alignment;
 +      u32 search_flag, alloc_flag;
 +      u64 start, end;
 +      u64 size, fence_size;
        struct i915_vma *vma;
        int ret;
  
                size = flags & PIN_MAPPABLE ? fence_size : obj->base.size;
        }
  
 +      start = flags & PIN_OFFSET_BIAS ? flags & PIN_OFFSET_MASK : 0;
 +      end = vm->total;
 +      if (flags & PIN_MAPPABLE)
 +              end = min_t(u64, end, dev_priv->gtt.mappable_end);
 +      if (flags & PIN_ZONE_4G)
 +              end = min_t(u64, end, (1ULL << 32));
 +
        if (alignment == 0)
                alignment = flags & PIN_MAPPABLE ? fence_alignment :
                                                unfenced_alignment;
         * attempt to find space.
         */
        if (size > end) {
 -              DRM_DEBUG("Attempting to bind an object (view type=%u) larger than the aperture: size=%u > %s aperture=%llu\n",
 +              DRM_DEBUG("Attempting to bind an object (view type=%u) larger than the aperture: size=%llu > %s aperture=%llu\n",
                          ggtt_view ? ggtt_view->type : 0,
                          size,
                          flags & PIN_MAPPABLE ? "mappable" : "total",
        if (IS_ERR(vma))
                goto err_unpin;
  
 +      if (flags & PIN_HIGH) {
 +              search_flag = DRM_MM_SEARCH_BELOW;
 +              alloc_flag = DRM_MM_CREATE_TOP;
 +      } else {
 +              search_flag = DRM_MM_SEARCH_DEFAULT;
 +              alloc_flag = DRM_MM_CREATE_DEFAULT;
 +      }
 +
  search_free:
        ret = drm_mm_insert_node_in_range_generic(&vm->mm, &vma->node,
                                                  size, alignment,
                                                  obj->cache_level,
                                                  start, end,
 -                                                DRM_MM_SEARCH_DEFAULT,
 -                                                DRM_MM_CREATE_DEFAULT);
 +                                                search_flag,
 +                                                alloc_flag);
        if (ret) {
                ret = i915_gem_evict_something(dev, vm, size, alignment,
                                               obj->cache_level,
@@@ -3657,117 -3632,59 +3656,117 @@@ i915_gem_object_set_to_gtt_domain(struc
        return 0;
  }
  
 +/**
 + * Changes the cache-level of an object across all VMA.
 + *
 + * After this function returns, the object will be in the new cache-level
 + * across all GTT and the contents of the backing storage will be coherent,
 + * with respect to the new cache-level. In order to keep the backing storage
 + * coherent for all users, we only allow a single cache level to be set
 + * globally on the object and prevent it from being changed whilst the
 + * hardware is reading from the object. That is if the object is currently
 + * on the scanout it will be set to uncached (or equivalent display
 + * cache coherency) and all non-MOCS GPU access will also be uncached so
 + * that all direct access to the scanout remains coherent.
 + */
  int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
                                    enum i915_cache_level cache_level)
  {
        struct drm_device *dev = obj->base.dev;
        struct i915_vma *vma, *next;
 -      int ret;
 +      bool bound = false;
 +      int ret = 0;
  
        if (obj->cache_level == cache_level)
 -              return 0;
 -
 -      if (i915_gem_obj_is_pinned(obj)) {
 -              DRM_DEBUG("can not change the cache level of pinned objects\n");
 -              return -EBUSY;
 -      }
 +              goto out;
  
 +      /* Inspect the list of currently bound VMA and unbind any that would
 +       * be invalid given the new cache-level. This is principally to
 +       * catch the issue of the CS prefetch crossing page boundaries and
 +       * reading an invalid PTE on older architectures.
 +       */
        list_for_each_entry_safe(vma, next, &obj->vma_list, vma_link) {
 +              if (!drm_mm_node_allocated(&vma->node))
 +                      continue;
 +
 +              if (vma->pin_count) {
 +                      DRM_DEBUG("can not change the cache level of pinned objects\n");
 +                      return -EBUSY;
 +              }
 +
                if (!i915_gem_valid_gtt_space(vma, cache_level)) {
                        ret = i915_vma_unbind(vma);
                        if (ret)
                                return ret;
 -              }
 +              } else
 +                      bound = true;
        }
  
 -      if (i915_gem_obj_bound_any(obj)) {
 +      /* We can reuse the existing drm_mm nodes but need to change the
 +       * cache-level on the PTE. We could simply unbind them all and
 +       * rebind with the correct cache-level on next use. However since
 +       * we already have a valid slot, dma mapping, pages etc, we may as
 +       * rewrite the PTE in the belief that doing so tramples upon less
 +       * state and so involves less work.
 +       */
 +      if (bound) {
 +              /* Before we change the PTE, the GPU must not be accessing it.
 +               * If we wait upon the object, we know that all the bound
 +               * VMA are no longer active.
 +               */
                ret = i915_gem_object_wait_rendering(obj, false);
                if (ret)
                        return ret;
  
 -              i915_gem_object_finish_gtt(obj);
 -
 -              /* Before SandyBridge, you could not use tiling or fence
 -               * registers with snooped memory, so relinquish any fences
 -               * currently pointing to our region in the aperture.
 -               */
 -              if (INTEL_INFO(dev)->gen < 6) {
 +              if (!HAS_LLC(dev) && cache_level != I915_CACHE_NONE) {
 +                      /* Access to snoopable pages through the GTT is
 +                       * incoherent and on some machines causes a hard
 +                       * lockup. Relinquish the CPU mmaping to force
 +                       * userspace to refault in the pages and we can
 +                       * then double check if the GTT mapping is still
 +                       * valid for that pointer access.
 +                       */
 +                      i915_gem_release_mmap(obj);
 +
 +                      /* As we no longer need a fence for GTT access,
 +                       * we can relinquish it now (and so prevent having
 +                       * to steal a fence from someone else on the next
 +                       * fence request). Note GPU activity would have
 +                       * dropped the fence as all snoopable access is
 +                       * supposed to be linear.
 +                       */
                        ret = i915_gem_object_put_fence(obj);
                        if (ret)
                                return ret;
 +              } else {
 +                      /* We either have incoherent backing store and
 +                       * so no GTT access or the architecture is fully
 +                       * coherent. In such cases, existing GTT mmaps
 +                       * ignore the cache bit in the PTE and we can
 +                       * rewrite it without confusing the GPU or having
 +                       * to force userspace to fault back in its mmaps.
 +                       */
                }
  
 -              list_for_each_entry(vma, &obj->vma_list, vma_link)
 -                      if (drm_mm_node_allocated(&vma->node)) {
 -                              ret = i915_vma_bind(vma, cache_level,
 -                                                  PIN_UPDATE);
 -                              if (ret)
 -                                      return ret;
 -                      }
 +              list_for_each_entry(vma, &obj->vma_list, vma_link) {
 +                      if (!drm_mm_node_allocated(&vma->node))
 +                              continue;
 +
 +                      ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
 +                      if (ret)
 +                              return ret;
 +              }
        }
  
        list_for_each_entry(vma, &obj->vma_list, vma_link)
                vma->node.color = cache_level;
        obj->cache_level = cache_level;
  
 +out:
 +      /* Flush the dirty CPU caches to the backing storage so that the
 +       * object is now coherent at its new cache level (with respect
 +       * to the access domain).
 +       */
        if (obj->cache_dirty &&
            obj->base.write_domain != I915_GEM_DOMAIN_CPU &&
            cpu_write_needs_clflush(obj)) {
@@@ -3820,15 -3737,6 +3819,15 @@@ int i915_gem_set_caching_ioctl(struct d
                level = I915_CACHE_NONE;
                break;
        case I915_CACHING_CACHED:
 +              /*
 +               * Due to a HW issue on BXT A stepping, GPU stores via a
 +               * snooped mapping may leave stale data in a corresponding CPU
 +               * cacheline, whereas normally such cachelines would get
 +               * invalidated.
 +               */
 +              if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0)
 +                      return -ENODEV;
 +
                level = I915_CACHE_LLC;
                break;
        case I915_CACHING_DISPLAY:
@@@ -4102,13 -4010,15 +4101,13 @@@ i915_gem_object_do_pin(struct drm_i915_
                        return -EBUSY;
  
                if (i915_vma_misplaced(vma, alignment, flags)) {
 -                      unsigned long offset;
 -                      offset = ggtt_view ? i915_gem_obj_ggtt_offset_view(obj, ggtt_view) :
 -                                           i915_gem_obj_offset(obj, vm);
                        WARN(vma->pin_count,
                             "bo is already pinned in %s with incorrect alignment:"
 -                           " offset=%lx, req.alignment=%x, req.map_and_fenceable=%d,"
 +                           " offset=%08x %08x, req.alignment=%x, req.map_and_fenceable=%d,"
                             " obj->map_and_fenceable=%d\n",
                             ggtt_view ? "ggtt" : "ppgtt",
 -                           offset,
 +                           upper_32_bits(vma->node.start),
 +                           lower_32_bits(vma->node.start),
                             alignment,
                             !!(flags & PIN_MAPPABLE),
                             obj->map_and_fenceable);
@@@ -4615,6 -4525,22 +4614,6 @@@ void i915_gem_init_swizzling(struct drm
                BUG();
  }
  
 -static bool
 -intel_enable_blt(struct drm_device *dev)
 -{
 -      if (!HAS_BLT(dev))
 -              return false;
 -
 -      /* The blitter was dysfunctional on early prototypes */
 -      if (IS_GEN6(dev) && dev->pdev->revision < 8) {
 -              DRM_INFO("BLT not supported on this pre-production hardware;"
 -                       " graphics performance will be degraded.\n");
 -              return false;
 -      }
 -
 -      return true;
 -}
 -
  static void init_unused_ring(struct drm_device *dev, u32 base)
  {
        struct drm_i915_private *dev_priv = dev->dev_private;
@@@ -4657,7 -4583,7 +4656,7 @@@ int i915_gem_init_rings(struct drm_devi
                        goto cleanup_render_ring;
        }
  
 -      if (intel_enable_blt(dev)) {
 +      if (HAS_BLT(dev)) {
                ret = intel_init_blt_ring_buffer(dev);
                if (ret)
                        goto cleanup_bsd_ring;
                        goto cleanup_vebox_ring;
        }
  
 -      ret = i915_gem_set_seqno(dev, ((u32)~0 - 0x1000));
 -      if (ret)
 -              goto cleanup_bsd2_ring;
 -
        return 0;
  
 -cleanup_bsd2_ring:
 -      intel_cleanup_ring_buffer(&dev_priv->ring[VCS2]);
  cleanup_vebox_ring:
        intel_cleanup_ring_buffer(&dev_priv->ring[VECS]);
  cleanup_blt_ring:
@@@ -4746,33 -4678,6 +4745,33 @@@ i915_gem_init_hw(struct drm_device *dev
                        goto out;
        }
  
 +      /* We can't enable contexts until all firmware is loaded */
 +      if (HAS_GUC_UCODE(dev)) {
 +              ret = intel_guc_ucode_load(dev);
 +              if (ret) {
 +                      /*
 +                       * If we got an error and GuC submission is enabled, map
 +                       * the error to -EIO so the GPU will be declared wedged.
 +                       * OTOH, if we didn't intend to use the GuC anyway, just
 +                       * discard the error and carry on.
 +                       */
 +                      DRM_ERROR("Failed to initialize GuC, error %d%s\n", ret,
 +                                i915.enable_guc_submission ? "" :
 +                                " (ignored)");
 +                      ret = i915.enable_guc_submission ? -EIO : 0;
 +                      if (ret)
 +                              goto out;
 +              }
 +      }
 +
 +      /*
 +       * Increment the next seqno by 0x100 so we have a visible break
 +       * on re-initialisation
 +       */
 +      ret = i915_gem_set_seqno(dev, dev_priv->next_seqno+0x100);
 +      if (ret)
 +              goto out;
 +
        /* Now it is safe to go back round and do everything else: */
        for_each_ring(ring, dev_priv, i) {
                struct drm_i915_gem_request *req;
@@@ -4910,6 -4815,18 +4909,6 @@@ init_ring_lists(struct intel_engine_cs 
        INIT_LIST_HEAD(&ring->request_list);
  }
  
 -void i915_init_vm(struct drm_i915_private *dev_priv,
 -                struct i915_address_space *vm)
 -{
 -      if (!i915_is_ggtt(vm))
 -              drm_mm_init(&vm->mm, vm->start, vm->total);
 -      vm->dev = dev_priv->dev;
 -      INIT_LIST_HEAD(&vm->active_list);
 -      INIT_LIST_HEAD(&vm->inactive_list);
 -      INIT_LIST_HEAD(&vm->global_link);
 -      list_add_tail(&vm->global_link, &dev_priv->vm_list);
 -}
 -
  void
  i915_gem_load(struct drm_device *dev)
  {
                                  NULL);
  
        INIT_LIST_HEAD(&dev_priv->vm_list);
 -      i915_init_vm(dev_priv, &dev_priv->gtt.base);
 -
        INIT_LIST_HEAD(&dev_priv->context_list);
        INIT_LIST_HEAD(&dev_priv->mm.unbound_list);
        INIT_LIST_HEAD(&dev_priv->mm.bound_list);
                dev_priv->num_fence_regs =
                                I915_READ(vgtif_reg(avail_rs.fence_num));
  
 +      /*
 +       * Set initial sequence number for requests.
 +       * Using this number allows the wraparound to happen early,
 +       * catching any obvious problems.
 +       */
 +      dev_priv->next_seqno = ((u32)~0 - 0x1100);
 +      dev_priv->last_seqno = ((u32)~0 - 0x1101);
 +
        /* Initialize fence registers to zero */
        INIT_LIST_HEAD(&dev_priv->mm.fence_list);
        i915_gem_restore_fences(dev);
@@@ -5037,9 -4948,9 +5036,9 @@@ int i915_gem_open(struct drm_device *de
  
  /**
   * i915_gem_track_fb - update frontbuffer tracking
 - * old: current GEM buffer for the frontbuffer slots
 - * new: new GEM buffer for the frontbuffer slots
 - * frontbuffer_bits: bitmask of frontbuffer slots
 + * @old: current GEM buffer for the frontbuffer slots
 + * @new: new GEM buffer for the frontbuffer slots
 + * @frontbuffer_bits: bitmask of frontbuffer slots
   *
   * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
   * from @old and setting them in @new. Both @old and @new can be NULL.
@@@ -5062,8 -4973,9 +5061,8 @@@ void i915_gem_track_fb(struct drm_i915_
  }
  
  /* All the new VM stuff */
 -unsigned long
 -i915_gem_obj_offset(struct drm_i915_gem_object *o,
 -                  struct i915_address_space *vm)
 +u64 i915_gem_obj_offset(struct drm_i915_gem_object *o,
 +                      struct i915_address_space *vm)
  {
        struct drm_i915_private *dev_priv = o->base.dev->dev_private;
        struct i915_vma *vma;
        return -1;
  }
  
 -unsigned long
 -i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
 -                            const struct i915_ggtt_view *view)
 +u64 i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
 +                                const struct i915_ggtt_view *view)
  {
        struct i915_address_space *ggtt = i915_obj_to_ggtt(o);
        struct i915_vma *vma;
index dcdaa79e3f0faa0dcd9288a0f14cf9e4e996210c,59ab264c99c4f4a87c92cf654f14cc0806539cd6..2aba774f835b9caca8e9e1645d1efd6cf6f08bf9
@@@ -1007,29 -1007,26 +1007,29 @@@ int ib_init_ah_from_path(struct ib_devi
        force_grh = rdma_cap_eth_ah(device, port_num);
  
        if (rec->hop_limit > 1 || force_grh) {
 +              struct net_device *ndev = ib_get_ndev_from_path(rec);
 +
                ah_attr->ah_flags = IB_AH_GRH;
                ah_attr->grh.dgid = rec->dgid;
  
 -              ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
 +              ret = ib_find_cached_gid(device, &rec->sgid, ndev, &port_num,
                                         &gid_index);
 -              if (ret)
 +              if (ret) {
 +                      if (ndev)
 +                              dev_put(ndev);
                        return ret;
 +              }
  
                ah_attr->grh.sgid_index    = gid_index;
                ah_attr->grh.flow_label    = be32_to_cpu(rec->flow_label);
                ah_attr->grh.hop_limit     = rec->hop_limit;
                ah_attr->grh.traffic_class = rec->traffic_class;
 +              if (ndev)
 +                      dev_put(ndev);
        }
        if (force_grh) {
                memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
 -              ah_attr->vlan_id = rec->vlan_id;
 -      } else {
 -              ah_attr->vlan_id = 0xffff;
        }
 -
        return 0;
  }
  EXPORT_SYMBOL(ib_init_ah_from_path);
@@@ -1086,7 -1083,7 +1086,7 @@@ static void init_mad(struct ib_sa_mad *
  
  static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
  {
-       bool preload = !!(gfp_mask & __GFP_WAIT);
+       bool preload = gfpflags_allow_blocking(gfp_mask);
        unsigned long flags;
        int ret, id;
  
@@@ -1153,9 -1150,9 +1153,9 @@@ static void ib_sa_path_rec_callback(str
  
                ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
                          mad->data, &rec);
 -              rec.vlan_id = 0xffff;
 +              rec.net = NULL;
 +              rec.ifindex = 0;
                memset(rec.dmac, 0, ETH_ALEN);
 -              memset(rec.smac, 0, ETH_ALEN);
                query->callback(status, &rec, query->context);
        } else
                query->callback(status, NULL, query->context);
index 0d533bba4ad18097e447be3dca3fa41e5d9abeab,3aa0116c843c050490b675498ac09e825ba78807..8b2be1e7714f8bb7aa1d62193d3b3320fab64414
@@@ -89,6 -89,8 +89,6 @@@ static struct dma_map_ops amd_iommu_dma
  struct iommu_dev_data {
        struct list_head list;            /* For domain->dev_list */
        struct list_head dev_data_list;   /* For global dev_data_list */
 -      struct list_head alias_list;      /* Link alias-groups together */
 -      struct iommu_dev_data *alias_data;/* The alias dev_data */
        struct protection_domain *domain; /* Domain the device is bound to */
        u16 devid;                        /* PCI Device ID */
        bool iommu_v2;                    /* Device can make use of IOMMUv2 */
@@@ -134,6 -136,8 +134,6 @@@ static struct iommu_dev_data *alloc_dev
        if (!dev_data)
                return NULL;
  
 -      INIT_LIST_HEAD(&dev_data->alias_list);
 -
        dev_data->devid = devid;
  
        spin_lock_irqsave(&dev_data_list_lock, flags);
        return dev_data;
  }
  
 -static void free_dev_data(struct iommu_dev_data *dev_data)
 -{
 -      unsigned long flags;
 -
 -      spin_lock_irqsave(&dev_data_list_lock, flags);
 -      list_del(&dev_data->dev_data_list);
 -      spin_unlock_irqrestore(&dev_data_list_lock, flags);
 -
 -      kfree(dev_data);
 -}
 -
  static struct iommu_dev_data *search_dev_data(u16 devid)
  {
        struct iommu_dev_data *dev_data;
@@@ -296,10 -311,73 +296,10 @@@ out
        iommu_group_put(group);
  }
  
 -static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
 -{
 -      *(u16 *)data = alias;
 -      return 0;
 -}
 -
 -static u16 get_alias(struct device *dev)
 -{
 -      struct pci_dev *pdev = to_pci_dev(dev);
 -      u16 devid, ivrs_alias, pci_alias;
 -
 -      devid = get_device_id(dev);
 -      ivrs_alias = amd_iommu_alias_table[devid];
 -      pci_for_each_dma_alias(pdev, __last_alias, &pci_alias);
 -
 -      if (ivrs_alias == pci_alias)
 -              return ivrs_alias;
 -
 -      /*
 -       * DMA alias showdown
 -       *
 -       * The IVRS is fairly reliable in telling us about aliases, but it
 -       * can't know about every screwy device.  If we don't have an IVRS
 -       * reported alias, use the PCI reported alias.  In that case we may
 -       * still need to initialize the rlookup and dev_table entries if the
 -       * alias is to a non-existent device.
 -       */
 -      if (ivrs_alias == devid) {
 -              if (!amd_iommu_rlookup_table[pci_alias]) {
 -                      amd_iommu_rlookup_table[pci_alias] =
 -                              amd_iommu_rlookup_table[devid];
 -                      memcpy(amd_iommu_dev_table[pci_alias].data,
 -                             amd_iommu_dev_table[devid].data,
 -                             sizeof(amd_iommu_dev_table[pci_alias].data));
 -              }
 -
 -              return pci_alias;
 -      }
 -
 -      pr_info("AMD-Vi: Using IVRS reported alias %02x:%02x.%d "
 -              "for device %s[%04x:%04x], kernel reported alias "
 -              "%02x:%02x.%d\n", PCI_BUS_NUM(ivrs_alias), PCI_SLOT(ivrs_alias),
 -              PCI_FUNC(ivrs_alias), dev_name(dev), pdev->vendor, pdev->device,
 -              PCI_BUS_NUM(pci_alias), PCI_SLOT(pci_alias),
 -              PCI_FUNC(pci_alias));
 -
 -      /*
 -       * If we don't have a PCI DMA alias and the IVRS alias is on the same
 -       * bus, then the IVRS table may know about a quirk that we don't.
 -       */
 -      if (pci_alias == devid &&
 -          PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) {
 -              pdev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
 -              pdev->dma_alias_devfn = ivrs_alias & 0xff;
 -              pr_info("AMD-Vi: Added PCI DMA alias %02x.%d for %s\n",
 -                      PCI_SLOT(ivrs_alias), PCI_FUNC(ivrs_alias),
 -                      dev_name(dev));
 -      }
 -
 -      return ivrs_alias;
 -}
 -
  static int iommu_init_device(struct device *dev)
  {
        struct pci_dev *pdev = to_pci_dev(dev);
        struct iommu_dev_data *dev_data;
 -      u16 alias;
  
        if (dev->archdata.iommu)
                return 0;
        if (!dev_data)
                return -ENOMEM;
  
 -      alias = get_alias(dev);
 -
 -      if (alias != dev_data->devid) {
 -              struct iommu_dev_data *alias_data;
 -
 -              alias_data = find_dev_data(alias);
 -              if (alias_data == NULL) {
 -                      pr_err("AMD-Vi: Warning: Unhandled device %s\n",
 -                                      dev_name(dev));
 -                      free_dev_data(dev_data);
 -                      return -ENOTSUPP;
 -              }
 -              dev_data->alias_data = alias_data;
 -
 -              /* Add device to the alias_list */
 -              list_add(&dev_data->alias_list, &alias_data->alias_list);
 -      }
 -
        if (pci_iommuv2_capable(pdev)) {
                struct amd_iommu *iommu;
  
@@@ -349,6 -445,9 +349,6 @@@ static void iommu_uninit_device(struct 
  
        iommu_group_remove_device(dev);
  
 -      /* Unlink from alias, it may change if another device is re-plugged */
 -      dev_data->alias_data = NULL;
 -
        /* Remove dma-ops */
        dev->archdata.dma_ops = NULL;
  
@@@ -534,7 -633,7 +534,7 @@@ static void iommu_poll_events(struct am
  
        while (head != tail) {
                iommu_print_event(iommu, iommu->evt_buf + head);
 -              head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
 +              head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
        }
  
        writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
@@@ -684,7 -783,7 +684,7 @@@ static void copy_cmd_to_buffer(struct a
        u8 *target;
  
        target = iommu->cmd_buf + tail;
 -      tail   = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
 +      tail   = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
  
        /* Copy command to buffer */
        memcpy(target, cmd, sizeof(*cmd));
@@@ -851,13 -950,15 +851,13 @@@ static int iommu_queue_command_sync(str
        u32 left, tail, head, next_tail;
        unsigned long flags;
  
 -      WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
 -
  again:
        spin_lock_irqsave(&iommu->lock, flags);
  
        head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
        tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
 -      next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
 -      left      = (head - next_tail) % iommu->cmd_buf_size;
 +      next_tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
 +      left      = (head - next_tail) % CMD_BUFFER_SIZE;
  
        if (left <= 2) {
                struct iommu_cmd sync_cmd;
@@@ -1013,15 -1114,11 +1013,15 @@@ static int device_flush_iotlb(struct io
  static int device_flush_dte(struct iommu_dev_data *dev_data)
  {
        struct amd_iommu *iommu;
 +      u16 alias;
        int ret;
  
        iommu = amd_iommu_rlookup_table[dev_data->devid];
 +      alias = amd_iommu_alias_table[dev_data->devid];
  
        ret = iommu_flush_dte(iommu, dev_data->devid);
 +      if (!ret && alias != dev_data->devid)
 +              ret = iommu_flush_dte(iommu, alias);
        if (ret)
                return ret;
  
@@@ -1877,8 -1974,8 +1877,8 @@@ static void set_dte_entry(u16 devid, st
  static void clear_dte_entry(u16 devid)
  {
        /* remove entry from the device table seen by the hardware */
 -      amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
 -      amd_iommu_dev_table[devid].data[1] = 0;
 +      amd_iommu_dev_table[devid].data[0]  = IOMMU_PTE_P | IOMMU_PTE_TV;
 +      amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK;
  
        amd_iommu_apply_erratum_63(devid);
  }
@@@ -1887,33 -1984,27 +1887,33 @@@ static void do_attach(struct iommu_dev_
                      struct protection_domain *domain)
  {
        struct amd_iommu *iommu;
 +      u16 alias;
        bool ats;
  
        iommu = amd_iommu_rlookup_table[dev_data->devid];
 +      alias = amd_iommu_alias_table[dev_data->devid];
        ats   = dev_data->ats.enabled;
  
        /* Update data structures */
        dev_data->domain = domain;
        list_add(&dev_data->list, &domain->dev_list);
 -      set_dte_entry(dev_data->devid, domain, ats);
  
        /* Do reference counting */
        domain->dev_iommu[iommu->index] += 1;
        domain->dev_cnt                 += 1;
  
 -      /* Flush the DTE entry */
 +      /* Update device table */
 +      set_dte_entry(dev_data->devid, domain, ats);
 +      if (alias != dev_data->devid)
 +              set_dte_entry(dev_data->devid, domain, ats);
 +
        device_flush_dte(dev_data);
  }
  
  static void do_detach(struct iommu_dev_data *dev_data)
  {
        struct amd_iommu *iommu;
 +      u16 alias;
  
        /*
         * First check if the device is still attached. It might already
                return;
  
        iommu = amd_iommu_rlookup_table[dev_data->devid];
 +      alias = amd_iommu_alias_table[dev_data->devid];
  
        /* decrease reference counters */
        dev_data->domain->dev_iommu[iommu->index] -= 1;
        dev_data->domain = NULL;
        list_del(&dev_data->list);
        clear_dte_entry(dev_data->devid);
 +      if (alias != dev_data->devid)
 +              clear_dte_entry(alias);
  
        /* Flush the DTE entry */
        device_flush_dte(dev_data);
  static int __attach_device(struct iommu_dev_data *dev_data,
                           struct protection_domain *domain)
  {
 -      struct iommu_dev_data *head, *entry;
        int ret;
  
 +      /*
 +       * Must be called with IRQs disabled. Warn here to detect early
 +       * when its not.
 +       */
 +      WARN_ON(!irqs_disabled());
 +
        /* lock domain */
        spin_lock(&domain->lock);
  
 -      head = dev_data;
 -
 -      if (head->alias_data != NULL)
 -              head = head->alias_data;
 -
 -      /* Now we have the root of the alias group, if any */
 -
        ret = -EBUSY;
 -      if (head->domain != NULL)
 +      if (dev_data->domain != NULL)
                goto out_unlock;
  
        /* Attach alias group root */
 -      do_attach(head, domain);
 -
 -      /* Attach other devices in the alias group */
 -      list_for_each_entry(entry, &head->alias_list, alias_list)
 -              do_attach(entry, domain);
 +      do_attach(dev_data, domain);
  
        ret = 0;
  
@@@ -2115,24 -2209,26 +2115,24 @@@ static int attach_device(struct device 
   */
  static void __detach_device(struct iommu_dev_data *dev_data)
  {
 -      struct iommu_dev_data *head, *entry;
        struct protection_domain *domain;
 -      unsigned long flags;
  
 -      BUG_ON(!dev_data->domain);
 -
 -      domain = dev_data->domain;
 +      /*
 +       * Must be called with IRQs disabled. Warn here to detect early
 +       * when its not.
 +       */
 +      WARN_ON(!irqs_disabled());
  
 -      spin_lock_irqsave(&domain->lock, flags);
 +      if (WARN_ON(!dev_data->domain))
 +              return;
  
 -      head = dev_data;
 -      if (head->alias_data != NULL)
 -              head = head->alias_data;
 +      domain = dev_data->domain;
  
 -      list_for_each_entry(entry, &head->alias_list, alias_list)
 -              do_detach(entry);
 +      spin_lock(&domain->lock);
  
 -      do_detach(head);
 +      do_detach(dev_data);
  
 -      spin_unlock_irqrestore(&domain->lock, flags);
 +      spin_unlock(&domain->lock);
  }
  
  /*
@@@ -2668,7 -2764,7 +2668,7 @@@ static void *alloc_coherent(struct devi
  
        page = alloc_pages(flag | __GFP_NOWARN,  get_order(size));
        if (!page) {
-               if (!(flag & __GFP_WAIT))
+               if (!gfpflags_allow_blocking(flag))
                        return NULL;
  
                page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
@@@ -3102,7 -3198,6 +3102,7 @@@ static const struct iommu_ops amd_iommu
        .iova_to_phys = amd_iommu_iova_to_phys,
        .add_device = amd_iommu_add_device,
        .remove_device = amd_iommu_remove_device,
 +      .device_group = pci_device_group,
        .get_dm_regions = amd_iommu_get_dm_regions,
        .put_dm_regions = amd_iommu_put_dm_regions,
        .pgsize_bitmap  = AMD_IOMMU_PGSIZES,
index 7cf80c1a8a1607f2d2ea675b270600bed70c6ad7,0c139f63b8bedd5201370289ebba66ad742ed87c..f1042daef9ada83e931ae450623ce491ebd55959
@@@ -34,7 -34,6 +34,7 @@@
  #include <linux/mempool.h>
  #include <linux/memory.h>
  #include <linux/timer.h>
 +#include <linux/io.h>
  #include <linux/iova.h>
  #include <linux/iommu.h>
  #include <linux/intel-iommu.h>
@@@ -419,13 -418,10 +419,13 @@@ struct device_domain_info 
        struct list_head global; /* link to global list */
        u8 bus;                 /* PCI bus number */
        u8 devfn;               /* PCI devfn number */
 -      struct {
 -              u8 enabled:1;
 -              u8 qdep;
 -      } ats;                  /* ATS state */
 +      u8 pasid_supported:3;
 +      u8 pasid_enabled:1;
 +      u8 pri_supported:1;
 +      u8 pri_enabled:1;
 +      u8 ats_supported:1;
 +      u8 ats_enabled:1;
 +      u8 ats_qdep;
        struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
        struct intel_iommu *iommu; /* IOMMU used by this device */
        struct dmar_domain *domain; /* pointer to domain */
@@@ -501,37 -497,13 +501,37 @@@ static int dmar_forcedac
  static int intel_iommu_strict;
  static int intel_iommu_superpage = 1;
  static int intel_iommu_ecs = 1;
 +static int intel_iommu_pasid28;
 +static int iommu_identity_mapping;
 +
 +#define IDENTMAP_ALL          1
 +#define IDENTMAP_GFX          2
 +#define IDENTMAP_AZALIA               4
  
 -/* We only actually use ECS when PASID support (on the new bit 40)
 - * is also advertised. Some early implementations â€” the ones with
 - * PASID support on bit 28 â€” have issues even when we *only* use
 - * extended root/context tables. */
 +/* Broadwell and Skylake have broken ECS support â€” normal so-called "second
 + * level" translation of DMA requests-without-PASID doesn't actually happen
 + * unless you also set the NESTE bit in an extended context-entry. Which of
 + * course means that SVM doesn't work because it's trying to do nested
 + * translation of the physical addresses it finds in the process page tables,
 + * through the IOVA->phys mapping found in the "second level" page tables.
 + *
 + * The VT-d specification was retroactively changed to change the definition
 + * of the capability bits and pretend that Broadwell/Skylake never happened...
 + * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 + * for some reason it was the PASID capability bit which was redefined (from
 + * bit 28 on BDW/SKL to bit 40 in future).
 + *
 + * So our test for ECS needs to eschew those implementations which set the old
 + * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 + * Unless we are working around the 'pasid28' limitations, that is, by putting
 + * the device into passthrough mode for normal DMA and thus masking the bug.
 + */
  #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 -                          ecap_pasid(iommu->ecap))
 +                          (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 +/* PASID support is thus enabled if ECS is enabled and *either* of the old
 + * or new capability bits are set. */
 +#define pasid_enabled(iommu) (ecs_enabled(iommu) &&                   \
 +                            (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
  
  int intel_iommu_gfx_mapped;
  EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
@@@ -594,11 -566,6 +594,11 @@@ static int __init intel_iommu_setup(cha
                        printk(KERN_INFO
                                "Intel-IOMMU: disable extended context table support\n");
                        intel_iommu_ecs = 0;
 +              } else if (!strncmp(str, "pasid28", 7)) {
 +                      printk(KERN_INFO
 +                              "Intel-IOMMU: enable pre-production PASID support\n");
 +                      intel_iommu_pasid28 = 1;
 +                      iommu_identity_mapping |= IDENTMAP_GFX;
                }
  
                str += strcspn(str, ",");
@@@ -1440,22 -1407,37 +1440,22 @@@ static struct device_domain_info 
  iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
                         u8 bus, u8 devfn)
  {
 -      bool found = false;
        struct device_domain_info *info;
 -      struct pci_dev *pdev;
  
        assert_spin_locked(&device_domain_lock);
  
 -      if (!ecap_dev_iotlb_support(iommu->ecap))
 -              return NULL;
 -
        if (!iommu->qi)
                return NULL;
  
        list_for_each_entry(info, &domain->devices, link)
                if (info->iommu == iommu && info->bus == bus &&
                    info->devfn == devfn) {
 -                      found = true;
 +                      if (info->ats_supported && info->dev)
 +                              return info;
                        break;
                }
  
 -      if (!found || !info->dev || !dev_is_pci(info->dev))
 -              return NULL;
 -
 -      pdev = to_pci_dev(info->dev);
 -
 -      if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
 -              return NULL;
 -
 -      if (!dmar_find_matched_atsr_unit(pdev))
 -              return NULL;
 -
 -      return info;
 +      return NULL;
  }
  
  static void iommu_enable_dev_iotlb(struct device_domain_info *info)
                return;
  
        pdev = to_pci_dev(info->dev);
 -      if (pci_enable_ats(pdev, VTD_PAGE_SHIFT))
 -              return;
  
 -      info->ats.enabled = 1;
 -      info->ats.qdep = pci_ats_queue_depth(pdev);
 +#ifdef CONFIG_INTEL_IOMMU_SVM
 +      /* The PCIe spec, in its wisdom, declares that the behaviour of
 +         the device if you enable PASID support after ATS support is
 +         undefined. So always enable PASID support on devices which
 +         have it, even if we can't yet know if we're ever going to
 +         use it. */
 +      if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
 +              info->pasid_enabled = 1;
 +
 +      if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
 +              info->pri_enabled = 1;
 +#endif
 +      if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
 +              info->ats_enabled = 1;
 +              info->ats_qdep = pci_ats_queue_depth(pdev);
 +      }
  }
  
  static void iommu_disable_dev_iotlb(struct device_domain_info *info)
  {
 -      if (!info->ats.enabled)
 +      struct pci_dev *pdev;
 +
 +      if (dev_is_pci(info->dev))
                return;
  
 -      pci_disable_ats(to_pci_dev(info->dev));
 -      info->ats.enabled = 0;
 +      pdev = to_pci_dev(info->dev);
 +
 +      if (info->ats_enabled) {
 +              pci_disable_ats(pdev);
 +              info->ats_enabled = 0;
 +      }
 +#ifdef CONFIG_INTEL_IOMMU_SVM
 +      if (info->pri_enabled) {
 +              pci_disable_pri(pdev);
 +              info->pri_enabled = 0;
 +      }
 +      if (info->pasid_enabled) {
 +              pci_disable_pasid(pdev);
 +              info->pasid_enabled = 0;
 +      }
 +#endif
  }
  
  static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
  
        spin_lock_irqsave(&device_domain_lock, flags);
        list_for_each_entry(info, &domain->devices, link) {
 -              if (!info->ats.enabled)
 +              if (!info->ats_enabled)
                        continue;
  
                sid = info->bus << 8 | info->devfn;
 -              qdep = info->ats.qdep;
 +              qdep = info->ats_qdep;
                qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
        }
        spin_unlock_irqrestore(&device_domain_lock, flags);
@@@ -1713,14 -1667,6 +1713,14 @@@ static void free_dmar_iommu(struct inte
  
        /* free context mapping */
        free_context_table(iommu);
 +
 +#ifdef CONFIG_INTEL_IOMMU_SVM
 +      if (pasid_enabled(iommu)) {
 +              if (ecap_prs(iommu->ecap))
 +                      intel_svm_finish_prq(iommu);
 +              intel_svm_free_pasid_tables(iommu);
 +      }
 +#endif
  }
  
  static struct dmar_domain *alloc_domain(int flags)
@@@ -1988,10 -1934,8 +1988,10 @@@ static int domain_context_mapping_one(s
                }
  
                info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
 -              translation = info ? CONTEXT_TT_DEV_IOTLB :
 -                                   CONTEXT_TT_MULTI_LEVEL;
 +              if (info && info->ats_supported)
 +                      translation = CONTEXT_TT_DEV_IOTLB;
 +              else
 +                      translation = CONTEXT_TT_MULTI_LEVEL;
  
                context_set_address_root(context, virt_to_phys(pgd));
                context_set_address_width(context, iommu->agaw);
@@@ -2171,19 -2115,15 +2171,19 @@@ static int __domain_mapping(struct dmar
                                return -ENOMEM;
                        /* It is large page*/
                        if (largepage_lvl > 1) {
 +                              unsigned long nr_superpages, end_pfn;
 +
                                pteval |= DMA_PTE_LARGE_PAGE;
                                lvl_pages = lvl_to_nr_pages(largepage_lvl);
 +
 +                              nr_superpages = sg_res / lvl_pages;
 +                              end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
 +
                                /*
                                 * Ensure that old small page tables are
 -                               * removed to make room for superpage,
 -                               * if they exist.
 +                               * removed to make room for superpage(s).
                                 */
 -                              dma_pte_free_pagetable(domain, iov_pfn,
 -                                                     iov_pfn + lvl_pages - 1);
 +                              dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
                        } else {
                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
                        }
@@@ -2329,34 -2269,12 +2329,34 @@@ static struct dmar_domain *dmar_insert_
  
        info->bus = bus;
        info->devfn = devfn;
 -      info->ats.enabled = 0;
 -      info->ats.qdep = 0;
 +      info->ats_supported = info->pasid_supported = info->pri_supported = 0;
 +      info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
 +      info->ats_qdep = 0;
        info->dev = dev;
        info->domain = domain;
        info->iommu = iommu;
  
 +      if (dev && dev_is_pci(dev)) {
 +              struct pci_dev *pdev = to_pci_dev(info->dev);
 +
 +              if (ecap_dev_iotlb_support(iommu->ecap) &&
 +                  pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
 +                  dmar_find_matched_atsr_unit(pdev))
 +                      info->ats_supported = 1;
 +
 +              if (ecs_enabled(iommu)) {
 +                      if (pasid_enabled(iommu)) {
 +                              int features = pci_pasid_features(pdev);
 +                              if (features >= 0)
 +                                      info->pasid_supported = features | 1;
 +                      }
 +
 +                      if (info->ats_supported && ecap_prs(iommu->ecap) &&
 +                          pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
 +                              info->pri_supported = 1;
 +              }
 +      }
 +
        spin_lock_irqsave(&device_domain_lock, flags);
        if (dev)
                found = find_domain(dev);
@@@ -2482,6 -2400,11 +2482,6 @@@ found_domain
        return domain;
  }
  
 -static int iommu_identity_mapping;
 -#define IDENTMAP_ALL          1
 -#define IDENTMAP_GFX          2
 -#define IDENTMAP_AZALIA               4
 -
  static int iommu_domain_identity_map(struct dmar_domain *domain,
                                     unsigned long long start,
                                     unsigned long long end)
                                  DMA_PTE_READ|DMA_PTE_WRITE);
  }
  
 -static int iommu_prepare_identity_map(struct device *dev,
 -                                    unsigned long long start,
 -                                    unsigned long long end)
 +static int domain_prepare_identity_map(struct device *dev,
 +                                     struct dmar_domain *domain,
 +                                     unsigned long long start,
 +                                     unsigned long long end)
  {
 -      struct dmar_domain *domain;
 -      int ret;
 -
 -      domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 -      if (!domain)
 -              return -ENOMEM;
 -
        /* For _hardware_ passthrough, don't bother. But for software
           passthrough, we do it anyway -- it may indicate a memory
           range which is reserved in E820, so which didn't get set
                        dmi_get_system_info(DMI_BIOS_VENDOR),
                        dmi_get_system_info(DMI_BIOS_VERSION),
                     dmi_get_system_info(DMI_PRODUCT_VERSION));
 -              ret = -EIO;
 -              goto error;
 +              return -EIO;
        }
  
        if (end >> agaw_to_width(domain->agaw)) {
                     dmi_get_system_info(DMI_BIOS_VENDOR),
                     dmi_get_system_info(DMI_BIOS_VERSION),
                     dmi_get_system_info(DMI_PRODUCT_VERSION));
 -              ret = -EIO;
 -              goto error;
 +              return -EIO;
        }
  
 -      ret = iommu_domain_identity_map(domain, start, end);
 -      if (ret)
 -              goto error;
 +      return iommu_domain_identity_map(domain, start, end);
 +}
  
 -      return 0;
 +static int iommu_prepare_identity_map(struct device *dev,
 +                                    unsigned long long start,
 +                                    unsigned long long end)
 +{
 +      struct dmar_domain *domain;
 +      int ret;
 +
 +      domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 +      if (!domain)
 +              return -ENOMEM;
 +
 +      ret = domain_prepare_identity_map(dev, domain, start, end);
 +      if (ret)
 +              domain_exit(domain);
  
 - error:
 -      domain_exit(domain);
        return ret;
  }
  
@@@ -2887,18 -2808,18 +2887,18 @@@ static void intel_iommu_init_qi(struct 
  }
  
  static int copy_context_table(struct intel_iommu *iommu,
 -                            struct root_entry __iomem *old_re,
 +                            struct root_entry *old_re,
                              struct context_entry **tbl,
                              int bus, bool ext)
  {
        int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
 -      struct context_entry __iomem *old_ce = NULL;
        struct context_entry *new_ce = NULL, ce;
 +      struct context_entry *old_ce = NULL;
        struct root_entry re;
        phys_addr_t old_ce_phys;
  
        tbl_idx = ext ? bus * 2 : bus;
 -      memcpy_fromio(&re, old_re, sizeof(re));
 +      memcpy(&re, old_re, sizeof(re));
  
        for (devfn = 0; devfn < 256; devfn++) {
                /* First calculate the correct index */
                        }
  
                        ret = -ENOMEM;
 -                      old_ce = ioremap_cache(old_ce_phys, PAGE_SIZE);
 +                      old_ce = memremap(old_ce_phys, PAGE_SIZE,
 +                                      MEMREMAP_WB);
                        if (!old_ce)
                                goto out;
  
                }
  
                /* Now copy the context entry */
 -              memcpy_fromio(&ce, old_ce + idx, sizeof(ce));
 +              memcpy(&ce, old_ce + idx, sizeof(ce));
  
                if (!__context_present(&ce))
                        continue;
        __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
  
  out_unmap:
 -      iounmap(old_ce);
 +      memunmap(old_ce);
  
  out:
        return ret;
  
  static int copy_translation_tables(struct intel_iommu *iommu)
  {
 -      struct root_entry __iomem *old_rt;
        struct context_entry **ctxt_tbls;
 +      struct root_entry *old_rt;
        phys_addr_t old_rt_phys;
        int ctxt_table_entries;
        unsigned long flags;
        if (!old_rt_phys)
                return -EINVAL;
  
 -      old_rt = ioremap_cache(old_rt_phys, PAGE_SIZE);
 +      old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
        if (!old_rt)
                return -ENOMEM;
  
        ret = 0;
  
  out_unmap:
 -      iounmap(old_rt);
 +      memunmap(old_rt);
  
        return ret;
  }
@@@ -3176,10 -3096,6 +3176,10 @@@ static int __init init_dmars(void
  
                if (!ecap_pass_through(iommu->ecap))
                        hw_pass_through = 0;
 +#ifdef CONFIG_INTEL_IOMMU_SVM
 +              if (pasid_enabled(iommu))
 +                      intel_svm_alloc_pasid_tables(iommu);
 +#endif
        }
  
        if (iommu_pass_through)
@@@ -3267,13 -3183,6 +3267,13 @@@ domains_done
  
                iommu_flush_write_buffer(iommu);
  
 +#ifdef CONFIG_INTEL_IOMMU_SVM
 +              if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
 +                      ret = intel_svm_enable_prq(iommu);
 +                      if (ret)
 +                              goto free_iommu;
 +              }
 +#endif
                ret = dmar_set_interrupt(iommu);
                if (ret)
                        goto free_iommu;
@@@ -3333,10 -3242,7 +3333,10 @@@ static struct iova *intel_alloc_iova(st
  
  static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
  {
 +      struct dmar_rmrr_unit *rmrr;
        struct dmar_domain *domain;
 +      struct device *i_dev;
 +      int i, ret;
  
        domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
        if (!domain) {
                return NULL;
        }
  
 +      /* We have a new domain - setup possible RMRRs for the device */
 +      rcu_read_lock();
 +      for_each_rmrr_units(rmrr) {
 +              for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
 +                                        i, i_dev) {
 +                      if (i_dev != dev)
 +                              continue;
 +
 +                      ret = domain_prepare_identity_map(dev, domain,
 +                                                        rmrr->base_address,
 +                                                        rmrr->end_address);
 +                      if (ret)
 +                              dev_err(dev, "Mapping reserved region failed\n");
 +              }
 +      }
 +      rcu_read_unlock();
 +
        return domain;
  }
  
@@@ -3647,7 -3536,7 +3647,7 @@@ static void *intel_alloc_coherent(struc
                        flags |= GFP_DMA32;
        }
  
-       if (flags & __GFP_WAIT) {
+       if (gfpflags_allow_blocking(flags)) {
                unsigned int count = size >> PAGE_SHIFT;
  
                page = dma_alloc_from_contiguous(dev, count, order);
@@@ -4222,11 -4111,6 +4222,11 @@@ static int intel_iommu_add(struct dmar_
        if (ret)
                goto out;
  
 +#ifdef CONFIG_INTEL_IOMMU_SVM
 +      if (pasid_enabled(iommu))
 +              intel_svm_alloc_pasid_tables(iommu);
 +#endif
 +
        if (dmaru->ignored) {
                /*
                 * we always have to disable PMRs or DMA may fail on this device
  
        intel_iommu_init_qi(iommu);
        iommu_flush_write_buffer(iommu);
 +
 +#ifdef CONFIG_INTEL_IOMMU_SVM
 +      if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
 +              ret = intel_svm_enable_prq(iommu);
 +              if (ret)
 +                      goto disable_iommu;
 +      }
 +#endif
        ret = dmar_set_interrupt(iommu);
        if (ret)
                goto disable_iommu;
@@@ -4314,17 -4190,14 +4314,17 @@@ int dmar_find_matched_atsr_unit(struct 
        dev = pci_physfn(dev);
        for (bus = dev->bus; bus; bus = bus->parent) {
                bridge = bus->self;
 -              if (!bridge || !pci_is_pcie(bridge) ||
 +              /* If it's an integrated device, allow ATS */
 +              if (!bridge)
 +                      return 1;
 +              /* Connected via non-PCIe: no ATS */
 +              if (!pci_is_pcie(bridge) ||
                    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
                        return 0;
 +              /* If we found the root port, look it up in the ATSR */
                if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
                        break;
        }
 -      if (!bridge)
 -              return 0;
  
        rcu_read_lock();
        list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
@@@ -4988,114 -4861,6 +4988,114 @@@ static void intel_iommu_remove_device(s
        iommu_device_unlink(iommu->iommu_dev, dev);
  }
  
 +#ifdef CONFIG_INTEL_IOMMU_SVM
 +int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
 +{
 +      struct device_domain_info *info;
 +      struct context_entry *context;
 +      struct dmar_domain *domain;
 +      unsigned long flags;
 +      u64 ctx_lo;
 +      int ret;
 +
 +      domain = get_valid_domain_for_dev(sdev->dev);
 +      if (!domain)
 +              return -EINVAL;
 +
 +      spin_lock_irqsave(&device_domain_lock, flags);
 +      spin_lock(&iommu->lock);
 +
 +      ret = -EINVAL;
 +      info = sdev->dev->archdata.iommu;
 +      if (!info || !info->pasid_supported)
 +              goto out;
 +
 +      context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
 +      if (WARN_ON(!context))
 +              goto out;
 +
 +      ctx_lo = context[0].lo;
 +
 +      sdev->did = domain->iommu_did[iommu->seq_id];
 +      sdev->sid = PCI_DEVID(info->bus, info->devfn);
 +
 +      if (!(ctx_lo & CONTEXT_PASIDE)) {
 +              context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
 +              context[1].lo = (u64)virt_to_phys(iommu->pasid_table) | ecap_pss(iommu->ecap);
 +              wmb();
 +              /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
 +               * extended to permit requests-with-PASID if the PASIDE bit
 +               * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
 +               * however, the PASIDE bit is ignored and requests-with-PASID
 +               * are unconditionally blocked. Which makes less sense.
 +               * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
 +               * "guest mode" translation types depending on whether ATS
 +               * is available or not. Annoyingly, we can't use the new
 +               * modes *unless* PASIDE is set. */
 +              if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
 +                      ctx_lo &= ~CONTEXT_TT_MASK;
 +                      if (info->ats_supported)
 +                              ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
 +                      else
 +                              ctx_lo |= CONTEXT_TT_PT_PASID << 2;
 +              }
 +              ctx_lo |= CONTEXT_PASIDE;
 +              if (iommu->pasid_state_table)
 +                      ctx_lo |= CONTEXT_DINVE;
 +              if (info->pri_supported)
 +                      ctx_lo |= CONTEXT_PRS;
 +              context[0].lo = ctx_lo;
 +              wmb();
 +              iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
 +                                         DMA_CCMD_MASK_NOBIT,
 +                                         DMA_CCMD_DEVICE_INVL);
 +      }
 +
 +      /* Enable PASID support in the device, if it wasn't already */
 +      if (!info->pasid_enabled)
 +              iommu_enable_dev_iotlb(info);
 +
 +      if (info->ats_enabled) {
 +              sdev->dev_iotlb = 1;
 +              sdev->qdep = info->ats_qdep;
 +              if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
 +                      sdev->qdep = 0;
 +      }
 +      ret = 0;
 +
 + out:
 +      spin_unlock(&iommu->lock);
 +      spin_unlock_irqrestore(&device_domain_lock, flags);
 +
 +      return ret;
 +}
 +
 +struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
 +{
 +      struct intel_iommu *iommu;
 +      u8 bus, devfn;
 +
 +      if (iommu_dummy(dev)) {
 +              dev_warn(dev,
 +                       "No IOMMU translation for device; cannot enable SVM\n");
 +              return NULL;
 +      }
 +
 +      iommu = device_to_iommu(dev, &bus, &devfn);
 +      if ((!iommu)) {
 +              dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
 +              return NULL;
 +      }
 +
 +      if (!iommu->pasid_table) {
 +              dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
 +              return NULL;
 +      }
 +
 +      return iommu;
 +}
 +#endif /* CONFIG_INTEL_IOMMU_SVM */
 +
  static const struct iommu_ops intel_iommu_ops = {
        .capable        = intel_iommu_capable,
        .domain_alloc   = intel_iommu_domain_alloc,
        .iova_to_phys   = intel_iommu_iova_to_phys,
        .add_device     = intel_iommu_add_device,
        .remove_device  = intel_iommu_remove_device,
 +      .device_group   = pci_device_group,
        .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
  };
  
diff --combined drivers/md/dm-crypt.c
index 3729b394432c9d66c7c219a8a52ddbf9be987869,ca718c98bb9d666a7dd8e5306b049d7fe2fcd3f1..917d47e290ae08be08f4c964a3326f1f67acd077
@@@ -994,7 -994,7 +994,7 @@@ static struct bio *crypt_alloc_buffer(s
        struct bio_vec *bvec;
  
  retry:
-       if (unlikely(gfp_mask & __GFP_WAIT))
+       if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
                mutex_lock(&cc->bio_alloc_lock);
  
        clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
                if (!page) {
                        crypt_free_buffer_pages(cc, clone);
                        bio_put(clone);
-                       gfp_mask |= __GFP_WAIT;
+                       gfp_mask |= __GFP_DIRECT_RECLAIM;
                        goto retry;
                }
  
        }
  
  return_clone:
-       if (unlikely(gfp_mask & __GFP_WAIT))
+       if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
                mutex_unlock(&cc->bio_alloc_lock);
  
        return clone;
@@@ -1544,8 -1544,10 +1544,8 @@@ static void crypt_dtr(struct dm_target 
        if (cc->bs)
                bioset_free(cc->bs);
  
 -      if (cc->page_pool)
 -              mempool_destroy(cc->page_pool);
 -      if (cc->req_pool)
 -              mempool_destroy(cc->req_pool);
 +      mempool_destroy(cc->page_pool);
 +      mempool_destroy(cc->req_pool);
  
        if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
                cc->iv_gen_ops->dtr(cc);
index 1bd2fd47421fc43e02b7ad39c28546e55b3ea69e,fb2cb4bdc0c181258f9a44d7fc95f1f8a9fad263..4432fd69b7cbf86db946fb9aa6bfa753cb21434c
@@@ -458,12 -458,11 +458,12 @@@ static inline u32 vop_usec(const vop_he
  static int solo_fill_jpeg(struct solo_enc_dev *solo_enc,
                          struct vb2_buffer *vb, const vop_header *vh)
  {
 +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
        struct solo_dev *solo_dev = solo_enc->solo_dev;
 -      struct sg_table *vbuf = vb2_dma_sg_plane_desc(vb, 0);
 +      struct sg_table *sgt = vb2_dma_sg_plane_desc(vb, 0);
        int frame_size;
  
 -      vb->v4l2_buf.flags |= V4L2_BUF_FLAG_KEYFRAME;
 +      vbuf->flags |= V4L2_BUF_FLAG_KEYFRAME;
  
        if (vb2_plane_size(vb, 0) < vop_jpeg_size(vh) + solo_enc->jpeg_len)
                return -EIO;
        frame_size = ALIGN(vop_jpeg_size(vh) + solo_enc->jpeg_len, DMA_ALIGN);
        vb2_set_plane_payload(vb, 0, vop_jpeg_size(vh) + solo_enc->jpeg_len);
  
 -      return solo_send_desc(solo_enc, solo_enc->jpeg_len, vbuf,
 +      return solo_send_desc(solo_enc, solo_enc->jpeg_len, sgt,
                             vop_jpeg_offset(vh) - SOLO_JPEG_EXT_ADDR(solo_dev),
                             frame_size, SOLO_JPEG_EXT_ADDR(solo_dev),
                             SOLO_JPEG_EXT_SIZE(solo_dev));
  static int solo_fill_mpeg(struct solo_enc_dev *solo_enc,
                struct vb2_buffer *vb, const vop_header *vh)
  {
 +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
        struct solo_dev *solo_dev = solo_enc->solo_dev;
 -      struct sg_table *vbuf = vb2_dma_sg_plane_desc(vb, 0);
 +      struct sg_table *sgt = vb2_dma_sg_plane_desc(vb, 0);
        int frame_off, frame_size;
        int skip = 0;
  
                return -EIO;
  
        /* If this is a key frame, add extra header */
 -      vb->v4l2_buf.flags &= ~(V4L2_BUF_FLAG_KEYFRAME | V4L2_BUF_FLAG_PFRAME |
 +      vbuf->flags &= ~(V4L2_BUF_FLAG_KEYFRAME | V4L2_BUF_FLAG_PFRAME |
                V4L2_BUF_FLAG_BFRAME);
        if (!vop_type(vh)) {
                skip = solo_enc->vop_len;
 -              vb->v4l2_buf.flags |= V4L2_BUF_FLAG_KEYFRAME;
 +              vbuf->flags |= V4L2_BUF_FLAG_KEYFRAME;
                vb2_set_plane_payload(vb, 0, vop_mpeg_size(vh) +
                        solo_enc->vop_len);
        } else {
 -              vb->v4l2_buf.flags |= V4L2_BUF_FLAG_PFRAME;
 +              vbuf->flags |= V4L2_BUF_FLAG_PFRAME;
                vb2_set_plane_payload(vb, 0, vop_mpeg_size(vh));
        }
  
                sizeof(*vh)) % SOLO_MP4E_EXT_SIZE(solo_dev);
        frame_size = ALIGN(vop_mpeg_size(vh) + skip, DMA_ALIGN);
  
 -      return solo_send_desc(solo_enc, skip, vbuf, frame_off, frame_size,
 +      return solo_send_desc(solo_enc, skip, sgt, frame_off, frame_size,
                        SOLO_MP4E_EXT_ADDR(solo_dev),
                        SOLO_MP4E_EXT_SIZE(solo_dev));
  }
  static int solo_enc_fillbuf(struct solo_enc_dev *solo_enc,
                            struct vb2_buffer *vb, struct solo_enc_buf *enc_buf)
  {
 +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
        const vop_header *vh = enc_buf->vh;
        int ret;
  
        }
  
        if (!ret) {
 -              vb->v4l2_buf.sequence = solo_enc->sequence++;
 -              vb->v4l2_buf.timestamp.tv_sec = vop_sec(vh);
 -              vb->v4l2_buf.timestamp.tv_usec = vop_usec(vh);
 +              vbuf->sequence = solo_enc->sequence++;
 +              vbuf->timestamp.tv_sec = vop_sec(vh);
 +              vbuf->timestamp.tv_usec = vop_usec(vh);
  
                /* Check for motion flags */
                if (solo_is_motion_on(solo_enc) && enc_buf->motion) {
                        struct v4l2_event ev = {
                                .type = V4L2_EVENT_MOTION_DET,
                                .u.motion_det = {
 -                                      .flags = V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ,
 -                                      .frame_sequence = vb->v4l2_buf.sequence,
 +                                      .flags
 +                                      = V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ,
 +                                      .frame_sequence = vbuf->sequence,
                                        .region_mask = enc_buf->motion ? 1 : 0,
                                },
                        };
@@@ -575,7 -571,7 +575,7 @@@ static void solo_enc_handle_one(struct 
        list_del(&vb->list);
        spin_unlock_irqrestore(&solo_enc->av_lock, flags);
  
 -      solo_enc_fillbuf(solo_enc, &vb->vb, enc_buf);
 +      solo_enc_fillbuf(solo_enc, &vb->vb.vb2_buf, enc_buf);
  unlock:
        mutex_unlock(&solo_enc->lock);
  }
@@@ -663,7 -659,7 +663,7 @@@ static int solo_ring_thread(void *data
  }
  
  static int solo_enc_queue_setup(struct vb2_queue *q,
 -                              const struct v4l2_format *fmt,
 +                              const void *parg,
                                unsigned int *num_buffers,
                                unsigned int *num_planes, unsigned int sizes[],
                                void *alloc_ctxs[])
  
  static void solo_enc_buf_queue(struct vb2_buffer *vb)
  {
 +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
        struct vb2_queue *vq = vb->vb2_queue;
        struct solo_enc_dev *solo_enc = vb2_get_drv_priv(vq);
        struct solo_vb2_buf *solo_vb =
 -              container_of(vb, struct solo_vb2_buf, vb);
 +              container_of(vbuf, struct solo_vb2_buf, vb);
  
        spin_lock(&solo_enc->av_lock);
        list_add_tail(&solo_vb->list, &solo_enc->vidq_active);
@@@ -739,26 -734,25 +739,26 @@@ static void solo_enc_stop_streaming(str
                                struct solo_vb2_buf, list);
  
                list_del(&buf->list);
 -              vb2_buffer_done(&buf->vb, VB2_BUF_STATE_ERROR);
 +              vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR);
        }
        spin_unlock_irqrestore(&solo_enc->av_lock, flags);
  }
  
  static void solo_enc_buf_finish(struct vb2_buffer *vb)
  {
 +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
        struct solo_enc_dev *solo_enc = vb2_get_drv_priv(vb->vb2_queue);
 -      struct sg_table *vbuf = vb2_dma_sg_plane_desc(vb, 0);
 +      struct sg_table *sgt = vb2_dma_sg_plane_desc(vb, 0);
  
        switch (solo_enc->fmt) {
        case V4L2_PIX_FMT_MPEG4:
        case V4L2_PIX_FMT_H264:
 -              if (vb->v4l2_buf.flags & V4L2_BUF_FLAG_KEYFRAME)
 -                      sg_copy_from_buffer(vbuf->sgl, vbuf->nents,
 +              if (vbuf->flags & V4L2_BUF_FLAG_KEYFRAME)
 +                      sg_copy_from_buffer(sgt->sgl, sgt->nents,
                                        solo_enc->vop, solo_enc->vop_len);
                break;
        default: /* V4L2_PIX_FMT_MJPEG */
 -              sg_copy_from_buffer(vbuf->sgl, vbuf->nents,
 +              sg_copy_from_buffer(sgt->sgl, sgt->nents,
                                solo_enc->jpeg_header, solo_enc->jpeg_len);
                break;
        }
@@@ -1297,7 -1291,7 +1297,7 @@@ static struct solo_enc_dev *solo_enc_al
        solo_enc->vidq.ops = &solo_enc_video_qops;
        solo_enc->vidq.mem_ops = &vb2_dma_sg_memops;
        solo_enc->vidq.drv_priv = solo_enc;
-       solo_enc->vidq.gfp_flags = __GFP_DMA32;
+       solo_enc->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
        solo_enc->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
        solo_enc->vidq.buf_struct_size = sizeof(struct solo_vb2_buf);
        solo_enc->vidq.lock = &solo_enc->lock;
index 26df903585d7dd02a54077bc24bbba8acda87b40,bde77b22340c4ee93591960f905af8b0d76ecce5..f7ce493b1feed72c27dd4f9d58e1ebe15bec5708
@@@ -26,7 -26,6 +26,7 @@@
  #include <media/v4l2-ioctl.h>
  #include <media/v4l2-common.h>
  #include <media/v4l2-event.h>
 +#include <media/videobuf2-v4l2.h>
  #include <media/videobuf2-dma-contig.h>
  
  #include "solo6x10.h"
@@@ -192,14 -191,13 +192,14 @@@ static int solo_v4l2_set_ch(struct solo
  static void solo_fillbuf(struct solo_dev *solo_dev,
                         struct vb2_buffer *vb)
  {
 -      dma_addr_t vbuf;
 +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
 +      dma_addr_t addr;
        unsigned int fdma_addr;
        int error = -1;
        int i;
  
 -      vbuf = vb2_dma_contig_plane_dma_addr(vb, 0);
 -      if (!vbuf)
 +      addr = vb2_dma_contig_plane_dma_addr(vb, 0);
 +      if (!addr)
                goto finish_buf;
  
        if (erase_off(solo_dev)) {
                fdma_addr = SOLO_DISP_EXT_ADDR + (solo_dev->old_write *
                                (SOLO_HW_BPL * solo_vlines(solo_dev)));
  
 -              error = solo_p2m_dma_t(solo_dev, 0, vbuf, fdma_addr,
 +              error = solo_p2m_dma_t(solo_dev, 0, addr, fdma_addr,
                                       solo_bytesperline(solo_dev),
                                       solo_vlines(solo_dev), SOLO_HW_BPL);
        }
@@@ -224,8 -222,8 +224,8 @@@ finish_buf
        if (!error) {
                vb2_set_plane_payload(vb, 0,
                        solo_vlines(solo_dev) * solo_bytesperline(solo_dev));
 -              vb->v4l2_buf.sequence = solo_dev->sequence++;
 -              v4l2_get_timestamp(&vb->v4l2_buf.timestamp);
 +              vbuf->sequence = solo_dev->sequence++;
 +              v4l2_get_timestamp(&vbuf->timestamp);
        }
  
        vb2_buffer_done(vb, error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE);
@@@ -258,7 -256,7 +258,7 @@@ static void solo_thread_try(struct solo
  
                spin_unlock(&solo_dev->slock);
  
 -              solo_fillbuf(solo_dev, &vb->vb);
 +              solo_fillbuf(solo_dev, &vb->vb.vb2_buf);
        }
  
        assert_spin_locked(&solo_dev->slock);
@@@ -313,7 -311,7 +313,7 @@@ static void solo_stop_thread(struct sol
        solo_dev->kthread = NULL;
  }
  
 -static int solo_queue_setup(struct vb2_queue *q, const struct v4l2_format *fmt,
 +static int solo_queue_setup(struct vb2_queue *q, const void *parg,
                           unsigned int *num_buffers, unsigned int *num_planes,
                           unsigned int sizes[], void *alloc_ctxs[])
  {
@@@ -347,11 -345,10 +347,11 @@@ static void solo_stop_streaming(struct 
  
  static void solo_buf_queue(struct vb2_buffer *vb)
  {
 +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
        struct vb2_queue *vq = vb->vb2_queue;
        struct solo_dev *solo_dev = vb2_get_drv_priv(vq);
        struct solo_vb2_buf *solo_vb =
 -              container_of(vb, struct solo_vb2_buf, vb);
 +              container_of(vbuf, struct solo_vb2_buf, vb);
  
        spin_lock(&solo_dev->slock);
        list_add_tail(&solo_vb->list, &solo_dev->vidq_active);
@@@ -678,7 -675,7 +678,7 @@@ int solo_v4l2_init(struct solo_dev *sol
        solo_dev->vidq.mem_ops = &vb2_dma_contig_memops;
        solo_dev->vidq.drv_priv = solo_dev;
        solo_dev->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
-       solo_dev->vidq.gfp_flags = __GFP_DMA32;
+       solo_dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
        solo_dev->vidq.buf_struct_size = sizeof(struct solo_vb2_buf);
        solo_dev->vidq.lock = &solo_dev->lock;
        ret = vb2_queue_init(&solo_dev->vidq);
index 4c3293dcddbcde3cd6b37584c6665d870fef1bc8,e556f989aaabd26bf6d2cebaa0b67fe97ac9aa84..46642ef9151b644413c3de4e33ee5e9279d2107d
@@@ -376,11 -376,10 +376,11 @@@ static int tw68_buffer_count(unsigned i
  /* ------------------------------------------------------------- */
  /* vb2 queue operations                                          */
  
 -static int tw68_queue_setup(struct vb2_queue *q, const struct v4l2_format *fmt,
 +static int tw68_queue_setup(struct vb2_queue *q, const void *parg,
                           unsigned int *num_buffers, unsigned int *num_planes,
                           unsigned int sizes[], void *alloc_ctxs[])
  {
 +      const struct v4l2_format *fmt = parg;
        struct tw68_dev *dev = vb2_get_drv_priv(q);
        unsigned tot_bufs = q->num_buffers + *num_buffers;
  
   */
  static void tw68_buf_queue(struct vb2_buffer *vb)
  {
 +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
        struct vb2_queue *vq = vb->vb2_queue;
        struct tw68_dev *dev = vb2_get_drv_priv(vq);
 -      struct tw68_buf *buf = container_of(vb, struct tw68_buf, vb);
 +      struct tw68_buf *buf = container_of(vbuf, struct tw68_buf, vb);
        struct tw68_buf *prev;
        unsigned long flags;
  
   */
  static int tw68_buf_prepare(struct vb2_buffer *vb)
  {
 +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
        struct vb2_queue *vq = vb->vb2_queue;
        struct tw68_dev *dev = vb2_get_drv_priv(vq);
 -      struct tw68_buf *buf = container_of(vb, struct tw68_buf, vb);
 +      struct tw68_buf *buf = container_of(vbuf, struct tw68_buf, vb);
        struct sg_table *dma = vb2_dma_sg_plane_desc(vb, 0);
        unsigned size, bpl;
  
  
  static void tw68_buf_finish(struct vb2_buffer *vb)
  {
 +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
        struct vb2_queue *vq = vb->vb2_queue;
        struct tw68_dev *dev = vb2_get_drv_priv(vq);
 -      struct tw68_buf *buf = container_of(vb, struct tw68_buf, vb);
 +      struct tw68_buf *buf = container_of(vbuf, struct tw68_buf, vb);
  
        pci_free_consistent(dev->pci, buf->size, buf->cpu, buf->dma);
  }
@@@ -532,7 -528,7 +532,7 @@@ static void tw68_stop_streaming(struct 
                        container_of(dev->active.next, struct tw68_buf, list);
  
                list_del(&buf->list);
 -              vb2_buffer_done(&buf->vb, VB2_BUF_STATE_ERROR);
 +              vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR);
        }
  }
  
@@@ -979,7 -975,7 +979,7 @@@ int tw68_video_init2(struct tw68_dev *d
        dev->vidq.ops = &tw68_video_qops;
        dev->vidq.mem_ops = &vb2_dma_sg_memops;
        dev->vidq.drv_priv = dev;
-       dev->vidq.gfp_flags = __GFP_DMA32;
+       dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
        dev->vidq.buf_struct_size = sizeof(struct tw68_buf);
        dev->vidq.lock = &dev->lock;
        dev->vidq.min_buffers_needed = 2;
@@@ -1016,10 -1012,10 +1016,10 @@@ void tw68_irq_video_done(struct tw68_de
                buf = list_entry(dev->active.next, struct tw68_buf, list);
                list_del(&buf->list);
                spin_unlock(&dev->slock);
 -              v4l2_get_timestamp(&buf->vb.v4l2_buf.timestamp);
 -              buf->vb.v4l2_buf.field = dev->field;
 -              buf->vb.v4l2_buf.sequence = dev->seqnr++;
 -              vb2_buffer_done(&buf->vb, VB2_BUF_STATE_DONE);
 +              v4l2_get_timestamp(&buf->vb.timestamp);
 +              buf->vb.field = dev->field;
 +              buf->vb.sequence = dev->seqnr++;
 +              vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_DONE);
                status &= ~(TW68_DMAPI);
                if (0 == status)
                        return;
index 89300870fefb97a66291b96d52fca3aeb3714259,1b49e53463a268a35a5e75fe2bbc81c5c8c76119..1e688bfec56728c3d00ebc353031c26fde29f187
@@@ -1,7 -1,7 +1,7 @@@
  /*
   * VMware Balloon driver.
   *
 - * Copyright (C) 2000-2010, VMware, Inc. All Rights Reserved.
 + * Copyright (C) 2000-2014, VMware, Inc. All Rights Reserved.
   *
   * This program is free software; you can redistribute it and/or modify it
   * under the terms of the GNU General Public License as published by the
  #include <linux/types.h>
  #include <linux/kernel.h>
  #include <linux/mm.h>
 +#include <linux/vmalloc.h>
  #include <linux/sched.h>
  #include <linux/module.h>
  #include <linux/workqueue.h>
  #include <linux/debugfs.h>
  #include <linux/seq_file.h>
 +#include <linux/vmw_vmci_defs.h>
 +#include <linux/vmw_vmci_api.h>
  #include <asm/hypervisor.h>
  
  MODULE_AUTHOR("VMware, Inc.");
  MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
 -MODULE_VERSION("1.3.0.0-k");
 +MODULE_VERSION("1.5.0.0-k");
  MODULE_ALIAS("dmi:*:svnVMware*:*");
  MODULE_ALIAS("vmware_vmmemctl");
  MODULE_LICENSE("GPL");
   * measured in pages.
   */
  
 -/*
 - * Rate of allocating memory when there is no memory pressure
 - * (driver performs non-sleeping allocations).
 - */
 -#define VMW_BALLOON_NOSLEEP_ALLOC_MAX 16384U
 -
  /*
   * Rates of memory allocaton when guest experiences memory pressure
   * (driver performs sleeping allocations).
  #define VMW_BALLOON_RATE_ALLOC_MAX    2048U
  #define VMW_BALLOON_RATE_ALLOC_INC    16U
  
 -/*
 - * Rates for releasing pages while deflating balloon.
 - */
 -#define VMW_BALLOON_RATE_FREE_MIN     512U
 -#define VMW_BALLOON_RATE_FREE_MAX     16384U
 -#define VMW_BALLOON_RATE_FREE_INC     16U
 -
  /*
   * When guest is under memory pressure, use a reduced page allocation
   * rate for next several cycles.
@@@ -75,7 -85,7 +75,7 @@@
  
  /*
   * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
-  * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use
+  * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
   * __GFP_NOWARN, to suppress page allocation failure warnings.
   */
  #define VMW_PAGE_ALLOC_NOSLEEP                (__GFP_HIGHMEM|__GFP_NOWARN)
@@@ -89,6 -99,9 +89,6 @@@
   */
  #define VMW_PAGE_ALLOC_CANSLEEP               (GFP_HIGHUSER)
  
 -/* Maximum number of page allocations without yielding processor */
 -#define VMW_BALLOON_YIELD_THRESHOLD   1024
 -
  /* Maximum number of refused pages we accumulate during inflation cycle */
  #define VMW_BALLOON_MAX_REFUSED               16
  
@@@ -103,45 -116,17 +103,45 @@@ enum vmwballoon_capabilities 
        /*
         * Bit 0 is reserved and not associated to any capability.
         */
 -      VMW_BALLOON_BASIC_CMDS          = (1 << 1),
 -      VMW_BALLOON_BATCHED_CMDS        = (1 << 2)
 +      VMW_BALLOON_BASIC_CMDS                  = (1 << 1),
 +      VMW_BALLOON_BATCHED_CMDS                = (1 << 2),
 +      VMW_BALLOON_BATCHED_2M_CMDS             = (1 << 3),
 +      VMW_BALLOON_SIGNALLED_WAKEUP_CMD        = (1 << 4),
  };
  
 -#define VMW_BALLOON_CAPABILITIES      (VMW_BALLOON_BASIC_CMDS)
 +#define VMW_BALLOON_CAPABILITIES      (VMW_BALLOON_BASIC_CMDS \
 +                                      | VMW_BALLOON_BATCHED_CMDS \
 +                                      | VMW_BALLOON_BATCHED_2M_CMDS \
 +                                      | VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
 +
 +#define VMW_BALLOON_2M_SHIFT          (9)
 +#define VMW_BALLOON_NUM_PAGE_SIZES    (2)
 +
 +/*
 + * Backdoor commands availability:
 + *
 + * START, GET_TARGET and GUEST_ID are always available,
 + *
 + * VMW_BALLOON_BASIC_CMDS:
 + *    LOCK and UNLOCK commands,
 + * VMW_BALLOON_BATCHED_CMDS:
 + *    BATCHED_LOCK and BATCHED_UNLOCK commands.
 + * VMW BALLOON_BATCHED_2M_CMDS:
 + *    BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
 + * VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
 + *    VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
 + */
 +#define VMW_BALLOON_CMD_START                 0
 +#define VMW_BALLOON_CMD_GET_TARGET            1
 +#define VMW_BALLOON_CMD_LOCK                  2
 +#define VMW_BALLOON_CMD_UNLOCK                        3
 +#define VMW_BALLOON_CMD_GUEST_ID              4
 +#define VMW_BALLOON_CMD_BATCHED_LOCK          6
 +#define VMW_BALLOON_CMD_BATCHED_UNLOCK                7
 +#define VMW_BALLOON_CMD_BATCHED_2M_LOCK               8
 +#define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK     9
 +#define VMW_BALLOON_CMD_VMCI_DOORBELL_SET     10
  
 -#define VMW_BALLOON_CMD_START         0
 -#define VMW_BALLOON_CMD_GET_TARGET    1
 -#define VMW_BALLOON_CMD_LOCK          2
 -#define VMW_BALLOON_CMD_UNLOCK                3
 -#define VMW_BALLOON_CMD_GUEST_ID      4
  
  /* error codes */
  #define VMW_BALLOON_SUCCESS                   0
  
  #define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES (0x03000000)
  
 -#define VMWARE_BALLOON_CMD(cmd, data, result)                 \
 +/* Batch page description */
 +
 +/*
 + * Layout of a page in the batch page:
 + *
 + * +-------------+----------+--------+
 + * |             |          |        |
 + * | Page number | Reserved | Status |
 + * |             |          |        |
 + * +-------------+----------+--------+
 + * 64  PAGE_SHIFT          6         0
 + *
 + * The reserved field should be set to 0.
 + */
 +#define VMW_BALLOON_BATCH_MAX_PAGES   (PAGE_SIZE / sizeof(u64))
 +#define VMW_BALLOON_BATCH_STATUS_MASK ((1UL << 5) - 1)
 +#define VMW_BALLOON_BATCH_PAGE_MASK   (~((1UL << PAGE_SHIFT) - 1))
 +
 +struct vmballoon_batch_page {
 +      u64 pages[VMW_BALLOON_BATCH_MAX_PAGES];
 +};
 +
 +static u64 vmballoon_batch_get_pa(struct vmballoon_batch_page *batch, int idx)
 +{
 +      return batch->pages[idx] & VMW_BALLOON_BATCH_PAGE_MASK;
 +}
 +
 +static int vmballoon_batch_get_status(struct vmballoon_batch_page *batch,
 +                              int idx)
 +{
 +      return (int)(batch->pages[idx] & VMW_BALLOON_BATCH_STATUS_MASK);
 +}
 +
 +static void vmballoon_batch_set_pa(struct vmballoon_batch_page *batch, int idx,
 +                              u64 pa)
 +{
 +      batch->pages[idx] = pa;
 +}
 +
 +
 +#define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)           \
  ({                                                            \
 -      unsigned long __status, __dummy1, __dummy2;             \
 +      unsigned long __status, __dummy1, __dummy2, __dummy3;   \
        __asm__ __volatile__ ("inl %%dx" :                      \
                "=a"(__status),                                 \
                "=c"(__dummy1),                                 \
                "=d"(__dummy2),                                 \
 -              "=b"(result) :                                  \
 +              "=b"(result),                                   \
 +              "=S" (__dummy3) :                               \
                "0"(VMW_BALLOON_HV_MAGIC),                      \
                "1"(VMW_BALLOON_CMD_##cmd),                     \
                "2"(VMW_BALLOON_HV_PORT),                       \
 -              "3"(data) :                                     \
 +              "3"(arg1),                                      \
 +              "4" (arg2) :                                    \
                "memory");                                      \
        if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)     \
                result = __dummy1;                              \
  #ifdef CONFIG_DEBUG_FS
  struct vmballoon_stats {
        unsigned int timer;
 +      unsigned int doorbell;
  
        /* allocation statistics */
 -      unsigned int alloc;
 -      unsigned int alloc_fail;
 +      unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
 +      unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
        unsigned int sleep_alloc;
        unsigned int sleep_alloc_fail;
 -      unsigned int refused_alloc;
 -      unsigned int refused_free;
 -      unsigned int free;
 +      unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
 +      unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
 +      unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
  
        /* monitor operations */
 -      unsigned int lock;
 -      unsigned int lock_fail;
 -      unsigned int unlock;
 -      unsigned int unlock_fail;
 +      unsigned int lock[VMW_BALLOON_NUM_PAGE_SIZES];
 +      unsigned int lock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 +      unsigned int unlock[VMW_BALLOON_NUM_PAGE_SIZES];
 +      unsigned int unlock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
        unsigned int target;
        unsigned int target_fail;
        unsigned int start;
        unsigned int start_fail;
        unsigned int guest_type;
        unsigned int guest_type_fail;
 +      unsigned int doorbell_set;
 +      unsigned int doorbell_unset;
  };
  
  #define STATS_INC(stat) (stat)++
  #define STATS_INC(stat)
  #endif
  
 -struct vmballoon {
 +struct vmballoon;
  
 +struct vmballoon_ops {
 +      void (*add_page)(struct vmballoon *b, int idx, struct page *p);
 +      int (*lock)(struct vmballoon *b, unsigned int num_pages,
 +                      bool is_2m_pages, unsigned int *target);
 +      int (*unlock)(struct vmballoon *b, unsigned int num_pages,
 +                      bool is_2m_pages, unsigned int *target);
 +};
 +
 +struct vmballoon_page_size {
        /* list of reserved physical pages */
        struct list_head pages;
  
        /* transient list of non-balloonable pages */
        struct list_head refused_pages;
        unsigned int n_refused_pages;
 +};
 +
 +struct vmballoon {
 +      struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
 +
 +      /* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
 +      unsigned supported_page_sizes;
  
        /* balloon size in pages */
        unsigned int size;
  
        /* adjustment rates (pages per second) */
        unsigned int rate_alloc;
 -      unsigned int rate_free;
  
        /* slowdown page allocations for next few cycles */
        unsigned int slow_allocation_cycles;
  
 +      unsigned long capabilities;
 +
 +      struct vmballoon_batch_page *batch_page;
 +      unsigned int batch_max_pages;
 +      struct page *page;
 +
 +      const struct vmballoon_ops *ops;
 +
  #ifdef CONFIG_DEBUG_FS
        /* statistics */
        struct vmballoon_stats stats;
        struct sysinfo sysinfo;
  
        struct delayed_work dwork;
 +
 +      struct vmci_handle vmci_doorbell;
  };
  
  static struct vmballoon balloon;
   * Send "start" command to the host, communicating supported version
   * of the protocol.
   */
 -static bool vmballoon_send_start(struct vmballoon *b)
 +static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
  {
 -      unsigned long status, capabilities;
 +      unsigned long status, capabilities, dummy = 0;
 +      bool success;
  
        STATS_INC(b->stats.start);
  
 -      status = VMWARE_BALLOON_CMD(START, VMW_BALLOON_CAPABILITIES,
 -                              capabilities);
 -      if (status == VMW_BALLOON_SUCCESS)
 -              return true;
 +      status = VMWARE_BALLOON_CMD(START, req_caps, dummy, capabilities);
  
 -      pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 -      STATS_INC(b->stats.start_fail);
 -      return false;
 +      switch (status) {
 +      case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
 +              b->capabilities = capabilities;
 +              success = true;
 +              break;
 +      case VMW_BALLOON_SUCCESS:
 +              b->capabilities = VMW_BALLOON_BASIC_CMDS;
 +              success = true;
 +              break;
 +      default:
 +              success = false;
 +      }
 +
 +      if (b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS)
 +              b->supported_page_sizes = 2;
 +      else
 +              b->supported_page_sizes = 1;
 +
 +      if (!success) {
 +              pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 +              STATS_INC(b->stats.start_fail);
 +      }
 +      return success;
  }
  
  static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
   */
  static bool vmballoon_send_guest_id(struct vmballoon *b)
  {
 -      unsigned long status, dummy;
 +      unsigned long status, dummy = 0;
  
 -      status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy);
 +      status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy,
 +                              dummy);
  
        STATS_INC(b->stats.guest_type);
  
        return false;
  }
  
 +static u16 vmballoon_page_size(bool is_2m_page)
 +{
 +      if (is_2m_page)
 +              return 1 << VMW_BALLOON_2M_SHIFT;
 +
 +      return 1;
 +}
 +
  /*
   * Retrieve desired balloon size from the host.
   */
@@@ -407,7 -295,6 +407,7 @@@ static bool vmballoon_send_get_target(s
        unsigned long status;
        unsigned long target;
        unsigned long limit;
 +      unsigned long dummy = 0;
        u32 limit32;
  
        /*
        /* update stats */
        STATS_INC(b->stats.target);
  
 -      status = VMWARE_BALLOON_CMD(GET_TARGET, limit, target);
 +      status = VMWARE_BALLOON_CMD(GET_TARGET, limit, dummy, target);
        if (vmballoon_check_status(b, status)) {
                *new_target = target;
                return true;
   * check the return value and maybe submit a different page.
   */
  static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
 -                                   unsigned int *hv_status)
 +                              unsigned int *hv_status, unsigned int *target)
  {
 -      unsigned long status, dummy;
 +      unsigned long status, dummy = 0;
        u32 pfn32;
  
        pfn32 = (u32)pfn;
        if (pfn32 != pfn)
                return -1;
  
 -      STATS_INC(b->stats.lock);
 +      STATS_INC(b->stats.lock[false]);
  
 -      *hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy);
 +      *hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy, *target);
        if (vmballoon_check_status(b, status))
                return 0;
  
        pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
 -      STATS_INC(b->stats.lock_fail);
 +      STATS_INC(b->stats.lock_fail[false]);
 +      return 1;
 +}
 +
 +static int vmballoon_send_batched_lock(struct vmballoon *b,
 +              unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 +{
 +      unsigned long status;
 +      unsigned long pfn = page_to_pfn(b->page);
 +
 +      STATS_INC(b->stats.lock[is_2m_pages]);
 +
 +      if (is_2m_pages)
 +              status = VMWARE_BALLOON_CMD(BATCHED_2M_LOCK, pfn, num_pages,
 +                              *target);
 +      else
 +              status = VMWARE_BALLOON_CMD(BATCHED_LOCK, pfn, num_pages,
 +                              *target);
 +
 +      if (vmballoon_check_status(b, status))
 +              return 0;
 +
 +      pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
 +      STATS_INC(b->stats.lock_fail[is_2m_pages]);
        return 1;
  }
  
   * Notify the host that guest intends to release given page back into
   * the pool of available (to the guest) pages.
   */
 -static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn)
 +static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn,
 +                                                      unsigned int *target)
  {
 -      unsigned long status, dummy;
 +      unsigned long status, dummy = 0;
        u32 pfn32;
  
        pfn32 = (u32)pfn;
        if (pfn32 != pfn)
                return false;
  
 -      STATS_INC(b->stats.unlock);
 +      STATS_INC(b->stats.unlock[false]);
  
 -      status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy);
 +      status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy, *target);
        if (vmballoon_check_status(b, status))
                return true;
  
        pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
 -      STATS_INC(b->stats.unlock_fail);
 +      STATS_INC(b->stats.unlock_fail[false]);
 +      return false;
 +}
 +
 +static bool vmballoon_send_batched_unlock(struct vmballoon *b,
 +              unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 +{
 +      unsigned long status;
 +      unsigned long pfn = page_to_pfn(b->page);
 +
 +      STATS_INC(b->stats.unlock[is_2m_pages]);
 +
 +      if (is_2m_pages)
 +              status = VMWARE_BALLOON_CMD(BATCHED_2M_UNLOCK, pfn, num_pages,
 +                              *target);
 +      else
 +              status = VMWARE_BALLOON_CMD(BATCHED_UNLOCK, pfn, num_pages,
 +                              *target);
 +
 +      if (vmballoon_check_status(b, status))
 +              return true;
 +
 +      pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
 +      STATS_INC(b->stats.unlock_fail[is_2m_pages]);
        return false;
  }
  
 +static struct page *vmballoon_alloc_page(gfp_t flags, bool is_2m_page)
 +{
 +      if (is_2m_page)
 +              return alloc_pages(flags, VMW_BALLOON_2M_SHIFT);
 +
 +      return alloc_page(flags);
 +}
 +
 +static void vmballoon_free_page(struct page *page, bool is_2m_page)
 +{
 +      if (is_2m_page)
 +              __free_pages(page, VMW_BALLOON_2M_SHIFT);
 +      else
 +              __free_page(page);
 +}
 +
  /*
   * Quickly release all pages allocated for the balloon. This function is
   * called when host decides to "reset" balloon for one reason or another.
  static void vmballoon_pop(struct vmballoon *b)
  {
        struct page *page, *next;
 -      unsigned int count = 0;
 -
 -      list_for_each_entry_safe(page, next, &b->pages, lru) {
 -              list_del(&page->lru);
 -              __free_page(page);
 -              STATS_INC(b->stats.free);
 -              b->size--;
 -
 -              if (++count >= b->rate_free) {
 -                      count = 0;
 +      unsigned is_2m_pages;
 +
 +      for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
 +                      is_2m_pages++) {
 +              struct vmballoon_page_size *page_size =
 +                              &b->page_sizes[is_2m_pages];
 +              u16 size_per_page = vmballoon_page_size(is_2m_pages);
 +
 +              list_for_each_entry_safe(page, next, &page_size->pages, lru) {
 +                      list_del(&page->lru);
 +                      vmballoon_free_page(page, is_2m_pages);
 +                      STATS_INC(b->stats.free[is_2m_pages]);
 +                      b->size -= size_per_page;
                        cond_resched();
                }
        }
 -}
  
 -/*
 - * Perform standard reset sequence by popping the balloon (in case it
 - * is not  empty) and then restarting protocol. This operation normally
 - * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
 - */
 -static void vmballoon_reset(struct vmballoon *b)
 -{
 -      /* free all pages, skipping monitor unlock */
 -      vmballoon_pop(b);
 +      if (b->batch_page) {
 +              vunmap(b->batch_page);
 +              b->batch_page = NULL;
 +      }
  
 -      if (vmballoon_send_start(b)) {
 -              b->reset_required = false;
 -              if (!vmballoon_send_guest_id(b))
 -                      pr_err("failed to send guest ID to the host\n");
 +      if (b->page) {
 +              __free_page(b->page);
 +              b->page = NULL;
        }
  }
  
   * refuse list, those refused page are then released at the end of the
   * inflation cycle.
   */
 -static int vmballoon_lock_page(struct vmballoon *b, struct page *page)
 +static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages,
 +                              bool is_2m_pages, unsigned int *target)
  {
        int locked, hv_status;
 +      struct page *page = b->page;
 +      struct vmballoon_page_size *page_size = &b->page_sizes[false];
 +
 +      /* is_2m_pages can never happen as 2m pages support implies batching */
  
 -      locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status);
 +      locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status,
 +                                                              target);
        if (locked > 0) {
 -              STATS_INC(b->stats.refused_alloc);
 +              STATS_INC(b->stats.refused_alloc[false]);
  
                if (hv_status == VMW_BALLOON_ERROR_RESET ||
                                hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED) {
 -                      __free_page(page);
 +                      vmballoon_free_page(page, false);
                        return -EIO;
                }
  
                 * and retry allocation, unless we already accumulated
                 * too many of them, in which case take a breather.
                 */
 -              if (b->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
 -                      b->n_refused_pages++;
 -                      list_add(&page->lru, &b->refused_pages);
 +              if (page_size->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
 +                      page_size->n_refused_pages++;
 +                      list_add(&page->lru, &page_size->refused_pages);
                } else {
 -                      __free_page(page);
 +                      vmballoon_free_page(page, false);
                }
                return -EIO;
        }
  
        /* track allocated page */
 -      list_add(&page->lru, &b->pages);
 +      list_add(&page->lru, &page_size->pages);
  
        /* update balloon size */
        b->size++;
        return 0;
  }
  
 +static int vmballoon_lock_batched_page(struct vmballoon *b,
 +              unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 +{
 +      int locked, i;
 +      u16 size_per_page = vmballoon_page_size(is_2m_pages);
 +
 +      locked = vmballoon_send_batched_lock(b, num_pages, is_2m_pages,
 +                      target);
 +      if (locked > 0) {
 +              for (i = 0; i < num_pages; i++) {
 +                      u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 +                      struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 +
 +                      vmballoon_free_page(p, is_2m_pages);
 +              }
 +
 +              return -EIO;
 +      }
 +
 +      for (i = 0; i < num_pages; i++) {
 +              u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 +              struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 +              struct vmballoon_page_size *page_size =
 +                              &b->page_sizes[is_2m_pages];
 +
 +              locked = vmballoon_batch_get_status(b->batch_page, i);
 +
 +              switch (locked) {
 +              case VMW_BALLOON_SUCCESS:
 +                      list_add(&p->lru, &page_size->pages);
 +                      b->size += size_per_page;
 +                      break;
 +              case VMW_BALLOON_ERROR_PPN_PINNED:
 +              case VMW_BALLOON_ERROR_PPN_INVALID:
 +                      if (page_size->n_refused_pages
 +                                      < VMW_BALLOON_MAX_REFUSED) {
 +                              list_add(&p->lru, &page_size->refused_pages);
 +                              page_size->n_refused_pages++;
 +                              break;
 +                      }
 +                      /* Fallthrough */
 +              case VMW_BALLOON_ERROR_RESET:
 +              case VMW_BALLOON_ERROR_PPN_NOTNEEDED:
 +                      vmballoon_free_page(p, is_2m_pages);
 +                      break;
 +              default:
 +                      /* This should never happen */
 +                      WARN_ON_ONCE(true);
 +              }
 +      }
 +
 +      return 0;
 +}
 +
  /*
   * Release the page allocated for the balloon. Note that we first notify
   * the host so it can make sure the page will be available for the guest
   * to use, if needed.
   */
 -static int vmballoon_release_page(struct vmballoon *b, struct page *page)
 +static int vmballoon_unlock_page(struct vmballoon *b, unsigned int num_pages,
 +              bool is_2m_pages, unsigned int *target)
  {
 -      if (!vmballoon_send_unlock_page(b, page_to_pfn(page)))
 -              return -EIO;
 +      struct page *page = b->page;
 +      struct vmballoon_page_size *page_size = &b->page_sizes[false];
 +
 +      /* is_2m_pages can never happen as 2m pages support implies batching */
  
 -      list_del(&page->lru);
 +      if (!vmballoon_send_unlock_page(b, page_to_pfn(page), target)) {
 +              list_add(&page->lru, &page_size->pages);
 +              return -EIO;
 +      }
  
        /* deallocate page */
 -      __free_page(page);
 -      STATS_INC(b->stats.free);
 +      vmballoon_free_page(page, false);
 +      STATS_INC(b->stats.free[false]);
  
        /* update balloon size */
        b->size--;
        return 0;
  }
  
 +static int vmballoon_unlock_batched_page(struct vmballoon *b,
 +                              unsigned int num_pages, bool is_2m_pages,
 +                              unsigned int *target)
 +{
 +      int locked, i, ret = 0;
 +      bool hv_success;
 +      u16 size_per_page = vmballoon_page_size(is_2m_pages);
 +
 +      hv_success = vmballoon_send_batched_unlock(b, num_pages, is_2m_pages,
 +                      target);
 +      if (!hv_success)
 +              ret = -EIO;
 +
 +      for (i = 0; i < num_pages; i++) {
 +              u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 +              struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 +              struct vmballoon_page_size *page_size =
 +                              &b->page_sizes[is_2m_pages];
 +
 +              locked = vmballoon_batch_get_status(b->batch_page, i);
 +              if (!hv_success || locked != VMW_BALLOON_SUCCESS) {
 +                      /*
 +                       * That page wasn't successfully unlocked by the
 +                       * hypervisor, re-add it to the list of pages owned by
 +                       * the balloon driver.
 +                       */
 +                      list_add(&p->lru, &page_size->pages);
 +              } else {
 +                      /* deallocate page */
 +                      vmballoon_free_page(p, is_2m_pages);
 +                      STATS_INC(b->stats.free[is_2m_pages]);
 +
 +                      /* update balloon size */
 +                      b->size -= size_per_page;
 +              }
 +      }
 +
 +      return ret;
 +}
 +
  /*
   * Release pages that were allocated while attempting to inflate the
   * balloon but were refused by the host for one reason or another.
   */
 -static void vmballoon_release_refused_pages(struct vmballoon *b)
 +static void vmballoon_release_refused_pages(struct vmballoon *b,
 +              bool is_2m_pages)
  {
        struct page *page, *next;
 +      struct vmballoon_page_size *page_size =
 +                      &b->page_sizes[is_2m_pages];
  
 -      list_for_each_entry_safe(page, next, &b->refused_pages, lru) {
 +      list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
                list_del(&page->lru);
 -              __free_page(page);
 -              STATS_INC(b->stats.refused_free);
 +              vmballoon_free_page(page, is_2m_pages);
 +              STATS_INC(b->stats.refused_free[is_2m_pages]);
        }
  
 -      b->n_refused_pages = 0;
 +      page_size->n_refused_pages = 0;
 +}
 +
 +static void vmballoon_add_page(struct vmballoon *b, int idx, struct page *p)
 +{
 +      b->page = p;
 +}
 +
 +static void vmballoon_add_batched_page(struct vmballoon *b, int idx,
 +                              struct page *p)
 +{
 +      vmballoon_batch_set_pa(b->batch_page, idx,
 +                      (u64)page_to_pfn(p) << PAGE_SHIFT);
  }
  
  /*
   */
  static void vmballoon_inflate(struct vmballoon *b)
  {
 -      unsigned int goal;
 -      unsigned int rate;
 -      unsigned int i;
 +      unsigned rate;
        unsigned int allocations = 0;
 +      unsigned int num_pages = 0;
        int error = 0;
        gfp_t flags = VMW_PAGE_ALLOC_NOSLEEP;
 +      bool is_2m_pages;
  
        pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
  
         * slowdown page allocations considerably.
         */
  
 -      goal = b->target - b->size;
        /*
         * Start with no sleep allocation rate which may be higher
         * than sleeping allocation rate.
         */
 -      rate = b->slow_allocation_cycles ?
 -                      b->rate_alloc : VMW_BALLOON_NOSLEEP_ALLOC_MAX;
 +      if (b->slow_allocation_cycles) {
 +              rate = b->rate_alloc;
 +              is_2m_pages = false;
 +      } else {
 +              rate = UINT_MAX;
 +              is_2m_pages =
 +                      b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
 +      }
  
 -      pr_debug("%s - goal: %d, no-sleep rate: %d, sleep rate: %d\n",
 -               __func__, goal, rate, b->rate_alloc);
 +      pr_debug("%s - goal: %d, no-sleep rate: %u, sleep rate: %d\n",
 +               __func__, b->target - b->size, rate, b->rate_alloc);
  
 -      for (i = 0; i < goal; i++) {
 +      while (!b->reset_required &&
 +              b->size + num_pages * vmballoon_page_size(is_2m_pages)
 +              < b->target) {
                struct page *page;
  
                if (flags == VMW_PAGE_ALLOC_NOSLEEP)
 -                      STATS_INC(b->stats.alloc);
 +                      STATS_INC(b->stats.alloc[is_2m_pages]);
                else
                        STATS_INC(b->stats.sleep_alloc);
  
 -              page = alloc_page(flags);
 +              page = vmballoon_alloc_page(flags, is_2m_pages);
                if (!page) {
 +                      STATS_INC(b->stats.alloc_fail[is_2m_pages]);
 +
 +                      if (is_2m_pages) {
 +                              b->ops->lock(b, num_pages, true, &b->target);
 +
 +                              /*
 +                               * ignore errors from locking as we now switch
 +                               * to 4k pages and we might get different
 +                               * errors.
 +                               */
 +
 +                              num_pages = 0;
 +                              is_2m_pages = false;
 +                              continue;
 +                      }
 +
                        if (flags == VMW_PAGE_ALLOC_CANSLEEP) {
                                /*
                                 * CANSLEEP page allocation failed, so guest
                                STATS_INC(b->stats.sleep_alloc_fail);
                                break;
                        }
 -                      STATS_INC(b->stats.alloc_fail);
  
                        /*
                         * NOSLEEP page allocation failed, so the guest is
                         */
                        b->slow_allocation_cycles = VMW_BALLOON_SLOW_CYCLES;
  
 -                      if (i >= b->rate_alloc)
 +                      if (allocations >= b->rate_alloc)
                                break;
  
                        flags = VMW_PAGE_ALLOC_CANSLEEP;
                        continue;
                }
  
 -              error = vmballoon_lock_page(b, page);
 -              if (error)
 -                      break;
 -
 -              if (++allocations > VMW_BALLOON_YIELD_THRESHOLD) {
 -                      cond_resched();
 -                      allocations = 0;
 +              b->ops->add_page(b, num_pages++, page);
 +              if (num_pages == b->batch_max_pages) {
 +                      error = b->ops->lock(b, num_pages, is_2m_pages,
 +                                      &b->target);
 +                      num_pages = 0;
 +                      if (error)
 +                              break;
                }
  
 -              if (i >= rate) {
 +              cond_resched();
 +
 +              if (allocations >= rate) {
                        /* We allocated enough pages, let's take a break. */
                        break;
                }
        }
  
 +      if (num_pages > 0)
 +              b->ops->lock(b, num_pages, is_2m_pages, &b->target);
 +
        /*
         * We reached our goal without failures so try increasing
         * allocation rate.
         */
 -      if (error == 0 && i >= b->rate_alloc) {
 -              unsigned int mult = i / b->rate_alloc;
 +      if (error == 0 && allocations >= b->rate_alloc) {
 +              unsigned int mult = allocations / b->rate_alloc;
  
                b->rate_alloc =
                        min(b->rate_alloc + mult * VMW_BALLOON_RATE_ALLOC_INC,
                            VMW_BALLOON_RATE_ALLOC_MAX);
        }
  
 -      vmballoon_release_refused_pages(b);
 +      vmballoon_release_refused_pages(b, true);
 +      vmballoon_release_refused_pages(b, false);
  }
  
  /*
   */
  static void vmballoon_deflate(struct vmballoon *b)
  {
 -      struct page *page, *next;
 -      unsigned int i = 0;
 -      unsigned int goal;
 -      int error;
 +      unsigned is_2m_pages;
  
        pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
  
 -      /* limit deallocation rate */
 -      goal = min(b->size - b->target, b->rate_free);
 +      /* free pages to reach target */
 +      for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
 +                      is_2m_pages++) {
 +              struct page *page, *next;
 +              unsigned int num_pages = 0;
 +              struct vmballoon_page_size *page_size =
 +                              &b->page_sizes[is_2m_pages];
 +
 +              list_for_each_entry_safe(page, next, &page_size->pages, lru) {
 +                      if (b->reset_required ||
 +                              (b->target > 0 &&
 +                                      b->size - num_pages
 +                                      * vmballoon_page_size(is_2m_pages)
 +                              < b->target + vmballoon_page_size(true)))
 +                              break;
 +
 +                      list_del(&page->lru);
 +                      b->ops->add_page(b, num_pages++, page);
  
 -      pr_debug("%s - goal: %d, rate: %d\n", __func__, goal, b->rate_free);
 +                      if (num_pages == b->batch_max_pages) {
 +                              int error;
  
 -      /* free pages to reach target */
 -      list_for_each_entry_safe(page, next, &b->pages, lru) {
 -              error = vmballoon_release_page(b, page);
 -              if (error) {
 -                      /* quickly decrease rate in case of error */
 -                      b->rate_free = max(b->rate_free / 2,
 -                                         VMW_BALLOON_RATE_FREE_MIN);
 -                      return;
 +                              error = b->ops->unlock(b, num_pages,
 +                                              is_2m_pages, &b->target);
 +                              num_pages = 0;
 +                              if (error)
 +                                      return;
 +                      }
 +
 +                      cond_resched();
                }
  
 -              if (++i >= goal)
 -                      break;
 +              if (num_pages > 0)
 +                      b->ops->unlock(b, num_pages, is_2m_pages, &b->target);
 +      }
 +}
 +
 +static const struct vmballoon_ops vmballoon_basic_ops = {
 +      .add_page = vmballoon_add_page,
 +      .lock = vmballoon_lock_page,
 +      .unlock = vmballoon_unlock_page
 +};
 +
 +static const struct vmballoon_ops vmballoon_batched_ops = {
 +      .add_page = vmballoon_add_batched_page,
 +      .lock = vmballoon_lock_batched_page,
 +      .unlock = vmballoon_unlock_batched_page
 +};
 +
 +static bool vmballoon_init_batching(struct vmballoon *b)
 +{
 +      b->page = alloc_page(VMW_PAGE_ALLOC_NOSLEEP);
 +      if (!b->page)
 +              return false;
 +
 +      b->batch_page = vmap(&b->page, 1, VM_MAP, PAGE_KERNEL);
 +      if (!b->batch_page) {
 +              __free_page(b->page);
 +              return false;
 +      }
 +
 +      return true;
 +}
 +
 +/*
 + * Receive notification and resize balloon
 + */
 +static void vmballoon_doorbell(void *client_data)
 +{
 +      struct vmballoon *b = client_data;
 +
 +      STATS_INC(b->stats.doorbell);
 +
 +      mod_delayed_work(system_freezable_wq, &b->dwork, 0);
 +}
 +
 +/*
 + * Clean up vmci doorbell
 + */
 +static void vmballoon_vmci_cleanup(struct vmballoon *b)
 +{
 +      int error;
 +
 +      VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, VMCI_INVALID_ID,
 +                      VMCI_INVALID_ID, error);
 +      STATS_INC(b->stats.doorbell_unset);
 +
 +      if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
 +              vmci_doorbell_destroy(b->vmci_doorbell);
 +              b->vmci_doorbell = VMCI_INVALID_HANDLE;
 +      }
 +}
 +
 +/*
 + * Initialize vmci doorbell, to get notified as soon as balloon changes
 + */
 +static int vmballoon_vmci_init(struct vmballoon *b)
 +{
 +      int error = 0;
 +
 +      if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) != 0) {
 +              error = vmci_doorbell_create(&b->vmci_doorbell,
 +                              VMCI_FLAG_DELAYED_CB,
 +                              VMCI_PRIVILEGE_FLAG_RESTRICTED,
 +                              vmballoon_doorbell, b);
 +
 +              if (error == VMCI_SUCCESS) {
 +                      VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET,
 +                                      b->vmci_doorbell.context,
 +                                      b->vmci_doorbell.resource, error);
 +                      STATS_INC(b->stats.doorbell_set);
 +              }
 +      }
 +
 +      if (error != 0) {
 +              vmballoon_vmci_cleanup(b);
 +
 +              return -EIO;
        }
  
 -      /* slowly increase rate if there were no errors */
 -      b->rate_free = min(b->rate_free + VMW_BALLOON_RATE_FREE_INC,
 -                         VMW_BALLOON_RATE_FREE_MAX);
 +      return 0;
 +}
 +
 +/*
 + * Perform standard reset sequence by popping the balloon (in case it
 + * is not  empty) and then restarting protocol. This operation normally
 + * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
 + */
 +static void vmballoon_reset(struct vmballoon *b)
 +{
 +      int error;
 +
 +      vmballoon_vmci_cleanup(b);
 +
 +      /* free all pages, skipping monitor unlock */
 +      vmballoon_pop(b);
 +
 +      if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
 +              return;
 +
 +      if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
 +              b->ops = &vmballoon_batched_ops;
 +              b->batch_max_pages = VMW_BALLOON_BATCH_MAX_PAGES;
 +              if (!vmballoon_init_batching(b)) {
 +                      /*
 +                       * We failed to initialize batching, inform the monitor
 +                       * about it by sending a null capability.
 +                       *
 +                       * The guest will retry in one second.
 +                       */
 +                      vmballoon_send_start(b, 0);
 +                      return;
 +              }
 +      } else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
 +              b->ops = &vmballoon_basic_ops;
 +              b->batch_max_pages = 1;
 +      }
 +
 +      b->reset_required = false;
 +
 +      error = vmballoon_vmci_init(b);
 +      if (error)
 +              pr_err("failed to initialize vmci doorbell\n");
 +
 +      if (!vmballoon_send_guest_id(b))
 +              pr_err("failed to send guest ID to the host\n");
  }
  
  /*
@@@ -1126,14 -664,13 +1126,14 @@@ static void vmballoon_work(struct work_
        if (b->slow_allocation_cycles > 0)
                b->slow_allocation_cycles--;
  
 -      if (vmballoon_send_get_target(b, &target)) {
 +      if (!b->reset_required && vmballoon_send_get_target(b, &target)) {
                /* update target, adjust size */
                b->target = target;
  
                if (b->size < target)
                        vmballoon_inflate(b);
 -              else if (b->size > target)
 +              else if (target == 0 ||
 +                              b->size > target + vmballoon_page_size(true))
                        vmballoon_deflate(b);
        }
  
@@@ -1155,14 -692,6 +1155,14 @@@ static int vmballoon_debug_show(struct 
        struct vmballoon *b = f->private;
        struct vmballoon_stats *stats = &b->stats;
  
 +      /* format capabilities info */
 +      seq_printf(f,
 +                 "balloon capabilities:   %#4x\n"
 +                 "used capabilities:      %#4lx\n"
 +                 "is resetting:           %c\n",
 +                 VMW_BALLOON_CAPABILITIES, b->capabilities,
 +                 b->reset_required ? 'y' : 'n');
 +
        /* format size info */
        seq_printf(f,
                   "target:             %8d pages\n"
  
        /* format rate info */
        seq_printf(f,
 -                 "rateNoSleepAlloc:   %8d pages/sec\n"
 -                 "rateSleepAlloc:     %8d pages/sec\n"
 -                 "rateFree:           %8d pages/sec\n",
 -                 VMW_BALLOON_NOSLEEP_ALLOC_MAX,
 -                 b->rate_alloc, b->rate_free);
 +                 "rateSleepAlloc:     %8d pages/sec\n",
 +                 b->rate_alloc);
  
        seq_printf(f,
                   "\n"
                   "timer:              %8u\n"
 +                 "doorbell:           %8u\n"
                   "start:              %8u (%4u failed)\n"
                   "guestType:          %8u (%4u failed)\n"
 +                 "2m-lock:            %8u (%4u failed)\n"
                   "lock:               %8u (%4u failed)\n"
 +                 "2m-unlock:          %8u (%4u failed)\n"
                   "unlock:             %8u (%4u failed)\n"
                   "target:             %8u (%4u failed)\n"
 +                 "prim2mAlloc:        %8u (%4u failed)\n"
                   "primNoSleepAlloc:   %8u (%4u failed)\n"
                   "primCanSleepAlloc:  %8u (%4u failed)\n"
 +                 "prim2mFree:         %8u\n"
                   "primFree:           %8u\n"
 +                 "err2mAlloc:         %8u\n"
                   "errAlloc:           %8u\n"
 -                 "errFree:            %8u\n",
 +                 "err2mFree:          %8u\n"
 +                 "errFree:            %8u\n"
 +                 "doorbellSet:        %8u\n"
 +                 "doorbellUnset:      %8u\n",
                   stats->timer,
 +                 stats->doorbell,
                   stats->start, stats->start_fail,
                   stats->guest_type, stats->guest_type_fail,
 -                 stats->lock,  stats->lock_fail,
 -                 stats->unlock, stats->unlock_fail,
 +                 stats->lock[true],  stats->lock_fail[true],
 +                 stats->lock[false],  stats->lock_fail[false],
 +                 stats->unlock[true], stats->unlock_fail[true],
 +                 stats->unlock[false], stats->unlock_fail[false],
                   stats->target, stats->target_fail,
 -                 stats->alloc, stats->alloc_fail,
 +                 stats->alloc[true], stats->alloc_fail[true],
 +                 stats->alloc[false], stats->alloc_fail[false],
                   stats->sleep_alloc, stats->sleep_alloc_fail,
 -                 stats->free,
 -                 stats->refused_alloc, stats->refused_free);
 +                 stats->free[true],
 +                 stats->free[false],
 +                 stats->refused_alloc[true], stats->refused_alloc[false],
 +                 stats->refused_free[true], stats->refused_free[false],
 +                 stats->doorbell_set, stats->doorbell_unset);
  
        return 0;
  }
@@@ -1266,7 -782,7 +1266,7 @@@ static inline void vmballoon_debugfs_ex
  static int __init vmballoon_init(void)
  {
        int error;
 -
 +      unsigned is_2m_pages;
        /*
         * Check if we are running on VMware's hypervisor and bail out
         * if we are not.
        if (x86_hyper != &x86_hyper_vmware)
                return -ENODEV;
  
 -      INIT_LIST_HEAD(&balloon.pages);
 -      INIT_LIST_HEAD(&balloon.refused_pages);
 +      for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
 +                      is_2m_pages++) {
 +              INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
 +              INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
 +      }
  
        /* initialize rates */
        balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
 -      balloon.rate_free = VMW_BALLOON_RATE_FREE_MAX;
  
        INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
  
 -      /*
 -       * Start balloon.
 -       */
 -      if (!vmballoon_send_start(&balloon)) {
 -              pr_err("failed to send start command to the host\n");
 -              return -EIO;
 -      }
 -
 -      if (!vmballoon_send_guest_id(&balloon)) {
 -              pr_err("failed to send guest ID to the host\n");
 -              return -EIO;
 -      }
 -
        error = vmballoon_debugfs_init(&balloon);
        if (error)
                return error;
  
 +      balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
 +      balloon.batch_page = NULL;
 +      balloon.page = NULL;
 +      balloon.reset_required = true;
 +
        queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
  
        return 0;
@@@ -1302,7 -824,6 +1302,7 @@@ module_init(vmballoon_init)
  
  static void __exit vmballoon_exit(void)
  {
 +      vmballoon_vmci_cleanup(&balloon);
        cancel_delayed_work_sync(&balloon.dwork);
  
        vmballoon_debugfs_exit(&balloon);
         * Reset connection before deallocating memory to avoid potential for
         * additional spurious resets from guest touching deallocated pages.
         */
 -      vmballoon_send_start(&balloon);
 +      vmballoon_send_start(&balloon, 0);
        vmballoon_pop(&balloon);
  }
  module_exit(vmballoon_exit);
diff --combined drivers/mtd/mtdcore.c
index b1eea48c501d11cdb58e862ac73a0b29eb015aba,2dfb291a47c6da6e64caa1788f1329305658b9c5..a9a15c22ef24649ff4ef82385a1128dfef8d7818
@@@ -387,14 -387,6 +387,14 @@@ int add_mtd_device(struct mtd_info *mtd
        struct mtd_notifier *not;
        int i, error;
  
 +      /*
 +       * May occur, for instance, on buggy drivers which call
 +       * mtd_device_parse_register() multiple times on the same master MTD,
 +       * especially with CONFIG_MTD_PARTITIONED_MASTER=y.
 +       */
 +      if (WARN_ONCE(mtd->backing_dev_info, "MTD already registered\n"))
 +              return -EEXIST;
 +
        mtd->backing_dev_info = &mtd_bdi;
  
        BUG_ON(mtd->writesize == 0);
        mtd->erasesize_mask = (1 << mtd->erasesize_shift) - 1;
        mtd->writesize_mask = (1 << mtd->writesize_shift) - 1;
  
 +      if (mtd->dev.parent) {
 +              if (!mtd->owner && mtd->dev.parent->driver)
 +                      mtd->owner = mtd->dev.parent->driver->owner;
 +              if (!mtd->name)
 +                      mtd->name = dev_name(mtd->dev.parent);
 +      } else {
 +              pr_debug("mtd device won't show a device symlink in sysfs\n");
 +      }
 +
        /* Some chips always power up locked. Unlock them now */
        if ((mtd->flags & MTD_WRITEABLE) && (mtd->flags & MTD_POWERUP_LOCK)) {
                error = mtd_unlock(mtd, 0, mtd->size);
        }
  
        /* Caller should have set dev.parent to match the
 -       * physical device.
 +       * physical device, if appropriate.
         */
        mtd->dev.type = &mtd_devtype;
        mtd->dev.class = &mtd_class;
@@@ -596,17 -579,9 +596,17 @@@ int mtd_device_parse_register(struct mt
                else
                        ret = nr_parts;
        }
 +      /* Didn't come up with either parsed OR fallback partitions */
 +      if (ret < 0) {
 +              pr_info("mtd: failed to find partitions; one or more parsers reports errors (%d)\n",
 +                      ret);
 +              /* Don't abort on errors; we can still use unpartitioned MTD */
 +              ret = 0;
 +      }
  
 -      if (ret >= 0)
 -              ret = mtd_add_device_partitions(mtd, real_parts, ret);
 +      ret = mtd_add_device_partitions(mtd, real_parts, ret);
 +      if (ret)
 +              goto out;
  
        /*
         * FIXME: some drivers unfortunately call this function more than once.
         * does cause problems with parse_mtd_partitions() above (e.g.,
         * cmdlineparts will register partitions more than once).
         */
 +      WARN_ONCE(mtd->reboot_notifier.notifier_call, "MTD already registered\n");
        if (mtd->_reboot && !mtd->reboot_notifier.notifier_call) {
                mtd->reboot_notifier.notifier_call = mtd_reboot_notifier;
                register_reboot_notifier(&mtd->reboot_notifier);
        }
  
 +out:
        kfree(real_parts);
        return ret;
  }
@@@ -1215,8 -1188,7 +1215,7 @@@ EXPORT_SYMBOL_GPL(mtd_writev)
   */
  void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
  {
-       gfp_t flags = __GFP_NOWARN | __GFP_WAIT |
-                      __GFP_NORETRY | __GFP_NO_KSWAPD;
+       gfp_t flags = __GFP_NOWARN | __GFP_DIRECT_RECLAIM | __GFP_NORETRY;
        size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
        void *kbuf;
  
@@@ -1328,7 -1300,6 +1327,7 @@@ static void __exit cleanup_mtd(void
                remove_proc_entry("mtd", NULL);
        class_unregister(&mtd_class);
        bdi_destroy(&mtd_bdi);
 +      idr_destroy(&mtd_idr);
  }
  
  module_init(init_mtd);
diff --combined drivers/nvme/host/pci.c
index c4bb85934aa2692cd6f46a12666ee2b6eff1c2e6,e917cf304ad0c53b470399a80fb95cca8b1ed2ed..34fae2804e0b4e75a8d6159677a9506aefca343c
@@@ -12,6 -12,7 +12,6 @@@
   * more details.
   */
  
 -#include <linux/nvme.h>
  #include <linux/bitops.h>
  #include <linux/blkdev.h>
  #include <linux/blk-mq.h>
  #include <linux/slab.h>
  #include <linux/t10-pi.h>
  #include <linux/types.h>
 +#include <linux/pr.h>
  #include <scsi/sg.h>
 -#include <asm-generic/io-64-nonatomic-lo-hi.h>
 +#include <linux/io-64-nonatomic-lo-hi.h>
 +#include <asm/unaligned.h>
 +
 +#include <uapi/linux/nvme_ioctl.h>
 +#include "nvme.h"
  
  #define NVME_MINORS           (1U << MINORBITS)
  #define NVME_Q_DEPTH          1024
@@@ -88,10 -84,9 +88,10 @@@ static wait_queue_head_t nvme_kthread_w
  
  static struct class *nvme_class;
  
 -static void nvme_reset_failed_dev(struct work_struct *ws);
 +static int __nvme_reset(struct nvme_dev *dev);
  static int nvme_reset(struct nvme_dev *dev);
  static int nvme_process_cq(struct nvme_queue *nvmeq);
 +static void nvme_dead_ctrl(struct nvme_dev *dev);
  
  struct async_cmd_info {
        struct kthread_work work;
@@@ -540,7 -535,7 +540,7 @@@ static void nvme_dif_remap(struct reque
        virt = bip_get_seed(bip);
        phys = nvme_block_nr(ns, blk_rq_pos(req));
        nlb = (blk_rq_bytes(req) >> ns->lba_shift);
 -      ts = ns->disk->integrity->tuple_size;
 +      ts = ns->disk->queue->integrity.tuple_size;
  
        for (i = 0; i < nlb; i++, virt++, phys++) {
                pi = (struct t10_pi_tuple *)p;
        kunmap_atomic(pmap);
  }
  
 -static int nvme_noop_verify(struct blk_integrity_iter *iter)
 -{
 -      return 0;
 -}
 -
 -static int nvme_noop_generate(struct blk_integrity_iter *iter)
 -{
 -      return 0;
 -}
 -
 -struct blk_integrity nvme_meta_noop = {
 -      .name                   = "NVME_META_NOOP",
 -      .generate_fn            = nvme_noop_generate,
 -      .verify_fn              = nvme_noop_verify,
 -};
 -
  static void nvme_init_integrity(struct nvme_ns *ns)
  {
        struct blk_integrity integrity;
  
        switch (ns->pi_type) {
        case NVME_NS_DPS_PI_TYPE3:
 -              integrity = t10_pi_type3_crc;
 +              integrity.profile = &t10_pi_type3_crc;
                break;
        case NVME_NS_DPS_PI_TYPE1:
        case NVME_NS_DPS_PI_TYPE2:
 -              integrity = t10_pi_type1_crc;
 +              integrity.profile = &t10_pi_type1_crc;
                break;
        default:
 -              integrity = nvme_meta_noop;
 +              integrity.profile = NULL;
                break;
        }
        integrity.tuple_size = ns->ms;
@@@ -592,31 -603,27 +592,31 @@@ static void req_completion(struct nvme_
        struct nvme_iod *iod = ctx;
        struct request *req = iod_get_private(iod);
        struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
 -
        u16 status = le16_to_cpup(&cqe->status) >> 1;
 +      bool requeue = false;
 +      int error = 0;
  
        if (unlikely(status)) {
                if (!(status & NVME_SC_DNR || blk_noretry_request(req))
                    && (jiffies - req->start_time) < req->timeout) {
                        unsigned long flags;
  
 +                      requeue = true;
                        blk_mq_requeue_request(req);
                        spin_lock_irqsave(req->q->queue_lock, flags);
                        if (!blk_queue_stopped(req->q))
                                blk_mq_kick_requeue_list(req->q);
                        spin_unlock_irqrestore(req->q->queue_lock, flags);
 -                      return;
 +                      goto release_iod;
                }
  
                if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
                        if (cmd_rq->ctx == CMD_CTX_CANCELLED)
 -                              status = -EINTR;
 +                              error = -EINTR;
 +                      else
 +                              error = status;
                } else {
 -                      status = nvme_error_status(status);
 +                      error = nvme_error_status(status);
                }
        }
  
        if (cmd_rq->aborted)
                dev_warn(nvmeq->dev->dev,
                        "completing aborted command with status:%04x\n",
 -                      status);
 +                      error);
  
 +release_iod:
        if (iod->nents) {
                dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
                        rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
        }
        nvme_free_iod(nvmeq->dev, iod);
  
 -      blk_mq_complete_request(req, status);
 +      if (likely(!requeue))
 +              blk_mq_complete_request(req, error);
  }
  
  /* length is in bytes.  gfp flags indicates whether we may sleep. */
@@@ -1025,11 -1030,11 +1025,11 @@@ int __nvme_submit_sync_cmd(struct reque
        req->special = (void *)0;
  
        if (buffer && bufflen) {
-               ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT);
+               ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_RECLAIM);
                if (ret)
                        goto out;
        } else if (ubuffer && bufflen) {
-               ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT);
+               ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_RECLAIM);
                if (ret)
                        goto out;
                bio = req->bio;
@@@ -1272,13 -1277,18 +1272,13 @@@ static void nvme_abort_req(struct reque
        struct nvme_command cmd;
  
        if (!nvmeq->qid || cmd_rq->aborted) {
 -              unsigned long flags;
 -
 -              spin_lock_irqsave(&dev_list_lock, flags);
 -              if (work_busy(&dev->reset_work))
 -                      goto out;
 -              list_del_init(&dev->node);
 -              dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n",
 -                                                      req->tag, nvmeq->qid);
 -              dev->reset_workfn = nvme_reset_failed_dev;
 -              queue_work(nvme_workq, &dev->reset_work);
 - out:
 -              spin_unlock_irqrestore(&dev_list_lock, flags);
 +              spin_lock(&dev_list_lock);
 +              if (!__nvme_reset(dev)) {
 +                      dev_warn(dev->dev,
 +                               "I/O %d QID %d timeout, reset controller\n",
 +                               req->tag, nvmeq->qid);
 +              }
 +              spin_unlock(&dev_list_lock);
                return;
        }
  
@@@ -1794,7 -1804,7 +1794,7 @@@ static int nvme_submit_io(struct nvme_n
  
        length = (io.nblocks + 1) << ns->lba_shift;
        meta_len = (io.nblocks + 1) * ns->ms;
 -      metadata = (void __user *)(unsigned long)io.metadata;
 +      metadata = (void __user *)(uintptr_t)io.metadata;
        write = io.opcode & 1;
  
        if (ns->ext) {
        c.rw.metadata = cpu_to_le64(meta_dma);
  
        status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
 -                      (void __user *)io.addr, length, NULL, 0);
 +                      (void __user *)(uintptr_t)io.addr, length, NULL, 0);
   unmap:
        if (meta) {
                if (status == NVME_SC_SUCCESS && !write) {
@@@ -1876,7 -1886,7 +1876,7 @@@ static int nvme_user_cmd(struct nvme_de
                timeout = msecs_to_jiffies(cmd.timeout_ms);
  
        status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
 -                      NULL, (void __user *)cmd.addr, cmd.data_len,
 +                      NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
                        &cmd.result, timeout);
        if (status >= 0) {
                if (put_user(cmd.result, &ucmd->result))
@@@ -1933,23 -1943,6 +1933,23 @@@ static int nvme_compat_ioctl(struct blo
  #define nvme_compat_ioctl     NULL
  #endif
  
 +static void nvme_free_dev(struct kref *kref);
 +static void nvme_free_ns(struct kref *kref)
 +{
 +      struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 +
 +      if (ns->type == NVME_NS_LIGHTNVM)
 +              nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
 +
 +      spin_lock(&dev_list_lock);
 +      ns->disk->private_data = NULL;
 +      spin_unlock(&dev_list_lock);
 +
 +      kref_put(&ns->dev->kref, nvme_free_dev);
 +      put_disk(ns->disk);
 +      kfree(ns);
 +}
 +
  static int nvme_open(struct block_device *bdev, fmode_t mode)
  {
        int ret = 0;
        ns = bdev->bd_disk->private_data;
        if (!ns)
                ret = -ENXIO;
 -      else if (!kref_get_unless_zero(&ns->dev->kref))
 +      else if (!kref_get_unless_zero(&ns->kref))
                ret = -ENXIO;
        spin_unlock(&dev_list_lock);
  
        return ret;
  }
  
 -static void nvme_free_dev(struct kref *kref);
 -
  static void nvme_release(struct gendisk *disk, fmode_t mode)
  {
        struct nvme_ns *ns = disk->private_data;
 -      struct nvme_dev *dev = ns->dev;
 -
 -      kref_put(&dev->kref, nvme_free_dev);
 +      kref_put(&ns->kref, nvme_free_ns);
  }
  
  static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
@@@ -2010,16 -2007,6 +2010,16 @@@ static int nvme_revalidate_disk(struct 
                return -ENODEV;
        }
  
 +      if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
 +              if (nvme_nvm_register(ns->queue, disk->disk_name)) {
 +                      dev_warn(dev->dev,
 +                              "%s: LightNVM init failure\n", __func__);
 +                      kfree(id);
 +                      return -ENODEV;
 +              }
 +              ns->type = NVME_NS_LIGHTNVM;
 +      }
 +
        old_ms = ns->ms;
        lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
        ns->lba_shift = id->lbaf[lbaf].ds;
        pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
                                        id->dps & NVME_NS_DPS_PI_MASK : 0;
  
 +      blk_mq_freeze_queue(disk->queue);
        if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
                                ns->ms != old_ms ||
                                bs != queue_logical_block_size(disk->queue) ||
        ns->pi_type = pi_type;
        blk_queue_logical_block_size(ns->queue, bs);
  
 -      if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) &&
 -                                                              !ns->ext)
 +      if (ns->ms && !ns->ext)
                nvme_init_integrity(ns);
  
 -      if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
 +      if ((ns->ms && !(ns->ms == 8 && ns->pi_type) &&
 +                                              !blk_get_integrity(disk)) ||
 +                                              ns->type == NVME_NS_LIGHTNVM)
                set_capacity(disk, 0);
        else
                set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
  
        if (dev->oncs & NVME_CTRL_ONCS_DSM)
                nvme_config_discard(ns);
 +      blk_mq_unfreeze_queue(disk->queue);
  
        kfree(id);
        return 0;
  }
  
 +static char nvme_pr_type(enum pr_type type)
 +{
 +      switch (type) {
 +      case PR_WRITE_EXCLUSIVE:
 +              return 1;
 +      case PR_EXCLUSIVE_ACCESS:
 +              return 2;
 +      case PR_WRITE_EXCLUSIVE_REG_ONLY:
 +              return 3;
 +      case PR_EXCLUSIVE_ACCESS_REG_ONLY:
 +              return 4;
 +      case PR_WRITE_EXCLUSIVE_ALL_REGS:
 +              return 5;
 +      case PR_EXCLUSIVE_ACCESS_ALL_REGS:
 +              return 6;
 +      default:
 +              return 0;
 +      }
 +};
 +
 +static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
 +                              u64 key, u64 sa_key, u8 op)
 +{
 +      struct nvme_ns *ns = bdev->bd_disk->private_data;
 +      struct nvme_command c;
 +      u8 data[16] = { 0, };
 +
 +      put_unaligned_le64(key, &data[0]);
 +      put_unaligned_le64(sa_key, &data[8]);
 +
 +      memset(&c, 0, sizeof(c));
 +      c.common.opcode = op;
 +      c.common.nsid = cpu_to_le32(ns->ns_id);
 +      c.common.cdw10[0] = cpu_to_le32(cdw10);
 +
 +      return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
 +}
 +
 +static int nvme_pr_register(struct block_device *bdev, u64 old,
 +              u64 new, unsigned flags)
 +{
 +      u32 cdw10;
 +
 +      if (flags & ~PR_FL_IGNORE_KEY)
 +              return -EOPNOTSUPP;
 +
 +      cdw10 = old ? 2 : 0;
 +      cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
 +      cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
 +      return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
 +}
 +
 +static int nvme_pr_reserve(struct block_device *bdev, u64 key,
 +              enum pr_type type, unsigned flags)
 +{
 +      u32 cdw10;
 +
 +      if (flags & ~PR_FL_IGNORE_KEY)
 +              return -EOPNOTSUPP;
 +
 +      cdw10 = nvme_pr_type(type) << 8;
 +      cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
 +      return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
 +}
 +
 +static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
 +              enum pr_type type, bool abort)
 +{
 +      u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
 +      return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
 +}
 +
 +static int nvme_pr_clear(struct block_device *bdev, u64 key)
 +{
 +      u32 cdw10 = 1 | key ? 1 << 3 : 0;
 +      return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
 +}
 +
 +static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
 +{
 +      u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
 +      return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
 +}
 +
 +static const struct pr_ops nvme_pr_ops = {
 +      .pr_register    = nvme_pr_register,
 +      .pr_reserve     = nvme_pr_reserve,
 +      .pr_release     = nvme_pr_release,
 +      .pr_preempt     = nvme_pr_preempt,
 +      .pr_clear       = nvme_pr_clear,
 +};
 +
  static const struct block_device_operations nvme_fops = {
        .owner          = THIS_MODULE,
        .ioctl          = nvme_ioctl,
        .release        = nvme_release,
        .getgeo         = nvme_getgeo,
        .revalidate_disk= nvme_revalidate_disk,
 +      .pr_ops         = &nvme_pr_ops,
  };
  
  static int nvme_kthread(void *data)
  
                        if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
                                                        csts & NVME_CSTS_CFS) {
 -                              if (work_busy(&dev->reset_work))
 -                                      continue;
 -                              list_del_init(&dev->node);
 -                              dev_warn(dev->dev,
 -                                      "Failed status: %x, reset controller\n",
 -                                      readl(&dev->bar->csts));
 -                              dev->reset_workfn = nvme_reset_failed_dev;
 -                              queue_work(nvme_workq, &dev->reset_work);
 +                              if (!__nvme_reset(dev)) {
 +                                      dev_warn(dev->dev,
 +                                              "Failed status: %x, reset controller\n",
 +                                              readl(&dev->bar->csts));
 +                              }
                                continue;
                        }
                        for (i = 0; i < dev->queue_count; i++) {
@@@ -2232,7 -2126,6 +2232,7 @@@ static void nvme_alloc_ns(struct nvme_d
        if (!disk)
                goto out_free_queue;
  
 +      kref_init(&ns->kref);
        ns->ns_id = nsid;
        ns->disk = disk;
        ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
        if (nvme_revalidate_disk(ns->disk))
                goto out_free_disk;
  
 -      add_disk(ns->disk);
 -      if (ns->ms) {
 -              struct block_device *bd = bdget_disk(ns->disk, 0);
 -              if (!bd)
 -                      return;
 -              if (blkdev_get(bd, FMODE_READ, NULL)) {
 -                      bdput(bd);
 -                      return;
 +      kref_get(&dev->kref);
 +      if (ns->type != NVME_NS_LIGHTNVM) {
 +              add_disk(ns->disk);
 +              if (ns->ms) {
 +                      struct block_device *bd = bdget_disk(ns->disk, 0);
 +                      if (!bd)
 +                              return;
 +                      if (blkdev_get(bd, FMODE_READ, NULL)) {
 +                              bdput(bd);
 +                              return;
 +                      }
 +                      blkdev_reread_part(bd);
 +                      blkdev_put(bd, FMODE_READ);
                }
 -              blkdev_reread_part(bd);
 -              blkdev_put(bd, FMODE_READ);
        }
        return;
   out_free_disk:
        kfree(ns);
  }
  
 +/*
 + * Create I/O queues.  Failing to create an I/O queue is not an issue,
 + * we can continue with less than the desired amount of queues, and
 + * even a controller without I/O queues an still be used to issue
 + * admin commands.  This might be useful to upgrade a buggy firmware
 + * for example.
 + */
  static void nvme_create_io_queues(struct nvme_dev *dev)
  {
        unsigned i;
                        break;
  
        for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
 -              if (nvme_create_queue(dev->queues[i], i))
 +              if (nvme_create_queue(dev->queues[i], i)) {
 +                      nvme_free_queues(dev, i);
                        break;
 +              }
  }
  
  static int set_queue_count(struct nvme_dev *dev, int count)
@@@ -2476,6 -2357,18 +2476,6 @@@ static int nvme_setup_io_queues(struct 
        return result;
  }
  
 -static void nvme_free_namespace(struct nvme_ns *ns)
 -{
 -      list_del(&ns->list);
 -
 -      spin_lock(&dev_list_lock);
 -      ns->disk->private_data = NULL;
 -      spin_unlock(&dev_list_lock);
 -
 -      put_disk(ns->disk);
 -      kfree(ns);
 -}
 -
  static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
  {
        struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
@@@ -2509,14 -2402,15 +2509,14 @@@ static void nvme_ns_remove(struct nvme_
  
        if (kill)
                blk_set_queue_dying(ns->queue);
 -      if (ns->disk->flags & GENHD_FL_UP) {
 -              if (blk_get_integrity(ns->disk))
 -                      blk_integrity_unregister(ns->disk);
 +      if (ns->disk->flags & GENHD_FL_UP)
                del_gendisk(ns->disk);
 -      }
        if (kill || !blk_queue_dying(ns->queue)) {
                blk_mq_abort_requeue_list(ns->queue);
                blk_cleanup_queue(ns->queue);
 -        }
 +      }
 +      list_del_init(&ns->list);
 +      kref_put(&ns->kref, nvme_free_ns);
  }
  
  static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
        for (i = 1; i <= nn; i++) {
                ns = nvme_find_ns(dev, i);
                if (ns) {
 -                      if (revalidate_disk(ns->disk)) {
 +                      if (revalidate_disk(ns->disk))
                                nvme_ns_remove(ns);
 -                              nvme_free_namespace(ns);
 -                      }
                } else
                        nvme_alloc_ns(dev, i);
        }
        list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
 -              if (ns->ns_id > nn) {
 +              if (ns->ns_id > nn)
                        nvme_ns_remove(ns);
 -                      nvme_free_namespace(ns);
 -              }
        }
        list_sort(NULL, &dev->namespaces, ns_cmp);
  }
@@@ -2924,9 -2822,9 +2924,9 @@@ static void nvme_dev_shutdown(struct nv
  
  static void nvme_dev_remove(struct nvme_dev *dev)
  {
 -      struct nvme_ns *ns;
 +      struct nvme_ns *ns, *next;
  
 -      list_for_each_entry(ns, &dev->namespaces, list)
 +      list_for_each_entry_safe(ns, next, &dev->namespaces, list)
                nvme_ns_remove(ns);
  }
  
@@@ -2982,12 -2880,21 +2982,12 @@@ static void nvme_release_instance(struc
        spin_unlock(&dev_list_lock);
  }
  
 -static void nvme_free_namespaces(struct nvme_dev *dev)
 -{
 -      struct nvme_ns *ns, *next;
 -
 -      list_for_each_entry_safe(ns, next, &dev->namespaces, list)
 -              nvme_free_namespace(ns);
 -}
 -
  static void nvme_free_dev(struct kref *kref)
  {
        struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
  
        put_device(dev->dev);
        put_device(dev->device);
 -      nvme_free_namespaces(dev);
        nvme_release_instance(dev);
        if (dev->tagset.tags)
                blk_mq_free_tag_set(&dev->tagset);
@@@ -3061,15 -2968,14 +3061,15 @@@ static const struct file_operations nvm
        .compat_ioctl   = nvme_dev_ioctl,
  };
  
 -static int nvme_dev_start(struct nvme_dev *dev)
 +static void nvme_probe_work(struct work_struct *work)
  {
 -      int result;
 +      struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
        bool start_thread = false;
 +      int result;
  
        result = nvme_dev_map(dev);
        if (result)
 -              return result;
 +              goto out;
  
        result = nvme_configure_admin_queue(dev);
        if (result)
                goto free_tags;
  
        dev->event_limit = 1;
 -      return result;
 +
 +      /*
 +       * Keep the controller around but remove all namespaces if we don't have
 +       * any working I/O queue.
 +       */
 +      if (dev->online_queues < 2) {
 +              dev_warn(dev->dev, "IO queues not created\n");
 +              nvme_dev_remove(dev);
 +      } else {
 +              nvme_unfreeze_queues(dev);
 +              nvme_dev_add(dev);
 +      }
 +
 +      return;
  
   free_tags:
        nvme_dev_remove_admin(dev);
        nvme_dev_list_remove(dev);
   unmap:
        nvme_dev_unmap(dev);
 -      return result;
 + out:
 +      if (!work_busy(&dev->reset_work))
 +              nvme_dead_ctrl(dev);
  }
  
  static int nvme_remove_dead_ctrl(void *arg)
        return 0;
  }
  
 -static void nvme_remove_disks(struct work_struct *ws)
 -{
 -      struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
 -
 -      nvme_free_queues(dev, 1);
 -      nvme_dev_remove(dev);
 -}
 -
 -static int nvme_dev_resume(struct nvme_dev *dev)
 -{
 -      int ret;
 -
 -      ret = nvme_dev_start(dev);
 -      if (ret)
 -              return ret;
 -      if (dev->online_queues < 2) {
 -              spin_lock(&dev_list_lock);
 -              dev->reset_workfn = nvme_remove_disks;
 -              queue_work(nvme_workq, &dev->reset_work);
 -              spin_unlock(&dev_list_lock);
 -      } else {
 -              nvme_unfreeze_queues(dev);
 -              nvme_dev_add(dev);
 -      }
 -      return 0;
 -}
 -
  static void nvme_dead_ctrl(struct nvme_dev *dev)
  {
        dev_warn(dev->dev, "Device failed to resume\n");
        }
  }
  
 -static void nvme_dev_reset(struct nvme_dev *dev)
 +static void nvme_reset_work(struct work_struct *ws)
  {
 +      struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
        bool in_probe = work_busy(&dev->probe_work);
  
        nvme_dev_shutdown(dev);
        schedule_work(&dev->probe_work);
  }
  
 -static void nvme_reset_failed_dev(struct work_struct *ws)
 +static int __nvme_reset(struct nvme_dev *dev)
  {
 -      struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
 -      nvme_dev_reset(dev);
 -}
 -
 -static void nvme_reset_workfn(struct work_struct *work)
 -{
 -      struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
 -      dev->reset_workfn(work);
 +      if (work_pending(&dev->reset_work))
 +              return -EBUSY;
 +      list_del_init(&dev->node);
 +      queue_work(nvme_workq, &dev->reset_work);
 +      return 0;
  }
  
  static int nvme_reset(struct nvme_dev *dev)
  {
 -      int ret = -EBUSY;
 +      int ret;
  
        if (!dev->admin_q || blk_queue_dying(dev->admin_q))
                return -ENODEV;
  
        spin_lock(&dev_list_lock);
 -      if (!work_pending(&dev->reset_work)) {
 -              dev->reset_workfn = nvme_reset_failed_dev;
 -              queue_work(nvme_workq, &dev->reset_work);
 -              ret = 0;
 -      }
 +      ret = __nvme_reset(dev);
        spin_unlock(&dev_list_lock);
  
        if (!ret) {
@@@ -3223,6 -3147,7 +3223,6 @@@ static ssize_t nvme_sysfs_reset(struct 
  }
  static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
  
 -static void nvme_async_probe(struct work_struct *work);
  static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  {
        int node, result = -ENOMEM;
                goto free;
  
        INIT_LIST_HEAD(&dev->namespaces);
 -      dev->reset_workfn = nvme_reset_failed_dev;
 -      INIT_WORK(&dev->reset_work, nvme_reset_workfn);
 +      INIT_WORK(&dev->reset_work, nvme_reset_work);
        dev->dev = get_device(&pdev->dev);
        pci_set_drvdata(pdev, dev);
        result = nvme_set_instance(dev);
  
        INIT_LIST_HEAD(&dev->node);
        INIT_WORK(&dev->scan_work, nvme_dev_scan);
 -      INIT_WORK(&dev->probe_work, nvme_async_probe);
 +      INIT_WORK(&dev->probe_work, nvme_probe_work);
        schedule_work(&dev->probe_work);
        return 0;
  
        return result;
  }
  
 -static void nvme_async_probe(struct work_struct *work)
 -{
 -      struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
 -
 -      if (nvme_dev_resume(dev) && !work_busy(&dev->reset_work))
 -              nvme_dead_ctrl(dev);
 -}
 -
  static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
  {
        struct nvme_dev *dev = pci_get_drvdata(pdev);
        if (prepare)
                nvme_dev_shutdown(dev);
        else
 -              nvme_dev_resume(dev);
 +              schedule_work(&dev->probe_work);
  }
  
  static void nvme_shutdown(struct pci_dev *pdev)
@@@ -3354,7 -3288,10 +3354,7 @@@ static int nvme_resume(struct device *d
        struct pci_dev *pdev = to_pci_dev(dev);
        struct nvme_dev *ndev = pci_get_drvdata(pdev);
  
 -      if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) {
 -              ndev->reset_workfn = nvme_reset_failed_dev;
 -              queue_work(nvme_workq, &ndev->reset_work);
 -      }
 +      schedule_work(&ndev->probe_work);
        return 0;
  }
  #endif
index ada724aab3d586ebb81193909a2518a1e68872fb,d4cdbf28dbb6701fae141493dbad5516eb5d4d8e..d4c3e5512dd54dbcf0b3f6d9a927a1e8bbb3fc5a
@@@ -27,7 -27,7 +27,7 @@@
  #include "ion_priv.h"
  
  static gfp_t high_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN |
-                                    __GFP_NORETRY) & ~__GFP_WAIT;
+                                    __GFP_NORETRY) & ~__GFP_DIRECT_RECLAIM;
  static gfp_t low_order_gfp_flags  = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN);
  static const unsigned int orders[] = {8, 4, 0};
  static const int num_orders = ARRAY_SIZE(orders);
@@@ -185,11 -185,8 +185,11 @@@ static void ion_system_heap_free(struc
        struct scatterlist *sg;
        int i;
  
 -      /* uncached pages come from the page pools, zero them before returning
 -         for security purposes (other allocations are zerod at alloc time */
 +      /*
 +       *  uncached pages come from the page pools, zero them before returning
 +       *  for security purposes (other allocations are zerod at
 +       *  alloc time
 +       */
        if (!cached && !(buffer->private_flags & ION_PRIV_FLAG_SHRINKER_FREE))
                ion_heap_buffer_zero(buffer);
  
index 6af733de69caddbf7672b80e75e1827304e7b237,78bde2c11b50ba44928d0a27a2d50d2a2d9e6d05..f0b0423a716bd09252b74f5f9d22506fddc7c4dd
@@@ -79,7 -79,7 +79,7 @@@ do {                                                                  
  
  #define KLASSERT(e) LASSERT(e)
  
 -void lbug_with_loc(struct libcfs_debug_msg_data *)__attribute__((noreturn));
 +void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *);
  
  #define LBUG()                                                          \
  do {                                                              \
@@@ -95,7 -95,7 +95,7 @@@
  do {                                                                      \
        LASSERT(!in_interrupt() ||                                          \
                ((size) <= LIBCFS_VMALLOC_SIZE &&                           \
-                ((mask) & __GFP_WAIT) == 0));                              \
+                !gfpflags_allow_blocking(mask)));                          \
  } while (0)
  
  #define LIBCFS_ALLOC_POST(ptr, size)                                      \
index 47a1202fcbdf5117e66a8c78da95c80bbffdf459,29fff7f2a45a1f59334c3783d0f0c7ecc54001ec..8666f3ad24e9960bfb85b6814de6c277ae37b522
@@@ -134,8 -134,11 +134,8 @@@ int hfi1_create_ctxts(struct hfi1_devda
        dd->assigned_node_id = local_node_id;
  
        dd->rcd = kcalloc(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL);
 -      if (!dd->rcd) {
 -              dd_dev_err(dd,
 -                      "Unable to allocate receive context array, failing\n");
 +      if (!dd->rcd)
                goto nomem;
 -      }
  
        /* create one or more kernel contexts */
        for (i = 0; i < dd->first_user_ctxt; ++i) {
@@@ -290,14 -293,12 +290,14 @@@ struct hfi1_ctxtdata *hfi1_create_ctxtd
                 * The resulting value will be rounded down to the closest
                 * multiple of dd->rcv_entries.group_size.
                 */
 -              rcd->egrbufs.buffers = kzalloc(sizeof(*rcd->egrbufs.buffers) *
 -                                             rcd->egrbufs.count, GFP_KERNEL);
 +              rcd->egrbufs.buffers = kcalloc(rcd->egrbufs.count,
 +                                             sizeof(*rcd->egrbufs.buffers),
 +                                             GFP_KERNEL);
                if (!rcd->egrbufs.buffers)
                        goto bail;
 -              rcd->egrbufs.rcvtids = kzalloc(sizeof(*rcd->egrbufs.rcvtids) *
 -                                             rcd->egrbufs.count, GFP_KERNEL);
 +              rcd->egrbufs.rcvtids = kcalloc(rcd->egrbufs.count,
 +                                             sizeof(*rcd->egrbufs.rcvtids),
 +                                             GFP_KERNEL);
                if (!rcd->egrbufs.rcvtids)
                        goto bail;
                rcd->egrbufs.size = eager_buffer_size;
                if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
                        rcd->opstats = kzalloc(sizeof(*rcd->opstats),
                                GFP_KERNEL);
 -                      if (!rcd->opstats) {
 -                              dd_dev_err(dd,
 -                                         "ctxt%u: Unable to allocate per ctxt stats buffer\n",
 -                                         rcd->ctxt);
 +                      if (!rcd->opstats)
                                goto bail;
 -                      }
                }
        }
        return rcd;
@@@ -413,7 -418,6 +413,7 @@@ static enum hrtimer_restart cca_timer_f
        int sl;
        u16 ccti, ccti_timer, ccti_min;
        struct cc_state *cc_state;
 +      unsigned long flags;
  
        cca_timer = container_of(t, struct cca_timer, hrtimer);
        ppd = cca_timer->ppd;
        ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
        ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
  
 -      spin_lock(&ppd->cca_timer_lock);
 +      spin_lock_irqsave(&ppd->cca_timer_lock, flags);
  
        ccti = cca_timer->ccti;
  
                set_link_ipg(ppd);
        }
  
 -      spin_unlock(&ppd->cca_timer_lock);
 +      spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
  
        rcu_read_unlock();
  
@@@ -1046,8 -1050,8 +1046,8 @@@ struct hfi1_devdata *hfi1_alloc_devdata
        if (!hfi1_cpulist_count) {
                u32 count = num_online_cpus();
  
 -              hfi1_cpulist = kzalloc(BITS_TO_LONGS(count) *
 -                                    sizeof(long), GFP_KERNEL);
 +              hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long),
 +                                     GFP_KERNEL);
                if (hfi1_cpulist)
                        hfi1_cpulist_count = count;
                else
@@@ -1560,7 -1564,7 +1560,7 @@@ int hfi1_setup_eagerbufs(struct hfi1_ct
         * heavy filesystem activity makes these fail, and we can
         * use compound pages.
         */
-       gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
  
        /*
         * The minimum size of the eager buffers is a groups of MTU-sized
index 5d9b9dbd8fc44804fbf7a4256782ecc7d4f7630f,c11f6c58ce534df07c91f01c57a2ddbb8f1ba1f2..13c3cd11ab92a5c610ef7a5e32795a0b0a0931c9
@@@ -825,13 -825,13 +825,13 @@@ static void ipath_clean_part_key(struc
                                ipath_stats.sps_pkeys[j] =
                                        dd->ipath_pkeys[j] = 0;
                                pchanged++;
 +                      } else {
 +                              ipath_cdbg(VERBOSE, "p%u key %x matches #%d, "
 +                                         "but ref still %d\n", pd->port_port,
 +                                         pd->port_pkeys[i], j,
 +                                         atomic_read(&dd->ipath_pkeyrefs[j]));
 +                              break;
                        }
 -                      else ipath_cdbg(
 -                              VERBOSE, "p%u key %x matches #%d, "
 -                              "but ref still %d\n", pd->port_port,
 -                              pd->port_pkeys[i], j,
 -                              atomic_read(&dd->ipath_pkeyrefs[j]));
 -                      break;
                }
                pd->port_pkeys[i] = 0;
        }
@@@ -905,7 -905,7 +905,7 @@@ static int ipath_create_user_egr(struc
         * heavy filesystem activity makes these fail, and we can
         * use compound pages.
         */
-       gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
  
        egrcnt = dd->ipath_rcvegrcnt;
        /* TID number offset for this port */
@@@ -2046,6 -2046,7 +2046,6 @@@ static void unlock_expected_tids(struc
  
  static int ipath_close(struct inode *in, struct file *fp)
  {
 -      int ret = 0;
        struct ipath_filedata *fd;
        struct ipath_portdata *pd;
        struct ipath_devdata *dd;
  
  bail:
        kfree(fd);
 -      return ret;
 +      return 0;
  }
  
  static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
index c69b650b7bba67ef5f1a885027b17f36f44dd3d7,9d1e3b3d39ca977381cdddd0c2502a08d44ba1d5..223ccf89d2263fb6483a1a22116a3743c9d7ed8e
@@@ -2258,10 -2258,12 +2258,10 @@@ reset
                /* Disable the endpoints */
                if (fsg->bulk_in_enabled) {
                        usb_ep_disable(fsg->bulk_in);
 -                      fsg->bulk_in->driver_data = NULL;
                        fsg->bulk_in_enabled = 0;
                }
                if (fsg->bulk_out_enabled) {
                        usb_ep_disable(fsg->bulk_out);
 -                      fsg->bulk_out->driver_data = NULL;
                        fsg->bulk_out_enabled = 0;
                }
  
@@@ -2345,7 -2347,6 +2345,6 @@@ static void fsg_disable(struct usb_func
  
  static void handle_exception(struct fsg_common *common)
  {
-       siginfo_t               info;
        int                     i;
        struct fsg_buffhd       *bh;
        enum fsg_state          old_state;
         * into a high-priority EXIT exception.
         */
        for (;;) {
-               int sig =
-                       dequeue_signal_lock(current, &current->blocked, &info);
+               int sig = kernel_dequeue_signal(NULL);
                if (!sig)
                        break;
                if (sig != SIGUSR1) {
@@@ -2660,12 -2660,10 +2658,12 @@@ EXPORT_SYMBOL_GPL(fsg_common_put)
  /* check if fsg_num_buffers is within a valid range */
  static inline int fsg_num_buffers_validate(unsigned int fsg_num_buffers)
  {
 -      if (fsg_num_buffers >= 2 && fsg_num_buffers <= 4)
 +#define FSG_MAX_NUM_BUFFERS   32
 +
 +      if (fsg_num_buffers >= 2 && fsg_num_buffers <= FSG_MAX_NUM_BUFFERS)
                return 0;
        pr_err("fsg_num_buffers %u is out of range (%d to %d)\n",
 -             fsg_num_buffers, 2, 4);
 +             fsg_num_buffers, 2, FSG_MAX_NUM_BUFFERS);
        return -EINVAL;
  }
  
@@@ -3070,11 -3068,13 +3068,11 @@@ static int fsg_bind(struct usb_configur
        ep = usb_ep_autoconfig(gadget, &fsg_fs_bulk_in_desc);
        if (!ep)
                goto autoconf_fail;
 -      ep->driver_data = fsg->common;  /* claim the endpoint */
        fsg->bulk_in = ep;
  
        ep = usb_ep_autoconfig(gadget, &fsg_fs_bulk_out_desc);
        if (!ep)
                goto autoconf_fail;
 -      ep->driver_data = fsg->common;  /* claim the endpoint */
        fsg->bulk_out = ep;
  
        /* Assume endpoint addresses are the same for both speeds */
@@@ -3142,6 -3142,9 +3140,6 @@@ static inline struct fsg_opts *to_fsg_o
                            func_inst.group);
  }
  
 -CONFIGFS_ATTR_STRUCT(fsg_lun_opts);
 -CONFIGFS_ATTR_OPS(fsg_lun_opts);
 -
  static void fsg_lun_attr_release(struct config_item *item)
  {
        struct fsg_lun_opts *lun_opts;
  
  static struct configfs_item_operations fsg_lun_item_ops = {
        .release                = fsg_lun_attr_release,
 -      .show_attribute         = fsg_lun_opts_attr_show,
 -      .store_attribute        = fsg_lun_opts_attr_store,
  };
  
 -static ssize_t fsg_lun_opts_file_show(struct fsg_lun_opts *opts, char *page)
 +static ssize_t fsg_lun_opts_file_show(struct config_item *item, char *page)
  {
 -      struct fsg_opts *fsg_opts;
 -
 -      fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
 +      struct fsg_lun_opts *opts = to_fsg_lun_opts(item);
 +      struct fsg_opts *fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
  
        return fsg_show_file(opts->lun, &fsg_opts->common->filesem, page);
  }
  
 -static ssize_t fsg_lun_opts_file_store(struct fsg_lun_opts *opts,
 +static ssize_t fsg_lun_opts_file_store(struct config_item *item,
                                       const char *page, size_t len)
  {
 -      struct fsg_opts *fsg_opts;
 -
 -      fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
 +      struct fsg_lun_opts *opts = to_fsg_lun_opts(item);
 +      struct fsg_opts *fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
  
        return fsg_store_file(opts->lun, &fsg_opts->common->filesem, page, len);
  }
  
 -static struct fsg_lun_opts_attribute fsg_lun_opts_file =
 -      __CONFIGFS_ATTR(file, S_IRUGO | S_IWUSR, fsg_lun_opts_file_show,
 -                      fsg_lun_opts_file_store);
 +CONFIGFS_ATTR(fsg_lun_opts_, file);
  
 -static ssize_t fsg_lun_opts_ro_show(struct fsg_lun_opts *opts, char *page)
 +static ssize_t fsg_lun_opts_ro_show(struct config_item *item, char *page)
  {
 -      return fsg_show_ro(opts->lun, page);
 +      return fsg_show_ro(to_fsg_lun_opts(item)->lun, page);
  }
  
 -static ssize_t fsg_lun_opts_ro_store(struct fsg_lun_opts *opts,
 +static ssize_t fsg_lun_opts_ro_store(struct config_item *item,
                                       const char *page, size_t len)
  {
 -      struct fsg_opts *fsg_opts;
 -
 -      fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
 +      struct fsg_lun_opts *opts = to_fsg_lun_opts(item);
 +      struct fsg_opts *fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
  
        return fsg_store_ro(opts->lun, &fsg_opts->common->filesem, page, len);
  }
  
 -static struct fsg_lun_opts_attribute fsg_lun_opts_ro =
 -      __CONFIGFS_ATTR(ro, S_IRUGO | S_IWUSR, fsg_lun_opts_ro_show,
 -                      fsg_lun_opts_ro_store);
 +CONFIGFS_ATTR(fsg_lun_opts_, ro);
  
 -static ssize_t fsg_lun_opts_removable_show(struct fsg_lun_opts *opts,
 +static ssize_t fsg_lun_opts_removable_show(struct config_item *item,
                                           char *page)
  {
 -      return fsg_show_removable(opts->lun, page);
 +      return fsg_show_removable(to_fsg_lun_opts(item)->lun, page);
  }
  
 -static ssize_t fsg_lun_opts_removable_store(struct fsg_lun_opts *opts,
 +static ssize_t fsg_lun_opts_removable_store(struct config_item *item,
                                       const char *page, size_t len)
  {
 -      return fsg_store_removable(opts->lun, page, len);
 +      return fsg_store_removable(to_fsg_lun_opts(item)->lun, page, len);
  }
  
 -static struct fsg_lun_opts_attribute fsg_lun_opts_removable =
 -      __CONFIGFS_ATTR(removable, S_IRUGO | S_IWUSR,
 -                      fsg_lun_opts_removable_show,
 -                      fsg_lun_opts_removable_store);
 +CONFIGFS_ATTR(fsg_lun_opts_, removable);
  
 -static ssize_t fsg_lun_opts_cdrom_show(struct fsg_lun_opts *opts, char *page)
 +static ssize_t fsg_lun_opts_cdrom_show(struct config_item *item, char *page)
  {
 -      return fsg_show_cdrom(opts->lun, page);
 +      return fsg_show_cdrom(to_fsg_lun_opts(item)->lun, page);
  }
  
 -static ssize_t fsg_lun_opts_cdrom_store(struct fsg_lun_opts *opts,
 +static ssize_t fsg_lun_opts_cdrom_store(struct config_item *item,
                                       const char *page, size_t len)
  {
 -      struct fsg_opts *fsg_opts;
 -
 -      fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
 +      struct fsg_lun_opts *opts = to_fsg_lun_opts(item);
 +      struct fsg_opts *fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
  
        return fsg_store_cdrom(opts->lun, &fsg_opts->common->filesem, page,
                               len);
  }
  
 -static struct fsg_lun_opts_attribute fsg_lun_opts_cdrom =
 -      __CONFIGFS_ATTR(cdrom, S_IRUGO | S_IWUSR, fsg_lun_opts_cdrom_show,
 -                      fsg_lun_opts_cdrom_store);
 +CONFIGFS_ATTR(fsg_lun_opts_, cdrom);
  
 -static ssize_t fsg_lun_opts_nofua_show(struct fsg_lun_opts *opts, char *page)
 +static ssize_t fsg_lun_opts_nofua_show(struct config_item *item, char *page)
  {
 -      return fsg_show_nofua(opts->lun, page);
 +      return fsg_show_nofua(to_fsg_lun_opts(item)->lun, page);
  }
  
 -static ssize_t fsg_lun_opts_nofua_store(struct fsg_lun_opts *opts,
 +static ssize_t fsg_lun_opts_nofua_store(struct config_item *item,
                                       const char *page, size_t len)
  {
 -      return fsg_store_nofua(opts->lun, page, len);
 +      return fsg_store_nofua(to_fsg_lun_opts(item)->lun, page, len);
  }
  
 -static struct fsg_lun_opts_attribute fsg_lun_opts_nofua =
 -      __CONFIGFS_ATTR(nofua, S_IRUGO | S_IWUSR, fsg_lun_opts_nofua_show,
 -                      fsg_lun_opts_nofua_store);
 +CONFIGFS_ATTR(fsg_lun_opts_, nofua);
  
  static struct configfs_attribute *fsg_lun_attrs[] = {
 -      &fsg_lun_opts_file.attr,
 -      &fsg_lun_opts_ro.attr,
 -      &fsg_lun_opts_removable.attr,
 -      &fsg_lun_opts_cdrom.attr,
 -      &fsg_lun_opts_nofua.attr,
 +      &fsg_lun_opts_attr_file,
 +      &fsg_lun_opts_attr_ro,
 +      &fsg_lun_opts_attr_removable,
 +      &fsg_lun_opts_attr_cdrom,
 +      &fsg_lun_opts_attr_nofua,
        NULL,
  };
  
@@@ -3330,6 -3350,9 +3328,6 @@@ static void fsg_lun_drop(struct config_
        config_item_put(item);
  }
  
 -CONFIGFS_ATTR_STRUCT(fsg_opts);
 -CONFIGFS_ATTR_OPS(fsg_opts);
 -
  static void fsg_attr_release(struct config_item *item)
  {
        struct fsg_opts *opts = to_fsg_opts(item);
  
  static struct configfs_item_operations fsg_item_ops = {
        .release                = fsg_attr_release,
 -      .show_attribute         = fsg_opts_attr_show,
 -      .store_attribute        = fsg_opts_attr_store,
  };
  
 -static ssize_t fsg_opts_stall_show(struct fsg_opts *opts, char *page)
 +static ssize_t fsg_opts_stall_show(struct config_item *item, char *page)
  {
 +      struct fsg_opts *opts = to_fsg_opts(item);
        int result;
  
        mutex_lock(&opts->lock);
        return result;
  }
  
 -static ssize_t fsg_opts_stall_store(struct fsg_opts *opts, const char *page,
 +static ssize_t fsg_opts_stall_store(struct config_item *item, const char *page,
                                    size_t len)
  {
 +      struct fsg_opts *opts = to_fsg_opts(item);
        int ret;
        bool stall;
  
        return ret;
  }
  
 -static struct fsg_opts_attribute fsg_opts_stall =
 -      __CONFIGFS_ATTR(stall, S_IRUGO | S_IWUSR, fsg_opts_stall_show,
 -                      fsg_opts_stall_store);
 +CONFIGFS_ATTR(fsg_opts_, stall);
  
  #ifdef CONFIG_USB_GADGET_DEBUG_FILES
 -static ssize_t fsg_opts_num_buffers_show(struct fsg_opts *opts, char *page)
 +static ssize_t fsg_opts_num_buffers_show(struct config_item *item, char *page)
  {
 +      struct fsg_opts *opts = to_fsg_opts(item);
        int result;
  
        mutex_lock(&opts->lock);
        return result;
  }
  
 -static ssize_t fsg_opts_num_buffers_store(struct fsg_opts *opts,
 +static ssize_t fsg_opts_num_buffers_store(struct config_item *item,
                                          const char *page, size_t len)
  {
 +      struct fsg_opts *opts = to_fsg_opts(item);
        int ret;
        u8 num;
  
        return ret;
  }
  
 -static struct fsg_opts_attribute fsg_opts_num_buffers =
 -      __CONFIGFS_ATTR(num_buffers, S_IRUGO | S_IWUSR,
 -                      fsg_opts_num_buffers_show,
 -                      fsg_opts_num_buffers_store);
 -
 +CONFIGFS_ATTR(fsg_opts_, num_buffers);
  #endif
  
  static struct configfs_attribute *fsg_attrs[] = {
 -      &fsg_opts_stall.attr,
 +      &fsg_opts_attr_stall,
  #ifdef CONFIG_USB_GADGET_DEBUG_FILES
 -      &fsg_opts_num_buffers.attr,
 +      &fsg_opts_attr_num_buffers,
  #endif
        NULL,
  };
index 0a94895a358d47e8e51cf32722afbdce0d6104d9,67b3b9d9dfd13c471ac31908737a0d30e9284a78..692ccc69345e4a9998246a53b6af817a56a5d435
@@@ -2244,8 -2244,9 +2244,8 @@@ static int u132_urb_enqueue(struct usb_
  {
        struct u132 *u132 = hcd_to_u132(hcd);
        if (irqs_disabled()) {
-               if (__GFP_WAIT & mem_flags) {
+               if (gfpflags_allow_blocking(mem_flags)) {
 -                      printk(KERN_ERR "invalid context for function that migh"
 -                              "t sleep\n");
 +                      printk(KERN_ERR "invalid context for function that might sleep\n");
                        return -EINVAL;
                }
        }
diff --combined fs/9p/vfs_file.c
index f23fd86697ea5ed4234ff96a8dd236b8884e6026,6b747394f6f566dfbd7e2608d1690f88f81e4206..7bf835f85bc822ef1119b639be82619af066d326
@@@ -161,7 -161,7 +161,7 @@@ static int v9fs_file_do_lock(struct fil
        if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
                BUG();
  
 -      res = posix_lock_file_wait(filp, fl);
 +      res = locks_lock_file_wait(filp, fl);
        if (res < 0)
                goto out;
  
@@@ -231,7 -231,8 +231,8 @@@ out_unlock
        if (res < 0 && fl->fl_type != F_UNLCK) {
                fl_type = fl->fl_type;
                fl->fl_type = F_UNLCK;
-               res = locks_lock_file_wait(filp, fl);
+               /* Even if this fails we want to return the remote error */
 -              posix_lock_file_wait(filp, fl);
++              locks_lock_file_wait(filp, fl);
                fl->fl_type = fl_type;
        }
  out:
diff --combined fs/cifs/file.c
index 47c5c97e2dd31c20663f1fa0584da2c3f87722bd,2d319e66b8f84ea0e78c37e5ffdf5782bdd3aa4e..0a2752b79e72cc2b7a083894843a8b3ae1dea23d
@@@ -1553,7 -1553,7 +1553,7 @@@ cifs_setlk(struct file *file, struct fi
  
  out:
        if (flock->fl_flags & FL_POSIX && !rc)
 -              rc = posix_lock_file_wait(file, flock);
 +              rc = locks_lock_file_wait(file, flock);
        return rc;
  }
  
@@@ -3380,7 -3380,7 +3380,7 @@@ readpages_get_pages(struct address_spac
        struct page *page, *tpage;
        unsigned int expected_index;
        int rc;
-       gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+       gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
  
        INIT_LIST_HEAD(tmplist);
  
         * should have access to this page, we're safe to simply set
         * PG_locked without checking it first.
         */
-       __set_page_locked(page);
+       __SetPageLocked(page);
        rc = add_to_page_cache_locked(page, mapping,
                                      page->index, gfp);
  
        /* give up if we can't stick it in the cache */
        if (rc) {
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
                return rc;
        }
  
                if (*bytes + PAGE_CACHE_SIZE > rsize)
                        break;
  
-               __set_page_locked(page);
+               __SetPageLocked(page);
                if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
-                       __clear_page_locked(page);
+                       __ClearPageLocked(page);
                        break;
                }
                list_move_tail(&page->lru, tmplist);
diff --combined fs/coredump.c
index 53d7d46c55c82c58c321895225faaad8e9a2e6e5,1777331eee767fa323cb864fb95131983eaad588..b3c153ca435d24fdbdfcb909228b6b4787bb63f2
@@@ -32,7 -32,6 +32,7 @@@
  #include <linux/pipe_fs_i.h>
  #include <linux/oom.h>
  #include <linux/compat.h>
 +#include <linux/timekeeping.h>
  
  #include <asm/uaccess.h>
  #include <asm/mmu_context.h>
@@@ -233,10 -232,9 +233,10 @@@ static int format_corename(struct core_
                                break;
                        /* UNIX time of coredump */
                        case 't': {
 -                              struct timeval tv;
 -                              do_gettimeofday(&tv);
 -                              err = cn_printf(cn, "%lu", tv.tv_sec);
 +                              time64_t time;
 +
 +                              time = ktime_get_real_seconds();
 +                              err = cn_printf(cn, "%lld", time);
                                break;
                        }
                        /* hostname */
@@@ -282,23 -280,24 +282,24 @@@ out
        return ispipe;
  }
  
- static int zap_process(struct task_struct *start, int exit_code)
+ static int zap_process(struct task_struct *start, int exit_code, int flags)
  {
        struct task_struct *t;
        int nr = 0;
  
+       /* ignore all signals except SIGKILL, see prepare_signal() */
+       start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
        start->signal->group_exit_code = exit_code;
        start->signal->group_stop_count = 0;
  
-       t = start;
-       do {
+       for_each_thread(start, t) {
                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                if (t != current && t->mm) {
                        sigaddset(&t->pending.signal, SIGKILL);
                        signal_wake_up(t, 1);
                        nr++;
                }
-       } while_each_thread(start, t);
+       }
  
        return nr;
  }
@@@ -313,10 -312,8 +314,8 @@@ static int zap_threads(struct task_stru
        spin_lock_irq(&tsk->sighand->siglock);
        if (!signal_group_exit(tsk->signal)) {
                mm->core_state = core_state;
-               nr = zap_process(tsk, exit_code);
                tsk->signal->group_exit_task = tsk;
-               /* ignore all signals except SIGKILL, see prepare_signal() */
-               tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+               nr = zap_process(tsk, exit_code, 0);
                clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
        }
        spin_unlock_irq(&tsk->sighand->siglock);
                        continue;
                if (g->flags & PF_KTHREAD)
                        continue;
-               p = g;
-               do {
-                       if (p->mm) {
-                               if (unlikely(p->mm == mm)) {
-                                       lock_task_sighand(p, &flags);
-                                       nr += zap_process(p, exit_code);
-                                       p->signal->flags = SIGNAL_GROUP_EXIT;
-                                       unlock_task_sighand(p, &flags);
-                               }
-                               break;
+               for_each_thread(g, p) {
+                       if (unlikely(!p->mm))
+                               continue;
+                       if (unlikely(p->mm == mm)) {
+                               lock_task_sighand(p, &flags);
+                               nr += zap_process(p, exit_code,
+                                                       SIGNAL_GROUP_EXIT);
+                               unlock_task_sighand(p, &flags);
                        }
-               } while_each_thread(g, p);
+                       break;
+               }
        }
        rcu_read_unlock();
  done:
diff --combined fs/direct-io.c
index 3ae0e0427191c7849fc70301e58792515f5f181c,dbb94a2d6c504a08d0349d891ccbc2d8acc2f6a1..18e7554cf94cac57d1eb3dc17ba4001e75cef5f6
@@@ -120,7 -120,6 +120,7 @@@ struct dio 
        int page_errors;                /* errno from get_user_pages() */
        int is_async;                   /* is IO async ? */
        bool defer_completion;          /* defer AIO completion to workqueue? */
 +      bool should_dirty;              /* if pages should be dirtied */
        int io_error;                   /* IO error in completion path */
        unsigned long refcount;         /* direct_io_worker() and bios */
        struct bio *bio_list;           /* singly linked via bi_private */
@@@ -361,7 -360,7 +361,7 @@@ dio_bio_alloc(struct dio *dio, struct d
  
        /*
         * bio_alloc() is guaranteed to return a bio when called with
-        * __GFP_WAIT and we request a valid number of vectors.
+        * __GFP_RECLAIM and we request a valid number of vectors.
         */
        bio = bio_alloc(GFP_KERNEL, nr_vecs);
  
@@@ -394,7 -393,7 +394,7 @@@ static inline void dio_bio_submit(struc
        dio->refcount++;
        spin_unlock_irqrestore(&dio->bio_lock, flags);
  
 -      if (dio->is_async && dio->rw == READ)
 +      if (dio->is_async && dio->rw == READ && dio->should_dirty)
                bio_set_pages_dirty(bio);
  
        if (sdio->submit_io)
@@@ -465,15 -464,14 +465,15 @@@ static int dio_bio_complete(struct dio 
        if (bio->bi_error)
                dio->io_error = -EIO;
  
 -      if (dio->is_async && dio->rw == READ) {
 +      if (dio->is_async && dio->rw == READ && dio->should_dirty) {
                bio_check_pages_dirty(bio);     /* transfers ownership */
                err = bio->bi_error;
        } else {
                bio_for_each_segment_all(bvec, bio, i) {
                        struct page *page = bvec->bv_page;
  
 -                      if (dio->rw == READ && !PageCompound(page))
 +                      if (dio->rw == READ && !PageCompound(page) &&
 +                                      dio->should_dirty)
                                set_page_dirty_lock(page);
                        page_cache_release(page);
                }
@@@ -1221,7 -1219,6 +1221,7 @@@ do_blockdev_direct_IO(struct kiocb *ioc
        spin_lock_init(&dio->bio_lock);
        dio->refcount = 1;
  
 +      dio->should_dirty = (iter->type == ITER_IOVEC);
        sdio.iter = iter;
        sdio.final_block_in_request =
                (offset + iov_iter_count(iter)) >> blkbits;
diff --combined fs/ext4/inode.c
index e8d620a484f6a86bb684ce432888c329a429e948,60aaecd5598b32798cb114cbd473d5818af7c692..7d1aad1d9313155f3780cde3923710fe7f60ea1c
@@@ -378,7 -378,7 +378,7 @@@ static int __check_block_validity(struc
                                 "lblock %lu mapped to illegal pblock "
                                 "(length %d)", (unsigned long) map->m_lblk,
                                 map->m_len);
 -              return -EIO;
 +              return -EFSCORRUPTED;
        }
        return 0;
  }
@@@ -480,7 -480,7 +480,7 @@@ int ext4_map_blocks(handle_t *handle, s
  
        /* We can handle the block number less than EXT_MAX_BLOCKS */
        if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
 -              return -EIO;
 +              return -EFSCORRUPTED;
  
        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
@@@ -965,7 -965,7 +965,7 @@@ static int ext4_block_write_begin(struc
        if (unlikely(err))
                page_zero_new_buffers(page, from, to);
        else if (decrypt)
 -              err = ext4_decrypt_one(inode, page);
 +              err = ext4_decrypt(page);
        return err;
  }
  #endif
@@@ -1181,38 -1181,6 +1181,38 @@@ errout
        return ret ? ret : copied;
  }
  
 +/*
 + * This is a private version of page_zero_new_buffers() which doesn't
 + * set the buffer to be dirty, since in data=journalled mode we need
 + * to call ext4_handle_dirty_metadata() instead.
 + */
 +static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
 +{
 +      unsigned int block_start = 0, block_end;
 +      struct buffer_head *head, *bh;
 +
 +      bh = head = page_buffers(page);
 +      do {
 +              block_end = block_start + bh->b_size;
 +              if (buffer_new(bh)) {
 +                      if (block_end > from && block_start < to) {
 +                              if (!PageUptodate(page)) {
 +                                      unsigned start, size;
 +
 +                                      start = max(from, block_start);
 +                                      size = min(to, block_end) - start;
 +
 +                                      zero_user(page, start, size);
 +                                      set_buffer_uptodate(bh);
 +                              }
 +                              clear_buffer_new(bh);
 +                      }
 +              }
 +              block_start = block_end;
 +              bh = bh->b_this_page;
 +      } while (bh != head);
 +}
 +
  static int ext4_journalled_write_end(struct file *file,
                                     struct address_space *mapping,
                                     loff_t pos, unsigned len, unsigned copied,
                if (copied < len) {
                        if (!PageUptodate(page))
                                copied = 0;
 -                      page_zero_new_buffers(page, from+copied, to);
 +                      zero_new_buffers(page, from+copied, to);
                }
  
                ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
@@@ -1847,22 -1815,11 +1847,22 @@@ static int ext4_writepage(struct page *
         * the page. But we may reach here when we do a journal commit via
         * journal_submit_inode_data_buffers() and in that case we must write
         * allocated buffers to achieve data=ordered mode guarantees.
 +       *
 +       * Also, if there is only one buffer per page (the fs block
 +       * size == the page size), if one buffer needs block
 +       * allocation or needs to modify the extent tree to clear the
 +       * unwritten flag, we know that the page can't be written at
 +       * all, so we might as well refuse the write immediately.
 +       * Unfortunately if the block size != page size, we can't as
 +       * easily detect this case using ext4_walk_page_buffers(), but
 +       * for the extremely common case, this is an optimization that
 +       * skips a useless round trip through ext4_bio_write_page().
         */
        if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
                                   ext4_bh_delay_or_unwritten)) {
                redirty_page_for_writepage(wbc, page);
 -              if (current->flags & PF_MEMALLOC) {
 +              if ((current->flags & PF_MEMALLOC) ||
 +                  (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
                        /*
                         * For memory cleaning there's no point in writing only
                         * some buffers. So just bail out. Warn if we came here
@@@ -2642,7 -2599,8 +2642,7 @@@ static int ext4_nonda_switch(struct sup
  /* We always reserve for an inode update; the superblock could be there too */
  static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
  {
 -      if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
 -                              EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
 +      if (likely(ext4_has_feature_large_file(inode->i_sb)))
                return 1;
  
        if (pos + len <= 0x7fffffffULL)
@@@ -3386,7 -3344,7 +3386,7 @@@ static int __ext4_block_zero_page_range
        int err = 0;
  
        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-                                  mapping_gfp_mask(mapping) & ~__GFP_FS);
+                                  mapping_gfp_constraint(mapping, ~__GFP_FS));
        if (!page)
                return -ENOMEM;
  
                        /* We expect the key to be set. */
                        BUG_ON(!ext4_has_encryption_key(inode));
                        BUG_ON(blocksize != PAGE_CACHE_SIZE);
 -                      WARN_ON_ONCE(ext4_decrypt_one(inode, page));
 +                      WARN_ON_ONCE(ext4_decrypt(page));
                }
        }
        if (ext4_should_journal_data(inode)) {
@@@ -3862,7 -3820,7 +3862,7 @@@ static int __ext4_get_inode_loc(struct 
  
        iloc->bh = NULL;
        if (!ext4_valid_inum(sb, inode->i_ino))
 -              return -EIO;
 +              return -EFSCORRUPTED;
  
        iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
        gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
@@@ -4048,7 -4006,8 +4048,7 @@@ static blkcnt_t ext4_inode_blocks(struc
        struct inode *inode = &(ei->vfs_inode);
        struct super_block *sb = inode->i_sb;
  
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                              EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
 +      if (ext4_has_feature_huge_file(sb)) {
                /* we are using combined 48 bit field */
                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
                                        le32_to_cpu(raw_inode->i_blocks_lo);
@@@ -4109,7 -4068,7 +4109,7 @@@ struct inode *ext4_iget(struct super_bl
                        EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
                                EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
                                EXT4_INODE_SIZE(inode->i_sb));
 -                      ret = -EIO;
 +                      ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
        } else
  
        if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
                EXT4_ERROR_INODE(inode, "checksum invalid");
 -              ret = -EIO;
 +              ret = -EFSBADCRC;
                goto bad_inode;
        }
  
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
 +      if (ext4_has_feature_64bit(sb))
                ei->i_file_acl |=
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
        inode->i_size = ext4_isize(raw_inode);
            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
                EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
                                 ei->i_file_acl);
 -              ret = -EIO;
 +              ret = -EFSCORRUPTED;
                goto bad_inode;
        } else if (!ext4_has_inline_data(inode)) {
                if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
        } else if (ino == EXT4_BOOT_LOADER_INO) {
                make_bad_inode(inode);
        } else {
 -              ret = -EIO;
 +              ret = -EFSCORRUPTED;
                EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
                goto bad_inode;
        }
@@@ -4313,7 -4272,7 +4313,7 @@@ bad_inode
  struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
  {
        if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
 -              return ERR_PTR(-EIO);
 +              return ERR_PTR(-EFSCORRUPTED);
        return ext4_iget(sb, ino);
  }
  
@@@ -4335,7 -4294,7 +4335,7 @@@ static int ext4_inode_blocks_set(handle
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                return 0;
        }
 -      if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
 +      if (!ext4_has_feature_huge_file(sb))
                return -EFBIG;
  
        if (i_blocks <= 0xffffffffffffULL) {
@@@ -4496,7 -4455,8 +4496,7 @@@ static int ext4_do_update_inode(handle_
                need_datasync = 1;
        }
        if (ei->i_disksize > 0x7fffffffULL) {
 -              if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                              EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
 +              if (!ext4_has_feature_large_file(sb) ||
                                EXT4_SB(sb)->s_es->s_rev_level ==
                    cpu_to_le32(EXT4_GOOD_OLD_REV))
                        set_large_file = 1;
                if (err)
                        goto out_brelse;
                ext4_update_dynamic_rev(sb);
 -              EXT4_SET_RO_COMPAT_FEATURE(sb,
 -                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 +              ext4_set_feature_large_file(sb);
                ext4_handle_sync(handle);
                err = ext4_handle_dirty_super(handle, sb);
        }
diff --combined fs/ext4/readpage.c
index d94af71a4e7fcabd1783f28cccb86363473fe3ae,1061611ae14dd6c66878584820528d757cfea399..5dc5e95063de2a7e42749a94464f00f7c50be4b8
@@@ -62,7 -62,7 +62,7 @@@ static void completion_pages(struct wor
        bio_for_each_segment_all(bv, bio, i) {
                struct page *page = bv->bv_page;
  
 -              int ret = ext4_decrypt(ctx, page);
 +              int ret = ext4_decrypt(page);
                if (ret) {
                        WARN_ON_ONCE(1);
                        SetPageError(page);
@@@ -166,7 -166,7 +166,7 @@@ int ext4_mpage_readpages(struct address
                        page = list_entry(pages->prev, struct page, lru);
                        list_del(&page->lru);
                        if (add_to_page_cache_lru(page, mapping, page->index,
-                                       GFP_KERNEL & mapping_gfp_mask(mapping)))
+                                 mapping_gfp_constraint(mapping, GFP_KERNEL)))
                                goto next_page;
                }
  
diff --combined fs/ext4/super.c
index 04d0f1b334096525030674c6d818e124a1bd81f3,49f6c78ee3afe26aa45941d443c5ea242ef95676..753f4e68b820da0dd78fc7a7e3a66e529846ea0b
@@@ -34,6 -34,7 +34,6 @@@
  #include <linux/namei.h>
  #include <linux/quotaops.h>
  #include <linux/seq_file.h>
 -#include <linux/proc_fs.h>
  #include <linux/ctype.h>
  #include <linux/log2.h>
  #include <linux/crc16.h>
  #define CREATE_TRACE_POINTS
  #include <trace/events/ext4.h>
  
 -static struct proc_dir_entry *ext4_proc_root;
 -static struct kset *ext4_kset;
  static struct ext4_lazy_init *ext4_li_info;
  static struct mutex ext4_li_mtx;
 -static struct ext4_features *ext4_feat;
  static int ext4_mballoc_ready;
  static struct ratelimit_state ext4_mount_msg_ratelimit;
  
@@@ -79,6 -83,7 +79,6 @@@ static int ext4_feature_set_ok(struct s
  static void ext4_destroy_lazyinit_thread(void);
  static void ext4_unregister_li_request(struct super_block *sb);
  static void ext4_clear_request_list(void);
 -static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
  
  #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
  static struct file_system_type ext2_fs_type = {
@@@ -110,7 -115,8 +110,7 @@@ MODULE_ALIAS("ext3")
  static int ext4_verify_csum_type(struct super_block *sb,
                                 struct ext4_super_block *es)
  {
 -      if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
 +      if (!ext4_has_feature_metadata_csum(sb))
                return 1;
  
        return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
@@@ -388,13 -394,9 +388,13 @@@ static void ext4_handle_error(struct su
                smp_wmb();
                sb->s_flags |= MS_RDONLY;
        }
 -      if (test_opt(sb, ERRORS_PANIC))
 +      if (test_opt(sb, ERRORS_PANIC)) {
 +              if (EXT4_SB(sb)->s_journal &&
 +                !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
 +                      return;
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
 +      }
  }
  
  #define ext4_error_ratelimit(sb)                                      \
@@@ -493,12 -495,6 +493,12 @@@ const char *ext4_decode_error(struct su
        char *errstr = NULL;
  
        switch (errno) {
 +      case -EFSCORRUPTED:
 +              errstr = "Corrupt filesystem";
 +              break;
 +      case -EFSBADCRC:
 +              errstr = "Filesystem failed CRC";
 +              break;
        case -EIO:
                errstr = "IO failure";
                break;
@@@ -589,12 -585,8 +589,12 @@@ void __ext4_abort(struct super_block *s
                        jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
                save_error_info(sb, function, line);
        }
 -      if (test_opt(sb, ERRORS_PANIC))
 +      if (test_opt(sb, ERRORS_PANIC)) {
 +              if (EXT4_SB(sb)->s_journal &&
 +                !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
 +                      return;
                panic("EXT4-fs panic from previous error\n");
 +      }
  }
  
  void __ext4_msg(struct super_block *sb,
@@@ -808,7 -800,6 +808,7 @@@ static void ext4_put_super(struct super
                        ext4_abort(sb, "Couldn't clean up the journal");
        }
  
 +      ext4_unregister_sysfs(sb);
        ext4_es_unregister_shrinker(sbi);
        del_timer_sync(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_xattr_put_super(sb);
  
        if (!(sb->s_flags & MS_RDONLY)) {
 -              EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 +              ext4_clear_feature_journal_needs_recovery(sb);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
        }
        if (!(sb->s_flags & MS_RDONLY))
                ext4_commit_super(sb, 1);
  
 -      if (sbi->s_proc) {
 -              remove_proc_entry("options", sbi->s_proc);
 -              remove_proc_entry(sb->s_id, ext4_proc_root);
 -      }
 -      kobject_del(&sbi->s_kobj);
 -
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
        kvfree(sbi->s_group_desc);
@@@ -1061,7 -1058,7 +1061,7 @@@ static int bdev_try_to_free_page(struc
                return 0;
        if (journal)
                return jbd2_journal_try_to_free_buffers(journal, page,
-                                                       wait & ~__GFP_WAIT);
+                                               wait & ~__GFP_DIRECT_RECLAIM);
        return try_to_free_buffers(page);
  }
  
@@@ -1291,7 -1288,7 +1291,7 @@@ static int set_qf_name(struct super_blo
                        "quota options when quota turned on");
                return -1;
        }
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
 +      if (ext4_has_feature_quota(sb)) {
                ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
                         "when QUOTA feature is enabled");
                return -1;
@@@ -1384,10 -1381,10 +1384,10 @@@ static const struct mount_opts 
        {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
 -       MOPT_EXT4_ONLY | MOPT_SET},
 +       MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
                                    EXT4_MOUNT_JOURNAL_CHECKSUM),
 -       MOPT_EXT4_ONLY | MOPT_SET},
 +       MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
        {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
        {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
@@@ -1516,14 -1513,8 +1516,14 @@@ static int handle_mount_opt(struct supe
                return -1;
        if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
                return -1;
 -      if (m->flags & MOPT_EXPLICIT)
 -              set_opt2(sb, EXPLICIT_DELALLOC);
 +      if (m->flags & MOPT_EXPLICIT) {
 +              if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
 +                      set_opt2(sb, EXPLICIT_DELALLOC);
 +              } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
 +                      set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
 +              } else
 +                      return -1;
 +      }
        if (m->flags & MOPT_CLEAR_ERR)
                clear_opt(sb, ERRORS_MASK);
        if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
                                 "quota options when quota turned on");
                        return -1;
                }
 -              if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                                             EXT4_FEATURE_RO_COMPAT_QUOTA)) {
 +              if (ext4_has_feature_quota(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "Cannot set journaled quota options "
                                 "when QUOTA feature is enabled");
@@@ -1715,7 -1707,7 +1715,7 @@@ static int parse_options(char *options
                        return 0;
        }
  #ifdef CONFIG_QUOTA
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
 +      if (ext4_has_feature_quota(sb) &&
            (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
                ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
                         "feature is enabled");
@@@ -1888,7 -1880,7 +1888,7 @@@ static int ext4_show_options(struct seq
        return _ext4_show_options(seq, root->d_sb, 0);
  }
  
 -static int options_seq_show(struct seq_file *seq, void *offset)
 +int ext4_seq_options_show(struct seq_file *seq, void *offset)
  {
        struct super_block *sb = seq->private;
        int rc;
        return rc;
  }
  
 -static int options_open_fs(struct inode *inode, struct file *file)
 -{
 -      return single_open(file, options_seq_show, PDE_DATA(inode));
 -}
 -
 -static const struct file_operations ext4_seq_options_fops = {
 -      .owner = THIS_MODULE,
 -      .open = options_open_fs,
 -      .read = seq_read,
 -      .llseek = seq_lseek,
 -      .release = single_release,
 -};
 -
  static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                            int read_only)
  {
        es->s_mtime = cpu_to_le32(get_seconds());
        ext4_update_dynamic_rev(sb);
        if (sbi->s_journal)
 -              EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 +              ext4_set_feature_journal_needs_recovery(sb);
  
        ext4_commit_super(sb, 1);
  done:
@@@ -2022,13 -2027,12 +2022,13 @@@ failed
        return 0;
  }
  
 -static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 +static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
                                   struct ext4_group_desc *gdp)
  {
        int offset;
        __u16 crc = 0;
        __le32 le_group = cpu_to_le32(block_group);
 +      struct ext4_sb_info *sbi = EXT4_SB(sb);
  
        if (ext4_has_metadata_csum(sbi->s_sb)) {
                /* Use new metadata_csum algorithm */
        }
  
        /* old crc16 code */
 -      if (!(sbi->s_es->s_feature_ro_compat &
 -            cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
 +      if (!ext4_has_feature_gdt_csum(sb))
                return 0;
  
        offset = offsetof(struct ext4_group_desc, bg_checksum);
        crc = crc16(crc, (__u8 *)gdp, offset);
        offset += sizeof(gdp->bg_checksum); /* skip checksum */
        /* for checksum of struct ext4_group_desc do the rest...*/
 -      if ((sbi->s_es->s_feature_incompat &
 -           cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
 +      if (ext4_has_feature_64bit(sb) &&
            offset < le16_to_cpu(sbi->s_es->s_desc_size))
                crc = crc16(crc, (__u8 *)gdp + offset,
                            le16_to_cpu(sbi->s_es->s_desc_size) -
@@@ -2072,7 -2078,8 +2072,7 @@@ int ext4_group_desc_csum_verify(struct 
                                struct ext4_group_desc *gdp)
  {
        if (ext4_has_group_desc_csum(sb) &&
 -          (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
 -                                                    block_group, gdp)))
 +          (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
                return 0;
  
        return 1;
@@@ -2083,7 -2090,7 +2083,7 @@@ void ext4_group_desc_csum_set(struct su
  {
        if (!ext4_has_group_desc_csum(sb))
                return;
 -      gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
 +      gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
  }
  
  /* Called at mount-time, super-block is locked */
@@@ -2099,7 -2106,7 +2099,7 @@@ static int ext4_check_descriptors(struc
        int flexbg_flag = 0;
        ext4_group_t i, grp = sbi->s_groups_count;
  
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 +      if (ext4_has_feature_flex_bg(sb))
                flexbg_flag = 1;
  
        ext4_debug("Checking group descriptors");
                if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Checksum for group %u failed (%u!=%u)",
 -                               i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
 +                               i, le16_to_cpu(ext4_group_desc_csum(sb, i,
                                     gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!(sb->s_flags & MS_RDONLY)) {
                                ext4_unlock_group(sb, i);
@@@ -2406,7 -2413,8 +2406,7 @@@ static ext4_fsblk_t descriptor_loc(stru
  
        first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
  
 -      if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
 -          nr < first_meta_bg)
 +      if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
                return logical_sb_block + nr + 1;
        bg = sbi->s_desc_per_block * nr;
        if (ext4_bg_has_super(sb, bg))
@@@ -2462,6 -2470,335 +2462,6 @@@ static unsigned long ext4_get_stripe_si
        return ret;
  }
  
 -/* sysfs supprt */
 -
 -struct ext4_attr {
 -      struct attribute attr;
 -      ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
 -      ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
 -                       const char *, size_t);
 -      union {
 -              int offset;
 -              int deprecated_val;
 -      } u;
 -};
 -
 -static int parse_strtoull(const char *buf,
 -              unsigned long long max, unsigned long long *value)
 -{
 -      int ret;
 -
 -      ret = kstrtoull(skip_spaces(buf), 0, value);
 -      if (!ret && *value > max)
 -              ret = -EINVAL;
 -      return ret;
 -}
 -
 -static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
 -                                            struct ext4_sb_info *sbi,
 -                                            char *buf)
 -{
 -      return snprintf(buf, PAGE_SIZE, "%llu\n",
 -              (s64) EXT4_C2B(sbi,
 -                      percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
 -}
 -
 -static ssize_t session_write_kbytes_show(struct ext4_attr *a,
 -                                       struct ext4_sb_info *sbi, char *buf)
 -{
 -      struct super_block *sb = sbi->s_buddy_cache->i_sb;
 -
 -      if (!sb->s_bdev->bd_part)
 -              return snprintf(buf, PAGE_SIZE, "0\n");
 -      return snprintf(buf, PAGE_SIZE, "%lu\n",
 -                      (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 -                       sbi->s_sectors_written_start) >> 1);
 -}
 -
 -static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
 -                                        struct ext4_sb_info *sbi, char *buf)
 -{
 -      struct super_block *sb = sbi->s_buddy_cache->i_sb;
 -
 -      if (!sb->s_bdev->bd_part)
 -              return snprintf(buf, PAGE_SIZE, "0\n");
 -      return snprintf(buf, PAGE_SIZE, "%llu\n",
 -                      (unsigned long long)(sbi->s_kbytes_written +
 -                      ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 -                        EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 -}
 -
 -static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 -                                        struct ext4_sb_info *sbi,
 -                                        const char *buf, size_t count)
 -{
 -      unsigned long t;
 -      int ret;
 -
 -      ret = kstrtoul(skip_spaces(buf), 0, &t);
 -      if (ret)
 -              return ret;
 -
 -      if (t && (!is_power_of_2(t) || t > 0x40000000))
 -              return -EINVAL;
 -
 -      sbi->s_inode_readahead_blks = t;
 -      return count;
 -}
 -
 -static ssize_t sbi_ui_show(struct ext4_attr *a,
 -                         struct ext4_sb_info *sbi, char *buf)
 -{
 -      unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
 -
 -      return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
 -}
 -
 -static ssize_t sbi_ui_store(struct ext4_attr *a,
 -                          struct ext4_sb_info *sbi,
 -                          const char *buf, size_t count)
 -{
 -      unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
 -      unsigned long t;
 -      int ret;
 -
 -      ret = kstrtoul(skip_spaces(buf), 0, &t);
 -      if (ret)
 -              return ret;
 -      *ui = t;
 -      return count;
 -}
 -
 -static ssize_t es_ui_show(struct ext4_attr *a,
 -                         struct ext4_sb_info *sbi, char *buf)
 -{
 -
 -      unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
 -                         a->u.offset);
 -
 -      return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
 -}
 -
 -static ssize_t reserved_clusters_show(struct ext4_attr *a,
 -                                struct ext4_sb_info *sbi, char *buf)
 -{
 -      return snprintf(buf, PAGE_SIZE, "%llu\n",
 -              (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
 -}
 -
 -static ssize_t reserved_clusters_store(struct ext4_attr *a,
 -                                 struct ext4_sb_info *sbi,
 -                                 const char *buf, size_t count)
 -{
 -      unsigned long long val;
 -      int ret;
 -
 -      if (parse_strtoull(buf, -1ULL, &val))
 -              return -EINVAL;
 -      ret = ext4_reserve_clusters(sbi, val);
 -
 -      return ret ? ret : count;
 -}
 -
 -static ssize_t trigger_test_error(struct ext4_attr *a,
 -                                struct ext4_sb_info *sbi,
 -                                const char *buf, size_t count)
 -{
 -      int len = count;
 -
 -      if (!capable(CAP_SYS_ADMIN))
 -              return -EPERM;
 -
 -      if (len && buf[len-1] == '\n')
 -              len--;
 -
 -      if (len)
 -              ext4_error(sbi->s_sb, "%.*s", len, buf);
 -      return count;
 -}
 -
 -static ssize_t sbi_deprecated_show(struct ext4_attr *a,
 -                                 struct ext4_sb_info *sbi, char *buf)
 -{
 -      return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
 -}
 -
 -#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
 -static struct ext4_attr ext4_attr_##_name = {                 \
 -      .attr = {.name = __stringify(_name), .mode = _mode },   \
 -      .show   = _show,                                        \
 -      .store  = _store,                                       \
 -      .u = {                                                  \
 -              .offset = offsetof(struct ext4_sb_info, _elname),\
 -      },                                                      \
 -}
 -
 -#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)         \
 -static struct ext4_attr ext4_attr_##_name = {                         \
 -      .attr = {.name = __stringify(_name), .mode = _mode },           \
 -      .show   = _show,                                                \
 -      .store  = _store,                                               \
 -      .u = {                                                          \
 -              .offset = offsetof(struct ext4_super_block, _elname),   \
 -      },                                                              \
 -}
 -
 -#define EXT4_ATTR(name, mode, show, store) \
 -static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 -
 -#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
 -#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
 -#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
 -
 -#define EXT4_RO_ATTR_ES_UI(name, elname)      \
 -      EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
 -#define EXT4_RW_ATTR_SBI_UI(name, elname)     \
 -      EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
 -
 -#define ATTR_LIST(name) &ext4_attr_##name.attr
 -#define EXT4_DEPRECATED_ATTR(_name, _val)     \
 -static struct ext4_attr ext4_attr_##_name = {                 \
 -      .attr = {.name = __stringify(_name), .mode = 0444 },    \
 -      .show   = sbi_deprecated_show,                          \
 -      .u = {                                                  \
 -              .deprecated_val = _val,                         \
 -      },                                                      \
 -}
 -
 -EXT4_RO_ATTR(delayed_allocation_blocks);
 -EXT4_RO_ATTR(session_write_kbytes);
 -EXT4_RO_ATTR(lifetime_write_kbytes);
 -EXT4_RW_ATTR(reserved_clusters);
 -EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 -               inode_readahead_blks_store, s_inode_readahead_blks);
 -EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
 -EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
 -EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 -EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 -EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 -EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 -EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
 -EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 -EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
 -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
 -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
 -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
 -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
 -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
 -EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 -EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
 -EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
 -
 -static struct attribute *ext4_attrs[] = {
 -      ATTR_LIST(delayed_allocation_blocks),
 -      ATTR_LIST(session_write_kbytes),
 -      ATTR_LIST(lifetime_write_kbytes),
 -      ATTR_LIST(reserved_clusters),
 -      ATTR_LIST(inode_readahead_blks),
 -      ATTR_LIST(inode_goal),
 -      ATTR_LIST(mb_stats),
 -      ATTR_LIST(mb_max_to_scan),
 -      ATTR_LIST(mb_min_to_scan),
 -      ATTR_LIST(mb_order2_req),
 -      ATTR_LIST(mb_stream_req),
 -      ATTR_LIST(mb_group_prealloc),
 -      ATTR_LIST(max_writeback_mb_bump),
 -      ATTR_LIST(extent_max_zeroout_kb),
 -      ATTR_LIST(trigger_fs_error),
 -      ATTR_LIST(err_ratelimit_interval_ms),
 -      ATTR_LIST(err_ratelimit_burst),
 -      ATTR_LIST(warning_ratelimit_interval_ms),
 -      ATTR_LIST(warning_ratelimit_burst),
 -      ATTR_LIST(msg_ratelimit_interval_ms),
 -      ATTR_LIST(msg_ratelimit_burst),
 -      ATTR_LIST(errors_count),
 -      ATTR_LIST(first_error_time),
 -      ATTR_LIST(last_error_time),
 -      NULL,
 -};
 -
 -/* Features this copy of ext4 supports */
 -EXT4_INFO_ATTR(lazy_itable_init);
 -EXT4_INFO_ATTR(batched_discard);
 -EXT4_INFO_ATTR(meta_bg_resize);
 -EXT4_INFO_ATTR(encryption);
 -
 -static struct attribute *ext4_feat_attrs[] = {
 -      ATTR_LIST(lazy_itable_init),
 -      ATTR_LIST(batched_discard),
 -      ATTR_LIST(meta_bg_resize),
 -      ATTR_LIST(encryption),
 -      NULL,
 -};
 -
 -static ssize_t ext4_attr_show(struct kobject *kobj,
 -                            struct attribute *attr, char *buf)
 -{
 -      struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
 -                                              s_kobj);
 -      struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
 -
 -      return a->show ? a->show(a, sbi, buf) : 0;
 -}
 -
 -static ssize_t ext4_attr_store(struct kobject *kobj,
 -                             struct attribute *attr,
 -                             const char *buf, size_t len)
 -{
 -      struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
 -                                              s_kobj);
 -      struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
 -
 -      return a->store ? a->store(a, sbi, buf, len) : 0;
 -}
 -
 -static void ext4_sb_release(struct kobject *kobj)
 -{
 -      struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
 -                                              s_kobj);
 -      complete(&sbi->s_kobj_unregister);
 -}
 -
 -static const struct sysfs_ops ext4_attr_ops = {
 -      .show   = ext4_attr_show,
 -      .store  = ext4_attr_store,
 -};
 -
 -static struct kobj_type ext4_ktype = {
 -      .default_attrs  = ext4_attrs,
 -      .sysfs_ops      = &ext4_attr_ops,
 -      .release        = ext4_sb_release,
 -};
 -
 -static void ext4_feat_release(struct kobject *kobj)
 -{
 -      complete(&ext4_feat->f_kobj_unregister);
 -}
 -
 -static ssize_t ext4_feat_show(struct kobject *kobj,
 -                            struct attribute *attr, char *buf)
 -{
 -      return snprintf(buf, PAGE_SIZE, "supported\n");
 -}
 -
 -/*
 - * We can not use ext4_attr_show/store because it relies on the kobject
 - * being embedded in the ext4_sb_info structure which is definitely not
 - * true in this case.
 - */
 -static const struct sysfs_ops ext4_feat_ops = {
 -      .show   = ext4_feat_show,
 -      .store  = NULL,
 -};
 -
 -static struct kobj_type ext4_feat_ktype = {
 -      .default_attrs  = ext4_feat_attrs,
 -      .sysfs_ops      = &ext4_feat_ops,
 -      .release        = ext4_feat_release,
 -};
 -
  /*
   * Check whether this filesystem can be mounted based on
   * the features present and the RDONLY/RDWR mount requested.
   */
  static int ext4_feature_set_ok(struct super_block *sb, int readonly)
  {
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
 +      if (ext4_has_unknown_ext4_incompat_features(sb)) {
                ext4_msg(sb, KERN_ERR,
                        "Couldn't mount because of "
                        "unsupported optional features (%x)",
        if (readonly)
                return 1;
  
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
 +      if (ext4_has_feature_readonly(sb)) {
                ext4_msg(sb, KERN_INFO, "filesystem is read-only");
                sb->s_flags |= MS_RDONLY;
                return 1;
        }
  
        /* Check that feature set is OK for a read-write mount */
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
 +      if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
                ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
                         "unsupported optional features (%x)",
                         (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
         * Large file size enabled file system can only be mounted
         * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
         */
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
 +      if (ext4_has_feature_huge_file(sb)) {
                if (sizeof(blkcnt_t) < sizeof(u64)) {
                        ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
                                 "cannot be mounted RDWR without "
                        return 0;
                }
        }
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
 -          !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 +      if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Can't support bigalloc feature without "
                         "extents feature\n");
        }
  
  #ifndef CONFIG_QUOTA
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
 -          !readonly) {
 +      if (ext4_has_feature_quota(sb) && !readonly) {
                ext4_msg(sb, KERN_ERR,
                         "Filesystem with quota feature cannot be mounted RDWR "
                         "without CONFIG_QUOTA");
@@@ -2973,7 -3312,7 +2973,7 @@@ static int count_overhead(struct super_
        ext4_group_t            i, ngroups = ext4_get_groups_count(sb);
        int                     s, j, count = 0;
  
 -      if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
 +      if (!ext4_has_feature_bigalloc(sb))
                return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
                        sbi->s_itb_per_group + 2);
  
@@@ -3064,10 -3403,10 +3064,10 @@@ int ext4_calculate_overhead(struct supe
        return 0;
  }
  
 -
 -static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
 +static void ext4_set_resv_clusters(struct super_block *sb)
  {
        ext4_fsblk_t resv_clusters;
 +      struct ext4_sb_info *sbi = EXT4_SB(sb);
  
        /*
         * There's no need to reserve anything when we aren't using extents.
         * hole punching doesn't need new metadata... This is needed especially
         * to keep ext2/3 backward compatibility.
         */
 -      if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
 -              return 0;
 +      if (!ext4_has_feature_extents(sb))
 +              return;
        /*
         * By default we reserve 2% or 4096 clusters, whichever is smaller.
         * This should cover the situations where we can not afford to run
         * allocation would require 1, or 2 blocks, higher numbers are
         * very rare.
         */
 -      resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
 -                      EXT4_SB(sb)->s_cluster_bits;
 +      resv_clusters = (ext4_blocks_count(sbi->s_es) >>
 +                       sbi->s_cluster_bits);
  
        do_div(resv_clusters, 50);
        resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
  
 -      return resv_clusters;
 -}
 -
 -
 -static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
 -{
 -      ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
 -                              sbi->s_cluster_bits;
 -
 -      if (count >= clusters)
 -              return -EINVAL;
 -
 -      atomic64_set(&sbi->s_resv_clusters, count);
 -      return 0;
 +      atomic64_set(&sbi->s_resv_clusters, resv_clusters);
  }
  
  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
  
        /* Warn if metadata_csum and gdt_csum are both set. */
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                                     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
 -          EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
 +      if (ext4_has_feature_metadata_csum(sb) &&
 +          ext4_has_feature_gdt_csum(sb))
                ext4_warning(sb, "metadata_csum and uninit_bg are "
                             "redundant flags; please run fsck.");
  
        }
  
        /* Load the checksum driver */
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                                     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
 +      if (ext4_has_feature_metadata_csum(sb)) {
                sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
                if (IS_ERR(sbi->s_chksum_driver)) {
                        ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
                         "invalid superblock checksum.  Run e2fsck?");
                silent = 1;
 +              ret = -EFSBADCRC;
                goto cantfind_ext4;
        }
  
        /* Precompute checksum seed for all metadata */
 -      if (ext4_has_metadata_csum(sb))
 +      if (ext4_has_feature_csum_seed(sb))
 +              sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
 +      else if (ext4_has_metadata_csum(sb))
                sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
                                               sizeof(es->s_uuid));
  
                (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
  
        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
 -          (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
 -           EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
 -           EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
 +          (ext4_has_compat_features(sb) ||
 +           ext4_has_ro_compat_features(sb) ||
 +           ext4_has_incompat_features(sb)))
                ext4_msg(sb, KERN_WARNING,
                       "feature flags set on rev 0 fs, "
                       "running e2fsck is recommended");
  
        if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
                set_opt2(sb, HURD_COMPAT);
 -              if (EXT4_HAS_INCOMPAT_FEATURE(sb,
 -                                            EXT4_FEATURE_INCOMPAT_64BIT)) {
 +              if (ext4_has_feature_64bit(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "The Hurd can't support 64-bit file systems");
                        goto failed_mount;
                }
        }
  
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
 -          es->s_encryption_level) {
 +      if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
                ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
                         es->s_encryption_level);
                goto failed_mount;
                }
        }
  
 -      has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                              EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
 +      has_huge_files = ext4_has_feature_huge_file(sb);
        sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
                                                      has_huge_files);
        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
        }
  
        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
 +      if (ext4_has_feature_64bit(sb)) {
                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
                    !is_power_of_2(sbi->s_desc_size)) {
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
 -      if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
 +      if (ext4_has_feature_dir_index(sb)) {
                i = le32_to_cpu(es->s_flags);
                if (i & EXT2_FLAGS_UNSIGNED_HASH)
                        sbi->s_hash_unsigned = 3;
  
        /* Handle clustersize */
        clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
 -      has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                              EXT4_FEATURE_RO_COMPAT_BIGALLOC);
 +      has_bigalloc = ext4_has_feature_bigalloc(sb);
        if (has_bigalloc) {
                if (clustersize < blocksize) {
                        ext4_msg(sb, KERN_ERR,
                goto failed_mount;
        }
  
 -      if (ext4_proc_root)
 -              sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
 -
 -      if (sbi->s_proc)
 -              proc_create_data("options", S_IRUGO, sbi->s_proc,
 -                               &ext4_seq_options_fops, sb);
 -
        bgl_lock_init(sbi->s_blockgroup_lock);
  
        for (i = 0; i < db_count; i++) {
        }
        if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
 +              ret = -EFSCORRUPTED;
                goto failed_mount2;
        }
  
        sb->s_xattr = ext4_xattr_handlers;
  #ifdef CONFIG_QUOTA
        sb->dq_op = &ext4_quota_operations;
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
 +      if (ext4_has_feature_quota(sb))
                sb->s_qcop = &dquot_quotactl_sysfile_ops;
        else
                sb->s_qcop = &ext4_qctl_operations;
        sb->s_root = NULL;
  
        needs_recovery = (es->s_last_orphan != 0 ||
 -                        EXT4_HAS_INCOMPAT_FEATURE(sb,
 -                                  EXT4_FEATURE_INCOMPAT_RECOVER));
 +                        ext4_has_feature_journal_needs_recovery(sb));
  
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
 -          !(sb->s_flags & MS_RDONLY))
 +      if (ext4_has_feature_mmp(sb) && !(sb->s_flags & MS_RDONLY))
                if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
                        goto failed_mount3a;
  
         * The first inode we look at is the journal inode.  Don't try
         * root first: it may be modified in the journal!
         */
 -      if (!test_opt(sb, NOLOAD) &&
 -          EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
 +      if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
                if (ext4_load_journal(sb, es, journal_devnum))
                        goto failed_mount3a;
        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 -            EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 +                 ext4_has_feature_journal_needs_recovery(sb)) {
                ext4_msg(sb, KERN_ERR, "required journal recovery "
                       "suppressed and not mounted read-only");
                goto failed_mount_wq;
        } else {
 +              /* Nojournal mode, all journal mount options are illegal */
 +              if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
 +                      ext4_msg(sb, KERN_ERR, "can't mount with "
 +                               "journal_checksum, fs mounted w/o journal");
 +                      goto failed_mount_wq;
 +              }
 +              if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
 +                      ext4_msg(sb, KERN_ERR, "can't mount with "
 +                               "journal_async_commit, fs mounted w/o journal");
 +                      goto failed_mount_wq;
 +              }
 +              if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
 +                      ext4_msg(sb, KERN_ERR, "can't mount with "
 +                               "commit=%lu, fs mounted w/o journal",
 +                               sbi->s_commit_interval / HZ);
 +                      goto failed_mount_wq;
 +              }
 +              if (EXT4_MOUNT_DATA_FLAGS &
 +                  (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
 +                      ext4_msg(sb, KERN_ERR, "can't mount with "
 +                               "data=, fs mounted w/o journal");
 +                      goto failed_mount_wq;
 +              }
 +              sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM;
 +              clear_opt(sb, JOURNAL_CHECKSUM);
                clear_opt(sb, DATA_FLAGS);
                sbi->s_journal = NULL;
                needs_recovery = 0;
                goto no_journal;
        }
  
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
 +      if (ext4_has_feature_64bit(sb) &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
@@@ -3762,16 -4101,18 +3762,16 @@@ no_journal
                }
        }
  
 -      if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
 -           EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
 +      if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
            (blocksize != PAGE_CACHE_SIZE)) {
                ext4_msg(sb, KERN_ERR,
                         "Unsupported blocksize for fs encryption");
                goto failed_mount_wq;
        }
  
 -      if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
 -          !(sb->s_flags & MS_RDONLY) &&
 -          !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
 -              EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
 +      if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) &&
 +          !ext4_has_feature_encrypt(sb)) {
 +              ext4_set_feature_encrypt(sb);
                ext4_commit_super(sb, 1);
        }
  
        if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
                                                     EXT4_GOOD_OLD_INODE_SIZE;
 -              if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                                     EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
 +              if (ext4_has_feature_extra_isize(sb)) {
                        if (sbi->s_want_extra_isize <
                            le16_to_cpu(es->s_want_extra_isize))
                                sbi->s_want_extra_isize =
                         "available");
        }
  
 -      err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
 -      if (err) {
 -              ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
 -                       "reserved pool", ext4_calculate_resv_clusters(sb));
 -              goto failed_mount4a;
 -      }
 +      ext4_set_resv_clusters(sb);
  
        err = ext4_setup_system_zone(sb);
        if (err) {
                goto failed_mount6;
        }
  
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 +      if (ext4_has_feature_flex_bg(sb))
                if (!ext4_fill_flex_info(sb)) {
                        ext4_msg(sb, KERN_ERR,
                               "unable to initialize "
        if (err)
                goto failed_mount6;
  
 -      sbi->s_kobj.kset = ext4_kset;
 -      init_completion(&sbi->s_kobj_unregister);
 -      err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
 -                                 "%s", sb->s_id);
 +      err = ext4_register_sysfs(sb);
        if (err)
                goto failed_mount7;
  
  #ifdef CONFIG_QUOTA
        /* Enable quota usage during mount. */
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
 -          !(sb->s_flags & MS_RDONLY)) {
 +      if (ext4_has_feature_quota(sb) && !(sb->s_flags & MS_RDONLY)) {
                err = ext4_enable_quotas(sb);
                if (err)
                        goto failed_mount8;
@@@ -3962,7 -4313,7 +3962,7 @@@ cantfind_ext4
  
  #ifdef CONFIG_QUOTA
  failed_mount8:
 -      kobject_del(&sbi->s_kobj);
 +      ext4_unregister_sysfs(sb);
  #endif
  failed_mount7:
        ext4_unregister_li_request(sb);
@@@ -4002,6 -4353,10 +4002,6 @@@ failed_mount2
  failed_mount:
        if (sbi->s_chksum_driver)
                crypto_free_shash(sbi->s_chksum_driver);
 -      if (sbi->s_proc) {
 -              remove_proc_entry("options", sbi->s_proc);
 -              remove_proc_entry(sb->s_id, ext4_proc_root);
 -      }
  #ifdef CONFIG_QUOTA
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(sbi->s_qf_names[i]);
@@@ -4048,7 -4403,7 +4048,7 @@@ static journal_t *ext4_get_journal(stru
        struct inode *journal_inode;
        journal_t *journal;
  
 -      BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 +      BUG_ON(!ext4_has_feature_journal(sb));
  
        /* First, test for the existence of a valid inode on disk.  Bad
         * things happen if we iget() an unused inode, as the subsequent
@@@ -4098,7 -4453,7 +4098,7 @@@ static journal_t *ext4_get_dev_journal(
        struct ext4_super_block *es;
        struct block_device *bdev;
  
 -      BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 +      BUG_ON(!ext4_has_feature_journal(sb));
  
        bdev = ext4_blkdev_get(j_dev, sb);
        if (bdev == NULL)
@@@ -4190,7 -4545,7 +4190,7 @@@ static int ext4_load_journal(struct sup
        int err = 0;
        int really_read_only;
  
 -      BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 +      BUG_ON(!ext4_has_feature_journal(sb));
  
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
         * crash?  For recovery, we need to check in advance whether we
         * can get read-write access to the device.
         */
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 +      if (ext4_has_feature_journal_needs_recovery(sb)) {
                if (sb->s_flags & MS_RDONLY) {
                        ext4_msg(sb, KERN_INFO, "INFO: recovery "
                                        "required on readonly filesystem");
        if (!(journal->j_flags & JBD2_BARRIER))
                ext4_msg(sb, KERN_INFO, "barriers disabled");
  
 -      if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
 +      if (!ext4_has_feature_journal_needs_recovery(sb))
                err = jbd2_journal_wipe(journal, !really_read_only);
        if (!err) {
                char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
@@@ -4352,7 -4707,7 +4352,7 @@@ static void ext4_mark_recovery_complete
  {
        journal_t *journal = EXT4_SB(sb)->s_journal;
  
 -      if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
 +      if (!ext4_has_feature_journal(sb)) {
                BUG_ON(journal != NULL);
                return;
        }
        if (jbd2_journal_flush(journal) < 0)
                goto out;
  
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 +      if (ext4_has_feature_journal_needs_recovery(sb) &&
            sb->s_flags & MS_RDONLY) {
 -              EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 +              ext4_clear_feature_journal_needs_recovery(sb);
                ext4_commit_super(sb, 1);
        }
  
@@@ -4382,7 -4737,7 +4382,7 @@@ static void ext4_clear_journal_err(stru
        int j_errno;
        const char *errstr;
  
 -      BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 +      BUG_ON(!ext4_has_feature_journal(sb));
  
        journal = EXT4_SB(sb)->s_journal;
  
@@@ -4497,7 -4852,7 +4497,7 @@@ static int ext4_freeze(struct super_blo
                        goto out;
  
                /* Journal blocked and flushed, clear needs_recovery flag. */
 -              EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 +              ext4_clear_feature_journal_needs_recovery(sb);
        }
  
        error = ext4_commit_super(sb, 1);
@@@ -4519,7 -4874,7 +4519,7 @@@ static int ext4_unfreeze(struct super_b
  
        if (EXT4_SB(sb)->s_journal) {
                /* Reset the needs_recovery flag before the fs is unlocked. */
 -              EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 +              ext4_set_feature_journal_needs_recovery(sb);
        }
  
        ext4_commit_super(sb, 1);
@@@ -4672,7 -5027,8 +4672,7 @@@ static int ext4_remount(struct super_bl
                                ext4_mark_recovery_complete(sb, es);
                } else {
                        /* Make sure we can mount this feature set readwrite */
 -                      if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                                      EXT4_FEATURE_RO_COMPAT_READONLY) ||
 +                      if (ext4_has_feature_readonly(sb) ||
                            !ext4_feature_set_ok(sb, 0)) {
                                err = -EROFS;
                                goto restore_opts;
                                if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
                                        ext4_msg(sb, KERN_ERR,
               "ext4_remount: Checksum for group %u failed (%u!=%u)",
 -              g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
 +              g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
 -                                      err = -EINVAL;
 +                                      err = -EFSBADCRC;
                                        goto restore_opts;
                                }
                        }
                        sbi->s_mount_state = le16_to_cpu(es->s_state);
                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
 -                      if (EXT4_HAS_INCOMPAT_FEATURE(sb,
 -                                                   EXT4_FEATURE_INCOMPAT_MMP))
 +                      if (ext4_has_feature_mmp(sb))
                                if (ext4_multi_mount_protect(sb,
                                                le64_to_cpu(es->s_mmp_block))) {
                                        err = -EROFS;
        if (enable_quota) {
                if (sb_any_quota_suspended(sb))
                        dquot_resume(sb, -1);
 -              else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 -                                      EXT4_FEATURE_RO_COMPAT_QUOTA)) {
 +              else if (ext4_has_feature_quota(sb)) {
                        err = ext4_enable_quotas(sb);
                        if (err)
                                goto restore_opts;
@@@ -4897,7 -5255,7 +4897,7 @@@ static int ext4_mark_dquot_dirty(struc
        struct ext4_sb_info *sbi = EXT4_SB(sb);
  
        /* Are we journaling quotas? */
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
 +      if (ext4_has_feature_quota(sb) ||
            sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
                dquot_mark_dquot_dirty(dquot);
                return ext4_write_dquot(dquot);
@@@ -4985,7 -5343,7 +4985,7 @@@ static int ext4_quota_enable(struct sup
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
        };
  
 -      BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
 +      BUG_ON(!ext4_has_feature_quota(sb));
  
        if (!qf_inums[type])
                return -EPERM;
@@@ -5179,11 -5537,11 +5179,11 @@@ static inline void unregister_as_ext2(v
  
  static inline int ext2_feature_set_ok(struct super_block *sb)
  {
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
 +      if (ext4_has_unknown_ext2_incompat_features(sb))
                return 0;
        if (sb->s_flags & MS_RDONLY)
                return 1;
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
 +      if (ext4_has_unknown_ext2_ro_compat_features(sb))
                return 0;
        return 1;
  }
@@@ -5208,13 -5566,13 +5208,13 @@@ static inline void unregister_as_ext3(v
  
  static inline int ext3_feature_set_ok(struct super_block *sb)
  {
 -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
 +      if (ext4_has_unknown_ext3_incompat_features(sb))
                return 0;
 -      if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
 +      if (!ext4_has_feature_journal(sb))
                return 0;
        if (sb->s_flags & MS_RDONLY)
                return 1;
 -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
 +      if (ext4_has_unknown_ext3_ro_compat_features(sb))
                return 0;
        return 1;
  }
@@@ -5228,6 -5586,37 +5228,6 @@@ static struct file_system_type ext4_fs_
  };
  MODULE_ALIAS_FS("ext4");
  
 -static int __init ext4_init_feat_adverts(void)
 -{
 -      struct ext4_features *ef;
 -      int ret = -ENOMEM;
 -
 -      ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
 -      if (!ef)
 -              goto out;
 -
 -      ef->f_kobj.kset = ext4_kset;
 -      init_completion(&ef->f_kobj_unregister);
 -      ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
 -                                 "features");
 -      if (ret) {
 -              kfree(ef);
 -              goto out;
 -      }
 -
 -      ext4_feat = ef;
 -      ret = 0;
 -out:
 -      return ret;
 -}
 -
 -static void ext4_exit_feat_adverts(void)
 -{
 -      kobject_put(&ext4_feat->f_kobj);
 -      wait_for_completion(&ext4_feat->f_kobj_unregister);
 -      kfree(ext4_feat);
 -}
 -
  /* Shared across all ext4 file systems */
  wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
  struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
@@@ -5254,15 -5643,21 +5254,15 @@@ static int __init ext4_init_fs(void
  
        err = ext4_init_pageio();
        if (err)
 -              goto out7;
 +              goto out5;
  
        err = ext4_init_system_zone();
        if (err)
 -              goto out6;
 -      ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
 -      if (!ext4_kset) {
 -              err = -ENOMEM;
 -              goto out5;
 -      }
 -      ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 +              goto out4;
  
 -      err = ext4_init_feat_adverts();
 +      err = ext4_init_sysfs();
        if (err)
 -              goto out4;
 +              goto out3;
  
        err = ext4_init_mballoc();
        if (err)
@@@ -5287,12 -5682,16 +5287,12 @@@ out1
        ext4_mballoc_ready = 0;
        ext4_exit_mballoc();
  out2:
 -      ext4_exit_feat_adverts();
 -out4:
 -      if (ext4_proc_root)
 -              remove_proc_entry("fs/ext4", NULL);
 -      kset_unregister(ext4_kset);
 -out5:
 +      ext4_exit_sysfs();
 +out3:
        ext4_exit_system_zone();
 -out6:
 +out4:
        ext4_exit_pageio();
 -out7:
 +out5:
        ext4_exit_es();
  
        return err;
@@@ -5307,7 -5706,9 +5307,7 @@@ static void __exit ext4_exit_fs(void
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
        ext4_exit_mballoc();
 -      ext4_exit_feat_adverts();
 -      remove_proc_entry("fs/ext4", NULL);
 -      kset_unregister(ext4_kset);
 +      ext4_exit_sysfs();
        ext4_exit_system_zone();
        ext4_exit_pageio();
        ext4_exit_es();
diff --combined fs/fs-writeback.c
index 7378169e90be6ed485ac48d0cf633c8e37c4c3d2,e82e1194b1d84a889ec3743333f3c526f1291d54..206a68b1db1ab1b1f5a6ed6a611723f957e1e58b
@@@ -778,24 -778,19 +778,24 @@@ static void bdi_split_work_to_wbs(struc
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
  {
 -      int next_memcg_id = 0;
 -      struct bdi_writeback *wb;
 -      struct wb_iter iter;
 +      struct bdi_writeback *last_wb = NULL;
 +      struct bdi_writeback *wb = list_entry(&bdi->wb_list,
 +                                            struct bdi_writeback, bdi_node);
  
        might_sleep();
  restart:
        rcu_read_lock();
 -      bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
 +      list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
                struct wb_writeback_work fallback_work;
                struct wb_writeback_work *work;
                long nr_pages;
  
 +              if (last_wb) {
 +                      wb_put(last_wb);
 +                      last_wb = NULL;
 +              }
 +
                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
  
                wb_queue_work(wb, work);
  
 -              next_memcg_id = wb->memcg_css->id + 1;
 +              /*
 +               * Pin @wb so that it stays on @bdi->wb_list.  This allows
 +               * continuing iteration from @wb after dropping and
 +               * regrabbing rcu read lock.
 +               */
 +              wb_get(wb);
 +              last_wb = wb;
 +
                rcu_read_unlock();
                wb_wait_for_completion(bdi, &fallback_work_done);
                goto restart;
        }
        rcu_read_unlock();
 +
 +      if (last_wb)
 +              wb_put(last_wb);
  }
  
  #else /* CONFIG_CGROUP_WRITEBACK */
@@@ -1872,11 -1857,12 +1872,11 @@@ void wakeup_flusher_threads(long nr_pag
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;
 -              struct wb_iter iter;
  
                if (!bdi_has_dirty_io(bdi))
                        continue;
  
 -              bdi_for_each_wb(wb, bdi, &iter, 0)
 +              list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                        wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
                                           false, reason);
        }
@@@ -1908,10 -1894,11 +1908,10 @@@ static void wakeup_dirtytime_writeback(
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;
 -              struct wb_iter iter;
  
 -              bdi_for_each_wb(wb, bdi, &iter, 0)
 -                      if (!list_empty(&bdi->wb.b_dirty_time))
 -                              wb_wakeup(&bdi->wb);
 +              list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
 +                      if (!list_empty(&wb->b_dirty_time))
 +                              wb_wakeup(wb);
        }
        rcu_read_unlock();
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@@ -2149,7 -2136,12 +2149,12 @@@ static void wait_sb_inodes(struct super
                iput(old_inode);
                old_inode = inode;
  
-               filemap_fdatawait(mapping);
+               /*
+                * We keep the error status of individual mapping so that
+                * applications can catch the writeback error using fsync(2).
+                * See filemap_fdatawait_keep_errors() for details.
+                */
+               filemap_fdatawait_keep_errors(mapping);
  
                cond_resched();
  
diff --combined fs/jffs2/wbuf.c
index 63f31c0733c51e5e1e8cc0b51425dc07bb59cc3e,955da626ba6b7f74d02a7d7dd5badb5e2a031430..f3a4857ff0718794b967836e796b8f9c9e345ae7
@@@ -1264,7 -1264,7 +1264,7 @@@ int jffs2_dataflash_setup(struct jffs2_
        if ((c->flash_size % c->sector_size) != 0) {
                c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
                pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
-       };
+       }
  
        c->wbuf_ofs = 0xFFFFFFFF;
        c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
  #ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
        c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
        if (!c->wbuf_verify) {
 -              kfree(c->oobbuf);
                kfree(c->wbuf);
                return -ENOMEM;
        }
diff --combined fs/mpage.c
index 09abba7653aa8db8189d05d7c2094b77ef1998a9,7d29c863c05214509afc5986a02820b719ab7e6e..1480d3a180370fe3922a7724e613d09b896f9d00
@@@ -361,7 -361,7 +361,7 @@@ mpage_readpages(struct address_space *m
        sector_t last_block_in_bio = 0;
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-       gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+       gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
  
        map_bh.b_state = 0;
        map_bh.b_size = 0;
@@@ -397,7 -397,7 +397,7 @@@ int mpage_readpage(struct page *page, g
        sector_t last_block_in_bio = 0;
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-       gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping);
+       gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
  
        map_bh.b_state = 0;
        map_bh.b_size = 0;
@@@ -485,7 -485,6 +485,7 @@@ static int __mpage_writepage(struct pag
        struct buffer_head map_bh;
        loff_t i_size = i_size_read(inode);
        int ret = 0;
 +      int wr = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
  
        if (page_has_buffers(page)) {
                struct buffer_head *head = page_buffers(page);
@@@ -594,7 -593,7 +594,7 @@@ page_is_mapped
         * This page will go to BIO.  Do we need to send this BIO off first?
         */
        if (bio && mpd->last_block_in_bio != blocks[0] - 1)
 -              bio = mpage_bio_submit(WRITE, bio);
 +              bio = mpage_bio_submit(wr, bio);
  
  alloc_new:
        if (bio == NULL) {
        wbc_account_io(wbc, page, PAGE_SIZE);
        length = first_unmapped << blkbits;
        if (bio_add_page(bio, page, length, 0) < length) {
 -              bio = mpage_bio_submit(WRITE, bio);
 +              bio = mpage_bio_submit(wr, bio);
                goto alloc_new;
        }
  
        set_page_writeback(page);
        unlock_page(page);
        if (boundary || (first_unmapped != blocks_per_page)) {
 -              bio = mpage_bio_submit(WRITE, bio);
 +              bio = mpage_bio_submit(wr, bio);
                if (boundary_block) {
                        write_boundary_block(boundary_bdev,
                                        boundary_block, 1 << blkbits);
  
  confused:
        if (bio)
 -              bio = mpage_bio_submit(WRITE, bio);
 +              bio = mpage_bio_submit(wr, bio);
  
        if (mpd->use_writepage) {
                ret = mapping->a_ops->writepage(page, wbc);
@@@ -699,11 -698,8 +699,11 @@@ mpage_writepages(struct address_space *
                };
  
                ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
 -              if (mpd.bio)
 -                      mpage_bio_submit(WRITE, mpd.bio);
 +              if (mpd.bio) {
 +                      int wr = (wbc->sync_mode == WB_SYNC_ALL ?
 +                                WRITE_SYNC : WRITE);
 +                      mpage_bio_submit(wr, mpd.bio);
 +              }
        }
        blk_finish_plug(&plug);
        return ret;
@@@ -720,11 -716,8 +720,11 @@@ int mpage_writepage(struct page *page, 
                .use_writepage = 0,
        };
        int ret = __mpage_writepage(page, wbc, &mpd);
 -      if (mpd.bio)
 -              mpage_bio_submit(WRITE, mpd.bio);
 +      if (mpd.bio) {
 +              int wr = (wbc->sync_mode == WB_SYNC_ALL ?
 +                        WRITE_SYNC : WRITE);
 +              mpage_bio_submit(wr, mpd.bio);
 +      }
        return ret;
  }
  EXPORT_SYMBOL(mpage_writepage);
diff --combined fs/namei.c
index 2b729d253715ba183e912fae98cfec1e68665e24,c86ea9e89f7d573c2d735e72915ad0eb47fbe9f3..174ef4f106cd2ac9696db8cf153f40dd7ef0ecaa
@@@ -955,23 -955,26 +955,23 @@@ static bool safe_hardlink_source(struc
   *  - sysctl_protected_hardlinks enabled
   *  - fsuid does not match inode
   *  - hardlink source is unsafe (see safe_hardlink_source() above)
 - *  - not CAP_FOWNER
 + *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
   *
   * Returns 0 if successful, -ve on error.
   */
  static int may_linkat(struct path *link)
  {
 -      const struct cred *cred;
        struct inode *inode;
  
        if (!sysctl_protected_hardlinks)
                return 0;
  
 -      cred = current_cred();
        inode = link->dentry->d_inode;
  
        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
         * otherwise, it must be a safe source.
         */
 -      if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
 -          capable(CAP_FOWNER))
 +      if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
                return 0;
  
        audit_log_link_denied("linkat", link);
@@@ -1966,7 -1969,7 +1966,7 @@@ OK
                if (err) {
                        const char *s = get_link(nd);
  
 -                      if (unlikely(IS_ERR(s)))
 +                      if (IS_ERR(s))
                                return PTR_ERR(s);
                        err = 0;
                        if (unlikely(!s)) {
@@@ -2279,8 -2282,6 +2279,8 @@@ EXPORT_SYMBOL(vfs_path_lookup)
   *
   * Note that this routine is purely a helper for filesystem usage and should
   * not be called by generic code.
 + *
 + * The caller must hold base->i_mutex.
   */
  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
  {
  }
  EXPORT_SYMBOL(lookup_one_len);
  
 +/**
 + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 + * @name:     pathname component to lookup
 + * @base:     base directory to lookup from
 + * @len:      maximum length @len should be interpreted to
 + *
 + * Note that this routine is purely a helper for filesystem usage and should
 + * not be called by generic code.
 + *
 + * Unlike lookup_one_len, it should be called without the parent
 + * i_mutex held, and will take the i_mutex itself if necessary.
 + */
 +struct dentry *lookup_one_len_unlocked(const char *name,
 +                                     struct dentry *base, int len)
 +{
 +      struct qstr this;
 +      unsigned int c;
 +      int err;
 +      struct dentry *ret;
 +
 +      this.name = name;
 +      this.len = len;
 +      this.hash = full_name_hash(name, len);
 +      if (!len)
 +              return ERR_PTR(-EACCES);
 +
 +      if (unlikely(name[0] == '.')) {
 +              if (len < 2 || (len == 2 && name[1] == '.'))
 +                      return ERR_PTR(-EACCES);
 +      }
 +
 +      while (len--) {
 +              c = *(const unsigned char *)name++;
 +              if (c == '/' || c == '\0')
 +                      return ERR_PTR(-EACCES);
 +      }
 +      /*
 +       * See if the low-level filesystem might want
 +       * to use its own hash..
 +       */
 +      if (base->d_flags & DCACHE_OP_HASH) {
 +              int err = base->d_op->d_hash(base, &this);
 +              if (err < 0)
 +                      return ERR_PTR(err);
 +      }
 +
 +      err = inode_permission(base->d_inode, MAY_EXEC);
 +      if (err)
 +              return ERR_PTR(err);
 +
 +      ret = __d_lookup(base, &this);
 +      if (ret)
 +              return ret;
 +      /*
 +       * __d_lookup() is used to try to get a quick answer and avoid the
 +       * mutex.  A false-negative does no harm.
 +       */
 +      ret = __d_lookup(base, &this);
 +      if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
 +              dput(ret);
 +              ret = NULL;
 +      }
 +      if (ret)
 +              return ret;
 +
 +      mutex_lock(&base->d_inode->i_mutex);
 +      ret =  __lookup_hash(&this, base, 0);
 +      mutex_unlock(&base->d_inode->i_mutex);
 +      return ret;
 +}
 +EXPORT_SYMBOL(lookup_one_len_unlocked);
 +
  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
  {
@@@ -3454,7 -3383,7 +3454,7 @@@ struct file *do_file_open_root(struct d
                return ERR_PTR(-ELOOP);
  
        filename = getname_kernel(name);
 -      if (unlikely(IS_ERR(filename)))
 +      if (IS_ERR(filename))
                return ERR_CAST(filename);
  
        set_nameidata(&nd, -1, filename);
@@@ -4678,7 -4607,7 +4678,7 @@@ EXPORT_SYMBOL(__page_symlink)
  int page_symlink(struct inode *inode, const char *symname, int len)
  {
        return __page_symlink(inode, symname, len,
-                       !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+                       !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
  }
  EXPORT_SYMBOL(page_symlink);
  
diff --combined fs/nfs/file.c
index 37f639d50af580396bf016a2fb40c2b427c7b1dd,17d3417c8a74375a2974af6e1c4f13219b188476..93e236429c5d785a1711d643d0b4676dfe4396cf
@@@ -473,8 -473,8 +473,8 @@@ static int nfs_release_page(struct pag
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
  
        /* Always try to initiate a 'commit' if relevant, but only
-        * wait for it if __GFP_WAIT is set.  Even then, only wait 1
-        * second and only if the 'bdi' is not congested.
+        * wait for it if the caller allows blocking.  Even then,
+        * only wait 1 second and only if the 'bdi' is not congested.
         * Waiting indefinitely can cause deadlocks when the NFS
         * server is on this machine, when a new TCP connection is
         * needed and in other rare cases.  There is no particular
        if (mapping) {
                struct nfs_server *nfss = NFS_SERVER(mapping->host);
                nfs_commit_inode(mapping->host, 0);
-               if ((gfp & __GFP_WAIT) &&
+               if (gfpflags_allow_blocking(gfp) &&
                    !bdi_write_congested(&nfss->backing_dev_info)) {
                        wait_on_page_bit_killable_timeout(page, PG_private,
                                                          HZ);
@@@ -738,7 -738,18 +738,7 @@@ out_noconflict
  
  static int do_vfs_lock(struct file *file, struct file_lock *fl)
  {
 -      int res = 0;
 -      switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
 -              case FL_POSIX:
 -                      res = posix_lock_file_wait(file, fl);
 -                      break;
 -              case FL_FLOCK:
 -                      res = flock_lock_file_wait(file, fl);
 -                      break;
 -              default:
 -                      BUG();
 -      }
 -      return res;
 +      return locks_lock_file_wait(file, fl);
  }
  
  static int
index e404386bd93e8535a32152fcc5936ede9ac5fa41,ddddef0021a0f8cd94f456969099bc660ec1fdb5..709fbbd44c65366ce1e31aebce0904b5966d53a6
@@@ -219,7 -219,8 +219,8 @@@ struct o2hb_region 
        unsigned                hr_unclean_stop:1,
                                hr_aborted_start:1,
                                hr_item_pinned:1,
-                               hr_item_dropped:1;
+                               hr_item_dropped:1,
+                               hr_node_deleted:1;
  
        /* protected by the hr_callback_sem */
        struct task_struct      *hr_task;
@@@ -1078,7 -1079,13 +1079,13 @@@ static int o2hb_thread(void *data
        set_user_nice(current, MIN_NICE);
  
        /* Pin node */
-       o2nm_depend_this_node();
+       ret = o2nm_depend_this_node();
+       if (ret) {
+               mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
+               reg->hr_node_deleted = 1;
+               wake_up(&o2hb_steady_queue);
+               return 0;
+       }
  
        while (!kthread_should_stop() &&
               !reg->hr_unclean_stop && !reg->hr_aborted_start) {
@@@ -1473,17 -1480,16 +1480,17 @@@ static int o2hb_read_block_input(struc
        return 0;
  }
  
 -static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
 +static ssize_t o2hb_region_block_bytes_show(struct config_item *item,
                                            char *page)
  {
 -      return sprintf(page, "%u\n", reg->hr_block_bytes);
 +      return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes);
  }
  
 -static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
 +static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
                                             const char *page,
                                             size_t count)
  {
 +      struct o2hb_region *reg = to_o2hb_region(item);
        int status;
        unsigned long block_bytes;
        unsigned int block_bits;
        return count;
  }
  
 -static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
 +static ssize_t o2hb_region_start_block_show(struct config_item *item,
                                            char *page)
  {
 -      return sprintf(page, "%llu\n", reg->hr_start_block);
 +      return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block);
  }
  
 -static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
 +static ssize_t o2hb_region_start_block_store(struct config_item *item,
                                             const char *page,
                                             size_t count)
  {
 +      struct o2hb_region *reg = to_o2hb_region(item);
        unsigned long long tmp;
        char *p = (char *)page;
  
        return count;
  }
  
 -static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
 -                                     char *page)
 +static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page)
  {
 -      return sprintf(page, "%d\n", reg->hr_blocks);
 +      return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks);
  }
  
 -static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
 +static ssize_t o2hb_region_blocks_store(struct config_item *item,
                                        const char *page,
                                        size_t count)
  {
 +      struct o2hb_region *reg = to_o2hb_region(item);
        unsigned long tmp;
        char *p = (char *)page;
  
        return count;
  }
  
 -static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
 -                                  char *page)
 +static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
  {
        unsigned int ret = 0;
  
 -      if (reg->hr_bdev)
 -              ret = sprintf(page, "%s\n", reg->hr_dev_name);
 +      if (to_o2hb_region(item)->hr_bdev)
 +              ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name);
  
        return ret;
  }
  }
  
  /* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
 -static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 +static ssize_t o2hb_region_dev_store(struct config_item *item,
                                     const char *page,
                                     size_t count)
  {
 +      struct o2hb_region *reg = to_o2hb_region(item);
        struct task_struct *hb_task;
        long fd;
        int sectsize;
        spin_unlock(&o2hb_live_lock);
  
        ret = wait_event_interruptible(o2hb_steady_queue,
-                               atomic_read(&reg->hr_steady_iterations) == 0);
+                               atomic_read(&reg->hr_steady_iterations) == 0 ||
+                               reg->hr_node_deleted);
        if (ret) {
                atomic_set(&reg->hr_steady_iterations, 0);
                reg->hr_aborted_start = 1;
                goto out3;
        }
  
+       if (reg->hr_node_deleted) {
+               ret = -EINVAL;
+               goto out3;
+       }
        /* Ok, we were woken.  Make sure it wasn't by drop_item() */
        spin_lock(&o2hb_live_lock);
        hb_task = reg->hr_task;
@@@ -1830,9 -1841,9 +1843,9 @@@ out
        return ret;
  }
  
 -static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
 -                                      char *page)
 +static ssize_t o2hb_region_pid_show(struct config_item *item, char *page)
  {
 +      struct o2hb_region *reg = to_o2hb_region(item);
        pid_t pid = 0;
  
        spin_lock(&o2hb_live_lock);
        return sprintf(page, "%u\n", pid);
  }
  
 -struct o2hb_region_attribute {
 -      struct configfs_attribute attr;
 -      ssize_t (*show)(struct o2hb_region *, char *);
 -      ssize_t (*store)(struct o2hb_region *, const char *, size_t);
 -};
 -
 -static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
 -      .attr   = { .ca_owner = THIS_MODULE,
 -                  .ca_name = "block_bytes",
 -                  .ca_mode = S_IRUGO | S_IWUSR },
 -      .show   = o2hb_region_block_bytes_read,
 -      .store  = o2hb_region_block_bytes_write,
 -};
 -
 -static struct o2hb_region_attribute o2hb_region_attr_start_block = {
 -      .attr   = { .ca_owner = THIS_MODULE,
 -                  .ca_name = "start_block",
 -                  .ca_mode = S_IRUGO | S_IWUSR },
 -      .show   = o2hb_region_start_block_read,
 -      .store  = o2hb_region_start_block_write,
 -};
 -
 -static struct o2hb_region_attribute o2hb_region_attr_blocks = {
 -      .attr   = { .ca_owner = THIS_MODULE,
 -                  .ca_name = "blocks",
 -                  .ca_mode = S_IRUGO | S_IWUSR },
 -      .show   = o2hb_region_blocks_read,
 -      .store  = o2hb_region_blocks_write,
 -};
 -
 -static struct o2hb_region_attribute o2hb_region_attr_dev = {
 -      .attr   = { .ca_owner = THIS_MODULE,
 -                  .ca_name = "dev",
 -                  .ca_mode = S_IRUGO | S_IWUSR },
 -      .show   = o2hb_region_dev_read,
 -      .store  = o2hb_region_dev_write,
 -};
 -
 -static struct o2hb_region_attribute o2hb_region_attr_pid = {
 -       .attr   = { .ca_owner = THIS_MODULE,
 -                   .ca_name = "pid",
 -                   .ca_mode = S_IRUGO | S_IRUSR },
 -       .show   = o2hb_region_pid_read,
 -};
 +CONFIGFS_ATTR(o2hb_region_, block_bytes);
 +CONFIGFS_ATTR(o2hb_region_, start_block);
 +CONFIGFS_ATTR(o2hb_region_, blocks);
 +CONFIGFS_ATTR(o2hb_region_, dev);
 +CONFIGFS_ATTR_RO(o2hb_region_, pid);
  
  static struct configfs_attribute *o2hb_region_attrs[] = {
 -      &o2hb_region_attr_block_bytes.attr,
 -      &o2hb_region_attr_start_block.attr,
 -      &o2hb_region_attr_blocks.attr,
 -      &o2hb_region_attr_dev.attr,
 -      &o2hb_region_attr_pid.attr,
 +      &o2hb_region_attr_block_bytes,
 +      &o2hb_region_attr_start_block,
 +      &o2hb_region_attr_blocks,
 +      &o2hb_region_attr_dev,
 +      &o2hb_region_attr_pid,
        NULL,
  };
  
 -static ssize_t o2hb_region_show(struct config_item *item,
 -                              struct configfs_attribute *attr,
 -                              char *page)
 -{
 -      struct o2hb_region *reg = to_o2hb_region(item);
 -      struct o2hb_region_attribute *o2hb_region_attr =
 -              container_of(attr, struct o2hb_region_attribute, attr);
 -      ssize_t ret = 0;
 -
 -      if (o2hb_region_attr->show)
 -              ret = o2hb_region_attr->show(reg, page);
 -      return ret;
 -}
 -
 -static ssize_t o2hb_region_store(struct config_item *item,
 -                               struct configfs_attribute *attr,
 -                               const char *page, size_t count)
 -{
 -      struct o2hb_region *reg = to_o2hb_region(item);
 -      struct o2hb_region_attribute *o2hb_region_attr =
 -              container_of(attr, struct o2hb_region_attribute, attr);
 -      ssize_t ret = -EINVAL;
 -
 -      if (o2hb_region_attr->store)
 -              ret = o2hb_region_attr->store(reg, page, count);
 -      return ret;
 -}
 -
  static struct configfs_item_operations o2hb_region_item_ops = {
        .release                = o2hb_region_release,
 -      .show_attribute         = o2hb_region_show,
 -      .store_attribute        = o2hb_region_store,
  };
  
  static struct config_item_type o2hb_region_type = {
@@@ -2057,14 -2137,49 +2070,14 @@@ unlock
        spin_unlock(&o2hb_live_lock);
  }
  
 -struct o2hb_heartbeat_group_attribute {
 -      struct configfs_attribute attr;
 -      ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
 -      ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
 -};
 -
 -static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
 -                                       struct configfs_attribute *attr,
 -                                       char *page)
 -{
 -      struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
 -      struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
 -              container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
 -      ssize_t ret = 0;
 -
 -      if (o2hb_heartbeat_group_attr->show)
 -              ret = o2hb_heartbeat_group_attr->show(reg, page);
 -      return ret;
 -}
 -
 -static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
 -                                        struct configfs_attribute *attr,
 -                                        const char *page, size_t count)
 -{
 -      struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
 -      struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
 -              container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
 -      ssize_t ret = -EINVAL;
 -
 -      if (o2hb_heartbeat_group_attr->store)
 -              ret = o2hb_heartbeat_group_attr->store(reg, page, count);
 -      return ret;
 -}
 -
 -static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
 -                                                   char *page)
 +static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
 +              char *page)
  {
        return sprintf(page, "%u\n", o2hb_dead_threshold);
  }
  
 -static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
 -                                                  const char *page,
 -                                                  size_t count)
 +static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
 +              const char *page, size_t count)
  {
        unsigned long tmp;
        char *p = (char *)page;
        return count;
  }
  
 -static
 -ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
 -                                     char *page)
 +static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item,
 +              char *page)
  {
        return sprintf(page, "%s\n",
                       o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
  }
  
 -static
 -ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
 -                                      const char *page, size_t count)
 +static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
 +              const char *page, size_t count)
  {
        unsigned int i;
        int ret;
  
  }
  
 -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
 -      .attr   = { .ca_owner = THIS_MODULE,
 -                  .ca_name = "dead_threshold",
 -                  .ca_mode = S_IRUGO | S_IWUSR },
 -      .show   = o2hb_heartbeat_group_threshold_show,
 -      .store  = o2hb_heartbeat_group_threshold_store,
 -};
 -
 -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
 -      .attr   = { .ca_owner = THIS_MODULE,
 -              .ca_name = "mode",
 -              .ca_mode = S_IRUGO | S_IWUSR },
 -      .show   = o2hb_heartbeat_group_mode_show,
 -      .store  = o2hb_heartbeat_group_mode_store,
 -};
 +CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
 +CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
  
  static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
 -      &o2hb_heartbeat_group_attr_threshold.attr,
 -      &o2hb_heartbeat_group_attr_mode.attr,
 +      &o2hb_heartbeat_group_attr_threshold,
 +      &o2hb_heartbeat_group_attr_mode,
        NULL,
  };
  
 -static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
 -      .show_attribute         = o2hb_heartbeat_group_show,
 -      .store_attribute        = o2hb_heartbeat_group_store,
 -};
 -
  static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
        .make_item      = o2hb_heartbeat_group_make_item,
        .drop_item      = o2hb_heartbeat_group_drop_item,
  
  static struct config_item_type o2hb_heartbeat_group_type = {
        .ct_group_ops   = &o2hb_heartbeat_group_group_ops,
 -      .ct_item_ops    = &o2hb_heartbeat_group_item_ops,
        .ct_attrs       = o2hb_heartbeat_group_attrs,
        .ct_owner       = THIS_MODULE,
  };
diff --combined fs/proc/array.c
index eed2050db9be9c7795acd2153f976d4742e2fe82,ff2ce1ab064d7ab94fd5b15d7b02e091f58a8426..d73291f5f0fcbfb0cd2cff2bb1b628a72f754f6e
  static inline void task_name(struct seq_file *m, struct task_struct *p)
  {
        char *buf;
+       size_t size;
        char tcomm[sizeof(p->comm)];
+       int ret;
  
        get_task_comm(tcomm, p);
  
        seq_puts(m, "Name:\t");
-       buf = m->buf + m->count;
  
-       /* Ignore error for now */
-       buf += string_escape_str(tcomm, buf, m->size - m->count,
-                                ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+       size = seq_get_buf(m, &buf);
+       ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+       seq_commit(m, ret < size ? ret : -1);
  
-       m->count = buf - m->buf;
        seq_putc(m, '\n');
  }
  
@@@ -375,7 -375,7 +375,7 @@@ int proc_pid_status(struct seq_file *m
  static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task, int whole)
  {
 -      unsigned long vsize, eip, esp, wchan = ~0UL;
 +      unsigned long vsize, eip, esp, wchan = 0;
        int priority, nice;
        int tty_pgrp = -1, tty_nr = 0;
        sigset_t sigign, sigcatch;
        seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
        seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
        seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
 -      seq_put_decimal_ull(m, ' ', wchan);
 +
 +      /*
 +       * We used to output the absolute kernel address, but that's an
 +       * information leak - so instead we show a 0/1 flag here, to signal
 +       * to user-space whether there's a wchan field in /proc/PID/wchan.
 +       *
 +       * This works with older implementations of procps as well.
 +       */
 +      if (wchan)
 +              seq_puts(m, " 1");
 +      else
 +              seq_puts(m, " 0");
 +
        seq_put_decimal_ull(m, ' ', 0);
        seq_put_decimal_ull(m, ' ', 0);
        seq_put_decimal_ll(m, ' ', task->exit_signal);
diff --combined fs/proc/task_mmu.c
index b029d426c55892544afcd3bf2b8a5965f6e0e5ee,c00cb0ae24f7de274fc4bc4485156b93af395b0d..9ca699b05e78906167519fa17ccb3acdbde510ec
@@@ -70,6 -70,7 +70,7 @@@ void task_mem(struct seq_file *m, struc
                ptes >> 10,
                pmds >> 10,
                swap << (PAGE_SHIFT-10));
+       hugetlb_report_usage(m, mm);
  }
  
  unsigned long task_vsize(struct mm_struct *mm)
@@@ -446,14 -447,17 +447,17 @@@ struct mem_size_stats 
        unsigned long anonymous;
        unsigned long anonymous_thp;
        unsigned long swap;
+       unsigned long shared_hugetlb;
+       unsigned long private_hugetlb;
        u64 pss;
        u64 swap_pss;
  };
  
  static void smaps_account(struct mem_size_stats *mss, struct page *page,
-               unsigned long size, bool young, bool dirty)
+               bool compound, bool young, bool dirty)
  {
-       int mapcount;
+       int i, nr = compound ? HPAGE_PMD_NR : 1;
+       unsigned long size = nr * PAGE_SIZE;
  
        if (PageAnon(page))
                mss->anonymous += size;
        /* Accumulate the size in pages that have been accessed. */
        if (young || page_is_young(page) || PageReferenced(page))
                mss->referenced += size;
-       mapcount = page_mapcount(page);
-       if (mapcount >= 2) {
-               u64 pss_delta;
  
-               if (dirty || PageDirty(page))
-                       mss->shared_dirty += size;
-               else
-                       mss->shared_clean += size;
-               pss_delta = (u64)size << PSS_SHIFT;
-               do_div(pss_delta, mapcount);
-               mss->pss += pss_delta;
-       } else {
+       /*
+        * page_count(page) == 1 guarantees the page is mapped exactly once.
+        * If any subpage of the compound page mapped with PTE it would elevate
+        * page_count().
+        */
+       if (page_count(page) == 1) {
                if (dirty || PageDirty(page))
                        mss->private_dirty += size;
                else
                        mss->private_clean += size;
                mss->pss += (u64)size << PSS_SHIFT;
+               return;
+       }
+       for (i = 0; i < nr; i++, page++) {
+               int mapcount = page_mapcount(page);
+               if (mapcount >= 2) {
+                       if (dirty || PageDirty(page))
+                               mss->shared_dirty += PAGE_SIZE;
+                       else
+                               mss->shared_clean += PAGE_SIZE;
+                       mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+               } else {
+                       if (dirty || PageDirty(page))
+                               mss->private_dirty += PAGE_SIZE;
+                       else
+                               mss->private_clean += PAGE_SIZE;
+                       mss->pss += PAGE_SIZE << PSS_SHIFT;
+               }
        }
  }
  
@@@ -513,7 -531,8 +531,8 @@@ static void smaps_pte_entry(pte_t *pte
  
        if (!page)
                return;
-       smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+       smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
  }
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@@ -529,8 -548,7 +548,7 @@@ static void smaps_pmd_entry(pmd_t *pmd
        if (IS_ERR_OR_NULL(page))
                return;
        mss->anonymous_thp += HPAGE_PMD_SIZE;
-       smaps_account(mss, page, HPAGE_PMD_SIZE,
-                       pmd_young(*pmd), pmd_dirty(*pmd));
+       smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
  }
  #else
  static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@@ -546,7 -564,7 +564,7 @@@ static int smaps_pte_range(pmd_t *pmd, 
        pte_t *pte;
        spinlock_t *ptl;
  
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                smaps_pmd_entry(pmd, addr, walk);
                spin_unlock(ptl);
                return 0;
@@@ -625,12 -643,44 +643,44 @@@ static void show_smap_vma_flags(struct 
        seq_putc(m, '\n');
  }
  
+ #ifdef CONFIG_HUGETLB_PAGE
+ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+                                unsigned long addr, unsigned long end,
+                                struct mm_walk *walk)
+ {
+       struct mem_size_stats *mss = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       struct page *page = NULL;
+       if (pte_present(*pte)) {
+               page = vm_normal_page(vma, addr, *pte);
+       } else if (is_swap_pte(*pte)) {
+               swp_entry_t swpent = pte_to_swp_entry(*pte);
+               if (is_migration_entry(swpent))
+                       page = migration_entry_to_page(swpent);
+       }
+       if (page) {
+               int mapcount = page_mapcount(page);
+               if (mapcount >= 2)
+                       mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+               else
+                       mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+       }
+       return 0;
+ }
+ #endif /* HUGETLB_PAGE */
  static int show_smap(struct seq_file *m, void *v, int is_pid)
  {
        struct vm_area_struct *vma = v;
        struct mem_size_stats mss;
        struct mm_walk smaps_walk = {
                .pmd_entry = smaps_pte_range,
+ #ifdef CONFIG_HUGETLB_PAGE
+               .hugetlb_entry = smaps_hugetlb_range,
+ #endif
                .mm = vma->vm_mm,
                .private = &mss,
        };
                   "Referenced:     %8lu kB\n"
                   "Anonymous:      %8lu kB\n"
                   "AnonHugePages:  %8lu kB\n"
+                  "Shared_Hugetlb: %8lu kB\n"
+                  "Private_Hugetlb: %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "SwapPss:        %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
                   mss.referenced >> 10,
                   mss.anonymous >> 10,
                   mss.anonymous_thp >> 10,
+                  mss.shared_hugetlb >> 10,
+                  mss.private_hugetlb >> 10,
                   mss.swap >> 10,
                   (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
                   vma_kernel_pagesize(vma) >> 10,
@@@ -753,36 -807,37 +807,37 @@@ static inline void clear_soft_dirty(str
        pte_t ptent = *pte;
  
        if (pte_present(ptent)) {
+               ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
                ptent = pte_wrprotect(ptent);
 -              ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
 +              ptent = pte_clear_soft_dirty(ptent);
+               ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
        } else if (is_swap_pte(ptent)) {
                ptent = pte_swp_clear_soft_dirty(ptent);
+               set_pte_at(vma->vm_mm, addr, pte, ptent);
        }
-       set_pte_at(vma->vm_mm, addr, pte, ptent);
  }
+ #else
+ static inline void clear_soft_dirty(struct vm_area_struct *vma,
+               unsigned long addr, pte_t *pte)
+ {
+ }
+ #endif
  
+ #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
  static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmdp)
  {
-       pmd_t pmd = *pmdp;
+       pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
  
        pmd = pmd_wrprotect(pmd);
 -      pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
 +      pmd = pmd_clear_soft_dirty(pmd);
  
        if (vma->vm_flags & VM_SOFTDIRTY)
                vma->vm_flags &= ~VM_SOFTDIRTY;
  
        set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
  }
  #else
- static inline void clear_soft_dirty(struct vm_area_struct *vma,
-               unsigned long addr, pte_t *pte)
- {
- }
  static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmdp)
  {
@@@ -798,7 -853,7 +853,7 @@@ static int clear_refs_pte_range(pmd_t *
        spinlock_t *ptl;
        struct page *page;
  
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
                        clear_soft_dirty_pmd(vma, addr, pmd);
                        goto out;
@@@ -1072,7 -1127,7 +1127,7 @@@ static int pagemap_pmd_range(pmd_t *pmd
        int err = 0;
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
                u64 flags = 0, frame = 0;
                pmd_t pmd = *pmdp;
  
@@@ -1404,7 -1459,7 +1459,7 @@@ static int gather_pte_stats(pmd_t *pmd
        pte_t *orig_pte;
        pte_t *pte;
  
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                pte_t huge_pte = *(pte_t *)pmd;
                struct page *page;
  
diff --combined fs/xfs/xfs_qm.c
index 7af7648c06c63bd63ec21b150cc2713914f28c08,587174fd4f2c216637eda70b9986848a0980fd09..532ab79d38fe376c14a5463a97195b59a61d8f84
@@@ -184,7 -184,7 +184,7 @@@ xfs_qm_dqpurge
         */
        ASSERT(!list_empty(&dqp->q_lru));
        list_lru_del(&qi->qi_lru, &dqp->q_lru);
 -      XFS_STATS_DEC(xs_qm_dquot_unused);
 +      XFS_STATS_DEC(mp, xs_qm_dquot_unused);
  
        xfs_qm_dqdestroy(dqp);
        return 0;
@@@ -448,11 -448,11 +448,11 @@@ xfs_qm_dquot_isolate
         */
        if (dqp->q_nrefs) {
                xfs_dqunlock(dqp);
 -              XFS_STATS_INC(xs_qm_dqwants);
 +              XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
  
                trace_xfs_dqreclaim_want(dqp);
                list_lru_isolate(lru, &dqp->q_lru);
 -              XFS_STATS_DEC(xs_qm_dquot_unused);
 +              XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
                return LRU_REMOVED;
        }
  
  
        ASSERT(dqp->q_nrefs == 0);
        list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
 -      XFS_STATS_DEC(xs_qm_dquot_unused);
 +      XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
        trace_xfs_dqreclaim_done(dqp);
 -      XFS_STATS_INC(xs_qm_dqreclaims);
 +      XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
        return LRU_REMOVED;
  
  out_miss_busy:
        trace_xfs_dqreclaim_busy(dqp);
 -      XFS_STATS_INC(xs_qm_dqreclaim_misses);
 +      XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
        return LRU_SKIP;
  
  out_unlock_dirty:
        trace_xfs_dqreclaim_busy(dqp);
 -      XFS_STATS_INC(xs_qm_dqreclaim_misses);
 +      XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
        xfs_dqunlock(dqp);
        spin_lock(lru_lock);
        return LRU_RETRY;
@@@ -525,7 -525,7 +525,7 @@@ xfs_qm_shrink_scan
        unsigned long           freed;
        int                     error;
  
-       if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+       if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
                return 0;
  
        INIT_LIST_HEAD(&isol.buffers);
index 14b0ff32fb9f16c6ce30e0e54c3f3b4885216699,010a7e3f6ad142dee0b702ddf763f8a181a9f16a..63abda1ac06dbf74793130ca2859b6cc553e6cfc
@@@ -30,19 -30,9 +30,19 @@@ extern int ptep_set_access_flags(struc
  #endif
  
  #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
 +#else
 +static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 +                                      unsigned long address, pmd_t *pmdp,
 +                                      pmd_t entry, int dirty)
 +{
 +      BUILD_BUG();
 +      return 0;
 +}
 +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  #endif
  
  #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
@@@ -74,12 -64,12 +74,12 @@@ static inline int pmdp_test_and_clear_y
                set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
        return r;
  }
 -#else /* CONFIG_TRANSPARENT_HUGEPAGE */
 +#else
  static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
  {
 -      BUG();
 +      BUILD_BUG();
        return 0;
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@@ -91,21 -81,8 +91,21 @@@ int ptep_clear_flush_young(struct vm_ar
  #endif
  
  #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
 -int pmdp_clear_flush_young(struct vm_area_struct *vma,
 -                         unsigned long address, pmd_t *pmdp);
 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 +extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
 +                                unsigned long address, pmd_t *pmdp);
 +#else
 +/*
 + * Despite relevant to THP only, this API is called from generic rmap code
 + * under PageTransHuge(), hence needs a dummy implementation for !THP
 + */
 +static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
 +                                       unsigned long address, pmd_t *pmdp)
 +{
 +      BUILD_BUG();
 +      return 0;
 +}
 +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  #endif
  
  #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
@@@ -198,20 -175,15 +198,15 @@@ static inline void pmdp_set_wrprotect(s
        pmd_t old_pmd = *pmdp;
        set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
  }
 -#else /* CONFIG_TRANSPARENT_HUGEPAGE */
 +#else
  static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
  {
 -      BUG();
 +      BUILD_BUG();
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  #endif
  
- #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long address, pmd_t *pmdp);
- #endif
  #ifndef pmdp_collapse_flush
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
@@@ -271,7 -243,7 +266,7 @@@ static inline int pmd_same(pmd_t pmd_a
  #else /* CONFIG_TRANSPARENT_HUGEPAGE */
  static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
  {
 -      BUG();
 +      BUILD_BUG();
        return 0;
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@@ -505,16 -477,6 +500,16 @@@ static inline pmd_t pmd_mksoft_dirty(pm
        return pmd;
  }
  
 +static inline pte_t pte_clear_soft_dirty(pte_t pte)
 +{
 +      return pte;
 +}
 +
 +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
 +{
 +      return pmd;
 +}
 +
  static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
  {
        return pte;
@@@ -619,10 -581,6 +614,6 @@@ static inline int pmd_trans_huge(pmd_t 
  {
        return 0;
  }
- static inline int pmd_trans_splitting(pmd_t pmd)
- {
-       return 0;
- }
  #ifndef __HAVE_ARCH_PMD_WRITE
  static inline int pmd_write(pmd_t pmd)
  {
diff --combined include/drm/drmP.h
index 4d3b842f4319586fef51493cfebae68397d2b0b4,f56cdcecc1c97bd4e6e7d884845ed4551a41a673..0b921ae06cd83585e1d1cf2adb6baf665f203013
@@@ -107,9 -107,6 +107,9 @@@ struct dma_buf_attachment
   * ATOMIC: used in the atomic code.
   *      This is the category used by the DRM_DEBUG_ATOMIC() macro.
   *
 + * VBL: used for verbose debug message in the vblank code
 + *      This is the category used by the DRM_DEBUG_VBL() macro.
 + *
   * Enabling verbose debug messages is done through the drm.debug parameter,
   * each category being enabled by a bit.
   *
   * drm.debug=0x2 will enable DRIVER messages
   * drm.debug=0x3 will enable CORE and DRIVER messages
   * ...
 - * drm.debug=0xf will enable all messages
 + * drm.debug=0x3f will enable all messages
   *
   * An interesting feature is that it's possible to enable verbose logging at
   * run-time by echoing the debug value in its sysfs node:
  #define DRM_UT_KMS            0x04
  #define DRM_UT_PRIME          0x08
  #define DRM_UT_ATOMIC         0x10
 +#define DRM_UT_VBL            0x20
  
  extern __printf(2, 3)
  void drm_ut_debug_printk(const char *function_name,
@@@ -221,11 -217,6 +221,11 @@@ void drm_err(const char *format, ...)
                if (unlikely(drm_debug & DRM_UT_ATOMIC))                \
                        drm_ut_debug_printk(__func__, fmt, ##args);     \
        } while (0)
 +#define DRM_DEBUG_VBL(fmt, args...)                                   \
 +      do {                                                            \
 +              if (unlikely(drm_debug & DRM_UT_VBL))                   \
 +                      drm_ut_debug_printk(__func__, fmt, ##args);     \
 +      } while (0)
  
  /*@}*/
  
@@@ -421,7 -412,7 +421,7 @@@ struct drm_driver 
        /**
         * get_vblank_counter - get raw hardware vblank counter
         * @dev: DRM device
 -       * @crtc: counter to fetch
 +       * @pipe: counter to fetch
         *
         * Driver callback for fetching a raw hardware vblank counter for @crtc.
         * If a device doesn't have a hardware counter, the driver can simply
         * RETURNS
         * Raw vblank counter value.
         */
 -      u32 (*get_vblank_counter) (struct drm_device *dev, int crtc);
 +      u32 (*get_vblank_counter) (struct drm_device *dev, unsigned int pipe);
  
        /**
         * enable_vblank - enable vblank interrupt events
         * @dev: DRM device
 -       * @crtc: which irq to enable
 +       * @pipe: which irq to enable
         *
         * Enable vblank interrupts for @crtc.  If the device doesn't have
         * a hardware vblank counter, this routine should be a no-op, since
         * Zero on success, appropriate errno if the given @crtc's vblank
         * interrupt cannot be enabled.
         */
 -      int (*enable_vblank) (struct drm_device *dev, int crtc);
 +      int (*enable_vblank) (struct drm_device *dev, unsigned int pipe);
  
        /**
         * disable_vblank - disable vblank interrupt events
         * @dev: DRM device
 -       * @crtc: which irq to enable
 +       * @pipe: which irq to enable
         *
         * Disable vblank interrupts for @crtc.  If the device doesn't have
         * a hardware vblank counter, this routine should be a no-op, since
         * interrupts will have to stay on to keep the count accurate.
         */
 -      void (*disable_vblank) (struct drm_device *dev, int crtc);
 +      void (*disable_vblank) (struct drm_device *dev, unsigned int pipe);
  
        /**
         * Called by \c drm_device_is_agp.  Typically used to determine if a
         * optional accurate ktime_get timestamp of when position was measured.
         *
         * \param dev  DRM device.
 -       * \param crtc Id of the crtc to query.
 +       * \param pipe Id of the crtc to query.
         * \param flags Flags from the caller (DRM_CALLED_FROM_VBLIRQ or 0).
         * \param *vpos Target location for current vertical scanout position.
         * \param *hpos Target location for current horizontal scanout position.
         *               scanout position query. Can be NULL to skip timestamp.
         * \param *etime Target location for timestamp taken immediately after
         *               scanout position query. Can be NULL to skip timestamp.
 +       * \param mode Current display timings.
         *
         * Returns vpos as a positive number while in active scanout area.
         * Returns vpos as a negative number inside vblank, counting the number
         * but unknown small number of scanlines wrt. real scanout position.
         *
         */
 -      int (*get_scanout_position) (struct drm_device *dev, int crtc,
 -                                   unsigned int flags,
 -                                   int *vpos, int *hpos, ktime_t *stime,
 -                                   ktime_t *etime);
 +      int (*get_scanout_position) (struct drm_device *dev, unsigned int pipe,
 +                                   unsigned int flags, int *vpos, int *hpos,
 +                                   ktime_t *stime, ktime_t *etime,
 +                                   const struct drm_display_mode *mode);
  
        /**
         * Called by \c drm_get_last_vbltimestamp. Should return a precise
         * to the OpenML OML_sync_control extension specification.
         *
         * \param dev dev DRM device handle.
 -       * \param crtc crtc for which timestamp should be returned.
 +       * \param pipe crtc for which timestamp should be returned.
         * \param *max_error Maximum allowable timestamp error in nanoseconds.
         *                   Implementation should strive to provide timestamp
         *                   with an error of at most *max_error nanoseconds.
         * negative number on failure. A positive status code on success,
         * which describes how the vblank_time timestamp was computed.
         */
 -      int (*get_vblank_timestamp) (struct drm_device *dev, int crtc,
 +      int (*get_vblank_timestamp) (struct drm_device *dev, unsigned int pipe,
                                     int *max_error,
                                     struct timeval *vblank_time,
                                     unsigned flags);
@@@ -711,8 -701,6 +711,8 @@@ struct drm_vblank_crtc 
        u32 last_wait;                  /* Last vblank seqno waited per CRTC */
        unsigned int inmodeset;         /* Display driver is setting mode */
        unsigned int pipe;              /* crtc index */
 +      int framedur_ns;                /* frame/field duration in ns */
 +      int linedur_ns;                 /* line duration in ns */
        bool enabled;                   /* so we don't call enable more than
                                           once per disable */
  };
@@@ -834,7 -822,6 +834,6 @@@ struct drm_device 
  
        struct drm_sg_mem *sg;  /**< Scatter gather memory */
        unsigned int num_crtcs;                  /**< Number of CRTCs on this device */
-       sigset_t sigmask;
  
        struct {
                int context;
@@@ -918,8 -905,6 +917,8 @@@ extern unsigned int drm_poll(struct fil
  /* Misc. IOCTL support (drm_ioctl.c) */
  int drm_noop(struct drm_device *dev, void *data,
             struct drm_file *file_priv);
 +int drm_invalid_op(struct drm_device *dev, void *data,
 +                 struct drm_file *file_priv);
  
  /* Cache management (drm_cache.c) */
  void drm_clflush_pages(struct page *pages[], unsigned long num_pages);
@@@ -938,12 -923,10 +937,12 @@@ extern int drm_irq_uninstall(struct drm
  extern int drm_vblank_init(struct drm_device *dev, unsigned int num_crtcs);
  extern int drm_wait_vblank(struct drm_device *dev, void *data,
                           struct drm_file *filp);
 -extern u32 drm_vblank_count(struct drm_device *dev, int pipe);
 +extern u32 drm_vblank_count(struct drm_device *dev, unsigned int pipe);
  extern u32 drm_crtc_vblank_count(struct drm_crtc *crtc);
  extern u32 drm_vblank_count_and_time(struct drm_device *dev, unsigned int pipe,
                                     struct timeval *vblanktime);
 +extern u32 drm_crtc_vblank_count_and_time(struct drm_crtc *crtc,
 +                                        struct timeval *vblanktime);
  extern void drm_send_vblank_event(struct drm_device *dev, unsigned int pipe,
                                  struct drm_pending_vblank_event *e);
  extern void drm_crtc_send_vblank_event(struct drm_crtc *crtc,
@@@ -962,12 -945,12 +961,12 @@@ extern void drm_crtc_vblank_off(struct 
  extern void drm_crtc_vblank_reset(struct drm_crtc *crtc);
  extern void drm_crtc_vblank_on(struct drm_crtc *crtc);
  extern void drm_vblank_cleanup(struct drm_device *dev);
 +extern u32 drm_vblank_no_hw_counter(struct drm_device *dev, unsigned int pipe);
  
  extern int drm_calc_vbltimestamp_from_scanoutpos(struct drm_device *dev,
                                                 unsigned int pipe, int *max_error,
                                                 struct timeval *vblank_time,
                                                 unsigned flags,
 -                                               const struct drm_crtc *refcrtc,
                                                 const struct drm_display_mode *mode);
  extern void drm_calc_timestamping_constants(struct drm_crtc *crtc,
                                            const struct drm_display_mode *mode);
index 8efb40e61d6e48021d68f93635eea8d3ab3e8c0b,dc3d6b7ce1ebafe159f4b65b7d1dc732caae4af9..22ab246feed34c104038d3f94e1401ea9a587f8f
  
  #if GCC_VERSION >= 40600
  /*
-  * Tell the optimizer that something else uses this function or variable.
+  * When used with Link Time Optimization, gcc can optimize away C functions or
+  * variables which are referenced only from assembly code.  __visible tells the
+  * optimizer that something else uses this function or variable, thus preventing
+  * this.
   */
  #define __visible     __attribute__((externally_visible))
  #endif
  
+ #if GCC_VERSION >= 40900 && !defined(__CHECKER__)
+ /*
+  * __assume_aligned(n, k): Tell the optimizer that the returned
+  * pointer can be assumed to be k modulo n. The second argument is
+  * optional (default 0), so we use a variadic macro to make the
+  * shorthand.
+  *
+  * Beware: Do not apply this to functions which may return
+  * ERR_PTRs. Also, it is probably unwise to apply it to functions
+  * returning extra information in the low bits (but in that case the
+  * compiler should see some alignment anyway, when the return value is
+  * massaged by 'flags = ptr & 3; ptr &= ~3;').
+  */
+ #define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__)))
+ #endif
  /*
   * GCC 'asm goto' miscompiles certain code sequences:
   *
  #define KASAN_ABI_VERSION 3
  #endif
  
 +#if GCC_VERSION >= 40902
 +/*
 + * Tell the compiler that address safety instrumentation (KASAN)
 + * should not be applied to that function.
 + * Conflicts with inlining: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
 + */
 +#define __no_sanitize_address __attribute__((no_sanitize_address))
 +#endif
 +
  #endif        /* gcc version >= 40000 specific checks */
  
  #if !defined(__noclone)
  #define __noclone     /* not needed */
  #endif
  
 +#if !defined(__no_sanitize_address)
 +#define __no_sanitize_address
 +#endif
 +
  /*
   * A trick to suppress uninitialized variable warning without generating any
   * code
diff --combined include/linux/compiler.h
index 8807e4f1b0e6b1878c845a7301f7aded28b4707b,6167ca663ad954726540431b72a6aca5a8e98de6..f108e5222dad0ea50ae685173eaceb486f6eab94
@@@ -56,7 -56,7 +56,7 @@@ extern void __chk_io_ptr(const volatil
  #include <linux/compiler-gcc.h>
  #endif
  
 -#ifdef CC_USING_HOTPATCH
 +#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__)
  #define notrace __attribute__((hotpatch(0,0)))
  #else
  #define notrace __attribute__((no_instrument_function))
@@@ -198,45 -198,19 +198,45 @@@ void ftrace_likely_update(struct ftrace
  
  #include <uapi/linux/types.h>
  
 -static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
 +#define __READ_ONCE_SIZE                                              \
 +({                                                                    \
 +      switch (size) {                                                 \
 +      case 1: *(__u8 *)res = *(volatile __u8 *)p; break;              \
 +      case 2: *(__u16 *)res = *(volatile __u16 *)p; break;            \
 +      case 4: *(__u32 *)res = *(volatile __u32 *)p; break;            \
 +      case 8: *(__u64 *)res = *(volatile __u64 *)p; break;            \
 +      default:                                                        \
 +              barrier();                                              \
 +              __builtin_memcpy((void *)res, (const void *)p, size);   \
 +              barrier();                                              \
 +      }                                                               \
 +})
 +
 +static __always_inline
 +void __read_once_size(const volatile void *p, void *res, int size)
  {
 -      switch (size) {
 -      case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
 -      case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
 -      case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
 -      case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
 -      default:
 -              barrier();
 -              __builtin_memcpy((void *)res, (const void *)p, size);
 -              barrier();
 -      }
 +      __READ_ONCE_SIZE;
 +}
 +
 +#ifdef CONFIG_KASAN
 +/*
 + * This function is not 'inline' because __no_sanitize_address confilcts
 + * with inlining. Attempt to inline it may cause a build failure.
 + *    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
 + * '__maybe_unused' allows us to avoid defined-but-not-used warnings.
 + */
 +static __no_sanitize_address __maybe_unused
 +void __read_once_size_nocheck(const volatile void *p, void *res, int size)
 +{
 +      __READ_ONCE_SIZE;
 +}
 +#else
 +static __always_inline
 +void __read_once_size_nocheck(const volatile void *p, void *res, int size)
 +{
 +      __READ_ONCE_SIZE;
  }
 +#endif
  
  static __always_inline void __write_once_size(volatile void *p, void *res, int size)
  {
   * required ordering.
   */
  
 -#define READ_ONCE(x) \
 -      ({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
 +#define __READ_ONCE(x, check)                                         \
 +({                                                                    \
 +      union { typeof(x) __val; char __c[1]; } __u;                    \
 +      if (check)                                                      \
 +              __read_once_size(&(x), __u.__c, sizeof(x));             \
 +      else                                                            \
 +              __read_once_size_nocheck(&(x), __u.__c, sizeof(x));     \
 +      __u.__val;                                                      \
 +})
 +#define READ_ONCE(x) __READ_ONCE(x, 1)
 +
 +/*
 + * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need
 + * to hide memory access from KASAN.
 + */
 +#define READ_ONCE_NOCHECK(x) __READ_ONCE(x, 0)
  
  #define WRITE_ONCE(x, val) \
  ({                                                    \
  #define __visible
  #endif
  
+ /*
+  * Assume alignment of return value.
+  */
+ #ifndef __assume_aligned
+ #define __assume_aligned(a, ...)
+ #endif
  /* Are two types/vars the same type (ignoring qualifiers)? */
  #ifndef __same_type
  # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
diff --combined include/linux/fs.h
index f2325998cd20cd551d445072bbcd0fb15c17732c,9355f377fd468914e891214874f62ca646876b80..f78dd76f682817dbdbc1d357f8ffd8126857d3c2
@@@ -1042,7 -1042,6 +1042,7 @@@ extern int fcntl_setlease(unsigned int 
  extern int fcntl_getlease(struct file *filp);
  
  /* fs/locks.c */
 +extern struct srcu_notifier_head      lease_notifier_chain;
  void locks_free_lock_context(struct file_lock_context *ctx);
  void locks_free_lock(struct file_lock *fl);
  extern void locks_init_lock(struct file_lock *);
@@@ -1054,11 -1053,12 +1054,11 @@@ extern void locks_remove_file(struct fi
  extern void locks_release_private(struct file_lock *);
  extern void posix_test_lock(struct file *, struct file_lock *);
  extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
 -extern int posix_lock_inode_wait(struct inode *, struct file_lock *);
  extern int posix_unblock_lock(struct file_lock *);
  extern int vfs_test_lock(struct file *, struct file_lock *);
  extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
  extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
 -extern int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl);
 +extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);
  extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
  extern void lease_get_mtime(struct inode *, struct timespec *time);
  extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
@@@ -1144,6 -1144,12 +1144,6 @@@ static inline int posix_lock_file(struc
        return -ENOLCK;
  }
  
 -static inline int posix_lock_inode_wait(struct inode *inode,
 -                                      struct file_lock *fl)
 -{
 -      return -ENOLCK;
 -}
 -
  static inline int posix_unblock_lock(struct file_lock *waiter)
  {
        return -ENOENT;
@@@ -1165,7 -1171,8 +1165,7 @@@ static inline int vfs_cancel_lock(struc
        return 0;
  }
  
 -static inline int flock_lock_inode_wait(struct inode *inode,
 -                                      struct file_lock *request)
 +static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
  {
        return -ENOLCK;
  }
@@@ -1208,9 -1215,14 +1208,9 @@@ static inline struct inode *file_inode(
        return f->f_inode;
  }
  
 -static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 -{
 -      return posix_lock_inode_wait(file_inode(filp), fl);
 -}
 -
 -static inline int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
 +static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
  {
 -      return flock_lock_inode_wait(file_inode(filp), fl);
 +      return locks_lock_inode_wait(file_inode(filp), fl);
  }
  
  struct fasync_struct {
@@@ -2410,6 -2422,7 +2410,7 @@@ extern int write_inode_now(struct inod
  extern int filemap_fdatawrite(struct address_space *);
  extern int filemap_flush(struct address_space *);
  extern int filemap_fdatawait(struct address_space *);
+ extern void filemap_fdatawait_keep_errors(struct address_space *);
  extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
                                   loff_t lend);
  extern int filemap_write_and_wait(struct address_space *mapping);
index 7edd305152983af1ab6aee93f470dd99289046e8,75e34b90074894847eb63aff72d33d2639cf7337..24154c26d469c60984020b5a0441fcb6dde3fcb0
@@@ -32,7 -32,7 +32,7 @@@ static inline struct hugetlb_cgroup *hu
  
        if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
                return NULL;
-       return (struct hugetlb_cgroup *)page[2].lru.next;
+       return (struct hugetlb_cgroup *)page[2].private;
  }
  
  static inline
@@@ -42,13 -42,15 +42,13 @@@ int set_hugetlb_cgroup(struct page *pag
  
        if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
                return -1;
-       page[2].lru.next = (void *)h_cg;
+       page[2].private = (unsigned long)h_cg;
        return 0;
  }
  
  static inline bool hugetlb_cgroup_disabled(void)
  {
 -      if (hugetlb_cgrp_subsys.disabled)
 -              return true;
 -      return false;
 +      return !cgroup_subsys_enabled(hugetlb_cgrp_subsys);
  }
  
  extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
index 27251ed428f7db8adaf54c58b7f9e41deda9048d,a11d9f5d559531c227aa58bd862ae35787c9eb2e..ffc5460ed9e55b54f473bbacd90b62f30f93e978
@@@ -213,9 -213,6 +213,9 @@@ struct mem_cgroup 
        /* OOM-Killer disable */
        int             oom_kill_disable;
  
 +      /* handle for "memory.events" */
 +      struct cgroup_file events_file;
 +
        /* protect arrays of thresholds */
        struct mutex thresholds_lock;
  
@@@ -288,21 -285,21 +288,22 @@@ static inline void mem_cgroup_events(st
                       unsigned int nr)
  {
        this_cpu_add(memcg->stat->events[idx], nr);
 +      cgroup_file_notify(&memcg->events_file);
  }
  
  bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
  
  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
-                         gfp_t gfp_mask, struct mem_cgroup **memcgp);
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound);
  void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             bool lrucare);
- void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
+                             bool lrucare, bool compound);
+ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+               bool compound);
  void mem_cgroup_uncharge(struct page *page);
  void mem_cgroup_uncharge_list(struct list_head *page_list);
  
- void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
-                       bool lrucare);
+ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage);
  
  struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
  struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
@@@ -350,7 -347,9 +351,7 @@@ ino_t page_cgroup_ino(struct page *page
  
  static inline bool mem_cgroup_disabled(void)
  {
 -      if (memory_cgrp_subsys.disabled)
 -              return true;
 -      return false;
 +      return !cgroup_subsys_enabled(memory_cgrp_subsys);
  }
  
  /*
@@@ -384,7 -383,7 +385,7 @@@ unsigned long mem_cgroup_get_lru_size(s
        return mz->lru_size[lru];
  }
  
- static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+ static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
  {
        unsigned long inactive_ratio;
        unsigned long inactive;
        return inactive * inactive_ratio < active;
  }
  
+ void mem_cgroup_handle_over_high(void);
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
                                struct task_struct *p);
  
  static inline void mem_cgroup_oom_enable(void)
  {
-       WARN_ON(current->memcg_oom.may_oom);
-       current->memcg_oom.may_oom = 1;
+       WARN_ON(current->memcg_may_oom);
+       current->memcg_may_oom = 1;
  }
  
  static inline void mem_cgroup_oom_disable(void)
  {
-       WARN_ON(!current->memcg_oom.may_oom);
-       current->memcg_oom.may_oom = 0;
+       WARN_ON(!current->memcg_may_oom);
+       current->memcg_may_oom = 0;
  }
  
  static inline bool task_in_memcg_oom(struct task_struct *p)
  {
-       return p->memcg_oom.memcg;
+       return p->memcg_in_oom;
  }
  
  bool mem_cgroup_oom_synchronize(bool wait);
@@@ -512,7 -513,8 +515,8 @@@ static inline bool mem_cgroup_low(struc
  
  static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                                        gfp_t gfp_mask,
-                                       struct mem_cgroup **memcgp)
+                                       struct mem_cgroup **memcgp,
+                                       bool compound)
  {
        *memcgp = NULL;
        return 0;
  
  static inline void mem_cgroup_commit_charge(struct page *page,
                                            struct mem_cgroup *memcg,
-                                           bool lrucare)
+                                           bool lrucare, bool compound)
  {
  }
  
  static inline void mem_cgroup_cancel_charge(struct page *page,
-                                           struct mem_cgroup *memcg)
+                                           struct mem_cgroup *memcg,
+                                           bool compound)
  {
  }
  
@@@ -537,9 -540,7 +542,7 @@@ static inline void mem_cgroup_uncharge_
  {
  }
  
- static inline void mem_cgroup_migrate(struct page *oldpage,
-                                     struct page *newpage,
-                                     bool lrucare)
+ static inline void mem_cgroup_replace_page(struct page *old, struct page *new)
  {
  }
  
@@@ -585,10 -586,10 +588,10 @@@ static inline bool mem_cgroup_disabled(
        return true;
  }
  
- static inline int
+ static inline bool
  mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
  {
-       return 1;
+       return true;
  }
  
  static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
@@@ -622,6 -623,10 +625,10 @@@ static inline void mem_cgroup_end_page_
  {
  }
  
+ static inline void mem_cgroup_handle_over_high(void)
+ {
+ }
  static inline void mem_cgroup_oom_enable(void)
  {
  }
@@@ -678,9 -683,8 +685,9 @@@ enum 
  
  struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
  struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
 -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
 -                       unsigned long *pdirty, unsigned long *pwriteback);
 +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 +                       unsigned long *pheadroom, unsigned long *pdirty,
 +                       unsigned long *pwriteback);
  
  #else /* CONFIG_CGROUP_WRITEBACK */
  
@@@ -690,8 -694,7 +697,8 @@@ static inline struct wb_domain *mem_cgr
  }
  
  static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
 -                                     unsigned long *pavail,
 +                                     unsigned long *pfilepages,
 +                                     unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
  {
@@@ -748,11 -751,10 +755,10 @@@ static inline bool memcg_kmem_is_active
   * conditions, but because they are pretty simple, they are expected to be
   * fast.
   */
- bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
-                                       int order);
- void __memcg_kmem_commit_charge(struct page *page,
-                                      struct mem_cgroup *memcg, int order);
- void __memcg_kmem_uncharge_pages(struct page *page, int order);
+ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+                             struct mem_cgroup *memcg);
+ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
+ void __memcg_kmem_uncharge(struct page *page, int order);
  
  /*
   * helper for acessing a memcg's index. It will be used as an index in the
@@@ -767,77 -769,42 +773,42 @@@ static inline int memcg_cache_id(struc
  struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
  void __memcg_kmem_put_cache(struct kmem_cache *cachep);
  
- struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr);
- int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                     unsigned long nr_pages);
- void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
- /**
-  * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
-  * @gfp: the gfp allocation flags.
-  * @memcg: a pointer to the memcg this was charged against.
-  * @order: allocation order.
-  *
-  * returns true if the memcg where the current task belongs can hold this
-  * allocation.
-  *
-  * We return true automatically if this allocation is not to be accounted to
-  * any memcg.
-  */
- static inline bool
- memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+ static inline bool __memcg_kmem_bypass(gfp_t gfp)
  {
        if (!memcg_kmem_enabled())
                return true;
        if (gfp & __GFP_NOACCOUNT)
                return true;
-       /*
-        * __GFP_NOFAIL allocations will move on even if charging is not
-        * possible. Therefore we don't even try, and have this allocation
-        * unaccounted. We could in theory charge it forcibly, but we hope
-        * those allocations are rare, and won't be worth the trouble.
-        */
-       if (gfp & __GFP_NOFAIL)
-               return true;
        if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
                return true;
-       /* If the test is dying, just let it go. */
-       if (unlikely(fatal_signal_pending(current)))
-               return true;
-       return __memcg_kmem_newpage_charge(gfp, memcg, order);
+       return false;
  }
  
  /**
-  * memcg_kmem_uncharge_pages: uncharge pages from memcg
-  * @page: pointer to struct page being freed
-  * @order: allocation order.
+  * memcg_kmem_charge: charge a kmem page
+  * @page: page to charge
+  * @gfp: reclaim mode
+  * @order: allocation order
+  *
+  * Returns 0 on success, an error code on failure.
   */
- static inline void
memcg_kmem_uncharge_pages(struct page *page, int order)
+ static __always_inline int memcg_kmem_charge(struct page *page,
                                           gfp_t gfp, int order)
  {
-       if (memcg_kmem_enabled())
-               __memcg_kmem_uncharge_pages(page, order);
+       if (__memcg_kmem_bypass(gfp))
+               return 0;
+       return __memcg_kmem_charge(page, gfp, order);
  }
  
  /**
-  * memcg_kmem_commit_charge: embeds correct memcg in a page
-  * @page: pointer to struct page recently allocated
-  * @memcg: the memcg structure we charged against
-  * @order: allocation order.
-  *
-  * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
-  * failure of the allocation. if @page is NULL, this function will revert the
-  * charges. Otherwise, it will commit @page to @memcg.
+  * memcg_kmem_uncharge: uncharge a kmem page
+  * @page: page to uncharge
+  * @order: allocation order
   */
- static inline void
- memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+ static __always_inline void memcg_kmem_uncharge(struct page *page, int order)
  {
-       if (memcg_kmem_enabled() && memcg)
-               __memcg_kmem_commit_charge(page, memcg, order);
+       if (memcg_kmem_enabled())
+               __memcg_kmem_uncharge(page, order);
  }
  
  /**
  static __always_inline struct kmem_cache *
  memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
  {
-       if (!memcg_kmem_enabled())
+       if (__memcg_kmem_bypass(gfp))
                return cachep;
-       if (gfp & __GFP_NOACCOUNT)
-               return cachep;
-       if (gfp & __GFP_NOFAIL)
-               return cachep;
-       if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
-               return cachep;
-       if (unlikely(fatal_signal_pending(current)))
-               return cachep;
        return __memcg_kmem_get_cache(cachep);
  }
  
@@@ -869,13 -827,6 +831,6 @@@ static __always_inline void memcg_kmem_
        if (memcg_kmem_enabled())
                __memcg_kmem_put_cache(cachep);
  }
- static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
- {
-       if (!memcg_kmem_enabled())
-               return NULL;
-       return __mem_cgroup_from_kmem(ptr);
- }
  #else
  #define for_each_memcg_cache_index(_idx)      \
        for (; NULL; )
@@@ -890,18 -841,12 +845,12 @@@ static inline bool memcg_kmem_is_active
        return false;
  }
  
- static inline bool
- memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+ static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
  {
-       return true;
+       return 0;
  }
  
- static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
- {
- }
- static inline void
- memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+ static inline void memcg_kmem_uncharge(struct page *page, int order)
  {
  }
  
@@@ -927,11 -872,5 +876,5 @@@ memcg_kmem_get_cache(struct kmem_cache 
  static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
  {
  }
- static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
- {
-       return NULL;
- }
  #endif /* CONFIG_MEMCG_KMEM */
  #endif /* _LINUX_MEMCONTROL_H */
diff --combined include/linux/sched.h
index 4effb1025fbb1555bc9c3ce6f80d98db004271a2,02b63957a721a587d2f1db37e8628f869350d63c..4069febaa34af9e93be8bb98e807db04a67c4380
@@@ -384,6 -384,7 +384,7 @@@ extern int proc_dowatchdog_thresh(struc
                                  void __user *buffer,
                                  size_t *lenp, loff_t *ppos);
  extern unsigned int  softlockup_panic;
+ extern unsigned int  hardlockup_panic;
  void lockup_detector_init(void);
  #else
  static inline void touch_softlockup_watchdog(void)
@@@ -599,42 -600,33 +600,42 @@@ struct task_cputime_atomic 
                .sum_exec_runtime = ATOMIC64_INIT(0),           \
        }
  
 -#ifdef CONFIG_PREEMPT_COUNT
 -#define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
 -#else
 -#define PREEMPT_DISABLED      PREEMPT_ENABLED
 -#endif
 +#define PREEMPT_DISABLED      (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
  
  /*
 - * Disable preemption until the scheduler is running.
 - * Reset by start_kernel()->sched_init()->init_idle().
 + * Disable preemption until the scheduler is running -- use an unconditional
 + * value so that it also works on !PREEMPT_COUNT kernels.
   *
 - * We include PREEMPT_ACTIVE to avoid cond_resched() from working
 - * before the scheduler is active -- see should_resched().
 + * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
   */
 -#define INIT_PREEMPT_COUNT    (PREEMPT_DISABLED + PREEMPT_ACTIVE)
 +#define INIT_PREEMPT_COUNT    PREEMPT_OFFSET
 +
 +/*
 + * Initial preempt_count value; reflects the preempt_count schedule invariant
 + * which states that during context switches:
 + *
 + *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
 + *
 + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
 + * Note: See finish_task_switch().
 + */
 +#define FORK_PREEMPT_COUNT    (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
  
  /**
   * struct thread_group_cputimer - thread group interval timer counts
   * @cputime_atomic:   atomic thread group interval timers.
 - * @running:          non-zero when there are timers running and
 - *                    @cputime receives updates.
 + * @running:          true when there are timers running and
 + *                    @cputime_atomic receives updates.
 + * @checking_timer:   true when a thread in the group is in the
 + *                    process of checking for thread group timers.
   *
   * This structure contains the version of task_cputime, above, that is
   * used for thread group CPU timer calculations.
   */
  struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
 -      int running;
 +      bool running;
 +      bool checking_timer;
  };
  
  #include <linux/rwsem.h>
@@@ -771,6 -763,18 +772,6 @@@ struct signal_struct 
        unsigned audit_tty_log_passwd;
        struct tty_audit_buf *tty_audit_buf;
  #endif
 -#ifdef CONFIG_CGROUPS
 -      /*
 -       * group_rwsem prevents new tasks from entering the threadgroup and
 -       * member tasks from exiting,a more specifically, setting of
 -       * PF_EXITING.  fork and exit paths are protected with this rwsem
 -       * using threadgroup_change_begin/end().  Users which require
 -       * threadgroup to remain stable should use threadgroup_[un]lock()
 -       * which also takes care of exec path.  Currently, cgroup is the
 -       * only user.
 -       */
 -      struct rw_semaphore group_rwsem;
 -#endif
  
        oom_flags_t oom_flags;
        short oom_score_adj;            /* OOM kill score adjustment */
@@@ -837,7 -841,7 +838,7 @@@ struct user_struct 
        struct hlist_node uidhash_node;
        kuid_t uid;
  
 -#ifdef CONFIG_PERF_EVENTS
 +#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
        atomic_long_t locked_vm;
  #endif
  };
@@@ -1136,6 -1140,8 +1137,6 @@@ struct sched_domain_topology_level 
  #endif
  };
  
 -extern struct sched_domain_topology_level *sched_domain_topology;
 -
  extern void set_sched_topology(struct sched_domain_topology_level *tl);
  extern void wake_up_if_idle(int cpu);
  
@@@ -1184,10 -1190,10 +1185,10 @@@ struct load_weight 
  
  /*
   * The load_avg/util_avg accumulates an infinite geometric series.
 - * 1) load_avg factors the amount of time that a sched_entity is
 - * runnable on a rq into its weight. For cfs_rq, it is the aggregated
 - * such weights of all runnable and blocked sched_entities.
 - * 2) util_avg factors frequency scaling into the amount of time
 + * 1) load_avg factors frequency scaling into the amount of time that a
 + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
 + * aggregated such weights of all runnable and blocked sched_entities.
 + * 2) util_avg factors frequency and cpu scaling into the amount of time
   * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
   * For cfs_rq, it is the aggregated such times of all runnable and
   * blocked sched_entities.
@@@ -1337,12 -1343,10 +1338,12 @@@ struct sched_dl_entity 
  
  union rcu_special {
        struct {
 -              bool blocked;
 -              bool need_qs;
 -      } b;
 -      short s;
 +              u8 blocked;
 +              u8 need_qs;
 +              u8 exp_need_qs;
 +              u8 pad; /* Otherwise the compiler can store garbage here. */
 +      } b; /* Bits. */
 +      u32 s; /* Set of bits. */
  };
  struct rcu_node;
  
@@@ -1460,7 -1464,9 +1461,9 @@@ struct task_struct 
        unsigned sched_reset_on_fork:1;
        unsigned sched_contributes_to_load:1;
        unsigned sched_migrated:1;
+ #ifdef CONFIG_MEMCG
+       unsigned memcg_may_oom:1;
+ #endif
  #ifdef CONFIG_MEMCG_KMEM
        unsigned memcg_kmem_skip_account:1;
  #endif
  
        unsigned long sas_ss_sp;
        size_t sas_ss_size;
-       int (*notifier)(void *priv);
-       void *notifier_data;
-       sigset_t *notifier_mask;
        struct callback_head *task_works;
  
        struct audit_context *audit_context;
        unsigned long trace_recursion;
  #endif /* CONFIG_TRACING */
  #ifdef CONFIG_MEMCG
-       struct memcg_oom_info {
-               struct mem_cgroup *memcg;
-               gfp_t gfp_mask;
-               int order;
-               unsigned int may_oom:1;
-       } memcg_oom;
+       struct mem_cgroup *memcg_in_oom;
+       gfp_t memcg_oom_gfp_mask;
+       int memcg_oom_order;
+       /* number of pages to reclaim on returning to userland */
+       unsigned int memcg_nr_pages_over_high;
  #endif
  #ifdef CONFIG_UPROBES
        struct uprobe_task *utask;
@@@ -2461,21 -2465,29 +2462,29 @@@ extern void ignore_signals(struct task_
  extern void flush_signal_handlers(struct task_struct *, int force_default);
  extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
  
- static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+ static inline int kernel_dequeue_signal(siginfo_t *info)
  {
-       unsigned long flags;
+       struct task_struct *tsk = current;
+       siginfo_t __info;
        int ret;
  
-       spin_lock_irqsave(&tsk->sighand->siglock, flags);
-       ret = dequeue_signal(tsk, mask, info);
-       spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+       spin_lock_irq(&tsk->sighand->siglock);
+       ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
+       spin_unlock_irq(&tsk->sighand->siglock);
  
        return ret;
  }
  
- extern void block_all_signals(int (*notifier)(void *priv), void *priv,
-                             sigset_t *mask);
- extern void unblock_all_signals(void);
+ static inline void kernel_signal_stop(void)
+ {
+       spin_lock_irq(&current->sighand->siglock);
+       if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+               __set_current_state(TASK_STOPPED);
+       spin_unlock_irq(&current->sighand->siglock);
+       schedule();
+ }
  extern void release_task(struct task_struct * p);
  extern int send_sig_info(int, struct siginfo *, struct task_struct *);
  extern int force_sigsegv(int, struct task_struct *);
diff --combined include/linux/skbuff.h
index 24f4dfd94c517b3b387682509180dee161e0912d,4d82b886af839af39f92325da3952567eb4464c6..4355129fff91b6f188136af2a499d6100f2e5bfd
@@@ -463,15 -463,6 +463,15 @@@ static inline u32 skb_mstamp_us_delta(c
        return delta_us;
  }
  
 +static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
 +                                  const struct skb_mstamp *t0)
 +{
 +      s32 diff = t1->stamp_jiffies - t0->stamp_jiffies;
 +
 +      if (!diff)
 +              diff = t1->stamp_us - t0->stamp_us;
 +      return diff > 0;
 +}
  
  /** 
   *    struct sk_buff - socket buffer
@@@ -1224,7 -1215,7 +1224,7 @@@ static inline int skb_cloned(const stru
  
  static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
  {
-       might_sleep_if(pri & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(pri));
  
        if (skb_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);
@@@ -1308,7 -1299,7 +1308,7 @@@ static inline int skb_shared(const stru
   */
  static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
  {
-       might_sleep_if(pri & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, pri);
  
  static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
                                          gfp_t pri)
  {
-       might_sleep_if(pri & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_cloned(skb)) {
                struct sk_buff *nskb = skb_copy(skb, pri);
  
diff --combined include/net/sock.h
index aeed5c95f3caedcdb4c10668c67764d8557e9369,509694740bede696abd69910aee0d1e87a68caf5..59a71965b47682edadc1b37b69d112c8b347a95a
@@@ -150,10 -150,6 +150,10 @@@ typedef __u64 __bitwise __addrpair
   *    @skc_node: main hash linkage for various protocol lookup tables
   *    @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
   *    @skc_tx_queue_mapping: tx queue number for this connection
 + *    @skc_flags: place holder for sk_flags
 + *            %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 + *            %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
 + *    @skc_incoming_cpu: record/match cpu processing incoming packets
   *    @skc_refcnt: reference count
   *
   *    This is the minimal network layer representation of sockets, the header
@@@ -204,16 -200,6 +204,16 @@@ struct sock_common 
  
        atomic64_t              skc_cookie;
  
 +      /* following fields are padding to force
 +       * offset(struct sock, sk_refcnt) == 128 on 64bit arches
 +       * assuming IPV6 is enabled. We use this padding differently
 +       * for different kind of 'sockets'
 +       */
 +      union {
 +              unsigned long   skc_flags;
 +              struct sock     *skc_listener; /* request_sock */
 +              struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
 +      };
        /*
         * fields between dontcopy_begin/dontcopy_end
         * are not copied in sock_copy()
                struct hlist_nulls_node skc_nulls_node;
        };
        int                     skc_tx_queue_mapping;
 +      union {
 +              int             skc_incoming_cpu;
 +              u32             skc_rcv_wnd;
 +              u32             skc_tw_rcv_nxt; /* struct tcp_timewait_sock  */
 +      };
 +
        atomic_t                skc_refcnt;
        /* private: */
        int                     skc_dontcopy_end[0];
 +      union {
 +              u32             skc_rxhash;
 +              u32             skc_window_clamp;
 +              u32             skc_tw_snd_nxt; /* struct tcp_timewait_sock */
 +      };
        /* public: */
  };
  
@@@ -268,6 -243,8 +268,6 @@@ struct cg_proto
    *   @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
    *   @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
    *   @sk_sndbuf: size of send buffer in bytes
 -  *   @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 -  *              %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
    *   @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
    *   @sk_no_check_rx: allow zero checksum in RX packets
    *   @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
    *   @sk_rcvlowat: %SO_RCVLOWAT setting
    *   @sk_rcvtimeo: %SO_RCVTIMEO setting
    *   @sk_sndtimeo: %SO_SNDTIMEO setting
 -  *   @sk_rxhash: flow hash received from netif layer
 -  *   @sk_incoming_cpu: record cpu processing incoming packets
    *   @sk_txhash: computed flow hash for use on transmit
    *   @sk_filter: socket filtering instructions
    *   @sk_timer: sock cleanup timer
@@@ -352,9 -331,6 +352,9 @@@ struct sock 
  #define sk_v6_daddr           __sk_common.skc_v6_daddr
  #define sk_v6_rcv_saddr       __sk_common.skc_v6_rcv_saddr
  #define sk_cookie             __sk_common.skc_cookie
 +#define sk_incoming_cpu               __sk_common.skc_incoming_cpu
 +#define sk_flags              __sk_common.skc_flags
 +#define sk_rxhash             __sk_common.skc_rxhash
  
        socket_lock_t           sk_lock;
        struct sk_buff_head     sk_receive_queue;
        } sk_backlog;
  #define sk_rmem_alloc sk_backlog.rmem_alloc
        int                     sk_forward_alloc;
 -#ifdef CONFIG_RPS
 -      __u32                   sk_rxhash;
 -#endif
 -      u16                     sk_incoming_cpu;
 -      /* 16bit hole
 -       * Warned : sk_incoming_cpu can be set from softirq,
 -       * Do not use this hole without fully understanding possible issues.
 -       */
  
        __u32                   sk_txhash;
  #ifdef CONFIG_NET_RX_BUSY_POLL
  #ifdef CONFIG_XFRM
        struct xfrm_policy      *sk_policy[2];
  #endif
 -      unsigned long           sk_flags;
        struct dst_entry        *sk_rx_dst;
        struct dst_entry __rcu  *sk_dst_cache;
        spinlock_t              sk_dst_lock;
@@@ -774,7 -759,7 +774,7 @@@ static inline int sk_memalloc_socks(voi
  
  #endif
  
 -static inline gfp_t sk_gfp_atomic(struct sock *sk, gfp_t gfp_mask)
 +static inline gfp_t sk_gfp_atomic(const struct sock *sk, gfp_t gfp_mask)
  {
        return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC);
  }
@@@ -1537,13 -1522,6 +1537,13 @@@ void sock_kfree_s(struct sock *sk, voi
  void sock_kzfree_s(struct sock *sk, void *mem, int size);
  void sk_send_sigurg(struct sock *sk);
  
 +struct sockcm_cookie {
 +      u32 mark;
 +};
 +
 +int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
 +                 struct sockcm_cookie *sockc);
 +
  /*
   * Functions to fill in entries in struct proto_ops when a protocol
   * does not implement a particular function.
@@@ -1684,16 -1662,12 +1684,16 @@@ static inline void sock_graft(struct so
  kuid_t sock_i_uid(struct sock *sk);
  unsigned long sock_i_ino(struct sock *sk);
  
 -static inline void sk_set_txhash(struct sock *sk)
 +static inline u32 net_tx_rndhash(void)
  {
 -      sk->sk_txhash = prandom_u32();
 +      u32 v = prandom_u32();
  
 -      if (unlikely(!sk->sk_txhash))
 -              sk->sk_txhash = 1;
 +      return v ?: 1;
 +}
 +
 +static inline void sk_set_txhash(struct sock *sk)
 +{
 +      sk->sk_txhash = net_tx_rndhash();
  }
  
  static inline void sk_rethink_txhash(struct sock *sk)
@@@ -2054,7 -2028,7 +2054,7 @@@ struct sk_buff *sk_stream_alloc_skb(str
   */
  static inline struct page_frag *sk_page_frag(struct sock *sk)
  {
-       if (sk->sk_allocation & __GFP_WAIT)
+       if (gfpflags_allow_blocking(sk->sk_allocation))
                return &current->task_frag;
  
        return &sk->sk_frag;
@@@ -2231,14 -2205,6 +2231,14 @@@ static inline bool sk_fullsock(const st
        return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
  }
  
 +/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
 + * SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
 + */
 +static inline bool sk_listener(const struct sock *sk)
 +{
 +      return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
 +}
 +
  void sock_enable_timestamp(struct sock *sk, int flag);
  int sock_get_timestamp(struct sock *, struct timeval __user *);
  int sock_get_timestampns(struct sock *, struct timespec __user *);
diff --combined kernel/audit.c
index 8a056a32ded7d2b4560612af7d84ac317f7a17b3,6ae6e2b62e3e5aa567526b0b24dc622e5de33e98..5ffcbd354a520b88781ed2d66c7839a7aaa7f86d
@@@ -407,33 -407,16 +407,33 @@@ static void audit_printk_skb(struct sk_
  static void kauditd_send_skb(struct sk_buff *skb)
  {
        int err;
 +      int attempts = 0;
 +#define AUDITD_RETRIES 5
 +
 +restart:
        /* take a reference in case we can't send it and we want to hold it */
        skb_get(skb);
        err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
        if (err < 0) {
 -              BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
 +              pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
 +                     audit_pid, err);
                if (audit_pid) {
 -                      pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
 -                      audit_log_lost("auditd disappeared");
 -                      audit_pid = 0;
 -                      audit_sock = NULL;
 +                      if (err == -ECONNREFUSED || err == -EPERM
 +                          || ++attempts >= AUDITD_RETRIES) {
 +                              char s[32];
 +
 +                              snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
 +                              audit_log_lost(s);
 +                              audit_pid = 0;
 +                              audit_sock = NULL;
 +                      } else {
 +                              pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
 +                                      attempts, audit_pid);
 +                              set_current_state(TASK_INTERRUPTIBLE);
 +                              schedule();
 +                              __set_current_state(TASK_RUNNING);
 +                              goto restart;
 +                      }
                }
                /* we might get lucky and get this in the next auditd */
                audit_hold_skb(skb);
@@@ -701,22 -684,25 +701,22 @@@ static int audit_netlink_ok(struct sk_b
        return err;
  }
  
 -static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
 +static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
  {
 -      int rc = 0;
        uid_t uid = from_kuid(&init_user_ns, current_uid());
        pid_t pid = task_tgid_nr(current);
  
        if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
                *ab = NULL;
 -              return rc;
 +              return;
        }
  
        *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
        if (unlikely(!*ab))
 -              return rc;
 +              return;
        audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
        audit_log_session_info(*ab);
        audit_log_task_context(*ab);
 -
 -      return rc;
  }
  
  int is_audit_feature_set(int i)
@@@ -1371,16 -1357,16 +1371,16 @@@ struct audit_buffer *audit_log_start(st
        if (unlikely(audit_filter_type(type)))
                return NULL;
  
-       if (gfp_mask & __GFP_WAIT) {
+       if (gfp_mask & __GFP_DIRECT_RECLAIM) {
                if (audit_pid && audit_pid == current->pid)
-                       gfp_mask &= ~__GFP_WAIT;
+                       gfp_mask &= ~__GFP_DIRECT_RECLAIM;
                else
                        reserve = 0;
        }
  
        while (audit_backlog_limit
               && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
-               if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+               if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
                        long sleep_time;
  
                        sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
@@@ -1580,14 -1566,14 +1580,14 @@@ void audit_log_n_string(struct audit_bu
   * @string: string to be checked
   * @len: max length of the string to check
   */
 -int audit_string_contains_control(const char *string, size_t len)
 +bool audit_string_contains_control(const char *string, size_t len)
  {
        const unsigned char *p;
        for (p = string; p < (const unsigned char *)string + len; p++) {
                if (*p == '"' || *p < 0x21 || *p > 0x7e)
 -                      return 1;
 +                      return true;
        }
 -      return 0;
 +      return false;
  }
  
  /**
diff --combined kernel/cgroup.c
index b9d0cce3f9ce54937fea988b531d0cc7bf52f692,311b00c30a889e02eec2fbbffd0b0565a347943e..f1603c153890d2b9dbd37a5c687fd297c6137f24
@@@ -45,7 -45,7 +45,7 @@@
  #include <linux/sched.h>
  #include <linux/slab.h>
  #include <linux/spinlock.h>
 -#include <linux/rwsem.h>
 +#include <linux/percpu-rwsem.h>
  #include <linux/string.h>
  #include <linux/sort.h>
  #include <linux/kmod.h>
@@@ -75,7 -75,7 +75,7 @@@
   * cgroup_mutex is the master lock.  Any modification to cgroup or its
   * hierarchy must be performed while holding it.
   *
 - * css_set_rwsem protects task->cgroups pointer, the list of css_set
 + * css_set_lock protects task->cgroups pointer, the list of css_set
   * objects, and the chain of tasks off each css_set.
   *
   * These locks are exported if CONFIG_PROVE_RCU so that accessors in
   */
  #ifdef CONFIG_PROVE_RCU
  DEFINE_MUTEX(cgroup_mutex);
 -DECLARE_RWSEM(css_set_rwsem);
 +DEFINE_SPINLOCK(css_set_lock);
  EXPORT_SYMBOL_GPL(cgroup_mutex);
 -EXPORT_SYMBOL_GPL(css_set_rwsem);
 +EXPORT_SYMBOL_GPL(css_set_lock);
  #else
  static DEFINE_MUTEX(cgroup_mutex);
 -static DECLARE_RWSEM(css_set_rwsem);
 +static DEFINE_SPINLOCK(css_set_lock);
  #endif
  
  /*
@@@ -103,8 -103,6 +103,8 @@@ static DEFINE_SPINLOCK(cgroup_idr_lock)
   */
  static DEFINE_SPINLOCK(release_agent_path_lock);
  
 +struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
 +
  #define cgroup_assert_mutex_or_rcu_locked()                           \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
                           !lockdep_is_held(&cgroup_mutex),             \
@@@ -138,27 -136,6 +138,27 @@@ static const char *cgroup_subsys_name[
  };
  #undef SUBSYS
  
 +/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
 +#define SUBSYS(_x)                                                            \
 +      DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                 \
 +      DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                  \
 +      EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                      \
 +      EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
 +#include <linux/cgroup_subsys.h>
 +#undef SUBSYS
 +
 +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
 +static struct static_key_true *cgroup_subsys_enabled_key[] = {
 +#include <linux/cgroup_subsys.h>
 +};
 +#undef SUBSYS
 +
 +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
 +static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 +#include <linux/cgroup_subsys.h>
 +};
 +#undef SUBSYS
 +
  /*
   * The default hierarchy, reserved for the subsystems that are otherwise
   * unattached - it never has more than a single cgroup, and all tasks are
@@@ -173,6 -150,12 +173,6 @@@ EXPORT_SYMBOL_GPL(cgrp_dfl_root)
   */
  static bool cgrp_dfl_root_visible;
  
 -/*
 - * Set by the boot param of the same name and makes subsystems with NULL
 - * ->dfl_files to use ->legacy_files on the default hierarchy.
 - */
 -static bool cgroup_legacy_files_on_dfl;
 -
  /* some controllers are not supported in the default hierarchy */
  static unsigned long cgrp_dfl_root_inhibit_ss_mask;
  
@@@ -200,7 -183,6 +200,7 @@@ static u64 css_serial_nr_next = 1
   */
  static unsigned long have_fork_callback __read_mostly;
  static unsigned long have_exit_callback __read_mostly;
 +static unsigned long have_free_callback __read_mostly;
  
  /* Ditto for the can_fork callback. */
  static unsigned long have_canfork_callback __read_mostly;
@@@ -210,87 -192,14 +210,87 @@@ static struct cftype cgroup_legacy_base
  
  static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned long ss_mask);
 +static void css_task_iter_advance(struct css_task_iter *it);
  static int cgroup_destroy_locked(struct cgroup *cgrp);
  static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
                      bool visible);
  static void css_release(struct percpu_ref *ref);
  static void kill_css(struct cgroup_subsys_state *css);
 -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 +static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 +                            struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
  
 +/**
 + * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 + * @ssid: subsys ID of interest
 + *
 + * cgroup_subsys_enabled() can only be used with literal subsys names which
 + * is fine for individual subsystems but unsuitable for cgroup core.  This
 + * is slower static_key_enabled() based test indexed by @ssid.
 + */
 +static bool cgroup_ssid_enabled(int ssid)
 +{
 +      return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 +}
 +
 +/**
 + * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 + * @cgrp: the cgroup of interest
 + *
 + * The default hierarchy is the v2 interface of cgroup and this function
 + * can be used to test whether a cgroup is on the default hierarchy for
 + * cases where a subsystem should behave differnetly depending on the
 + * interface version.
 + *
 + * The set of behaviors which change on the default hierarchy are still
 + * being determined and the mount option is prefixed with __DEVEL__.
 + *
 + * List of changed behaviors:
 + *
 + * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 + *   and "name" are disallowed.
 + *
 + * - When mounting an existing superblock, mount options should match.
 + *
 + * - Remount is disallowed.
 + *
 + * - rename(2) is disallowed.
 + *
 + * - "tasks" is removed.  Everything should be at process granularity.  Use
 + *   "cgroup.procs" instead.
 + *
 + * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 + *   recycled inbetween reads.
 + *
 + * - "release_agent" and "notify_on_release" are removed.  Replacement
 + *   notification mechanism will be implemented.
 + *
 + * - "cgroup.clone_children" is removed.
 + *
 + * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 + *   and its descendants contain no task; otherwise, 1.  The file also
 + *   generates kernfs notification which can be monitored through poll and
 + *   [di]notify when the value of the file changes.
 + *
 + * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 + *   take masks of ancestors with non-empty cpus/mems, instead of being
 + *   moved to an ancestor.
 + *
 + * - cpuset: a task can be moved into an empty cpuset, and again it takes
 + *   masks of ancestors.
 + *
 + * - memcg: use_hierarchy is on by default and the cgroup file for the flag
 + *   is not created.
 + *
 + * - blkcg: blk-throttle becomes properly hierarchical.
 + *
 + * - debug: disallowed on the default hierarchy.
 + */
 +static bool cgroup_on_dfl(const struct cgroup *cgrp)
 +{
 +      return cgrp->root == &cgrp_dfl_root;
 +}
 +
  /* IDR wrappers which synchronize using cgroup_idr_lock */
  static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
                            gfp_t gfp_mask)
  
        idr_preload(gfp_mask);
        spin_lock_bh(&cgroup_idr_lock);
-       ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
+       ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
        spin_unlock_bh(&cgroup_idr_lock);
        idr_preload_end();
        return ret;
@@@ -423,22 -332,6 +423,22 @@@ static inline bool cgroup_is_dead(cons
        return !(cgrp->self.flags & CSS_ONLINE);
  }
  
 +static void cgroup_get(struct cgroup *cgrp)
 +{
 +      WARN_ON_ONCE(cgroup_is_dead(cgrp));
 +      css_get(&cgrp->self);
 +}
 +
 +static bool cgroup_tryget(struct cgroup *cgrp)
 +{
 +      return css_tryget(&cgrp->self);
 +}
 +
 +static void cgroup_put(struct cgroup *cgrp)
 +{
 +      css_put(&cgrp->self);
 +}
 +
  struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
  {
        struct cgroup *cgrp = of->kn->parent->priv;
@@@ -588,31 -481,19 +588,31 @@@ struct css_set init_css_set = 
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
 +      .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
  };
  
  static int css_set_count      = 1;    /* 1 for init_css_set */
  
 +/**
 + * css_set_populated - does a css_set contain any tasks?
 + * @cset: target css_set
 + */
 +static bool css_set_populated(struct css_set *cset)
 +{
 +      lockdep_assert_held(&css_set_lock);
 +
 +      return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
 +}
 +
  /**
   * cgroup_update_populated - updated populated count of a cgroup
   * @cgrp: the target cgroup
   * @populated: inc or dec populated count
   *
 - * @cgrp is either getting the first task (css_set) or losing the last.
 - * Update @cgrp->populated_cnt accordingly.  The count is propagated
 - * towards root so that a given cgroup's populated_cnt is zero iff the
 - * cgroup and all its descendants are empty.
 + * One of the css_sets associated with @cgrp is either getting its first
 + * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
 + * count is propagated towards root so that a given cgroup's populated_cnt
 + * is zero iff the cgroup and all its descendants don't contain any tasks.
   *
   * @cgrp's interface file "cgroup.populated" is zero if
   * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
   */
  static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
  {
 -      lockdep_assert_held(&css_set_rwsem);
 +      lockdep_assert_held(&css_set_lock);
  
        do {
                bool trigger;
                if (!trigger)
                        break;
  
 -              if (cgrp->populated_kn)
 -                      kernfs_notify(cgrp->populated_kn);
 +              check_for_release(cgrp);
 +              cgroup_file_notify(&cgrp->events_file);
 +
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);
  }
  
 +/**
 + * css_set_update_populated - update populated state of a css_set
 + * @cset: target css_set
 + * @populated: whether @cset is populated or depopulated
 + *
 + * @cset is either getting the first task or losing the last.  Update the
 + * ->populated_cnt of all associated cgroups accordingly.
 + */
 +static void css_set_update_populated(struct css_set *cset, bool populated)
 +{
 +      struct cgrp_cset_link *link;
 +
 +      lockdep_assert_held(&css_set_lock);
 +
 +      list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
 +              cgroup_update_populated(link->cgrp, populated);
 +}
 +
 +/**
 + * css_set_move_task - move a task from one css_set to another
 + * @task: task being moved
 + * @from_cset: css_set @task currently belongs to (may be NULL)
 + * @to_cset: new css_set @task is being moved to (may be NULL)
 + * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 + *
 + * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 + * css_set, @from_cset can be NULL.  If @task is being disassociated
 + * instead of moved, @to_cset can be NULL.
 + *
 + * This function automatically handles populated_cnt updates and
 + * css_task_iter adjustments but the caller is responsible for managing
 + * @from_cset and @to_cset's reference counts.
 + */
 +static void css_set_move_task(struct task_struct *task,
 +                            struct css_set *from_cset, struct css_set *to_cset,
 +                            bool use_mg_tasks)
 +{
 +      lockdep_assert_held(&css_set_lock);
 +
 +      if (from_cset) {
 +              struct css_task_iter *it, *pos;
 +
 +              WARN_ON_ONCE(list_empty(&task->cg_list));
 +
 +              /*
 +               * @task is leaving, advance task iterators which are
 +               * pointing to it so that they can resume at the next
 +               * position.  Advancing an iterator might remove it from
 +               * the list, use safe walk.  See css_task_iter_advance*()
 +               * for details.
 +               */
 +              list_for_each_entry_safe(it, pos, &from_cset->task_iters,
 +                                       iters_node)
 +                      if (it->task_pos == &task->cg_list)
 +                              css_task_iter_advance(it);
 +
 +              list_del_init(&task->cg_list);
 +              if (!css_set_populated(from_cset))
 +                      css_set_update_populated(from_cset, false);
 +      } else {
 +              WARN_ON_ONCE(!list_empty(&task->cg_list));
 +      }
 +
 +      if (to_cset) {
 +              /*
 +               * We are synchronized through cgroup_threadgroup_rwsem
 +               * against PF_EXITING setting such that we can't race
 +               * against cgroup_exit() changing the css_set to
 +               * init_css_set and dropping the old one.
 +               */
 +              WARN_ON_ONCE(task->flags & PF_EXITING);
 +
 +              if (!css_set_populated(to_cset))
 +                      css_set_update_populated(to_cset, true);
 +              rcu_assign_pointer(task->cgroups, to_cset);
 +              list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
 +                                                           &to_cset->tasks);
 +      }
 +}
 +
  /*
   * hash table for cgroup groups. This improves the performance to find
   * an existing css_set. This hash doesn't (currently) take into
@@@ -749,7 -549,7 +749,7 @@@ static void put_css_set_locked(struct c
        struct cgroup_subsys *ss;
        int ssid;
  
 -      lockdep_assert_held(&css_set_rwsem);
 +      lockdep_assert_held(&css_set_lock);
  
        if (!atomic_dec_and_test(&cset->refcount))
                return;
        css_set_count--;
  
        list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 -              struct cgroup *cgrp = link->cgrp;
 -
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
 -
 -              /* @cgrp can't go away while we're holding css_set_rwsem */
 -              if (list_empty(&cgrp->cset_links)) {
 -                      cgroup_update_populated(cgrp, false);
 -                      check_for_release(cgrp);
 -              }
 -
 +              if (cgroup_parent(link->cgrp))
 +                      cgroup_put(link->cgrp);
                kfree(link);
        }
  
@@@ -781,9 -588,9 +781,9 @@@ static void put_css_set(struct css_set 
        if (atomic_add_unless(&cset->refcount, -1, 1))
                return;
  
 -      down_write(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        put_css_set_locked(cset);
 -      up_write(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
  }
  
  /*
@@@ -972,15 -779,15 +972,15 @@@ static void link_css_set(struct list_he
        link->cset = cset;
        link->cgrp = cgrp;
  
 -      if (list_empty(&cgrp->cset_links))
 -              cgroup_update_populated(cgrp, true);
 -      list_move(&link->cset_link, &cgrp->cset_links);
 -
        /*
 -       * Always add links to the tail of the list so that the list
 -       * is sorted by order of hierarchy creation
 +       * Always add links to the tail of the lists so that the lists are
 +       * in choronological order.
         */
 +      list_move_tail(&link->cset_link, &cgrp->cset_links);
        list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 +
 +      if (cgroup_parent(cgrp))
 +              cgroup_get(cgrp);
  }
  
  /**
@@@ -1006,11 -813,11 +1006,11 @@@ static struct css_set *find_css_set(str
  
        /* First see if we already have a cgroup group that matches
         * the desired set */
 -      down_read(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        cset = find_existing_css_set(old_cset, cgrp, template);
        if (cset)
                get_css_set(cset);
 -      up_read(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
  
        if (cset)
                return cset;
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->mg_preload_node);
        INIT_LIST_HEAD(&cset->mg_node);
 +      INIT_LIST_HEAD(&cset->task_iters);
        INIT_HLIST_NODE(&cset->hlist);
  
        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
        memcpy(cset->subsys, template, sizeof(cset->subsys));
  
 -      down_write(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;
                list_add_tail(&cset->e_cset_node[ssid],
                              &cset->subsys[ssid]->cgroup->e_csets[ssid]);
  
 -      up_write(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
  
        return cset;
  }
  
 -void cgroup_threadgroup_change_begin(struct task_struct *tsk)
 -{
 -      down_read(&tsk->signal->group_rwsem);
 -}
 -
 -void cgroup_threadgroup_change_end(struct task_struct *tsk)
 -{
 -      up_read(&tsk->signal->group_rwsem);
 -}
 -
 -/**
 - * threadgroup_lock - lock threadgroup
 - * @tsk: member task of the threadgroup to lock
 - *
 - * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
 - * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
 - * change ->group_leader/pid.  This is useful for cases where the threadgroup
 - * needs to stay stable across blockable operations.
 - *
 - * fork and exit explicitly call threadgroup_change_{begin|end}() for
 - * synchronization.  While held, no new task will be added to threadgroup
 - * and no existing live task will have its PF_EXITING set.
 - *
 - * de_thread() does threadgroup_change_{begin|end}() when a non-leader
 - * sub-thread becomes a new leader.
 - */
 -static void threadgroup_lock(struct task_struct *tsk)
 -{
 -      down_write(&tsk->signal->group_rwsem);
 -}
 -
 -/**
 - * threadgroup_unlock - unlock threadgroup
 - * @tsk: member task of the threadgroup to unlock
 - *
 - * Reverse threadgroup_lock().
 - */
 -static inline void threadgroup_unlock(struct task_struct *tsk)
 -{
 -      up_write(&tsk->signal->group_rwsem);
 -}
 -
  static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
  {
        struct cgroup *root_cgrp = kf_root->kn->priv;
@@@ -1124,15 -972,14 +1124,15 @@@ static void cgroup_destroy_root(struct 
         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
 -      down_write(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
  
        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                kfree(link);
        }
 -      up_write(&css_set_rwsem);
 +
 +      spin_unlock_bh(&css_set_lock);
  
        if (!list_empty(&root->root_list)) {
                list_del(&root->root_list);
@@@ -1154,7 -1001,7 +1154,7 @@@ static struct cgroup *cset_cgroup_from_
        struct cgroup *res = NULL;
  
        lockdep_assert_held(&cgroup_mutex);
 -      lockdep_assert_held(&css_set_rwsem);
 +      lockdep_assert_held(&css_set_lock);
  
        if (cset == &init_css_set) {
                res = &root->cgrp;
  
  /*
   * Return the cgroup for "task" from the given hierarchy. Must be
 - * called with cgroup_mutex and css_set_rwsem held.
 + * called with cgroup_mutex and css_set_lock held.
   */
  static struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                            struct cgroup_root *root)
   * update of a tasks cgroup pointer by cgroup_attach_task()
   */
  
 -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
  static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
  static const struct file_operations proc_cgroupstats_operations;
  
@@@ -1238,25 -1086,43 +1238,25 @@@ static char *cgroup_file_name(struct cg
   * cgroup_file_mode - deduce file mode of a control file
   * @cft: the control file in question
   *
 - * returns cft->mode if ->mode is not 0
 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
 - * returns S_IRUGO if it has only a read handler
 - * returns S_IWUSR if it has only a write hander
 + * S_IRUGO for read, S_IWUSR for write.
   */
  static umode_t cgroup_file_mode(const struct cftype *cft)
  {
        umode_t mode = 0;
  
 -      if (cft->mode)
 -              return cft->mode;
 -
        if (cft->read_u64 || cft->read_s64 || cft->seq_show)
                mode |= S_IRUGO;
  
 -      if (cft->write_u64 || cft->write_s64 || cft->write)
 -              mode |= S_IWUSR;
 +      if (cft->write_u64 || cft->write_s64 || cft->write) {
 +              if (cft->flags & CFTYPE_WORLD_WRITABLE)
 +                      mode |= S_IWUGO;
 +              else
 +                      mode |= S_IWUSR;
 +      }
  
        return mode;
  }
  
 -static void cgroup_get(struct cgroup *cgrp)
 -{
 -      WARN_ON_ONCE(cgroup_is_dead(cgrp));
 -      css_get(&cgrp->self);
 -}
 -
 -static bool cgroup_tryget(struct cgroup *cgrp)
 -{
 -      return css_tryget(&cgrp->self);
 -}
 -
 -static void cgroup_put(struct cgroup *cgrp)
 -{
 -      css_put(&cgrp->self);
 -}
 -
  /**
   * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
   * @cgrp: the target cgroup
@@@ -1397,64 -1263,28 +1397,64 @@@ static void cgroup_rm_file(struct cgrou
  }
  
  /**
 - * cgroup_clear_dir - remove subsys files in a cgroup directory
 - * @cgrp: target cgroup
 - * @subsys_mask: mask of the subsystem ids whose files should be removed
 + * css_clear_dir - remove subsys files in a cgroup directory
 + * @css: taget css
 + * @cgrp_override: specify if target cgroup is different from css->cgroup
   */
 -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 +static void css_clear_dir(struct cgroup_subsys_state *css,
 +                        struct cgroup *cgrp_override)
  {
 -      struct cgroup_subsys *ss;
 -      int i;
 +      struct cgroup *cgrp = cgrp_override ?: css->cgroup;
 +      struct cftype *cfts;
  
 -      for_each_subsys(ss, i) {
 -              struct cftype *cfts;
 +      list_for_each_entry(cfts, &css->ss->cfts, node)
 +              cgroup_addrm_files(css, cgrp, cfts, false);
 +}
  
 -              if (!(subsys_mask & (1 << i)))
 -                      continue;
 -              list_for_each_entry(cfts, &ss->cfts, node)
 -                      cgroup_addrm_files(cgrp, cfts, false);
 +/**
 + * css_populate_dir - create subsys files in a cgroup directory
 + * @css: target css
 + * @cgrp_overried: specify if target cgroup is different from css->cgroup
 + *
 + * On failure, no file is added.
 + */
 +static int css_populate_dir(struct cgroup_subsys_state *css,
 +                          struct cgroup *cgrp_override)
 +{
 +      struct cgroup *cgrp = cgrp_override ?: css->cgroup;
 +      struct cftype *cfts, *failed_cfts;
 +      int ret;
 +
 +      if (!css->ss) {
 +              if (cgroup_on_dfl(cgrp))
 +                      cfts = cgroup_dfl_base_files;
 +              else
 +                      cfts = cgroup_legacy_base_files;
 +
 +              return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
 +      }
 +
 +      list_for_each_entry(cfts, &css->ss->cfts, node) {
 +              ret = cgroup_addrm_files(css, cgrp, cfts, true);
 +              if (ret < 0) {
 +                      failed_cfts = cfts;
 +                      goto err;
 +              }
        }
 +      return 0;
 +err:
 +      list_for_each_entry(cfts, &css->ss->cfts, node) {
 +              if (cfts == failed_cfts)
 +                      break;
 +              cgroup_addrm_files(css, cgrp, cfts, false);
 +      }
 +      return ret;
  }
  
  static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned long ss_mask)
  {
 +      struct cgroup *dcgrp = &dst_root->cgrp;
        struct cgroup_subsys *ss;
        unsigned long tmp_ss_mask;
        int ssid, i, ret;
        if (dst_root == &cgrp_dfl_root)
                tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
  
 -      ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
 -      if (ret) {
 -              if (dst_root != &cgrp_dfl_root)
 -                      return ret;
 +      for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
 +              struct cgroup *scgrp = &ss->root->cgrp;
 +              int tssid;
 +
 +              ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
 +              if (!ret)
 +                      continue;
  
                /*
                 * Rebinding back to the default root is not allowed to
                 * be rare.  Moving subsystems back and forth even more so.
                 * Just warn about it and continue.
                 */
 -              if (cgrp_dfl_root_visible) {
 -                      pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
 -                              ret, ss_mask);
 -                      pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
 +              if (dst_root == &cgrp_dfl_root) {
 +                      if (cgrp_dfl_root_visible) {
 +                              pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
 +                                      ret, ss_mask);
 +                              pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
 +                      }
 +                      continue;
 +              }
 +
 +              for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
 +                      if (tssid == ssid)
 +                              break;
 +                      css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
                }
 +              return ret;
        }
  
        /*
         * Nothing can fail from this point on.  Remove files for the
         * removed subsystems and rebind each subsystem.
         */
 -      for_each_subsys_which(ss, ssid, &ss_mask)
 -              cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
 -
        for_each_subsys_which(ss, ssid, &ss_mask) {
 -              struct cgroup_root *src_root;
 -              struct cgroup_subsys_state *css;
 +              struct cgroup_root *src_root = ss->root;
 +              struct cgroup *scgrp = &src_root->cgrp;
 +              struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
                struct css_set *cset;
  
 -              src_root = ss->root;
 -              css = cgroup_css(&src_root->cgrp, ss);
 +              WARN_ON(!css || cgroup_css(dcgrp, ss));
  
 -              WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
 +              css_clear_dir(css, NULL);
  
 -              RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
 -              rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
 +              RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
 +              rcu_assign_pointer(dcgrp->subsys[ssid], css);
                ss->root = dst_root;
 -              css->cgroup = &dst_root->cgrp;
 +              css->cgroup = dcgrp;
  
 -              down_write(&css_set_rwsem);
 +              spin_lock_bh(&css_set_lock);
                hash_for_each(css_set_table, i, cset, hlist)
                        list_move_tail(&cset->e_cset_node[ss->id],
 -                                     &dst_root->cgrp.e_csets[ss->id]);
 -              up_write(&css_set_rwsem);
 +                                     &dcgrp->e_csets[ss->id]);
 +              spin_unlock_bh(&css_set_lock);
  
                src_root->subsys_mask &= ~(1 << ssid);
 -              src_root->cgrp.subtree_control &= ~(1 << ssid);
 -              cgroup_refresh_child_subsys_mask(&src_root->cgrp);
 +              scgrp->subtree_control &= ~(1 << ssid);
 +              cgroup_refresh_child_subsys_mask(scgrp);
  
                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
 -              if (dst_root != &cgrp_dfl_root) {
 -                      dst_root->cgrp.subtree_control |= 1 << ssid;
 -                      cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
 +              if (dst_root == &cgrp_dfl_root) {
 +                      static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
 +              } else {
 +                      dcgrp->subtree_control |= 1 << ssid;
 +                      cgroup_refresh_child_subsys_mask(dcgrp);
 +                      static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                }
  
                if (ss->bind)
                        ss->bind(css);
        }
  
 -      kernfs_activate(dst_root->cgrp.kn);
 +      kernfs_activate(dcgrp->kn);
        return 0;
  }
  
@@@ -1680,7 -1497,7 +1680,7 @@@ static int parse_cgroupfs_options(char 
                for_each_subsys(ss, i) {
                        if (strcmp(token, ss->legacy_name))
                                continue;
 -                      if (ss->disabled)
 +                      if (!cgroup_ssid_enabled(i))
                                continue;
  
                        /* Mutually exclusive option 'all' + subsystem name */
         */
        if (all_ss || (!one_ss && !opts->none && !opts->name))
                for_each_subsys(ss, i)
 -                      if (!ss->disabled)
 +                      if (cgroup_ssid_enabled(i))
                                opts->subsys_mask |= (1 << i);
  
        /*
@@@ -1807,7 -1624,7 +1807,7 @@@ static void cgroup_enable_task_cg_lists
  {
        struct task_struct *p, *g;
  
 -      down_write(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
  
        if (use_task_css_set_links)
                goto out_unlock;
                if (!(p->flags & PF_EXITING)) {
                        struct css_set *cset = task_css_set(p);
  
 -                      list_add(&p->cg_list, &cset->tasks);
 +                      if (!css_set_populated(cset))
 +                              css_set_update_populated(cset, true);
 +                      list_add_tail(&p->cg_list, &cset->tasks);
                        get_css_set(cset);
                }
                spin_unlock_irq(&p->sighand->siglock);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
  out_unlock:
 -      up_write(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
  }
  
  static void init_cgroup_housekeeping(struct cgroup *cgrp)
  
        INIT_LIST_HEAD(&cgrp->self.sibling);
        INIT_LIST_HEAD(&cgrp->self.children);
 +      INIT_LIST_HEAD(&cgrp->self.files);
        INIT_LIST_HEAD(&cgrp->cset_links);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
@@@ -1894,6 -1708,7 +1894,6 @@@ static int cgroup_setup_root(struct cgr
  {
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
 -      struct cftype *base_files;
        struct css_set *cset;
        int i, ret;
  
                goto out;
  
        /*
 -       * We're accessing css_set_count without locking css_set_rwsem here,
 +       * We're accessing css_set_count without locking css_set_lock here,
         * but that's OK - it can only be increased by someone holding
         * cgroup_lock, and that's us. The worst that can happen is that we
         * have some link structures left over
        }
        root_cgrp->kn = root->kf_root->kn;
  
 -      if (root == &cgrp_dfl_root)
 -              base_files = cgroup_dfl_base_files;
 -      else
 -              base_files = cgroup_legacy_base_files;
 -
 -      ret = cgroup_addrm_files(root_cgrp, base_files, true);
 +      ret = css_populate_dir(&root_cgrp->self, NULL);
        if (ret)
                goto destroy_root;
  
         * Link the root cgroup in this hierarchy into all the css_set
         * objects.
         */
 -      down_write(&css_set_rwsem);
 -      hash_for_each(css_set_table, i, cset, hlist)
 +      spin_lock_bh(&css_set_lock);
 +      hash_for_each(css_set_table, i, cset, hlist) {
                link_css_set(&tmp_links, cset, root_cgrp);
 -      up_write(&css_set_rwsem);
 +              if (css_set_populated(cset))
 +                      cgroup_update_populated(root_cgrp, true);
 +      }
 +      spin_unlock_bh(&css_set_lock);
  
        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@@ -2191,7 -2008,7 +2191,7 @@@ char *task_cgroup_path(struct task_stru
        char *path = NULL;
  
        mutex_lock(&cgroup_mutex);
 -      down_read(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
  
        root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
  
                        path = buf;
        }
  
 -      up_read(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        return path;
  }
@@@ -2232,49 -2049,6 +2232,49 @@@ struct cgroup_taskset 
        struct task_struct      *cur_task;
  };
  
 +#define CGROUP_TASKSET_INIT(tset)     (struct cgroup_taskset){        \
 +      .src_csets              = LIST_HEAD_INIT(tset.src_csets),       \
 +      .dst_csets              = LIST_HEAD_INIT(tset.dst_csets),       \
 +      .csets                  = &tset.src_csets,                      \
 +}
 +
 +/**
 + * cgroup_taskset_add - try to add a migration target task to a taskset
 + * @task: target task
 + * @tset: target taskset
 + *
 + * Add @task, which is a migration target, to @tset.  This function becomes
 + * noop if @task doesn't need to be migrated.  @task's css_set should have
 + * been added as a migration source and @task->cg_list will be moved from
 + * the css_set's tasks list to mg_tasks one.
 + */
 +static void cgroup_taskset_add(struct task_struct *task,
 +                             struct cgroup_taskset *tset)
 +{
 +      struct css_set *cset;
 +
 +      lockdep_assert_held(&css_set_lock);
 +
 +      /* @task either already exited or can't exit until the end */
 +      if (task->flags & PF_EXITING)
 +              return;
 +
 +      /* leave @task alone if post_fork() hasn't linked it yet */
 +      if (list_empty(&task->cg_list))
 +              return;
 +
 +      cset = task_css_set(task);
 +      if (!cset->mg_src_cgrp)
 +              return;
 +
 +      list_move_tail(&task->cg_list, &cset->mg_tasks);
 +      if (list_empty(&cset->mg_node))
 +              list_add_tail(&cset->mg_node, &tset->src_csets);
 +      if (list_empty(&cset->mg_dst_cset->mg_node))
 +              list_move_tail(&cset->mg_dst_cset->mg_node,
 +                             &tset->dst_csets);
 +}
 +
  /**
   * cgroup_taskset_first - reset taskset and return the first task
   * @tset: taskset of interest
@@@ -2322,86 -2096,47 +2322,86 @@@ struct task_struct *cgroup_taskset_next
  }
  
  /**
 - * cgroup_task_migrate - move a task from one cgroup to another.
 - * @old_cgrp: the cgroup @tsk is being migrated from
 - * @tsk: the task being migrated
 - * @new_cset: the new css_set @tsk is being attached to
 + * cgroup_taskset_migrate - migrate a taskset to a cgroup
 + * @tset: taget taskset
 + * @dst_cgrp: destination cgroup
   *
 - * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
 + * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
 + * ->can_attach callbacks fails and guarantees that either all or none of
 + * the tasks in @tset are migrated.  @tset is consumed regardless of
 + * success.
   */
 -static void cgroup_task_migrate(struct cgroup *old_cgrp,
 -                              struct task_struct *tsk,
 -                              struct css_set *new_cset)
 +static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 +                                struct cgroup *dst_cgrp)
  {
 -      struct css_set *old_cset;
 -
 -      lockdep_assert_held(&cgroup_mutex);
 -      lockdep_assert_held(&css_set_rwsem);
 +      struct cgroup_subsys_state *css, *failed_css = NULL;
 +      struct task_struct *task, *tmp_task;
 +      struct css_set *cset, *tmp_cset;
 +      int i, ret;
  
 -      /*
 -       * We are synchronized through threadgroup_lock() against PF_EXITING
 -       * setting such that we can't race against cgroup_exit() changing the
 -       * css_set to init_css_set and dropping the old one.
 -       */
 -      WARN_ON_ONCE(tsk->flags & PF_EXITING);
 -      old_cset = task_css_set(tsk);
 +      /* methods shouldn't be called if no task is actually migrating */
 +      if (list_empty(&tset->src_csets))
 +              return 0;
  
 -      get_css_set(new_cset);
 -      rcu_assign_pointer(tsk->cgroups, new_cset);
 +      /* check that we can legitimately attach to the cgroup */
 +      for_each_e_css(css, i, dst_cgrp) {
 +              if (css->ss->can_attach) {
 +                      ret = css->ss->can_attach(css, tset);
 +                      if (ret) {
 +                              failed_css = css;
 +                              goto out_cancel_attach;
 +                      }
 +              }
 +      }
  
        /*
 -       * Use move_tail so that cgroup_taskset_first() still returns the
 -       * leader after migration.  This works because cgroup_migrate()
 -       * ensures that the dst_cset of the leader is the first on the
 -       * tset's dst_csets list.
 +       * Now that we're guaranteed success, proceed to move all tasks to
 +       * the new cgroup.  There are no failure cases after here, so this
 +       * is the commit point.
         */
 -      list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
 +      spin_lock_bh(&css_set_lock);
 +      list_for_each_entry(cset, &tset->src_csets, mg_node) {
 +              list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
 +                      struct css_set *from_cset = task_css_set(task);
 +                      struct css_set *to_cset = cset->mg_dst_cset;
 +
 +                      get_css_set(to_cset);
 +                      css_set_move_task(task, from_cset, to_cset, true);
 +                      put_css_set_locked(from_cset);
 +              }
 +      }
 +      spin_unlock_bh(&css_set_lock);
  
        /*
 -       * We just gained a reference on old_cset by taking it from the
 -       * task. As trading it for new_cset is protected by cgroup_mutex,
 -       * we're safe to drop it here; it will be freed under RCU.
 +       * Migration is committed, all target tasks are now on dst_csets.
 +       * Nothing is sensitive to fork() after this point.  Notify
 +       * controllers that migration is complete.
         */
 -      put_css_set_locked(old_cset);
 +      tset->csets = &tset->dst_csets;
 +
 +      for_each_e_css(css, i, dst_cgrp)
 +              if (css->ss->attach)
 +                      css->ss->attach(css, tset);
 +
 +      ret = 0;
 +      goto out_release_tset;
 +
 +out_cancel_attach:
 +      for_each_e_css(css, i, dst_cgrp) {
 +              if (css == failed_css)
 +                      break;
 +              if (css->ss->cancel_attach)
 +                      css->ss->cancel_attach(css, tset);
 +      }
 +out_release_tset:
 +      spin_lock_bh(&css_set_lock);
 +      list_splice_init(&tset->dst_csets, &tset->src_csets);
 +      list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
 +              list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 +              list_del_init(&cset->mg_node);
 +      }
 +      spin_unlock_bh(&css_set_lock);
 +      return ret;
  }
  
  /**
@@@ -2417,14 -2152,14 +2417,14 @@@ static void cgroup_migrate_finish(struc
  
        lockdep_assert_held(&cgroup_mutex);
  
 -      down_write(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_preload_node);
                put_css_set_locked(cset);
        }
 -      up_write(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
  }
  
  /**
   * @src_cset and add it to @preloaded_csets, which should later be cleaned
   * up by cgroup_migrate_finish().
   *
 - * This function may be called without holding threadgroup_lock even if the
 - * target is a process.  Threads may be created and destroyed but as long
 - * as cgroup_mutex is not dropped, no new css_set can be put into play and
 - * the preloaded css_sets are guaranteed to cover all migrations.
 + * This function may be called without holding cgroup_threadgroup_rwsem
 + * even if the target is a process.  Threads may be created and destroyed
 + * but as long as cgroup_mutex is not dropped, no new css_set can be put
 + * into play and the preloaded css_sets are guaranteed to cover all
 + * migrations.
   */
  static void cgroup_migrate_add_src(struct css_set *src_cset,
                                   struct cgroup *dst_cgrp,
        struct cgroup *src_cgrp;
  
        lockdep_assert_held(&cgroup_mutex);
 -      lockdep_assert_held(&css_set_rwsem);
 +      lockdep_assert_held(&css_set_lock);
  
        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
  
  
  /**
   * cgroup_migrate - migrate a process or task to a cgroup
 - * @cgrp: the destination cgroup
   * @leader: the leader of the process or the task to migrate
   * @threadgroup: whether @leader points to the whole process or a single task
 + * @cgrp: the destination cgroup
   *
   * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
 - * process, the caller must be holding threadgroup_lock of @leader.  The
 + * process, the caller must be holding cgroup_threadgroup_rwsem.  The
   * caller is also responsible for invoking cgroup_migrate_add_src() and
   * cgroup_migrate_prepare_dst() on the targets before invoking this
   * function and following up with cgroup_migrate_finish().
   * decided for all targets by invoking group_migrate_prepare_dst() before
   * actually starting migrating.
   */
 -static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
 -                        bool threadgroup)
 -{
 -      struct cgroup_taskset tset = {
 -              .src_csets      = LIST_HEAD_INIT(tset.src_csets),
 -              .dst_csets      = LIST_HEAD_INIT(tset.dst_csets),
 -              .csets          = &tset.src_csets,
 -      };
 -      struct cgroup_subsys_state *css, *failed_css = NULL;
 -      struct css_set *cset, *tmp_cset;
 -      struct task_struct *task, *tmp_task;
 -      int i, ret;
 +static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 +                        struct cgroup *cgrp)
 +{
 +      struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
 +      struct task_struct *task;
  
        /*
         * Prevent freeing of tasks while we take a snapshot. Tasks that are
         * already PF_EXITING could be freed from underneath us unless we
         * take an rcu_read_lock.
         */
 -      down_write(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
 -              /* @task either already exited or can't exit until the end */
 -              if (task->flags & PF_EXITING)
 -                      goto next;
 -
 -              /* leave @task alone if post_fork() hasn't linked it yet */
 -              if (list_empty(&task->cg_list))
 -                      goto next;
 -
 -              cset = task_css_set(task);
 -              if (!cset->mg_src_cgrp)
 -                      goto next;
 -
 -              /*
 -               * cgroup_taskset_first() must always return the leader.
 -               * Take care to avoid disturbing the ordering.
 -               */
 -              list_move_tail(&task->cg_list, &cset->mg_tasks);
 -              if (list_empty(&cset->mg_node))
 -                      list_add_tail(&cset->mg_node, &tset.src_csets);
 -              if (list_empty(&cset->mg_dst_cset->mg_node))
 -                      list_move_tail(&cset->mg_dst_cset->mg_node,
 -                                     &tset.dst_csets);
 -      next:
 +              cgroup_taskset_add(task, &tset);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
 -      up_write(&css_set_rwsem);
 -
 -      /* methods shouldn't be called if no task is actually migrating */
 -      if (list_empty(&tset.src_csets))
 -              return 0;
 -
 -      /* check that we can legitimately attach to the cgroup */
 -      for_each_e_css(css, i, cgrp) {
 -              if (css->ss->can_attach) {
 -                      ret = css->ss->can_attach(css, &tset);
 -                      if (ret) {
 -                              failed_css = css;
 -                              goto out_cancel_attach;
 -                      }
 -              }
 -      }
 -
 -      /*
 -       * Now that we're guaranteed success, proceed to move all tasks to
 -       * the new cgroup.  There are no failure cases after here, so this
 -       * is the commit point.
 -       */
 -      down_write(&css_set_rwsem);
 -      list_for_each_entry(cset, &tset.src_csets, mg_node) {
 -              list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
 -                      cgroup_task_migrate(cset->mg_src_cgrp, task,
 -                                          cset->mg_dst_cset);
 -      }
 -      up_write(&css_set_rwsem);
 -
 -      /*
 -       * Migration is committed, all target tasks are now on dst_csets.
 -       * Nothing is sensitive to fork() after this point.  Notify
 -       * controllers that migration is complete.
 -       */
 -      tset.csets = &tset.dst_csets;
 -
 -      for_each_e_css(css, i, cgrp)
 -              if (css->ss->attach)
 -                      css->ss->attach(css, &tset);
 -
 -      ret = 0;
 -      goto out_release_tset;
 +      spin_unlock_bh(&css_set_lock);
  
 -out_cancel_attach:
 -      for_each_e_css(css, i, cgrp) {
 -              if (css == failed_css)
 -                      break;
 -              if (css->ss->cancel_attach)
 -                      css->ss->cancel_attach(css, &tset);
 -      }
 -out_release_tset:
 -      down_write(&css_set_rwsem);
 -      list_splice_init(&tset.dst_csets, &tset.src_csets);
 -      list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
 -              list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 -              list_del_init(&cset->mg_node);
 -      }
 -      up_write(&css_set_rwsem);
 -      return ret;
 +      return cgroup_taskset_migrate(&tset, cgrp);
  }
  
  /**
   * @leader: the task or the leader of the threadgroup to be attached
   * @threadgroup: attach the whole threadgroup?
   *
 - * Call holding cgroup_mutex and threadgroup_lock of @leader.
 + * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
   */
  static int cgroup_attach_task(struct cgroup *dst_cgrp,
                              struct task_struct *leader, bool threadgroup)
        int ret;
  
        /* look up all src csets */
 -      down_read(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
 -      up_read(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
  
        /* prepare dst csets and commit */
        ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
        if (!ret)
 -              ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
 +              ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
  
        cgroup_migrate_finish(&preloaded_csets);
        return ret;
@@@ -2639,15 -2459,15 +2639,15 @@@ static int cgroup_procs_write_permissio
                struct cgroup *cgrp;
                struct inode *inode;
  
 -              down_read(&css_set_rwsem);
 +              spin_lock_bh(&css_set_lock);
                cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
 -              up_read(&css_set_rwsem);
 +              spin_unlock_bh(&css_set_lock);
  
                while (!cgroup_is_descendant(dst_cgrp, cgrp))
                        cgrp = cgroup_parent(cgrp);
  
                ret = -ENOMEM;
 -              inode = kernfs_get_inode(sb, cgrp->procs_kn);
 +              inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
                if (inode) {
                        ret = inode_permission(inode, MAY_WRITE);
                        iput(inode);
@@@ -2678,13 -2498,14 +2678,13 @@@ static ssize_t __cgroup_procs_write(str
        if (!cgrp)
                return -ENODEV;
  
 -retry_find_task:
 +      percpu_down_write(&cgroup_threadgroup_rwsem);
        rcu_read_lock();
        if (pid) {
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
 -                      rcu_read_unlock();
                        ret = -ESRCH;
 -                      goto out_unlock_cgroup;
 +                      goto out_unlock_rcu;
                }
        } else {
                tsk = current;
         */
        if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
                ret = -EINVAL;
 -              rcu_read_unlock();
 -              goto out_unlock_cgroup;
 +              goto out_unlock_rcu;
        }
  
        get_task_struct(tsk);
        rcu_read_unlock();
  
 -      threadgroup_lock(tsk);
 -      if (threadgroup) {
 -              if (!thread_group_leader(tsk)) {
 -                      /*
 -                       * a race with de_thread from another thread's exec()
 -                       * may strip us of our leadership, if this happens,
 -                       * there is no choice but to throw this task away and
 -                       * try again; this is
 -                       * "double-double-toil-and-trouble-check locking".
 -                       */
 -                      threadgroup_unlock(tsk);
 -                      put_task_struct(tsk);
 -                      goto retry_find_task;
 -              }
 -      }
 -
        ret = cgroup_procs_write_permission(tsk, cgrp, of);
        if (!ret)
                ret = cgroup_attach_task(cgrp, tsk, threadgroup);
  
 -      threadgroup_unlock(tsk);
 -
        put_task_struct(tsk);
 -out_unlock_cgroup:
 +      goto out_unlock_threadgroup;
 +
 +out_unlock_rcu:
 +      rcu_read_unlock();
 +out_unlock_threadgroup:
 +      percpu_up_write(&cgroup_threadgroup_rwsem);
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
  }
@@@ -2738,9 -2573,9 +2738,9 @@@ int cgroup_attach_task_all(struct task_
                if (root == &cgrp_dfl_root)
                        continue;
  
 -              down_read(&css_set_rwsem);
 +              spin_lock_bh(&css_set_lock);
                from_cgrp = task_cgroup_from_root(from, root);
 -              up_read(&css_set_rwsem);
 +              spin_unlock_bh(&css_set_lock);
  
                retval = cgroup_attach_task(from_cgrp, tsk, false);
                if (retval)
@@@ -2855,17 -2690,14 +2855,17 @@@ static int cgroup_subtree_control_show(
  static int cgroup_update_dfl_csses(struct cgroup *cgrp)
  {
        LIST_HEAD(preloaded_csets);
 +      struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
        struct cgroup_subsys_state *css;
        struct css_set *src_cset;
        int ret;
  
        lockdep_assert_held(&cgroup_mutex);
  
 +      percpu_down_write(&cgroup_threadgroup_rwsem);
 +
        /* look up all csses currently attached to @cgrp's subtree */
 -      down_read(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
                struct cgrp_cset_link *link;
  
                        cgroup_migrate_add_src(link->cset, cgrp,
                                               &preloaded_csets);
        }
 -      up_read(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
  
        /* NULL dst indicates self on default hierarchy */
        ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
        if (ret)
                goto out_finish;
  
 +      spin_lock_bh(&css_set_lock);
        list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
 -              struct task_struct *last_task = NULL, *task;
 +              struct task_struct *task, *ntask;
  
                /* src_csets precede dst_csets, break on the first dst_cset */
                if (!src_cset->mg_src_cgrp)
                        break;
  
 -              /*
 -               * All tasks in src_cset need to be migrated to the
 -               * matching dst_cset.  Empty it process by process.  We
 -               * walk tasks but migrate processes.  The leader might even
 -               * belong to a different cset but such src_cset would also
 -               * be among the target src_csets because the default
 -               * hierarchy enforces per-process membership.
 -               */
 -              while (true) {
 -                      down_read(&css_set_rwsem);
 -                      task = list_first_entry_or_null(&src_cset->tasks,
 -                                              struct task_struct, cg_list);
 -                      if (task) {
 -                              task = task->group_leader;
 -                              WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
 -                              get_task_struct(task);
 -                      }
 -                      up_read(&css_set_rwsem);
 -
 -                      if (!task)
 -                              break;
 -
 -                      /* guard against possible infinite loop */
 -                      if (WARN(last_task == task,
 -                               "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
 -                              goto out_finish;
 -                      last_task = task;
 -
 -                      threadgroup_lock(task);
 -                      /* raced against de_thread() from another thread? */
 -                      if (!thread_group_leader(task)) {
 -                              threadgroup_unlock(task);
 -                              put_task_struct(task);
 -                              continue;
 -                      }
 -
 -                      ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
 -
 -                      threadgroup_unlock(task);
 -                      put_task_struct(task);
 -
 -                      if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
 -                              goto out_finish;
 -              }
 +              /* all tasks in src_csets need to be migrated */
 +              list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
 +                      cgroup_taskset_add(task, &tset);
        }
 +      spin_unlock_bh(&css_set_lock);
  
 +      ret = cgroup_taskset_migrate(&tset, cgrp);
  out_finish:
        cgroup_migrate_finish(&preloaded_csets);
 +      percpu_up_write(&cgroup_threadgroup_rwsem);
        return ret;
  }
  
@@@ -2928,8 -2797,7 +2928,8 @@@ static ssize_t cgroup_subtree_control_w
                if (tok[0] == '\0')
                        continue;
                for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
 -                      if (ss->disabled || strcmp(tok + 1, ss->name))
 +                      if (!cgroup_ssid_enabled(ssid) ||
 +                          strcmp(tok + 1, ss->name))
                                continue;
  
                        if (*tok == '+') {
                                ret = create_css(child, ss,
                                        cgrp->subtree_control & (1 << ssid));
                        else
 -                              ret = cgroup_populate_dir(child, 1 << ssid);
 +                              ret = css_populate_dir(cgroup_css(child, ss),
 +                                                     NULL);
                        if (ret)
                                goto err_undo_css;
                }
                        if (css_disable & (1 << ssid)) {
                                kill_css(css);
                        } else {
 -                              cgroup_clear_dir(child, 1 << ssid);
 +                              css_clear_dir(css, NULL);
                                if (ss->css_reset)
                                        ss->css_reset(css);
                        }
@@@ -3135,16 -3002,15 +3135,16 @@@ err_undo_css
                        if (css_enable & (1 << ssid))
                                kill_css(css);
                        else
 -                              cgroup_clear_dir(child, 1 << ssid);
 +                              css_clear_dir(css, NULL);
                }
        }
        goto out_unlock;
  }
  
 -static int cgroup_populated_show(struct seq_file *seq, void *v)
 +static int cgroup_events_show(struct seq_file *seq, void *v)
  {
 -      seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
 +      seq_printf(seq, "populated %d\n",
 +                 cgroup_is_populated(seq_css(seq)->cgroup));
        return 0;
  }
  
@@@ -3287,8 -3153,7 +3287,8 @@@ static int cgroup_kn_set_ugid(struct ke
        return kernfs_setattr(kn, &iattr);
  }
  
 -static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 +                         struct cftype *cft)
  {
        char name[CGROUP_FILE_NAME_MAX];
        struct kernfs_node *kn;
                return ret;
        }
  
 -      if (cft->write == cgroup_procs_write)
 -              cgrp->procs_kn = kn;
 -      else if (cft->seq_show == cgroup_populated_show)
 -              cgrp->populated_kn = kn;
 +      if (cft->file_offset) {
 +              struct cgroup_file *cfile = (void *)css + cft->file_offset;
 +
 +              kernfs_get(kn);
 +              cfile->kn = kn;
 +              list_add(&cfile->node, &css->files);
 +      }
 +
        return 0;
  }
  
  /**
   * cgroup_addrm_files - add or remove files to a cgroup directory
 - * @cgrp: the target cgroup
 + * @css: the target css
 + * @cgrp: the target cgroup (usually css->cgroup)
   * @cfts: array of cftypes to be added
   * @is_add: whether to add or remove
   *
   * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
 - * For removals, this function never fails.  If addition fails, this
 - * function doesn't remove files already added.  The caller is responsible
 - * for cleaning up.
 + * For removals, this function never fails.
   */
 -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 +static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 +                            struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add)
  {
 -      struct cftype *cft;
 +      struct cftype *cft, *cft_end = NULL;
        int ret;
  
        lockdep_assert_held(&cgroup_mutex);
  
 -      for (cft = cfts; cft->name[0] != '\0'; cft++) {
 +restart:
 +      for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
                if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                        continue;
                        continue;
  
                if (is_add) {
 -                      ret = cgroup_add_file(cgrp, cft);
 +                      ret = cgroup_add_file(css, cgrp, cft);
                        if (ret) {
                                pr_warn("%s: failed to add %s, err=%d\n",
                                        __func__, cft->name, ret);
 -                              return ret;
 +                              cft_end = cft;
 +                              is_add = false;
 +                              goto restart;
                        }
                } else {
                        cgroup_rm_file(cgrp, cft);
@@@ -3385,7 -3243,7 +3385,7 @@@ static int cgroup_apply_cftypes(struct 
                if (cgroup_is_dead(cgrp))
                        continue;
  
 -              ret = cgroup_addrm_files(cgrp, cfts, is_add);
 +              ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
                if (ret)
                        break;
        }
@@@ -3497,7 -3355,7 +3497,7 @@@ static int cgroup_add_cftypes(struct cg
  {
        int ret;
  
 -      if (ss->disabled)
 +      if (!cgroup_ssid_enabled(ss->id))
                return 0;
  
        if (!cfts || cfts[0].name[0] == '\0')
@@@ -3547,8 -3405,17 +3547,8 @@@ int cgroup_add_legacy_cftypes(struct cg
  {
        struct cftype *cft;
  
 -      /*
 -       * If legacy_flies_on_dfl, we want to show the legacy files on the
 -       * dfl hierarchy but iff the target subsystem hasn't been updated
 -       * for the dfl hierarchy yet.
 -       */
 -      if (!cgroup_legacy_files_on_dfl ||
 -          ss->dfl_cftypes != ss->legacy_cftypes) {
 -              for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
 -                      cft->flags |= __CFTYPE_NOT_ON_DFL;
 -      }
 -
 +      for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
 +              cft->flags |= __CFTYPE_NOT_ON_DFL;
        return cgroup_add_cftypes(ss, cfts);
  }
  
@@@ -3563,10 -3430,10 +3563,10 @@@ static int cgroup_task_count(const stru
        int count = 0;
        struct cgrp_cset_link *link;
  
 -      down_read(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                count += atomic_read(&link->cset->refcount);
 -      up_read(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
        return count;
  }
  
@@@ -3798,25 -3665,22 +3798,25 @@@ bool css_has_online_children(struct cgr
  }
  
  /**
 - * css_advance_task_iter - advance a task itererator to the next css_set
 + * css_task_iter_advance_css_set - advance a task itererator to the next css_set
   * @it: the iterator to advance
   *
   * Advance @it to the next css_set to walk.
   */
 -static void css_advance_task_iter(struct css_task_iter *it)
 +static void css_task_iter_advance_css_set(struct css_task_iter *it)
  {
        struct list_head *l = it->cset_pos;
        struct cgrp_cset_link *link;
        struct css_set *cset;
  
 +      lockdep_assert_held(&css_set_lock);
 +
        /* Advance to the next non-empty css_set */
        do {
                l = l->next;
                if (l == it->cset_head) {
                        it->cset_pos = NULL;
 +                      it->task_pos = NULL;
                        return;
                }
  
                        link = list_entry(l, struct cgrp_cset_link, cset_link);
                        cset = link->cset;
                }
 -      } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
 +      } while (!css_set_populated(cset));
  
        it->cset_pos = l;
  
  
        it->tasks_head = &cset->tasks;
        it->mg_tasks_head = &cset->mg_tasks;
 +
 +      /*
 +       * We don't keep css_sets locked across iteration steps and thus
 +       * need to take steps to ensure that iteration can be resumed after
 +       * the lock is re-acquired.  Iteration is performed at two levels -
 +       * css_sets and tasks in them.
 +       *
 +       * Once created, a css_set never leaves its cgroup lists, so a
 +       * pinned css_set is guaranteed to stay put and we can resume
 +       * iteration afterwards.
 +       *
 +       * Tasks may leave @cset across iteration steps.  This is resolved
 +       * by registering each iterator with the css_set currently being
 +       * walked and making css_set_move_task() advance iterators whose
 +       * next task is leaving.
 +       */
 +      if (it->cur_cset) {
 +              list_del(&it->iters_node);
 +              put_css_set_locked(it->cur_cset);
 +      }
 +      get_css_set(cset);
 +      it->cur_cset = cset;
 +      list_add(&it->iters_node, &cset->task_iters);
 +}
 +
 +static void css_task_iter_advance(struct css_task_iter *it)
 +{
 +      struct list_head *l = it->task_pos;
 +
 +      lockdep_assert_held(&css_set_lock);
 +      WARN_ON_ONCE(!l);
 +
 +      /*
 +       * Advance iterator to find next entry.  cset->tasks is consumed
 +       * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
 +       * next cset.
 +       */
 +      l = l->next;
 +
 +      if (l == it->tasks_head)
 +              l = it->mg_tasks_head->next;
 +
 +      if (l == it->mg_tasks_head)
 +              css_task_iter_advance_css_set(it);
 +      else
 +              it->task_pos = l;
  }
  
  /**
   * css_task_iter_next() to walk through the tasks until the function
   * returns NULL.  On completion of iteration, css_task_iter_end() must be
   * called.
 - *
 - * Note that this function acquires a lock which is released when the
 - * iteration finishes.  The caller can't sleep while iteration is in
 - * progress.
   */
  void css_task_iter_start(struct cgroup_subsys_state *css,
                         struct css_task_iter *it)
 -      __acquires(css_set_rwsem)
  {
        /* no one should try to iterate before mounting cgroups */
        WARN_ON_ONCE(!use_task_css_set_links);
  
 -      down_read(&css_set_rwsem);
 +      memset(it, 0, sizeof(*it));
 +
 +      spin_lock_bh(&css_set_lock);
  
        it->ss = css->ss;
  
  
        it->cset_head = it->cset_pos;
  
 -      css_advance_task_iter(it);
 +      css_task_iter_advance_css_set(it);
 +
 +      spin_unlock_bh(&css_set_lock);
  }
  
  /**
   */
  struct task_struct *css_task_iter_next(struct css_task_iter *it)
  {
 -      struct task_struct *res;
 -      struct list_head *l = it->task_pos;
 +      if (it->cur_task) {
 +              put_task_struct(it->cur_task);
 +              it->cur_task = NULL;
 +      }
  
 -      /* If the iterator cg is NULL, we have no tasks */
 -      if (!it->cset_pos)
 -              return NULL;
 -      res = list_entry(l, struct task_struct, cg_list);
 +      spin_lock_bh(&css_set_lock);
  
 -      /*
 -       * Advance iterator to find next entry.  cset->tasks is consumed
 -       * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
 -       * next cset.
 -       */
 -      l = l->next;
 +      if (it->task_pos) {
 +              it->cur_task = list_entry(it->task_pos, struct task_struct,
 +                                        cg_list);
 +              get_task_struct(it->cur_task);
 +              css_task_iter_advance(it);
 +      }
  
 -      if (l == it->tasks_head)
 -              l = it->mg_tasks_head->next;
 +      spin_unlock_bh(&css_set_lock);
  
 -      if (l == it->mg_tasks_head)
 -              css_advance_task_iter(it);
 -      else
 -              it->task_pos = l;
 -
 -      return res;
 +      return it->cur_task;
  }
  
  /**
   * Finish task iteration started by css_task_iter_start().
   */
  void css_task_iter_end(struct css_task_iter *it)
 -      __releases(css_set_rwsem)
  {
 -      up_read(&css_set_rwsem);
 +      if (it->cur_cset) {
 +              spin_lock_bh(&css_set_lock);
 +              list_del(&it->iters_node);
 +              put_css_set_locked(it->cur_cset);
 +              spin_unlock_bh(&css_set_lock);
 +      }
 +
 +      if (it->cur_task)
 +              put_task_struct(it->cur_task);
  }
  
  /**
@@@ -3990,10 -3809,10 +3990,10 @@@ int cgroup_transfer_tasks(struct cgrou
        mutex_lock(&cgroup_mutex);
  
        /* all tasks in @from are being moved, all csets are source */
 -      down_read(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &from->cset_links, cset_link)
                cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
 -      up_read(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
  
        ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
        if (ret)
                css_task_iter_end(&it);
  
                if (task) {
 -                      ret = cgroup_migrate(to, task, false);
 +                      ret = cgroup_migrate(task, false, to);
                        put_task_struct(task);
                }
        } while (task && !ret);
@@@ -4508,13 -4327,13 +4508,13 @@@ static int cgroup_clone_children_write(
  static struct cftype cgroup_dfl_base_files[] = {
        {
                .name = "cgroup.procs",
 +              .file_offset = offsetof(struct cgroup, procs_file),
                .seq_start = cgroup_pidlist_start,
                .seq_next = cgroup_pidlist_next,
                .seq_stop = cgroup_pidlist_stop,
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_PROCS,
                .write = cgroup_procs_write,
 -              .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "cgroup.controllers",
                .write = cgroup_subtree_control_write,
        },
        {
 -              .name = "cgroup.populated",
 +              .name = "cgroup.events",
                .flags = CFTYPE_NOT_ON_ROOT,
 -              .seq_show = cgroup_populated_show,
 +              .file_offset = offsetof(struct cgroup, events_file),
 +              .seq_show = cgroup_events_show,
        },
        { }     /* terminate */
  };
@@@ -4550,6 -4368,7 +4550,6 @@@ static struct cftype cgroup_legacy_base
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_PROCS,
                .write = cgroup_procs_write,
 -              .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "cgroup.clone_children",
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_TASKS,
                .write = cgroup_tasks_write,
 -              .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "notify_on_release",
        { }     /* terminate */
  };
  
 -/**
 - * cgroup_populate_dir - create subsys files in a cgroup directory
 - * @cgrp: target cgroup
 - * @subsys_mask: mask of the subsystem ids whose files should be added
 - *
 - * On failure, no file is added.
 - */
 -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 -{
 -      struct cgroup_subsys *ss;
 -      int i, ret = 0;
 -
 -      /* process cftsets of each subsystem */
 -      for_each_subsys(ss, i) {
 -              struct cftype *cfts;
 -
 -              if (!(subsys_mask & (1 << i)))
 -                      continue;
 -
 -              list_for_each_entry(cfts, &ss->cfts, node) {
 -                      ret = cgroup_addrm_files(cgrp, cfts, true);
 -                      if (ret < 0)
 -                              goto err;
 -              }
 -      }
 -      return 0;
 -err:
 -      cgroup_clear_dir(cgrp, subsys_mask);
 -      return ret;
 -}
 -
  /*
   * css destruction is four-stage process.
   *
@@@ -4613,13 -4464,9 +4613,13 @@@ static void css_free_work_fn(struct wor
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;
 +      struct cgroup_file *cfile;
  
        percpu_ref_exit(&css->refcnt);
  
 +      list_for_each_entry(cfile, &css->files, node)
 +              kernfs_put(cfile->kn);
 +
        if (ss) {
                /* css free path */
                int id = css->id;
@@@ -4724,7 -4571,6 +4724,7 @@@ static void init_and_link_css(struct cg
        css->ss = ss;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
 +      INIT_LIST_HEAD(&css->files);
        css->serial_nr = css_serial_nr_next++;
  
        if (cgroup_parent(cgrp)) {
@@@ -4807,7 -4653,7 +4807,7 @@@ static int create_css(struct cgroup *cg
        css->id = err;
  
        if (visible) {
 -              err = cgroup_populate_dir(cgrp, 1 << ss->id);
 +              err = css_populate_dir(css, NULL);
                if (err)
                        goto err_free_id;
        }
  
  err_list_del:
        list_del_rcu(&css->sibling);
 -      cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 +      css_clear_dir(css, NULL);
  err_free_id:
        cgroup_idr_remove(&ss->css_idr, css->id);
  err_free_percpu_ref:
@@@ -4850,6 -4696,7 +4850,6 @@@ static int cgroup_mkdir(struct kernfs_n
        struct cgroup_root *root;
        struct cgroup_subsys *ss;
        struct kernfs_node *kn;
 -      struct cftype *base_files;
        int ssid, ret;
  
        /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
        if (ret)
                goto out_destroy;
  
 -      if (cgroup_on_dfl(cgrp))
 -              base_files = cgroup_dfl_base_files;
 -      else
 -              base_files = cgroup_legacy_base_files;
 -
 -      ret = cgroup_addrm_files(cgrp, base_files, true);
 +      ret = css_populate_dir(&cgrp->self, NULL);
        if (ret)
                goto out_destroy;
  
@@@ -5012,7 -4864,7 +5012,7 @@@ static void kill_css(struct cgroup_subs
         * This must happen before css is disassociated with its cgroup.
         * See seq_css() for details.
         */
 -      cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 +      css_clear_dir(css, NULL);
  
        /*
         * Killing would put the base ref, but we need to keep it alive
@@@ -5061,15 -4913,19 +5061,15 @@@ static int cgroup_destroy_locked(struc
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
        struct cgroup_subsys_state *css;
 -      bool empty;
        int ssid;
  
        lockdep_assert_held(&cgroup_mutex);
  
        /*
 -       * css_set_rwsem synchronizes access to ->cset_links and prevents
 -       * @cgrp from being removed while put_css_set() is in progress.
 +       * Only migration can raise populated from zero and we're already
 +       * holding cgroup_mutex.
         */
 -      down_read(&css_set_rwsem);
 -      empty = list_empty(&cgrp->cset_links);
 -      up_read(&css_set_rwsem);
 -      if (!empty)
 +      if (cgroup_is_populated(cgrp))
                return -EBUSY;
  
        /*
@@@ -5167,7 -5023,6 +5167,7 @@@ static void __init cgroup_init_subsys(s
  
        have_fork_callback |= (bool)ss->fork << ss->id;
        have_exit_callback |= (bool)ss->exit << ss->id;
 +      have_free_callback |= (bool)ss->free << ss->id;
        have_canfork_callback |= (bool)ss->can_fork << ss->id;
  
        /* At system boot, before all subsystems have been
@@@ -5216,8 -5071,6 +5216,8 @@@ int __init cgroup_init_early(void
        return 0;
  }
  
 +static unsigned long cgroup_disable_mask __initdata;
 +
  /**
   * cgroup_init - cgroup initialization
   *
@@@ -5228,9 -5081,8 +5228,9 @@@ int __init cgroup_init(void
  {
        struct cgroup_subsys *ss;
        unsigned long key;
 -      int ssid, err;
 +      int ssid;
  
 +      BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
  
                 * disabled flag and cftype registration needs kmalloc,
                 * both of which aren't available during early_init.
                 */
 -              if (ss->disabled)
 +              if (cgroup_disable_mask & (1 << ssid)) {
 +                      static_branch_disable(cgroup_subsys_enabled_key[ssid]);
 +                      printk(KERN_INFO "Disabling %s control group subsystem\n",
 +                             ss->name);
                        continue;
 +              }
  
                cgrp_dfl_root.subsys_mask |= 1 << ss->id;
  
 -              if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
 -                      ss->dfl_cftypes = ss->legacy_cftypes;
 -
                if (!ss->dfl_cftypes)
                        cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
  
                        ss->bind(init_css_set.subsys[ssid]);
        }
  
 -      err = sysfs_create_mount_point(fs_kobj, "cgroup");
 -      if (err)
 -              return err;
 +      WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
 +      WARN_ON(register_filesystem(&cgroup_fs_type));
 +      WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
  
 -      err = register_filesystem(&cgroup_fs_type);
 -      if (err < 0) {
 -              sysfs_remove_mount_point(fs_kobj, "cgroup");
 -              return err;
 -      }
 -
 -      proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
        return 0;
  }
  
@@@ -5337,7 -5195,7 +5337,7 @@@ int proc_cgroup_show(struct seq_file *m
                goto out;
  
        mutex_lock(&cgroup_mutex);
 -      down_read(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
  
        for_each_root(root) {
                struct cgroup_subsys *ss;
                        seq_printf(m, "%sname=%s", count ? "," : "",
                                   root->name);
                seq_putc(m, ':');
 +
                cgrp = task_cgroup_from_root(tsk, root);
 -              path = cgroup_path(cgrp, buf, PATH_MAX);
 -              if (!path) {
 -                      retval = -ENAMETOOLONG;
 -                      goto out_unlock;
 +
 +              /*
 +               * On traditional hierarchies, all zombie tasks show up as
 +               * belonging to the root cgroup.  On the default hierarchy,
 +               * while a zombie doesn't show up in "cgroup.procs" and
 +               * thus can't be migrated, its /proc/PID/cgroup keeps
 +               * reporting the cgroup it belonged to before exiting.  If
 +               * the cgroup is removed before the zombie is reaped,
 +               * " (deleted)" is appended to the cgroup path.
 +               */
 +              if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
 +                      path = cgroup_path(cgrp, buf, PATH_MAX);
 +                      if (!path) {
 +                              retval = -ENAMETOOLONG;
 +                              goto out_unlock;
 +                      }
 +              } else {
 +                      path = "/";
                }
 +
                seq_puts(m, path);
 -              seq_putc(m, '\n');
 +
 +              if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
 +                      seq_puts(m, " (deleted)\n");
 +              else
 +                      seq_putc(m, '\n');
        }
  
        retval = 0;
  out_unlock:
 -      up_read(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
        kfree(buf);
  out:
@@@ -5413,8 -5251,7 +5413,8 @@@ static int proc_cgroupstats_show(struc
        for_each_subsys(ss, i)
                seq_printf(m, "%s\t%d\t%d\t%d\n",
                           ss->legacy_name, ss->root->hierarchy_id,
 -                         atomic_read(&ss->root->nr_cgrps), !ss->disabled);
 +                         atomic_read(&ss->root->nr_cgrps),
 +                         cgroup_ssid_enabled(i));
  
        mutex_unlock(&cgroup_mutex);
        return 0;
@@@ -5535,7 -5372,7 +5535,7 @@@ void cgroup_post_fork(struct task_struc
         * @child during its iteration.
         *
         * If we won the race, @child is associated with %current's
 -       * css_set.  Grabbing css_set_rwsem guarantees both that the
 +       * css_set.  Grabbing css_set_lock guarantees both that the
         * association is stable, and, on completion of the parent's
         * migration, @child is visible in the source of migration or
         * already in the destination cgroup.  This guarantee is necessary
        if (use_task_css_set_links) {
                struct css_set *cset;
  
 -              down_write(&css_set_rwsem);
 +              spin_lock_bh(&css_set_lock);
                cset = task_css_set(current);
                if (list_empty(&child->cg_list)) {
 -                      rcu_assign_pointer(child->cgroups, cset);
 -                      list_add(&child->cg_list, &cset->tasks);
                        get_css_set(cset);
 +                      css_set_move_task(child, NULL, cset, false);
                }
 -              up_write(&css_set_rwsem);
 +              spin_unlock_bh(&css_set_lock);
        }
  
        /*
@@@ -5591,42 -5429,39 +5591,42 @@@ void cgroup_exit(struct task_struct *ts
  {
        struct cgroup_subsys *ss;
        struct css_set *cset;
 -      bool put_cset = false;
        int i;
  
        /*
         * Unlink from @tsk from its css_set.  As migration path can't race
 -       * with us, we can check cg_list without grabbing css_set_rwsem.
 +       * with us, we can check css_set and cg_list without synchronization.
         */
 +      cset = task_css_set(tsk);
 +
        if (!list_empty(&tsk->cg_list)) {
 -              down_write(&css_set_rwsem);
 -              list_del_init(&tsk->cg_list);
 -              up_write(&css_set_rwsem);
 -              put_cset = true;
 +              spin_lock_bh(&css_set_lock);
 +              css_set_move_task(tsk, cset, NULL, false);
 +              spin_unlock_bh(&css_set_lock);
 +      } else {
 +              get_css_set(cset);
        }
  
 -      /* Reassign the task to the init_css_set. */
 -      cset = task_css_set(tsk);
 -      RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 -
        /* see cgroup_post_fork() for details */
 -      for_each_subsys_which(ss, i, &have_exit_callback) {
 -              struct cgroup_subsys_state *old_css = cset->subsys[i];
 -              struct cgroup_subsys_state *css = task_css(tsk, i);
 +      for_each_subsys_which(ss, i, &have_exit_callback)
 +              ss->exit(tsk);
 +}
  
 -              ss->exit(css, old_css, tsk);
 -      }
 +void cgroup_free(struct task_struct *task)
 +{
 +      struct css_set *cset = task_css_set(task);
 +      struct cgroup_subsys *ss;
 +      int ssid;
  
 -      if (put_cset)
 -              put_css_set(cset);
 +      for_each_subsys_which(ss, ssid, &have_free_callback)
 +              ss->free(task);
 +
 +      put_css_set(cset);
  }
  
  static void check_for_release(struct cgroup *cgrp)
  {
 -      if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
 +      if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
            !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
                schedule_work(&cgrp->release_agent_work);
  }
@@@ -5705,13 -5540,25 +5705,13 @@@ static int __init cgroup_disable(char *
                        if (strcmp(token, ss->name) &&
                            strcmp(token, ss->legacy_name))
                                continue;
 -
 -                      ss->disabled = 1;
 -                      printk(KERN_INFO "Disabling %s control group subsystem\n",
 -                             ss->name);
 -                      break;
 +                      cgroup_disable_mask |= 1 << i;
                }
        }
        return 1;
  }
  __setup("cgroup_disable=", cgroup_disable);
  
 -static int __init cgroup_set_legacy_files_on_dfl(char *str)
 -{
 -      printk("cgroup: using legacy files on the default hierarchy\n");
 -      cgroup_legacy_files_on_dfl = true;
 -      return 0;
 -}
 -__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
 -
  /**
   * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
   * @dentry: directory dentry of interest
@@@ -5815,7 -5662,7 +5815,7 @@@ static int current_css_set_cg_links_rea
        if (!name_buf)
                return -ENOMEM;
  
 -      down_read(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        rcu_read_lock();
        cset = rcu_dereference(current->cgroups);
        list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                           c->root->hierarchy_id, name_buf);
        }
        rcu_read_unlock();
 -      up_read(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
        kfree(name_buf);
        return 0;
  }
@@@ -5837,7 -5684,7 +5837,7 @@@ static int cgroup_css_links_read(struc
        struct cgroup_subsys_state *css = seq_css(seq);
        struct cgrp_cset_link *link;
  
 -      down_read(&css_set_rwsem);
 +      spin_lock_bh(&css_set_lock);
        list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
                struct css_set *cset = link->cset;
                struct task_struct *task;
        overflow:
                seq_puts(seq, "  ...\n");
        }
 -      up_read(&css_set_rwsem);
 +      spin_unlock_bh(&css_set_lock);
        return 0;
  }
  
  static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
  {
 -      return (!cgroup_has_tasks(css->cgroup) &&
 +      return (!cgroup_is_populated(css->cgroup) &&
                !css_has_online_children(&css->cgroup->self));
  }
  
diff --combined kernel/cpuset.c
index c02d677c541c68067f76f0053864ed176ba39ccc,9ef59a37c1907acb633581e4c9658479439b9a3a..c9ea63ff70a7433d02a7791aed66a681c5058227
@@@ -51,7 -51,6 +51,7 @@@
  #include <linux/stat.h>
  #include <linux/string.h>
  #include <linux/time.h>
 +#include <linux/time64.h>
  #include <linux/backing-dev.h>
  #include <linux/sort.h>
  
@@@ -69,7 -68,7 +69,7 @@@ struct static_key cpusets_enabled_key _
  struct fmeter {
        int cnt;                /* unprocessed events count */
        int val;                /* most recent output value */
 -      time_t time;            /* clock (secs) when val computed */
 +      time64_t time;          /* clock (secs) when val computed */
        spinlock_t lock;        /* guards read or write of above */
  };
  
@@@ -474,8 -473,7 +474,8 @@@ static int validate_change(struct cpuse
  
        /* On legacy hiearchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
 -      if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
 +      if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 +          !is_cpuset_subset(trial, par))
                goto out;
  
        /*
         * be changed to have empty cpus_allowed or mems_allowed.
         */
        ret = -ENOSPC;
 -      if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
 +      if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
                if (!cpumask_empty(cur->cpus_allowed) &&
                    cpumask_empty(trial->cpus_allowed))
                        goto out;
@@@ -881,8 -879,7 +881,8 @@@ static void update_cpumasks_hier(struc
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some CPUs.
                 */
 -              if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
 +              if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 +                  cpumask_empty(new_cpus))
                        cpumask_copy(new_cpus, parent->effective_cpus);
  
                /* Skip the whole subtree if the cpumask remains the same. */
                cpumask_copy(cp->effective_cpus, new_cpus);
                spin_unlock_irq(&callback_lock);
  
 -              WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
 +              WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
  
                update_tasks_cpumask(cp);
@@@ -1138,8 -1135,7 +1138,8 @@@ static void update_nodemasks_hier(struc
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some MEMs.
                 */
 -              if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
 +              if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 +                  nodes_empty(*new_mems))
                        *new_mems = parent->effective_mems;
  
                /* Skip the whole subtree if the nodemask remains the same. */
                cp->effective_mems = *new_mems;
                spin_unlock_irq(&callback_lock);
  
 -              WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
 +              WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                        !nodes_equal(cp->mems_allowed, cp->effective_mems));
  
                update_tasks_nodemask(cp);
@@@ -1375,7 -1371,7 +1375,7 @@@ out
   */
  
  #define FM_COEF 933           /* coefficient for half-life of 10 secs */
 -#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
 +#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
  #define FM_MAXCNT 1000000     /* limit cnt to avoid overflow */
  #define FM_SCALE 1000         /* faux fixed point scale */
  
@@@ -1391,11 -1387,8 +1391,11 @@@ static void fmeter_init(struct fmeter *
  /* Internal meter update - process cnt events and update value */
  static void fmeter_update(struct fmeter *fmp)
  {
 -      time_t now = get_seconds();
 -      time_t ticks = now - fmp->time;
 +      time64_t now;
 +      u32 ticks;
 +
 +      now = ktime_get_seconds();
 +      ticks = now - fmp->time;
  
        if (ticks == 0)
                return;
@@@ -1447,7 -1440,7 +1447,7 @@@ static int cpuset_can_attach(struct cgr
  
        /* allow moving tasks into an empty cpuset if on default hierarchy */
        ret = -ENOSPC;
 -      if (!cgroup_on_dfl(css->cgroup) &&
 +      if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
            (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                goto out_unlock;
  
@@@ -1491,8 -1484,9 +1491,8 @@@ static void cpuset_attach(struct cgroup
  {
        /* static buf protected by cpuset_mutex */
        static nodemask_t cpuset_attach_nodemask_to;
 -      struct mm_struct *mm;
        struct task_struct *task;
 -      struct task_struct *leader = cgroup_taskset_first(tset);
 +      struct task_struct *leader;
        struct cpuset *cs = css_cs(css);
        struct cpuset *oldcs = cpuset_attach_old_cs;
  
        }
  
        /*
 -       * Change mm, possibly for multiple threads in a threadgroup. This is
 -       * expensive and may sleep.
 +       * Change mm for all threadgroup leaders. This is expensive and may
 +       * sleep and should be moved outside migration path proper.
         */
        cpuset_attach_nodemask_to = cs->effective_mems;
 -      mm = get_task_mm(leader);
 -      if (mm) {
 -              mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
 -
 -              /*
 -               * old_mems_allowed is the same with mems_allowed here, except
 -               * if this task is being moved automatically due to hotplug.
 -               * In that case @mems_allowed has been updated and is empty,
 -               * so @old_mems_allowed is the right nodesets that we migrate
 -               * mm from.
 -               */
 -              if (is_memory_migrate(cs)) {
 -                      cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
 -                                        &cpuset_attach_nodemask_to);
 +      cgroup_taskset_for_each_leader(leader, tset) {
 +              struct mm_struct *mm = get_task_mm(leader);
 +
 +              if (mm) {
 +                      mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
 +
 +                      /*
 +                       * old_mems_allowed is the same with mems_allowed
 +                       * here, except if this task is being moved
 +                       * automatically due to hotplug.  In that case
 +                       * @mems_allowed has been updated and is empty, so
 +                       * @old_mems_allowed is the right nodesets that we
 +                       * migrate mm from.
 +                       */
 +                      if (is_memory_migrate(cs)) {
 +                              cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
 +                                                &cpuset_attach_nodemask_to);
 +                      }
 +                      mmput(mm);
                }
 -              mmput(mm);
        }
  
        cs->old_mems_allowed = cpuset_attach_nodemask_to;
@@@ -1604,6 -1594,9 +1604,6 @@@ static int cpuset_write_u64(struct cgro
        case FILE_MEMORY_PRESSURE_ENABLED:
                cpuset_memory_pressure_enabled = !!val;
                break;
 -      case FILE_MEMORY_PRESSURE:
 -              retval = -EACCES;
 -              break;
        case FILE_SPREAD_PAGE:
                retval = update_flag(CS_SPREAD_PAGE, cs, val);
                break;
@@@ -1870,6 -1863,9 +1870,6 @@@ static struct cftype files[] = 
        {
                .name = "memory_pressure",
                .read_u64 = cpuset_read_u64,
 -              .write_u64 = cpuset_write_u64,
 -              .private = FILE_MEMORY_PRESSURE,
 -              .mode = S_IRUGO,
        },
  
        {
@@@ -1956,7 -1952,7 +1956,7 @@@ static int cpuset_css_online(struct cgr
        cpuset_inc();
  
        spin_lock_irq(&callback_lock);
 -      if (cgroup_on_dfl(cs->css.cgroup)) {
 +      if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                cs->effective_mems = parent->effective_mems;
        }
@@@ -2033,7 -2029,7 +2033,7 @@@ static void cpuset_bind(struct cgroup_s
        mutex_lock(&cpuset_mutex);
        spin_lock_irq(&callback_lock);
  
 -      if (cgroup_on_dfl(root_css->cgroup)) {
 +      if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                top_cpuset.mems_allowed = node_possible_map;
        } else {
@@@ -2214,7 -2210,7 +2214,7 @@@ retry
        cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
        mems_updated = !nodes_equal(new_mems, cs->effective_mems);
  
 -      if (cgroup_on_dfl(cs->css.cgroup))
 +      if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                     cpus_updated, mems_updated);
        else
@@@ -2245,7 -2241,7 +2245,7 @@@ static void cpuset_hotplug_workfn(struc
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
 -      bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
 +      bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
  
        mutex_lock(&cpuset_mutex);
  
@@@ -2602,22 -2598,22 +2602,22 @@@ int cpuset_mems_allowed_intersects(cons
  }
  
  /**
-  * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
-  * @tsk: pointer to task_struct of some task.
+  * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
   *
-  * Description: Prints @task's name, cpuset name, and cached copy of its
+  * Description: Prints current's name, cpuset name, and cached copy of its
   * mems_allowed to the kernel log.
   */
- void cpuset_print_task_mems_allowed(struct task_struct *tsk)
+ void cpuset_print_current_mems_allowed(void)
  {
        struct cgroup *cgrp;
  
        rcu_read_lock();
  
-       cgrp = task_cs(tsk)->css.cgroup;
-       pr_info("%s cpuset=", tsk->comm);
+       cgrp = task_cs(current)->css.cgroup;
+       pr_info("%s cpuset=", current->comm);
        pr_cont_cgroup_name(cgrp);
-       pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
+       pr_cont(" mems_allowed=%*pbl\n",
+               nodemask_pr_args(&current->mems_allowed));
  
        rcu_read_unlock();
  }
diff --combined kernel/fork.c
index 825ecc32454d23f4e60216bedfb2de31fe504699,fe7968901bea81a8f9bc40ae0a28f473c28e832b..f97f2c449f5cf556ea6c54cb4aec6e894dd8bab5
@@@ -251,7 -251,6 +251,7 @@@ void __put_task_struct(struct task_stru
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
  
 +      cgroup_free(tsk);
        task_numa_free(tsk);
        security_task_free(tsk);
        exit_creds(tsk);
@@@ -455,7 -454,8 +455,8 @@@ static int dup_mmap(struct mm_struct *m
                tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
-               tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
+               tmp->vm_flags &=
+                       ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
                tmp->vm_next = tmp->vm_prev = NULL;
                tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                file = tmp->vm_file;
@@@ -1102,7 -1102,7 +1103,7 @@@ static void posix_cpu_timers_init_group
        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        if (cpu_limit != RLIM_INFINITY) {
                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
 -              sig->cputimer.running = 1;
 +              sig->cputimer.running = true;
        }
  
        /* The timer lists. */
@@@ -1150,6 -1150,10 +1151,6 @@@ static int copy_signal(unsigned long cl
        tty_audit_fork(sig);
        sched_autogroup_fork(sig);
  
 -#ifdef CONFIG_CGROUPS
 -      init_rwsem(&sig->group_rwsem);
 -#endif
 -
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
  
diff --combined kernel/futex.c
index 684d7549825a4300ced2002a3fbec0a5698a18d1,843b552ddd75264b02dcc87c662498482b47c0c8..470c06c3299a7feb5cc0598508715e453274c677
@@@ -255,18 -255,9 +255,18 @@@ struct futex_hash_bucket 
        struct plist_head chain;
  } ____cacheline_aligned_in_smp;
  
 -static unsigned long __read_mostly futex_hashsize;
 +/*
 + * The base of the bucket array and its size are always used together
 + * (after initialization only in hash_futex()), so ensure that they
 + * reside in the same cacheline.
 + */
 +static struct {
 +      struct futex_hash_bucket *queues;
 +      unsigned long            hashsize;
 +} __futex_data __read_mostly __aligned(2*sizeof(long));
 +#define futex_queues   (__futex_data.queues)
 +#define futex_hashsize (__futex_data.hashsize)
  
 -static struct futex_hash_bucket *futex_queues;
  
  /*
   * Fault injections for futexes.
  static struct {
        struct fault_attr attr;
  
 -      u32 ignore_private;
 +      bool ignore_private;
  } fail_futex = {
        .attr = FAULT_ATTR_INITIALIZER,
 -      .ignore_private = 0,
 +      .ignore_private = false,
  };
  
  static int __init setup_fail_futex(char *str)
@@@ -469,7 -460,7 +469,7 @@@ get_futex_key(u32 __user *uaddr, int fs
  {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-       struct page *page, *page_head;
+       struct page *page;
        int err, ro = 0;
  
        /*
@@@ -519,46 -510,9 +519,9 @@@ again
        else
                err = 0;
  
- #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       page_head = page;
-       if (unlikely(PageTail(page))) {
-               put_page(page);
-               /* serialize against __split_huge_page_splitting() */
-               local_irq_disable();
-               if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
-                       page_head = compound_head(page);
-                       /*
-                        * page_head is valid pointer but we must pin
-                        * it before taking the PG_lock and/or
-                        * PG_compound_lock. The moment we re-enable
-                        * irqs __split_huge_page_splitting() can
-                        * return and the head page can be freed from
-                        * under us. We can't take the PG_lock and/or
-                        * PG_compound_lock on a page that could be
-                        * freed from under us.
-                        */
-                       if (page != page_head) {
-                               get_page(page_head);
-                               put_page(page);
-                       }
-                       local_irq_enable();
-               } else {
-                       local_irq_enable();
-                       goto again;
-               }
-       }
- #else
-       page_head = compound_head(page);
-       if (page != page_head) {
-               get_page(page_head);
-               put_page(page);
-       }
- #endif
-       lock_page(page_head);
+       lock_page(page);
        /*
-        * If page_head->mapping is NULL, then it cannot be a PageAnon
+        * If page->mapping is NULL, then it cannot be a PageAnon
         * page; but it might be the ZERO_PAGE or in the gate area or
         * in a special mapping (all cases which we are happy to fail);
         * or it may have been a good file page when get_user_pages_fast
         *
         * The case we do have to guard against is when memory pressure made
         * shmem_writepage move it from filecache to swapcache beneath us:
-        * an unlikely race, but we do need to retry for page_head->mapping.
+        * an unlikely race, but we do need to retry for page->mapping.
         */
-       if (!page_head->mapping) {
-               int shmem_swizzled = PageSwapCache(page_head);
-               unlock_page(page_head);
-               put_page(page_head);
+       if (!page->mapping) {
+               int shmem_swizzled = PageSwapCache(page);
+               unlock_page(page);
+               put_page(page);
                if (shmem_swizzled)
                        goto again;
                return -EFAULT;
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
-       if (PageAnon(page_head)) {
+       if (PageAnon(page)) {
                /*
                 * A RO anonymous page will never change and thus doesn't make
                 * sense for futex operations.
                key->private.address = address;
        } else {
                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-               key->shared.inode = page_head->mapping->host;
+               key->shared.inode = page->mapping->host;
                key->shared.pgoff = basepage_index(page);
        }
  
        get_futex_key_refs(key); /* implies MB (B) */
  
  out:
-       unlock_page(page_head);
-       put_page(page_head);
+       unlock_page(page);
+       put_page(page);
        return err;
  }
  
diff --combined kernel/kexec_core.c
index bd9f8a03cefa4ef05c08d54a357910286487afd8,dd21c783e3dde44769fbadc526ad5b73b1ea1a62..11b64a63c0f88817b80a2c35117d70bcfe446fa1
@@@ -6,7 -6,7 +6,7 @@@
   * Version 2.  See the file COPYING for more details.
   */
  
- #define pr_fmt(fmt)   "kexec: " fmt
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  
  #include <linux/capability.h>
  #include <linux/mm.h>
@@@ -1027,7 -1027,7 +1027,7 @@@ static int __init crash_notes_memory_in
  
        crash_notes = __alloc_percpu(size, align);
        if (!crash_notes) {
-               pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+               pr_warn("Memory allocation for saving cpu register states failed\n");
                return -ENOMEM;
        }
        return 0;
@@@ -1149,7 -1149,7 +1149,7 @@@ static int __init parse_crashkernel_sim
        if (*cur == '@')
                *crash_base = memparse(cur+1, &cur);
        else if (*cur != ' ' && *cur != '\0') {
 -              pr_warn("crashkernel: unrecognized char\n");
 +              pr_warn("crashkernel: unrecognized char: %c\n", *cur);
                return -EINVAL;
        }
  
@@@ -1186,12 -1186,12 +1186,12 @@@ static int __init parse_crashkernel_suf
  
        /* check with suffix */
        if (strncmp(cur, suffix, strlen(suffix))) {
 -              pr_warn("crashkernel: unrecognized char\n");
 +              pr_warn("crashkernel: unrecognized char: %c\n", *cur);
                return -EINVAL;
        }
        cur += strlen(suffix);
        if (*cur != ' ' && *cur != '\0') {
 -              pr_warn("crashkernel: unrecognized char\n");
 +              pr_warn("crashkernel: unrecognized char: %c\n", *cur);
                return -EINVAL;
        }
  
diff --combined kernel/params.c
index ed1e0a1cffa7c7b78d750df0b72770a7769817b7,93a380a2345d71ae8c884a56006977eec959cbf8..a6d6149c0fe60df1ca38d9a66acef281b78ee79d
@@@ -223,7 -223,7 +223,7 @@@ char *parse_args(const char *doing
                 int (*unknown)(char *param, char *val,
                                const char *doing, void *arg))
  {
 -      char *param, *val;
 +      char *param, *val, *err = NULL;
  
        /* Chew leading spaces */
        args = skip_spaces(args);
                args = next_arg(args, &param, &val);
                /* Stop at -- */
                if (!val && strcmp(param, "--") == 0)
 -                      return args;
 +                      return err ?: args;
                irq_was_disabled = irqs_disabled();
                ret = parse_one(param, val, doing, params, num,
                                min_level, max_level, arg, unknown);
                                doing, param);
  
                switch (ret) {
 +              case 0:
 +                      continue;
                case -ENOENT:
                        pr_err("%s: Unknown parameter `%s'\n", doing, param);
 -                      return ERR_PTR(ret);
 +                      break;
                case -ENOSPC:
                        pr_err("%s: `%s' too large for parameter `%s'\n",
                               doing, val ?: "", param);
 -                      return ERR_PTR(ret);
 -              case 0:
                        break;
                default:
                        pr_err("%s: `%s' invalid for parameter `%s'\n",
                               doing, val ?: "", param);
 -                      return ERR_PTR(ret);
 +                      break;
                }
 +
 +              err = ERR_PTR(ret);
        }
  
 -      /* All parsed OK. */
 -      return NULL;
 +      return err;
  }
  
  /* Lazy bastard, eh? */
@@@ -326,10 -325,11 +326,11 @@@ int param_get_charp(char *buffer, cons
  }
  EXPORT_SYMBOL(param_get_charp);
  
static void param_free_charp(void *arg)
+ void param_free_charp(void *arg)
  {
        maybe_kfree_parameter(*((char **)arg));
  }
+ EXPORT_SYMBOL(param_free_charp);
  
  const struct kernel_param_ops param_ops_charp = {
        .set = param_set_charp,
diff --combined kernel/sysctl.c
index 96c856b040819e30f5e9d4dad4ac396569f0eba0,a3411175f7166f7cf87a962204355f9aead56691..dc6858d6639ed022d65129bdbb869ff7bcc05789
@@@ -64,7 -64,6 +64,7 @@@
  #include <linux/binfmts.h>
  #include <linux/sched/sysctl.h>
  #include <linux/kexec.h>
 +#include <linux/bpf.h>
  
  #include <asm/uaccess.h>
  #include <asm/processor.h>
@@@ -888,6 -887,17 +888,17 @@@ static struct ctl_table kern_table[] = 
                .extra1         = &zero,
                .extra2         = &one,
        },
+ #ifdef CONFIG_HARDLOCKUP_DETECTOR
+       {
+               .procname       = "hardlockup_panic",
+               .data           = &hardlockup_panic,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+ #endif
  #ifdef CONFIG_SMP
        {
                .procname       = "softlockup_all_cpu_backtrace",
                .extra1         = &zero,
                .extra2         = &one,
        },
+       {
+               .procname       = "hardlockup_all_cpu_backtrace",
+               .data           = &sysctl_hardlockup_all_cpu_backtrace,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
  #endif /* CONFIG_SMP */
  #endif
  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
                .mode           = 0644,
                .proc_handler   = timer_migration_handler,
        },
 +#endif
 +#ifdef CONFIG_BPF_SYSCALL
 +      {
 +              .procname       = "unprivileged_bpf_disabled",
 +              .data           = &sysctl_unprivileged_bpf_disabled,
 +              .maxlen         = sizeof(sysctl_unprivileged_bpf_disabled),
 +              .mode           = 0644,
 +              /* only handle a transition from default "0" to "1" */
 +              .proc_handler   = proc_dointvec_minmax,
 +              .extra1         = &one,
 +              .extra2         = &one,
 +      },
  #endif
        { }
  };
diff --combined lib/Kconfig.debug
index 565783733cd013166b1edc367ba3bc919215d2bd,4d1b97b03b2f8ee27a000f66c4ab8c6cb5cae94d..526105c18566c16b0bdd6f0dc0d958b262724e0d
@@@ -216,7 -216,7 +216,7 @@@ config STRIP_ASM_SYM
  
  config READABLE_ASM
          bool "Generate readable assembler code"
 -        depends on DEBUG_KERNEL
 +        depends on DEBUG_KERNEL && !LTO
          help
            Disable some compiler optimizations that tend to generate human unreadable
            assembler output. This may make the kernel slightly slower, but it helps
@@@ -312,15 -312,6 +312,15 @@@ config DEBUG_SECTION_MISMATC
          - Enable verbose reporting from modpost in order to help resolve
            the section mismatches that are reported.
  
 +config SECTION_MISMATCH_WARN_ONLY
 +      bool "Make section mismatch errors non-fatal"
 +      default y
 +      help
 +        If you say N here, the build process will fail if there are any
 +        section mismatch, instead of just throwing warnings.
 +
 +        If unsure, say Y.
 +
  #
  # Select this config option from the architecture Kconfig, if it
  # is preferred to always offer frame pointers as a config
@@@ -580,6 -571,14 +580,14 @@@ config DEBUG_VM_R
  
          If unsure, say N.
  
+ config DEBUG_VM_PGFLAGS
+       bool "Debug page-flags operations"
+       depends on DEBUG_VM
+       help
+         Enables extra validation on page flags operations.
+         If unsure, say N.
  config DEBUG_VIRTUAL
        bool "Debug VM translations"
        depends on DEBUG_KERNEL && X86
@@@ -1695,6 -1694,9 +1703,9 @@@ config TEST_STRING_HELPER
  config TEST_KSTRTOX
        tristate "Test kstrto*() family of functions at runtime"
  
+ config TEST_PRINTF
+       tristate "Test printf() family of functions at runtime"
  config TEST_RHASHTABLE
        tristate "Perform selftest on resizable hash table"
        default n
@@@ -1762,6 -1764,16 +1773,16 @@@ config DMA_API_DEBU
  
          If unsure, say N.
  
+ config DMA_API_DEBUG_POISON
+       bool "Poison coherent DMA buffers"
+       depends on DMA_API_DEBUG && EXPERT
+       help
+         Poison DMA buffers returned by dma_alloc_coherent unless __GFP_ZERO
+         is explicitly specified, to catch drivers depending on zeroed buffers
+         without passing the correct flags.
+         Only say Y if you're prepared for almost everything to break.
  config TEST_LKM
        tristate "Test module loading with 'hello world' module"
        default n
diff --combined lib/Makefile
index 8de3b012eac77ed2c14160022d0cc6b9f75773c6,dbf6f3d6eefb41ac2919f874c6eef5570e6cc88e..8498a5c9815a3c0273d99fd45c3d10bdf42c3375
@@@ -26,8 -26,7 +26,8 @@@ obj-y += bcd.o div64.o sort.o parser.o 
         bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
         gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
         bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 -       percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
 +       percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
 +       once.o
  obj-y += string_helpers.o
  obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
  obj-y += hexdump.o
@@@ -42,6 -41,7 +42,7 @@@ obj-$(CONFIG_TEST_RHASHTABLE) += test_r
  obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
  obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o
  obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
+ obj-$(CONFIG_TEST_PRINTF) += test_printf.o
  
  ifeq ($(CONFIG_DEBUG_KOBJECT),y)
  CFLAGS_kobject.o += -DDEBUG
@@@ -82,6 -82,7 +83,7 @@@ obj-$(CONFIG_CRC32)   += crc32.
  obj-$(CONFIG_CRC7)    += crc7.o
  obj-$(CONFIG_LIBCRC32C)       += libcrc32c.o
  obj-$(CONFIG_CRC8)    += crc8.o
+ obj-$(CONFIG_CRC64_ECMA)      += crc64_ecma.o
  obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
  
  obj-$(CONFIG_842_COMPRESS) += 842/
diff --combined lib/dma-debug.c
index fcb65d2a0b947e85e335599229bd3a44a1c425d7,40514eddb67053562978de2c06a8f12451175a98..af6262b4e02c62c11bbf6295abc22c38ca4bc5a1
@@@ -30,6 -30,7 +30,7 @@@
  #include <linux/sched.h>
  #include <linux/ctype.h>
  #include <linux/list.h>
+ #include <linux/poison.h>
  #include <linux/slab.h>
  
  #include <asm/sections.h>
@@@ -100,7 -101,7 +101,7 @@@ static LIST_HEAD(free_entries)
  static DEFINE_SPINLOCK(free_entries_lock);
  
  /* Global disable flag - will be set in case of an error */
 -static u32 global_disable __read_mostly;
 +static bool global_disable __read_mostly;
  
  /* Early initialization disable flag, set at the end of dma_debug_init */
  static bool dma_debug_initialized __read_mostly;
@@@ -1249,6 -1250,14 +1250,14 @@@ static void check_sync(struct device *d
                                dir2name[entry->direction],
                                dir2name[ref->direction]);
  
+       if (ref->sg_call_ents && ref->type == dma_debug_sg &&
+           ref->sg_call_ents != entry->sg_call_ents) {
+               err_printk(ref->dev, entry, "DMA-API: device driver syncs "
+                          "DMA sg list with different entry count "
+                          "[map count=%d] [sync count=%d]\n",
+                          entry->sg_call_ents, ref->sg_call_ents);
+       }
  out:
        put_hash_bucket(bucket, &flags);
  }
@@@ -1439,7 -1448,7 +1448,7 @@@ void debug_dma_unmap_sg(struct device *
  EXPORT_SYMBOL(debug_dma_unmap_sg);
  
  void debug_dma_alloc_coherent(struct device *dev, size_t size,
-                             dma_addr_t dma_addr, void *virt)
+                             dma_addr_t dma_addr, void *virt, gfp_t flags)
  {
        struct dma_debug_entry *entry;
  
        if (unlikely(virt == NULL))
                return;
  
+       if (IS_ENABLED(CONFIG_DMA_API_DEBUG_POISON) && !(flags & __GFP_ZERO))
+               memset(virt, DMA_ALLOC_POISON, size);
        entry = dma_entry_alloc();
        if (!entry)
                return;
diff --combined lib/kobject.c
index 0554077462669074d4df0fd01e2d087d7f997837,fee2fd950306569573d5b2c6b164ccdb9fad3f15..7cbccd2b4c72042595484e32c2e11906948414b7
@@@ -257,18 -257,32 +257,32 @@@ static int kobject_add_internal(struct 
  int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
                                  va_list vargs)
  {
-       char *s;
+       const char *s;
  
        if (kobj->name && !fmt)
                return 0;
  
-       s = kvasprintf(GFP_KERNEL, fmt, vargs);
+       s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
        if (!s)
                return -ENOMEM;
  
-       /* ewww... some of these buggers have '/' in the name ... */
-       strreplace(s, '/', '!');
-       kfree(kobj->name);
+       /*
+        * ewww... some of these buggers have '/' in the name ... If
+        * that's the case, we need to make sure we have an actual
+        * allocated copy to modify, since kvasprintf_const may have
+        * returned something from .rodata.
+        */
+       if (strchr(s, '/')) {
+               char *t;
+               t = kstrdup(s, GFP_KERNEL);
+               kfree_const(s);
+               if (!t)
+                       return -ENOMEM;
+               strreplace(t, '/', '!');
+               s = t;
+       }
+       kfree_const(kobj->name);
        kobj->name = s;
  
        return 0;
@@@ -466,7 -480,7 +480,7 @@@ int kobject_rename(struct kobject *kobj
        envp[0] = devpath_string;
        envp[1] = NULL;
  
-       name = dup_name = kstrdup(new_name, GFP_KERNEL);
+       name = dup_name = kstrdup_const(new_name, GFP_KERNEL);
        if (!name) {
                error = -ENOMEM;
                goto out;
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);
  
  out:
-       kfree(dup_name);
+       kfree_const(dup_name);
        kfree(devpath_string);
        kfree(devpath);
        kobject_put(kobj);
@@@ -568,7 -582,6 +582,7 @@@ void kobject_del(struct kobject *kobj
        kobject_put(kobj->parent);
        kobj->parent = NULL;
  }
 +EXPORT_SYMBOL(kobject_del);
  
  /**
   * kobject_get - increment refcount for object.
@@@ -585,7 -598,6 +599,7 @@@ struct kobject *kobject_get(struct kobj
        }
        return kobj;
  }
 +EXPORT_SYMBOL(kobject_get);
  
  static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
  {
@@@ -634,7 -646,7 +648,7 @@@ static void kobject_cleanup(struct kobj
        /* free name if we allocated it */
        if (name) {
                pr_debug("kobject: '%s': free name\n", name);
-               kfree(name);
+               kfree_const(name);
        }
  }
  
@@@ -677,7 -689,6 +691,7 @@@ void kobject_put(struct kobject *kobj
                kref_put(&kobj->kref, kobject_release);
        }
  }
 +EXPORT_SYMBOL(kobject_put);
  
  static void dynamic_kobj_release(struct kobject *kobj)
  {
@@@ -806,7 -817,6 +820,7 @@@ int kset_register(struct kset *k
        kobject_uevent(&k->kobj, KOBJ_ADD);
        return 0;
  }
 +EXPORT_SYMBOL(kset_register);
  
  /**
   * kset_unregister - remove a kset.
@@@ -819,7 -829,6 +833,7 @@@ void kset_unregister(struct kset *k
        kobject_del(&k->kobj);
        kobject_put(&k->kobj);
  }
 +EXPORT_SYMBOL(kset_unregister);
  
  /**
   * kset_find_obj - search for object in kset.
@@@ -1056,3 -1065,10 +1070,3 @@@ void kobj_ns_drop(enum kobj_ns_type typ
                kobj_ns_ops_tbl[type]->drop_ns(ns);
        spin_unlock(&kobj_ns_type_lock);
  }
 -
 -EXPORT_SYMBOL(kobject_get);
 -EXPORT_SYMBOL(kobject_put);
 -EXPORT_SYMBOL(kobject_del);
 -
 -EXPORT_SYMBOL(kset_register);
 -EXPORT_SYMBOL(kset_unregister);
diff --combined mm/backing-dev.c
index 619984fc07ec32792349c7fe8aec7c8b56e3d2a3,e7781eb35fd122507149fe8b8306948a95b32455..8ed2ffd963c53b910f91e1b60b04c56385a3129f
@@@ -480,10 -480,6 +480,10 @@@ static void cgwb_release_workfn(struct 
                                                release_work);
        struct backing_dev_info *bdi = wb->bdi;
  
 +      spin_lock_irq(&cgwb_lock);
 +      list_del_rcu(&wb->bdi_node);
 +      spin_unlock_irq(&cgwb_lock);
 +
        wb_shutdown(wb);
  
        css_put(wb->memcg_css);
@@@ -579,7 -575,6 +579,7 @@@ static int cgwb_create(struct backing_d
                ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
                if (!ret) {
                        atomic_inc(&bdi->usage_cnt);
 +                      list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
                        list_add(&wb->memcg_node, memcg_cgwb_list);
                        list_add(&wb->blkcg_node, blkcg_cgwb_list);
                        css_get(memcg_css);
@@@ -637,7 -632,7 +637,7 @@@ struct bdi_writeback *wb_get_create(str
  {
        struct bdi_writeback *wb;
  
-       might_sleep_if(gfp & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(gfp));
  
        if (!memcg_css->parent)
                return &bdi->wb;
@@@ -681,7 -676,7 +681,7 @@@ static int cgwb_bdi_init(struct backing
  static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
  {
        struct radix_tree_iter iter;
 -      struct bdi_writeback_congested *congested, *congested_n;
 +      struct rb_node *rbn;
        void **slot;
  
        WARN_ON(test_bit(WB_registered, &bdi->wb.state));
        radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
                cgwb_kill(*slot);
  
 -      rbtree_postorder_for_each_entry_safe(congested, congested_n,
 -                                      &bdi->cgwb_congested_tree, rb_node) {
 -              rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree);
 +      while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
 +              struct bdi_writeback_congested *congested =
 +                      rb_entry(rbn, struct bdi_writeback_congested, rb_node);
 +
 +              rb_erase(rbn, &bdi->cgwb_congested_tree);
                congested->bdi = NULL;  /* mark @congested unlinked */
        }
  
@@@ -771,22 -764,15 +771,22 @@@ static void cgwb_bdi_destroy(struct bac
  
  int bdi_init(struct backing_dev_info *bdi)
  {
 +      int ret;
 +
        bdi->dev = NULL;
  
        bdi->min_ratio = 0;
        bdi->max_ratio = 100;
        bdi->max_prop_frac = FPROP_FRAC_BASE;
        INIT_LIST_HEAD(&bdi->bdi_list);
 +      INIT_LIST_HEAD(&bdi->wb_list);
        init_waitqueue_head(&bdi->wb_waitq);
  
 -      return cgwb_bdi_init(bdi);
 +      ret = cgwb_bdi_init(bdi);
 +
 +      list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
 +
 +      return ret;
  }
  EXPORT_SYMBOL(bdi_init);
  
@@@ -837,7 -823,7 +837,7 @@@ static void bdi_remove_from_list(struc
        synchronize_rcu_expedited();
  }
  
 -void bdi_destroy(struct backing_dev_info *bdi)
 +void bdi_unregister(struct backing_dev_info *bdi)
  {
        /* make sure nobody finds us on the bdi_list anymore */
        bdi_remove_from_list(bdi);
                device_unregister(bdi->dev);
                bdi->dev = NULL;
        }
 +}
  
 +void bdi_exit(struct backing_dev_info *bdi)
 +{
 +      WARN_ON_ONCE(bdi->dev);
        wb_exit(&bdi->wb);
  }
 +
 +void bdi_destroy(struct backing_dev_info *bdi)
 +{
 +      bdi_unregister(bdi);
 +      bdi_exit(bdi);
 +}
  EXPORT_SYMBOL(bdi_destroy);
  
  /*
diff --combined mm/failslab.c
index 98fb490311eb94386aebd2f4ceb77c729f4fa01e,35c876c82b9dc2f02cf22a5bc0af899d022bc742..79171b4a58269986491198403a322d6c2a7dc814
@@@ -3,12 -3,12 +3,12 @@@
  
  static struct {
        struct fault_attr attr;
-       bool ignore_gfp_wait;
 -      u32 ignore_gfp_reclaim;
 -      int cache_filter;
++      bool ignore_gfp_reclaim;
 +      bool cache_filter;
  } failslab = {
        .attr = FAULT_ATTR_INITIALIZER,
-       .ignore_gfp_wait = true,
 -      .ignore_gfp_reclaim = 1,
 -      .cache_filter = 0,
++      .ignore_gfp_reclaim = true,
 +      .cache_filter = false,
  };
  
  bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
@@@ -16,7 -16,7 +16,7 @@@
        if (gfpflags & __GFP_NOFAIL)
                return false;
  
-         if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
+       if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
                return false;
  
        if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
@@@ -42,7 -42,7 +42,7 @@@ static int __init failslab_debugfs_init
                return PTR_ERR(dir);
  
        if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                               &failslab.ignore_gfp_wait))
+                               &failslab.ignore_gfp_reclaim))
                goto fail;
        if (!debugfs_create_bool("cache-filter", mode, dir,
                                &failslab.cache_filter))
diff --combined mm/huge_memory.c
index 440be97ad2bb0fcb0b3e8d83aaaa9ebc0696a008,cb34583d016cfcb073e97ddce31e8b3f5f1dd5dd..4b3420ade697b04659e6032ca602c7ac77149ad7
  #include <linux/hashtable.h>
  #include <linux/userfaultfd_k.h>
  #include <linux/page_idle.h>
+ #include <linux/swapops.h>
  
  #include <asm/tlb.h>
  #include <asm/pgalloc.h>
  #include "internal.h"
  
+ enum scan_result {
+       SCAN_FAIL,
+       SCAN_SUCCEED,
+       SCAN_PMD_NULL,
+       SCAN_EXCEED_NONE_PTE,
+       SCAN_PTE_NON_PRESENT,
+       SCAN_PAGE_RO,
+       SCAN_NO_REFERENCED_PAGE,
+       SCAN_PAGE_NULL,
+       SCAN_SCAN_ABORT,
+       SCAN_PAGE_COUNT,
+       SCAN_PAGE_LRU,
+       SCAN_PAGE_LOCK,
+       SCAN_PAGE_ANON,
+       SCAN_PAGE_COMPOUND,
+       SCAN_ANY_PROCESS,
+       SCAN_VMA_NULL,
+       SCAN_VMA_CHECK,
+       SCAN_ADDRESS_RANGE,
+       SCAN_SWAP_CACHE_PAGE,
+       SCAN_DEL_PAGE_LRU,
+       SCAN_ALLOC_HUGE_PAGE_FAIL,
+       SCAN_CGROUP_CHARGE_FAIL,
+       SCAN_EXCEED_SWAP_PTE
+ };
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/huge_memory.h>
  /*
   * By default transparent hugepage support is disabled in order that avoid
   * to risk increase the memory footprint of applications without a guaranteed
@@@ -67,6 -97,7 +97,7 @@@ static DECLARE_WAIT_QUEUE_HEAD(khugepag
   * fault.
   */
  static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+ static unsigned int khugepaged_max_ptes_swap __read_mostly = HPAGE_PMD_NR/8;
  
  static int khugepaged(void *none);
  static int khugepaged_slab_init(void);
@@@ -106,6 -137,10 +137,10 @@@ static struct khugepaged_scan khugepage
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
  };
  
+ static DEFINE_SPINLOCK(split_queue_lock);
+ static LIST_HEAD(split_queue);
+ static unsigned long split_queue_len;
+ static struct shrinker deferred_split_shrinker;
  
  static void set_recommended_min_free_kbytes(void)
  {
        for_each_populated_zone(zone)
                nr_zones++;
  
-       /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+       /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
        recommended_min = pageblock_nr_pages * nr_zones * 2;
  
        /*
@@@ -151,7 -186,7 +186,7 @@@ static int start_stop_khugepaged(void
                if (!khugepaged_thread)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
 -              if (unlikely(IS_ERR(khugepaged_thread))) {
 +              if (IS_ERR(khugepaged_thread)) {
                        pr_err("khugepaged: kthread_run(khugepaged) failed\n");
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
@@@ -553,6 -588,33 +588,33 @@@ static struct kobj_attribute khugepaged
        __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
               khugepaged_max_ptes_none_store);
  
+ static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+                                            struct kobj_attribute *attr,
+                                            char *buf)
+ {
+       return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+ }
+ static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+                                             struct kobj_attribute *attr,
+                                             const char *buf, size_t count)
+ {
+       int err;
+       unsigned long max_ptes_swap;
+       err  = kstrtoul(buf, 10, &max_ptes_swap);
+       if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+               return -EINVAL;
+       khugepaged_max_ptes_swap = max_ptes_swap;
+       return count;
+ }
+ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+       __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+              khugepaged_max_ptes_swap_store);
  static struct attribute *khugepaged_attr[] = {
        &khugepaged_defrag_attr.attr,
        &khugepaged_max_ptes_none_attr.attr,
        &full_scans_attr.attr,
        &scan_sleep_millisecs_attr.attr,
        &alloc_sleep_millisecs_attr.attr,
+       &khugepaged_max_ptes_swap_attr.attr,
        NULL,
  };
  
@@@ -638,6 -701,9 +701,9 @@@ static int __init hugepage_init(void
        err = register_shrinker(&huge_zero_page_shrinker);
        if (err)
                goto err_hzp_shrinker;
+       err = register_shrinker(&deferred_split_shrinker);
+       if (err)
+               goto err_split_shrinker;
  
        /*
         * By default disable transparent hugepages on smaller systems,
  
        return 0;
  err_khugepaged:
+       unregister_shrinker(&deferred_split_shrinker);
+ err_split_shrinker:
        unregister_shrinker(&huge_zero_page_shrinker);
  err_hzp_shrinker:
        khugepaged_slab_exit();
@@@ -711,6 -779,27 +779,27 @@@ static inline pmd_t mk_huge_pmd(struct 
        return entry;
  }
  
+ static inline struct list_head *page_deferred_list(struct page *page)
+ {
+       /*
+        * ->lru in the tail pages is occupied by compound_head.
+        * Let's use ->mapping + ->index in the second tail page as list_head.
+        */
+       return (struct list_head *)&page[2].mapping;
+ }
+ void prep_transhuge_page(struct page *page)
+ {
+       /*
+        * we use page->mapping and page->indexlru in second tail page
+        * as list_head: assuming THP order >= 2
+        */
+       BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+       INIT_LIST_HEAD(page_deferred_list(page));
+       set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+ }
  static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmd,
  
        VM_BUG_ON_PAGE(!PageCompound(page), page);
  
-       if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+       if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
  
        pgtable = pte_alloc_one(mm, haddr);
        if (unlikely(!pgtable)) {
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, true);
                put_page(page);
                return VM_FAULT_OOM;
        }
        ptl = pmd_lock(mm, pmd);
        if (unlikely(!pmd_none(*pmd))) {
                spin_unlock(ptl);
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, true);
                put_page(page);
                pte_free(mm, pgtable);
        } else {
                        int ret;
  
                        spin_unlock(ptl);
-                       mem_cgroup_cancel_charge(page, memcg);
+                       mem_cgroup_cancel_charge(page, memcg, true);
                        put_page(page);
                        pte_free(mm, pgtable);
                        ret = handle_userfault(vma, address, flags,
  
                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-               page_add_new_anon_rmap(page, vma, haddr);
-               mem_cgroup_commit_charge(page, memcg, false);
+               page_add_new_anon_rmap(page, vma, haddr, true);
+               mem_cgroup_commit_charge(page, memcg, false, true);
                lru_cache_add_active_or_unevictable(page, vma);
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                set_pmd_at(mm, haddr, pmd, entry);
  
  static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
  {
-       return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+       return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
  }
  
  /* Caller must hold page table lock. */
@@@ -865,6 -954,7 +954,7 @@@ int do_huge_pmd_anonymous_page(struct m
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
+       prep_transhuge_page(page);
        return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
                                            flags);
  }
@@@ -956,19 -1046,10 +1046,10 @@@ int copy_huge_pmd(struct mm_struct *dst
                goto out_unlock;
        }
  
-       if (unlikely(pmd_trans_splitting(pmd))) {
-               /* split huge page running from under us */
-               spin_unlock(src_ptl);
-               spin_unlock(dst_ptl);
-               pte_free(dst_mm, pgtable);
-               wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
-               goto out;
-       }
        src_page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
        get_page(src_page);
-       page_dup_rmap(src_page);
+       page_dup_rmap(src_page, true);
        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@@ -1008,37 -1089,6 +1089,6 @@@ unlock
        spin_unlock(ptl);
  }
  
- /*
-  * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
-  * during copy_user_huge_page()'s copy_page_rep(): in the case when
-  * the source page gets split and a tail freed before copy completes.
-  * Called under pmd_lock of checked pmd, so safe from splitting itself.
-  */
- static void get_user_huge_page(struct page *page)
- {
-       if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
-               struct page *endpage = page + HPAGE_PMD_NR;
-               atomic_add(HPAGE_PMD_NR, &page->_count);
-               while (++page < endpage)
-                       get_huge_page_tail(page);
-       } else {
-               get_page(page);
-       }
- }
- static void put_user_huge_page(struct page *page)
- {
-       if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
-               struct page *endpage = page + HPAGE_PMD_NR;
-               while (page < endpage)
-                       put_page(page++);
-       } else {
-               put_page(page);
-       }
- }
  static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
                                               vma, address, page_to_nid(page));
                if (unlikely(!pages[i] ||
                             mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
-                                                  &memcg))) {
+                                                  &memcg, false))) {
                        if (pages[i])
                                put_page(pages[i]);
                        while (--i >= 0) {
                                memcg = (void *)page_private(pages[i]);
                                set_page_private(pages[i], 0);
-                               mem_cgroup_cancel_charge(pages[i], memcg);
+                               mem_cgroup_cancel_charge(pages[i], memcg,
+                                               false);
                                put_page(pages[i]);
                        }
                        kfree(pages);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
-               page_add_new_anon_rmap(pages[i], vma, haddr);
-               mem_cgroup_commit_charge(pages[i], memcg, false);
+               page_add_new_anon_rmap(pages[i], vma, haddr, false);
+               mem_cgroup_commit_charge(pages[i], memcg, false, false);
                lru_cache_add_active_or_unevictable(pages[i], vma);
                pte = pte_offset_map(&_pmd, haddr);
                VM_BUG_ON(!pte_none(*pte));
  
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
-       page_remove_rmap(page);
+       page_remove_rmap(page, true);
        spin_unlock(ptl);
  
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@@ -1141,7 -1192,7 +1192,7 @@@ out_free_pages
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                memcg = (void *)page_private(pages[i]);
                set_page_private(pages[i], 0);
-               mem_cgroup_cancel_charge(pages[i], memcg);
+               mem_cgroup_cancel_charge(pages[i], memcg, false);
                put_page(pages[i]);
        }
        kfree(pages);
@@@ -1171,7 -1222,17 +1222,17 @@@ int do_huge_pmd_wp_page(struct mm_struc
  
        page = pmd_page(orig_pmd);
        VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
-       if (page_mapcount(page) == 1) {
+       /*
+        * We can only reuse the page if nobody else maps the huge page or it's
+        * part. We can do it by checking page_mapcount() on each sub-page, but
+        * it's expensive.
+        * The cheaper way is to check page_count() to be equal 1: every
+        * mapcount takes page reference reference, so this way we can
+        * guarantee, that the PMD is the only mapping.
+        * This can give false negative if somebody pinned the page, but that's
+        * fine.
+        */
+       if (page_mapcount(page) == 1 && page_count(page) == 1) {
                pmd_t entry;
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
-       get_user_huge_page(page);
+       get_page(page);
        spin_unlock(ptl);
  alloc:
        if (transparent_hugepage_enabled(vma) &&
        } else
                new_page = NULL;
  
-       if (unlikely(!new_page)) {
+       if (likely(new_page)) {
+               prep_transhuge_page(new_page);
+       } else {
                if (!page) {
-                       split_huge_page_pmd(vma, address, pmd);
+                       split_huge_pmd(vma, pmd, address);
                        ret |= VM_FAULT_FALLBACK;
                } else {
                        ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
                                        pmd, orig_pmd, page, haddr);
                        if (ret & VM_FAULT_OOM) {
-                               split_huge_page(page);
+                               split_huge_pmd(vma, pmd, address);
                                ret |= VM_FAULT_FALLBACK;
                        }
-                       put_user_huge_page(page);
+                       put_page(page);
                }
                count_vm_event(THP_FAULT_FALLBACK);
                goto out;
        }
  
-       if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
+       if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp,
+                                       &memcg, true))) {
                put_page(new_page);
                if (page) {
-                       split_huge_page(page);
-                       put_user_huge_page(page);
+                       split_huge_pmd(vma, pmd, address);
+                       put_page(page);
                } else
-                       split_huge_page_pmd(vma, address, pmd);
+                       split_huge_pmd(vma, pmd, address);
                ret |= VM_FAULT_FALLBACK;
                count_vm_event(THP_FAULT_FALLBACK);
                goto out;
  
        spin_lock(ptl);
        if (page)
-               put_user_huge_page(page);
+               put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(ptl);
-               mem_cgroup_cancel_charge(new_page, memcg);
+               mem_cgroup_cancel_charge(new_page, memcg, true);
                put_page(new_page);
                goto out_mn;
        } else {
                entry = mk_huge_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-               page_add_new_anon_rmap(new_page, vma, haddr);
-               mem_cgroup_commit_charge(new_page, memcg, false);
+               page_add_new_anon_rmap(new_page, vma, haddr, true);
+               mem_cgroup_commit_charge(new_page, memcg, false, true);
                lru_cache_add_active_or_unevictable(new_page, vma);
                set_pmd_at(mm, haddr, pmd, entry);
                update_mmu_cache_pmd(vma, address, pmd);
                        put_huge_zero_page();
                } else {
                        VM_BUG_ON_PAGE(!PageHead(page), page);
-                       page_remove_rmap(page);
+                       page_remove_rmap(page, true);
                        put_page(page);
                }
                ret |= VM_FAULT_WRITE;
@@@ -1307,8 -1371,21 +1371,21 @@@ struct page *follow_trans_huge_pmd(stru
                                          pmd, _pmd,  1))
                        update_mmu_cache_pmd(vma, addr, pmd);
        }
-       if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
-               if (page->mapping && trylock_page(page)) {
+       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+               /*
+                * We don't mlock() pte-mapped THPs. This way we can avoid
+                * leaking mlocked pages into non-VM_LOCKED VMAs.
+                *
+                * In most cases the pmd is the only mapping of the page as we
+                * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+                * writable private mappings in populate_vma_page_range().
+                *
+                * The only scenario when we have the page shared here is if we
+                * mlocking read-only mapping shared over fork(). We skip
+                * mlocking such pages.
+                */
+               if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+                               page->mapping && trylock_page(page)) {
                        lru_add_drain();
                        if (page->mapping)
                                mlock_vma_page(page);
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON_PAGE(!PageCompound(page), page);
        if (flags & FOLL_GET)
-               get_page_foll(page);
+               get_page(page);
  
  out:
        return page;
        return 0;
  }
  
+ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                pmd_t *pmd, unsigned long addr)
+ {
+       spinlock_t *ptl;
+       struct mm_struct *mm = tlb->mm;
+       int ret = 1;
+       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+               struct page *page;
+               pmd_t orig_pmd;
+               if (is_huge_zero_pmd(*pmd))
+                       goto out;
+               orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmd);
+               /* No hugepage in swapcache */
+               page = pmd_page(orig_pmd);
+               VM_BUG_ON_PAGE(PageSwapCache(page), page);
+               orig_pmd = pmd_mkold(orig_pmd);
+               orig_pmd = pmd_mkclean(orig_pmd);
+               set_pmd_at(mm, addr, pmd, orig_pmd);
+               tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ out:
+               spin_unlock(ptl);
+               ret = 0;
+       }
+       return ret;
+ }
  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
  {
        pmd_t orig_pmd;
        spinlock_t *ptl;
  
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+       if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
                return 0;
        /*
         * For architectures like ppc64 we look at deposited pgtable
                put_huge_zero_page();
        } else {
                struct page *page = pmd_page(orig_pmd);
-               page_remove_rmap(page);
+               page_remove_rmap(page, true);
                VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
                add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
                VM_BUG_ON_PAGE(!PageHead(page), page);
        return 1;
  }
  
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                  unsigned long old_addr,
                  unsigned long new_addr, unsigned long old_end,
                  pmd_t *old_pmd, pmd_t *new_pmd)
  {
        spinlock_t *old_ptl, *new_ptl;
-       int ret = 0;
        pmd_t pmd;
  
        struct mm_struct *mm = vma->vm_mm;
            (new_addr & ~HPAGE_PMD_MASK) ||
            old_end - old_addr < HPAGE_PMD_SIZE ||
            (new_vma->vm_flags & VM_NOHUGEPAGE))
-               goto out;
+               return false;
  
        /*
         * The destination pmd shouldn't be established, free_pgtables()
         */
        if (WARN_ON(!pmd_none(*new_pmd))) {
                VM_BUG_ON(pmd_trans_huge(*new_pmd));
-               goto out;
+               return false;
        }
  
        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_sem prevents deadlock.
         */
-       ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
-       if (ret == 1) {
+       if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                if (new_ptl != old_ptl)
                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
+               return true;
        }
- out:
-       return ret;
+       return false;
  }
  
  /*
@@@ -1558,7 -1667,7 +1667,7 @@@ int change_huge_pmd(struct vm_area_stru
        spinlock_t *ptl;
        int ret = 0;
  
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
                pmd_t entry;
                bool preserve_write = prot_numa && pmd_write(*pmd);
                ret = 1;
  }
  
  /*
-  * Returns 1 if a given pmd maps a stable (not under splitting) thp.
-  * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+  * Returns true if a given pmd maps a thp, false otherwise.
   *
-  * Note that if it returns 1, this routine returns without unlocking page
-  * table locks. So callers must unlock them.
+  * Note that if it returns true, this routine returns without unlocking page
+  * table lock. So callers must unlock it.
   */
int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                spinlock_t **ptl)
  {
        *ptl = pmd_lock(vma->vm_mm, pmd);
-       if (likely(pmd_trans_huge(*pmd))) {
-               if (unlikely(pmd_trans_splitting(*pmd))) {
-                       spin_unlock(*ptl);
-                       wait_split_huge_page(vma->anon_vma, pmd);
-                       return -1;
-               } else {
-                       /* Thp mapped by 'pmd' is stable, so we can
-                        * handle it as it is. */
-                       return 1;
-               }
-       }
+       if (likely(pmd_trans_huge(*pmd)))
+               return true;
        spin_unlock(*ptl);
-       return 0;
+       return false;
  }
  
  /*
  pmd_t *page_check_address_pmd(struct page *page,
                              struct mm_struct *mm,
                              unsigned long address,
-                             enum page_check_address_pmd_flag flag,
                              spinlock_t **ptl)
  {
        pgd_t *pgd;
                goto unlock;
        if (pmd_page(*pmd) != page)
                goto unlock;
-       /*
-        * split_vma() may create temporary aliased mappings. There is
-        * no risk as long as all huge pmd are found and have their
-        * splitting bit set before __split_huge_page_refcount
-        * runs. Finding the same huge pmd more than once during the
-        * same rmap walk is not a problem.
-        */
-       if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
-           pmd_trans_splitting(*pmd))
-               goto unlock;
-       if (pmd_trans_huge(*pmd)) {
-               VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
-                         !pmd_trans_splitting(*pmd));
+       if (pmd_trans_huge(*pmd))
                return pmd;
-       }
  unlock:
        spin_unlock(*ptl);
        return NULL;
  }
  
- static int __split_huge_page_splitting(struct page *page,
-                                      struct vm_area_struct *vma,
-                                      unsigned long address)
- {
-       struct mm_struct *mm = vma->vm_mm;
-       spinlock_t *ptl;
-       pmd_t *pmd;
-       int ret = 0;
-       /* For mmu_notifiers */
-       const unsigned long mmun_start = address;
-       const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
-       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       pmd = page_check_address_pmd(page, mm, address,
-                       PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
-       if (pmd) {
-               /*
-                * We can't temporarily set the pmd to null in order
-                * to split it, the pmd must remain marked huge at all
-                * times or the VM won't take the pmd_trans_huge paths
-                * and it won't wait on the anon_vma->root->rwsem to
-                * serialize against split_huge_page*.
-                */
-               pmdp_splitting_flush(vma, address, pmd);
-               ret = 1;
-               spin_unlock(ptl);
-       }
-       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-       return ret;
- }
- static void __split_huge_page_refcount(struct page *page,
-                                      struct list_head *list)
- {
-       int i;
-       struct zone *zone = page_zone(page);
-       struct lruvec *lruvec;
-       int tail_count = 0;
-       /* prevent PageLRU to go away from under us, and freeze lru stats */
-       spin_lock_irq(&zone->lru_lock);
-       lruvec = mem_cgroup_page_lruvec(page, zone);
-       compound_lock(page);
-       /* complete memcg works before add pages to LRU */
-       mem_cgroup_split_huge_fixup(page);
-       for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
-               struct page *page_tail = page + i;
-               /* tail_page->_mapcount cannot change */
-               BUG_ON(page_mapcount(page_tail) < 0);
-               tail_count += page_mapcount(page_tail);
-               /* check for overflow */
-               BUG_ON(tail_count < 0);
-               BUG_ON(atomic_read(&page_tail->_count) != 0);
-               /*
-                * tail_page->_count is zero and not changing from
-                * under us. But get_page_unless_zero() may be running
-                * from under us on the tail_page. If we used
-                * atomic_set() below instead of atomic_add(), we
-                * would then run atomic_set() concurrently with
-                * get_page_unless_zero(), and atomic_set() is
-                * implemented in C not using locked ops. spin_unlock
-                * on x86 sometime uses locked ops because of PPro
-                * errata 66, 92, so unless somebody can guarantee
-                * atomic_set() here would be safe on all archs (and
-                * not only on x86), it's safer to use atomic_add().
-                */
-               atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
-                          &page_tail->_count);
-               /* after clearing PageTail the gup refcount can be released */
-               smp_mb__after_atomic();
-               page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-               page_tail->flags |= (page->flags &
-                                    ((1L << PG_referenced) |
-                                     (1L << PG_swapbacked) |
-                                     (1L << PG_mlocked) |
-                                     (1L << PG_uptodate) |
-                                     (1L << PG_active) |
-                                     (1L << PG_unevictable)));
-               page_tail->flags |= (1L << PG_dirty);
-               /* clear PageTail before overwriting first_page */
-               smp_wmb();
-               if (page_is_young(page))
-                       set_page_young(page_tail);
-               if (page_is_idle(page))
-                       set_page_idle(page_tail);
-               /*
-                * __split_huge_page_splitting() already set the
-                * splitting bit in all pmd that could map this
-                * hugepage, that will ensure no CPU can alter the
-                * mapcount on the head page. The mapcount is only
-                * accounted in the head page and it has to be
-                * transferred to all tail pages in the below code. So
-                * for this code to be safe, the split the mapcount
-                * can't change. But that doesn't mean userland can't
-                * keep changing and reading the page contents while
-                * we transfer the mapcount, so the pmd splitting
-                * status is achieved setting a reserved bit in the
-                * pmd, not by clearing the present bit.
-               */
-               page_tail->_mapcount = page->_mapcount;
-               BUG_ON(page_tail->mapping);
-               page_tail->mapping = page->mapping;
-               page_tail->index = page->index + i;
-               page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
-               BUG_ON(!PageAnon(page_tail));
-               BUG_ON(!PageUptodate(page_tail));
-               BUG_ON(!PageDirty(page_tail));
-               BUG_ON(!PageSwapBacked(page_tail));
-               lru_add_page_tail(page, page_tail, lruvec, list);
-       }
-       atomic_sub(tail_count, &page->_count);
-       BUG_ON(atomic_read(&page->_count) <= 0);
-       __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-       ClearPageCompound(page);
-       compound_unlock(page);
-       spin_unlock_irq(&zone->lru_lock);
-       for (i = 1; i < HPAGE_PMD_NR; i++) {
-               struct page *page_tail = page + i;
-               BUG_ON(page_count(page_tail) <= 0);
-               /*
-                * Tail pages may be freed if there wasn't any mapping
-                * like if add_to_swap() is running on a lru page that
-                * had its mapping zapped. And freeing these pages
-                * requires taking the lru_lock so we do the put_page
-                * of the tail pages after the split is complete.
-                */
-               put_page(page_tail);
-       }
-       /*
-        * Only the head page (now become a regular page) is required
-        * to be pinned by the caller.
-        */
-       BUG_ON(page_count(page) <= 0);
- }
- static int __split_huge_page_map(struct page *page,
-                                struct vm_area_struct *vma,
-                                unsigned long address)
- {
-       struct mm_struct *mm = vma->vm_mm;
-       spinlock_t *ptl;
-       pmd_t *pmd, _pmd;
-       int ret = 0, i;
-       pgtable_t pgtable;
-       unsigned long haddr;
-       pmd = page_check_address_pmd(page, mm, address,
-                       PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
-       if (pmd) {
-               pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-               pmd_populate(mm, &_pmd, pgtable);
-               if (pmd_write(*pmd))
-                       BUG_ON(page_mapcount(page) != 1);
-               haddr = address;
-               for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-                       pte_t *pte, entry;
-                       BUG_ON(PageCompound(page+i));
-                       /*
-                        * Note that NUMA hinting access restrictions are not
-                        * transferred to avoid any possibility of altering
-                        * permissions across VMAs.
-                        */
-                       entry = mk_pte(page + i, vma->vm_page_prot);
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                       if (!pmd_write(*pmd))
-                               entry = pte_wrprotect(entry);
-                       if (!pmd_young(*pmd))
-                               entry = pte_mkold(entry);
-                       pte = pte_offset_map(&_pmd, haddr);
-                       BUG_ON(!pte_none(*pte));
-                       set_pte_at(mm, haddr, pte, entry);
-                       pte_unmap(pte);
-               }
-               smp_wmb(); /* make pte visible before pmd */
-               /*
-                * Up to this point the pmd is present and huge and
-                * userland has the whole access to the hugepage
-                * during the split (which happens in place). If we
-                * overwrite the pmd with the not-huge version
-                * pointing to the pte here (which of course we could
-                * if all CPUs were bug free), userland could trigger
-                * a small page size TLB miss on the small sized TLB
-                * while the hugepage TLB entry is still established
-                * in the huge TLB. Some CPU doesn't like that. See
-                * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
-                * Erratum 383 on page 93. Intel should be safe but is
-                * also warns that it's only safe if the permission
-                * and cache attributes of the two entries loaded in
-                * the two TLB is identical (which should be the case
-                * here). But it is generally safer to never allow
-                * small and huge TLB entries for the same virtual
-                * address to be loaded simultaneously. So instead of
-                * doing "pmd_populate(); flush_pmd_tlb_range();" we first
-                * mark the current pmd notpresent (atomically because
-                * here the pmd_trans_huge and pmd_trans_splitting
-                * must remain set at all times on the pmd until the
-                * split is complete for this pmd), then we flush the
-                * SMP TLB and finally we write the non-huge version
-                * of the pmd entry with pmd_populate.
-                */
-               pmdp_invalidate(vma, address, pmd);
-               pmd_populate(mm, pmd, pgtable);
-               ret = 1;
-               spin_unlock(ptl);
-       }
-       return ret;
- }
- /* must be called with anon_vma->root->rwsem held */
- static void __split_huge_page(struct page *page,
-                             struct anon_vma *anon_vma,
-                             struct list_head *list)
- {
-       int mapcount, mapcount2;
-       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-       struct anon_vma_chain *avc;
-       BUG_ON(!PageHead(page));
-       BUG_ON(PageTail(page));
-       mapcount = 0;
-       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-               struct vm_area_struct *vma = avc->vma;
-               unsigned long addr = vma_address(page, vma);
-               BUG_ON(is_vma_temporary_stack(vma));
-               mapcount += __split_huge_page_splitting(page, vma, addr);
-       }
-       /*
-        * It is critical that new vmas are added to the tail of the
-        * anon_vma list. This guarantes that if copy_huge_pmd() runs
-        * and establishes a child pmd before
-        * __split_huge_page_splitting() freezes the parent pmd (so if
-        * we fail to prevent copy_huge_pmd() from running until the
-        * whole __split_huge_page() is complete), we will still see
-        * the newly established pmd of the child later during the
-        * walk, to be able to set it as pmd_trans_splitting too.
-        */
-       if (mapcount != page_mapcount(page)) {
-               pr_err("mapcount %d page_mapcount %d\n",
-                       mapcount, page_mapcount(page));
-               BUG();
-       }
-       __split_huge_page_refcount(page, list);
-       mapcount2 = 0;
-       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-               struct vm_area_struct *vma = avc->vma;
-               unsigned long addr = vma_address(page, vma);
-               BUG_ON(is_vma_temporary_stack(vma));
-               mapcount2 += __split_huge_page_map(page, vma, addr);
-       }
-       if (mapcount != mapcount2) {
-               pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
-                       mapcount, mapcount2, page_mapcount(page));
-               BUG();
-       }
- }
- /*
-  * Split a hugepage into normal pages. This doesn't change the position of head
-  * page. If @list is null, tail pages will be added to LRU list, otherwise, to
-  * @list. Both head page and tail pages will inherit mapping, flags, and so on
-  * from the hugepage.
-  * Return 0 if the hugepage is split successfully otherwise return 1.
-  */
- int split_huge_page_to_list(struct page *page, struct list_head *list)
- {
-       struct anon_vma *anon_vma;
-       int ret = 1;
-       BUG_ON(is_huge_zero_page(page));
-       BUG_ON(!PageAnon(page));
-       /*
-        * The caller does not necessarily hold an mmap_sem that would prevent
-        * the anon_vma disappearing so we first we take a reference to it
-        * and then lock the anon_vma for write. This is similar to
-        * page_lock_anon_vma_read except the write lock is taken to serialise
-        * against parallel split or collapse operations.
-        */
-       anon_vma = page_get_anon_vma(page);
-       if (!anon_vma)
-               goto out;
-       anon_vma_lock_write(anon_vma);
-       ret = 0;
-       if (!PageCompound(page))
-               goto out_unlock;
-       BUG_ON(!PageSwapBacked(page));
-       __split_huge_page(page, anon_vma, list);
-       count_vm_event(THP_SPLIT);
-       BUG_ON(PageCompound(page));
- out_unlock:
-       anon_vma_unlock_write(anon_vma);
-       put_anon_vma(anon_vma);
- out:
-       return ret;
- }
  #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
  
  int hugepage_madvise(struct vm_area_struct *vma,
@@@ -2199,26 -1961,33 +1961,33 @@@ static int __collapse_huge_page_isolate
                                        unsigned long address,
                                        pte_t *pte)
  {
-       struct page *page;
+       struct page *page = NULL;
        pte_t *_pte;
-       int none_or_zero = 0;
+       int none_or_zero = 0, result = 0;
        bool referenced = false, writable = false;
        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
                if (pte_none(pteval) || (pte_present(pteval) &&
                                is_zero_pfn(pte_pfn(pteval)))) {
                        if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none)
+                           ++none_or_zero <= khugepaged_max_ptes_none) {
                                continue;
-                       else
+                       } else {
+                               result = SCAN_EXCEED_NONE_PTE;
                                goto out;
+                       }
                }
-               if (!pte_present(pteval))
+               if (!pte_present(pteval)) {
+                       result = SCAN_PTE_NON_PRESENT;
                        goto out;
+               }
                page = vm_normal_page(vma, address, pteval);
-               if (unlikely(!page))
+               if (unlikely(!page)) {
+                       result = SCAN_PAGE_NULL;
                        goto out;
+               }
  
                VM_BUG_ON_PAGE(PageCompound(page), page);
                VM_BUG_ON_PAGE(!PageAnon(page), page);
                 * is needed to serialize against split_huge_page
                 * when invoked from the VM.
                 */
-               if (!trylock_page(page))
+               if (!trylock_page(page)) {
+                       result = SCAN_PAGE_LOCK;
                        goto out;
+               }
  
                /*
                 * cannot use mapcount: can't collapse if there's a gup pin.
                 */
                if (page_count(page) != 1 + !!PageSwapCache(page)) {
                        unlock_page(page);
+                       result = SCAN_PAGE_COUNT;
                        goto out;
                }
                if (pte_write(pteval)) {
                } else {
                        if (PageSwapCache(page) && !reuse_swap_page(page)) {
                                unlock_page(page);
+                               result = SCAN_SWAP_CACHE_PAGE;
                                goto out;
                        }
                        /*
                 */
                if (isolate_lru_page(page)) {
                        unlock_page(page);
+                       result = SCAN_DEL_PAGE_LRU;
                        goto out;
                }
                /* 0 stands for page_is_file_cache(page) == false */
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = true;
        }
-       if (likely(referenced && writable))
-               return 1;
+       if (likely(writable)) {
+               if (likely(referenced)) {
+                       result = SCAN_SUCCEED;
+                       trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+                                                           referenced, writable, result);
+                       return 1;
+               }
+       } else {
+               result = SCAN_PAGE_RO;
+       }
  out:
        release_pte_pages(pte, _pte);
+       trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+                                           referenced, writable, result);
        return 0;
  }
  
@@@ -2322,7 -2107,7 +2107,7 @@@ static void __collapse_huge_page_copy(p
                         * superfluous.
                         */
                        pte_clear(vma->vm_mm, address, _pte);
-                       page_remove_rmap(src_page);
+                       page_remove_rmap(src_page, false);
                        spin_unlock(ptl);
                        free_page_and_swap_cache(src_page);
                }
@@@ -2433,6 -2218,7 +2218,7 @@@ khugepaged_alloc_page(struct page **hpa
                return NULL;
        }
  
+       prep_transhuge_page(*hpage);
        count_vm_event(THP_COLLAPSE_ALLOC);
        return *hpage;
  }
@@@ -2444,8 -2230,12 +2230,12 @@@ static int khugepaged_find_target_node(
  
  static inline struct page *alloc_hugepage(int defrag)
  {
-       return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
-                          HPAGE_PMD_ORDER);
+       struct page *page;
+       page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+       if (page)
+               prep_transhuge_page(page);
+       return page;
  }
  
  static struct page *khugepaged_alloc_hugepage(bool *wait)
@@@ -2496,7 -2286,6 +2286,6 @@@ static bool hugepage_vma_check(struct v
        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
            (vma->vm_flags & VM_NOHUGEPAGE))
                return false;
        if (!vma->anon_vma || vma->vm_ops)
                return false;
        if (is_vma_temporary_stack(vma))
        return true;
  }
  
+ /*
+  * Bring missing pages in from swap, to complete THP collapse.
+  * Only done if khugepaged_scan_pmd believes it is worthwhile.
+  *
+  * Called and returns without pte mapped or spinlocks held,
+  * but with mmap_sem held to protect against vma changes.
+  */
+ static void __collapse_huge_page_swapin(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       unsigned long address, pmd_t *pmd)
+ {
+       unsigned long _address;
+       pte_t *pte, pteval;
+       int swapped_in = 0, ret = 0;
+       pte = pte_offset_map(pmd, address);
+       for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
+            pte++, _address += PAGE_SIZE) {
+               pteval = *pte;
+               if (!is_swap_pte(pteval))
+                       continue;
+               swapped_in++;
+               ret = do_swap_page(mm, vma, _address, pte, pmd,
+                                  FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+                                  pteval);
+               if (ret & VM_FAULT_ERROR) {
+                       trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
+                       return;
+               }
+               /* pte is unmapped now, we need to map it */
+               pte = pte_offset_map(pmd, _address);
+       }
+       pte--;
+       pte_unmap(pte);
+       trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+ }
  static void collapse_huge_page(struct mm_struct *mm,
                                   unsigned long address,
                                   struct page **hpage,
        pgtable_t pgtable;
        struct page *new_page;
        spinlock_t *pmd_ptl, *pte_ptl;
-       int isolated;
+       int isolated = 0, result = 0;
        unsigned long hstart, hend;
        struct mem_cgroup *memcg;
        unsigned long mmun_start;       /* For mmu_notifiers */
  
        /* release the mmap_sem read lock. */
        new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
-       if (!new_page)
-               return;
+       if (!new_page) {
+               result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+               goto out_nolock;
+       }
  
-       if (unlikely(mem_cgroup_try_charge(new_page, mm,
-                                          gfp, &memcg)))
-               return;
+       if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+               result = SCAN_CGROUP_CHARGE_FAIL;
+               goto out_nolock;
+       }
  
        /*
         * Prevent all access to pagetables with the exception of
         * handled by the anon_vma lock + PG_lock.
         */
        down_write(&mm->mmap_sem);
-       if (unlikely(khugepaged_test_exit(mm)))
+       if (unlikely(khugepaged_test_exit(mm))) {
+               result = SCAN_ANY_PROCESS;
                goto out;
+       }
  
        vma = find_vma(mm, address);
-       if (!vma)
+       if (!vma) {
+               result = SCAN_VMA_NULL;
                goto out;
+       }
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
-       if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+       if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
+               result = SCAN_ADDRESS_RANGE;
                goto out;
-       if (!hugepage_vma_check(vma))
+       }
+       if (!hugepage_vma_check(vma)) {
+               result = SCAN_VMA_CHECK;
                goto out;
+       }
        pmd = mm_find_pmd(mm, address);
-       if (!pmd)
+       if (!pmd) {
+               result = SCAN_PMD_NULL;
                goto out;
+       }
+       __collapse_huge_page_swapin(mm, vma, address, pmd);
  
        anon_vma_lock_write(vma->anon_vma);
  
                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(pmd_ptl);
                anon_vma_unlock_write(vma->anon_vma);
+               result = SCAN_FAIL;
                goto out;
        }
  
  
        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
-       page_add_new_anon_rmap(new_page, vma, address);
-       mem_cgroup_commit_charge(new_page, memcg, false);
+       page_add_new_anon_rmap(new_page, vma, address, true);
+       mem_cgroup_commit_charge(new_page, memcg, false, true);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        *hpage = NULL;
  
        khugepaged_pages_collapsed++;
+       result = SCAN_SUCCEED;
  out_up_write:
        up_write(&mm->mmap_sem);
+ out_nolock:
+       trace_mm_collapse_huge_page(mm, isolated, result);
        return;
  out:
-       mem_cgroup_cancel_charge(new_page, memcg);
+       mem_cgroup_cancel_charge(new_page, memcg, true);
        goto out_up_write;
  }
  
@@@ -2648,39 -2493,62 +2493,62 @@@ static int khugepaged_scan_pmd(struct m
  {
        pmd_t *pmd;
        pte_t *pte, *_pte;
-       int ret = 0, none_or_zero = 0;
-       struct page *page;
+       int ret = 0, none_or_zero = 0, result = 0;
+       struct page *page = NULL;
        unsigned long _address;
        spinlock_t *ptl;
-       int node = NUMA_NO_NODE;
+       int node = NUMA_NO_NODE, unmapped = 0;
        bool writable = false, referenced = false;
  
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  
        pmd = mm_find_pmd(mm, address);
-       if (!pmd)
+       if (!pmd) {
+               result = SCAN_PMD_NULL;
                goto out;
+       }
  
        memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
+               if (is_swap_pte(pteval)) {
+                       if (++unmapped <= khugepaged_max_ptes_swap) {
+                               continue;
+                       } else {
+                               result = SCAN_EXCEED_SWAP_PTE;
+                               goto out_unmap;
+                       }
+               }
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none)
+                           ++none_or_zero <= khugepaged_max_ptes_none) {
                                continue;
-                       else
+                       } else {
+                               result = SCAN_EXCEED_NONE_PTE;
                                goto out_unmap;
+                       }
                }
-               if (!pte_present(pteval))
+               if (!pte_present(pteval)) {
+                       result = SCAN_PTE_NON_PRESENT;
                        goto out_unmap;
+               }
                if (pte_write(pteval))
                        writable = true;
  
                page = vm_normal_page(vma, _address, pteval);
-               if (unlikely(!page))
+               if (unlikely(!page)) {
+                       result = SCAN_PAGE_NULL;
+                       goto out_unmap;
+               }
+               /* TODO: teach khugepaged to collapse THP mapped with pte */
+               if (PageCompound(page)) {
+                       result = SCAN_PAGE_COMPOUND;
                        goto out_unmap;
+               }
                /*
                 * Record which node the original page is from and save this
                 * information to khugepaged_node_load[].
                 * hit record.
                 */
                node = page_to_nid(page);
-               if (khugepaged_scan_abort(node))
+               if (khugepaged_scan_abort(node)) {
+                       result = SCAN_SCAN_ABORT;
                        goto out_unmap;
+               }
                khugepaged_node_load[node]++;
-               VM_BUG_ON_PAGE(PageCompound(page), page);
-               if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+               if (!PageLRU(page)) {
+                       result = SCAN_SCAN_ABORT;
+                       goto out_unmap;
+               }
+               if (PageLocked(page)) {
+                       result = SCAN_PAGE_LOCK;
+                       goto out_unmap;
+               }
+               if (!PageAnon(page)) {
+                       result = SCAN_PAGE_ANON;
                        goto out_unmap;
+               }
                /*
                 * cannot use mapcount: can't collapse if there's a gup pin.
                 * The page must only be referenced by the scanned process
                 * and page swap cache.
                 */
-               if (page_count(page) != 1 + !!PageSwapCache(page))
+               if (page_count(page) != 1 + !!PageSwapCache(page)) {
+                       result = SCAN_PAGE_COUNT;
                        goto out_unmap;
+               }
                if (pte_young(pteval) ||
                    page_is_young(page) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = true;
        }
-       if (referenced && writable)
-               ret = 1;
+       if (writable) {
+               if (referenced) {
+                       result = SCAN_SUCCEED;
+                       ret = 1;
+               } else {
+                       result = SCAN_NO_REFERENCED_PAGE;
+               }
+       } else {
+               result = SCAN_PAGE_RO;
+       }
  out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (ret) {
                collapse_huge_page(mm, address, hpage, vma, node);
        }
  out:
+       trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+                                    none_or_zero, result, unmapped);
        return ret;
  }
  
@@@ -2941,8 -2833,8 +2833,8 @@@ static void __split_huge_zero_page_pmd(
        pmd_t _pmd;
        int i;
  
-       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
+       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
  
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);
        put_huge_zero_page();
  }
  
void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
-               pmd_t *pmd)
static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long haddr, bool freeze)
  {
-       spinlock_t *ptl;
-       struct page *page = NULL;
        struct mm_struct *mm = vma->vm_mm;
-       unsigned long haddr = address & HPAGE_PMD_MASK;
-       unsigned long mmun_start;       /* For mmu_notifiers */
-       unsigned long mmun_end;         /* For mmu_notifiers */
+       struct page *page;
+       pgtable_t pgtable;
+       pmd_t _pmd;
+       bool young, write;
+       int i;
  
-       BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+       VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+       VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+       VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+       VM_BUG_ON(!pmd_trans_huge(*pmd));
+       count_vm_event(THP_SPLIT_PMD);
  
-       mmun_start = haddr;
-       mmun_end   = haddr + HPAGE_PMD_SIZE;
- again:
-       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       ptl = pmd_lock(mm, pmd);
-       if (unlikely(!pmd_trans_huge(*pmd)))
-               goto unlock;
        if (vma_is_dax(vma)) {
                pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
                if (is_huge_zero_pmd(_pmd))
                        put_huge_zero_page();
+               return;
        } else if (is_huge_zero_pmd(*pmd)) {
-               __split_huge_zero_page_pmd(vma, haddr, pmd);
-       } else {
-               page = pmd_page(*pmd);
-               VM_BUG_ON_PAGE(!page_count(page), page);
-               get_page(page);
+               return __split_huge_zero_page_pmd(vma, haddr, pmd);
        }
-  unlock:
-       spin_unlock(ptl);
-       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
  
-       if (!page)
-               return;
+       page = pmd_page(*pmd);
+       VM_BUG_ON_PAGE(!page_count(page), page);
+       atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+       write = pmd_write(*pmd);
+       young = pmd_young(*pmd);
  
-       split_huge_page(page);
-       put_page(page);
+       /* leave pmd empty until pte is filled */
+       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+       pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+       pmd_populate(mm, &_pmd, pgtable);
+       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+               pte_t entry, *pte;
+               /*
+                * Note that NUMA hinting access restrictions are not
+                * transferred to avoid any possibility of altering
+                * permissions across VMAs.
+                */
+               if (freeze) {
+                       swp_entry_t swp_entry;
+                       swp_entry = make_migration_entry(page + i, write);
+                       entry = swp_entry_to_pte(swp_entry);
+               } else {
+                       entry = mk_pte(page + i, vma->vm_page_prot);
+                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                       if (!write)
+                               entry = pte_wrprotect(entry);
+                       if (!young)
+                               entry = pte_mkold(entry);
+               }
+               pte = pte_offset_map(&_pmd, haddr);
+               BUG_ON(!pte_none(*pte));
+               set_pte_at(mm, haddr, pte, entry);
+               atomic_inc(&page[i]._mapcount);
+               pte_unmap(pte);
+       }
  
        /*
-        * We don't always have down_write of mmap_sem here: a racing
-        * do_huge_pmd_wp_page() might have copied-on-write to another
-        * huge page before our split_huge_page() got the anon_vma lock.
+        * Set PG_double_map before dropping compound_mapcount to avoid
+        * false-negative page_mapped().
         */
-       if (unlikely(pmd_trans_huge(*pmd)))
-               goto again;
+       if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
+               for (i = 0; i < HPAGE_PMD_NR; i++)
+                       atomic_inc(&page[i]._mapcount);
+       }
+       if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+               /* Last compound_mapcount is gone. */
+               __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+               if (TestClearPageDoubleMap(page)) {
+                       /* No need in mapcount reference anymore */
+                       for (i = 0; i < HPAGE_PMD_NR; i++)
+                               atomic_dec(&page[i]._mapcount);
+               }
+       }
+       smp_wmb(); /* make pte visible before pmd */
+       pmd_populate(mm, pmd, pgtable);
  }
  
- void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd)
+ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long address)
  {
-       struct vm_area_struct *vma;
+       spinlock_t *ptl;
+       struct mm_struct *mm = vma->vm_mm;
+       struct page *page = NULL;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
  
-       vma = find_vma(mm, address);
-       BUG_ON(vma == NULL);
-       split_huge_page_pmd(vma, address, pmd);
+       mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       ptl = pmd_lock(mm, pmd);
+       if (unlikely(!pmd_trans_huge(*pmd)))
+               goto out;
+       page = pmd_page(*pmd);
+       __split_huge_pmd_locked(vma, pmd, haddr, false);
+       if (PageMlocked(page))
+               get_page(page);
+       else
+               page = NULL;
+ out:
+       spin_unlock(ptl);
+       mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       if (page) {
+               lock_page(page);
+               munlock_vma_page(page);
+               unlock_page(page);
+               put_page(page);
+       }
  }
  
- static void split_huge_page_address(struct mm_struct *mm,
+ static void split_huge_pmd_address(struct vm_area_struct *vma,
                                    unsigned long address)
  {
        pgd_t *pgd;
  
        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
  
-       pgd = pgd_offset(mm, address);
+       pgd = pgd_offset(vma->vm_mm, address);
        if (!pgd_present(*pgd))
                return;
  
                return;
  
        pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd))
+       if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
                return;
        /*
         * Caller holds the mmap_sem write mode, so a huge pmd cannot
         * materialize from under us.
         */
-       split_huge_page_pmd_mm(mm, address, pmd);
+       split_huge_pmd(vma, pmd, address);
  }
  
  void vma_adjust_trans_huge(struct vm_area_struct *vma,
        if (start & ~HPAGE_PMD_MASK &&
            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_page_address(vma->vm_mm, start);
+               split_huge_pmd_address(vma, start);
  
        /*
         * If the new end address isn't hpage aligned and it could
        if (end & ~HPAGE_PMD_MASK &&
            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_page_address(vma->vm_mm, end);
+               split_huge_pmd_address(vma, end);
  
        /*
         * If we're also updating the vma->vm_next->vm_start, if the new
                if (nstart & ~HPAGE_PMD_MASK &&
                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
-                       split_huge_page_address(next->vm_mm, nstart);
+                       split_huge_pmd_address(next, nstart);
+       }
+ }
+ static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+               unsigned long address)
+ {
+       spinlock_t *ptl;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       int i;
+       pgd = pgd_offset(vma->vm_mm, address);
+       if (!pgd_present(*pgd))
+               return;
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return;
+       pmd = pmd_offset(pud, address);
+       ptl = pmd_lock(vma->vm_mm, pmd);
+       if (!pmd_present(*pmd)) {
+               spin_unlock(ptl);
+               return;
+       }
+       if (pmd_trans_huge(*pmd)) {
+               if (page == pmd_page(*pmd))
+                       __split_huge_pmd_locked(vma, pmd, address, true);
+               spin_unlock(ptl);
+               return;
+       }
+       spin_unlock(ptl);
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+       for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+               pte_t entry, swp_pte;
+               swp_entry_t swp_entry;
+               if (!pte_present(pte[i]))
+                       continue;
+               if (page_to_pfn(page) != pte_pfn(pte[i]))
+                       continue;
+               flush_cache_page(vma, address, page_to_pfn(page));
+               entry = ptep_clear_flush(vma, address, pte + i);
+               swp_entry = make_migration_entry(page, pte_write(entry));
+               swp_pte = swp_entry_to_pte(swp_entry);
+               if (pte_soft_dirty(entry))
+                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+               set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
+       }
+       pte_unmap_unlock(pte, ptl);
+ }
+ static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+ {
+       struct anon_vma_chain *avc;
+       pgoff_t pgoff = page_to_pgoff(page);
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+                       pgoff + HPAGE_PMD_NR - 1) {
+               unsigned long haddr;
+               haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
+               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                               haddr, haddr + HPAGE_PMD_SIZE);
+               freeze_page_vma(avc->vma, page, haddr);
+               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                               haddr, haddr + HPAGE_PMD_SIZE);
+       }
+ }
+ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+               unsigned long address)
+ {
+       spinlock_t *ptl;
+       pmd_t *pmd;
+       pte_t *pte, entry;
+       swp_entry_t swp_entry;
+       int i;
+       pmd = mm_find_pmd(vma->vm_mm, address);
+       if (!pmd)
+               return;
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+       for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+               if (!page_mapped(page))
+                       continue;
+               if (!is_swap_pte(pte[i]))
+                       continue;
+               swp_entry = pte_to_swp_entry(pte[i]);
+               if (!is_migration_entry(swp_entry))
+                       continue;
+               if (migration_entry_to_page(swp_entry) != page)
+                       continue;
+               entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+               entry = pte_mkdirty(entry);
+               if (is_write_migration_entry(swp_entry))
+                       entry = maybe_mkwrite(entry, vma);
+               flush_dcache_page(page);
+               set_pte_at(vma->vm_mm, address, pte + i, entry);
+               /* No need to invalidate - it was non-present before */
+               update_mmu_cache(vma, address, pte + i);
+       }
+       pte_unmap_unlock(pte, ptl);
+ }
+ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+ {
+       struct anon_vma_chain *avc;
+       pgoff_t pgoff = page_to_pgoff(page);
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+                       pgoff, pgoff + HPAGE_PMD_NR - 1) {
+               unsigned long address = __vma_address(page, avc->vma);
+               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+               unfreeze_page_vma(avc->vma, page, address);
+               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
        }
  }
+ static int total_mapcount(struct page *page)
+ {
+       int i, ret;
+       ret = compound_mapcount(page);
+       for (i = 0; i < HPAGE_PMD_NR; i++)
+               ret += atomic_read(&page[i]._mapcount) + 1;
+       if (PageDoubleMap(page))
+               ret -= HPAGE_PMD_NR;
+       return ret;
+ }
+ static int __split_huge_page_tail(struct page *head, int tail,
+               struct lruvec *lruvec, struct list_head *list)
+ {
+       int mapcount;
+       struct page *page_tail = head + tail;
+       mapcount = atomic_read(&page_tail->_mapcount) + 1;
+       VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+       /*
+        * tail_page->_count is zero and not changing from under us. But
+        * get_page_unless_zero() may be running from under us on the
+        * tail_page. If we used atomic_set() below instead of atomic_add(), we
+        * would then run atomic_set() concurrently with
+        * get_page_unless_zero(), and atomic_set() is implemented in C not
+        * using locked ops. spin_unlock on x86 sometime uses locked ops
+        * because of PPro errata 66, 92, so unless somebody can guarantee
+        * atomic_set() here would be safe on all archs (and not only on x86),
+        * it's safer to use atomic_add().
+        */
+       atomic_add(mapcount + 1, &page_tail->_count);
+       /* after clearing PageTail the gup refcount can be released */
+       smp_mb__after_atomic();
+       page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+       page_tail->flags |= (head->flags &
+                       ((1L << PG_referenced) |
+                        (1L << PG_swapbacked) |
+                        (1L << PG_mlocked) |
+                        (1L << PG_uptodate) |
+                        (1L << PG_active) |
+                        (1L << PG_locked) |
+                        (1L << PG_unevictable)));
+       page_tail->flags |= (1L << PG_dirty);
+       clear_compound_head(page_tail);
+       if (page_is_young(head))
+               set_page_young(page_tail);
+       if (page_is_idle(head))
+               set_page_idle(page_tail);
+       /* ->mapping in first tail page is compound_mapcount */
+       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+                       page_tail);
+       page_tail->mapping = head->mapping;
+       page_tail->index = head->index + tail;
+       page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+       lru_add_page_tail(head, page_tail, lruvec, list);
+       return mapcount;
+ }
+ static void __split_huge_page(struct page *page, struct list_head *list)
+ {
+       struct page *head = compound_head(page);
+       struct zone *zone = page_zone(head);
+       struct lruvec *lruvec;
+       int i, tail_mapcount;
+       /* prevent PageLRU to go away from under us, and freeze lru stats */
+       spin_lock_irq(&zone->lru_lock);
+       lruvec = mem_cgroup_page_lruvec(head, zone);
+       spin_lock(&split_queue_lock);
+       if (!list_empty(page_deferred_list(head))) {
+               split_queue_len--;
+               list_del(page_deferred_list(head));
+       }
+       spin_unlock(&split_queue_lock);
+       /* complete memcg works before add pages to LRU */
+       mem_cgroup_split_huge_fixup(head);
+       tail_mapcount = 0;
+       for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+               tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+       atomic_sub(tail_mapcount, &head->_count);
+       ClearPageCompound(head);
+       spin_unlock_irq(&zone->lru_lock);
+       unfreeze_page(page_anon_vma(head), head);
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               struct page *subpage = head + i;
+               if (subpage == page)
+                       continue;
+               unlock_page(subpage);
+               /*
+                * Subpages may be freed if there wasn't any mapping
+                * like if add_to_swap() is running on a lru page that
+                * had its mapping zapped. And freeing these pages
+                * requires taking the lru_lock so we do the put_page
+                * of the tail pages after the split is complete.
+                */
+               put_page(subpage);
+       }
+ }
+ /*
+  * This function splits huge page into normal pages. @page can point to any
+  * subpage of huge page to split. Split doesn't change the position of @page.
+  *
+  * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+  * The huge page must be locked.
+  *
+  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+  *
+  * Both head page and tail pages will inherit mapping, flags, and so on from
+  * the hugepage.
+  *
+  * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+  * they are not mapped.
+  *
+  * Returns 0 if the hugepage is split successfully.
+  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+  * us.
+  */
+ int split_huge_page_to_list(struct page *page, struct list_head *list)
+ {
+       struct page *head = compound_head(page);
+       struct anon_vma *anon_vma;
+       int count, mapcount, ret;
+       VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+       VM_BUG_ON_PAGE(!PageAnon(page), page);
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+       VM_BUG_ON_PAGE(!PageCompound(page), page);
+       /*
+        * The caller does not necessarily hold an mmap_sem that would prevent
+        * the anon_vma disappearing so we first we take a reference to it
+        * and then lock the anon_vma for write. This is similar to
+        * page_lock_anon_vma_read except the write lock is taken to serialise
+        * against parallel split or collapse operations.
+        */
+       anon_vma = page_get_anon_vma(head);
+       if (!anon_vma) {
+               ret = -EBUSY;
+               goto out;
+       }
+       anon_vma_lock_write(anon_vma);
+       /*
+        * Racy check if we can split the page, before freeze_page() will
+        * split PMDs
+        */
+       if (total_mapcount(head) != page_count(head) - 1) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+       freeze_page(anon_vma, head);
+       VM_BUG_ON_PAGE(compound_mapcount(head), head);
+       count = page_count(head);
+       mapcount = total_mapcount(head);
+       if (mapcount == count - 1) {
+               __split_huge_page(page, list);
+               ret = 0;
+       } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
+               pr_alert("total_mapcount: %u, page_count(): %u\n",
+                               mapcount, count);
+               if (PageTail(page))
+                       dump_page(head, NULL);
+               dump_page(page, "total_mapcount(head) > page_count(head) - 1");
+               BUG();
+       } else {
+               unfreeze_page(anon_vma, head);
+               ret = -EBUSY;
+       }
+ out_unlock:
+       anon_vma_unlock_write(anon_vma);
+       put_anon_vma(anon_vma);
+ out:
+       count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+       return ret;
+ }
+ void free_transhuge_page(struct page *page)
+ {
+       unsigned long flags;
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (!list_empty(page_deferred_list(page))) {
+               split_queue_len--;
+               list_del(page_deferred_list(page));
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+       free_compound_page(page);
+ }
+ void deferred_split_huge_page(struct page *page)
+ {
+       unsigned long flags;
+       VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (list_empty(page_deferred_list(page))) {
+               list_add_tail(page_deferred_list(page), &split_queue);
+               split_queue_len++;
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+ }
+ static unsigned long deferred_split_count(struct shrinker *shrink,
+               struct shrink_control *sc)
+ {
+       /*
+        * Split a page from split_queue will free up at least one page,
+        * at most HPAGE_PMD_NR - 1. We don't track exact number.
+        * Let's use HPAGE_PMD_NR / 2 as ballpark.
+        */
+       return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+ }
+ static unsigned long deferred_split_scan(struct shrinker *shrink,
+               struct shrink_control *sc)
+ {
+       unsigned long flags;
+       LIST_HEAD(list), *pos, *next;
+       struct page *page;
+       int split = 0;
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_init(&split_queue, &list);
+       /* Take pin on all head pages to avoid freeing them under us */
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               page = compound_head(page);
+               /* race with put_compound_page() */
+               if (!get_page_unless_zero(page)) {
+                       list_del_init(page_deferred_list(page));
+                       split_queue_len--;
+               }
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               lock_page(page);
+               /* split_huge_page() removes page from list on success */
+               if (!split_huge_page(page))
+                       split++;
+               unlock_page(page);
+               put_page(page);
+       }
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_tail(&list, &split_queue);
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+       return split * HPAGE_PMD_NR / 2;
+ }
+ static struct shrinker deferred_split_shrinker = {
+       .count_objects = deferred_split_count,
+       .scan_objects = deferred_split_scan,
+       .seeks = DEFAULT_SEEKS,
+ };
diff --combined mm/memcontrol.c
index b732edfddb767025185f27c8879903591c2b0c82,34cd0df82a8b57ac47e2916964384af01d162385..48735e7c617b3d9454b35ac5ea6a4f9ff5ef472c
@@@ -62,6 -62,7 +62,7 @@@
  #include <linux/oom.h>
  #include <linux/lockdep.h>
  #include <linux/file.h>
+ #include <linux/tracehook.h>
  #include "internal.h"
  #include <net/sock.h>
  #include <net/ip.h>
@@@ -434,7 -435,7 +435,7 @@@ struct cgroup_subsys_state *mem_cgroup_
  
        memcg = page->mem_cgroup;
  
 -      if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
 +      if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
                memcg = root_mem_cgroup;
  
        rcu_read_unlock();
@@@ -695,7 -696,7 +696,7 @@@ static unsigned long mem_cgroup_read_ev
  
  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                                         struct page *page,
-                                        int nr_pages)
+                                        bool compound, int nr_pages)
  {
        /*
         * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
                                nr_pages);
  
-       if (PageTransHuge(page))
+       if (compound) {
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
                                nr_pages);
+       }
  
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
@@@ -1661,7 -1664,7 +1664,7 @@@ static void memcg_oom_recover(struct me
  
  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  {
-       if (!current->memcg_oom.may_oom)
+       if (!current->memcg_may_oom)
                return;
        /*
         * We are in the middle of the charge context here, so we
         * and when we know whether the fault was overall successful.
         */
        css_get(&memcg->css);
-       current->memcg_oom.memcg = memcg;
-       current->memcg_oom.gfp_mask = mask;
-       current->memcg_oom.order = order;
+       current->memcg_in_oom = memcg;
+       current->memcg_oom_gfp_mask = mask;
+       current->memcg_oom_order = order;
  }
  
  /**
   */
  bool mem_cgroup_oom_synchronize(bool handle)
  {
-       struct mem_cgroup *memcg = current->memcg_oom.memcg;
+       struct mem_cgroup *memcg = current->memcg_in_oom;
        struct oom_wait_info owait;
        bool locked;
  
        if (locked && !memcg->oom_kill_disable) {
                mem_cgroup_unmark_under_oom(memcg);
                finish_wait(&memcg_oom_waitq, &owait.wait);
-               mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-                                        current->memcg_oom.order);
+               mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+                                        current->memcg_oom_order);
        } else {
                schedule();
                mem_cgroup_unmark_under_oom(memcg);
                memcg_oom_recover(memcg);
        }
  cleanup:
-       current->memcg_oom.memcg = NULL;
+       current->memcg_in_oom = NULL;
        css_put(&memcg->css);
        return true;
  }
@@@ -1972,6 -1975,31 +1975,31 @@@ static int memcg_cpu_hotplug_callback(s
        return NOTIFY_OK;
  }
  
+ /*
+  * Scheduled by try_charge() to be executed from the userland return path
+  * and reclaims memory over the high limit.
+  */
+ void mem_cgroup_handle_over_high(void)
+ {
+       unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       struct mem_cgroup *memcg, *pos;
+       if (likely(!nr_pages))
+               return;
+       pos = memcg = get_mem_cgroup_from_mm(current->mm);
+       do {
+               if (page_counter_read(&pos->memory) <= pos->high)
+                       continue;
+               mem_cgroup_events(pos, MEMCG_HIGH, 1);
+               try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+       } while ((pos = parent_mem_cgroup(pos)));
+       css_put(&memcg->css);
+       current->memcg_nr_pages_over_high = 0;
+ }
  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                      unsigned int nr_pages)
  {
        unsigned long nr_reclaimed;
        bool may_swap = true;
        bool drained = false;
-       int ret = 0;
  
        if (mem_cgroup_is_root(memcg))
-               goto done;
+               return 0;
  retry:
        if (consume_stock(memcg, nr_pages))
-               goto done;
+               return 0;
  
        if (!do_swap_account ||
            !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
        if (unlikely(test_thread_flag(TIF_MEMDIE) ||
                     fatal_signal_pending(current) ||
                     current->flags & PF_EXITING))
-               goto bypass;
+               goto force;
  
        if (unlikely(task_in_memcg_oom(current)))
                goto nomem;
  
-       if (!(gfp_mask & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(gfp_mask))
                goto nomem;
  
        mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
                goto retry;
  
        if (gfp_mask & __GFP_NOFAIL)
-               goto bypass;
+               goto force;
  
        if (fatal_signal_pending(current))
-               goto bypass;
+               goto force;
  
        mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
  
-       mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
+       mem_cgroup_oom(mem_over_limit, gfp_mask,
+                      get_order(nr_pages * PAGE_SIZE));
  nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
                return -ENOMEM;
- bypass:
-       return -EINTR;
+ force:
+       /*
+        * The allocation either can't fail or will lead to more memory
+        * being freed very soon.  Allow memory usage go over the limit
+        * temporarily by force charging it.
+        */
+       page_counter_charge(&memcg->memory, nr_pages);
+       if (do_swap_account)
+               page_counter_charge(&memcg->memsw, nr_pages);
+       css_get_many(&memcg->css, nr_pages);
+       return 0;
  
  done_restock:
        css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
-       if (!(gfp_mask & __GFP_WAIT))
-               goto done;
        /*
-        * If the hierarchy is above the normal consumption range,
-        * make the charging task trim their excess contribution.
+        * If the hierarchy is above the normal consumption range, schedule
+        * reclaim on returning to userland.  We can perform reclaim here
+        * if __GFP_WAIT but let's always punt for simplicity and so that
+        * GFP_KERNEL can consistently be used during reclaim.  @memcg is
+        * not recorded as it most likely matches current's and won't
+        * change in the meantime.  As high limit is checked again before
+        * reclaim, the cost of mismatch is negligible.
         */
        do {
-               if (page_counter_read(&memcg->memory) <= memcg->high)
-                       continue;
-               mem_cgroup_events(memcg, MEMCG_HIGH, 1);
-               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+               if (page_counter_read(&memcg->memory) > memcg->high) {
+                       current->memcg_nr_pages_over_high += nr_pages;
+                       set_notify_resume(current);
+                       break;
+               }
        } while ((memcg = parent_mem_cgroup(memcg)));
- done:
-       return ret;
+       return 0;
  }
  
  static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
@@@ -2174,55 -2217,6 +2217,6 @@@ static void commit_charge(struct page *
  }
  
  #ifdef CONFIG_MEMCG_KMEM
- int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                     unsigned long nr_pages)
- {
-       struct page_counter *counter;
-       int ret = 0;
-       ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
-       if (ret < 0)
-               return ret;
-       ret = try_charge(memcg, gfp, nr_pages);
-       if (ret == -EINTR)  {
-               /*
-                * try_charge() chose to bypass to root due to OOM kill or
-                * fatal signal.  Since our only options are to either fail
-                * the allocation or charge it to this cgroup, do it as a
-                * temporary condition. But we can't fail. From a kmem/slab
-                * perspective, the cache has already been selected, by
-                * mem_cgroup_kmem_get_cache(), so it is too late to change
-                * our minds.
-                *
-                * This condition will only trigger if the task entered
-                * memcg_charge_kmem in a sane state, but was OOM-killed
-                * during try_charge() above. Tasks that were already dying
-                * when the allocation triggers should have been already
-                * directed to the root cgroup in memcontrol.h
-                */
-               page_counter_charge(&memcg->memory, nr_pages);
-               if (do_swap_account)
-                       page_counter_charge(&memcg->memsw, nr_pages);
-               css_get_many(&memcg->css, nr_pages);
-               ret = 0;
-       } else if (ret)
-               page_counter_uncharge(&memcg->kmem, nr_pages);
-       return ret;
- }
- void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
- {
-       page_counter_uncharge(&memcg->memory, nr_pages);
-       if (do_swap_account)
-               page_counter_uncharge(&memcg->memsw, nr_pages);
-       page_counter_uncharge(&memcg->kmem, nr_pages);
-       css_put_many(&memcg->css, nr_pages);
- }
  static int memcg_alloc_cache_id(void)
  {
        int id, size;
@@@ -2384,85 -2378,59 +2378,59 @@@ void __memcg_kmem_put_cache(struct kmem
                css_put(&cachep->memcg_params.memcg->css);
  }
  
- /*
-  * We need to verify if the allocation against current->mm->owner's memcg is
-  * possible for the given order. But the page is not allocated yet, so we'll
-  * need a further commit step to do the final arrangements.
-  *
-  * It is possible for the task to switch cgroups in this mean time, so at
-  * commit time, we can't rely on task conversion any longer.  We'll then use
-  * the handle argument to return to the caller which cgroup we should commit
-  * against. We could also return the memcg directly and avoid the pointer
-  * passing, but a boolean return value gives better semantics considering
-  * the compiled-out case as well.
-  *
-  * Returning true means the allocation is possible.
-  */
- bool
- __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+                             struct mem_cgroup *memcg)
  {
-       struct mem_cgroup *memcg;
-       int ret;
+       unsigned int nr_pages = 1 << order;
+       struct page_counter *counter;
+       int ret = 0;
  
-       *_memcg = NULL;
+       if (!memcg_kmem_is_active(memcg))
+               return 0;
  
-       memcg = get_mem_cgroup_from_mm(current->mm);
+       ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
+       if (ret)
+               return ret;
  
-       if (!memcg_kmem_is_active(memcg)) {
-               css_put(&memcg->css);
-               return true;
+       ret = try_charge(memcg, gfp, nr_pages);
+       if (ret) {
+               page_counter_uncharge(&memcg->kmem, nr_pages);
+               return ret;
        }
  
-       ret = memcg_charge_kmem(memcg, gfp, 1 << order);
-       if (!ret)
-               *_memcg = memcg;
+       page->mem_cgroup = memcg;
  
-       css_put(&memcg->css);
-       return (ret == 0);
+       return 0;
  }
  
- void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             int order)
+ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
  {
-       VM_BUG_ON(mem_cgroup_is_root(memcg));
+       struct mem_cgroup *memcg;
+       int ret;
  
-       /* The page allocation failed. Revert */
-       if (!page) {
-               memcg_uncharge_kmem(memcg, 1 << order);
-               return;
-       }
-       page->mem_cgroup = memcg;
+       memcg = get_mem_cgroup_from_mm(current->mm);
+       ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+       css_put(&memcg->css);
+       return ret;
  }
  
- void __memcg_kmem_uncharge_pages(struct page *page, int order)
+ void __memcg_kmem_uncharge(struct page *page, int order)
  {
        struct mem_cgroup *memcg = page->mem_cgroup;
+       unsigned int nr_pages = 1 << order;
  
        if (!memcg)
                return;
  
        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
  
-       memcg_uncharge_kmem(memcg, 1 << order);
-       page->mem_cgroup = NULL;
- }
- struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
- {
-       struct mem_cgroup *memcg = NULL;
-       struct kmem_cache *cachep;
-       struct page *page;
-       page = virt_to_head_page(ptr);
-       if (PageSlab(page)) {
-               cachep = page->slab_cache;
-               if (!is_root_cache(cachep))
-                       memcg = cachep->memcg_params.memcg;
-       } else
-               /* page allocated by alloc_kmem_pages */
-               memcg = page->mem_cgroup;
+       page_counter_uncharge(&memcg->kmem, nr_pages);
+       page_counter_uncharge(&memcg->memory, nr_pages);
+       if (do_swap_account)
+               page_counter_uncharge(&memcg->memsw, nr_pages);
  
-       return memcg;
+       page->mem_cgroup = NULL;
+       css_put_many(&memcg->css, nr_pages);
  }
  #endif /* CONFIG_MEMCG_KMEM */
  
  
  /*
   * Because tail pages are not marked as "used", set it. We're under
-  * zone->lru_lock, 'splitting on pmd' and compound_lock.
-  * charge/uncharge will be never happen and move_account() is done under
-  * compound_lock(), so we don't have to take care of races.
+  * zone->lru_lock and migration entries setup in all page mappings.
   */
  void mem_cgroup_split_huge_fixup(struct page *head)
  {
@@@ -2926,7 -2892,7 +2892,7 @@@ static int memcg_activate_kmem(struct m
         * of course permitted.
         */
        mutex_lock(&memcg_create_mutex);
 -      if (cgroup_has_tasks(memcg->css.cgroup) ||
 +      if (cgroup_is_populated(memcg->css.cgroup) ||
            (memcg->use_hierarchy && memcg_has_children(memcg)))
                err = -EBUSY;
        mutex_unlock(&memcg_create_mutex);
@@@ -3741,43 -3707,44 +3707,43 @@@ struct wb_domain *mem_cgroup_wb_domain(
  /**
   * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
   * @wb: bdi_writeback in question
 - * @pavail: out parameter for number of available pages
 + * @pfilepages: out parameter for number of file pages
 + * @pheadroom: out parameter for number of allocatable pages according to memcg
   * @pdirty: out parameter for number of dirty pages
   * @pwriteback: out parameter for number of pages under writeback
   *
 - * Determine the numbers of available, dirty, and writeback pages in @wb's
 - * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
 - * more involved.
 + * Determine the numbers of file, headroom, dirty, and writeback pages in
 + * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
 + * is a bit more involved.
   *
 - * A memcg's headroom is "min(max, high) - used".  The available memory is
 - * calculated as the lowest headroom of itself and the ancestors plus the
 - * number of pages already being used for file pages.  Note that this
 - * doesn't consider the actual amount of available memory in the system.
 - * The caller should further cap *@pavail accordingly.
 + * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
 + * headroom is calculated as the lowest headroom of itself and the
 + * ancestors.  Note that this doesn't consider the actual amount of
 + * available memory in the system.  The caller should further cap
 + * *@pheadroom accordingly.
   */
 -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
 -                       unsigned long *pdirty, unsigned long *pwriteback)
 +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 +                       unsigned long *pheadroom, unsigned long *pdirty,
 +                       unsigned long *pwriteback)
  {
        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
        struct mem_cgroup *parent;
 -      unsigned long head_room = PAGE_COUNTER_MAX;
 -      unsigned long file_pages;
  
        *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
  
        /* this should eventually include NR_UNSTABLE_NFS */
        *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
 +      *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 +                                                   (1 << LRU_ACTIVE_FILE));
 +      *pheadroom = PAGE_COUNTER_MAX;
  
 -      file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 -                                                  (1 << LRU_ACTIVE_FILE));
        while ((parent = parent_mem_cgroup(memcg))) {
                unsigned long ceiling = min(memcg->memory.limit, memcg->high);
                unsigned long used = page_counter_read(&memcg->memory);
  
 -              head_room = min(head_room, ceiling - min(ceiling, used));
 +              *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
                memcg = parent;
        }
 -
 -      *pavail = file_pages + head_room;
  }
  
  #else /* CONFIG_CGROUP_WRITEBACK */
@@@ -4066,7 -4033,8 +4032,7 @@@ static struct cftype mem_cgroup_legacy_
        {
                .name = "cgroup.event_control",         /* XXX: for compat */
                .write = memcg_write_event_control,
 -              .flags = CFTYPE_NO_PREFIX,
 -              .mode = S_IWUGO,
 +              .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
        },
        {
                .name = "swappiness",
@@@ -4400,28 -4368,16 +4366,16 @@@ static int mem_cgroup_do_precharge(unsi
  {
        int ret;
  
-       /* Try a single bulk charge without reclaim first */
-       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+       /* Try a single bulk charge without reclaim first, kswapd may wake */
+       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
        if (!ret) {
                mc.precharge += count;
                return ret;
        }
-       if (ret == -EINTR) {
-               cancel_charge(root_mem_cgroup, count);
-               return ret;
-       }
  
        /* Try charges one by one with reclaim */
        while (count--) {
                ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
-               /*
-                * In case of failure, any residual charges against
-                * mc.to will be dropped by mem_cgroup_clear_mc()
-                * later on.  However, cancel any charges that are
-                * bypassed to root right away or they'll be lost.
-                */
-               if (ret == -EINTR)
-                       cancel_charge(root_mem_cgroup, 1);
                if (ret)
                        return ret;
                mc.precharge++;
@@@ -4547,39 -4503,30 +4501,30 @@@ static struct page *mc_handle_file_pte(
   * @from: mem_cgroup which the page is moved from.
   * @to:       mem_cgroup which the page is moved to. @from != @to.
   *
-  * The caller must confirm following.
-  * - page is not on LRU (isolate_page() is useful.)
-  * - compound_lock is held when nr_pages > 1
+  * The caller must make sure the page is not on LRU (isolate_page() is useful.)
   *
   * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
   * from old cgroup.
   */
  static int mem_cgroup_move_account(struct page *page,
-                                  unsigned int nr_pages,
+                                  bool compound,
                                   struct mem_cgroup *from,
                                   struct mem_cgroup *to)
  {
        unsigned long flags;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
        int ret;
        bool anon;
  
        VM_BUG_ON(from == to);
        VM_BUG_ON_PAGE(PageLRU(page), page);
-       /*
-        * The page is isolated from LRU. So, collapse function
-        * will not handle this page. But page splitting can happen.
-        * Do this check under compound_page_lock(). The caller should
-        * hold it.
-        */
-       ret = -EBUSY;
-       if (nr_pages > 1 && !PageTransHuge(page))
-               goto out;
+       VM_BUG_ON(compound && !PageTransHuge(page));
  
        /*
-        * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
-        * of its source page while we change it: page migration takes
-        * both pages off the LRU, but page cache replacement doesn't.
+        * Prevent mem_cgroup_replace_page() from looking at
+        * page->mem_cgroup of its source page while we change it.
         */
+       ret = -EBUSY;
        if (!trylock_page(page))
                goto out;
  
        ret = 0;
  
        local_irq_disable();
-       mem_cgroup_charge_statistics(to, page, nr_pages);
+       mem_cgroup_charge_statistics(to, page, compound, nr_pages);
        memcg_check_events(to, page);
-       mem_cgroup_charge_statistics(from, page, -nr_pages);
+       mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
        memcg_check_events(from, page);
        local_irq_enable();
  out_unlock:
@@@ -4726,7 -4673,7 +4671,7 @@@ static int mem_cgroup_count_precharge_p
        pte_t *pte;
        spinlock_t *ptl;
  
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                        mc.precharge += HPAGE_PMD_NR;
                spin_unlock(ptl);
@@@ -4833,7 -4780,7 +4778,7 @@@ static int mem_cgroup_can_attach(struc
  {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup *from;
 -      struct task_struct *p;
 +      struct task_struct *leader, *p;
        struct mm_struct *mm;
        unsigned long move_flags;
        int ret = 0;
        if (!move_flags)
                return 0;
  
 -      p = cgroup_taskset_first(tset);
 +      /*
 +       * Multi-process migrations only happen on the default hierarchy
 +       * where charge immigration is not used.  Perform charge
 +       * immigration if @tset contains a leader and whine if there are
 +       * multiple.
 +       */
 +      p = NULL;
 +      cgroup_taskset_for_each_leader(leader, tset) {
 +              WARN_ON_ONCE(p);
 +              p = leader;
 +      }
 +      if (!p)
 +              return 0;
 +
        from = mem_cgroup_from_task(p);
  
        VM_BUG_ON(from == memcg);
@@@ -4910,17 -4844,7 +4855,7 @@@ static int mem_cgroup_move_charge_pte_r
        union mc_target target;
        struct page *page;
  
-       /*
-        * We don't take compound_lock() here but no race with splitting thp
-        * happens because:
-        *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
-        *    under splitting, which means there's no concurrent thp split,
-        *  - if another thread runs into split_huge_page() just after we
-        *    entered this if-block, the thread must wait for page table lock
-        *    to be unlocked in __split_huge_page_splitting(), where the main
-        *    part of thp split is not executed yet.
-        */
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                if (mc.precharge < HPAGE_PMD_NR) {
                        spin_unlock(ptl);
                        return 0;
                if (target_type == MC_TARGET_PAGE) {
                        page = target.page;
                        if (!isolate_lru_page(page)) {
-                               if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+                               if (!mem_cgroup_move_account(page, true,
                                                             mc.from, mc.to)) {
                                        mc.precharge -= HPAGE_PMD_NR;
                                        mc.moved_charge += HPAGE_PMD_NR;
@@@ -4958,7 -4882,8 +4893,8 @@@ retry
                        page = target.page;
                        if (isolate_lru_page(page))
                                goto put;
-                       if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
+                       if (!mem_cgroup_move_account(page, false,
+                                               mc.from, mc.to)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
@@@ -5076,7 -5001,7 +5012,7 @@@ static void mem_cgroup_bind(struct cgro
         * guarantees that @root doesn't have any children, so turning it
         * on for the root memcg is enough.
         */
 -      if (cgroup_on_dfl(root_css->cgroup))
 +      if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                root_mem_cgroup->use_hierarchy = true;
        else
                root_mem_cgroup->use_hierarchy = false;
  static u64 memory_current_read(struct cgroup_subsys_state *css,
                               struct cftype *cft)
  {
-       return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+       return page_counter_read(&mem_cgroup_from_css(css)->memory);
  }
  
  static int memory_low_show(struct seq_file *m, void *v)
@@@ -5197,6 -5122,7 +5133,7 @@@ static int memory_events_show(struct se
  static struct cftype memory_files[] = {
        {
                .name = "current",
+               .flags = CFTYPE_NOT_ON_ROOT,
                .read_u64 = memory_current_read,
        },
        {
        {
                .name = "events",
                .flags = CFTYPE_NOT_ON_ROOT,
 +              .file_offset = offsetof(struct mem_cgroup, events_file),
                .seq_show = memory_events_show,
        },
        { }     /* terminate */
@@@ -5296,10 -5221,11 +5233,11 @@@ bool mem_cgroup_low(struct mem_cgroup *
   * with mem_cgroup_cancel_charge() in case page instantiation fails.
   */
  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
-                         gfp_t gfp_mask, struct mem_cgroup **memcgp)
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound)
  {
        struct mem_cgroup *memcg = NULL;
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
        int ret = 0;
  
        if (mem_cgroup_disabled())
                }
        }
  
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
        if (!memcg)
                memcg = get_mem_cgroup_from_mm(mm);
  
        ret = try_charge(memcg, gfp_mask, nr_pages);
  
        css_put(&memcg->css);
-       if (ret == -EINTR) {
-               memcg = root_mem_cgroup;
-               ret = 0;
-       }
  out:
        *memcgp = memcg;
        return ret;
   * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
   */
  void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             bool lrucare)
+                             bool lrucare, bool compound)
  {
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
  
        VM_BUG_ON_PAGE(!page->mapping, page);
        VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
  
        commit_charge(page, memcg, lrucare);
  
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
        local_irq_disable();
-       mem_cgroup_charge_statistics(memcg, page, nr_pages);
+       mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
        memcg_check_events(memcg, page);
        local_irq_enable();
  
   *
   * Cancel a charge transaction started by mem_cgroup_try_charge().
   */
- void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+               bool compound)
  {
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
  
        if (mem_cgroup_disabled())
                return;
        if (!memcg)
                return;
  
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
        cancel_charge(memcg, nr_pages);
  }
  
@@@ -5559,7 -5466,7 +5478,7 @@@ void mem_cgroup_uncharge_list(struct li
  }
  
  /**
-  * mem_cgroup_migrate - migrate a charge to another page
+  * mem_cgroup_replace_page - migrate a charge to another page
   * @oldpage: currently charged page
   * @newpage: page to transfer the charge to
   * @lrucare: either or both pages might be on the LRU already
   *
   * Both pages must be locked, @newpage->mapping must be set up.
   */
- void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
-                       bool lrucare)
+ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
  {
        struct mem_cgroup *memcg;
        int isolated;
  
        VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
        VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-       VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
-       VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
        VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
        VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
                       newpage);
        if (newpage->mem_cgroup)
                return;
  
-       /*
-        * Swapcache readahead pages can get migrated before being
-        * charged, and migration from compaction can happen to an
-        * uncharged page when the PFN walker finds a page that
-        * reclaim just put back on the LRU but has not released yet.
-        */
+       /* Swapcache readahead pages can get replaced before being charged */
        memcg = oldpage->mem_cgroup;
        if (!memcg)
                return;
  
-       if (lrucare)
-               lock_page_lru(oldpage, &isolated);
+       lock_page_lru(oldpage, &isolated);
        oldpage->mem_cgroup = NULL;
+       unlock_page_lru(oldpage, isolated);
  
-       if (lrucare)
-               unlock_page_lru(oldpage, isolated);
-       commit_charge(newpage, memcg, lrucare);
+       commit_charge(newpage, memcg, true);
  }
  
  /*
@@@ -5690,7 -5585,7 +5597,7 @@@ void mem_cgroup_swapout(struct page *pa
         * only synchronisation we have for udpating the per-CPU variables.
         */
        VM_BUG_ON(!irqs_disabled());
-       mem_cgroup_charge_statistics(memcg, page, -1);
+       mem_cgroup_charge_statistics(memcg, page, false, -1);
        memcg_check_events(memcg, page);
  }
  
diff --combined mm/memory_hotplug.c
index 0780d118d26e70d5cfc3a3e67ceca4915684a003,4b62bbac55125b07a14706797ad390b14f4a6e8a..67d488ab495e57b9018484932e135078c787903c
@@@ -339,8 -339,8 +339,8 @@@ static int __ref ensure_zone_is_initial
                        unsigned long start_pfn, unsigned long num_pages)
  {
        if (!zone_is_initialized(zone))
-               return init_currently_empty_zone(zone, start_pfn, num_pages,
-                                                MEMMAP_HOTPLUG);
+               return init_currently_empty_zone(zone, start_pfn, num_pages);
        return 0;
  }
  
@@@ -1232,21 -1232,23 +1232,21 @@@ int zone_for_memory(int nid, u64 start
  }
  
  /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
 -int __ref add_memory(int nid, u64 start, u64 size)
 +int __ref add_memory_resource(int nid, struct resource *res)
  {
 +      u64 start, size;
        pg_data_t *pgdat = NULL;
        bool new_pgdat;
        bool new_node;
 -      struct resource *res;
        int ret;
  
 +      start = res->start;
 +      size = resource_size(res);
 +
        ret = check_hotplug_memory_range(start, size);
        if (ret)
                return ret;
  
 -      res = register_memory_resource(start, size);
 -      ret = -EEXIST;
 -      if (!res)
 -              return ret;
 -
        {       /* Stupid hack to suppress address-never-null warning */
                void *p = NODE_DATA(nid);
                new_pgdat = !p;
@@@ -1298,28 -1300,13 +1298,28 @@@ error
        /* rollback pgdat allocation and others */
        if (new_pgdat)
                rollback_node_hotadd(nid, pgdat);
 -      release_memory_resource(res);
        memblock_remove(start, size);
  
  out:
        mem_hotplug_done();
        return ret;
  }
 +EXPORT_SYMBOL_GPL(add_memory_resource);
 +
 +int __ref add_memory(int nid, u64 start, u64 size)
 +{
 +      struct resource *res;
 +      int ret;
 +
 +      res = register_memory_resource(start, size);
 +      if (!res)
 +              return -EEXIST;
 +
 +      ret = add_memory_resource(nid, res);
 +      if (ret < 0)
 +              release_memory_resource(res);
 +      return ret;
 +}
  EXPORT_SYMBOL_GPL(add_memory);
  
  #ifdef CONFIG_MEMORY_HOTREMOVE
diff --combined mm/page_alloc.c
index 805bbad2e24e1a84b383ebc90fc825527238957a,cef70104614c05ccef23f2020b4ea26d78f4c08c..d0499fff8c7fb1ee2f33a34bd9e424420939a654
@@@ -169,19 -169,19 +169,19 @@@ void pm_restrict_gfp_mask(void
        WARN_ON(!mutex_is_locked(&pm_mutex));
        WARN_ON(saved_gfp_mask);
        saved_gfp_mask = gfp_allowed_mask;
-       gfp_allowed_mask &= ~GFP_IOFS;
+       gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
  }
  
  bool pm_suspended_storage(void)
  {
-       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+       if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                return false;
        return true;
  }
  #endif /* CONFIG_PM_SLEEP */
  
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
- int pageblock_order __read_mostly;
unsigned int pageblock_order __read_mostly;
  #endif
  
  static void __free_pages_ok(struct page *page, unsigned int order);
@@@ -229,6 -229,17 +229,17 @@@ static char * const zone_names[MAX_NR_Z
  #endif
  };
  
+ compound_page_dtor * const compound_page_dtors[] = {
+       NULL,
+       free_compound_page,
+ #ifdef CONFIG_HUGETLB_PAGE
+       free_huge_page,
+ #endif
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       free_transhuge_page,
+ #endif
+ };
  int min_free_kbytes = 1024;
  int user_min_free_kbytes = -1;
  
@@@ -436,39 -447,38 +447,38 @@@ out
  /*
   * Higher-order pages are called "compound pages".  They are structured thusly:
   *
-  * The first PAGE_SIZE page is called the "head page".
+  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
   *
-  * The remaining PAGE_SIZE pages are called "tail pages".
+  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
+  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
   *
-  * All pages have PG_compound set.  All tail pages have their ->first_page
-  * pointing at the head page.
+  * The first tail page's ->compound_dtor holds the offset in array of compound
+  * page destructors. See compound_page_dtors.
   *
-  * The first tail page's ->lru.next holds the address of the compound page's
-  * put_page() function.  Its ->lru.prev holds the order of allocation.
+  * The first tail page's ->compound_order holds the order of allocation.
   * This usage means that zero-order pages may not be compound.
   */
  
static void free_compound_page(struct page *page)
+ void free_compound_page(struct page *page)
  {
        __free_pages_ok(page, compound_order(page));
  }
  
- void prep_compound_page(struct page *page, unsigned long order)
+ void prep_compound_page(struct page *page, unsigned int order)
  {
        int i;
        int nr_pages = 1 << order;
  
-       set_compound_page_dtor(page, free_compound_page);
+       set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
        set_compound_order(page, order);
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
                set_page_count(p, 0);
-               p->first_page = page;
-               /* Make sure p->first_page is always valid for PageTail() */
-               smp_wmb();
-               __SetPageTail(p);
+               p->mapping = TAIL_MAPPING;
+               set_compound_head(p, page);
        }
+       atomic_set(compound_mapcount_ptr(page), -1);
  }
  
  #ifdef CONFIG_DEBUG_PAGEALLOC
@@@ -656,7 -666,7 +666,7 @@@ static inline void __free_one_page(stru
        unsigned long combined_idx;
        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
-       int max_order = MAX_ORDER;
+       unsigned int max_order = MAX_ORDER;
  
        VM_BUG_ON(!zone_is_initialized(zone));
        VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
                 * pageblock. Without this, pageblock isolation
                 * could cause incorrect freepage accounting.
                 */
-               max_order = min(MAX_ORDER, pageblock_order + 1);
+               max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
        } else {
                __mod_zone_freepage_state(zone, 1 << order, migratetype);
        }
@@@ -733,7 -743,7 +743,7 @@@ static inline int free_pages_check(stru
        const char *bad_reason = NULL;
        unsigned long bad_flags = 0;
  
-       if (unlikely(page_mapcount(page)))
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
@@@ -817,7 -827,6 +827,6 @@@ static void free_pcppages_bulk(struct z
                        if (unlikely(has_isolate_pageblock(zone)))
                                mt = get_pageblock_migratetype(page);
  
-                       /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
                        __free_one_page(page, page_to_pfn(page), zone, 0, mt);
                        trace_mm_page_pcpu_drain(page, 0, mt);
                } while (--to_free && --batch_free && !list_empty(list));
@@@ -846,17 -855,52 +855,52 @@@ static void free_one_page(struct zone *
  
  static int free_tail_pages_check(struct page *head_page, struct page *page)
  {
-       if (!IS_ENABLED(CONFIG_DEBUG_VM))
-               return 0;
+       int ret = 1;
+       /*
+        * We rely page->lru.next never has bit 0 set, unless the page
+        * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+        */
+       BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+       if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+               ret = 0;
+               goto out;
+       }
+       switch (page - head_page) {
+       case 1:
+               /* the first tail page: ->mapping is compound_mapcount() */
+               if (unlikely(compound_mapcount(page))) {
+                       bad_page(page, "nonzero compound_mapcount", 0);
+                       goto out;
+               }
+               break;
+       case 2:
+               /*
+                * the second tail page: ->mapping is
+                * page_deferred_list().next -- ignore value.
+                */
+               break;
+       default:
+               if (page->mapping != TAIL_MAPPING) {
+                       bad_page(page, "corrupted mapping in tail page", 0);
+                       goto out;
+               }
+               break;
+       }
        if (unlikely(!PageTail(page))) {
                bad_page(page, "PageTail not set", 0);
-               return 1;
+               goto out;
        }
-       if (unlikely(page->first_page != head_page)) {
-               bad_page(page, "first_page not consistent", 0);
-               return 1;
+       if (unlikely(compound_head(page) != head_page)) {
+               bad_page(page, "compound_head not consistent", 0);
+               goto out;
        }
-       return 0;
+       ret = 0;
+ out:
+       page->mapping = NULL;
+       clear_compound_head(page);
+       return ret;
  }
  
  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@@ -923,6 -967,10 +967,10 @@@ void __meminit reserve_bootmem_region(u
                        struct page *page = pfn_to_page(start_pfn);
  
                        init_reserved_page(start_pfn);
+                       /* Avoid false-positive PageTail() */
+                       INIT_LIST_HEAD(&page->lru);
                        SetPageReserved(page);
                }
        }
@@@ -1314,7 -1362,7 +1362,7 @@@ static inline int check_new_page(struc
        const char *bad_reason = NULL;
        unsigned long bad_flags = 0;
  
-       if (unlikely(page_mapcount(page)))
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
@@@ -1417,15 -1465,14 +1465,14 @@@ struct page *__rmqueue_smallest(struct 
   * the free lists for the desirable migrate type are depleted
   */
  static int fallbacks[MIGRATE_TYPES][4] = {
-       [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
-       [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
-       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
+       [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
+       [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
+       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
  #ifdef CONFIG_CMA
-       [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
+       [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
  #endif
-       [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
  #ifdef CONFIG_MEMORY_ISOLATION
-       [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
+       [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
  #endif
  };
  
@@@ -1450,7 -1497,7 +1497,7 @@@ int move_freepages(struct zone *zone
                          int migratetype)
  {
        struct page *page;
-       unsigned long order;
+       unsigned int order;
        int pages_moved = 0;
  
  #ifndef CONFIG_HOLES_IN_ZONE
@@@ -1563,7 -1610,7 +1610,7 @@@ static bool can_steal_fallback(unsigne
  static void steal_suitable_fallback(struct zone *zone, struct page *page,
                                                          int start_type)
  {
-       int current_order = page_order(page);
+       unsigned int current_order = page_order(page);
        int pages;
  
        /* Take ownership for orders >= pageblock_order */
@@@ -1598,7 -1645,7 +1645,7 @@@ int find_suitable_fallback(struct free_
        *can_steal = false;
        for (i = 0;; i++) {
                fallback_mt = fallbacks[migratetype][i];
-               if (fallback_mt == MIGRATE_RESERVE)
+               if (fallback_mt == MIGRATE_TYPES)
                        break;
  
                if (list_empty(&area->free_list[fallback_mt]))
        return -1;
  }
  
+ /*
+  * Reserve a pageblock for exclusive use of high-order atomic allocations if
+  * there are no empty page blocks that contain a page with a suitable order
+  */
+ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
+                               unsigned int alloc_order)
+ {
+       int mt;
+       unsigned long max_managed, flags;
+       /*
+        * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
+        * Check is race-prone but harmless.
+        */
+       max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+       if (zone->nr_reserved_highatomic >= max_managed)
+               return;
+       spin_lock_irqsave(&zone->lock, flags);
+       /* Recheck the nr_reserved_highatomic limit under the lock */
+       if (zone->nr_reserved_highatomic >= max_managed)
+               goto out_unlock;
+       /* Yoink! */
+       mt = get_pageblock_migratetype(page);
+       if (mt != MIGRATE_HIGHATOMIC &&
+                       !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+               zone->nr_reserved_highatomic += pageblock_nr_pages;
+               set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
+               move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+       }
+ out_unlock:
+       spin_unlock_irqrestore(&zone->lock, flags);
+ }
+ /*
+  * Used when an allocation is about to fail under memory pressure. This
+  * potentially hurts the reliability of high-order allocations when under
+  * intense memory pressure but failed atomic allocations should be easier
+  * to recover from than an OOM.
+  */
+ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
+ {
+       struct zonelist *zonelist = ac->zonelist;
+       unsigned long flags;
+       struct zoneref *z;
+       struct zone *zone;
+       struct page *page;
+       int order;
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+                                                               ac->nodemask) {
+               /* Preserve at least one pageblock */
+               if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
+                       continue;
+               spin_lock_irqsave(&zone->lock, flags);
+               for (order = 0; order < MAX_ORDER; order++) {
+                       struct free_area *area = &(zone->free_area[order]);
+                       if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+                               continue;
+                       page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next,
+                                               struct page, lru);
+                       /*
+                        * It should never happen but changes to locking could
+                        * inadvertently allow a per-cpu drain to add pages
+                        * to MIGRATE_HIGHATOMIC while unreserving so be safe
+                        * and watch for underflows.
+                        */
+                       zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
+                               zone->nr_reserved_highatomic);
+                       /*
+                        * Convert to ac->migratetype and avoid the normal
+                        * pageblock stealing heuristics. Minimally, the caller
+                        * is doing the work and needs the pages. More
+                        * importantly, if the block was always converted to
+                        * MIGRATE_UNMOVABLE or another type then the number
+                        * of pageblocks that cannot be completely freed
+                        * may increase.
+                        */
+                       set_pageblock_migratetype(page, ac->migratetype);
+                       move_freepages_block(zone, page, ac->migratetype);
+                       spin_unlock_irqrestore(&zone->lock, flags);
+                       return;
+               }
+               spin_unlock_irqrestore(&zone->lock, flags);
+       }
+ }
  /* Remove an element from the buddy allocator from the fallback list */
  static inline struct page *
  __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
   * Call me with the zone->lock already held.
   */
  static struct page *__rmqueue(struct zone *zone, unsigned int order,
-                                               int migratetype)
+                               int migratetype, gfp_t gfp_flags)
  {
        struct page *page;
  
- retry_reserve:
        page = __rmqueue_smallest(zone, order, migratetype);
-       if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
+       if (unlikely(!page)) {
                if (migratetype == MIGRATE_MOVABLE)
                        page = __rmqueue_cma_fallback(zone, order);
  
                if (!page)
                        page = __rmqueue_fallback(zone, order, migratetype);
-               /*
-                * Use MIGRATE_RESERVE rather than fail an allocation. goto
-                * is used because __rmqueue_smallest is an inline function
-                * and we want just one call site
-                */
-               if (!page) {
-                       migratetype = MIGRATE_RESERVE;
-                       goto retry_reserve;
-               }
        }
  
        trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@@ -1714,7 -1844,7 +1844,7 @@@ static int rmqueue_bulk(struct zone *zo
  
        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
-               struct page *page = __rmqueue(zone, order, migratetype);
+               struct page *page = __rmqueue(zone, order, migratetype, 0);
                if (unlikely(page == NULL))
                        break;
  
@@@ -2086,7 -2216,7 +2216,7 @@@ int split_free_page(struct page *page
  static inline
  struct page *buffered_rmqueue(struct zone *preferred_zone,
                        struct zone *zone, unsigned int order,
-                       gfp_t gfp_flags, int migratetype)
+                       gfp_t gfp_flags, int alloc_flags, int migratetype)
  {
        unsigned long flags;
        struct page *page;
                        WARN_ON_ONCE(order > 1);
                }
                spin_lock_irqsave(&zone->lock, flags);
-               page = __rmqueue(zone, order, migratetype);
+               page = NULL;
+               if (alloc_flags & ALLOC_HARDER) {
+                       page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+                       if (page)
+                               trace_mm_page_alloc_zone_locked(page, order, migratetype);
+               }
+               if (!page)
+                       page = __rmqueue(zone, order, migratetype, gfp_flags);
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
@@@ -2159,13 -2297,13 +2297,13 @@@ failed
  static struct {
        struct fault_attr attr;
  
 -      u32 ignore_gfp_highmem;
 -      u32 ignore_gfp_reclaim;
 +      bool ignore_gfp_highmem;
-       bool ignore_gfp_wait;
++      bool ignore_gfp_reclaim;
        u32 min_order;
  } fail_page_alloc = {
        .attr = FAULT_ATTR_INITIALIZER,
-       .ignore_gfp_wait = true,
 -      .ignore_gfp_reclaim = 1,
 -      .ignore_gfp_highmem = 1,
++      .ignore_gfp_reclaim = true,
 +      .ignore_gfp_highmem = true,
        .min_order = 1,
  };
  
@@@ -2183,7 -2321,8 +2321,8 @@@ static bool should_fail_alloc_page(gfp_
                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                return false;
-       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+       if (fail_page_alloc.ignore_gfp_reclaim &&
+                       (gfp_mask & __GFP_DIRECT_RECLAIM))
                return false;
  
        return should_fail(&fail_page_alloc.attr, 1 << order);
@@@ -2202,7 -2341,7 +2341,7 @@@ static int __init fail_page_alloc_debug
                return PTR_ERR(dir);
  
        if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                               &fail_page_alloc.ignore_gfp_wait))
+                               &fail_page_alloc.ignore_gfp_reclaim))
                goto fail;
        if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
                                &fail_page_alloc.ignore_gfp_highmem))
@@@ -2232,42 -2371,77 +2371,77 @@@ static inline bool should_fail_alloc_pa
  #endif /* CONFIG_FAIL_PAGE_ALLOC */
  
  /*
-  * Return true if free pages are above 'mark'. This takes into account the order
-  * of the allocation.
+  * Return true if free base pages are above 'mark'. For high-order checks it
+  * will return true of the order-0 watermark is reached and there is at least
+  * one free page of a suitable size. Checking now avoids taking the zone lock
+  * to check in the allocation paths if no pages are free.
   */
  static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                        unsigned long mark, int classzone_idx, int alloc_flags,
                        long free_pages)
  {
-       /* free_pages may go negative - that's OK */
        long min = mark;
        int o;
-       long free_cma = 0;
+       const int alloc_harder = (alloc_flags & ALLOC_HARDER);
  
+       /* free_pages may go negative - that's OK */
        free_pages -= (1 << order) - 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
-       if (alloc_flags & ALLOC_HARDER)
+       /*
+        * If the caller does not have rights to ALLOC_HARDER then subtract
+        * the high-atomic reserves. This will over-estimate the size of the
+        * atomic reserve but it avoids a search.
+        */
+       if (likely(!alloc_harder))
+               free_pages -= z->nr_reserved_highatomic;
+       else
                min -= min / 4;
  #ifdef CONFIG_CMA
        /* If allocation can't use CMA areas don't use free CMA pages */
        if (!(alloc_flags & ALLOC_CMA))
-               free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+               free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
  #endif
  
-       if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
+       /*
+        * Check watermarks for an order-0 allocation request. If these
+        * are not met, then a high-order request also cannot go ahead
+        * even if a suitable page happened to be free.
+        */
+       if (free_pages <= min + z->lowmem_reserve[classzone_idx])
                return false;
-       for (o = 0; o < order; o++) {
-               /* At the next order, this order's pages become unavailable */
-               free_pages -= z->free_area[o].nr_free << o;
  
-               /* Require fewer higher order pages to be free */
-               min >>= 1;
+       /* If this is an order-0 request then the watermark is fine */
+       if (!order)
+               return true;
+       /* For a high-order request, check at least one suitable page is free */
+       for (o = order; o < MAX_ORDER; o++) {
+               struct free_area *area = &z->free_area[o];
+               int mt;
+               if (!area->nr_free)
+                       continue;
+               if (alloc_harder)
+                       return true;
  
-               if (free_pages <= min)
-                       return false;
+               for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
+                       if (!list_empty(&area->free_list[mt]))
+                               return true;
+               }
+ #ifdef CONFIG_CMA
+               if ((alloc_flags & ALLOC_CMA) &&
+                   !list_empty(&area->free_list[MIGRATE_CMA])) {
+                       return true;
+               }
+ #endif
        }
-       return true;
+       return false;
  }
  
  bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
  }
  
  bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
-                       unsigned long mark, int classzone_idx, int alloc_flags)
+                       unsigned long mark, int classzone_idx)
  {
        long free_pages = zone_page_state(z, NR_FREE_PAGES);
  
        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
  
-       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+       return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
                                                                free_pages);
  }
  
  #ifdef CONFIG_NUMA
- /*
-  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
-  * skip over zones that are not allowed by the cpuset, or that have
-  * been recently (in last second) found to be nearly full.  See further
-  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
-  * that have to skip over a lot of full or unallowed zones.
-  *
-  * If the zonelist cache is present in the passed zonelist, then
-  * returns a pointer to the allowed node mask (either the current
-  * tasks mems_allowed, or node_states[N_MEMORY].)
-  *
-  * If the zonelist cache is not available for this zonelist, does
-  * nothing and returns NULL.
-  *
-  * If the fullzones BITMAP in the zonelist cache is stale (more than
-  * a second since last zap'd) then we zap it out (clear its bits.)
-  *
-  * We hold off even calling zlc_setup, until after we've checked the
-  * first zone in the zonelist, on the theory that most allocations will
-  * be satisfied from that first zone, so best to examine that zone as
-  * quickly as we can.
-  */
- static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
- {
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       nodemask_t *allowednodes;       /* zonelist_cache approximation */
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return NULL;
-       if (time_after(jiffies, zlc->last_full_zap + HZ)) {
-               bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-               zlc->last_full_zap = jiffies;
-       }
-       allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
-                                       &cpuset_current_mems_allowed :
-                                       &node_states[N_MEMORY];
-       return allowednodes;
- }
- /*
-  * Given 'z' scanning a zonelist, run a couple of quick checks to see
-  * if it is worth looking at further for free memory:
-  *  1) Check that the zone isn't thought to be full (doesn't have its
-  *     bit set in the zonelist_cache fullzones BITMAP).
-  *  2) Check that the zones node (obtained from the zonelist_cache
-  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
-  * Return true (non-zero) if zone is worth looking at further, or
-  * else return false (zero) if it is not.
-  *
-  * This check -ignores- the distinction between various watermarks,
-  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
-  * found to be full for any variation of these watermarks, it will
-  * be considered full for up to one second by all requests, unless
-  * we are so low on memory on all allowed nodes that we are forced
-  * into the second scan of the zonelist.
-  *
-  * In the second scan we ignore this zonelist cache and exactly
-  * apply the watermarks to all zones, even it is slower to do so.
-  * We are low on memory in the second scan, and should leave no stone
-  * unturned looking for a free page.
-  */
- static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
-                                               nodemask_t *allowednodes)
- {
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       int i;                          /* index of *z in zonelist zones */
-       int n;                          /* node that zone *z is on */
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return 1;
-       i = z - zonelist->_zonerefs;
-       n = zlc->z_to_n[i];
-       /* This zone is worth trying if it is allowed but not full */
-       return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
- }
- /*
-  * Given 'z' scanning a zonelist, set the corresponding bit in
-  * zlc->fullzones, so that subsequent attempts to allocate a page
-  * from that zone don't waste time re-examining it.
-  */
- static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
- {
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       int i;                          /* index of *z in zonelist zones */
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return;
-       i = z - zonelist->_zonerefs;
-       set_bit(i, zlc->fullzones);
- }
- /*
-  * clear all zones full, called after direct reclaim makes progress so that
-  * a zone that was recently full is not skipped over for up to a second
-  */
- static void zlc_clear_zones_full(struct zonelist *zonelist)
- {
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return;
-       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
- }
  static bool zone_local(struct zone *local_zone, struct zone *zone)
  {
        return local_zone->node == zone->node;
@@@ -2416,28 -2474,7 +2474,7 @@@ static bool zone_allows_reclaim(struct 
        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
                                RECLAIM_DISTANCE;
  }
  #else /* CONFIG_NUMA */
- static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
- {
-       return NULL;
- }
- static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
-                               nodemask_t *allowednodes)
- {
-       return 1;
- }
- static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
- {
- }
- static void zlc_clear_zones_full(struct zonelist *zonelist)
- {
- }
  static bool zone_local(struct zone *local_zone, struct zone *zone)
  {
        return true;
@@@ -2447,7 -2484,6 +2484,6 @@@ static bool zone_allows_reclaim(struct 
  {
        return true;
  }
  #endif        /* CONFIG_NUMA */
  
  static void reset_alloc_batches(struct zone *preferred_zone)
@@@ -2474,11 -2510,6 +2510,6 @@@ get_page_from_freelist(gfp_t gfp_mask, 
        struct zoneref *z;
        struct page *page = NULL;
        struct zone *zone;
-       nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
-       int zlc_active = 0;             /* set if using zonelist_cache */
-       int did_zlc_setup = 0;          /* just call zlc_setup() one time */
-       bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
-                               (gfp_mask & __GFP_WRITE);
        int nr_fair_skipped = 0;
        bool zonelist_rescan;
  
@@@ -2493,9 -2524,6 +2524,6 @@@ zonelist_scan
                                                                ac->nodemask) {
                unsigned long mark;
  
-               if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
-                       !zlc_zone_worth_trying(zonelist, z, allowednodes))
-                               continue;
                if (cpusets_enabled() &&
                        (alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed(zone, gfp_mask))
                 *
                 * XXX: For now, allow allocations to potentially
                 * exceed the per-zone dirty limit in the slowpath
-                * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                * (spread_dirty_pages unset) before going into reclaim,
                 * which is important when on a NUMA setup the allowed
                 * zones are together not big enough to reach the
                 * global limit.  The proper fix for these situations
                 * will require awareness of zones in the
                 * dirty-throttling and the flusher threads.
                 */
-               if (consider_zone_dirty && !zone_dirty_ok(zone))
+               if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
                        continue;
  
                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
                        if (alloc_flags & ALLOC_NO_WATERMARKS)
                                goto try_this_zone;
  
-                       if (IS_ENABLED(CONFIG_NUMA) &&
-                                       !did_zlc_setup && nr_online_nodes > 1) {
-                               /*
-                                * we do zlc_setup if there are multiple nodes
-                                * and before considering the first zone allowed
-                                * by the cpuset.
-                                */
-                               allowednodes = zlc_setup(zonelist, alloc_flags);
-                               zlc_active = 1;
-                               did_zlc_setup = 1;
-                       }
                        if (zone_reclaim_mode == 0 ||
                            !zone_allows_reclaim(ac->preferred_zone, zone))
-                               goto this_zone_full;
-                       /*
-                        * As we may have just activated ZLC, check if the first
-                        * eligible zone has failed zone_reclaim recently.
-                        */
-                       if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
-                               !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
  
                        ret = zone_reclaim(zone, gfp_mask, order);
                                                ac->classzone_idx, alloc_flags))
                                        goto try_this_zone;
  
-                               /*
-                                * Failed to reclaim enough to meet watermark.
-                                * Only mark the zone full if checking the min
-                                * watermark or if we failed to reclaim just
-                                * 1<<order pages or else the page allocator
-                                * fastpath will prematurely mark zones full
-                                * when the watermark is between the low and
-                                * min watermarks.
-                                */
-                               if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
-                                   ret == ZONE_RECLAIM_SOME)
-                                       goto this_zone_full;
                                continue;
                        }
                }
  
  try_this_zone:
                page = buffered_rmqueue(ac->preferred_zone, zone, order,
-                                               gfp_mask, ac->migratetype);
+                               gfp_mask, alloc_flags, ac->migratetype);
                if (page) {
                        if (prep_new_page(page, order, gfp_mask, alloc_flags))
                                goto try_this_zone;
+                       /*
+                        * If this is a high-order atomic allocation then check
+                        * if the pageblock should be reserved for the future
+                        */
+                       if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
+                               reserve_highatomic_pageblock(page, zone, order);
                        return page;
                }
- this_zone_full:
-               if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
-                       zlc_mark_zone_full(zonelist, z);
        }
  
        /*
                        zonelist_rescan = true;
        }
  
-       if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
-               /* Disable zlc cache for second zonelist scan */
-               zlc_active = 0;
-               zonelist_rescan = true;
-       }
        if (zonelist_rescan)
                goto zonelist_scan;
  
@@@ -2669,7 -2663,7 +2663,7 @@@ static DEFINE_RATELIMIT_STATE(nopage_rs
                DEFAULT_RATELIMIT_INTERVAL,
                DEFAULT_RATELIMIT_BURST);
  
- void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
  {
        unsigned int filter = SHOW_MEM_FILTER_NODES;
  
                if (test_thread_flag(TIF_MEMDIE) ||
                    (current->flags & (PF_MEMALLOC | PF_EXITING)))
                        filter &= ~SHOW_MEM_FILTER_NODES;
-       if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+       if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                filter &= ~SHOW_MEM_FILTER_NODES;
  
        if (fmt) {
                va_end(args);
        }
  
-       pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+       pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
                current->comm, order, gfp_mask);
  
        dump_stack();
@@@ -2889,19 -2883,17 +2883,17 @@@ __alloc_pages_direct_reclaim(gfp_t gfp_
        if (unlikely(!(*did_some_progress)))
                return NULL;
  
-       /* After successful reclaim, reconsider all zones for allocation */
-       if (IS_ENABLED(CONFIG_NUMA))
-               zlc_clear_zones_full(ac->zonelist);
  retry:
        page = get_page_from_freelist(gfp_mask, order,
                                        alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
  
        /*
         * If an allocation failed after direct reclaim, it could be because
-        * pages are pinned on the per-cpu lists. Drain them and try again
+        * pages are pinned on the per-cpu lists or in high alloc reserves.
+        * Shrink them them and try again
         */
        if (!page && !drained) {
+               unreserve_highatomic_pageblock(ac);
                drain_all_pages(NULL);
                drained = true;
                goto retry;
@@@ -2946,7 -2938,6 +2938,6 @@@ static inline in
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
        int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
-       const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
  
        /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
        BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
         * The caller may dip into page reserves a bit more if the caller
         * cannot run direct reclaim, or if the caller has realtime scheduling
         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
+        * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
         */
        alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
-       if (atomic) {
+       if (gfp_mask & __GFP_ATOMIC) {
                /*
                 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
                 * if it can't schedule.
@@@ -2996,11 -2987,16 +2987,16 @@@ bool gfp_pfmemalloc_allowed(gfp_t gfp_m
        return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
  }
  
+ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
+ {
+       return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+ }
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                                struct alloc_context *ac)
  {
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
        struct page *page = NULL;
        int alloc_flags;
        unsigned long pages_reclaimed = 0;
                return NULL;
        }
  
+       /*
+        * We also sanity check to catch abuse of atomic reserves being used by
+        * callers that are not in atomic context.
+        */
+       if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
+                               (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+               gfp_mask &= ~__GFP_ATOMIC;
        /*
         * If this allocation cannot block and it is for a specific node, then
         * fail early.  There's no need to wakeup kswapd or retry for a
         * speculative node-specific allocation.
         */
-       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
+       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
                goto nopage;
  
  retry:
-       if (!(gfp_mask & __GFP_NO_KSWAPD))
+       if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                wake_all_kswapds(order, ac);
  
        /*
                }
        }
  
-       /* Atomic allocations - we can't balance anything */
-       if (!wait) {
+       /* Caller is not willing to reclaim, we can't balance anything */
+       if (!can_direct_reclaim) {
                /*
                 * All existing users of the deprecated __GFP_NOFAIL are
                 * blockable, so warn of any new users that actually allow this
                goto got_pg;
  
        /* Checks for THP-specific high-order allocations */
-       if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+       if (is_thp_gfp_mask(gfp_mask)) {
                /*
                 * If compaction is deferred for high-order allocations, it is
                 * because sync compaction recently failed. If this is the case
         * fault, so use asynchronous memory compaction for THP unless it is
         * khugepaged trying to collapse.
         */
-       if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
-                                               (current->flags & PF_KTHREAD))
+       if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
                migration_mode = MIGRATE_SYNC_LIGHT;
  
        /* Try direct reclaim and then allocating */
@@@ -3210,7 -3213,7 +3213,7 @@@ __alloc_pages_nodemask(gfp_t gfp_mask, 
  
        lockdep_trace_alloc(gfp_mask);
  
-       might_sleep_if(gfp_mask & __GFP_WAIT);
+       might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
  
        if (should_fail_alloc_page(gfp_mask, order))
                return NULL;
@@@ -3231,6 -3234,10 +3234,10 @@@ retry_cpuset
  
        /* We set it here, as __alloc_pages_slowpath might have changed it */
        ac.zonelist = zonelist;
+       /* Dirty zone balancing only done in the fast path */
+       ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
        /* The preferred zone is used for statistics later */
        preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                                ac.nodemask ? : &cpuset_current_mems_allowed,
                 * complete.
                 */
                alloc_mask = memalloc_noio_flags(gfp_mask);
+               ac.spread_dirty_pages = false;
  
                page = __alloc_pages_slowpath(alloc_mask, order, &ac);
        }
@@@ -3428,24 -3436,24 +3436,24 @@@ EXPORT_SYMBOL(__free_page_frag)
  struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
  {
        struct page *page;
-       struct mem_cgroup *memcg = NULL;
  
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
        page = alloc_pages(gfp_mask, order);
-       memcg_kmem_commit_charge(page, memcg, order);
+       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+               __free_pages(page, order);
+               page = NULL;
+       }
        return page;
  }
  
  struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
  {
        struct page *page;
-       struct mem_cgroup *memcg = NULL;
  
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
        page = alloc_pages_node(nid, gfp_mask, order);
-       memcg_kmem_commit_charge(page, memcg, order);
+       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+               __free_pages(page, order);
+               page = NULL;
+       }
        return page;
  }
  
   */
  void __free_kmem_pages(struct page *page, unsigned int order)
  {
-       memcg_kmem_uncharge_pages(page, order);
+       memcg_kmem_uncharge(page, order);
        __free_pages(page, order);
  }
  
@@@ -3467,7 -3475,8 +3475,8 @@@ void free_kmem_pages(unsigned long addr
        }
  }
  
- static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+ static void *make_alloc_exact(unsigned long addr, unsigned int order,
+               size_t size)
  {
        if (addr) {
                unsigned long alloc_end = addr + (PAGE_SIZE << order);
@@@ -3517,7 -3526,7 +3526,7 @@@ EXPORT_SYMBOL(alloc_pages_exact)
   */
  void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
  {
-       unsigned order = get_order(size);
+       unsigned int order = get_order(size);
        struct page *p = alloc_pages_node(nid, gfp_mask, order);
        if (!p)
                return NULL;
@@@ -3666,7 -3675,6 +3675,6 @@@ static void show_migration_types(unsign
                [MIGRATE_UNMOVABLE]     = 'U',
                [MIGRATE_RECLAIMABLE]   = 'E',
                [MIGRATE_MOVABLE]       = 'M',
-               [MIGRATE_RESERVE]       = 'R',
  #ifdef CONFIG_CMA
                [MIGRATE_CMA]           = 'C',
  #endif
@@@ -3819,7 -3827,8 +3827,8 @@@ void show_free_areas(unsigned int filte
        }
  
        for_each_populated_zone(zone) {
-               unsigned long nr[MAX_ORDER], flags, order, total = 0;
+               unsigned int order;
+               unsigned long nr[MAX_ORDER], flags, total = 0;
                unsigned char types[MAX_ORDER];
  
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@@ -4168,7 -4177,7 +4177,7 @@@ static void build_zonelists(pg_data_t *
        nodemask_t used_mask;
        int local_node, prev_node;
        struct zonelist *zonelist;
-       int order = current_zonelist_order;
+       unsigned int order = current_zonelist_order;
  
        /* initialize zonelists */
        for (i = 0; i < MAX_ZONELISTS; i++) {
        build_thisnode_zonelists(pgdat);
  }
  
- /* Construct the zonelist performance cache - see further mmzone.h */
- static void build_zonelist_cache(pg_data_t *pgdat)
- {
-       struct zonelist *zonelist;
-       struct zonelist_cache *zlc;
-       struct zoneref *z;
-       zonelist = &pgdat->node_zonelists[0];
-       zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
-       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-       for (z = zonelist->_zonerefs; z->zone; z++)
-               zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
- }
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  /*
   * Return node id of node used for "local" allocations.
@@@ -4286,12 -4281,6 +4281,6 @@@ static void build_zonelists(pg_data_t *
        zonelist->_zonerefs[j].zone_idx = 0;
  }
  
- /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
- static void build_zonelist_cache(pg_data_t *pgdat)
- {
-       pgdat->node_zonelists[0].zlcache_ptr = NULL;
- }
  #endif        /* CONFIG_NUMA */
  
  /*
@@@ -4332,14 -4321,12 +4321,12 @@@ static int __build_all_zonelists(void *
  
        if (self && !node_online(self->node_id)) {
                build_zonelists(self);
-               build_zonelist_cache(self);
        }
  
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
  
                build_zonelists(pgdat);
-               build_zonelist_cache(pgdat);
        }
  
        /*
                /*
                 * We now know the "local memory node" for each node--
                 * i.e., the node of the first zone in the generic zonelist.
-                * Set up numa_mem percpu variable for on-line cpus.  During
-                * boot, only the boot cpu should be on-line;  we'll init the
-                * secondary cpus' numa_mem as they come on-line.  During
-                * node/memory hotplug, we'll fixup all on-line cpus.
+                * Set up numa_mem percpu variable for all possible cpus
+                * if associated node has been onlined.
                 */
-               if (cpu_online(cpu))
+               if (node_online(cpu_to_node(cpu)))
                        set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+               else
+                       set_cpu_numa_mem(cpu, NUMA_NO_NODE);
  #endif
        }
  
@@@ -4498,120 -4485,6 +4485,6 @@@ static inline unsigned long wait_table_
        return ffz(~size);
  }
  
- /*
-  * Check if a pageblock contains reserved pages
-  */
- static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
- {
-       unsigned long pfn;
-       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-               if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
-                       return 1;
-       }
-       return 0;
- }
- /*
-  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
-  * of blocks reserved is based on min_wmark_pages(zone). The memory within
-  * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
-  * higher will lead to a bigger reserve which will get freed as contiguous
-  * blocks as reclaim kicks in
-  */
- static void setup_zone_migrate_reserve(struct zone *zone)
- {
-       unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
-       struct page *page;
-       unsigned long block_migratetype;
-       int reserve;
-       int old_reserve;
-       /*
-        * Get the start pfn, end pfn and the number of blocks to reserve
-        * We have to be careful to be aligned to pageblock_nr_pages to
-        * make sure that we always check pfn_valid for the first page in
-        * the block.
-        */
-       start_pfn = zone->zone_start_pfn;
-       end_pfn = zone_end_pfn(zone);
-       start_pfn = roundup(start_pfn, pageblock_nr_pages);
-       reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
-                                                       pageblock_order;
-       /*
-        * Reserve blocks are generally in place to help high-order atomic
-        * allocations that are short-lived. A min_free_kbytes value that
-        * would result in more than 2 reserve blocks for atomic allocations
-        * is assumed to be in place to help anti-fragmentation for the
-        * future allocation of hugepages at runtime.
-        */
-       reserve = min(2, reserve);
-       old_reserve = zone->nr_migrate_reserve_block;
-       /* When memory hot-add, we almost always need to do nothing */
-       if (reserve == old_reserve)
-               return;
-       zone->nr_migrate_reserve_block = reserve;
-       for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
-               if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
-                       return;
-               if (!pfn_valid(pfn))
-                       continue;
-               page = pfn_to_page(pfn);
-               /* Watch out for overlapping nodes */
-               if (page_to_nid(page) != zone_to_nid(zone))
-                       continue;
-               block_migratetype = get_pageblock_migratetype(page);
-               /* Only test what is necessary when the reserves are not met */
-               if (reserve > 0) {
-                       /*
-                        * Blocks with reserved pages will never free, skip
-                        * them.
-                        */
-                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-                       if (pageblock_is_reserved(pfn, block_end_pfn))
-                               continue;
-                       /* If this block is reserved, account for it */
-                       if (block_migratetype == MIGRATE_RESERVE) {
-                               reserve--;
-                               continue;
-                       }
-                       /* Suitable for reserving if this block is movable */
-                       if (block_migratetype == MIGRATE_MOVABLE) {
-                               set_pageblock_migratetype(page,
-                                                       MIGRATE_RESERVE);
-                               move_freepages_block(zone, page,
-                                                       MIGRATE_RESERVE);
-                               reserve--;
-                               continue;
-                       }
-               } else if (!old_reserve) {
-                       /*
-                        * At boot time we don't need to scan the whole zone
-                        * for turning off MIGRATE_RESERVE.
-                        */
-                       break;
-               }
-               /*
-                * If the reserve is met and this is a previous reserved block,
-                * take it back
-                */
-               if (block_migratetype == MIGRATE_RESERVE) {
-                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-                       move_freepages_block(zone, page, MIGRATE_MOVABLE);
-               }
-       }
- }
  /*
   * Initially all pages are reserved - free ones are freed
   * up by free_all_bootmem() once the early boot process is
@@@ -4651,9 -4524,7 +4524,7 @@@ void __meminit memmap_init_zone(unsigne
                 * movable at startup. This will force kernel allocations
                 * to reserve their blocks rather than leaking throughout
                 * the address space during boot when many long-lived
-                * kernel allocations are made. Later some blocks near
-                * the start are marked MIGRATE_RESERVE by
-                * setup_zone_migrate_reserve()
+                * kernel allocations are made.
                 *
                 * bitmap is created for zone's valid pfn range. but memmap
                 * can be created for invalid pages (for alignment)
@@@ -4900,8 -4771,7 +4771,7 @@@ static __meminit void zone_pcp_init(str
  
  int __meminit init_currently_empty_zone(struct zone *zone,
                                        unsigned long zone_start_pfn,
-                                       unsigned long size,
-                                       enum memmap_context context)
+                                       unsigned long size)
  {
        struct pglist_data *pgdat = zone->zone_pgdat;
        int ret;
@@@ -5413,8 -5283,7 +5283,7 @@@ static void __paginginit free_area_init
  
                set_pageblock_order();
                setup_usemap(pgdat, zone, zone_start_pfn, size);
-               ret = init_currently_empty_zone(zone, zone_start_pfn,
-                                               size, MEMMAP_EARLY);
+               ret = init_currently_empty_zone(zone, zone_start_pfn, size);
                BUG_ON(ret);
                memmap_init(size, nid, j, zone_start_pfn);
                zone_start_pfn += size;
  
  static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
  {
+       unsigned long __maybe_unused offset = 0;
        /* Skip empty nodes */
        if (!pgdat->node_spanned_pages)
                return;
                 * for the buddy allocator to function correctly.
                 */
                start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+               offset = pgdat->node_start_pfn - start;
                end = pgdat_end_pfn(pgdat);
                end = ALIGN(end, MAX_ORDER_NR_PAGES);
                size =  (end - start) * sizeof(struct page);
                if (!map)
                        map = memblock_virt_alloc_node_nopanic(size,
                                                               pgdat->node_id);
-               pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
+               pgdat->node_mem_map = map + offset;
        }
  #ifndef CONFIG_NEED_MULTIPLE_NODES
        /*
         */
        if (pgdat == NODE_DATA(0)) {
                mem_map = NODE_DATA(0)->node_mem_map;
- #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
                if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
-                       mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
+                       mem_map -= offset;
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
        }
  #endif
@@@ -5668,13 -5540,17 +5540,17 @@@ static void __init find_zone_movable_pf
                 */
                required_movablecore =
                        roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+               required_movablecore = min(totalpages, required_movablecore);
                corepages = totalpages - required_movablecore;
  
                required_kernelcore = max(required_kernelcore, corepages);
        }
  
-       /* If kernelcore was not specified, there is no ZONE_MOVABLE */
-       if (!required_kernelcore)
+       /*
+        * If kernelcore was not specified or kernelcore size is larger
+        * than totalpages, there is no ZONE_MOVABLE.
+        */
+       if (!required_kernelcore || required_kernelcore >= totalpages)
                goto out;
  
        /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
@@@ -6209,7 -6085,6 +6085,6 @@@ static void __setup_per_zone_wmarks(voi
                        high_wmark_pages(zone) - low_wmark_pages(zone) -
                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
  
-               setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lock, flags);
        }
  
@@@ -6831,7 -6706,8 +6706,8 @@@ int alloc_contig_range(unsigned long st
                       unsigned migratetype)
  {
        unsigned long outer_start, outer_end;
-       int ret = 0, order;
+       unsigned int order;
+       int ret = 0;
  
        struct compact_control cc = {
                .nr_migratepages = 0,
diff --combined mm/pgtable-generic.c
index 7d3db0247983b22b121290c2203ba2c2fb544ec0,89b150f8c920f200310641ed6a70e56150d06a6c..69261d4c774dd3d9894447bdbf6342aac746ad34
@@@ -57,59 -57,35 +57,59 @@@ int ptep_set_access_flags(struct vm_are
  }
  #endif
  
 +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
 +int ptep_clear_flush_young(struct vm_area_struct *vma,
 +                         unsigned long address, pte_t *ptep)
 +{
 +      int young;
 +      young = ptep_test_and_clear_young(vma, address, ptep);
 +      if (young)
 +              flush_tlb_page(vma, address);
 +      return young;
 +}
 +#endif
 +
 +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
 +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
 +                     pte_t *ptep)
 +{
 +      struct mm_struct *mm = (vma)->vm_mm;
 +      pte_t pte;
 +      pte = ptep_get_and_clear(mm, address, ptep);
 +      if (pte_accessible(mm, pte))
 +              flush_tlb_page(vma, address);
 +      return pte;
 +}
 +#endif
 +
 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 +
 +#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
 +
 +/*
 + * ARCHes with special requirements for evicting THP backing TLB entries can
 + * implement this. Otherwise also, it can help optimize normal TLB flush in
 + * THP regime. stock flush_tlb_range() typically has optimization to nuke the
 + * entire TLB TLB if flush span is greater than a threshhold, which will
 + * likely be true for a single huge page. Thus a single thp flush will
 + * invalidate the entire TLB which is not desitable.
 + * e.g. see arch/arc: flush_pmd_tlb_range
 + */
 +#define flush_pmd_tlb_range(vma, addr, end)   flush_tlb_range(vma, addr, end)
 +#endif
 +
  #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
  int pmdp_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pmd_t *pmdp,
                          pmd_t entry, int dirty)
  {
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        int changed = !pmd_same(*pmdp, entry);
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        if (changed) {
                set_pmd_at(vma->vm_mm, address, pmdp, entry);
 -              flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 +              flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        }
        return changed;
 -#else /* CONFIG_TRANSPARENT_HUGEPAGE */
 -      BUG();
 -      return 0;
 -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 -}
 -#endif
 -
 -#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
 -int ptep_clear_flush_young(struct vm_area_struct *vma,
 -                         unsigned long address, pte_t *ptep)
 -{
 -      int young;
 -      young = ptep_test_and_clear_young(vma, address, ptep);
 -      if (young)
 -              flush_tlb_page(vma, address);
 -      return young;
  }
  #endif
  
@@@ -118,15 -94,33 +118,15 @@@ int pmdp_clear_flush_young(struct vm_ar
                           unsigned long address, pmd_t *pmdp)
  {
        int young;
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 -#else
 -      BUG();
 -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
 -              flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 +              flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return young;
  }
  #endif
  
 -#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
 -pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
 -                     pte_t *ptep)
 -{
 -      struct mm_struct *mm = (vma)->vm_mm;
 -      pte_t pte;
 -      pte = ptep_get_and_clear(mm, address, ptep);
 -      if (pte_accessible(mm, pte))
 -              flush_tlb_page(vma, address);
 -      return pte;
 -}
 -#endif
 -
  #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp)
  {
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(!pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 -      flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 +      flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
  }
 -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  #endif
  
- #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
- {
-       pmd_t pmd = pmd_mksplitting(*pmdp);
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-       /* tlb flush only to serialize against gup-fast */
-       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- }
- #endif
  #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
  {
                list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
        pmd_huge_pte(mm, pmdp) = pgtable;
  }
 -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  #endif
  
  #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  /* no "address" argument so destroys page coloring of some arch */
  pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
  {
        }
        return pgtable;
  }
 -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  #endif
  
  #ifndef __HAVE_ARCH_PMDP_INVALIDATE
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pmd_t *pmdp)
  {
        pmd_t entry = *pmdp;
        set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
 -      flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 +      flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
  }
 -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  #endif
  
  #ifndef pmdp_collapse_flush
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
                          pmd_t *pmdp)
  {
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 -      flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 +      flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
  }
 -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  #endif
 +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --combined mm/vmscan.c
index e7057af54b6e267558a99749fac80dc77dd7855f,f9cbe39d020bdbdf0c87055ea09236fb9a45cdc0..9b52ecf9119420bef8ce4ea2d503eb3ef4754c7f
@@@ -175,7 -175,7 +175,7 @@@ static bool sane_reclaim(struct scan_co
        if (!memcg)
                return true;
  #ifdef CONFIG_CGROUP_WRITEBACK
 -      if (cgroup_on_dfl(memcg->css.cgroup))
 +      if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return true;
  #endif
        return false;
@@@ -194,7 -194,7 +194,7 @@@ static bool sane_reclaim(struct scan_co
  
  static unsigned long zone_reclaimable_pages(struct zone *zone)
  {
-       int nr;
+       unsigned long nr;
  
        nr = zone_page_state(zone, NR_ACTIVE_FILE) +
             zone_page_state(zone, NR_INACTIVE_FILE);
@@@ -796,6 -796,8 +796,8 @@@ static enum page_references page_check_
        int referenced_ptes, referenced_page;
        unsigned long vm_flags;
  
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
        referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
                                          &vm_flags);
        referenced_page = TestClearPageReferenced(page);
@@@ -906,6 -908,7 +908,7 @@@ static unsigned long shrink_page_list(s
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                bool dirty, writeback;
+               bool freeable = false;
  
                cond_resched();
  
                                goto keep_locked;
                        if (!add_to_swap(page, page_list))
                                goto activate_locked;
+                       freeable = true;
                        may_enter_fs = 1;
                        /* Adding to swap updated mapping */
                        mapping = page_mapping(page);
                }
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page,
-                                       ttu_flags|TTU_BATCH_FLUSH)) {
+                       switch (try_to_unmap(page, freeable ?
+                                       ttu_flags | TTU_BATCH_FLUSH | TTU_FREE :
+                                       ttu_flags | TTU_BATCH_FLUSH)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
                 * we obviously don't have to worry about waking up a process
                 * waiting on the page lock, because there are no references.
                 */
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
  free_it:
+               if (freeable && !PageDirty(page))
+                       count_vm_event(PGLAZYFREED);
                nr_reclaimed++;
  
                /*
@@@ -1476,7 -1483,7 +1483,7 @@@ static int too_many_isolated(struct zon
         * won't get blocked by normal direct-reclaimers, forming a circular
         * deadlock.
         */
-       if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+       if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                inactive >>= 3;
  
        return isolated > inactive;
@@@ -1859,17 -1866,14 +1866,14 @@@ static void shrink_active_list(unsigne
  }
  
  #ifdef CONFIG_SWAP
- static int inactive_anon_is_low_global(struct zone *zone)
+ static bool inactive_anon_is_low_global(struct zone *zone)
  {
        unsigned long active, inactive;
  
        active = zone_page_state(zone, NR_ACTIVE_ANON);
        inactive = zone_page_state(zone, NR_INACTIVE_ANON);
  
-       if (inactive * zone->inactive_ratio < active)
-               return 1;
-       return 0;
+       return inactive * zone->inactive_ratio < active;
  }
  
  /**
   * Returns true if the zone does not have enough inactive anon pages,
   * meaning some active anon pages need to be deactivated.
   */
- static int inactive_anon_is_low(struct lruvec *lruvec)
+ static bool inactive_anon_is_low(struct lruvec *lruvec)
  {
        /*
         * If we don't have swap space, anonymous page deactivation
         * is pointless.
         */
        if (!total_swap_pages)
-               return 0;
+               return false;
  
        if (!mem_cgroup_disabled())
                return mem_cgroup_inactive_anon_is_low(lruvec);
        return inactive_anon_is_low_global(lruvec_zone(lruvec));
  }
  #else
- static inline int inactive_anon_is_low(struct lruvec *lruvec)
+ static inline bool inactive_anon_is_low(struct lruvec *lruvec)
  {
-       return 0;
+       return false;
  }
  #endif
  
   * This uses a different ratio than the anonymous pages, because
   * the page cache uses a use-once replacement algorithm.
   */
- static int inactive_file_is_low(struct lruvec *lruvec)
+ static bool inactive_file_is_low(struct lruvec *lruvec)
  {
        unsigned long inactive;
        unsigned long active;
        return active > inactive;
  }
  
- static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
+ static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
  {
        if (is_file_lru(lru))
                return inactive_file_is_low(lruvec);
@@@ -2480,7 -2484,7 +2484,7 @@@ static inline bool compaction_ready(str
        balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
                        zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
        watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
-       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0);
  
        /*
         * If compaction is deferred, reclaim up to a point where
@@@ -2963,7 -2967,7 +2967,7 @@@ static bool zone_balanced(struct zone *
                          unsigned long balance_gap, int classzone_idx)
  {
        if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
-                                   balance_gap, classzone_idx, 0))
+                                   balance_gap, classzone_idx))
                return false;
  
        if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
@@@ -3696,10 -3700,10 +3700,10 @@@ static inline unsigned long zone_unmapp
  }
  
  /* Work out how many page cache pages we can reclaim in this reclaim_mode */
- static long zone_pagecache_reclaimable(struct zone *zone)
+ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
  {
-       long nr_pagecache_reclaimable;
-       long delta = 0;
+       unsigned long nr_pagecache_reclaimable;
+       unsigned long delta = 0;
  
        /*
         * If RECLAIM_UNMAP is set, then all file pages are considered
@@@ -3794,7 -3798,7 +3798,7 @@@ int zone_reclaim(struct zone *zone, gfp
        /*
         * Do not scan if the allocation should not be delayed.
         */
-       if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
+       if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
                return ZONE_RECLAIM_NOSCAN;
  
        /*
diff --combined net/core/sock.c
index 0ef30aa90132c7a1a04971c773d4de8ed4ac146b,8cab9d90b0185cfe216d75ecf932b75b31b0a222..a2040bb09916e8f54f7a30e1990944116ba255c8
@@@ -422,25 -422,13 +422,25 @@@ static void sock_warn_obsolete_bsdism(c
        }
  }
  
 +static bool sock_needs_netstamp(const struct sock *sk)
 +{
 +      switch (sk->sk_family) {
 +      case AF_UNSPEC:
 +      case AF_UNIX:
 +              return false;
 +      default:
 +              return true;
 +      }
 +}
 +
  #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
  
  static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
  {
        if (sk->sk_flags & flags) {
                sk->sk_flags &= ~flags;
 -              if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 +              if (sock_needs_netstamp(sk) &&
 +                  !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
                        net_disable_timestamp();
        }
  }
@@@ -1000,10 -988,6 +1000,10 @@@ set_rcvbuf
                                         sk->sk_max_pacing_rate);
                break;
  
 +      case SO_INCOMING_CPU:
 +              sk->sk_incoming_cpu = val;
 +              break;
 +
        default:
                ret = -ENOPROTOOPT;
                break;
@@@ -1594,8 -1578,7 +1594,8 @@@ struct sock *sk_clone_lock(const struc
                if (newsk->sk_prot->sockets_allocated)
                        sk_sockets_allocated_inc(newsk);
  
 -              if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
 +              if (sock_needs_netstamp(sk) &&
 +                  newsk->sk_flags & SK_FLAGS_TIMESTAMP)
                        net_enable_timestamp();
        }
  out:
@@@ -1869,32 -1852,6 +1869,32 @@@ struct sk_buff *sock_alloc_send_skb(str
  }
  EXPORT_SYMBOL(sock_alloc_send_skb);
  
 +int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
 +                 struct sockcm_cookie *sockc)
 +{
 +      struct cmsghdr *cmsg;
 +
 +      for_each_cmsghdr(cmsg, msg) {
 +              if (!CMSG_OK(msg, cmsg))
 +                      return -EINVAL;
 +              if (cmsg->cmsg_level != SOL_SOCKET)
 +                      continue;
 +              switch (cmsg->cmsg_type) {
 +              case SO_MARK:
 +                      if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 +                              return -EPERM;
 +                      if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
 +                              return -EINVAL;
 +                      sockc->mark = *(u32 *)CMSG_DATA(cmsg);
 +                      break;
 +              default:
 +                      return -EINVAL;
 +              }
 +      }
 +      return 0;
 +}
 +EXPORT_SYMBOL(sock_cmsg_send);
 +
  /* On 32bit arches, an skb frag is limited to 2^15 */
  #define SKB_FRAG_PAGE_ORDER   get_order(32768)
  
@@@ -1922,8 -1879,10 +1922,10 @@@ bool skb_page_frag_refill(unsigned int 
  
        pfrag->offset = 0;
        if (SKB_FRAG_PAGE_ORDER) {
-               pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
-                                         __GFP_NOWARN | __GFP_NORETRY,
+               /* Avoid direct reclaim but allow kswapd to wake */
+               pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+                                         __GFP_COMP | __GFP_NOWARN |
+                                         __GFP_NORETRY,
                                          SKB_FRAG_PAGE_ORDER);
                if (likely(pfrag->page)) {
                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
@@@ -2396,7 -2355,6 +2398,7 @@@ void sock_init_data(struct socket *sock
  
        sk->sk_max_pacing_rate = ~0U;
        sk->sk_pacing_rate = ~0U;
 +      sk->sk_incoming_cpu = -1;
        /*
         * Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.txt for details)
@@@ -2523,8 -2481,7 +2525,8 @@@ void sock_enable_timestamp(struct sock 
                 * time stamping, but time stamping might have been on
                 * already because of the other one
                 */
 -              if (!(previous_flags & SK_FLAGS_TIMESTAMP))
 +              if (sock_needs_netstamp(sk) &&
 +                  !(previous_flags & SK_FLAGS_TIMESTAMP))
                        net_enable_timestamp();
        }
  }
@@@ -2803,7 -2760,7 +2805,7 @@@ static int req_prot_init(const struct p
  
        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
                                           rsk_prot->obj_size, 0,
 -                                         0, NULL);
 +                                         prot->slab_flags, NULL);
  
        if (!rsk_prot->slab) {
                pr_crit("%s: Can't create request sock SLAB cache!\n",
diff --combined net/netlink/af_netlink.c
index fafe33bdb61989e680dc4b26dbe99dcc1d4064b5,ab061cca59d2265d018fc109c2becd2ca626306c..59651af8cc2705b39f3ad1ea71ab0b161668af02
@@@ -2116,7 -2116,7 +2116,7 @@@ int netlink_broadcast_filtered(struct s
        consume_skb(info.skb2);
  
        if (info.delivered) {
-               if (info.congested && (allocation & __GFP_WAIT))
+               if (info.congested && gfpflags_allow_blocking(allocation))
                        yield();
                return 0;
        }
@@@ -2371,7 -2371,7 +2371,7 @@@ static int netlink_getsockopt(struct so
                int pos, idx, shift;
  
                err = 0;
 -              netlink_table_grab();
 +              netlink_lock_table();
                for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
                        if (len - pos < sizeof(u32))
                                break;
                }
                if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen))
                        err = -EFAULT;
 -              netlink_table_ungrab();
 +              netlink_unlock_table();
                break;
        }
        case NETLINK_CAP_ACK:
diff --combined net/openvswitch/flow.c
index 0ea128eeeab2f835221b2068b1098a81fe1d731d,95cd5fd3a78046b232454431c3632403239b51ec..619f1d710eac0d9f9aab37d5d2340f91fa92b639
@@@ -71,7 -71,7 +71,7 @@@ void ovs_flow_stats_update(struct sw_fl
                           const struct sk_buff *skb)
  {
        struct flow_stats *stats;
-       int node = numa_node_id();
+       int node = numa_mem_id();
        int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
  
        stats = rcu_dereference(flow->stats[node]);
@@@ -698,7 -698,8 +698,7 @@@ int ovs_flow_key_extract(const struct i
  {
        /* Extract metadata from packet. */
        if (tun_info) {
 -              if (ip_tunnel_info_af(tun_info) != AF_INET)
 -                      return -EINVAL;
 +              key->tun_proto = ip_tunnel_info_af(tun_info);
                memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
  
                if (tun_info->options_len) {
                        key->tun_opts_len = 0;
                }
        } else  {
 +              key->tun_proto = 0;
                key->tun_opts_len = 0;
                memset(&key->tun_key, 0, sizeof(key->tun_key));
        }
diff --combined net/rds/ib_recv.c
index 96744b75db9387aa2ef3b28d8ea103d81997a9ab,dcfb59775acc2bccdce963a06998a1b2d1fa70a0..977fb86065b75dbef916bd0acb9b94876c0f5c04
@@@ -305,7 -305,7 +305,7 @@@ static int rds_ib_recv_refill_one(struc
        gfp_t slab_mask = GFP_NOWAIT;
        gfp_t page_mask = GFP_NOWAIT;
  
-       if (gfp & __GFP_WAIT) {
+       if (gfp & __GFP_DIRECT_RECLAIM) {
                slab_mask = GFP_KERNEL;
                page_mask = GFP_HIGHUSER;
        }
@@@ -379,7 -379,7 +379,7 @@@ void rds_ib_recv_refill(struct rds_conn
        struct ib_recv_wr *failed_wr;
        unsigned int posted = 0;
        int ret = 0;
-       bool can_wait = !!(gfp & __GFP_WAIT);
+       bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
        u32 pos;
  
        /* the goal here is to just make sure that someone, somewhere
@@@ -596,7 -596,8 +596,7 @@@ void rds_ib_recv_init_ack(struct rds_ib
   * wr_id and avoids working with the ring in that case.
   */
  #ifndef KERNEL_HAS_ATOMIC64
 -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
 -                              int ack_required)
 +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
  {
        unsigned long flags;
  
@@@ -621,7 -622,8 +621,7 @@@ static u64 rds_ib_get_ack(struct rds_ib
        return seq;
  }
  #else
 -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
 -                              int ack_required)
 +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
  {
        atomic64_set(&ic->i_ack_next, seq);
        if (ack_required) {
@@@ -828,6 -830,20 +828,6 @@@ static void rds_ib_cong_recv(struct rds
        rds_cong_map_updated(map, uncongested);
  }
  
 -/*
 - * Rings are posted with all the allocations they'll need to queue the
 - * incoming message to the receiving socket so this can't fail.
 - * All fragments start with a header, so we can make sure we're not receiving
 - * garbage, and we can tell a small 8 byte fragment from an ACK frame.
 - */
 -struct rds_ib_ack_state {
 -      u64             ack_next;
 -      u64             ack_recv;
 -      unsigned int    ack_required:1;
 -      unsigned int    ack_next_valid:1;
 -      unsigned int    ack_recv_valid:1;
 -};
 -
  static void rds_ib_process_recv(struct rds_connection *conn,
                                struct rds_ib_recv_work *recv, u32 data_len,
                                struct rds_ib_ack_state *state)
        }
  }
  
 -/*
 - * Plucking the oldest entry from the ring can be done concurrently with
 - * the thread refilling the ring.  Each ring operation is protected by
 - * spinlocks and the transient state of refilling doesn't change the
 - * recording of which entry is oldest.
 - *
 - * This relies on IB only calling one cq comp_handler for each cq so that
 - * there will only be one caller of rds_recv_incoming() per RDS connection.
 - */
 -void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
 -{
 -      struct rds_connection *conn = context;
 -      struct rds_ib_connection *ic = conn->c_transport_data;
 -
 -      rdsdebug("conn %p cq %p\n", conn, cq);
 -
 -      rds_ib_stats_inc(s_ib_rx_cq_call);
 -
 -      tasklet_schedule(&ic->i_recv_tasklet);
 -}
 -
 -static inline void rds_poll_cq(struct rds_ib_connection *ic,
 -                             struct rds_ib_ack_state *state)
 +void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
 +                           struct ib_wc *wc,
 +                           struct rds_ib_ack_state *state)
  {
        struct rds_connection *conn = ic->conn;
 -      struct ib_wc wc;
        struct rds_ib_recv_work *recv;
  
 -      while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
 -              rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
 -                       (unsigned long long)wc.wr_id, wc.status,
 -                       ib_wc_status_msg(wc.status), wc.byte_len,
 -                       be32_to_cpu(wc.ex.imm_data));
 -              rds_ib_stats_inc(s_ib_rx_cq_event);
 +      rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
 +               (unsigned long long)wc->wr_id, wc->status,
 +               ib_wc_status_msg(wc->status), wc->byte_len,
 +               be32_to_cpu(wc->ex.imm_data));
  
 -              recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
 +      rds_ib_stats_inc(s_ib_rx_cq_event);
 +      recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
 +      ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1,
 +                      DMA_FROM_DEVICE);
  
 -              ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
 -
 -              /*
 -               * Also process recvs in connecting state because it is possible
 -               * to get a recv completion _before_ the rdmacm ESTABLISHED
 -               * event is processed.
 -               */
 -              if (wc.status == IB_WC_SUCCESS) {
 -                      rds_ib_process_recv(conn, recv, wc.byte_len, state);
 -              } else {
 -                      /* We expect errors as the qp is drained during shutdown */
 -                      if (rds_conn_up(conn) || rds_conn_connecting(conn))
 -                              rds_ib_conn_error(conn, "recv completion on %pI4 had "
 -                                                "status %u (%s), disconnecting and "
 -                                                "reconnecting\n", &conn->c_faddr,
 -                                                wc.status,
 -                                                ib_wc_status_msg(wc.status));
 -              }
 -
 -              /*
 -               * rds_ib_process_recv() doesn't always consume the frag, and
 -               * we might not have called it at all if the wc didn't indicate
 -               * success. We already unmapped the frag's pages, though, and
 -               * the following rds_ib_ring_free() call tells the refill path
 -               * that it will not find an allocated frag here. Make sure we
 -               * keep that promise by freeing a frag that's still on the ring.
 -               */
 -              if (recv->r_frag) {
 -                      rds_ib_frag_free(ic, recv->r_frag);
 -                      recv->r_frag = NULL;
 -              }
 -              rds_ib_ring_free(&ic->i_recv_ring, 1);
 +      /* Also process recvs in connecting state because it is possible
 +       * to get a recv completion _before_ the rdmacm ESTABLISHED
 +       * event is processed.
 +       */
 +      if (wc->status == IB_WC_SUCCESS) {
 +              rds_ib_process_recv(conn, recv, wc->byte_len, state);
 +      } else {
 +              /* We expect errors as the qp is drained during shutdown */
 +              if (rds_conn_up(conn) || rds_conn_connecting(conn))
 +                      rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
 +                                        &conn->c_faddr,
 +                                        wc->status,
 +                                        ib_wc_status_msg(wc->status));
        }
 -}
  
 -void rds_ib_recv_tasklet_fn(unsigned long data)
 -{
 -      struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
 -      struct rds_connection *conn = ic->conn;
 -      struct rds_ib_ack_state state = { 0, };
 -
 -      rds_poll_cq(ic, &state);
 -      ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
 -      rds_poll_cq(ic, &state);
 -
 -      if (state.ack_next_valid)
 -              rds_ib_set_ack(ic, state.ack_next, state.ack_required);
 -      if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
 -              rds_send_drop_acked(conn, state.ack_recv, NULL);
 -              ic->i_ack_recv = state.ack_recv;
 +      /* rds_ib_process_recv() doesn't always consume the frag, and
 +       * we might not have called it at all if the wc didn't indicate
 +       * success. We already unmapped the frag's pages, though, and
 +       * the following rds_ib_ring_free() call tells the refill path
 +       * that it will not find an allocated frag here. Make sure we
 +       * keep that promise by freeing a frag that's still on the ring.
 +       */
 +      if (recv->r_frag) {
 +              rds_ib_frag_free(ic, recv->r_frag);
 +              recv->r_frag = NULL;
        }
 -      if (rds_conn_up(conn))
 -              rds_ib_attempt_ack(ic);
 +      rds_ib_ring_free(&ic->i_recv_ring, 1);
  
        /* If we ever end up with a really empty receive ring, we're
         * in deep trouble, as the sender will definitely see RNR
index 692b3e67fb54418ffb143491b9e2f1dc82d8d503,3b5de4b86058334b4d762ef965e86f2877503932..6c71ed1caf16727a587c90ff81dcd6a7abd3d10b
@@@ -500,7 -500,7 +500,7 @@@ int rxrpc_connect_call(struct rxrpc_soc
                if (bundle->num_conns >= 20) {
                        _debug("too many conns");
  
-                       if (!(gfp & __GFP_WAIT)) {
+                       if (!gfpflags_allow_blocking(gfp)) {
                                _leave(" = -EAGAIN");
                                return -EAGAIN;
                        }
@@@ -808,7 -808,7 +808,7 @@@ void rxrpc_put_connection(struct rxrpc_
  
        ASSERTCMP(atomic_read(&conn->usage), >, 0);
  
 -      conn->put_time = get_seconds();
 +      conn->put_time = ktime_get_seconds();
        if (atomic_dec_and_test(&conn->usage)) {
                _debug("zombie");
                rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
@@@ -852,7 -852,7 +852,7 @@@ static void rxrpc_connection_reaper(str
  
        _enter("");
  
 -      now = get_seconds();
 +      now = ktime_get_seconds();
        earliest = ULONG_MAX;
  
        write_lock_bh(&rxrpc_connection_lock);
index 2458288a8287861c87bc868296ea068016efb673,4b4957b8df4e879a0d19098fdacce49658fce24a..c8edff6803d1db0b9b36585746c3ecaf45b6681c
@@@ -6,6 -6,7 +6,7 @@@ TARGETS += firmwar
  TARGETS += ftrace
  TARGETS += futex
  TARGETS += kcmp
+ TARGETS += lib
  TARGETS += membarrier
  TARGETS += memfd
  TARGETS += memory-hotplug
@@@ -13,7 -14,6 +14,7 @@@ TARGETS += moun
  TARGETS += mqueue
  TARGETS += net
  TARGETS += powerpc
 +TARGETS += pstore
  TARGETS += ptrace
  TARGETS += seccomp
  TARGETS += size
@@@ -66,9 -66,6 +67,9 @@@ clean_hotplug
                make -C $$TARGET clean; \
        done;
  
 +run_pstore_crash:
 +      make -C pstore run_crash
 +
  INSTALL_PATH ?= install
  INSTALL_PATH := $(abspath $(INSTALL_PATH))
  ALL_SCRIPT := $(INSTALL_PATH)/run_kselftest.sh