Merge branch 'akpm-current/current'

author Stephen Rothwell <sfr@canb.auug.org.au>

Mon, 2 Nov 2015 03:45:18 +0000 (14:45 +1100)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Mon, 2 Nov 2015 03:45:26 +0000 (14:45 +1100)
author Stephen Rothwell <sfr@canb.auug.org.au>
Mon, 2 Nov 2015 03:45:18 +0000 (14:45 +1100)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Mon, 2 Nov 2015 03:45:26 +0000 (14:45 +1100)
diff --combined Documentation/filesystems/proc.txt

index 3a9d65c912e780977c12102d7719a0374241b962,d6f259eaa5efb384cb503d312808ec06660efba5..55ffd0820feba09105555137efa4eb54abc39be1
--- 1/Documentation/filesystems/proc.txt
--- 2/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@@ -140,8 -140,7 +140,8 @@@ Table 1-1: Process specific entries in 
    stat         Process status
    statm                Process memory status information
    status               Process status in human readable form
- - wchan                If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ + wchan                Present with CONFIG_KALLSYMS=y: it shows the kernel function
+ +              symbol the task is blocked in - or "0" if not blocked.
    pagemap      Page table
    stack                Report full stack trace, enable via CONFIG_STACKTRACE
    smaps                a extension based on maps, showing the memory consumption of
@@@ -175,6 -174,7 +175,7 @@@ read the file /proc/PID/status
     VmLib:      1412 kB
     VmPTE:        20 kb
     VmSwap:        0 kB
+   HugetlbPages:          0 kB
     Threads:        1
     SigQ:   0/28578
     SigPnd: 0000000000000000
@@@ -238,6 -238,7 +239,7 @@@ Table 1-2: Contents of the status file
    VmPTE                       size of page table entries
    VmPMD                       size of second level page tables
    VmSwap                      size of swap usage (the number of referred swapents)
+  HugetlbPages                size of hugetlb memory portions
    Threads                     number of threads
    SigQ                        number of signals queued/max. number for queue
    SigPnd                      bitmap of pending signals for the thread
@@@ -311,7 -312,7 +313,7 @@@ Table 1-4: Contents of the stat files (
     blocked       bitmap of blocked signals
     sigign        bitmap of ignored signals
     sigcatch      bitmap of caught signals
- -  wchan         address where process went to sleep
+ +  0           (place holder, used to be the wchan address, use /proc/PID/wchan instead)
     0             (place holder)
     0             (place holder)
     exit_signal   signal to send to parent thread on exit
@@@ -424,6 -425,9 +426,9 @@@ Private_Clean:         0 k
   Private_Dirty:         0 kB
   Referenced:          892 kB
   Anonymous:             0 kB
+ AnonHugePages:         0 kB
+ Shared_Hugetlb:        0 kB
+ Private_Hugetlb:        0 kB
   Swap:                  0 kB
   SwapPss:               0 kB
   KernelPageSize:        4 kB
@@@ -452,6 -456,11 +457,11 @@@ and a page is modified, the file page i
   "Swap" shows how much would-be-anonymous memory is also used, but out on
   swap.
   "SwapPss" shows proportional swap share of this mapping.
+ "AnonHugePages" shows the ammount of memory backed by transparent hugepage.
+ "Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by
+ hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
+ reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
+ 
   "VmFlags" field deserves a separate description. This member represents the kernel
   flags associated with the particular virtual memory area in two letter encoded
   manner. The codes are the following:
@@@ -475,7 -484,6 +485,6 @@@
       ac  - area is accountable
       nr  - swap space is not reserved for the area
       ht  - area uses huge tlb pages
-     nl  - non-linear mapping
       ar  - architecture specific flag
       dd  - do not include area into core dump
       sd  - soft-dirty flag
diff --combined Documentation/kernel-parameters.txt

index 101573c07788945b4423e3d29dbbed9275ab5cda,b4af96e01b06fc24d9636b278ff23b111f9173da..f8aae632f02f678000f9de7a8d507a5ea0bb9404
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -167,8 -167,7 +167,8 @@@ bytes respectively. Such letter suffixe
   
         acpi=           [HW,ACPI,X86,ARM64]
                         Advanced Configuration and Power Interface
- -                      Format: { force | off | strict | noirq | rsdt }
+ +                      Format: { force | off | strict | noirq | rsdt |
+ +                                copy_dsdt }
                         force -- enable ACPI if default was off
                         off -- disable ACPI if default was on
                         noirq -- do not use ACPI for IRQ routing
@@@ -790,10 -789,8 +790,10 @@@
                         is passed, kernel could allocate physical memory region
                         above 4G, that cause second kernel crash on system
                         that require some amount of low memory, e.g. swiotlb
- -                      requires at least 64M+32K low memory.  Kernel would
- -                      try to allocate 72M below 4G automatically.
+ +                      requires at least 64M+32K low memory, also enough extra
+ +                      low memory is needed to make sure DMA buffers for 32-bit
+ +                      devices won't run out. Kernel would try to allocate at
+ +                      at least 256M below 4G automatically.
                         This one let user to specify own low range under 4G
                         for second kernel instead.
                         0: to disable low allocation.
@@@ -932,11 -929,11 +932,11 @@@
                         The filter can be disabled or changed to another
                         driver later using sysfs.
   
- -      drm_kms_helper.edid_firmware=[<connector>:]<file>
- -                      Broken monitors, graphic adapters and KVMs may
- -                      send no or incorrect EDID data sets. This parameter
- -                      allows to specify an EDID data set in the
- -                      /lib/firmware directory that is used instead.
+ +      drm_kms_helper.edid_firmware=[<connector>:]<file>[,[<connector>:]<file>]
+ +                      Broken monitors, graphic adapters, KVMs and EDIDless
+ +                      panels may send no or incorrect EDID data sets.
+ +                      This parameter allows to specify an EDID data sets
+ +                      in the /lib/firmware directory that are used instead.
                         Generic built-in EDID data sets are used, if one of
                         edid/1024x768.bin, edid/1280x1024.bin,
                         edid/1680x1050.bin, or edid/1920x1080.bin is given
@@@ -945,10 -942,7 +945,10 @@@
                         available in Documentation/EDID/HOWTO.txt. An EDID
                         data set will only be used for a particular connector,
                         if its name and a colon are prepended to the EDID
- -                      name.
+ +                      name. Each connector may use a unique EDID data
+ +                      set by separating the files with a comma.  An EDID
+ +                      data set with no connector name will be used for
+ +                      any connectors not explicitly specified.
   
         dscc4.setup=    [NET]
   
@@@ -977,10 -971,6 +977,10 @@@
   
         earlycon=       [KNL] Output early console device and options.
   
+ +                      When used with no options, the early console is
+ +                      determined by the stdout-path property in device
+ +                      tree's chosen node.
+ +
                 cdns,<addr>
                         Start an early, polled-mode console on a cadence serial
                         port at the specified address. The cadence serial port
@@@ -1033,13 -1023,6 +1033,13 @@@
                         serial port must already be setup and configured.
                         Options are not yet supported.
   
+ +              lpuart,<addr>
+ +              lpuart32,<addr>
+ +                      Use early console provided by Freescale LP UART driver
+ +                      found on Freescale Vybrid and QorIQ LS1021A processors.
+ +                      A valid base address must be provided, and the serial
+ +                      port must already be setup and configured.
+ +
         earlyprintk=    [X86,SH,BLACKFIN,ARM,M68k]
                         earlyprintk=vga
                         earlyprintk=efi
@@@ -1111,21 -1094,6 +1111,21 @@@
                         you are really sure that your UEFI does sane gc and
                         fulfills the spec otherwise your board may brick.
   
+ +      efi_fake_mem=   nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86]
+ +                      Add arbitrary attribute to specific memory range by
+ +                      updating original EFI memory map.
+ +                      Region of memory which aa attribute is added to is
+ +                      from ss to ss+nn.
+ +                      If efi_fake_mem=2G@4G:0x10000,2G@0x10a0000000:0x10000
+ +                      is specified, EFI_MEMORY_MORE_RELIABLE(0x10000)
+ +                      attribute is added to range 0x100000000-0x180000000 and
+ +                      0x10a0000000-0x1120000000.
+ +
+ +                      Using this parameter you can do debugging of EFI memmap
+ +                      related feature. For example, you can do debugging of
+ +                      Address Range Mirroring feature even if your box
+ +                      doesn't support it.
+ +
         eisa_irq_edge=  [PARISC,HW]
                         See header of drivers/parisc/eisa.c.
   
@@@ -1278,6 -1246,11 +1278,11 @@@
                         Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
                         Default: 1024
   
+       hardlockup_all_cpu_backtrace=
+                       [KNL] Should the hard-lockup detector generate
+                       backtraces on all cpus.
+                       Format: <integer>
+ 
         hashdist=       [KNL,NUMA] Large hashes allocated during boot
                         are distributed across NUMA nodes.  Defaults on
                         for 64-bit NUMA, off otherwise.
@@@ -1578,9 -1551,6 +1583,9 @@@
                 hwp_only
                         Only load intel_pstate on systems which support
                         hardware P state control (HWP) if available.
+ +              no_acpi
+ +                      Don't use ACPI processor performance control objects
+ +                      _PSS and _PPC specified limits.
   
         intremap=       [X86-64, Intel-IOMMU]
                         on      enable Interrupt Remapping (default)
@@@ -1588,7 -1558,6 +1593,7 @@@
                         nosid   disable Source ID checking
                         no_x2apic_optout
                                 BIOS x2APIC opt-out request will be ignored
+ +                      nopost  disable Interrupt Posting
   
         iomem=          Disable strict checking of access to MMIO memory
                 strict  regions from userspace.
@@@ -2350,12 -2319,11 +2355,12 @@@
         nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
                         Format: [panic,][nopanic,][num]
                         Valid num: 0 or 1
- -                      0 - turn nmi_watchdog off
- -                      1 - turn nmi_watchdog on
+ +                      0 - turn hardlockup detector in nmi_watchdog off
+ +                      1 - turn hardlockup detector in nmi_watchdog on
                         When panic is specified, panic when an NMI watchdog
                         timeout occurs (or 'nopanic' to override the opposite
- -                      default).
+ +                      default). To disable both hard and soft lockup detectors,
+ +                      please see 'nowatchdog'.
                         This is useful when you use a panic=... timeout and
                         need the box quickly up again.
   
@@@ -3111,12 -3079,9 +3116,12 @@@
                         cache-to-cache transfer latencies.
   
         rcutree.rcu_fanout_leaf= [KNL]
- -                      Increase the number of CPUs assigned to each
- -                      leaf rcu_node structure.  Useful for very large
- -                      systems.
+ +                      Change the number of CPUs assigned to each
+ +                      leaf rcu_node structure.  Useful for very
+ +                      large systems, which will choose the value 64,
+ +                      and for NUMA systems with large remote-access
+ +                      latencies, which will choose a value aligned
+ +                      with the appropriate hardware boundaries.
   
         rcutree.jiffies_till_sched_qs= [KNL]
                         Set required age in jiffies for a
diff --combined MAINTAINERS

index 7016beec7f93a157694bd97981722472f6bf815a,5f36f5a294439f4fd8ce948deab9d14745e10b29..c16f3f95db26998179fcc942bd2051290fc69200
--- 1/MAINTAINERS
--- 2/MAINTAINERS
+++ b/MAINTAINERS
@@@ -240,12 -240,6 +240,12 @@@ L:       lm-sensors@lm-sensors.or
   S:    Maintained
   F:    drivers/hwmon/abituguru3.c
   
+ +ACCES 104-IDIO-16 GPIO DRIVER
+ +M:    "William Breathitt Gray" <vilhelm.gray@gmail.com>
+ +L:    linux-gpio@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/gpio/gpio-104-idio-16.c
+ +
   ACENIC DRIVER
   M:    Jes Sorensen <jes@trained-monkey.org>
   L:    linux-acenic@sunsite.dk
@@@ -660,6 -654,11 +660,6 @@@ F:        drivers/gpu/drm/radeon/radeon_kfd.
   F:    drivers/gpu/drm/radeon/radeon_kfd.h
   F:    include/uapi/linux/kfd_ioctl.h
   
- -AMD MICROCODE UPDATE SUPPORT
- -M:    Borislav Petkov <bp@alien8.de>
- -S:    Maintained
- -F:    arch/x86/kernel/cpu/microcode/amd*
- -
   AMD XGBE DRIVER
   M:    Tom Lendacky <thomas.lendacky@amd.com>
   L:    netdev@vger.kernel.org
@@@ -789,11 -788,6 +789,11 @@@ S:       Maintaine
   F:    drivers/net/appletalk/
   F:    net/appletalk/
   
+ +APPLIED MICRO (APM) X-GENE DEVICE TREE SUPPORT
+ +M:    Duc Dang <dhdang@apm.com>
+ +S:    Supported
+ +F:    arch/arm64/boot/dts/apm/
+ +
   APPLIED MICRO (APM) X-GENE SOC ETHERNET DRIVER
   M:    Iyappan Subramanian <isubramanian@apm.com>
   M:    Keyur Chudgar <kchudgar@apm.com>
@@@ -828,13 -822,12 +828,13 @@@ F:      arch/arm/include/asm/floppy.
   
   ARM PMU PROFILING AND DEBUGGING
   M:    Will Deacon <will.deacon@arm.com>
+ +R:    Mark Rutland <mark.rutland@arm.com>
   S:    Maintained
- -F:    arch/arm/kernel/perf_*
+ +F:    arch/arm*/kernel/perf_*
   F:    arch/arm/oprofile/common.c
- -F:    arch/arm/kernel/hw_breakpoint.c
- -F:    arch/arm/include/asm/hw_breakpoint.h
- -F:    arch/arm/include/asm/perf_event.h
+ +F:    arch/arm*/kernel/hw_breakpoint.c
+ +F:    arch/arm*/include/asm/hw_breakpoint.h
+ +F:    arch/arm*/include/asm/perf_event.h
   F:    drivers/perf/arm_pmu.c
   F:    include/linux/perf/arm_pmu.h
   
@@@ -901,12 -894,11 +901,12 @@@ M:      Lennert Buytenhek <kernel@wantstofly
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   S:    Maintained
   
- -ARM/Allwinner A1X SoC support
+ +ARM/Allwinner sunXi SoC support
   M:    Maxime Ripard <maxime.ripard@free-electrons.com>
+ +M:    Chen-Yu Tsai <wens@csie.org>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   S:    Maintained
- -N:    sun[x4567]i
+ +N:    sun[x456789]i
   
   ARM/Allwinner SoC Clock Support
   M:    Emilio López <emilio@elopez.com.ar>
@@@ -925,7 -917,7 +925,7 @@@ M: Tsahee Zidenberg <tsahee@annapurnala
   S:    Maintained
   F:    arch/arm/mach-alpine/
   
- -ARM/ATMEL AT91RM9200 AND AT91SAM ARM ARCHITECTURES
+ +ARM/ATMEL AT91RM9200, AT91SAM9 AND SAMA5 SOC SUPPORT
   M:    Nicolas Ferre <nicolas.ferre@atmel.com>
   M:    Alexandre Belloni <alexandre.belloni@free-electrons.com>
   M:    Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
@@@ -1238,13 -1230,6 +1238,13 @@@ ARM/LPC18XX ARCHITECTUR
   M:    Joachim Eastwood <manabian@gmail.com>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   S:    Maintained
+ +F:    arch/arm/boot/dts/lpc43*
+ +F:    drivers/clk/nxp/clk-lpc18xx*
+ +F:    drivers/clocksource/time-lpc32xx.c
+ +F:    drivers/i2c/busses/i2c-lpc2k.c
+ +F:    drivers/memory/pl172.c
+ +F:    drivers/mtd/spi-nor/nxp-spifi.c
+ +F:    drivers/rtc/rtc-lpc24xx.c
   N:    lpc18xx
   
   ARM/MAGICIAN MACHINE SUPPORT
@@@ -1312,13 -1297,6 +1312,13 @@@ F:    arch/arm/mach-mediatek
   N:    mtk
   K:    mediatek
   
+ +ARM/Mediatek USB3 PHY DRIVER
+ +M:    Chunfeng Yun <chunfeng.yun@mediatek.com>
+ +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+ +L:    linux-mediatek@lists.infradead.org (moderated for non-subscribers)
+ +S:    Maintained
+ +F:    drivers/phy/phy-mt65xx-usb3.c
+ +
   ARM/MICREL KS8695 ARCHITECTURE
   M:    Greg Ungerer <gerg@uclinux.org>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -1466,12 -1444,7 +1466,12 @@@ F:    arch/arm/mach-exynos*
   F:    drivers/*/*s3c2410*
   F:    drivers/*/*/*s3c2410*
   F:    drivers/spi/spi-s3c*
+ +F:    drivers/soc/samsung/*
   F:    sound/soc/samsung/*
+ +F:    Documentation/arm/Samsung/
+ +F:    Documentation/devicetree/bindings/arm/samsung/
+ +F:    Documentation/devicetree/bindings/sram/samsung-sram.txt
+ +F:    Documentation/devicetree/bindings/power/pd-samsung.txt
   N:    exynos
   
   ARM/SAMSUNG MOBILE MACHINE SUPPORT
@@@ -1506,14 -1479,6 +1506,14 @@@ L:    linux-media@vger.kernel.or
   S:    Maintained
   F:    drivers/media/platform/s5p-tv/
   
+ +ARM/SAMSUNG S5P SERIES JPEG CODEC SUPPORT
+ +M:    Andrzej Pietrasiewicz <andrzej.p@samsung.com>
+ +M:    Jacek Anaszewski <j.anaszewski@samsung.com>
+ +L:    linux-arm-kernel@lists.infradead.org
+ +L:    linux-media@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/media/platform/s5p-jpeg/
+ +
   ARM/SHMOBILE ARM ARCHITECTURE
   M:    Simon Horman <horms@verge.net.au>
   M:    Magnus Damm <magnus.damm@gmail.com>
@@@ -1526,6 -1491,8 +1526,6 @@@ F:      arch/arm/boot/dts/emev2
   F:    arch/arm/boot/dts/r7s*
   F:    arch/arm/boot/dts/r8a*
   F:    arch/arm/boot/dts/sh*
- -F:    arch/arm/configs/bockw_defconfig
- -F:    arch/arm/configs/marzen_defconfig
   F:    arch/arm/configs/shmobile_defconfig
   F:    arch/arm/include/debug/renesas-scif.S
   F:    arch/arm/mach-shmobile/
@@@ -1560,7 -1527,6 +1560,7 @@@ W:      http://www.stlinux.co
   S:    Maintained
   F:    arch/arm/mach-sti/
   F:    arch/arm/boot/dts/sti*
+ +F:    drivers/char/hw_random/st-rng.c
   F:    drivers/clocksource/arm_global_timer.c
   F:    drivers/clocksource/clksrc_st_lpc.c
   F:    drivers/i2c/busses/i2c-st.c
@@@ -1640,10 -1606,7 +1640,10 @@@ M:    Masahiro Yamada <yamada.masahiro@soc
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   S:    Maintained
   F:    arch/arm/boot/dts/uniphier*
+ +F:    arch/arm/include/asm/hardware/cache-uniphier.h
   F:    arch/arm/mach-uniphier/
+ +F:    arch/arm/mm/cache-uniphier.c
+ +F:    drivers/i2c/busses/i2c-uniphier*
   F:    drivers/pinctrl/uniphier/
   F:    drivers/tty/serial/8250/8250_uniphier.c
   N:    uniphier
@@@ -1816,14 -1779,6 +1816,14 @@@ S:    Supporte
   F:    Documentation/aoe/
   F:    drivers/block/aoe/
   
+ +ATHEROS 71XX/9XXX GPIO DRIVER
+ +M:    Alban Bedel <albeu@free.fr>
+ +W:    https://github.com/AlbanBedel/linux
+ +T:    git git://github.com/AlbanBedel/linux
+ +S:    Maintained
+ +F:    drivers/gpio/gpio-ath79.c
+ +F:    Documentation/devicetree/bindings/gpio/gpio-ath79.txt
+ +
   ATHEROS ATH GENERIC UTILITIES
   M:    "Luis R. Rodriguez" <mcgrof@do-not-panic.com>
   L:    linux-wireless@vger.kernel.org
@@@ -2405,27 -2360,19 +2405,27 @@@ L:   linux-scsi@vger.kernel.or
   S:    Supported
   F:    drivers/scsi/bnx2i/
   
- -BROADCOM CYGNUS/IPROC ARM ARCHITECTURE
+ +BROADCOM IPROC ARM ARCHITECTURE
   M:    Ray Jui <rjui@broadcom.com>
   M:    Scott Branden <sbranden@broadcom.com>
+ +M:    Jon Mason <jonmason@broadcom.com>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
   L:    bcm-kernel-feedback-list@broadcom.com
   T:    git git://github.com/broadcom/cygnus-linux.git
   S:    Maintained
   N:    iproc
   N:    cygnus
+ +N:    nsp
   N:    bcm9113*
   N:    bcm9583*
- -N:    bcm583*
+ +N:    bcm9585*
+ +N:    bcm9586*
+ +N:    bcm988312
   N:    bcm113*
+ +N:    bcm583*
+ +N:    bcm585*
+ +N:    bcm586*
+ +N:    bcm88312
   
   BROADCOM BRCMSTB GPIO DRIVER
   M:    Gregory Fong <gregory.0xf0@gmail.com>
@@@ -2783,10 -2730,9 +2783,10 @@@ S:    Supporte
   F:    drivers/net/ethernet/cisco/enic/
   
   CISCO VIC LOW LATENCY NIC DRIVER
- -M:    Upinder Malhi <umalhi@cisco.com>
+ +M:    Christian Benvenuti <benve@cisco.com>
+ +M:    Dave Goodell <dgoodell@cisco.com>
   S:    Supported
- -F:    drivers/infiniband/hw/usnic
+ +F:    drivers/infiniband/hw/usnic/
   
   CIRRUS LOGIC EP93XX ETHERNET DRIVER
   M:    Hartley Sweeten <hsweeten@visionengravers.com>
@@@ -3421,7 -3367,6 +3421,7 @@@ M:      Support Opensource <support.opensour
   W:    http://www.dialog-semiconductor.com/products
   S:    Supported
   F:    Documentation/hwmon/da90??
+ +F:    Documentation/devicetree/bindings/sound/da[79]*.txt
   F:    drivers/gpio/gpio-da90??.c
   F:    drivers/hwmon/da90??-hwmon.c
   F:    drivers/iio/adc/da91??-*.c
@@@ -3556,15 -3501,13 +3556,15 @@@ M:   Jonathan Corbet <corbet@lwn.net
   L:    linux-doc@vger.kernel.org
   S:    Maintained
   F:    Documentation/
+ +F:    scripts/docproc.c
+ +F:    scripts/kernel-doc*
   X:    Documentation/ABI/
   X:    Documentation/devicetree/
   X:    Documentation/acpi
   X:    Documentation/power
   X:    Documentation/spi
   X:    Documentation/DocBook/media
- -T:    git git://git.lwn.net/linux-2.6.git docs-next
+ +T:    git git://git.lwn.net/linux.git docs-next
   
   DOUBLETALK DRIVER
   M:    "James R. Van Zandt" <jrv@vanzandt.mv.com>
@@@ -3641,7 -3584,6 +3641,7 @@@ M:      Daniel Vetter <daniel.vetter@intel.c
   M:    Jani Nikula <jani.nikula@linux.intel.com>
   L:    intel-gfx@lists.freedesktop.org
   L:    dri-devel@lists.freedesktop.org
+ +W:    https://01.org/linuxgraphics/
   Q:    http://patchwork.freedesktop.org/project/intel-gfx/
   T:    git git://anongit.freedesktop.org/drm-intel
   S:    Supported
@@@ -4394,13 -4336,6 +4394,13 @@@ F:    include/linux/fmc*.
   F:    include/linux/ipmi-fru.h
   K:    fmc_d.*register
   
+ +FPGA MANAGER FRAMEWORK
+ +M:    Alan Tull <atull@opensource.altera.com>
+ +S:    Maintained
+ +F:    drivers/fpga/
+ +F:    include/linux/fpga/fpga-mgr.h
+ +W:    http://www.rocketboards.org
+ +
   FPU EMULATOR
   M:    Bill Metzenthen <billm@melbpc.org.au>
   W:    http://floatingpoint.sourceforge.net/emulator/index.html
@@@ -4492,14 -4427,6 +4492,14 @@@ L:    linuxppc-dev@lists.ozlabs.or
   S:    Maintained
   F:    drivers/net/ethernet/freescale/ucc_geth*
   
+ +FREESCALE eTSEC ETHERNET DRIVER (GIANFAR)
+ +M:    Claudiu Manoil <claudiu.manoil@freescale.com>
+ +L:    netdev@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/net/ethernet/freescale/gianfar*
+ +X:    drivers/net/ethernet/freescale/gianfar_ptp.c
+ +F:    Documentation/devicetree/bindings/net/fsl-tsec-phy.txt
+ +
   FREESCALE QUICC ENGINE UCC UART DRIVER
   M:    Timur Tabi <timur@tabi.org>
   L:    linuxppc-dev@lists.ozlabs.org
@@@ -5177,7 -5104,6 +5177,7 @@@ S:      Maintaine
   F:    Documentation/devicetree/bindings/i2c/
   F:    Documentation/i2c/
   F:    drivers/i2c/
+ +F:    drivers/i2c/*/
   F:    include/linux/i2c.h
   F:    include/linux/i2c-*.h
   F:    include/uapi/linux/i2c.h
@@@ -5519,6 -5445,12 +5519,6 @@@ W:     https://01.org/linux-acp
   S:    Supported
   F:    drivers/platform/x86/intel_menlow.c
   
- -INTEL IA32 MICROCODE UPDATE SUPPORT
- -M:    Borislav Petkov <bp@alien8.de>
- -S:    Maintained
- -F:    arch/x86/kernel/cpu/microcode/core*
- -F:    arch/x86/kernel/cpu/microcode/intel*
- -
   INTEL I/OAT DMA DRIVER
   M:    Dave Jiang <dave.jiang@intel.com>
   R:    Dan Williams <dan.j.williams@intel.com>
@@@ -5598,12 -5530,6 +5598,12 @@@ F:    Documentation/networking/README.ipw2
   F:    Documentation/networking/README.ipw2200
   F:    drivers/net/wireless/ipw2x00/
   
+ +INTEL(R) TRACE HUB
+ +M:    Alexander Shishkin <alexander.shishkin@linux.intel.com>
+ +S:    Supported
+ +F:    Documentation/trace/intel_th.txt
+ +F:    drivers/hwtracing/intel_th/
+ +
   INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT)
   M:    Richard L Maliszewski <richard.l.maliszewski@intel.com>
   M:    Gang Wei <gang.wei@intel.com>
@@@ -5635,7 -5561,7 +5635,7 @@@ F:      drivers/net/wireless/iwlegacy
   INTEL WIRELESS WIFI LINK (iwlwifi)
   M:    Johannes Berg <johannes.berg@intel.com>
   M:    Emmanuel Grumbach <emmanuel.grumbach@intel.com>
- -M:    Intel Linux Wireless <ilw@linux.intel.com>
+ +M:    Intel Linux Wireless <linuxwifi@intel.com>
   L:    linux-wireless@vger.kernel.org
   W:    http://intellinuxwireless.org
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/iwlwifi/iwlwifi.git
@@@ -5651,22 -5577,6 +5651,22 @@@ F:    include/linux/mei_cl_bus.
   F:    drivers/misc/mei/*
   F:    Documentation/misc-devices/mei/*
   
+ +INTEL MIC DRIVERS (mic)
+ +M:    Sudeep Dutt <sudeep.dutt@intel.com>
+ +M:    Ashutosh Dixit <ashutosh.dixit@intel.com>
+ +S:    Supported
+ +W:    https://github.com/sudeepdutt/mic
+ +W:    http://software.intel.com/en-us/mic-developer
+ +F:    include/linux/mic_bus.h
+ +F:    include/linux/scif.h
+ +F:    include/uapi/linux/mic_common.h
+ +F:    include/uapi/linux/mic_ioctl.h
+ +F     include/uapi/linux/scif_ioctl.h
+ +F:    drivers/misc/mic/
+ +F:    drivers/dma/mic_x100_dma.c
+ +F:    drivers/dma/mic_x100_dma.h
+ +F     Documentation/mic/
+ +
   INTEL PMC IPC DRIVER
   M:    Zha Qipeng<qipeng.zha@intel.com>
   L:    platform-driver-x86@vger.kernel.org
@@@ -6198,13 -6108,6 +6198,13 @@@ F:    Documentation/auxdisplay/ks010
   F:    drivers/auxdisplay/ks0108.c
   F:    include/linux/ks0108.h
   
+ +L3MDEV
+ +M:    David Ahern <dsa@cumulusnetworks.com>
+ +L:    netdev@vger.kernel.org
+ +S:    Maintained
+ +F:    net/l3mdev
+ +F:    include/net/l3mdev.h
+ +
   LAPB module
   L:    linux-x25@vger.kernel.org
   S:    Orphan
@@@ -6355,14 -6258,6 +6355,14 @@@ F:    drivers/nvdimm/pmem.
   F:    include/linux/pmem.h
   F:    arch/*/include/asm/pmem.h
   
+ +LIGHTNVM PLATFORM SUPPORT
+ +M:    Matias Bjorling <mb@lightnvm.io>
+ +W:    http://github/OpenChannelSSD
+ +S:    Maintained
+ +F:    drivers/lightnvm/
+ +F:    include/linux/lightnvm.h
+ +F:    include/uapi/linux/lightnvm.h
+ +
   LINUX FOR IBM pSERIES (RS/6000)
   M:    Paul Mackerras <paulus@au.ibm.com>
   W:    http://www.ibm.com/linux/ltc/projects/ppc
@@@ -6680,13 -6575,6 +6680,13 @@@ M:    Guenter Roeck <linux@roeck-us.net
   S:    Maintained
   F:    drivers/net/dsa/mv88e6352.c
   
+ +MARVELL CRYPTO DRIVER
+ +M:    Boris Brezillon <boris.brezillon@free-electrons.com>
+ +M:    Arnaud Ebalard <arno@natisbad.org>
+ +F:    drivers/crypto/marvell/
+ +S:    Maintained
+ +L:    linux-crypto@vger.kernel.org
+ +
   MARVELL GIGABIT ETHERNET DRIVERS (skge/sky2)
   M:    Mirko Lindner <mlindner@marvell.com>
   M:    Stephen Hemminger <stephen@networkplumber.org>
@@@ -6809,12 -6697,6 +6809,12 @@@ W:    http://linuxtv.or
   S:    Maintained
   F:    drivers/media/radio/radio-maxiradio*
   
+ +MCP4531 MICROCHIP DIGITAL POTENTIOMETER DRIVER
+ +M:    Peter Rosin <peda@axentia.se>
+ +L:    linux-iio@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/iio/potentiometer/mcp4531.c
+ +
   MEDIA DRIVERS FOR RENESAS - VSP1
   M:    Laurent Pinchart <laurent.pinchart@ideasonboard.com>
   L:    linux-media@vger.kernel.org
@@@ -7050,13 -6932,6 +7050,13 @@@ S:    Supporte
   F:    include/linux/mlx5/
   F:    drivers/infiniband/hw/mlx5/
   
+ +MELEXIS MLX90614 DRIVER
+ +M:    Crt Mori <cmo@melexis.com>
+ +L:    linux-iio@vger.kernel.org
+ +W:    http://www.melexis.com
+ +S:    Supported
+ +F:    drivers/iio/temperature/mlx90614.c
+ +
   MN88472 MEDIA DRIVER
   M:    Antti Palosaari <crope@iki.fi>
   L:    linux-media@vger.kernel.org
@@@ -7110,7 -6985,6 +7110,7 @@@ M:      Alan Ott <alan@signal11.us
   L:    linux-wpan@vger.kernel.org
   S:    Maintained
   F:    drivers/net/ieee802154/mrf24j40.c
+ +F:    Documentation/devicetree/bindings/net/ieee802154/mrf24j40.txt
   
   MSI LAPTOP SUPPORT
   M:    "Lee, Chun-Yi" <jlee@suse.com>
@@@ -7183,6 -7057,7 +7183,6 @@@ F:      drivers/media/i2c/mt9v032.
   F:    include/media/mt9v032.h
   
   MULTIFUNCTION DEVICES (MFD)
- -M:    Samuel Ortiz <sameo@linux.intel.com>
   M:    Lee Jones <lee.jones@linaro.org>
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git
   S:    Supported
@@@ -7444,6 -7319,7 +7444,6 @@@ S:      Odd Fixe
   F:    drivers/net/
   F:    include/linux/if_*
   F:    include/linux/netdevice.h
- -F:    include/linux/arcdevice.h
   F:    include/linux/etherdevice.h
   F:    include/linux/fcdevice.h
   F:    include/linux/fddidevice.h
@@@ -7509,6 -7385,7 +7509,7 @@@ S:      Supporte
   F:    Documentation/filesystems/nilfs2.txt
   F:    fs/nilfs2/
   F:    include/linux/nilfs2_fs.h
+ F:    include/trace/events/nilfs2.h
   
   NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER
   M:    YOKOTA Hiroshi <yokota@netlab.is.tsukuba.ac.jp>
@@@ -7536,10 -7413,10 +7537,10 @@@ NOKIA N900 POWER SUPPLY DRIVER
   M:    Pali Rohár <pali.rohar@gmail.com>
   S:    Maintained
   F:    include/linux/power/bq2415x_charger.h
- -F:    include/linux/power/bq27x00_battery.h
+ +F:    include/linux/power/bq27xxx_battery.h
   F:    include/linux/power/isp1704_charger.h
   F:    drivers/power/bq2415x_charger.c
- -F:    drivers/power/bq27x00_battery.c
+ +F:    drivers/power/bq27xxx_battery.c
   F:    drivers/power/isp1704_charger.c
   F:    drivers/power/rx51_battery.c
   
@@@ -7582,13 -7459,11 +7583,13 @@@ F:   drivers/video/fbdev/riva
   F:    drivers/video/fbdev/nvidia/
   
   NVM EXPRESS DRIVER
- -M:    Matthew Wilcox <willy@linux.intel.com>
+ +M:    Keith Busch <keith.busch@intel.com>
+ +M:    Jens Axboe <axboe@fb.com>
   L:    linux-nvme@lists.infradead.org
- -T:    git git://git.infradead.org/users/willy/linux-nvme.git
+ +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
+ +W:    https://kernel.googlesource.com/pub/scm/linux/kernel/git/axboe/linux-block/
   S:    Supported
- -F:    drivers/block/nvme*
+ +F:    drivers/nvme/host/
   F:    include/linux/nvme.h
   
   NVMEM FRAMEWORK
@@@ -8083,14 -7958,6 +8084,14 @@@ F:    include/linux/pci
   F:    arch/x86/pci/
   F:    arch/x86/kernel/quirks.c
   
+ +PCI DRIVER FOR ALTERA PCIE IP
+ +M:    Ley Foon Tan <lftan@altera.com>
+ +L:    rfi@lists.rocketboards.org (moderated for non-subscribers)
+ +L:    linux-pci@vger.kernel.org
+ +S:    Supported
+ +F:    Documentation/devicetree/bindings/pci/altera-pcie.txt
+ +F:    drivers/pci/host/pcie-altera.c
+ +
   PCI DRIVER FOR ARM VERSATILE PLATFORM
   M:    Rob Herring <robh@kernel.org>
   L:    linux-pci@vger.kernel.org
@@@ -8192,14 -8059,6 +8193,14 @@@ L:    linux-pci@vger.kernel.or
   S:    Maintained
   F:    drivers/pci/host/*spear*
   
+ +PCI MSI DRIVER FOR ALTERA MSI IP
+ +M:    Ley Foon Tan <lftan@altera.com>
+ +L:    rfi@lists.rocketboards.org (moderated for non-subscribers)
+ +L:    linux-pci@vger.kernel.org
+ +S:    Supported
+ +F:    Documentation/devicetree/bindings/pci/altera-pcie-msi.txt
+ +F:    drivers/pci/host/pcie-altera-msi.c
+ +
   PCI MSI DRIVER FOR APPLIEDMICRO XGENE
   M:    Duc Dang <dhdang@apm.com>
   L:    linux-pci@vger.kernel.org
@@@ -8208,13 -8067,6 +8209,13 @@@ S:    Maintaine
   F:    Documentation/devicetree/bindings/pci/xgene-pci-msi.txt
   F:    drivers/pci/host/pci-xgene-msi.c
   
+ +PCIE DRIVER FOR HISILICON
+ +M:    Zhou Wang <wangzhou1@hisilicon.com>
+ +L:    linux-pci@vger.kernel.org
+ +S:    Maintained
+ +F:    Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
+ +F:    drivers/pci/host/pcie-hisi.c
+ +
   PCMCIA SUBSYSTEM
   P:    Linux PCMCIA Team
   L:    linux-pcmcia@lists.infradead.org
@@@ -8321,13 -8173,6 +8322,13 @@@ L:    linux-arm-kernel@lists.infradead.or
   S:    Maintained
   F:    drivers/pinctrl/pinctrl-at91.*
   
+ +PIN CONTROLLER - ATMEL AT91 PIO4
+ +M:    Ludovic Desroches <ludovic.desroches@atmel.com>
+ +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+ +L:    linux-gpio@vger.kernel.org
+ +S:    Supported
+ +F:    drivers/pinctrl/pinctrl-at91-pio4.*
+ +
   PIN CONTROLLER - INTEL
   M:    Mika Westerberg <mika.westerberg@linux.intel.com>
   M:    Heikki Krogerus <heikki.krogerus@linux.intel.com>
@@@ -8431,6 -8276,12 +8432,6 @@@ M:     "Rafael J. Wysocki" <rafael.j.wysock
   S:    Maintained
   F:    drivers/pnp/
   
- -PNXxxxx I2C DRIVER
- -M:    Vitaly Wool <vitalywool@gmail.com>
- -L:    linux-i2c@vger.kernel.org
- -S:    Maintained
- -F:    drivers/i2c/busses/i2c-pnx.c
- -
   PPP PROTOCOL DRIVERS AND COMPRESSORS
   M:    Paul Mackerras <paulus@samba.org>
   L:    linux-ppp@vger.kernel.org
@@@ -8683,16 -8534,6 +8684,16 @@@ L:    netdev@vger.kernel.or
   S:    Supported
   F:    drivers/net/ethernet/qlogic/qlge/
   
+ +QLOGIC QL4xxx ETHERNET DRIVER
+ +M:    Yuval Mintz <Yuval.Mintz@qlogic.com>
+ +M:    Ariel Elior <Ariel.Elior@qlogic.com>
+ +M:    everest-linux-l2@qlogic.com
+ +L:    netdev@vger.kernel.org
+ +S:    Supported
+ +F:    drivers/net/ethernet/qlogic/qed/
+ +F:    include/linux/qed/
+ +F:    drivers/net/ethernet/qlogic/qede/
+ +
   QNX4 FILESYSTEM
   M:    Anders Larsen <al@alarsen.net>
   W:    http://www.alarsen.net/linux/qnx4fs/
@@@ -9044,13 -8885,6 +9045,13 @@@ S:    Maintaine
   F:    drivers/net/wireless/rtlwifi/
   F:    drivers/net/wireless/rtlwifi/rtl8192ce/
   
+ +RTL8XXXU WIRELESS DRIVER (rtl8xxxu)
+ +M:    Jes Sorensen <Jes.Sorensen@redhat.com>
+ +L:    linux-wireless@vger.kernel.org
+ +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jes/linux.git rtl8723au-mac80211
+ +S:    Maintained
+ +F:    drivers/net/wireless/realtek/rtl8xxxu/
+ +
   S3 SAVAGE FRAMEBUFFER DRIVER
   M:    Antonino Daplas <adaplas@gmail.com>
   L:    linux-fbdev@vger.kernel.org
@@@ -9124,13 -8958,6 +9125,13 @@@ F:    drivers/s390/net/*iucv
   F:    include/net/iucv/
   F:    net/iucv/
   
+ +S390 IOMMU (PCI)
+ +M:    Gerald Schaefer <gerald.schaefer@de.ibm.com>
+ +L:    linux-s390@vger.kernel.org
+ +W:    http://www.ibm.com/developerworks/linux/linux390/
+ +S:    Supported
+ +F:    drivers/iommu/s390-iommu.c
+ +
   S3C24XX SD/MMC Driver
   M:    Ben Dooks <ben-linux@fluff.org>
   L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@@ -9306,14 -9133,6 +9307,14 @@@ S:    Maintaine
   F:    include/linux/mmc/dw_mmc.h
   F:    drivers/mmc/host/dw_mmc*
   
+ +SYSTEM TRACE MODULE CLASS
+ +M:    Alexander Shishkin <alexander.shishkin@linux.intel.com>
+ +S:    Maintained
+ +F:    Documentation/trace/stm.txt
+ +F:    drivers/hwtracing/stm/
+ +F:    include/linux/stm.h
+ +F:    include/uapi/linux/stm.h
+ +
   THUNDERBOLT DRIVER
   M:    Andreas Noever <andreas.noever@gmail.com>
   S:    Maintained
@@@ -9358,16 -9177,6 +9359,16 @@@ W:    http://www.sunplus.co
   S:    Supported
   F:    arch/score/
   
+ +SYSTEM CONTROL & POWER INTERFACE (SCPI) Message Protocol drivers
+ +M:    Sudeep Holla <sudeep.holla@arm.com>
+ +L:    linux-arm-kernel@lists.infradead.org
+ +S:    Maintained
+ +F:    Documentation/devicetree/bindings/arm/arm,scpi.txt
+ +F:    drivers/clk/clk-scpi.c
+ +F:    drivers/cpufreq/scpi-cpufreq.c
+ +F:    drivers/firmware/arm_scpi.c
+ +F:    include/linux/scpi_protocol.h
+ +
   SCSI CDROM DRIVER
   M:    Jens Axboe <axboe@kernel.dk>
   L:    linux-scsi@vger.kernel.org
@@@ -9529,8 -9338,8 +9530,8 @@@ F:      include/uapi/linux/phantom.
   
   SERVER ENGINES 10Gbps iSCSI - BladeEngine 2 DRIVER
   M:    Jayamohan Kallickal <jayamohan.kallickal@avagotech.com>
- -M:    Minh Tran <minh.tran@avagotech.com>
- -M:    John Soni Jose <sony.john-n@avagotech.com>
+ +M:    Ketan Mukadam <ketan.mukadam@avagotech.com>
+ +M:    John Soni Jose <sony.john@avagotech.com>
   L:    linux-scsi@vger.kernel.org
   W:    http://www.avagotech.com
   S:    Supported
@@@ -10192,11 -10001,9 +10193,11 @@@ F: drivers/staging/vt665?
   
   STAGING - WILC1000 WIFI DRIVER
   M:    Johnny Kim <johnny.kim@atmel.com>
- -M:    Rachel Kim <rachel.kim@atmel.com>
- -M:    Dean Lee <dean.lee@atmel.com>
+ +M:    Austin Shin <austin.shin@atmel.com>
   M:    Chris Park <chris.park@atmel.com>
+ +M:    Tony Cho <tony.cho@atmel.com>
+ +M:    Glen Lee <glen.lee@atmel.com>
+ +M:    Leo Kim <leo.kim@atmel.com>
   L:    linux-wireless@vger.kernel.org
   S:    Supported
   F:    drivers/staging/wilc1000/
@@@ -10285,7 -10092,6 +10286,7 @@@ F:   include/net/switchdev.
   
   SYNOPSYS ARC ARCHITECTURE
   M:    Vineet Gupta <vgupta@synopsys.com>
+ +L:    linux-snps-arc@lists.infraded.org
   S:    Supported
   F:    arch/arc/
   F:    Documentation/devicetree/bindings/arc/*
@@@ -10746,12 -10552,6 +10747,12 @@@ L: platform-driver-x86@vger.kernel.or
   S:    Maintained
   F:    drivers/platform/x86/toshiba_haps.c
   
+ +TOSHIBA WMI HOTKEYS DRIVER
+ +M:    Azael Avalos <coproscefalo@gmail.com>
+ +L:    platform-driver-x86@vger.kernel.org
+ +S:    Maintained
+ +F:    drivers/platform/x86/toshiba-wmi.c
+ +
   TOSHIBA SMM DRIVER
   M:    Jonathan Buzzard <jonathan@buzzard.org.uk>
   W:    http://www.buzzard.org.uk/toshiba/
@@@ -10809,7 -10609,6 +10810,7 @@@ F:   drivers/media/pci/tw68
   TPM DEVICE DRIVER
   M:    Peter Huewe <peterhuewe@gmx.de>
   M:    Marcel Selhorst <tpmdd@selhorst.net>
+ +M:    Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
   R:    Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
   W:    http://tpmdd.sourceforge.net
   L:    tpmdd-devel@lists.sourceforge.net (moderated for non-subscribers)
@@@ -11294,12 -11093,6 +11295,12 @@@ S: Maintaine
   F:    Documentation/fb/uvesafb.txt
   F:    drivers/video/fbdev/uvesafb.*
   
+ +VF610 NAND DRIVER
+ +M:    Stefan Agner <stefan@agner.ch>
+ +L:    linux-mtd@lists.infradead.org
+ +S:    Supported
+ +F:    drivers/mtd/nand/vf610_nfc.c
+ +
   VFAT/FAT/MSDOS FILESYSTEM
   M:    OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
   S:    Maintained
@@@ -11330,12 -11123,6 +11331,12 @@@ S: Maintaine
   F:    drivers/media/v4l2-core/videobuf2-*
   F:    include/media/videobuf2-*
   
+ +VIRTUAL SERIO DEVICE DRIVER
+ +M:    Stephen Chandler Paul <thatslyude@gmail.com>
+ +S:    Maintained
+ +F:    drivers/input/serio/userio.c
+ +F:    include/uapi/linux/userio.h
+ +
   VIRTIO CONSOLE DRIVER
   M:    Amit Shah <amit.shah@redhat.com>
   L:    virtualization@lists.linux-foundation.org
@@@ -11413,13 -11200,6 +11414,13 @@@ L: netdev@vger.kernel.or
   S:    Maintained
   F:    drivers/net/ethernet/via/via-velocity.*
   
+ +VIRT LIB
+ +M:    Alex Williamson <alex.williamson@redhat.com>
+ +M:    Paolo Bonzini <pbonzini@redhat.com>
+ +L:    kvm@vger.kernel.org
+ +S:    Supported
+ +F:    virt/lib/
+ +
   VIVID VIRTUAL VIDEO DRIVER
   M:    Hans Verkuil <hverkuil@xs4all.nl>
   L:    linux-media@vger.kernel.org
@@@ -11508,6 -11288,7 +11509,6 @@@ M:   Shrijeet Mukherjee <shm@cumulusnetwo
   L:    netdev@vger.kernel.org
   S:    Maintained
   F:    drivers/net/vrf.c
- -F:    include/net/vrf.h
   F:    Documentation/networking/vrf.txt
   
   VT1211 HARDWARE MONITOR DRIVER
@@@ -11626,9 -11407,6 +11627,9 @@@ T:   git https://github.com/CirrusLogic/l
   W:    https://github.com/CirrusLogic/linux-drivers/wiki
   S:    Supported
   F:    Documentation/hwmon/wm83??
+ +F:    Documentation/devicetree/bindings/extcon/extcon-arizona.txt
+ +F:    Documentation/devicetree/bindings/regulator/arizona-regulator.txt
+ +F:    Documentation/devicetree/bindings/mfd/arizona.txt
   F:    arch/arm/mach-s3c64xx/mach-crag6410*
   F:    drivers/clk/clk-wm83*.c
   F:    drivers/extcon/extcon-arizona.c
@@@ -11689,7 -11467,6 +11690,7 @@@ L:   platform-driver-x86@vger.kernel.or
   T:    git git://git.infradead.org/users/dvhart/linux-platform-drivers-x86.git
   S:    Maintained
   F:    drivers/platform/x86/
+ +F:    drivers/platform/olpc/
   
   X86 MCE INFRASTRUCTURE
   M:    Tony Luck <tony.luck@intel.com>
@@@ -11698,11 -11475,6 +11699,11 @@@ L: linux-edac@vger.kernel.or
   S:    Maintained
   F:    arch/x86/kernel/cpu/mcheck/*
   
+ +X86 MICROCODE UPDATE SUPPORT
+ +M:    Borislav Petkov <bp@alien8.de>
+ +S:    Maintained
+ +F:    arch/x86/kernel/cpu/microcode/*
+ +
   X86 VDSO
   M:    Andy Lutomirski <luto@amacapital.net>
   L:    linux-kernel@vger.kernel.org
diff --combined arch/arc/mm/cache.c

index ff7ff6cbb8112408c05a38a2f8e001265d5d3726,875ac2e918c55d7fcc17b738c8dcfdeafacb8d79..b65f797e9ad6723abd7c38bba09e382df52450b4
--- 1/arch/arc/mm/cache.c
--- 2/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@@ -25,7 -25,7 +25,7 @@@ static int l2_line_sz
   int ioc_exists;
   volatile int slc_enable = 1, ioc_enable = 1;
   
- -void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr,
+ +void (*_cache_line_loop_ic_fn)(phys_addr_t paddr, unsigned long vaddr,
                                unsigned long sz, const int cacheop);
   
   void (*__dma_cache_wback_inv)(unsigned long start, unsigned long sz);
@@@ -37,6 -37,7 +37,6 @@@ char *arc_cache_mumbojumbo(int c, char 
         int n = 0;
         struct cpuinfo_arc_cache *p;
   
- -#define IS_USED_RUN(v)                ((v) ? "" : "(disabled) ")
   #define PR_CACHE(p, cfg, str)                                         \
         if (!(p)->ver)                                                  \
                 n += scnprintf(buf + n, len - n, str"\t\t: N/A\n");     \
@@@ -46,7 -47,7 +46,7 @@@
                         (p)->sz_k, (p)->assoc, (p)->line_len,           \
                         (p)->vipt ? "VIPT" : "PIPT",                    \
                         (p)->alias ? " aliasing" : "",                  \
- -                      IS_ENABLED(cfg) ? "" : " (not used)");
+ +                      IS_USED_CFG(cfg));
   
         PR_CACHE(&cpuinfo_arc700[c].icache, CONFIG_ARC_HAS_ICACHE, "I-Cache");
         PR_CACHE(&cpuinfo_arc700[c].dcache, CONFIG_ARC_HAS_DCACHE, "D-Cache");
@@@ -62,7 -63,7 +62,7 @@@
   
         if (ioc_exists)
                 n += scnprintf(buf + n, len - n, "IOC\t\t:%s\n",
- -                              IS_USED_RUN(ioc_enable));
+ +                              IS_DISABLED_RUN(ioc_enable));
   
         return buf;
   }
@@@ -216,7 -217,7 +216,7 @@@ slc_chk
    */
   
   static inline
- -void __cache_line_loop_v2(unsigned long paddr, unsigned long vaddr,
+ +void __cache_line_loop_v2(phys_addr_t paddr, unsigned long vaddr,
                           unsigned long sz, const int op)
   {
         unsigned int aux_cmd;
@@@ -253,12 -254,8 +253,12 @@@
         }
   }
   
+ +/*
+ + * For ARC700 MMUv3 I-cache and D-cache flushes
+ + * Also reused for HS38 aliasing I-cache configuration
+ + */
   static inline
- -void __cache_line_loop_v3(unsigned long paddr, unsigned long vaddr,
+ +void __cache_line_loop_v3(phys_addr_t paddr, unsigned long vaddr,
                           unsigned long sz, const int op)
   {
         unsigned int aux_cmd, aux_tag;
@@@ -293,16 -290,6 +293,16 @@@
         if (full_page)
                 write_aux_reg(aux_tag, paddr);
   
+ +      /*
+ +       * This is technically for MMU v4, using the MMU v3 programming model
+ +       * Special work for HS38 aliasing I-cache configuratino with PAE40
+ +       *   - upper 8 bits of paddr need to be written into PTAG_HI
+ +       *   - (and needs to be written before the lower 32 bits)
+ +       * Note that PTAG_HI is hoisted outside the line loop
+ +       */
+ +      if (is_pae40_enabled() && op == OP_INV_IC)
+ +              write_aux_reg(ARC_REG_IC_PTAG_HI, (u64)paddr >> 32);
+ +
         while (num_lines-- > 0) {
                 if (!full_page) {
                         write_aux_reg(aux_tag, paddr);
@@@ -315,20 -302,14 +315,20 @@@
   }
   
   /*
- - * In HS38x (MMU v4), although icache is VIPT, only paddr is needed for cache
- - * maintenance ops (in IVIL reg), as long as icache doesn't alias.
+ + * In HS38x (MMU v4), I-cache is VIPT (can alias), D-cache is PIPT
+ + * Here's how cache ops are implemented
+ + *
+ + *  - D-cache: only paddr needed (in DC_IVDL/DC_FLDL)
+ + *  - I-cache Non Aliasing: Despite VIPT, only paddr needed (in IC_IVIL)
+ + *  - I-cache Aliasing: Both vaddr and paddr needed (in IC_IVIL, IC_PTAG
+ + *    respectively, similar to MMU v3 programming model, hence
+ + *    __cache_line_loop_v3() is used)
    *
- - * For Aliasing icache, vaddr is also needed (in IVIL), while paddr is
- - * specified in PTAG (similar to MMU v3)
+ + * If PAE40 is enabled, independent of aliasing considerations, the higher bits
+ + * needs to be written into PTAG_HI
    */
   static inline
- -void __cache_line_loop_v4(unsigned long paddr, unsigned long vaddr,
+ +void __cache_line_loop_v4(phys_addr_t paddr, unsigned long vaddr,
                           unsigned long sz, const int cacheop)
   {
         unsigned int aux_cmd;
@@@ -355,22 -336,6 +355,22 @@@
   
         num_lines = DIV_ROUND_UP(sz, L1_CACHE_BYTES);
   
+ +      /*
+ +       * For HS38 PAE40 configuration
+ +       *   - upper 8 bits of paddr need to be written into PTAG_HI
+ +       *   - (and needs to be written before the lower 32 bits)
+ +       */
+ +      if (is_pae40_enabled()) {
+ +              if (cacheop == OP_INV_IC)
+ +                      /*
+ +                       * Non aliasing I-cache in HS38,
+ +                       * aliasing I-cache handled in __cache_line_loop_v3()
+ +                       */
+ +                      write_aux_reg(ARC_REG_IC_PTAG_HI, (u64)paddr >> 32);
+ +              else
+ +                      write_aux_reg(ARC_REG_DC_PTAG_HI, (u64)paddr >> 32);
+ +      }
+ +
         while (num_lines-- > 0) {
                 write_aux_reg(aux_cmd, paddr);
                 paddr += L1_CACHE_BYTES;
@@@ -448,7 -413,7 +448,7 @@@ static inline void __dc_entire_op(cons
   /*
    * D-Cache Line ops: Per Line INV (discard or wback+discard) or FLUSH (wback)
    */
- -static inline void __dc_line_op(unsigned long paddr, unsigned long vaddr,
+ +static inline void __dc_line_op(phys_addr_t paddr, unsigned long vaddr,
                                 unsigned long sz, const int op)
   {
         unsigned long flags;
@@@ -481,7 -446,7 +481,7 @@@ static inline void __ic_entire_inv(void
   }
   
   static inline void
- -__ic_line_inv_vaddr_local(unsigned long paddr, unsigned long vaddr,
+ +__ic_line_inv_vaddr_local(phys_addr_t paddr, unsigned long vaddr,
                           unsigned long sz)
   {
         unsigned long flags;
@@@ -498,7 -463,7 +498,7 @@@
   #else
   
   struct ic_inv_args {
- -      unsigned long paddr, vaddr;
+ +      phys_addr_t paddr, vaddr;
         int sz;
   };
   
@@@ -509,7 -474,7 +509,7 @@@ static void __ic_line_inv_vaddr_helper(
           __ic_line_inv_vaddr_local(ic_inv->paddr, ic_inv->vaddr, ic_inv->sz);
   }
   
- -static void __ic_line_inv_vaddr(unsigned long paddr, unsigned long vaddr,
+ +static void __ic_line_inv_vaddr(phys_addr_t paddr, unsigned long vaddr,
                                 unsigned long sz)
   {
         struct ic_inv_args ic_inv = {
@@@ -530,7 -495,7 +530,7 @@@
   
   #endif /* CONFIG_ARC_HAS_ICACHE */
   
- -noinline void slc_op(unsigned long paddr, unsigned long sz, const int op)
+ +noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op)
   {
   #ifdef CONFIG_ISA_ARCV2
         /*
@@@ -617,10 -582,10 +617,10 @@@ void flush_dcache_page(struct page *pag
          */
         if (!mapping_mapped(mapping)) {
                 clear_bit(PG_dc_clean, &page->flags);
-       } else if (page_mapped(page)) {
+       } else if (page_mapcount(page)) {
   
                 /* kernel reading from page with U-mapping */
- -              unsigned long paddr = (unsigned long)page_address(page);
+ +              phys_addr_t paddr = (unsigned long)page_address(page);
                 unsigned long vaddr = page->index << PAGE_CACHE_SHIFT;
   
                 if (addr_not_cache_congruent(paddr, vaddr))
@@@ -768,14 -733,14 +768,14 @@@ EXPORT_SYMBOL(flush_icache_range)
    *    builtin kernel page will not have any virtual mappings.
    *    kprobe on loadable module will be kernel vaddr.
    */
- -void __sync_icache_dcache(unsigned long paddr, unsigned long vaddr, int len)
+ +void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len)
   {
         __dc_line_op(paddr, vaddr, len, OP_FLUSH_N_INV);
         __ic_line_inv_vaddr(paddr, vaddr, len);
   }
   
   /* wrapper to compile time eliminate alignment checks in flush loop */
- -void __inv_icache_page(unsigned long paddr, unsigned long vaddr)
+ +void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr)
   {
         __ic_line_inv_vaddr(paddr, vaddr, PAGE_SIZE);
   }
@@@ -784,7 -749,7 +784,7 @@@
    * wrapper to clearout kernel or userspace mappings of a page
    * For kernel mappings @vaddr == @paddr
    */
- -void __flush_dcache_page(unsigned long paddr, unsigned long vaddr)
+ +void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr)
   {
         __dc_line_op(paddr, vaddr & PAGE_MASK, PAGE_SIZE, OP_FLUSH_N_INV);
   }
@@@ -842,8 -807,8 +842,8 @@@ void flush_anon_page(struct vm_area_str
   void copy_user_highpage(struct page *to, struct page *from,
         unsigned long u_vaddr, struct vm_area_struct *vma)
   {
- -      unsigned long kfrom = (unsigned long)page_address(from);
- -      unsigned long kto = (unsigned long)page_address(to);
+ +      void *kfrom = kmap_atomic(from);
+ +      void *kto = kmap_atomic(to);
         int clean_src_k_mappings = 0;
   
         /*
@@@ -853,16 -818,13 +853,16 @@@
          *
          * Note that while @u_vaddr refers to DST page's userspace vaddr, it is
          * equally valid for SRC page as well
+ +       *
+ +       * For !VIPT cache, all of this gets compiled out as
+ +       * addr_not_cache_congruent() is 0
          */
-       if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
+       if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) {
- -              __flush_dcache_page(kfrom, u_vaddr);
+ +              __flush_dcache_page((unsigned long)kfrom, u_vaddr);
                 clean_src_k_mappings = 1;
         }
   
- -      copy_page((void *)kto, (void *)kfrom);
+ +      copy_page(kto, kfrom);
   
         /*
          * Mark DST page K-mapping as dirty for a later finalization by
@@@ -879,14 -841,11 +879,14 @@@
          * sync the kernel mapping back to physical page
          */
         if (clean_src_k_mappings) {
- -              __flush_dcache_page(kfrom, kfrom);
+ +              __flush_dcache_page((unsigned long)kfrom, (unsigned long)kfrom);
                 set_bit(PG_dc_clean, &from->flags);
         } else {
                 clear_bit(PG_dc_clean, &from->flags);
         }
+ +
+ +      kunmap_atomic(kto);
+ +      kunmap_atomic(kfrom);
   }
   
   void clear_user_page(void *to, unsigned long u_vaddr, struct page *page)
diff --combined arch/arm/mm/dma-mapping.c

index ad4eb2d26e1697fc6a16f47a8805e532e198a693,38307d8312ac6b7a51bb155980f52bc7fca8ea7e..e62400e5fb99fdbf864af966e718a98decf85e29
--- 1/arch/arm/mm/dma-mapping.c
--- 2/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@@ -651,12 -651,12 +651,12 @@@ static void *__dma_alloc(struct device 
   
         if (nommu())
                 addr = __alloc_simple_buffer(dev, size, gfp, &page);
-       else if (dev_get_cma_area(dev) && (gfp & __GFP_WAIT))
+       else if (dev_get_cma_area(dev) && (gfp & __GFP_DIRECT_RECLAIM))
                 addr = __alloc_from_contiguous(dev, size, prot, &page,
                                                caller, want_vaddr);
         else if (is_coherent)
                 addr = __alloc_simple_buffer(dev, size, gfp, &page);
-       else if (!(gfp & __GFP_WAIT))
+       else if (!gfpflags_allow_blocking(gfp))
                 addr = __alloc_from_pool(size, &page);
         else
                 addr = __alloc_remap_buffer(dev, size, gfp, prot, &page,
@@@ -1363,7 -1363,7 +1363,7 @@@ static void *arm_iommu_alloc_attrs(stru
         *handle = DMA_ERROR_CODE;
         size = PAGE_ALIGN(size);
   
-       if (!(gfp & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(gfp))
                 return __iommu_alloc_atomic(dev, size, handle);
   
         /*
@@@ -1407,19 -1407,12 +1407,19 @@@ static int arm_iommu_mmap_attrs(struct 
         unsigned long uaddr = vma->vm_start;
         unsigned long usize = vma->vm_end - vma->vm_start;
         struct page **pages = __iommu_get_pages(cpu_addr, attrs);
+ +      unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ +      unsigned long off = vma->vm_pgoff;
   
         vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot);
   
         if (!pages)
                 return -ENXIO;
   
+ +      if (off >= nr_pages || (usize >> PAGE_SHIFT) > nr_pages - off)
+ +              return -ENXIO;
+ +
+ +      pages += off;
+ +
         do {
                 int ret = vm_insert_page(vma, uaddr, *pages++);
                 if (ret) {
diff --combined arch/arm/xen/mm.c

index 7c34f7126b046abe9d61637a1716a2ae5139bfe5,99eec9063f68afd075dfe87e5a2f0d0ef30502cf..c5f9a9e3d1f393daa161ce8349c820462870a942
--- 1/arch/arm/xen/mm.c
--- 2/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@@ -25,7 -25,7 +25,7 @@@
   unsigned long xen_get_swiotlb_free_pages(unsigned int order)
   {
         struct memblock_region *reg;
-       gfp_t flags = __GFP_NOWARN;
+       gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM;
   
         for_each_memblock(memory, reg) {
                 if (reg->base < (phys_addr_t)0xffffffff) {
@@@ -48,22 -48,22 +48,22 @@@ static void dma_cache_maint(dma_addr_t 
         size_t size, enum dma_data_direction dir, enum dma_cache_op op)
   {
         struct gnttab_cache_flush cflush;
- -      unsigned long pfn;
+ +      unsigned long xen_pfn;
         size_t left = size;
   
- -      pfn = (handle >> PAGE_SHIFT) + offset / PAGE_SIZE;
- -      offset %= PAGE_SIZE;
+ +      xen_pfn = (handle >> XEN_PAGE_SHIFT) + offset / XEN_PAGE_SIZE;
+ +      offset %= XEN_PAGE_SIZE;
   
         do {
                 size_t len = left;
         
                 /* buffers in highmem or foreign pages cannot cross page
                  * boundaries */
- -              if (len + offset > PAGE_SIZE)
- -                      len = PAGE_SIZE - offset;
+ +              if (len + offset > XEN_PAGE_SIZE)
+ +                      len = XEN_PAGE_SIZE - offset;
   
                 cflush.op = 0;
- -              cflush.a.dev_bus_addr = pfn << PAGE_SHIFT;
+ +              cflush.a.dev_bus_addr = xen_pfn << XEN_PAGE_SHIFT;
                 cflush.offset = offset;
                 cflush.length = len;
   
@@@ -79,7 -79,7 +79,7 @@@
                         HYPERVISOR_grant_table_op(GNTTABOP_cache_flush, &cflush, 1);
   
                 offset = 0;
- -              pfn++;
+ +              xen_pfn++;
                 left -= len;
         } while (left);
   }
@@@ -138,29 -138,10 +138,29 @@@ void __xen_dma_sync_single_for_device(s
   }
   
   bool xen_arch_need_swiotlb(struct device *dev,
- -                         unsigned long pfn,
- -                         unsigned long bfn)
+ +                         phys_addr_t phys,
+ +                         dma_addr_t dev_addr)
   {
- -      return (!hypercall_cflush && (pfn != bfn) && !is_device_dma_coherent(dev));
+ +      unsigned int xen_pfn = XEN_PFN_DOWN(phys);
+ +      unsigned int bfn = XEN_PFN_DOWN(dev_addr);
+ +
+ +      /*
+ +       * The swiotlb buffer should be used if
+ +       *      - Xen doesn't have the cache flush hypercall
+ +       *      - The Linux page refers to foreign memory
+ +       *      - The device doesn't support coherent DMA request
+ +       *
+ +       * The Linux page may be spanned acrros multiple Xen page, although
+ +       * it's not possible to have a mix of local and foreign Xen page.
+ +       * Furthermore, range_straddles_page_boundary is already checking
+ +       * if buffer is physically contiguous in the host RAM.
+ +       *
+ +       * Therefore we only need to check the first Xen page to know if we
+ +       * require a bounce buffer because the device doesn't support coherent
+ +       * memory and we are not able to flush the cache.
+ +       */
+ +      return (!hypercall_cflush && (xen_pfn != bfn) &&
+ +              !is_device_dma_coherent(dev));
   }
   
   int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
diff --combined arch/arm64/include/asm/pgtable.h

index f3acf421ded4f55616abd7b68a7dcf83081e6f38,0e98f9bc0674401a6e64934348a66254fece2031..5687caf59dd565183213bc920b390bb0232158fa
--- 1/arch/arm64/include/asm/pgtable.h
--- 2/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@@ -41,14 -41,7 +41,14 @@@
    *    fixed mappings and modules
    */
   #define VMEMMAP_SIZE          ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
- -#define VMALLOC_START         (UL(0xffffffffffffffff) << VA_BITS)
+ +
+ +#ifndef CONFIG_KASAN
+ +#define VMALLOC_START         (VA_START)
+ +#else
+ +#include <asm/kasan.h>
+ +#define VMALLOC_START         (KASAN_SHADOW_END + SZ_64K)
+ +#endif
+ +
   #define VMALLOC_END           (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
   
   #define vmemmap                       ((struct page *)(VMALLOC_END + SZ_64K))
@@@ -67,10 -60,8 +67,10 @@@ extern void __pgd_error(const char *fil
   #define PROT_DEFAULT          (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
   #define PROT_SECT_DEFAULT     (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
   
+ +#define PROT_DEVICE_nGnRnE    (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
   #define PROT_DEVICE_nGnRE     (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE))
   #define PROT_NORMAL_NC                (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC))
+ +#define PROT_NORMAL_WT                (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_WT))
   #define PROT_NORMAL           (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL))
   
   #define PROT_SECT_DEVICE_nGnRE        (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE))
@@@ -81,7 -72,6 +81,7 @@@
   
   #define PAGE_KERNEL           __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
   #define PAGE_KERNEL_EXEC      __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
+ +#define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
   
   #define PAGE_HYP              __pgprot(_PAGE_DEFAULT | PTE_HYP)
   #define PAGE_HYP_DEVICE               __pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
@@@ -150,7 -140,6 +150,7 @@@ extern struct page *empty_zero_page
   #define pte_special(pte)      (!!(pte_val(pte) & PTE_SPECIAL))
   #define pte_write(pte)                (!!(pte_val(pte) & PTE_WRITE))
   #define pte_exec(pte)         (!(pte_val(pte) & PTE_UXN))
+ +#define pte_cont(pte)         (!!(pte_val(pte) & PTE_CONT))
   
   #ifdef CONFIG_ARM64_HW_AFDBM
   #define pte_hw_dirty(pte)     (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
@@@ -213,16 -202,6 +213,16 @@@ static inline pte_t pte_mkspecial(pte_
         return set_pte_bit(pte, __pgprot(PTE_SPECIAL));
   }
   
+ +static inline pte_t pte_mkcont(pte_t pte)
+ +{
+ +      return set_pte_bit(pte, __pgprot(PTE_CONT));
+ +}
+ +
+ +static inline pte_t pte_mknoncont(pte_t pte)
+ +{
+ +      return clear_pte_bit(pte, __pgprot(PTE_CONT));
+ +}
+ +
   static inline void set_pte(pte_t *ptep, pte_t pte)
   {
         *ptep = pte;
@@@ -331,21 -310,15 +331,15 @@@ static inline pgprot_t mk_sect_prot(pgp
   
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
   #define pmd_trans_huge(pmd)   (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
- #define pmd_trans_splitting(pmd)      pte_special(pmd_pte(pmd))
- #ifdef CONFIG_HAVE_RCU_TABLE_FREE
- #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- struct vm_area_struct;
- void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp);
- #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   
   #define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
   #define pmd_young(pmd)                pte_young(pmd_pte(pmd))
+ #define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
   #define pmd_wrprotect(pmd)    pte_pmd(pte_wrprotect(pmd_pte(pmd)))
- #define pmd_mksplitting(pmd)  pte_pmd(pte_mkspecial(pmd_pte(pmd)))
   #define pmd_mkold(pmd)                pte_pmd(pte_mkold(pmd_pte(pmd)))
   #define pmd_mkwrite(pmd)      pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+ #define pmd_mkclean(pmd)      pte_pmd(pte_mkclean(pmd_pte(pmd)))
   #define pmd_mkdirty(pmd)      pte_pmd(pte_mkdirty(pmd_pte(pmd)))
   #define pmd_mkyoung(pmd)      pte_pmd(pte_mkyoung(pmd_pte(pmd)))
   #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
@@@ -667,17 -640,14 +661,17 @@@ static inline void update_mmu_cache(str
                                     unsigned long addr, pte_t *ptep)
   {
         /*
- -       * set_pte() does not have a DSB for user mappings, so make sure that
- -       * the page table write is visible.
+ +       * We don't do anything here, so there's a very small chance of
+ +       * us retaking a user fault which we just fixed up. The alternative
+ +       * is doing a dsb(ishst), but that penalises the fastpath.
          */
- -      dsb(ishst);
   }
   
   #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)
   
+ +#define kc_vaddr_to_offset(v) ((v) & ~VA_START)
+ +#define kc_offset_to_vaddr(o) ((o) | VA_START)
+ +
   #endif /* !__ASSEMBLY__ */
   
   #endif /* __ASM_PGTABLE_H */
diff --combined arch/arm64/mm/dma-mapping.c

index 6320361d8d4c703cf4072ba2e47fdf7884c7f779,478234383c2cd8c90087eca0031b425a3d01b606..131a199114b405e8403f05137e560a2b317f4941
--- 1/arch/arm64/mm/dma-mapping.c
--- 2/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@@ -100,7 -100,7 +100,7 @@@ static void *__dma_alloc_coherent(struc
         if (IS_ENABLED(CONFIG_ZONE_DMA) &&
             dev->coherent_dma_mask <= DMA_BIT_MASK(32))
                 flags |= GFP_DMA;
-       if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) {
+       if (dev_get_cma_area(dev) && gfpflags_allow_blocking(flags)) {
                 struct page *page;
                 void *addr;
   
@@@ -148,7 -148,7 +148,7 @@@ static void *__dma_alloc(struct device 
   
         size = PAGE_ALIGN(size);
   
-       if (!coherent && !(flags & __GFP_WAIT)) {
+       if (!coherent && !gfpflags_allow_blocking(flags)) {
                 struct page *page = NULL;
                 void *addr = __alloc_from_pool(size, &page, flags);
   
@@@ -533,460 -533,3 +533,460 @@@ static int __init dma_debug_do_init(voi
         return 0;
   }
   fs_initcall(dma_debug_do_init);
-       if (gfp & __GFP_WAIT) {
+ +
+ +
+ +#ifdef CONFIG_IOMMU_DMA
+ +#include <linux/dma-iommu.h>
+ +#include <linux/platform_device.h>
+ +#include <linux/amba/bus.h>
+ +
+ +/* Thankfully, all cache ops are by VA so we can ignore phys here */
+ +static void flush_page(struct device *dev, const void *virt, phys_addr_t phys)
+ +{
+ +      __dma_flush_range(virt, virt + PAGE_SIZE);
+ +}
+ +
+ +static void *__iommu_alloc_attrs(struct device *dev, size_t size,
+ +                               dma_addr_t *handle, gfp_t gfp,
+ +                               struct dma_attrs *attrs)
+ +{
+ +      bool coherent = is_device_dma_coherent(dev);
+ +      int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
+ +      void *addr;
+ +
+ +      if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
+ +              return NULL;
+ +      /*
+ +       * Some drivers rely on this, and we probably don't want the
+ +       * possibility of stale kernel data being read by devices anyway.
+ +       */
+ +      gfp |= __GFP_ZERO;
+ +
++      if (gfpflags_allow_blocking(gfp)) {
+ +              struct page **pages;
+ +              pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
+ +
+ +              pages = iommu_dma_alloc(dev, size, gfp, ioprot, handle,
+ +                                      flush_page);
+ +              if (!pages)
+ +                      return NULL;
+ +
+ +              addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
+ +                                            __builtin_return_address(0));
+ +              if (!addr)
+ +                      iommu_dma_free(dev, pages, size, handle);
+ +      } else {
+ +              struct page *page;
+ +              /*
+ +               * In atomic context we can't remap anything, so we'll only
+ +               * get the virtually contiguous buffer we need by way of a
+ +               * physically contiguous allocation.
+ +               */
+ +              if (coherent) {
+ +                      page = alloc_pages(gfp, get_order(size));
+ +                      addr = page ? page_address(page) : NULL;
+ +              } else {
+ +                      addr = __alloc_from_pool(size, &page, gfp);
+ +              }
+ +              if (!addr)
+ +                      return NULL;
+ +
+ +              *handle = iommu_dma_map_page(dev, page, 0, size, ioprot);
+ +              if (iommu_dma_mapping_error(dev, *handle)) {
+ +                      if (coherent)
+ +                              __free_pages(page, get_order(size));
+ +                      else
+ +                              __free_from_pool(addr, size);
+ +                      addr = NULL;
+ +              }
+ +      }
+ +      return addr;
+ +}
+ +
+ +static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+ +                             dma_addr_t handle, struct dma_attrs *attrs)
+ +{
+ +      /*
+ +       * @cpu_addr will be one of 3 things depending on how it was allocated:
+ +       * - A remapped array of pages from iommu_dma_alloc(), for all
+ +       *   non-atomic allocations.
+ +       * - A non-cacheable alias from the atomic pool, for atomic
+ +       *   allocations by non-coherent devices.
+ +       * - A normal lowmem address, for atomic allocations by
+ +       *   coherent devices.
+ +       * Hence how dodgy the below logic looks...
+ +       */
+ +      if (__in_atomic_pool(cpu_addr, size)) {
+ +              iommu_dma_unmap_page(dev, handle, size, 0, NULL);
+ +              __free_from_pool(cpu_addr, size);
+ +      } else if (is_vmalloc_addr(cpu_addr)){
+ +              struct vm_struct *area = find_vm_area(cpu_addr);
+ +
+ +              if (WARN_ON(!area || !area->pages))
+ +                      return;
+ +              iommu_dma_free(dev, area->pages, size, &handle);
+ +              dma_common_free_remap(cpu_addr, size, VM_USERMAP);
+ +      } else {
+ +              iommu_dma_unmap_page(dev, handle, size, 0, NULL);
+ +              __free_pages(virt_to_page(cpu_addr), get_order(size));
+ +      }
+ +}
+ +
+ +static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
+ +                            void *cpu_addr, dma_addr_t dma_addr, size_t size,
+ +                            struct dma_attrs *attrs)
+ +{
+ +      struct vm_struct *area;
+ +      int ret;
+ +
+ +      vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot,
+ +                                           is_device_dma_coherent(dev));
+ +
+ +      if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
+ +              return ret;
+ +
+ +      area = find_vm_area(cpu_addr);
+ +      if (WARN_ON(!area || !area->pages))
+ +              return -ENXIO;
+ +
+ +      return iommu_dma_mmap(area->pages, size, vma);
+ +}
+ +
+ +static int __iommu_get_sgtable(struct device *dev, struct sg_table *sgt,
+ +                             void *cpu_addr, dma_addr_t dma_addr,
+ +                             size_t size, struct dma_attrs *attrs)
+ +{
+ +      unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ +      struct vm_struct *area = find_vm_area(cpu_addr);
+ +
+ +      if (WARN_ON(!area || !area->pages))
+ +              return -ENXIO;
+ +
+ +      return sg_alloc_table_from_pages(sgt, area->pages, count, 0, size,
+ +                                       GFP_KERNEL);
+ +}
+ +
+ +static void __iommu_sync_single_for_cpu(struct device *dev,
+ +                                      dma_addr_t dev_addr, size_t size,
+ +                                      enum dma_data_direction dir)
+ +{
+ +      phys_addr_t phys;
+ +
+ +      if (is_device_dma_coherent(dev))
+ +              return;
+ +
+ +      phys = iommu_iova_to_phys(iommu_get_domain_for_dev(dev), dev_addr);
+ +      __dma_unmap_area(phys_to_virt(phys), size, dir);
+ +}
+ +
+ +static void __iommu_sync_single_for_device(struct device *dev,
+ +                                         dma_addr_t dev_addr, size_t size,
+ +                                         enum dma_data_direction dir)
+ +{
+ +      phys_addr_t phys;
+ +
+ +      if (is_device_dma_coherent(dev))
+ +              return;
+ +
+ +      phys = iommu_iova_to_phys(iommu_get_domain_for_dev(dev), dev_addr);
+ +      __dma_map_area(phys_to_virt(phys), size, dir);
+ +}
+ +
+ +static dma_addr_t __iommu_map_page(struct device *dev, struct page *page,
+ +                                 unsigned long offset, size_t size,
+ +                                 enum dma_data_direction dir,
+ +                                 struct dma_attrs *attrs)
+ +{
+ +      bool coherent = is_device_dma_coherent(dev);
+ +      int prot = dma_direction_to_prot(dir, coherent);
+ +      dma_addr_t dev_addr = iommu_dma_map_page(dev, page, offset, size, prot);
+ +
+ +      if (!iommu_dma_mapping_error(dev, dev_addr) &&
+ +          !dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
+ +              __iommu_sync_single_for_device(dev, dev_addr, size, dir);
+ +
+ +      return dev_addr;
+ +}
+ +
+ +static void __iommu_unmap_page(struct device *dev, dma_addr_t dev_addr,
+ +                             size_t size, enum dma_data_direction dir,
+ +                             struct dma_attrs *attrs)
+ +{
+ +      if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
+ +              __iommu_sync_single_for_cpu(dev, dev_addr, size, dir);
+ +
+ +      iommu_dma_unmap_page(dev, dev_addr, size, dir, attrs);
+ +}
+ +
+ +static void __iommu_sync_sg_for_cpu(struct device *dev,
+ +                                  struct scatterlist *sgl, int nelems,
+ +                                  enum dma_data_direction dir)
+ +{
+ +      struct scatterlist *sg;
+ +      int i;
+ +
+ +      if (is_device_dma_coherent(dev))
+ +              return;
+ +
+ +      for_each_sg(sgl, sg, nelems, i)
+ +              __dma_unmap_area(sg_virt(sg), sg->length, dir);
+ +}
+ +
+ +static void __iommu_sync_sg_for_device(struct device *dev,
+ +                                     struct scatterlist *sgl, int nelems,
+ +                                     enum dma_data_direction dir)
+ +{
+ +      struct scatterlist *sg;
+ +      int i;
+ +
+ +      if (is_device_dma_coherent(dev))
+ +              return;
+ +
+ +      for_each_sg(sgl, sg, nelems, i)
+ +              __dma_map_area(sg_virt(sg), sg->length, dir);
+ +}
+ +
+ +static int __iommu_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
+ +                              int nelems, enum dma_data_direction dir,
+ +                              struct dma_attrs *attrs)
+ +{
+ +      bool coherent = is_device_dma_coherent(dev);
+ +
+ +      if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
+ +              __iommu_sync_sg_for_device(dev, sgl, nelems, dir);
+ +
+ +      return iommu_dma_map_sg(dev, sgl, nelems,
+ +                      dma_direction_to_prot(dir, coherent));
+ +}
+ +
+ +static void __iommu_unmap_sg_attrs(struct device *dev,
+ +                                 struct scatterlist *sgl, int nelems,
+ +                                 enum dma_data_direction dir,
+ +                                 struct dma_attrs *attrs)
+ +{
+ +      if (!dma_get_attr(DMA_ATTR_SKIP_CPU_SYNC, attrs))
+ +              __iommu_sync_sg_for_cpu(dev, sgl, nelems, dir);
+ +
+ +      iommu_dma_unmap_sg(dev, sgl, nelems, dir, attrs);
+ +}
+ +
+ +static struct dma_map_ops iommu_dma_ops = {
+ +      .alloc = __iommu_alloc_attrs,
+ +      .free = __iommu_free_attrs,
+ +      .mmap = __iommu_mmap_attrs,
+ +      .get_sgtable = __iommu_get_sgtable,
+ +      .map_page = __iommu_map_page,
+ +      .unmap_page = __iommu_unmap_page,
+ +      .map_sg = __iommu_map_sg_attrs,
+ +      .unmap_sg = __iommu_unmap_sg_attrs,
+ +      .sync_single_for_cpu = __iommu_sync_single_for_cpu,
+ +      .sync_single_for_device = __iommu_sync_single_for_device,
+ +      .sync_sg_for_cpu = __iommu_sync_sg_for_cpu,
+ +      .sync_sg_for_device = __iommu_sync_sg_for_device,
+ +      .dma_supported = iommu_dma_supported,
+ +      .mapping_error = iommu_dma_mapping_error,
+ +};
+ +
+ +/*
+ + * TODO: Right now __iommu_setup_dma_ops() gets called too early to do
+ + * everything it needs to - the device is only partially created and the
+ + * IOMMU driver hasn't seen it yet, so it can't have a group. Thus we
+ + * need this delayed attachment dance. Once IOMMU probe ordering is sorted
+ + * to move the arch_setup_dma_ops() call later, all the notifier bits below
+ + * become unnecessary, and will go away.
+ + */
+ +struct iommu_dma_notifier_data {
+ +      struct list_head list;
+ +      struct device *dev;
+ +      const struct iommu_ops *ops;
+ +      u64 dma_base;
+ +      u64 size;
+ +};
+ +static LIST_HEAD(iommu_dma_masters);
+ +static DEFINE_MUTEX(iommu_dma_notifier_lock);
+ +
+ +/*
+ + * Temporarily "borrow" a domain feature flag to to tell if we had to resort
+ + * to creating our own domain here, in case we need to clean it up again.
+ + */
+ +#define __IOMMU_DOMAIN_FAKE_DEFAULT           (1U << 31)
+ +
+ +static bool do_iommu_attach(struct device *dev, const struct iommu_ops *ops,
+ +                         u64 dma_base, u64 size)
+ +{
+ +      struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+ +
+ +      /*
+ +       * Best case: The device is either part of a group which was
+ +       * already attached to a domain in a previous call, or it's
+ +       * been put in a default DMA domain by the IOMMU core.
+ +       */
+ +      if (!domain) {
+ +              /*
+ +               * Urgh. The IOMMU core isn't going to do default domains
+ +               * for non-PCI devices anyway, until it has some means of
+ +               * abstracting the entirely implementation-specific
+ +               * sideband data/SoC topology/unicorn dust that may or
+ +               * may not differentiate upstream masters.
+ +               * So until then, HORRIBLE HACKS!
+ +               */
+ +              domain = ops->domain_alloc(IOMMU_DOMAIN_DMA);
+ +              if (!domain)
+ +                      goto out_no_domain;
+ +
+ +              domain->ops = ops;
+ +              domain->type = IOMMU_DOMAIN_DMA | __IOMMU_DOMAIN_FAKE_DEFAULT;
+ +
+ +              if (iommu_attach_device(domain, dev))
+ +                      goto out_put_domain;
+ +      }
+ +
+ +      if (iommu_dma_init_domain(domain, dma_base, size))
+ +              goto out_detach;
+ +
+ +      dev->archdata.dma_ops = &iommu_dma_ops;
+ +      return true;
+ +
+ +out_detach:
+ +      iommu_detach_device(domain, dev);
+ +out_put_domain:
+ +      if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT)
+ +              iommu_domain_free(domain);
+ +out_no_domain:
+ +      pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
+ +              dev_name(dev));
+ +      return false;
+ +}
+ +
+ +static void queue_iommu_attach(struct device *dev, const struct iommu_ops *ops,
+ +                            u64 dma_base, u64 size)
+ +{
+ +      struct iommu_dma_notifier_data *iommudata;
+ +
+ +      iommudata = kzalloc(sizeof(*iommudata), GFP_KERNEL);
+ +      if (!iommudata)
+ +              return;
+ +
+ +      iommudata->dev = dev;
+ +      iommudata->ops = ops;
+ +      iommudata->dma_base = dma_base;
+ +      iommudata->size = size;
+ +
+ +      mutex_lock(&iommu_dma_notifier_lock);
+ +      list_add(&iommudata->list, &iommu_dma_masters);
+ +      mutex_unlock(&iommu_dma_notifier_lock);
+ +}
+ +
+ +static int __iommu_attach_notifier(struct notifier_block *nb,
+ +                                 unsigned long action, void *data)
+ +{
+ +      struct iommu_dma_notifier_data *master, *tmp;
+ +
+ +      if (action != BUS_NOTIFY_ADD_DEVICE)
+ +              return 0;
+ +
+ +      mutex_lock(&iommu_dma_notifier_lock);
+ +      list_for_each_entry_safe(master, tmp, &iommu_dma_masters, list) {
+ +              if (do_iommu_attach(master->dev, master->ops,
+ +                              master->dma_base, master->size)) {
+ +                      list_del(&master->list);
+ +                      kfree(master);
+ +              }
+ +      }
+ +      mutex_unlock(&iommu_dma_notifier_lock);
+ +      return 0;
+ +}
+ +
+ +static int register_iommu_dma_ops_notifier(struct bus_type *bus)
+ +{
+ +      struct notifier_block *nb = kzalloc(sizeof(*nb), GFP_KERNEL);
+ +      int ret;
+ +
+ +      if (!nb)
+ +              return -ENOMEM;
+ +      /*
+ +       * The device must be attached to a domain before the driver probe
+ +       * routine gets a chance to start allocating DMA buffers. However,
+ +       * the IOMMU driver also needs a chance to configure the iommu_group
+ +       * via its add_device callback first, so we need to make the attach
+ +       * happen between those two points. Since the IOMMU core uses a bus
+ +       * notifier with default priority for add_device, do the same but
+ +       * with a lower priority to ensure the appropriate ordering.
+ +       */
+ +      nb->notifier_call = __iommu_attach_notifier;
+ +      nb->priority = -100;
+ +
+ +      ret = bus_register_notifier(bus, nb);
+ +      if (ret) {
+ +              pr_warn("Failed to register DMA domain notifier; IOMMU DMA ops unavailable on bus '%s'\n",
+ +                      bus->name);
+ +              kfree(nb);
+ +      }
+ +      return ret;
+ +}
+ +
+ +static int __init __iommu_dma_init(void)
+ +{
+ +      int ret;
+ +
+ +      ret = iommu_dma_init();
+ +      if (!ret)
+ +              ret = register_iommu_dma_ops_notifier(&platform_bus_type);
+ +      if (!ret)
+ +              ret = register_iommu_dma_ops_notifier(&amba_bustype);
+ +      return ret;
+ +}
+ +arch_initcall(__iommu_dma_init);
+ +
+ +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
+ +                                const struct iommu_ops *ops)
+ +{
+ +      struct iommu_group *group;
+ +
+ +      if (!ops)
+ +              return;
+ +      /*
+ +       * TODO: As a concession to the future, we're ready to handle being
+ +       * called both early and late (i.e. after bus_add_device). Once all
+ +       * the platform bus code is reworked to call us late and the notifier
+ +       * junk above goes away, move the body of do_iommu_attach here.
+ +       */
+ +      group = iommu_group_get(dev);
+ +      if (group) {
+ +              do_iommu_attach(dev, ops, dma_base, size);
+ +              iommu_group_put(group);
+ +      } else {
+ +              queue_iommu_attach(dev, ops, dma_base, size);
+ +      }
+ +}
+ +
+ +void arch_teardown_dma_ops(struct device *dev)
+ +{
+ +      struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+ +
+ +      if (domain) {
+ +              iommu_detach_device(domain, dev);
+ +              if (domain->type & __IOMMU_DOMAIN_FAKE_DEFAULT)
+ +                      iommu_domain_free(domain);
+ +      }
+ +
+ +      dev->archdata.dma_ops = NULL;
+ +}
+ +
+ +#else
+ +
+ +static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
+ +                                struct iommu_ops *iommu)
+ +{ }
+ +
+ +#endif  /* CONFIG_IOMMU_DMA */
+ +
+ +void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
+ +                      struct iommu_ops *iommu, bool coherent)
+ +{
+ +      if (!acpi_disabled && !dev->archdata.dma_ops)
+ +              dev->archdata.dma_ops = dma_ops;
+ +
+ +      dev->archdata.dma_coherent = coherent;
+ +      __iommu_setup_dma_ops(dev, dma_base, size, iommu);
+ +}
diff --combined arch/mips/mm/tlbex.c

index 32e0be27673fefbeca6839929e61a581c8980902,b190ae9fe909fc4fa8663e250ac14ad13d8b2288..482192cc8f2b88ae89f4cf1495c0dd6055a5bda6
--- 1/arch/mips/mm/tlbex.c
--- 2/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@@ -240,7 -240,6 +240,6 @@@ static void output_pgtable_bits_defines
         pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT);
   #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
         pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT);
-       pr_define("_PAGE_SPLITTING_SHIFT %d\n", _PAGE_SPLITTING_SHIFT);
   #endif
   #ifdef CONFIG_CPU_MIPSR2
         if (cpu_has_rixi) {
@@@ -311,7 -310,6 +310,7 @@@ static struct uasm_label labels[128]
   static struct uasm_reloc relocs[128];
   
   static int check_for_high_segbits;
+ +static bool fill_includes_sw_bits;
   
   static unsigned int kscratch_used_mask;
   
@@@ -631,14 -629,8 +630,14 @@@ static void build_tlb_write_entry(u32 *
   static __maybe_unused void build_convert_pte_to_entrylo(u32 **p,
                                                         unsigned int reg)
   {
- -      if (cpu_has_rixi) {
- -              UASM_i_ROTR(p, reg, reg, ilog2(_PAGE_GLOBAL));
+ +      if (cpu_has_rixi && _PAGE_NO_EXEC) {
+ +              if (fill_includes_sw_bits) {
+ +                      UASM_i_ROTR(p, reg, reg, ilog2(_PAGE_GLOBAL));
+ +              } else {
+ +                      UASM_i_SRL(p, reg, reg, ilog2(_PAGE_NO_EXEC));
+ +                      UASM_i_ROTR(p, reg, reg,
+ +                                  ilog2(_PAGE_GLOBAL) - ilog2(_PAGE_NO_EXEC));
+ +              }
         } else {
   #ifdef CONFIG_PHYS_ADDR_T_64BIT
                 uasm_i_dsrl_safe(p, reg, reg, ilog2(_PAGE_GLOBAL));
@@@ -1012,7 -1004,21 +1011,7 @@@ static void build_update_entries(u32 **
          * 64bit address support (36bit on a 32bit CPU) in a 32bit
          * Kernel is a special case. Only a few CPUs use it.
          */
- -#ifdef CONFIG_PHYS_ADDR_T_64BIT
- -      if (cpu_has_64bits) {
- -              uasm_i_ld(p, tmp, 0, ptep); /* get even pte */
- -              uasm_i_ld(p, ptep, sizeof(pte_t), ptep); /* get odd pte */
- -              if (cpu_has_rixi) {
- -                      UASM_i_ROTR(p, tmp, tmp, ilog2(_PAGE_GLOBAL));
- -                      UASM_i_MTC0(p, tmp, C0_ENTRYLO0); /* load it */
- -                      UASM_i_ROTR(p, ptep, ptep, ilog2(_PAGE_GLOBAL));
- -              } else {
- -                      uasm_i_dsrl_safe(p, tmp, tmp, ilog2(_PAGE_GLOBAL)); /* convert to entrylo0 */
- -                      UASM_i_MTC0(p, tmp, C0_ENTRYLO0); /* load it */
- -                      uasm_i_dsrl_safe(p, ptep, ptep, ilog2(_PAGE_GLOBAL)); /* convert to entrylo1 */
- -              }
- -              UASM_i_MTC0(p, ptep, C0_ENTRYLO1); /* load it */
- -      } else {
+ +      if (config_enabled(CONFIG_PHYS_ADDR_T_64BIT) && !cpu_has_64bits) {
                 int pte_off_even = sizeof(pte_t) / 2;
                 int pte_off_odd = pte_off_even + sizeof(pte_t);
   #ifdef CONFIG_XPA
@@@ -1036,23 -1042,31 +1035,23 @@@
                 uasm_i_mthc0(p, tmp, C0_ENTRYLO0);
                 uasm_i_mthc0(p, ptep, C0_ENTRYLO1);
   #endif
+ +              return;
         }
- -#else
+ +
         UASM_i_LW(p, tmp, 0, ptep); /* get even pte */
         UASM_i_LW(p, ptep, sizeof(pte_t), ptep); /* get odd pte */
         if (r45k_bvahwbug())
                 build_tlb_probe_entry(p);
- -      if (cpu_has_rixi) {
- -              UASM_i_ROTR(p, tmp, tmp, ilog2(_PAGE_GLOBAL));
- -              if (r4k_250MHZhwbug())
- -                      UASM_i_MTC0(p, 0, C0_ENTRYLO0);
- -              UASM_i_MTC0(p, tmp, C0_ENTRYLO0); /* load it */
- -              UASM_i_ROTR(p, ptep, ptep, ilog2(_PAGE_GLOBAL));
- -      } else {
- -              UASM_i_SRL(p, tmp, tmp, ilog2(_PAGE_GLOBAL)); /* convert to entrylo0 */
- -              if (r4k_250MHZhwbug())
- -                      UASM_i_MTC0(p, 0, C0_ENTRYLO0);
- -              UASM_i_MTC0(p, tmp, C0_ENTRYLO0); /* load it */
- -              UASM_i_SRL(p, ptep, ptep, ilog2(_PAGE_GLOBAL)); /* convert to entrylo1 */
- -              if (r45k_bvahwbug())
- -                      uasm_i_mfc0(p, tmp, C0_INDEX);
- -      }
+ +      build_convert_pte_to_entrylo(p, tmp);
+ +      if (r4k_250MHZhwbug())
+ +              UASM_i_MTC0(p, 0, C0_ENTRYLO0);
+ +      UASM_i_MTC0(p, tmp, C0_ENTRYLO0); /* load it */
+ +      build_convert_pte_to_entrylo(p, ptep);
+ +      if (r45k_bvahwbug())
+ +              uasm_i_mfc0(p, tmp, C0_INDEX);
         if (r4k_250MHZhwbug())
                 UASM_i_MTC0(p, 0, C0_ENTRYLO1);
         UASM_i_MTC0(p, ptep, C0_ENTRYLO1); /* load it */
- -#endif
   }
   
   struct mips_huge_tlb_info {
@@@ -2284,10 -2298,6 +2283,10 @@@ static void config_htw_params(void
         /* re-initialize the PTI field including the even/odd bit */
         pwfield &= ~MIPS_PWFIELD_PTI_MASK;
         pwfield |= PAGE_SHIFT << MIPS_PWFIELD_PTI_SHIFT;
+ +      if (CONFIG_PGTABLE_LEVELS >= 3) {
+ +              pwfield &= ~MIPS_PWFIELD_MDI_MASK;
+ +              pwfield |= PMD_SHIFT << MIPS_PWFIELD_MDI_SHIFT;
+ +      }
         /* Set the PTEI right shift */
         ptei = _PAGE_GLOBAL_SHIFT << MIPS_PWFIELD_PTEI_SHIFT;
         pwfield |= ptei;
@@@ -2309,11 -2319,9 +2308,11 @@@
   
         pwsize = ilog2(PTRS_PER_PGD) << MIPS_PWSIZE_GDW_SHIFT;
         pwsize |= ilog2(PTRS_PER_PTE) << MIPS_PWSIZE_PTW_SHIFT;
+ +      if (CONFIG_PGTABLE_LEVELS >= 3)
+ +              pwsize |= ilog2(PTRS_PER_PMD) << MIPS_PWSIZE_MDW_SHIFT;
   
         /* If XPA has been enabled, PTEs are 64-bit in size. */
- -      if (read_c0_pagegrain() & PG_ELPA)
+ +      if (config_enabled(CONFIG_64BITS) || (read_c0_pagegrain() & PG_ELPA))
                 pwsize |= 1;
   
         write_c0_pwsize(pwsize);
@@@ -2351,41 -2359,6 +2350,41 @@@ static void config_xpa_params(void
   #endif
   }
   
+ +static void check_pabits(void)
+ +{
+ +      unsigned long entry;
+ +      unsigned pabits, fillbits;
+ +
+ +      if (!cpu_has_rixi || !_PAGE_NO_EXEC) {
+ +              /*
+ +               * We'll only be making use of the fact that we can rotate bits
+ +               * into the fill if the CPU supports RIXI, so don't bother
+ +               * probing this for CPUs which don't.
+ +               */
+ +              return;
+ +      }
+ +
+ +      write_c0_entrylo0(~0ul);
+ +      back_to_back_c0_hazard();
+ +      entry = read_c0_entrylo0();
+ +
+ +      /* clear all non-PFN bits */
+ +      entry &= ~((1 << MIPS_ENTRYLO_PFN_SHIFT) - 1);
+ +      entry &= ~(MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI);
+ +
+ +      /* find a lower bound on PABITS, and upper bound on fill bits */
+ +      pabits = fls_long(entry) + 6;
+ +      fillbits = max_t(int, (int)BITS_PER_LONG - pabits, 0);
+ +
+ +      /* minus the RI & XI bits */
+ +      fillbits -= min_t(unsigned, fillbits, 2);
+ +
+ +      if (fillbits >= ilog2(_PAGE_NO_EXEC))
+ +              fill_includes_sw_bits = true;
+ +
+ +      pr_debug("Entry* registers contain %u fill bits\n", fillbits);
+ +}
+ +
   void build_tlb_refill_handler(void)
   {
         /*
@@@ -2396,7 -2369,6 +2395,7 @@@
         static int run_once = 0;
   
         output_pgtable_bits_defines();
+ +      check_pabits();
   
   #ifdef CONFIG_64BIT
         check_for_high_segbits = current_cpu_data.vmbits > (PGDIR_SHIFT + PGD_ORDER + PAGE_SHIFT - 3);
diff --combined arch/powerpc/include/asm/pgtable-ppc64.h

index 3245f2d96d4f59e5140348b8c4dddbe836c5dda6,3c3a45632a953d30b21aa9c703819605b4b5143f..21d961bbac0e1e284b6f9cdb76cb472c8be48227
--- 1/arch/powerpc/include/asm/pgtable-ppc64.h
--- 2/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@@ -373,11 -373,6 +373,6 @@@ void pgtable_cache_add(unsigned shift, 
   void pgtable_cache_init(void);
   #endif /* __ASSEMBLY__ */
   
- /*
-  * THP pages can't be special. So use the _PAGE_SPECIAL
-  */
- #define _PAGE_SPLITTING _PAGE_SPECIAL
- 
   /*
    * We need to differentiate between explicit huge page and THP huge
    * page, since THP huge page also need to track real subpage details
@@@ -387,9 -382,8 +382,8 @@@
   /*
    * set of bits not changed in pmd_modify.
    */
- #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS |             \
-                        _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \
-                        _PAGE_THP_HUGE)
+ #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
+                        _PAGE_ACCESSED | _PAGE_THP_HUGE)
   
   #ifndef __ASSEMBLY__
   /*
@@@ -437,9 -431,9 +431,9 @@@ static inline char *get_hpte_slot_array
   
   }
   
+ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
                                    pmd_t *pmdp, unsigned long old_pmd);
- -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
   extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
   extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
@@@ -471,22 -465,7 +465,15 @@@ static inline int pmd_trans_huge(pmd_t 
         return (pmd_val(pmd) & 0x3) && (pmd_val(pmd) & _PAGE_THP_HUGE);
   }
   
- static inline int pmd_trans_splitting(pmd_t pmd)
- {
-       if (pmd_trans_huge(pmd))
-               return pmd_val(pmd) & _PAGE_SPLITTING;
-       return 0;
- }
- 
   extern int has_transparent_hugepage(void);
+ +#else
+ +static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
+ +                                        unsigned long addr, pmd_t *pmdp,
+ +                                        unsigned long old_pmd)
+ +{
+ +
+ +      WARN(1, "%s called with THP disabled\n", __func__);
+ +}
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   
   static inline int pmd_large(pmd_t pmd)
@@@ -515,9 -494,11 +502,11 @@@ static inline pte_t *pmdp_ptep(pmd_t *p
   #define pmd_pfn(pmd)          pte_pfn(pmd_pte(pmd))
   #define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
   #define pmd_young(pmd)                pte_young(pmd_pte(pmd))
+ #define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
   #define pmd_mkold(pmd)                pte_pmd(pte_mkold(pmd_pte(pmd)))
   #define pmd_wrprotect(pmd)    pte_pmd(pte_wrprotect(pmd_pte(pmd)))
   #define pmd_mkdirty(pmd)      pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+ #define pmd_mkclean(pmd)      pte_pmd(pte_mkclean(pmd_pte(pmd)))
   #define pmd_mkyoung(pmd)      pte_pmd(pte_mkyoung(pmd_pte(pmd)))
   #define pmd_mkwrite(pmd)      pte_pmd(pte_mkwrite(pmd_pte(pmd)))
   
@@@ -536,12 -517,6 +525,6 @@@ static inline pmd_t pmd_mknotpresent(pm
         return pmd;
   }
   
- static inline pmd_t pmd_mksplitting(pmd_t pmd)
- {
-       pmd_val(pmd) |= _PAGE_SPLITTING;
-       return pmd;
- }
- 
   #define __HAVE_ARCH_PMD_SAME
   static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
   {
@@@ -592,10 -567,6 +575,6 @@@ static inline void pmdp_set_wrprotect(s
         pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0);
   }
   
- #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long address, pmd_t *pmdp);
- 
   extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
   #define pmdp_collapse_flush pmdp_collapse_flush
diff --combined arch/powerpc/mm/hugetlbpage.c

index 9833fee493ec414be50c241153889d7ac4259402,8e01e4121fac870d2a3ee03acfa4b8dcfebe647f..cd2d82efe1cd15b1fe003b2eceee8bee33e0a072
--- 1/arch/powerpc/mm/hugetlbpage.c
--- 2/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@@ -89,25 -89,6 +89,25 @@@ int pgd_huge(pgd_t pgd
          */
         return ((pgd_val(pgd) & 0x3) != 0x0);
   }
+ +
+ +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_DEBUG_VM)
+ +/*
+ + * This enables us to catch the wrong page directory format
+ + * Moved here so that we can use WARN() in the call.
+ + */
+ +int hugepd_ok(hugepd_t hpd)
+ +{
+ +      bool is_hugepd;
+ +
+ +      /*
+ +       * We should not find this format in page directory, warn otherwise.
+ +       */
+ +      is_hugepd = (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0));
+ +      WARN(is_hugepd, "Found wrong page directory format\n");
+ +      return 0;
+ +}
+ +#endif
+ +
   #else
   int pmd_huge(pmd_t pmd)
   {
@@@ -128,7 -109,7 +128,7 @@@ int pgd_huge(pgd_t pgd
   pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
   {
         /* Only called for hugetlbfs pages, hence can ignore THP */
- -      return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+ +      return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
   }
   
   static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@@ -703,14 -684,13 +703,14 @@@ void hugetlb_free_pgd_range(struct mmu_
   struct page *
   follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
   {
+ +      bool is_thp;
         pte_t *ptep, pte;
         unsigned shift;
         unsigned long mask, flags;
         struct page *page = ERR_PTR(-EINVAL);
   
         local_irq_save(flags);
- -      ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
+ +      ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift);
         if (!ptep)
                 goto no_page;
         pte = READ_ONCE(*ptep);
@@@ -719,7 -699,7 +719,7 @@@
          * Transparent hugepages are handled by generic code. We can skip them
          * here.
          */
- -      if (!shift || pmd_trans_huge(__pmd(pte_val(pte))))
+ +      if (!shift || is_thp)
                 goto no_page;
   
         if (!pte_present(pte)) {
@@@ -976,7 -956,7 +976,7 @@@ void flush_dcache_icache_hugepage(struc
    */
   
   pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
- -                                 unsigned *shift)
+ +                                 bool *is_thp, unsigned *shift)
   {
         pgd_t pgd, *pgdp;
         pud_t pud, *pudp;
@@@ -988,9 -968,6 +988,9 @@@
         if (shift)
                 *shift = 0;
   
+ +      if (is_thp)
+ +              *is_thp = false;
+ +
         pgdp = pgdir + pgd_index(ea);
         pgd  = READ_ONCE(*pgdp);
         /*
@@@ -1030,22 -1007,11 +1030,18 @@@
                         /*
                          * A hugepage collapse is captured by pmd_none, because
                          * it mark the pmd none and do a hpte invalidate.
-                        *
-                        * We don't worry about pmd_trans_splitting here, The
-                        * caller if it needs to handle the splitting case
-                        * should check for that.
                          */
                         if (pmd_none(pmd))
                                 return NULL;
   
- -                      if (pmd_huge(pmd) || pmd_large(pmd)) {
+ +                      if (pmd_trans_huge(pmd)) {
+ +                              if (is_thp)
+ +                                      *is_thp = true;
+ +                              ret_pte = (pte_t *) pmdp;
+ +                              goto out;
+ +                      }
+ +
+ +                      if (pmd_huge(pmd)) {
                                 ret_pte = (pte_t *) pmdp;
                                 goto out;
                         } else if (is_hugepd(__hugepd(pmd_val(pmd))))
@@@ -1071,7 -1037,7 +1067,7 @@@ int gup_hugepte(pte_t *ptep, unsigned l
   {
         unsigned long mask;
         unsigned long pte_end;
-       struct page *head, *page, *tail;
+       struct page *head, *page;
         pte_t pte;
         int refs;
   
@@@ -1094,7 -1060,6 +1090,6 @@@
         head = pte_page(pte);
   
         page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
-       tail = page;
         do {
                 VM_BUG_ON(compound_head(page) != head);
                 pages[*nr] = page;
@@@ -1116,15 -1081,5 +1111,5 @@@
                 return 0;
         }
   
-       /*
-        * Any tail page need their mapcount reference taken before we
-        * return.
-        */
-       while (refs--) {
-               if (PageTail(tail))
-                       get_huge_page_tail(tail);
-               tail++;
-       }
- 
         return 1;
   }
diff --combined arch/powerpc/mm/numa.c

index b85d44271c3b9a1591e1fc5c3cd75db9501a5133,8d8a541211d0dc2164d9f48437a4e21fd3fe13e7..669a15e7fa76a07ad57c3d2b82712766520a049a
--- 1/arch/powerpc/mm/numa.c
--- 2/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@@ -80,7 -80,7 +80,7 @@@ static void __init setup_node_to_cpumas
                 setup_nr_node_ids();
   
         /* allocate the map */
-       for (node = 0; node < nr_node_ids; node++)
+       for_each_node(node)
                 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
   
         /* cpumask_of_node() will now work */
@@@ -276,6 -276,7 +276,6 @@@ static int of_node_to_nid_single(struc
   /* Walk the device tree upwards, looking for an associativity id */
   int of_node_to_nid(struct device_node *device)
   {
- -      struct device_node *tmp;
         int nid = -1;
   
         of_node_get(device);
@@@ -284,7 -285,9 +284,7 @@@
                 if (nid != -1)
                         break;
   
- -              tmp = device;
- -              device = of_get_parent(tmp);
- -              of_node_put(tmp);
+ +              device = of_get_next_parent(device);
         }
         of_node_put(device);
   
diff --combined arch/powerpc/sysdev/fsl_pci.c

index 1c65ef92768dbb553563506373094338ad92e7f9,13b9bcf5485e5f70f340f98c72a51f6e2af4b62b..610f472f91d14c25cef49a118ecc7f1d9eaf73b1
--- 1/arch/powerpc/sysdev/fsl_pci.c
--- 2/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@@ -179,19 -179,6 +179,19 @@@ static int setup_one_atmu(struct ccsr_p
         return i;
   }
   
+ +static bool is_kdump(void)
+ +{
+ +      struct device_node *node;
+ +
+ +      node = of_find_node_by_type(NULL, "memory");
+ +      if (!node) {
+ +              WARN_ON_ONCE(1);
+ +              return false;
+ +      }
+ +
+ +      return of_property_read_bool(node, "linux,usable-memory");
+ +}
+ +
   /* atmu setup for fsl pci/pcie controller */
   static void setup_pci_atmu(struct pci_controller *hose)
   {
@@@ -205,16 -192,6 +205,16 @@@
         const char *name = hose->dn->full_name;
         const u64 *reg;
         int len;
+ +      bool setup_inbound;
+ +
+ +      /*
+ +       * If this is kdump, we don't want to trigger a bunch of PCI
+ +       * errors by closing the window on in-flight DMA.
+ +       *
+ +       * We still run most of the function's logic so that things like
+ +       * hose->dma_window_size still get set.
+ +       */
+ +      setup_inbound = !is_kdump();
   
         if (early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP)) {
                 if (in_be32(&pci->block_rev1) >= PCIE_IP_REV_2_2) {
@@@ -227,11 -204,8 +227,11 @@@
         /* Disable all windows (except powar0 since it's ignored) */
         for(i = 1; i < 5; i++)
                 out_be32(&pci->pow[i].powar, 0);
- -      for (i = start_idx; i < end_idx; i++)
- -              out_be32(&pci->piw[i].piwar, 0);
+ +
+ +      if (setup_inbound) {
+ +              for (i = start_idx; i < end_idx; i++)
+ +                      out_be32(&pci->piw[i].piwar, 0);
+ +      }
   
         /* Setup outbound MEM window */
         for(i = 0, j = 1; i < 3; i++) {
@@@ -304,7 -278,6 +304,7 @@@
   
         /* Setup inbound mem window */
         mem = memblock_end_of_DRAM();
+ +      pr_info("%s: end of DRAM %llx\n", __func__, mem);
   
         /*
          * The msi-address-64 property, if it exists, indicates the physical
@@@ -347,14 -320,12 +347,14 @@@
   
                 piwar |= ((mem_log - 1) & PIWAR_SZ_MASK);
   
- -              /* Setup inbound memory window */
- -              out_be32(&pci->piw[win_idx].pitar,  0x00000000);
- -              out_be32(&pci->piw[win_idx].piwbar, 0x00000000);
- -              out_be32(&pci->piw[win_idx].piwar,  piwar);
- -              win_idx--;
+ +              if (setup_inbound) {
+ +                      /* Setup inbound memory window */
+ +                      out_be32(&pci->piw[win_idx].pitar,  0x00000000);
+ +                      out_be32(&pci->piw[win_idx].piwbar, 0x00000000);
+ +                      out_be32(&pci->piw[win_idx].piwar,  piwar);
+ +              }
   
+ +              win_idx--;
                 hose->dma_window_base_cur = 0x00000000;
                 hose->dma_window_size = (resource_size_t)sz;
   
@@@ -372,15 -343,13 +372,15 @@@
   
                         piwar = (piwar & ~PIWAR_SZ_MASK) | (mem_log - 1);
   
- -                      /* Setup inbound memory window */
- -                      out_be32(&pci->piw[win_idx].pitar,  0x00000000);
- -                      out_be32(&pci->piw[win_idx].piwbear,
- -                                      pci64_dma_offset >> 44);
- -                      out_be32(&pci->piw[win_idx].piwbar,
- -                                      pci64_dma_offset >> 12);
- -                      out_be32(&pci->piw[win_idx].piwar,  piwar);
+ +                      if (setup_inbound) {
+ +                              /* Setup inbound memory window */
+ +                              out_be32(&pci->piw[win_idx].pitar,  0x00000000);
+ +                              out_be32(&pci->piw[win_idx].piwbear,
+ +                                              pci64_dma_offset >> 44);
+ +                              out_be32(&pci->piw[win_idx].piwbar,
+ +                                              pci64_dma_offset >> 12);
+ +                              out_be32(&pci->piw[win_idx].piwar,  piwar);
+ +                      }
   
                         /*
                          * install our own dma_set_mask handler to fixup dma_ops
@@@ -393,15 -362,12 +393,15 @@@
         } else {
                 u64 paddr = 0;
   
- -              /* Setup inbound memory window */
- -              out_be32(&pci->piw[win_idx].pitar,  paddr >> 12);
- -              out_be32(&pci->piw[win_idx].piwbar, paddr >> 12);
- -              out_be32(&pci->piw[win_idx].piwar,  (piwar | (mem_log - 1)));
- -              win_idx--;
+ +              if (setup_inbound) {
+ +                      /* Setup inbound memory window */
+ +                      out_be32(&pci->piw[win_idx].pitar,  paddr >> 12);
+ +                      out_be32(&pci->piw[win_idx].piwbar, paddr >> 12);
+ +                      out_be32(&pci->piw[win_idx].piwar,
+ +                               (piwar | (mem_log - 1)));
+ +              }
   
+ +              win_idx--;
                 paddr += 1ull << mem_log;
                 sz -= 1ull << mem_log;
   
@@@ -409,15 -375,11 +409,15 @@@
                         mem_log = ilog2(sz);
                         piwar |= (mem_log - 1);
   
- -                      out_be32(&pci->piw[win_idx].pitar,  paddr >> 12);
- -                      out_be32(&pci->piw[win_idx].piwbar, paddr >> 12);
- -                      out_be32(&pci->piw[win_idx].piwar,  piwar);
- -                      win_idx--;
+ +                      if (setup_inbound) {
+ +                              out_be32(&pci->piw[win_idx].pitar,
+ +                                       paddr >> 12);
+ +                              out_be32(&pci->piw[win_idx].piwbar,
+ +                                       paddr >> 12);
+ +                              out_be32(&pci->piw[win_idx].piwar, piwar);
+ +                      }
   
+ +                      win_idx--;
                         paddr += 1ull << mem_log;
                 }
   
@@@ -1037,10 -999,10 +1037,10 @@@ int fsl_pci_mcheck_exception(struct pt_
                         ret = get_user(regs->nip, &inst);
                         pagefault_enable();
                 } else {
-                       ret = probe_kernel_address(regs->nip, inst);
+                       ret = probe_kernel_address((void *)regs->nip, inst);
                 }
   
- -              if (mcheck_handle_load(regs, inst)) {
+ +              if (!ret && mcheck_handle_load(regs, inst)) {
                         regs->nip += 4;
                         return 1;
                 }
diff --combined arch/s390/include/asm/pgtable.h

index 024f85f947aec50ea93c881e56a73ba3a5591d3c,5690abafe13ea240109441cd83e309b828a48796..64ead80912488b476e19a004eaf01924dbdc6b4c
--- 1/arch/s390/include/asm/pgtable.h
--- 2/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@@ -193,15 -193,9 +193,15 @@@ static inline int is_module_addr(void *
   #define _PAGE_UNUSED  0x080           /* SW bit for pgste usage state */
   #define __HAVE_ARCH_PTE_SPECIAL
   
+ +#ifdef CONFIG_MEM_SOFT_DIRTY
+ +#define _PAGE_SOFT_DIRTY 0x002                /* SW pte soft dirty bit */
+ +#else
+ +#define _PAGE_SOFT_DIRTY 0x000
+ +#endif
+ +
   /* Set of bits not changed in pte_modify */
   #define _PAGE_CHG_MASK                (PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \
- -                               _PAGE_YOUNG)
+ +                               _PAGE_YOUNG | _PAGE_SOFT_DIRTY)
   
   /*
    * handle_pte_fault uses pte_present and pte_none to find out the pte type
@@@ -286,17 -280,10 +286,16 @@@
   
   #define _SEGMENT_ENTRY_DIRTY  0x2000  /* SW segment dirty bit */
   #define _SEGMENT_ENTRY_YOUNG  0x1000  /* SW segment young bit */
- #define _SEGMENT_ENTRY_SPLIT  0x0800  /* THP splitting bit */
   #define _SEGMENT_ENTRY_LARGE  0x0400  /* STE-format control, large page */
   #define _SEGMENT_ENTRY_READ   0x0002  /* SW segment read bit */
   #define _SEGMENT_ENTRY_WRITE  0x0001  /* SW segment write bit */
   
+ +#ifdef CONFIG_MEM_SOFT_DIRTY
+ +#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */
+ +#else
+ +#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */
+ +#endif
+ +
   /*
    * Segment table entry encoding (R = read-only, I = invalid, y = young bit):
    *                            dy..R...I...wr
@@@ -318,8 -305,6 +317,6 @@@
    * SW-bits: y young, d dirty, r read, w write
    */
   
- #define _SEGMENT_ENTRY_SPLIT_BIT 11   /* THP splitting bit number */
- 
   /* Page status table bits for virtualization */
   #define PGSTE_ACC_BITS        0xf000000000000000UL
   #define PGSTE_FP_BIT  0x0800000000000000UL
@@@ -523,10 -508,6 +520,6 @@@ static inline int pmd_bad(pmd_t pmd
         return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
   }
   
- #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long addr, pmd_t *pmdp);
- 
   #define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
   extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp,
@@@ -601,43 -582,6 +594,43 @@@ static inline int pmd_protnone(pmd_t pm
   }
   #endif
   
+ +static inline int pte_soft_dirty(pte_t pte)
+ +{
+ +      return pte_val(pte) & _PAGE_SOFT_DIRTY;
+ +}
+ +#define pte_swp_soft_dirty pte_soft_dirty
+ +
+ +static inline pte_t pte_mksoft_dirty(pte_t pte)
+ +{
+ +      pte_val(pte) |= _PAGE_SOFT_DIRTY;
+ +      return pte;
+ +}
+ +#define pte_swp_mksoft_dirty pte_mksoft_dirty
+ +
+ +static inline pte_t pte_clear_soft_dirty(pte_t pte)
+ +{
+ +      pte_val(pte) &= ~_PAGE_SOFT_DIRTY;
+ +      return pte;
+ +}
+ +#define pte_swp_clear_soft_dirty pte_clear_soft_dirty
+ +
+ +static inline int pmd_soft_dirty(pmd_t pmd)
+ +{
+ +      return pmd_val(pmd) & _SEGMENT_ENTRY_SOFT_DIRTY;
+ +}
+ +
+ +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+ +{
+ +      pmd_val(pmd) |= _SEGMENT_ENTRY_SOFT_DIRTY;
+ +      return pmd;
+ +}
+ +
+ +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+ +{
+ +      pmd_val(pmd) &= ~_SEGMENT_ENTRY_SOFT_DIRTY;
+ +      return pmd;
+ +}
+ +
   static inline pgste_t pgste_get_lock(pte_t *ptep)
   {
         unsigned long new = 0;
@@@ -938,7 -882,7 +931,7 @@@ static inline pte_t pte_mkclean(pte_t p
   
   static inline pte_t pte_mkdirty(pte_t pte)
   {
- -      pte_val(pte) |= _PAGE_DIRTY;
+ +      pte_val(pte) |= _PAGE_DIRTY | _PAGE_SOFT_DIRTY;
         if (pte_val(pte) & _PAGE_WRITE)
                 pte_val(pte) &= ~_PAGE_PROTECT;
         return pte;
@@@ -1267,10 -1211,8 +1260,10 @@@ static inline int ptep_set_access_flags
                                         pte_t entry, int dirty)
   {
         pgste_t pgste;
+ +      pte_t oldpte;
   
- -      if (pte_same(*ptep, entry))
+ +      oldpte = *ptep;
+ +      if (pte_same(oldpte, entry))
                 return 0;
         if (mm_has_pgste(vma->vm_mm)) {
                 pgste = pgste_get_lock(ptep);
@@@ -1280,8 -1222,7 +1273,8 @@@
         ptep_flush_direct(vma->vm_mm, address, ptep);
   
         if (mm_has_pgste(vma->vm_mm)) {
- -              pgste_set_key(ptep, pgste, entry, vma->vm_mm);
+ +              if (pte_val(oldpte) & _PAGE_INVALID)
+ +                      pgste_set_key(ptep, pgste, entry, vma->vm_mm);
                 pgste = pgste_set_pte(ptep, pgste, entry);
                 pgste_set_unlock(ptep, pgste);
         } else
@@@ -1392,8 -1333,7 +1385,8 @@@ static inline pmd_t pmd_mkclean(pmd_t p
   static inline pmd_t pmd_mkdirty(pmd_t pmd)
   {
         if (pmd_large(pmd)) {
- -              pmd_val(pmd) |= _SEGMENT_ENTRY_DIRTY;
+ +              pmd_val(pmd) |= _SEGMENT_ENTRY_DIRTY |
+ +                              _SEGMENT_ENTRY_SOFT_DIRTY;
                 if (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE)
                         pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
         }
@@@ -1424,8 -1364,7 +1417,7 @@@ static inline pmd_t pmd_modify(pmd_t pm
         if (pmd_large(pmd)) {
                 pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
                         _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
-                       _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT |
-                       _SEGMENT_ENTRY_SOFT_DIRTY;
- -                      _SEGMENT_ENTRY_LARGE;
++                      _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY;
                 pmd_val(pmd) |= massage_pgprot_pmd(newprot);
                 if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
                         pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
@@@ -1533,12 -1472,6 +1525,6 @@@ extern void pgtable_trans_huge_deposit(
   #define __HAVE_ARCH_PGTABLE_WITHDRAW
   extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
   
- static inline int pmd_trans_splitting(pmd_t pmd)
- {
-       return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) &&
-               (pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT);
- }
- 
   static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                               pmd_t *pmdp, pmd_t entry)
   {
diff --combined arch/x86/Kconfig

index c22df590e7e7463c71ac80a3f7e795656f1d33c2,9e079f013c07e08d495d3ffff09cf68575b3b263..0f1ccc3b3d2b06729230b9ce4c44252bc283980b
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -636,7 -636,7 +636,7 @@@ config X86_32_IRI
   
   config SCHED_OMIT_FRAME_POINTER
         def_bool y
- -      prompt "Single-depth WCHAN output"
+ +      prompt "Single-depth WCHAN output" if !LTO && !FRAME_POINTER
         depends on X86
         ---help---
           Calculate simpler /proc/<PID>/wchan values. If this option
@@@ -1123,10 -1123,8 +1123,10 @@@ config X86_REBOOTFIXUP
           Say N otherwise.
   
   config MICROCODE
- -      tristate "CPU microcode loading support"
+ +      bool "CPU microcode loading support"
+ +      default y
         depends on CPU_SUP_AMD || CPU_SUP_INTEL
+ +      depends on BLK_DEV_INITRD
         select FW_LOADER
         ---help---
   
@@@ -1168,6 -1166,24 +1168,6 @@@ config MICROCODE_OLD_INTERFAC
         def_bool y
         depends on MICROCODE
   
- -config MICROCODE_INTEL_EARLY
- -      bool
- -
- -config MICROCODE_AMD_EARLY
- -      bool
- -
- -config MICROCODE_EARLY
- -      bool "Early load microcode"
- -      depends on MICROCODE=y && BLK_DEV_INITRD
- -      select MICROCODE_INTEL_EARLY if MICROCODE_INTEL
- -      select MICROCODE_AMD_EARLY if MICROCODE_AMD
- -      default y
- -      help
- -        This option provides functionality to read additional microcode data
- -        at the beginning of initrd image. The data tells kernel to load
- -        microcode to CPU's as early as possible. No functional change if no
- -        microcode data is glued to the initrd, therefore it's safe to say Y.
- -
   config X86_MSR
         tristate "/dev/cpu/*/msr - Model-specific register support"
         ---help---
@@@ -2027,55 -2043,6 +2027,55 @@@ config COMPAT_VDS
           If unsure, say N: if you are compiling your own kernel, you
           are unlikely to be using a buggy version of glibc.
   
+ +choice
+ +      prompt "vsyscall table for legacy applications"
+ +      depends on X86_64
+ +      default LEGACY_VSYSCALL_EMULATE
+ +      help
+ +        Legacy user code that does not know how to find the vDSO expects
+ +        to be able to issue three syscalls by calling fixed addresses in
+ +        kernel space. Since this location is not randomized with ASLR,
+ +        it can be used to assist security vulnerability exploitation.
+ +
+ +        This setting can be changed at boot time via the kernel command
+ +        line parameter vsyscall=[native|emulate|none].
+ +
+ +        On a system with recent enough glibc (2.14 or newer) and no
+ +        static binaries, you can say None without a performance penalty
+ +        to improve security.
+ +
+ +        If unsure, select "Emulate".
+ +
+ +      config LEGACY_VSYSCALL_NATIVE
+ +              bool "Native"
+ +              help
+ +                Actual executable code is located in the fixed vsyscall
+ +                address mapping, implementing time() efficiently. Since
+ +                this makes the mapping executable, it can be used during
+ +                security vulnerability exploitation (traditionally as
+ +                ROP gadgets). This configuration is not recommended.
+ +
+ +      config LEGACY_VSYSCALL_EMULATE
+ +              bool "Emulate"
+ +              help
+ +                The kernel traps and emulates calls into the fixed
+ +                vsyscall address mapping. This makes the mapping
+ +                non-executable, but it still contains known contents,
+ +                which could be used in certain rare security vulnerability
+ +                exploits. This configuration is recommended when userspace
+ +                still uses the vsyscall area.
+ +
+ +      config LEGACY_VSYSCALL_NONE
+ +              bool "None"
+ +              help
+ +                There will be no vsyscall mapping at all. This will
+ +                eliminate any risk of ASLR bypass due to the vsyscall
+ +                fixed address mapping. Attempts to use the vsyscalls
+ +                will be reported to dmesg, so that either old or
+ +                malicious userspace programs can be identified.
+ +
+ +endchoice
+ +
   config CMDLINE_BOOL
         bool "Built-in kernel command line"
         ---help---
@@@ -2151,6 -2118,9 +2151,9 @@@ config USE_PERCPU_NUMA_NODE_I
         def_bool y
         depends on NUMA
   
+ config HAVE_MEMORYLESS_NODES
+       def_bool NUMA
+ 
   config ARCH_ENABLE_SPLIT_PMD_PTLOCK
         def_bool y
         depends on X86_64 || X86_PAE
diff --combined arch/x86/entry/syscalls/syscall_32.tbl

index caa2c712d1e70c5895d92cf856b91a02d9123083,143ef9f37932691c7af983924a7213e9a7824301..f17705e1332cc3b81dc9a3a7551ece5d1848d5db
--- 1/arch/x86/entry/syscalls/syscall_32.tbl
--- 2/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@@ -8,7 -8,7 +8,7 @@@
   #
   0     i386    restart_syscall         sys_restart_syscall
   1     i386    exit                    sys_exit
- -2     i386    fork                    sys_fork                        stub32_fork
+ +2     i386    fork                    sys_fork                        sys_fork
   3     i386    read                    sys_read
   4     i386    write                   sys_write
   5     i386    open                    sys_open                        compat_sys_open
@@@ -17,7 -17,7 +17,7 @@@
   8     i386    creat                   sys_creat
   9     i386    link                    sys_link
   10    i386    unlink                  sys_unlink
- -11    i386    execve                  sys_execve                      stub32_execve
+ +11    i386    execve                  sys_execve                      compat_sys_execve
   12    i386    chdir                   sys_chdir
   13    i386    time                    sys_time                        compat_sys_time
   14    i386    mknod                   sys_mknod
@@@ -125,7 -125,7 +125,7 @@@
   116   i386    sysinfo                 sys_sysinfo                     compat_sys_sysinfo
   117   i386    ipc                     sys_ipc                         compat_sys_ipc
   118   i386    fsync                   sys_fsync
- -119   i386    sigreturn               sys_sigreturn                   stub32_sigreturn
+ +119   i386    sigreturn               sys_sigreturn                   sys32_sigreturn
   120   i386    clone                   sys_clone                       stub32_clone
   121   i386    setdomainname           sys_setdomainname
   122   i386    uname                   sys_newuname
@@@ -179,7 -179,7 +179,7 @@@
   170   i386    setresgid               sys_setresgid16
   171   i386    getresgid               sys_getresgid16
   172   i386    prctl                   sys_prctl
- -173   i386    rt_sigreturn            sys_rt_sigreturn                stub32_rt_sigreturn
+ +173   i386    rt_sigreturn            sys_rt_sigreturn                sys32_rt_sigreturn
   174   i386    rt_sigaction            sys_rt_sigaction                compat_sys_rt_sigaction
   175   i386    rt_sigprocmask          sys_rt_sigprocmask
   176   i386    rt_sigpending           sys_rt_sigpending               compat_sys_rt_sigpending
@@@ -196,7 -196,7 +196,7 @@@
   187   i386    sendfile                sys_sendfile                    compat_sys_sendfile
   188   i386    getpmsg
   189   i386    putpmsg
- -190   i386    vfork                   sys_vfork                       stub32_vfork
+ +190   i386    vfork                   sys_vfork                       sys_vfork
   191   i386    ugetrlimit              sys_getrlimit                   compat_sys_getrlimit
   192   i386    mmap2                   sys_mmap_pgoff
   193   i386    truncate64              sys_truncate64                  sys32_truncate64
@@@ -364,7 -364,7 +364,7 @@@
   355   i386    getrandom               sys_getrandom
   356   i386    memfd_create            sys_memfd_create
   357   i386    bpf                     sys_bpf
- -358   i386    execveat                sys_execveat                    stub32_execveat
+ +358   i386    execveat                sys_execveat                    compat_sys_execveat
   359   i386    socket                  sys_socket
   360   i386    socketpair              sys_socketpair
   361   i386    bind                    sys_bind
@@@ -382,3 -382,4 +382,4 @@@
   373   i386    shutdown                sys_shutdown
   374   i386    userfaultfd             sys_userfaultfd
   375   i386    membarrier              sys_membarrier
+ 376   i386    mlock2                  sys_mlock2
diff --combined arch/x86/include/asm/pgtable.h

index 6ec0c8b2e9df5b1d4c7702fd7f1d96c2c24db5d4,a8fdfe0d72775fa14d59189b5b3b48308e93ad9a..9ff592003afda8b9d1d2bf1d3353ae8f04625d60
--- 1/arch/x86/include/asm/pgtable.h
--- 2/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@@ -19,13 -19,6 +19,13 @@@
   #include <asm/x86_init.h>
   
   void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+ +void ptdump_walk_pgd_level_checkwx(void);
+ +
+ +#ifdef CONFIG_DEBUG_WX
+ +#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
+ +#else
+ +#define debug_checkwx() do { } while (0)
+ +#endif
   
   /*
    * ZERO_PAGE is a global shared page that is always zero: used
@@@ -149,12 -142,12 +149,12 @@@ static inline unsigned long pte_pfn(pte
   
   static inline unsigned long pmd_pfn(pmd_t pmd)
   {
- -      return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+ +      return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
   }
   
   static inline unsigned long pud_pfn(pud_t pud)
   {
- -      return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT;
+ +      return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
   }
   
   #define pte_page(pte) pfn_to_page(pte_pfn(pte))
@@@ -165,11 -158,6 +165,6 @@@ static inline int pmd_large(pmd_t pte
   }
   
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
- static inline int pmd_trans_splitting(pmd_t pmd)
- {
-       return pmd_val(pmd) & _PAGE_SPLITTING;
- }
- 
   static inline int pmd_trans_huge(pmd_t pmd)
   {
         return pmd_val(pmd) & _PAGE_PSE;
@@@ -274,6 -262,11 +269,11 @@@ static inline pmd_t pmd_mkold(pmd_t pmd
         return pmd_clear_flags(pmd, _PAGE_ACCESSED);
   }
   
+ static inline pmd_t pmd_mkclean(pmd_t pmd)
+ {
+       return pmd_clear_flags(pmd, _PAGE_DIRTY);
+ }
+ 
   static inline pmd_t pmd_wrprotect(pmd_t pmd)
   {
         return pmd_clear_flags(pmd, _PAGE_RW);
@@@ -325,16 -318,6 +325,16 @@@ static inline pmd_t pmd_mksoft_dirty(pm
         return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
   }
   
+ +static inline pte_t pte_clear_soft_dirty(pte_t pte)
+ +{
+ +      return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
+ +}
+ +
+ +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+ +{
+ +      return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+ +}
+ +
   #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
   
   /*
@@@ -396,9 -379,7 +396,9 @@@ static inline pgprot_t pgprot_modify(pg
         return __pgprot(preservebits | addbits);
   }
   
- -#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
+ +#define pte_pgprot(x) __pgprot(pte_flags(x))
+ +#define pmd_pgprot(x) __pgprot(pmd_flags(x))
+ +#define pud_pgprot(x) __pgprot(pud_flags(x))
   
   #define canon_pgprot(p) __pgprot(massage_pgprot(p))
   
@@@ -521,15 -502,14 +521,15 @@@ static inline int pmd_none(pmd_t pmd
   
   static inline unsigned long pmd_page_vaddr(pmd_t pmd)
   {
- -      return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
+ +      return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
   }
   
   /*
    * Currently stuck as a macro due to indirect forward reference to
    * linux/mmzone.h's __section_mem_map_addr() definition:
    */
- -#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
+ +#define pmd_page(pmd)         \
+ +      pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT)
   
   /*
    * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@@ -590,15 -570,14 +590,15 @@@ static inline int pud_present(pud_t pud
   
   static inline unsigned long pud_page_vaddr(pud_t pud)
   {
- -      return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
+ +      return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud));
   }
   
   /*
    * Currently stuck as a macro due to indirect forward reference to
    * linux/mmzone.h's __section_mem_map_addr() definition:
    */
- -#define pud_page(pud)         pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+ +#define pud_page(pud)         \
+ +      pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT)
   
   /* Find an entry in the second-level page table.. */
   static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@@ -816,10 -795,6 +816,6 @@@ extern int pmdp_clear_flush_young(struc
                                   unsigned long address, pmd_t *pmdp);
   
   
- #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long addr, pmd_t *pmdp);
- 
   #define __HAVE_ARCH_PMD_WRITE
   static inline int pmd_write(pmd_t pmd)
   {
diff --combined arch/x86/include/asm/pgtable_types.h

index dd5b0aa9dd2f93a01b554029ebc243aa56ae91d3,d173197cfd9e0cfb4051a566ac96aaa663aacd9d..116fc4ee586f3750da9bb2d3d9d7867e71fbea09
--- 1/arch/x86/include/asm/pgtable_types.h
--- 2/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@@ -22,7 -22,6 +22,6 @@@
   #define _PAGE_BIT_PAT_LARGE   12      /* On 2MB or 1GB pages */
   #define _PAGE_BIT_SPECIAL     _PAGE_BIT_SOFTW1
   #define _PAGE_BIT_CPA_TEST    _PAGE_BIT_SOFTW1
- #define _PAGE_BIT_SPLITTING   _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
   #define _PAGE_BIT_HIDDEN      _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
   #define _PAGE_BIT_SOFT_DIRTY  _PAGE_BIT_SOFTW3 /* software dirty tracking */
   #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
@@@ -46,7 -45,6 +45,6 @@@
   #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
   #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
   #define _PAGE_CPA_TEST        (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
- #define _PAGE_SPLITTING       (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
   #define __HAVE_ARCH_PTE_SPECIAL
   
   #ifdef CONFIG_KMEMCHECK
@@@ -209,10 -207,10 +207,10 @@@ enum page_cache_mode 
   
   #include <linux/types.h>
   
- -/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+ +/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
   #define PTE_PFN_MASK          ((pteval_t)PHYSICAL_PAGE_MASK)
   
- -/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+ +/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */
   #define PTE_FLAGS_MASK                (~PTE_PFN_MASK)
   
   typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
@@@ -276,46 -274,14 +274,46 @@@ static inline pmdval_t native_pmd_val(p
   }
   #endif
   
+ +static inline pudval_t pud_pfn_mask(pud_t pud)
+ +{
+ +      if (native_pud_val(pud) & _PAGE_PSE)
+ +              return PUD_PAGE_MASK & PHYSICAL_PAGE_MASK;
+ +      else
+ +              return PTE_PFN_MASK;
+ +}
+ +
+ +static inline pudval_t pud_flags_mask(pud_t pud)
+ +{
+ +      if (native_pud_val(pud) & _PAGE_PSE)
+ +              return ~(PUD_PAGE_MASK & (pudval_t)PHYSICAL_PAGE_MASK);
+ +      else
+ +              return ~PTE_PFN_MASK;
+ +}
+ +
   static inline pudval_t pud_flags(pud_t pud)
   {
- -      return native_pud_val(pud) & PTE_FLAGS_MASK;
+ +      return native_pud_val(pud) & pud_flags_mask(pud);
+ +}
+ +
+ +static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
+ +{
+ +      if (native_pmd_val(pmd) & _PAGE_PSE)
+ +              return PMD_PAGE_MASK & PHYSICAL_PAGE_MASK;
+ +      else
+ +              return PTE_PFN_MASK;
+ +}
+ +
+ +static inline pmdval_t pmd_flags_mask(pmd_t pmd)
+ +{
+ +      if (native_pmd_val(pmd) & _PAGE_PSE)
+ +              return ~(PMD_PAGE_MASK & (pmdval_t)PHYSICAL_PAGE_MASK);
+ +      else
+ +              return ~PTE_PFN_MASK;
   }
   
   static inline pmdval_t pmd_flags(pmd_t pmd)
   {
- -      return native_pmd_val(pmd) & PTE_FLAGS_MASK;
+ +      return native_pmd_val(pmd) & pmd_flags_mask(pmd);
   }
   
   static inline pte_t native_make_pte(pteval_t val)
diff --combined arch/x86/kernel/acpi/boot.c

index e75907601a41c349e05c8dfe63047dc460ccdf30,b98f1f915357de77f21d22ed74f9836fa4df1e06..3625ac798821366bbdd2680f9db172a081550fea
--- 1/arch/x86/kernel/acpi/boot.c
--- 2/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@@ -705,8 -705,14 +705,14 @@@ static void acpi_map_cpu2node(acpi_hand
   
         nid = acpi_get_node(handle);
         if (nid != -1) {
+               if (try_online_node(nid)) {
+                       pr_warn("failed to online node%d for CPU%d, use node%d instead.\n",
+                               nid, cpu, first_node(node_online_map));
+                       nid = first_node(node_online_map);
+               }
                 set_apicid_to_node(physid, nid);
                 numa_set_node(cpu, nid);
+               set_cpu_numa_mem(cpu, local_memory_node(nid));
         }
   #endif
   }
@@@ -733,9 -739,10 +739,10 @@@ int acpi_unmap_cpu(int cpu
   {
   #ifdef CONFIG_ACPI_NUMA
         set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+       set_cpu_numa_mem(cpu, NUMA_NO_NODE);
   #endif
   
-       per_cpu(x86_cpu_to_apicid, cpu) = -1;
+       per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
         set_cpu_present(cpu, false);
         num_processors--;
   
@@@ -976,8 -983,6 +983,8 @@@ static int __init acpi_parse_madt_lapic
   {
         int count;
         int x2count = 0;
+ +      int ret;
+ +      struct acpi_subtable_proc madt_proc[2];
   
         if (!cpu_has_apic)
                 return -ENODEV;
@@@ -1001,22 -1006,10 +1008,22 @@@
                                       acpi_parse_sapic, MAX_LOCAL_APIC);
   
         if (!count) {
- -              x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
- -                                      acpi_parse_x2apic, MAX_LOCAL_APIC);
- -              count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
- -                                      acpi_parse_lapic, MAX_LOCAL_APIC);
+ +              memset(madt_proc, 0, sizeof(madt_proc));
+ +              madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
+ +              madt_proc[0].handler = acpi_parse_lapic;
+ +              madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
+ +              madt_proc[1].handler = acpi_parse_x2apic;
+ +              ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
+ +                              sizeof(struct acpi_table_madt),
+ +                              madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
+ +              if (ret < 0) {
+ +                      printk(KERN_ERR PREFIX
+ +                                      "Error parsing LAPIC/X2APIC entries\n");
+ +                      return ret;
+ +              }
+ +
+ +              x2count = madt_proc[0].count;
+ +              count = madt_proc[1].count;
         }
         if (!count && !x2count) {
                 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --combined arch/x86/kernel/pci-dma.c

index cd99433b8ba17597cbc9e91aba9c40eee7e05e4b,a8e618b16a66a25f07260dac1379998a79b8ac9a..6ba014c61d62d20a078dd260103f23465a47a8cd
--- 1/arch/x86/kernel/pci-dma.c
--- 2/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@@ -90,7 -90,7 +90,7 @@@ void *dma_generic_alloc_coherent(struc
   again:
         page = NULL;
         /* CMA can be used only in the context which permits sleeping */
-       if (flag & __GFP_WAIT) {
+       if (gfpflags_allow_blocking(flag)) {
                 page = dma_alloc_from_contiguous(dev, count, get_order(size));
                 if (page && page_to_phys(page) + size > dma_mask) {
                         dma_release_from_contiguous(dev, page, count);
@@@ -131,12 -131,11 +131,12 @@@ void dma_generic_free_coherent(struct d
   
   bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
   {
+ +      if (!*dev)
+ +              *dev = &x86_dma_fallback_dev;
+ +
         *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
         *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
   
- -      if (!*dev)
- -              *dev = &x86_dma_fallback_dev;
         if (!is_device_dma_capable(*dev))
                 return false;
         return true;
diff --combined arch/x86/kernel/smpboot.c

index 892ee2e5ecbce417df506715f7b28d28c403ef91,a2a58e5337fbfbee38900fb0463c69f5f7449f1f..5ed24ea0e9455558931a414a38045cac44edb2ae
--- 1/arch/x86/kernel/smpboot.c
--- 2/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@@ -155,6 -155,8 +155,8 @@@ static void smp_callin(void
          */
         phys_id = read_apic_id();
   
+       set_numa_mem(local_memory_node(cpu_to_node(cpuid)));
+ 
         /*
          * the boot CPU has finished the init stage and is spinning
          * on callin_map until we finish. We are free to set up this
@@@ -509,7 -511,7 +511,7 @@@ void __inquire_remote_apic(int apicid
    */
   #define UDELAY_10MS_DEFAULT 10000
   
- -static unsigned int init_udelay = UDELAY_10MS_DEFAULT;
+ +static unsigned int init_udelay = INT_MAX;
   
   static int __init cpu_init_udelay(char *str)
   {
@@@ -522,16 -524,13 +524,16 @@@ early_param("cpu_init_udelay", cpu_init
   static void __init smp_quirk_init_udelay(void)
   {
         /* if cmdline changed it from default, leave it alone */
- -      if (init_udelay != UDELAY_10MS_DEFAULT)
+ +      if (init_udelay != INT_MAX)
                 return;
   
         /* if modern processor, use no delay */
         if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
             ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF)))
                 init_udelay = 0;
+ +
+ +      /* else, use legacy delay */
+ +      init_udelay = UDELAY_10MS_DEFAULT;
   }
   
   /*
@@@ -660,9 -659,7 +662,9 @@@ wakeup_secondary_cpu_via_init(int phys_
                 /*
                  * Give the other CPU some time to accept the IPI.
                  */
- -              if (init_udelay)
+ +              if (init_udelay == 0)
+ +                      udelay(10);
+ +              else
                         udelay(300);
   
                 pr_debug("Startup point 1\n");
@@@ -673,9 -670,7 +675,9 @@@
                 /*
                  * Give the other CPU some time to accept the IPI.
                  */
- -              if (init_udelay)
+ +              if (init_udelay == 0)
+ +                      udelay(10);
+ +              else
                         udelay(200);
   
                 if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
diff --combined arch/x86/mm/gup.c

index ae9a37bf13711460892584e67d02880291168f86,49bbbc57603b5e31326ff404cbbffe4d55f39a2b..f8cb3e8ac250ecc8ae288ec0135f5689ca7cc0b9
--- 1/arch/x86/mm/gup.c
--- 2/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@@ -118,26 -118,25 +118,24 @@@ static noinline int gup_huge_pmd(pmd_t 
                 unsigned long end, int write, struct page **pages, int *nr)
   {
         unsigned long mask;
- -      pte_t pte = *(pte_t *)&pmd;
         struct page *head, *page;
         int refs;
   
         mask = _PAGE_PRESENT|_PAGE_USER;
         if (write)
                 mask |= _PAGE_RW;
- -      if ((pte_flags(pte) & mask) != mask)
+ +      if ((pmd_flags(pmd) & mask) != mask)
                 return 0;
         /* hugepages are never "special" */
- -      VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
- -      VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ +      VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
+ +      VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
   
         refs = 0;
- -      head = pte_page(pte);
+ +      head = pmd_page(pmd);
         page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
         do {
                 VM_BUG_ON_PAGE(compound_head(page) != head, page);
                 pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                 (*nr)++;
                 page++;
                 refs++;
@@@ -158,18 -157,7 +156,7 @@@ static int gup_pmd_range(pud_t pud, uns
                 pmd_t pmd = *pmdp;
   
                 next = pmd_addr_end(addr, end);
-               /*
-                * The pmd_trans_splitting() check below explains why
-                * pmdp_splitting_flush has to flush the tlb, to stop
-                * this gup-fast code from running while we set the
-                * splitting bit in the pmd. Returning zero will take
-                * the slow path that will call wait_split_huge_page()
-                * if the pmd is still in splitting state. gup-fast
-                * can't because it has irq disabled and
-                * wait_split_huge_page() would never return as the
-                * tlb flush IPI wouldn't run.
-                */
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                         return 0;
                 if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
                         /*
@@@ -194,26 -182,25 +181,24 @@@ static noinline int gup_huge_pud(pud_t 
                 unsigned long end, int write, struct page **pages, int *nr)
   {
         unsigned long mask;
- -      pte_t pte = *(pte_t *)&pud;
         struct page *head, *page;
         int refs;
   
         mask = _PAGE_PRESENT|_PAGE_USER;
         if (write)
                 mask |= _PAGE_RW;
- -      if ((pte_flags(pte) & mask) != mask)
+ +      if ((pud_flags(pud) & mask) != mask)
                 return 0;
         /* hugepages are never "special" */
- -      VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
- -      VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ +      VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
+ +      VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
   
         refs = 0;
- -      head = pte_page(pte);
+ +      head = pud_page(pud);
         page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
         do {
                 VM_BUG_ON_PAGE(compound_head(page) != head, page);
                 pages[*nr] = page;
-               if (PageTail(page))
-                       get_huge_page_tail(page);
                 (*nr)++;
                 page++;
                 refs++;
diff --combined block/blk-core.c

index 89eec79658702a7e53712bc52178dae25bddcc22,0391206868e9a81dca0add4a72fc065c3b66d5c0..5dd1f54d793549e50180b0e4840f8667536351ad
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -554,30 -554,29 +554,30 @@@ void blk_cleanup_queue(struct request_q
          * Drain all requests queued before DYING marking. Set DEAD flag to
          * prevent that q->request_fn() gets invoked after draining finished.
          */
- -      if (q->mq_ops) {
- -              blk_mq_freeze_queue(q);
- -              spin_lock_irq(lock);
- -      } else {
- -              spin_lock_irq(lock);
+ +      blk_freeze_queue(q);
+ +      spin_lock_irq(lock);
+ +      if (!q->mq_ops)
                 __blk_drain_queue(q, true);
- -      }
         queue_flag_set(QUEUE_FLAG_DEAD, q);
         spin_unlock_irq(lock);
   
+ +      /* for synchronous bio-based driver finish in-flight integrity i/o */
+ +      blk_flush_integrity();
+ +
         /* @q won't process any more request, flush async actions */
         del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
         blk_sync_queue(q);
   
         if (q->mq_ops)
                 blk_mq_free_queue(q);
+ +      percpu_ref_exit(&q->q_usage_counter);
   
         spin_lock_irq(lock);
         if (q->queue_lock != &q->__queue_lock)
                 q->queue_lock = &q->__queue_lock;
         spin_unlock_irq(lock);
   
- -      bdi_destroy(&q->backing_dev_info);
+ +      bdi_unregister(&q->backing_dev_info);
   
         /* @q is and will stay empty, shutdown and put */
         blk_put_queue(q);
@@@ -630,40 -629,6 +630,40 @@@ struct request_queue *blk_alloc_queue(g
   }
   EXPORT_SYMBOL(blk_alloc_queue);
   
-               if (!(gfp & __GFP_WAIT))
+ +int blk_queue_enter(struct request_queue *q, gfp_t gfp)
+ +{
+ +      while (true) {
+ +              int ret;
+ +
+ +              if (percpu_ref_tryget_live(&q->q_usage_counter))
+ +                      return 0;
+ +
++              if (!gfpflags_allow_blocking(gfp))
+ +                      return -EBUSY;
+ +
+ +              ret = wait_event_interruptible(q->mq_freeze_wq,
+ +                              !atomic_read(&q->mq_freeze_depth) ||
+ +                              blk_queue_dying(q));
+ +              if (blk_queue_dying(q))
+ +                      return -ENODEV;
+ +              if (ret)
+ +                      return ret;
+ +      }
+ +}
+ +
+ +void blk_queue_exit(struct request_queue *q)
+ +{
+ +      percpu_ref_put(&q->q_usage_counter);
+ +}
+ +
+ +static void blk_queue_usage_counter_release(struct percpu_ref *ref)
+ +{
+ +      struct request_queue *q =
+ +              container_of(ref, struct request_queue, q_usage_counter);
+ +
+ +      wake_up_all(&q->mq_freeze_wq);
+ +}
+ +
   struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
   {
         struct request_queue *q;
@@@ -725,22 -690,11 +725,22 @@@
   
         init_waitqueue_head(&q->mq_freeze_wq);
   
- -      if (blkcg_init_queue(q))
+ +      /*
+ +       * Init percpu_ref in atomic mode so that it's faster to shutdown.
+ +       * See blk_register_queue() for details.
+ +       */
+ +      if (percpu_ref_init(&q->q_usage_counter,
+ +                              blk_queue_usage_counter_release,
+ +                              PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
                 goto fail_bdi;
   
+ +      if (blkcg_init_queue(q))
+ +              goto fail_ref;
+ +
         return q;
   
+ +fail_ref:
+ +      percpu_ref_exit(&q->q_usage_counter);
   fail_bdi:
         bdi_destroy(&q->backing_dev_info);
   fail_split:
@@@ -1206,8 -1160,8 +1206,8 @@@ rq_starved
    * @bio: bio to allocate request for (can be %NULL)
    * @gfp_mask: allocation mask
    *
-  * Get a free request from @q.  If %__GFP_WAIT is set in @gfp_mask, this
-  * function keeps retrying under memory pressure and fails iff @q is dead.
+  * Get a free request from @q.  If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
+  * this function keeps retrying under memory pressure and fails iff @q is dead.
    *
    * Must be called with @q->queue_lock held and,
    * Returns ERR_PTR on failure, with @q->queue_lock held.
@@@ -1227,7 -1181,7 +1227,7 @@@ retry
         if (!IS_ERR(rq))
                 return rq;
   
-       if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) {
+       if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
                 blk_put_rl(rl);
                 return rq;
         }
@@@ -1305,11 -1259,11 +1305,11 @@@ EXPORT_SYMBOL(blk_get_request)
    * BUG.
    *
    * WARNING: When allocating/cloning a bio-chain, careful consideration should be
-  * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
-  * anything but the first bio in the chain. Otherwise you risk waiting for IO
-  * completion of a bio that hasn't been submitted yet, thus resulting in a
-  * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
-  * of bio_alloc(), as that avoids the mempool deadlock.
+  * given to how you allocate bios. In particular, you cannot use
+  * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise
+  * you risk waiting for IO completion of a bio that hasn't been submitted yet,
+  * thus resulting in a deadlock. Alternatively bios should be allocated using
+  * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock.
    * If possible a big IO should be split into smaller parts when allocation
    * fails. Partial allocation should not be an error, or you risk a live-lock.
    */
@@@ -1640,30 -1594,6 +1640,30 @@@ out
         return ret;
   }
   
+ +unsigned int blk_plug_queued_count(struct request_queue *q)
+ +{
+ +      struct blk_plug *plug;
+ +      struct request *rq;
+ +      struct list_head *plug_list;
+ +      unsigned int ret = 0;
+ +
+ +      plug = current->plug;
+ +      if (!plug)
+ +              goto out;
+ +
+ +      if (q->mq_ops)
+ +              plug_list = &plug->mq_list;
+ +      else
+ +              plug_list = &plug->list;
+ +
+ +      list_for_each_entry(rq, plug_list, queuelist) {
+ +              if (rq->q == q)
+ +                      ret++;
+ +      }
+ +out:
+ +      return ret;
+ +}
+ +
   void init_request_from_bio(struct request *req, struct bio *bio)
   {
         req->cmd_type = REQ_TYPE_FS;
@@@ -1711,11 -1641,9 +1711,11 @@@ static void blk_queue_bio(struct reques
          * Check if we can merge with the plugged list before grabbing
          * any locks.
          */
- -      if (!blk_queue_nomerges(q) &&
- -          blk_attempt_plug_merge(q, bio, &request_count, NULL))
- -              return;
+ +      if (!blk_queue_nomerges(q)) {
+ +              if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
+ +                      return;
+ +      } else
+ +              request_count = blk_plug_queued_count(q);
   
         spin_lock_irq(q->queue_lock);
   
@@@ -2038,19 -1966,9 +2038,19 @@@ void generic_make_request(struct bio *b
         do {
                 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
   
-               if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) {
- -              q->make_request_fn(q, bio);
++              if (likely(blk_queue_enter(q, ___GFP_DIRECT_RECLAIM) == 0)) {
+ +
+ +                      q->make_request_fn(q, bio);
+ +
+ +                      blk_queue_exit(q);
   
- -              bio = bio_list_pop(current->bio_list);
+ +                      bio = bio_list_pop(current->bio_list);
+ +              } else {
+ +                      struct bio *bio_next = bio_list_pop(current->bio_list);
+ +
+ +                      bio_io_error(bio);
+ +                      bio = bio_next;
+ +              }
         } while (bio);
         current->bio_list = NULL; /* deactivate */
   }
diff --combined block/blk-mq-tag.c

index 60ac684c8b8c52f26fe8a83d290fbd8f75f76581,adbc577d83c481d4763848e9ed4b415afd4193c1..a07ca3488d96fb7a96159fbe48c0c9e5e95391ec
--- 1/block/blk-mq-tag.c
--- 2/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@@ -75,10 -75,6 +75,10 @@@ void blk_mq_tag_wakeup_all(struct blk_m
         struct blk_mq_bitmap_tags *bt;
         int i, wake_index;
   
+ +      /*
+ +       * Make sure all changes prior to this are visible from other CPUs.
+ +       */
+ +      smp_mb();
         bt = &tags->bitmap_tags;
         wake_index = atomic_read(&bt->wake_index);
         for (i = 0; i < BT_WAIT_QUEUES; i++) {
@@@ -268,7 -264,7 +268,7 @@@ static int bt_get(struct blk_mq_alloc_d
         if (tag != -1)
                 return tag;
   
-       if (!(data->gfp & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(data->gfp))
                 return -1;
   
         bs = bt_wait_ptr(bt, hctx);
@@@ -645,7 -641,6 +645,7 @@@ void blk_mq_free_tags(struct blk_mq_tag
   {
         bt_free(&tags->bitmap_tags);
         bt_free(&tags->breserved_tags);
+ +      free_cpumask_var(tags->cpumask);
         kfree(tags);
   }
   
diff --combined block/blk-mq.c

index 27bf3097532d02b0d43228d26bc696362b86cb28,3f3544edb941f169cae54042720c118c40a8563b..34e26163b73a434abf27bae5af8e6b21a6596c20
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -9,7 -9,6 +9,7 @@@
   #include <linux/backing-dev.h>
   #include <linux/bio.h>
   #include <linux/blkdev.h>
+ +#include <linux/kmemleak.h>
   #include <linux/mm.h>
   #include <linux/init.h>
   #include <linux/slab.h>
@@@ -78,13 -77,47 +78,13 @@@ static void blk_mq_hctx_clear_pending(s
         clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
   }
   
- -static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
- -{
- -      while (true) {
- -              int ret;
- -
- -              if (percpu_ref_tryget_live(&q->mq_usage_counter))
- -                      return 0;
- -
- -              if (!gfpflags_allow_blocking(gfp))
- -                      return -EBUSY;
- -
- -              ret = wait_event_interruptible(q->mq_freeze_wq,
- -                              !atomic_read(&q->mq_freeze_depth) ||
- -                              blk_queue_dying(q));
- -              if (blk_queue_dying(q))
- -                      return -ENODEV;
- -              if (ret)
- -                      return ret;
- -      }
- -}
- -
- -static void blk_mq_queue_exit(struct request_queue *q)
- -{
- -      percpu_ref_put(&q->mq_usage_counter);
- -}
- -
- -static void blk_mq_usage_counter_release(struct percpu_ref *ref)
- -{
- -      struct request_queue *q =
- -              container_of(ref, struct request_queue, mq_usage_counter);
- -
- -      wake_up_all(&q->mq_freeze_wq);
- -}
- -
   void blk_mq_freeze_queue_start(struct request_queue *q)
   {
         int freeze_depth;
   
         freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
         if (freeze_depth == 1) {
- -              percpu_ref_kill(&q->mq_usage_counter);
+ +              percpu_ref_kill(&q->q_usage_counter);
                 blk_mq_run_hw_queues(q, false);
         }
   }
@@@ -92,34 -125,18 +92,34 @@@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_s
   
   static void blk_mq_freeze_queue_wait(struct request_queue *q)
   {
- -      wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
+ +      wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
   }
   
   /*
    * Guarantee no request is in use, so we can change any data structure of
    * the queue afterward.
    */
- -void blk_mq_freeze_queue(struct request_queue *q)
+ +void blk_freeze_queue(struct request_queue *q)
   {
+ +      /*
+ +       * In the !blk_mq case we are only calling this to kill the
+ +       * q_usage_counter, otherwise this increases the freeze depth
+ +       * and waits for it to return to zero.  For this reason there is
+ +       * no blk_unfreeze_queue(), and blk_freeze_queue() is not
+ +       * exported to drivers as the only user for unfreeze is blk_mq.
+ +       */
         blk_mq_freeze_queue_start(q);
         blk_mq_freeze_queue_wait(q);
   }
+ +
+ +void blk_mq_freeze_queue(struct request_queue *q)
+ +{
+ +      /*
+ +       * ...just an alias to keep freeze and unfreeze actions balanced
+ +       * in the blk_mq_* namespace
+ +       */
+ +      blk_freeze_queue(q);
+ +}
   EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
   
   void blk_mq_unfreeze_queue(struct request_queue *q)
@@@ -129,7 -146,7 +129,7 @@@
         freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
         WARN_ON_ONCE(freeze_depth < 0);
         if (!freeze_depth) {
- -              percpu_ref_reinit(&q->mq_usage_counter);
+ +              percpu_ref_reinit(&q->q_usage_counter);
                 wake_up_all(&q->mq_freeze_wq);
         }
   }
@@@ -238,17 -255,17 +238,17 @@@ struct request *blk_mq_alloc_request(st
         struct blk_mq_alloc_data alloc_data;
         int ret;
   
- -      ret = blk_mq_queue_enter(q, gfp);
+ +      ret = blk_queue_enter(q, gfp);
         if (ret)
                 return ERR_PTR(ret);
   
         ctx = blk_mq_get_ctx(q);
         hctx = q->mq_ops->map_queue(q, ctx->cpu);
-       blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
+       blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
                         reserved, ctx, hctx);
   
         rq = __blk_mq_alloc_request(&alloc_data, rw);
-       if (!rq && (gfp & __GFP_WAIT)) {
+       if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
                 __blk_mq_run_hw_queue(hctx);
                 blk_mq_put_ctx(ctx);
   
@@@ -261,7 -278,7 +261,7 @@@
         }
         blk_mq_put_ctx(ctx);
         if (!rq) {
- -              blk_mq_queue_exit(q);
+ +              blk_queue_exit(q);
                 return ERR_PTR(-EWOULDBLOCK);
         }
         return rq;
@@@ -280,7 -297,7 +280,7 @@@ static void __blk_mq_free_request(struc
   
         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
         blk_mq_put_tag(hctx, tag, &ctx->last_tag);
- -      blk_mq_queue_exit(q);
+ +      blk_queue_exit(q);
   }
   
   void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
@@@ -972,25 -989,18 +972,25 @@@ void blk_mq_delay_queue(struct blk_mq_h
   }
   EXPORT_SYMBOL(blk_mq_delay_queue);
   
- -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
- -                                  struct request *rq, bool at_head)
+ +static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
+ +                                          struct blk_mq_ctx *ctx,
+ +                                          struct request *rq,
+ +                                          bool at_head)
   {
- -      struct blk_mq_ctx *ctx = rq->mq_ctx;
- -
         trace_block_rq_insert(hctx->queue, rq);
   
         if (at_head)
                 list_add(&rq->queuelist, &ctx->rq_list);
         else
                 list_add_tail(&rq->queuelist, &ctx->rq_list);
+ +}
+ +
+ +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
+ +                                  struct request *rq, bool at_head)
+ +{
+ +      struct blk_mq_ctx *ctx = rq->mq_ctx;
   
+ +      __blk_mq_insert_req_list(hctx, ctx, rq, at_head);
         blk_mq_hctx_mark_pending(hctx, ctx);
   }
   
@@@ -1046,9 -1056,8 +1046,9 @@@ static void blk_mq_insert_requests(stru
                 rq = list_first_entry(list, struct request, queuelist);
                 list_del_init(&rq->queuelist);
                 rq->mq_ctx = ctx;
- -              __blk_mq_insert_request(hctx, rq, false);
+ +              __blk_mq_insert_req_list(hctx, ctx, rq, false);
         }
+ +      blk_mq_hctx_mark_pending(hctx, ctx);
         spin_unlock(&ctx->lock);
   
         blk_mq_run_hw_queue(hctx, from_schedule);
@@@ -1130,7 -1139,7 +1130,7 @@@ static inline bool blk_mq_merge_queue_i
                                          struct blk_mq_ctx *ctx,
                                          struct request *rq, struct bio *bio)
   {
- -      if (!hctx_allow_merges(hctx)) {
+ +      if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) {
                 blk_mq_bio_to_request(rq, bio);
                 spin_lock(&ctx->lock);
   insert_rq:
@@@ -1167,7 -1176,11 +1167,7 @@@ static struct request *blk_mq_map_reque
         int rw = bio_data_dir(bio);
         struct blk_mq_alloc_data alloc_data;
   
- -      if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
- -              bio_io_error(bio);
- -              return NULL;
- -      }
- -
+ +      blk_queue_enter_live(q);
         ctx = blk_mq_get_ctx(q);
         hctx = q->mq_ops->map_queue(q, ctx->cpu);
   
@@@ -1186,7 -1199,7 +1186,7 @@@
                 ctx = blk_mq_get_ctx(q);
                 hctx = q->mq_ops->map_queue(q, ctx->cpu);
                 blk_mq_set_alloc_data(&alloc_data, q,
-                               __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
+                               __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
                 rq = __blk_mq_alloc_request(&alloc_data, rw);
                 ctx = alloc_data.ctx;
                 hctx = alloc_data.hctx;
@@@ -1254,12 -1267,9 +1254,12 @@@ static void blk_mq_make_request(struct 
   
         blk_queue_split(q, &bio, q->bio_split);
   
- -      if (!is_flush_fua && !blk_queue_nomerges(q) &&
- -          blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
- -              return;
+ +      if (!is_flush_fua && !blk_queue_nomerges(q)) {
+ +              if (blk_attempt_plug_merge(q, bio, &request_count,
+ +                                         &same_queue_rq))
+ +                      return;
+ +      } else
+ +              request_count = blk_plug_queued_count(q);
   
         rq = blk_mq_map_request(q, bio, &data);
         if (unlikely(!rq))
@@@ -1366,7 -1376,7 +1366,7 @@@ static void blk_sq_make_request(struct 
         plug = current->plug;
         if (plug) {
                 blk_mq_bio_to_request(rq, bio);
- -              if (list_empty(&plug->mq_list))
+ +              if (!request_count)
                         trace_block_plug(q);
                 else if (request_count >= BLK_MAX_REQUEST_COUNT) {
                         blk_flush_plug_list(plug, false);
@@@ -1420,11 -1430,6 +1420,11 @@@ static void blk_mq_free_rq_map(struct b
         while (!list_empty(&tags->page_list)) {
                 page = list_first_entry(&tags->page_list, struct page, lru);
                 list_del_init(&page->lru);
+ +              /*
+ +               * Remove kmemleak object previously allocated in
+ +               * blk_mq_init_rq_map().
+ +               */
+ +              kmemleak_free(page_address(page));
                 __free_pages(page, page->private);
         }
   
@@@ -1497,11 -1502,6 +1497,11 @@@ static struct blk_mq_tags *blk_mq_init_
                 list_add_tail(&page->lru, &tags->page_list);
   
                 p = page_address(page);
+ +              /*
+ +               * Allow kmemleak to scan these pages as they contain pointers
+ +               * to additional allocations like via ops->init_request().
+ +               */
+ +              kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
                 entries_per_page = order_to_size(this_order) / rq_size;
                 to_do = min(entries_per_page, set->queue_depth - i);
                 left -= to_do * rq_size;
@@@ -1989,6 -1989,14 +1989,6 @@@ struct request_queue *blk_mq_init_alloc
                 hctxs[i]->queue_num = i;
         }
   
- -      /*
- -       * Init percpu_ref in atomic mode so that it's faster to shutdown.
- -       * See blk_register_queue() for details.
- -       */
- -      if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
- -                          PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
- -              goto err_hctxs;
- -
         setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
         blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
   
@@@ -2069,6 -2077,8 +2069,6 @@@ void blk_mq_free_queue(struct request_q
   
         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
         blk_mq_free_hw_queues(q, set);
- -
- -      percpu_ref_exit(&q->mq_usage_counter);
   }
   
   /* Basically redo blk_mq_init_queue with queue frozen */
@@@ -2286,8 -2296,10 +2286,8 @@@ void blk_mq_free_tag_set(struct blk_mq_
         int i;
   
         for (i = 0; i < set->nr_hw_queues; i++) {
- -              if (set->tags[i]) {
+ +              if (set->tags[i])
                         blk_mq_free_rq_map(set, set->tags[i], i);
- -                      free_cpumask_var(set->tags[i]->cpumask);
- -              }
         }
   
         kfree(set->tags);
diff --combined block/genhd.c

index e5cafa51567c9d589147523c8ab7b43504f9d725,3213b66515f00bcb446a9c9874d24536a2625bfe..ebb41feea35754525761edf815b7e3a994ab0a06
--- 1/block/genhd.c
--- 2/block/genhd.c
+++ b/block/genhd.c
@@@ -630,7 -630,6 +630,7 @@@ void add_disk(struct gendisk *disk
         WARN_ON(retval);
   
         disk_add_events(disk);
+ +      blk_integrity_add(disk);
   }
   EXPORT_SYMBOL(add_disk);
   
@@@ -639,7 -638,6 +639,7 @@@ void del_gendisk(struct gendisk *disk
         struct disk_part_iter piter;
         struct hd_struct *part;
   
+ +      blk_integrity_del(disk);
         disk_del_events(disk);
   
         /* invalidate stuff */
@@@ -852,7 -850,7 +852,7 @@@ static int show_partition(struct seq_fi
         char buf[BDEVNAME_SIZE];
   
         /* Don't show non-partitionable removeable devices or empty devices */
-       if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+       if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
                                    (sgp->flags & GENHD_FL_REMOVABLE)))
                 return 0;
         if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --combined drivers/block/drbd/drbd_bitmap.c

index d3d73d114a4615e124e89bd6d4196ba5be35f415,3dc53a16ed3aaf14dfd30311c16e0dc14f6623a0..9462d27528507d693d8e4efe0e6464597ab1768b
--- 1/drivers/block/drbd/drbd_bitmap.c
--- 2/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@@ -29,7 -29,7 +29,7 @@@
   #include <linux/string.h>
   #include <linux/drbd.h>
   #include <linux/slab.h>
- -#include <asm/kmap_types.h>
+ +#include <linux/highmem.h>
   
   #include "drbd_int.h"
   
@@@ -1007,7 -1007,7 +1007,7 @@@ static void bm_page_io_async(struct drb
         bm_set_page_unchanged(b->bm_pages[page_nr]);
   
         if (ctx->flags & BM_AIO_COPY_PAGES) {
-               page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+               page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM);
                 copy_highpage(page, b->bm_pages[page_nr]);
                 bm_store_page_idx(page, page_nr);
         } else
diff --combined drivers/block/nbd.c

index 1b87623381e2b1183b5c9d57c870b7c10924f65e,214de17d0659f54f21e2b3034a46d690f821a063..93b3f99b6865fe721f7124412553cadf3c328e7a
--- 1/drivers/block/nbd.c
--- 2/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@@ -60,7 -60,6 +60,7 @@@ struct nbd_device 
         bool disconnect; /* a disconnect has been requested by user */
   
         struct timer_list timeout_timer;
+ +      spinlock_t tasks_lock;
         struct task_struct *task_recv;
         struct task_struct *task_send;
   
@@@ -141,23 -140,21 +141,23 @@@ static void sock_shutdown(struct nbd_de
   static void nbd_xmit_timeout(unsigned long arg)
   {
         struct nbd_device *nbd = (struct nbd_device *)arg;
- -      struct task_struct *task;
+ +      unsigned long flags;
   
         if (list_empty(&nbd->queue_head))
                 return;
   
         nbd->disconnect = true;
   
- -      task = READ_ONCE(nbd->task_recv);
- -      if (task)
- -              force_sig(SIGKILL, task);
+ +      spin_lock_irqsave(&nbd->tasks_lock, flags);
+ +
+ +      if (nbd->task_recv)
+ +              force_sig(SIGKILL, nbd->task_recv);
   
- -      task = READ_ONCE(nbd->task_send);
- -      if (task)
+ +      if (nbd->task_send)
                 force_sig(SIGKILL, nbd->task_send);
   
+ +      spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+ +
         dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n");
   }
   
@@@ -406,24 -403,17 +406,24 @@@ static int nbd_thread_recv(struct nbd_d
   {
         struct request *req;
         int ret;
+ +      unsigned long flags;
   
         BUG_ON(nbd->magic != NBD_MAGIC);
   
         sk_set_memalloc(nbd->sock->sk);
   
+ +      spin_lock_irqsave(&nbd->tasks_lock, flags);
         nbd->task_recv = current;
+ +      spin_unlock_irqrestore(&nbd->tasks_lock, flags);
   
         ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
         if (ret) {
                 dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+ +
+ +              spin_lock_irqsave(&nbd->tasks_lock, flags);
                 nbd->task_recv = NULL;
+ +              spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+ +
                 return ret;
         }
   
@@@ -439,14 -429,10 +439,12 @@@
   
         device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
   
+ +      spin_lock_irqsave(&nbd->tasks_lock, flags);
         nbd->task_recv = NULL;
+ +      spin_unlock_irqrestore(&nbd->tasks_lock, flags);
   
         if (signal_pending(current)) {
-               siginfo_t info;
- 
-               ret = dequeue_signal_lock(current, &current->blocked, &info);
+               ret = kernel_dequeue_signal(NULL);
                 dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
                          task_pid_nr(current), current->comm, ret);
                 mutex_lock(&nbd->tx_lock);
@@@ -546,11 -532,8 +544,11 @@@ static int nbd_thread_send(void *data
   {
         struct nbd_device *nbd = data;
         struct request *req;
+ +      unsigned long flags;
   
+ +      spin_lock_irqsave(&nbd->tasks_lock, flags);
         nbd->task_send = current;
+ +      spin_unlock_irqrestore(&nbd->tasks_lock, flags);
   
         set_user_nice(current, MIN_NICE);
         while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
@@@ -560,11 -543,8 +558,8 @@@
                                          !list_empty(&nbd->waiting_queue));
   
                 if (signal_pending(current)) {
-                       siginfo_t info;
-                       int ret;
+                       int ret = kernel_dequeue_signal(NULL);
   
-                       ret = dequeue_signal_lock(current, &current->blocked,
-                                                 &info);
                         dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n",
                                  task_pid_nr(current), current->comm, ret);
                         mutex_lock(&nbd->tx_lock);
@@@ -587,15 -567,7 +582,13 @@@
                 nbd_handle_req(nbd, req);
         }
   
+ +      spin_lock_irqsave(&nbd->tasks_lock, flags);
         nbd->task_send = NULL;
-       if (signal_pending(current)) {
-               siginfo_t info;
-               dequeue_signal_lock(current, &current->blocked, &info);
-       }
+ +      spin_unlock_irqrestore(&nbd->tasks_lock, flags);
+ +
+ +      /* Clear maybe pending signals */
++      if (signal_pending(current))
++              kernel_dequeue_signal(NULL);
   
         return 0;
   }
@@@ -1075,7 -1047,6 +1068,7 @@@ static int __init nbd_init(void
                 nbd_dev[i].magic = NBD_MAGIC;
                 INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
                 spin_lock_init(&nbd_dev[i].queue_lock);
+ +              spin_lock_init(&nbd_dev[i].tasks_lock);
                 INIT_LIST_HEAD(&nbd_dev[i].queue_head);
                 mutex_init(&nbd_dev[i].tx_lock);
                 init_timer(&nbd_dev[i].timeout_timer);
diff --combined drivers/block/pktcdvd.c

index cd813f9110bfc99314604fcf0ae93e58242ac413,5959c2981cc7d18efbbec56615db1d29b2698064..2f477d45d6cfa42d586080db8c293d41406055ae
--- 1/drivers/block/pktcdvd.c
--- 2/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@@ -704,14 -704,14 +704,14 @@@ static int pkt_generic_packet(struct pk
         int ret = 0;
   
         rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
-                            WRITE : READ, __GFP_WAIT);
+                            WRITE : READ, __GFP_RECLAIM);
         if (IS_ERR(rq))
                 return PTR_ERR(rq);
         blk_rq_set_block_pc(rq);
   
         if (cgc->buflen) {
                 ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
-                                     __GFP_WAIT);
+                                     __GFP_RECLAIM);
                 if (ret)
                         goto out;
         }
@@@ -2803,7 -2803,8 +2803,7 @@@ out_new_dev
   out_mem2:
         put_disk(disk);
   out_mem:
- -      if (pd->rb_pool)
- -              mempool_destroy(pd->rb_pool);
+ +      mempool_destroy(pd->rb_pool);
         kfree(pd);
   out_mutex:
         mutex_unlock(&ctl_mutex);
diff --combined drivers/gpu/drm/drm_gem.c

index abeb9af31f9c5e7c02b3f8d64ddf34f01621cfd0,1d47d2e9487c49de9bf58db9d0902a94afbd9a7a..2e10bba4468b0c6b65aee0d07fa279295c52d2b2
--- 1/drivers/gpu/drm/drm_gem.c
--- 2/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@@ -244,9 -244,8 +244,9 @@@ drm_gem_object_handle_unreference_unloc
    * @filp: drm file-private structure to use for the handle look up
    * @handle: userspace handle to delete
    *
- - * Removes the GEM handle from the @filp lookup table and if this is the last
- - * handle also cleans up linked resources like GEM names.
+ + * Removes the GEM handle from the @filp lookup table which has been added with
+ + * drm_gem_handle_create(). If this is the last handle also cleans up linked
+ + * resources like GEM names.
    */
   int
   drm_gem_handle_delete(struct drm_file *filp, u32 handle)
@@@ -315,10 -314,6 +315,10 @@@ EXPORT_SYMBOL(drm_gem_dumb_destroy)
    * This expects the dev->object_name_lock to be held already and will drop it
    * before returning. Used to avoid races in establishing new handles when
    * importing an object from either an flink name or a dma-buf.
+ + *
+ + * Handles must be release again through drm_gem_handle_delete(). This is done
+ + * when userspace closes @file_priv for all attached handles, or through the
+ + * GEM_CLOSE ioctl for individual handles.
    */
   int
   drm_gem_handle_create_tail(struct drm_file *file_priv,
@@@ -496,7 -491,7 +496,7 @@@ struct page **drm_gem_get_pages(struct 
                  * __GFP_DMA32 to be set in mapping_gfp_mask(inode->i_mapping)
                  * so shmem can relocate pages during swapin if required.
                  */
-               BUG_ON((mapping_gfp_mask(mapping) & __GFP_DMA32) &&
+               BUG_ON(mapping_gfp_constraint(mapping, __GFP_DMA32) &&
                                 (page_to_pfn(p) >= 0x00100000UL));
         }
   
@@@ -546,17 -541,7 +546,17 @@@ void drm_gem_put_pages(struct drm_gem_o
   }
   EXPORT_SYMBOL(drm_gem_put_pages);
   
- -/** Returns a reference to the object named by the handle. */
+ +/**
+ + * drm_gem_object_lookup - look up a GEM object from it's handle
+ + * @dev: DRM device
+ + * @filp: DRM file private date
+ + * @handle: userspace handle
+ + *
+ + * Returns:
+ + *
+ + * A reference to the object named by the handle if such exists on @filp, NULL
+ + * otherwise.
+ + */
   struct drm_gem_object *
   drm_gem_object_lookup(struct drm_device *dev, struct drm_file *filp,
                       u32 handle)
@@@ -778,8 -763,7 +778,8 @@@ EXPORT_SYMBOL(drm_gem_object_release)
   void
   drm_gem_object_free(struct kref *kref)
   {
- -      struct drm_gem_object *obj = (struct drm_gem_object *) kref;
+ +      struct drm_gem_object *obj =
+ +              container_of(kref, struct drm_gem_object, refcount);
         struct drm_device *dev = obj->dev;
   
         WARN_ON(!mutex_is_locked(&dev->struct_mutex));
@@@ -789,13 -773,6 +789,13 @@@
   }
   EXPORT_SYMBOL(drm_gem_object_free);
   
+ +/**
+ + * drm_gem_vm_open - vma->ops->open implementation for GEM
+ + * @vma: VM area structure
+ + *
+ + * This function implements the #vm_operations_struct open() callback for GEM
+ + * drivers. This must be used together with drm_gem_vm_close().
+ + */
   void drm_gem_vm_open(struct vm_area_struct *vma)
   {
         struct drm_gem_object *obj = vma->vm_private_data;
@@@ -804,13 -781,6 +804,13 @@@
   }
   EXPORT_SYMBOL(drm_gem_vm_open);
   
+ +/**
+ + * drm_gem_vm_close - vma->ops->close implementation for GEM
+ + * @vma: VM area structure
+ + *
+ + * This function implements the #vm_operations_struct close() callback for GEM
+ + * drivers. This must be used together with drm_gem_vm_open().
+ + */
   void drm_gem_vm_close(struct vm_area_struct *vma)
   {
         struct drm_gem_object *obj = vma->vm_private_data;
@@@ -840,6 -810,8 +840,6 @@@ EXPORT_SYMBOL(drm_gem_vm_close)
    * drm_gem_mmap() prevents unprivileged users from mapping random objects. So
    * callers must verify access restrictions before calling this helper.
    *
- - * NOTE: This function has to be protected with dev->struct_mutex
- - *
    * Return 0 or success or -EINVAL if the object size is smaller than the VMA
    * size, or if no gem_vm_ops are provided.
    */
@@@ -848,6 -820,8 +848,6 @@@ int drm_gem_mmap_obj(struct drm_gem_obj
   {
         struct drm_device *dev = obj->dev;
   
- -      lockdep_assert_held(&dev->struct_mutex);
- -
         /* Check for valid size. */
         if (obj_size < vma->vm_end - vma->vm_start)
                 return -EINVAL;
@@@ -891,46 -865,30 +891,46 @@@ int drm_gem_mmap(struct file *filp, str
   {
         struct drm_file *priv = filp->private_data;
         struct drm_device *dev = priv->minor->dev;
- -      struct drm_gem_object *obj;
+ +      struct drm_gem_object *obj = NULL;
         struct drm_vma_offset_node *node;
         int ret;
   
         if (drm_device_is_unplugged(dev))
                 return -ENODEV;
   
- -      mutex_lock(&dev->struct_mutex);
+ +      drm_vma_offset_lock_lookup(dev->vma_offset_manager);
+ +      node = drm_vma_offset_exact_lookup_locked(dev->vma_offset_manager,
+ +                                                vma->vm_pgoff,
+ +                                                vma_pages(vma));
+ +      if (likely(node)) {
+ +              obj = container_of(node, struct drm_gem_object, vma_node);
+ +              /*
+ +               * When the object is being freed, after it hits 0-refcnt it
+ +               * proceeds to tear down the object. In the process it will
+ +               * attempt to remove the VMA offset and so acquire this
+ +               * mgr->vm_lock.  Therefore if we find an object with a 0-refcnt
+ +               * that matches our range, we know it is in the process of being
+ +               * destroyed and will be freed as soon as we release the lock -
+ +               * so we have to check for the 0-refcnted object and treat it as
+ +               * invalid.
+ +               */
+ +              if (!kref_get_unless_zero(&obj->refcount))
+ +                      obj = NULL;
+ +      }
+ +      drm_vma_offset_unlock_lookup(dev->vma_offset_manager);
   
- -      node = drm_vma_offset_exact_lookup(dev->vma_offset_manager,
- -                                         vma->vm_pgoff,
- -                                         vma_pages(vma));
- -      if (!node) {
- -              mutex_unlock(&dev->struct_mutex);
+ +      if (!obj)
                 return -EINVAL;
- -      } else if (!drm_vma_node_is_allowed(node, filp)) {
- -              mutex_unlock(&dev->struct_mutex);
+ +
+ +      if (!drm_vma_node_is_allowed(node, filp)) {
+ +              drm_gem_object_unreference_unlocked(obj);
                 return -EACCES;
         }
   
- -      obj = container_of(node, struct drm_gem_object, vma_node);
- -      ret = drm_gem_mmap_obj(obj, drm_vma_node_size(node) << PAGE_SHIFT, vma);
+ +      ret = drm_gem_mmap_obj(obj, drm_vma_node_size(node) << PAGE_SHIFT,
+ +                             vma);
   
- -      mutex_unlock(&dev->struct_mutex);
+ +      drm_gem_object_unreference_unlocked(obj);
   
         return ret;
   }
diff --combined drivers/gpu/drm/i915/i915_gem.c

index e57061ac02191dd352d71f72ed0599f58c80b45b,399aab265db3cdd669bced4452d237ad78076b98..5cf4a1998273c3cfcc494c83210c0bc572f35c2e
--- 1/drivers/gpu/drm/i915/i915_gem.c
--- 2/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@@ -1005,14 -1005,12 +1005,14 @@@ out
                 if (!needs_clflush_after &&
                     obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
                         if (i915_gem_clflush_object(obj, obj->pin_display))
- -                              i915_gem_chipset_flush(dev);
+ +                              needs_clflush_after = true;
                 }
         }
   
         if (needs_clflush_after)
                 i915_gem_chipset_flush(dev);
+ +      else
+ +              obj->cache_dirty = true;
   
         intel_fb_obj_flush(obj, false, ORIGIN_CPU);
         return ret;
@@@ -1713,8 -1711,8 +1713,8 @@@ i915_gem_mmap_ioctl(struct drm_device *
   
   /**
    * i915_gem_fault - fault a page into the GTT
- - * vma: VMA in question
- - * vmf: fault info
+ + * @vma: VMA in question
+ + * @vmf: fault info
    *
    * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
    * from userspace.  The fault handler takes care of binding the object to
@@@ -2216,9 -2214,8 +2216,8 @@@ i915_gem_object_get_pages_gtt(struct dr
          * Fail silently without starting the shrinker
          */
         mapping = file_inode(obj->base.filp)->i_mapping;
-       gfp = mapping_gfp_mask(mapping);
-       gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD;
-       gfp &= ~(__GFP_IO | __GFP_WAIT);
+       gfp = mapping_gfp_constraint(mapping, ~(__GFP_IO | __GFP_RECLAIM));
+       gfp |= __GFP_NORETRY | __GFP_NOWARN;
         sg = st->sgl;
         st->nents = 0;
         for (i = 0; i < page_count; i++) {
@@@ -3208,7 -3205,7 +3207,7 @@@ static void i915_gem_object_finish_gtt(
                                             old_write_domain);
   }
   
- -int i915_vma_unbind(struct i915_vma *vma)
+ +static int __i915_vma_unbind(struct i915_vma *vma, bool wait)
   {
         struct drm_i915_gem_object *obj = vma->obj;
         struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
@@@ -3227,11 -3224,13 +3226,11 @@@
   
         BUG_ON(obj->pages == NULL);
   
- -      ret = i915_gem_object_wait_rendering(obj, false);
- -      if (ret)
- -              return ret;
- -      /* Continue on if we fail due to EIO, the GPU is hung so we
- -       * should be safe and we need to cleanup or else we might
- -       * cause memory corruption through use-after-free.
- -       */
+ +      if (wait) {
+ +              ret = i915_gem_object_wait_rendering(obj, false);
+ +              if (ret)
+ +                      return ret;
+ +      }
   
         if (i915_is_ggtt(vma->vm) &&
             vma->ggtt_view.type == I915_GGTT_VIEW_NORMAL) {
@@@ -3276,16 -3275,6 +3275,16 @@@
         return 0;
   }
   
+ +int i915_vma_unbind(struct i915_vma *vma)
+ +{
+ +      return __i915_vma_unbind(vma, true);
+ +}
+ +
+ +int __i915_vma_unbind_no_wait(struct i915_vma *vma)
+ +{
+ +      return __i915_vma_unbind(vma, false);
+ +}
+ +
   int i915_gpu_idle(struct drm_device *dev)
   {
         struct drm_i915_private *dev_priv = dev->dev_private;
@@@ -3365,10 -3354,11 +3364,10 @@@ i915_gem_object_bind_to_vm(struct drm_i
   {
         struct drm_device *dev = obj->base.dev;
         struct drm_i915_private *dev_priv = dev->dev_private;
- -      u32 size, fence_size, fence_alignment, unfenced_alignment;
- -      u64 start =
- -              flags & PIN_OFFSET_BIAS ? flags & PIN_OFFSET_MASK : 0;
- -      u64 end =
- -              flags & PIN_MAPPABLE ? dev_priv->gtt.mappable_end : vm->total;
+ +      u32 fence_alignment, unfenced_alignment;
+ +      u32 search_flag, alloc_flag;
+ +      u64 start, end;
+ +      u64 size, fence_size;
         struct i915_vma *vma;
         int ret;
   
@@@ -3408,13 -3398,6 +3407,13 @@@
                 size = flags & PIN_MAPPABLE ? fence_size : obj->base.size;
         }
   
+ +      start = flags & PIN_OFFSET_BIAS ? flags & PIN_OFFSET_MASK : 0;
+ +      end = vm->total;
+ +      if (flags & PIN_MAPPABLE)
+ +              end = min_t(u64, end, dev_priv->gtt.mappable_end);
+ +      if (flags & PIN_ZONE_4G)
+ +              end = min_t(u64, end, (1ULL << 32));
+ +
         if (alignment == 0)
                 alignment = flags & PIN_MAPPABLE ? fence_alignment :
                                                 unfenced_alignment;
@@@ -3430,7 -3413,7 +3429,7 @@@
          * attempt to find space.
          */
         if (size > end) {
- -              DRM_DEBUG("Attempting to bind an object (view type=%u) larger than the aperture: size=%u > %s aperture=%llu\n",
+ +              DRM_DEBUG("Attempting to bind an object (view type=%u) larger than the aperture: size=%llu > %s aperture=%llu\n",
                           ggtt_view ? ggtt_view->type : 0,
                           size,
                           flags & PIN_MAPPABLE ? "mappable" : "total",
@@@ -3450,21 -3433,13 +3449,21 @@@
         if (IS_ERR(vma))
                 goto err_unpin;
   
+ +      if (flags & PIN_HIGH) {
+ +              search_flag = DRM_MM_SEARCH_BELOW;
+ +              alloc_flag = DRM_MM_CREATE_TOP;
+ +      } else {
+ +              search_flag = DRM_MM_SEARCH_DEFAULT;
+ +              alloc_flag = DRM_MM_CREATE_DEFAULT;
+ +      }
+ +
   search_free:
         ret = drm_mm_insert_node_in_range_generic(&vm->mm, &vma->node,
                                                   size, alignment,
                                                   obj->cache_level,
                                                   start, end,
- -                                                DRM_MM_SEARCH_DEFAULT,
- -                                                DRM_MM_CREATE_DEFAULT);
+ +                                                search_flag,
+ +                                                alloc_flag);
         if (ret) {
                 ret = i915_gem_evict_something(dev, vm, size, alignment,
                                                obj->cache_level,
@@@ -3657,117 -3632,59 +3656,117 @@@ i915_gem_object_set_to_gtt_domain(struc
         return 0;
   }
   
+ +/**
+ + * Changes the cache-level of an object across all VMA.
+ + *
+ + * After this function returns, the object will be in the new cache-level
+ + * across all GTT and the contents of the backing storage will be coherent,
+ + * with respect to the new cache-level. In order to keep the backing storage
+ + * coherent for all users, we only allow a single cache level to be set
+ + * globally on the object and prevent it from being changed whilst the
+ + * hardware is reading from the object. That is if the object is currently
+ + * on the scanout it will be set to uncached (or equivalent display
+ + * cache coherency) and all non-MOCS GPU access will also be uncached so
+ + * that all direct access to the scanout remains coherent.
+ + */
   int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
                                     enum i915_cache_level cache_level)
   {
         struct drm_device *dev = obj->base.dev;
         struct i915_vma *vma, *next;
- -      int ret;
+ +      bool bound = false;
+ +      int ret = 0;
   
         if (obj->cache_level == cache_level)
- -              return 0;
- -
- -      if (i915_gem_obj_is_pinned(obj)) {
- -              DRM_DEBUG("can not change the cache level of pinned objects\n");
- -              return -EBUSY;
- -      }
+ +              goto out;
   
+ +      /* Inspect the list of currently bound VMA and unbind any that would
+ +       * be invalid given the new cache-level. This is principally to
+ +       * catch the issue of the CS prefetch crossing page boundaries and
+ +       * reading an invalid PTE on older architectures.
+ +       */
         list_for_each_entry_safe(vma, next, &obj->vma_list, vma_link) {
+ +              if (!drm_mm_node_allocated(&vma->node))
+ +                      continue;
+ +
+ +              if (vma->pin_count) {
+ +                      DRM_DEBUG("can not change the cache level of pinned objects\n");
+ +                      return -EBUSY;
+ +              }
+ +
                 if (!i915_gem_valid_gtt_space(vma, cache_level)) {
                         ret = i915_vma_unbind(vma);
                         if (ret)
                                 return ret;
- -              }
+ +              } else
+ +                      bound = true;
         }
   
- -      if (i915_gem_obj_bound_any(obj)) {
+ +      /* We can reuse the existing drm_mm nodes but need to change the
+ +       * cache-level on the PTE. We could simply unbind them all and
+ +       * rebind with the correct cache-level on next use. However since
+ +       * we already have a valid slot, dma mapping, pages etc, we may as
+ +       * rewrite the PTE in the belief that doing so tramples upon less
+ +       * state and so involves less work.
+ +       */
+ +      if (bound) {
+ +              /* Before we change the PTE, the GPU must not be accessing it.
+ +               * If we wait upon the object, we know that all the bound
+ +               * VMA are no longer active.
+ +               */
                 ret = i915_gem_object_wait_rendering(obj, false);
                 if (ret)
                         return ret;
   
- -              i915_gem_object_finish_gtt(obj);
- -
- -              /* Before SandyBridge, you could not use tiling or fence
- -               * registers with snooped memory, so relinquish any fences
- -               * currently pointing to our region in the aperture.
- -               */
- -              if (INTEL_INFO(dev)->gen < 6) {
+ +              if (!HAS_LLC(dev) && cache_level != I915_CACHE_NONE) {
+ +                      /* Access to snoopable pages through the GTT is
+ +                       * incoherent and on some machines causes a hard
+ +                       * lockup. Relinquish the CPU mmaping to force
+ +                       * userspace to refault in the pages and we can
+ +                       * then double check if the GTT mapping is still
+ +                       * valid for that pointer access.
+ +                       */
+ +                      i915_gem_release_mmap(obj);
+ +
+ +                      /* As we no longer need a fence for GTT access,
+ +                       * we can relinquish it now (and so prevent having
+ +                       * to steal a fence from someone else on the next
+ +                       * fence request). Note GPU activity would have
+ +                       * dropped the fence as all snoopable access is
+ +                       * supposed to be linear.
+ +                       */
                         ret = i915_gem_object_put_fence(obj);
                         if (ret)
                                 return ret;
+ +              } else {
+ +                      /* We either have incoherent backing store and
+ +                       * so no GTT access or the architecture is fully
+ +                       * coherent. In such cases, existing GTT mmaps
+ +                       * ignore the cache bit in the PTE and we can
+ +                       * rewrite it without confusing the GPU or having
+ +                       * to force userspace to fault back in its mmaps.
+ +                       */
                 }
   
- -              list_for_each_entry(vma, &obj->vma_list, vma_link)
- -                      if (drm_mm_node_allocated(&vma->node)) {
- -                              ret = i915_vma_bind(vma, cache_level,
- -                                                  PIN_UPDATE);
- -                              if (ret)
- -                                      return ret;
- -                      }
+ +              list_for_each_entry(vma, &obj->vma_list, vma_link) {
+ +                      if (!drm_mm_node_allocated(&vma->node))
+ +                              continue;
+ +
+ +                      ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
+ +                      if (ret)
+ +                              return ret;
+ +              }
         }
   
         list_for_each_entry(vma, &obj->vma_list, vma_link)
                 vma->node.color = cache_level;
         obj->cache_level = cache_level;
   
+ +out:
+ +      /* Flush the dirty CPU caches to the backing storage so that the
+ +       * object is now coherent at its new cache level (with respect
+ +       * to the access domain).
+ +       */
         if (obj->cache_dirty &&
             obj->base.write_domain != I915_GEM_DOMAIN_CPU &&
             cpu_write_needs_clflush(obj)) {
@@@ -3820,15 -3737,6 +3819,15 @@@ int i915_gem_set_caching_ioctl(struct d
                 level = I915_CACHE_NONE;
                 break;
         case I915_CACHING_CACHED:
+ +              /*
+ +               * Due to a HW issue on BXT A stepping, GPU stores via a
+ +               * snooped mapping may leave stale data in a corresponding CPU
+ +               * cacheline, whereas normally such cachelines would get
+ +               * invalidated.
+ +               */
+ +              if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0)
+ +                      return -ENODEV;
+ +
                 level = I915_CACHE_LLC;
                 break;
         case I915_CACHING_DISPLAY:
@@@ -4102,13 -4010,15 +4101,13 @@@ i915_gem_object_do_pin(struct drm_i915_
                         return -EBUSY;
   
                 if (i915_vma_misplaced(vma, alignment, flags)) {
- -                      unsigned long offset;
- -                      offset = ggtt_view ? i915_gem_obj_ggtt_offset_view(obj, ggtt_view) :
- -                                           i915_gem_obj_offset(obj, vm);
                         WARN(vma->pin_count,
                              "bo is already pinned in %s with incorrect alignment:"
- -                           " offset=%lx, req.alignment=%x, req.map_and_fenceable=%d,"
+ +                           " offset=%08x %08x, req.alignment=%x, req.map_and_fenceable=%d,"
                              " obj->map_and_fenceable=%d\n",
                              ggtt_view ? "ggtt" : "ppgtt",
- -                           offset,
+ +                           upper_32_bits(vma->node.start),
+ +                           lower_32_bits(vma->node.start),
                              alignment,
                              !!(flags & PIN_MAPPABLE),
                              obj->map_and_fenceable);
@@@ -4615,6 -4525,22 +4614,6 @@@ void i915_gem_init_swizzling(struct drm
                 BUG();
   }
   
- -static bool
- -intel_enable_blt(struct drm_device *dev)
- -{
- -      if (!HAS_BLT(dev))
- -              return false;
- -
- -      /* The blitter was dysfunctional on early prototypes */
- -      if (IS_GEN6(dev) && dev->pdev->revision < 8) {
- -              DRM_INFO("BLT not supported on this pre-production hardware;"
- -                       " graphics performance will be degraded.\n");
- -              return false;
- -      }
- -
- -      return true;
- -}
- -
   static void init_unused_ring(struct drm_device *dev, u32 base)
   {
         struct drm_i915_private *dev_priv = dev->dev_private;
@@@ -4657,7 -4583,7 +4656,7 @@@ int i915_gem_init_rings(struct drm_devi
                         goto cleanup_render_ring;
         }
   
- -      if (intel_enable_blt(dev)) {
+ +      if (HAS_BLT(dev)) {
                 ret = intel_init_blt_ring_buffer(dev);
                 if (ret)
                         goto cleanup_bsd_ring;
@@@ -4675,8 -4601,14 +4674,8 @@@
                         goto cleanup_vebox_ring;
         }
   
- -      ret = i915_gem_set_seqno(dev, ((u32)~0 - 0x1000));
- -      if (ret)
- -              goto cleanup_bsd2_ring;
- -
         return 0;
   
- -cleanup_bsd2_ring:
- -      intel_cleanup_ring_buffer(&dev_priv->ring[VCS2]);
   cleanup_vebox_ring:
         intel_cleanup_ring_buffer(&dev_priv->ring[VECS]);
   cleanup_blt_ring:
@@@ -4746,33 -4678,6 +4745,33 @@@ i915_gem_init_hw(struct drm_device *dev
                         goto out;
         }
   
+ +      /* We can't enable contexts until all firmware is loaded */
+ +      if (HAS_GUC_UCODE(dev)) {
+ +              ret = intel_guc_ucode_load(dev);
+ +              if (ret) {
+ +                      /*
+ +                       * If we got an error and GuC submission is enabled, map
+ +                       * the error to -EIO so the GPU will be declared wedged.
+ +                       * OTOH, if we didn't intend to use the GuC anyway, just
+ +                       * discard the error and carry on.
+ +                       */
+ +                      DRM_ERROR("Failed to initialize GuC, error %d%s\n", ret,
+ +                                i915.enable_guc_submission ? "" :
+ +                                " (ignored)");
+ +                      ret = i915.enable_guc_submission ? -EIO : 0;
+ +                      if (ret)
+ +                              goto out;
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Increment the next seqno by 0x100 so we have a visible break
+ +       * on re-initialisation
+ +       */
+ +      ret = i915_gem_set_seqno(dev, dev_priv->next_seqno+0x100);
+ +      if (ret)
+ +              goto out;
+ +
         /* Now it is safe to go back round and do everything else: */
         for_each_ring(ring, dev_priv, i) {
                 struct drm_i915_gem_request *req;
@@@ -4910,6 -4815,18 +4909,6 @@@ init_ring_lists(struct intel_engine_cs 
         INIT_LIST_HEAD(&ring->request_list);
   }
   
- -void i915_init_vm(struct drm_i915_private *dev_priv,
- -                struct i915_address_space *vm)
- -{
- -      if (!i915_is_ggtt(vm))
- -              drm_mm_init(&vm->mm, vm->start, vm->total);
- -      vm->dev = dev_priv->dev;
- -      INIT_LIST_HEAD(&vm->active_list);
- -      INIT_LIST_HEAD(&vm->inactive_list);
- -      INIT_LIST_HEAD(&vm->global_link);
- -      list_add_tail(&vm->global_link, &dev_priv->vm_list);
- -}
- -
   void
   i915_gem_load(struct drm_device *dev)
   {
@@@ -4933,6 -4850,8 +4932,6 @@@
                                   NULL);
   
         INIT_LIST_HEAD(&dev_priv->vm_list);
- -      i915_init_vm(dev_priv, &dev_priv->gtt.base);
- -
         INIT_LIST_HEAD(&dev_priv->context_list);
         INIT_LIST_HEAD(&dev_priv->mm.unbound_list);
         INIT_LIST_HEAD(&dev_priv->mm.bound_list);
@@@ -4960,14 -4879,6 +4959,14 @@@
                 dev_priv->num_fence_regs =
                                 I915_READ(vgtif_reg(avail_rs.fence_num));
   
+ +      /*
+ +       * Set initial sequence number for requests.
+ +       * Using this number allows the wraparound to happen early,
+ +       * catching any obvious problems.
+ +       */
+ +      dev_priv->next_seqno = ((u32)~0 - 0x1100);
+ +      dev_priv->last_seqno = ((u32)~0 - 0x1101);
+ +
         /* Initialize fence registers to zero */
         INIT_LIST_HEAD(&dev_priv->mm.fence_list);
         i915_gem_restore_fences(dev);
@@@ -5037,9 -4948,9 +5036,9 @@@ int i915_gem_open(struct drm_device *de
   
   /**
    * i915_gem_track_fb - update frontbuffer tracking
- - * old: current GEM buffer for the frontbuffer slots
- - * new: new GEM buffer for the frontbuffer slots
- - * frontbuffer_bits: bitmask of frontbuffer slots
+ + * @old: current GEM buffer for the frontbuffer slots
+ + * @new: new GEM buffer for the frontbuffer slots
+ + * @frontbuffer_bits: bitmask of frontbuffer slots
    *
    * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
    * from @old and setting them in @new. Both @old and @new can be NULL.
@@@ -5062,8 -4973,9 +5061,8 @@@ void i915_gem_track_fb(struct drm_i915_
   }
   
   /* All the new VM stuff */
- -unsigned long
- -i915_gem_obj_offset(struct drm_i915_gem_object *o,
- -                  struct i915_address_space *vm)
+ +u64 i915_gem_obj_offset(struct drm_i915_gem_object *o,
+ +                      struct i915_address_space *vm)
   {
         struct drm_i915_private *dev_priv = o->base.dev->dev_private;
         struct i915_vma *vma;
@@@ -5083,8 -4995,9 +5082,8 @@@
         return -1;
   }
   
- -unsigned long
- -i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
- -                            const struct i915_ggtt_view *view)
+ +u64 i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
+ +                                const struct i915_ggtt_view *view)
   {
         struct i915_address_space *ggtt = i915_obj_to_ggtt(o);
         struct i915_vma *vma;
diff --combined drivers/infiniband/core/sa_query.c

index dcdaa79e3f0faa0dcd9288a0f14cf9e4e996210c,59ab264c99c4f4a87c92cf654f14cc0806539cd6..2aba774f835b9caca8e9e1645d1efd6cf6f08bf9
--- 1/drivers/infiniband/core/sa_query.c
--- 2/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@@ -1007,29 -1007,26 +1007,29 @@@ int ib_init_ah_from_path(struct ib_devi
         force_grh = rdma_cap_eth_ah(device, port_num);
   
         if (rec->hop_limit > 1 || force_grh) {
+ +              struct net_device *ndev = ib_get_ndev_from_path(rec);
+ +
                 ah_attr->ah_flags = IB_AH_GRH;
                 ah_attr->grh.dgid = rec->dgid;
   
- -              ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
+ +              ret = ib_find_cached_gid(device, &rec->sgid, ndev, &port_num,
                                          &gid_index);
- -              if (ret)
+ +              if (ret) {
+ +                      if (ndev)
+ +                              dev_put(ndev);
                         return ret;
+ +              }
   
                 ah_attr->grh.sgid_index    = gid_index;
                 ah_attr->grh.flow_label    = be32_to_cpu(rec->flow_label);
                 ah_attr->grh.hop_limit     = rec->hop_limit;
                 ah_attr->grh.traffic_class = rec->traffic_class;
+ +              if (ndev)
+ +                      dev_put(ndev);
         }
         if (force_grh) {
                 memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
- -              ah_attr->vlan_id = rec->vlan_id;
- -      } else {
- -              ah_attr->vlan_id = 0xffff;
         }
- -
         return 0;
   }
   EXPORT_SYMBOL(ib_init_ah_from_path);
@@@ -1086,7 -1083,7 +1086,7 @@@ static void init_mad(struct ib_sa_mad *
   
   static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
   {
-       bool preload = !!(gfp_mask & __GFP_WAIT);
+       bool preload = gfpflags_allow_blocking(gfp_mask);
         unsigned long flags;
         int ret, id;
   
@@@ -1153,9 -1150,9 +1153,9 @@@ static void ib_sa_path_rec_callback(str
   
                 ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
                           mad->data, &rec);
- -              rec.vlan_id = 0xffff;
+ +              rec.net = NULL;
+ +              rec.ifindex = 0;
                 memset(rec.dmac, 0, ETH_ALEN);
- -              memset(rec.smac, 0, ETH_ALEN);
                 query->callback(status, &rec, query->context);
         } else
                 query->callback(status, NULL, query->context);
diff --combined drivers/iommu/amd_iommu.c

index 0d533bba4ad18097e447be3dca3fa41e5d9abeab,3aa0116c843c050490b675498ac09e825ba78807..8b2be1e7714f8bb7aa1d62193d3b3320fab64414
--- 1/drivers/iommu/amd_iommu.c
--- 2/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@@ -89,6 -89,8 +89,6 @@@ static struct dma_map_ops amd_iommu_dma
   struct iommu_dev_data {
         struct list_head list;            /* For domain->dev_list */
         struct list_head dev_data_list;   /* For global dev_data_list */
- -      struct list_head alias_list;      /* Link alias-groups together */
- -      struct iommu_dev_data *alias_data;/* The alias dev_data */
         struct protection_domain *domain; /* Domain the device is bound to */
         u16 devid;                        /* PCI Device ID */
         bool iommu_v2;                    /* Device can make use of IOMMUv2 */
@@@ -134,6 -136,8 +134,6 @@@ static struct iommu_dev_data *alloc_dev
         if (!dev_data)
                 return NULL;
   
- -      INIT_LIST_HEAD(&dev_data->alias_list);
- -
         dev_data->devid = devid;
   
         spin_lock_irqsave(&dev_data_list_lock, flags);
@@@ -143,6 -147,17 +143,6 @@@
         return dev_data;
   }
   
- -static void free_dev_data(struct iommu_dev_data *dev_data)
- -{
- -      unsigned long flags;
- -
- -      spin_lock_irqsave(&dev_data_list_lock, flags);
- -      list_del(&dev_data->dev_data_list);
- -      spin_unlock_irqrestore(&dev_data_list_lock, flags);
- -
- -      kfree(dev_data);
- -}
- -
   static struct iommu_dev_data *search_dev_data(u16 devid)
   {
         struct iommu_dev_data *dev_data;
@@@ -296,10 -311,73 +296,10 @@@ out
         iommu_group_put(group);
   }
   
- -static int __last_alias(struct pci_dev *pdev, u16 alias, void *data)
- -{
- -      *(u16 *)data = alias;
- -      return 0;
- -}
- -
- -static u16 get_alias(struct device *dev)
- -{
- -      struct pci_dev *pdev = to_pci_dev(dev);
- -      u16 devid, ivrs_alias, pci_alias;
- -
- -      devid = get_device_id(dev);
- -      ivrs_alias = amd_iommu_alias_table[devid];
- -      pci_for_each_dma_alias(pdev, __last_alias, &pci_alias);
- -
- -      if (ivrs_alias == pci_alias)
- -              return ivrs_alias;
- -
- -      /*
- -       * DMA alias showdown
- -       *
- -       * The IVRS is fairly reliable in telling us about aliases, but it
- -       * can't know about every screwy device.  If we don't have an IVRS
- -       * reported alias, use the PCI reported alias.  In that case we may
- -       * still need to initialize the rlookup and dev_table entries if the
- -       * alias is to a non-existent device.
- -       */
- -      if (ivrs_alias == devid) {
- -              if (!amd_iommu_rlookup_table[pci_alias]) {
- -                      amd_iommu_rlookup_table[pci_alias] =
- -                              amd_iommu_rlookup_table[devid];
- -                      memcpy(amd_iommu_dev_table[pci_alias].data,
- -                             amd_iommu_dev_table[devid].data,
- -                             sizeof(amd_iommu_dev_table[pci_alias].data));
- -              }
- -
- -              return pci_alias;
- -      }
- -
- -      pr_info("AMD-Vi: Using IVRS reported alias %02x:%02x.%d "
- -              "for device %s[%04x:%04x], kernel reported alias "
- -              "%02x:%02x.%d\n", PCI_BUS_NUM(ivrs_alias), PCI_SLOT(ivrs_alias),
- -              PCI_FUNC(ivrs_alias), dev_name(dev), pdev->vendor, pdev->device,
- -              PCI_BUS_NUM(pci_alias), PCI_SLOT(pci_alias),
- -              PCI_FUNC(pci_alias));
- -
- -      /*
- -       * If we don't have a PCI DMA alias and the IVRS alias is on the same
- -       * bus, then the IVRS table may know about a quirk that we don't.
- -       */
- -      if (pci_alias == devid &&
- -          PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) {
- -              pdev->dev_flags |= PCI_DEV_FLAGS_DMA_ALIAS_DEVFN;
- -              pdev->dma_alias_devfn = ivrs_alias & 0xff;
- -              pr_info("AMD-Vi: Added PCI DMA alias %02x.%d for %s\n",
- -                      PCI_SLOT(ivrs_alias), PCI_FUNC(ivrs_alias),
- -                      dev_name(dev));
- -      }
- -
- -      return ivrs_alias;
- -}
- -
   static int iommu_init_device(struct device *dev)
   {
         struct pci_dev *pdev = to_pci_dev(dev);
         struct iommu_dev_data *dev_data;
- -      u16 alias;
   
         if (dev->archdata.iommu)
                 return 0;
@@@ -308,6 -386,24 +308,6 @@@
         if (!dev_data)
                 return -ENOMEM;
   
- -      alias = get_alias(dev);
- -
- -      if (alias != dev_data->devid) {
- -              struct iommu_dev_data *alias_data;
- -
- -              alias_data = find_dev_data(alias);
- -              if (alias_data == NULL) {
- -                      pr_err("AMD-Vi: Warning: Unhandled device %s\n",
- -                                      dev_name(dev));
- -                      free_dev_data(dev_data);
- -                      return -ENOTSUPP;
- -              }
- -              dev_data->alias_data = alias_data;
- -
- -              /* Add device to the alias_list */
- -              list_add(&dev_data->alias_list, &alias_data->alias_list);
- -      }
- -
         if (pci_iommuv2_capable(pdev)) {
                 struct amd_iommu *iommu;
   
@@@ -349,6 -445,9 +349,6 @@@ static void iommu_uninit_device(struct 
   
         iommu_group_remove_device(dev);
   
- -      /* Unlink from alias, it may change if another device is re-plugged */
- -      dev_data->alias_data = NULL;
- -
         /* Remove dma-ops */
         dev->archdata.dma_ops = NULL;
   
@@@ -534,7 -633,7 +534,7 @@@ static void iommu_poll_events(struct am
   
         while (head != tail) {
                 iommu_print_event(iommu, iommu->evt_buf + head);
- -              head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
+ +              head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
         }
   
         writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
@@@ -684,7 -783,7 +684,7 @@@ static void copy_cmd_to_buffer(struct a
         u8 *target;
   
         target = iommu->cmd_buf + tail;
- -      tail   = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+ +      tail   = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
   
         /* Copy command to buffer */
         memcpy(target, cmd, sizeof(*cmd));
@@@ -851,13 -950,15 +851,13 @@@ static int iommu_queue_command_sync(str
         u32 left, tail, head, next_tail;
         unsigned long flags;
   
- -      WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
- -
   again:
         spin_lock_irqsave(&iommu->lock, flags);
   
         head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
         tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
- -      next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
- -      left      = (head - next_tail) % iommu->cmd_buf_size;
+ +      next_tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
+ +      left      = (head - next_tail) % CMD_BUFFER_SIZE;
   
         if (left <= 2) {
                 struct iommu_cmd sync_cmd;
@@@ -1013,15 -1114,11 +1013,15 @@@ static int device_flush_iotlb(struct io
   static int device_flush_dte(struct iommu_dev_data *dev_data)
   {
         struct amd_iommu *iommu;
+ +      u16 alias;
         int ret;
   
         iommu = amd_iommu_rlookup_table[dev_data->devid];
+ +      alias = amd_iommu_alias_table[dev_data->devid];
   
         ret = iommu_flush_dte(iommu, dev_data->devid);
+ +      if (!ret && alias != dev_data->devid)
+ +              ret = iommu_flush_dte(iommu, alias);
         if (ret)
                 return ret;
   
@@@ -1877,8 -1974,8 +1877,8 @@@ static void set_dte_entry(u16 devid, st
   static void clear_dte_entry(u16 devid)
   {
         /* remove entry from the device table seen by the hardware */
- -      amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
- -      amd_iommu_dev_table[devid].data[1] = 0;
+ +      amd_iommu_dev_table[devid].data[0]  = IOMMU_PTE_P | IOMMU_PTE_TV;
+ +      amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK;
   
         amd_iommu_apply_erratum_63(devid);
   }
@@@ -1887,33 -1984,27 +1887,33 @@@ static void do_attach(struct iommu_dev_
                       struct protection_domain *domain)
   {
         struct amd_iommu *iommu;
+ +      u16 alias;
         bool ats;
   
         iommu = amd_iommu_rlookup_table[dev_data->devid];
+ +      alias = amd_iommu_alias_table[dev_data->devid];
         ats   = dev_data->ats.enabled;
   
         /* Update data structures */
         dev_data->domain = domain;
         list_add(&dev_data->list, &domain->dev_list);
- -      set_dte_entry(dev_data->devid, domain, ats);
   
         /* Do reference counting */
         domain->dev_iommu[iommu->index] += 1;
         domain->dev_cnt                 += 1;
   
- -      /* Flush the DTE entry */
+ +      /* Update device table */
+ +      set_dte_entry(dev_data->devid, domain, ats);
+ +      if (alias != dev_data->devid)
+ +              set_dte_entry(dev_data->devid, domain, ats);
+ +
         device_flush_dte(dev_data);
   }
   
   static void do_detach(struct iommu_dev_data *dev_data)
   {
         struct amd_iommu *iommu;
+ +      u16 alias;
   
         /*
          * First check if the device is still attached. It might already
@@@ -1925,7 -2016,6 +1925,7 @@@
                 return;
   
         iommu = amd_iommu_rlookup_table[dev_data->devid];
+ +      alias = amd_iommu_alias_table[dev_data->devid];
   
         /* decrease reference counters */
         dev_data->domain->dev_iommu[iommu->index] -= 1;
@@@ -1935,8 -2025,6 +1935,8 @@@
         dev_data->domain = NULL;
         list_del(&dev_data->list);
         clear_dte_entry(dev_data->devid);
+ +      if (alias != dev_data->devid)
+ +              clear_dte_entry(alias);
   
         /* Flush the DTE entry */
         device_flush_dte(dev_data);
@@@ -1949,23 -2037,29 +1949,23 @@@
   static int __attach_device(struct iommu_dev_data *dev_data,
                            struct protection_domain *domain)
   {
- -      struct iommu_dev_data *head, *entry;
         int ret;
   
+ +      /*
+ +       * Must be called with IRQs disabled. Warn here to detect early
+ +       * when its not.
+ +       */
+ +      WARN_ON(!irqs_disabled());
+ +
         /* lock domain */
         spin_lock(&domain->lock);
   
- -      head = dev_data;
- -
- -      if (head->alias_data != NULL)
- -              head = head->alias_data;
- -
- -      /* Now we have the root of the alias group, if any */
- -
         ret = -EBUSY;
- -      if (head->domain != NULL)
+ +      if (dev_data->domain != NULL)
                 goto out_unlock;
   
         /* Attach alias group root */
- -      do_attach(head, domain);
- -
- -      /* Attach other devices in the alias group */
- -      list_for_each_entry(entry, &head->alias_list, alias_list)
- -              do_attach(entry, domain);
+ +      do_attach(dev_data, domain);
   
         ret = 0;
   
@@@ -2115,24 -2209,26 +2115,24 @@@ static int attach_device(struct device 
    */
   static void __detach_device(struct iommu_dev_data *dev_data)
   {
- -      struct iommu_dev_data *head, *entry;
         struct protection_domain *domain;
- -      unsigned long flags;
   
- -      BUG_ON(!dev_data->domain);
- -
- -      domain = dev_data->domain;
+ +      /*
+ +       * Must be called with IRQs disabled. Warn here to detect early
+ +       * when its not.
+ +       */
+ +      WARN_ON(!irqs_disabled());
   
- -      spin_lock_irqsave(&domain->lock, flags);
+ +      if (WARN_ON(!dev_data->domain))
+ +              return;
   
- -      head = dev_data;
- -      if (head->alias_data != NULL)
- -              head = head->alias_data;
+ +      domain = dev_data->domain;
   
- -      list_for_each_entry(entry, &head->alias_list, alias_list)
- -              do_detach(entry);
+ +      spin_lock(&domain->lock);
   
- -      do_detach(head);
+ +      do_detach(dev_data);
   
- -      spin_unlock_irqrestore(&domain->lock, flags);
+ +      spin_unlock(&domain->lock);
   }
   
   /*
@@@ -2668,7 -2764,7 +2668,7 @@@ static void *alloc_coherent(struct devi
   
         page = alloc_pages(flag | __GFP_NOWARN,  get_order(size));
         if (!page) {
-               if (!(flag & __GFP_WAIT))
+               if (!gfpflags_allow_blocking(flag))
                         return NULL;
   
                 page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
@@@ -3102,7 -3198,6 +3102,7 @@@ static const struct iommu_ops amd_iommu
         .iova_to_phys = amd_iommu_iova_to_phys,
         .add_device = amd_iommu_add_device,
         .remove_device = amd_iommu_remove_device,
+ +      .device_group = pci_device_group,
         .get_dm_regions = amd_iommu_get_dm_regions,
         .put_dm_regions = amd_iommu_put_dm_regions,
         .pgsize_bitmap  = AMD_IOMMU_PGSIZES,
diff --combined drivers/iommu/intel-iommu.c

index 7cf80c1a8a1607f2d2ea675b270600bed70c6ad7,0c139f63b8bedd5201370289ebba66ad742ed87c..f1042daef9ada83e931ae450623ce491ebd55959
--- 1/drivers/iommu/intel-iommu.c
--- 2/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@@ -34,7 -34,6 +34,7 @@@
   #include <linux/mempool.h>
   #include <linux/memory.h>
   #include <linux/timer.h>
+ +#include <linux/io.h>
   #include <linux/iova.h>
   #include <linux/iommu.h>
   #include <linux/intel-iommu.h>
@@@ -419,13 -418,10 +419,13 @@@ struct device_domain_info 
         struct list_head global; /* link to global list */
         u8 bus;                 /* PCI bus number */
         u8 devfn;               /* PCI devfn number */
- -      struct {
- -              u8 enabled:1;
- -              u8 qdep;
- -      } ats;                  /* ATS state */
+ +      u8 pasid_supported:3;
+ +      u8 pasid_enabled:1;
+ +      u8 pri_supported:1;
+ +      u8 pri_enabled:1;
+ +      u8 ats_supported:1;
+ +      u8 ats_enabled:1;
+ +      u8 ats_qdep;
         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
         struct intel_iommu *iommu; /* IOMMU used by this device */
         struct dmar_domain *domain; /* pointer to domain */
@@@ -501,37 -497,13 +501,37 @@@ static int dmar_forcedac
   static int intel_iommu_strict;
   static int intel_iommu_superpage = 1;
   static int intel_iommu_ecs = 1;
+ +static int intel_iommu_pasid28;
+ +static int iommu_identity_mapping;
+ +
+ +#define IDENTMAP_ALL          1
+ +#define IDENTMAP_GFX          2
+ +#define IDENTMAP_AZALIA               4
   
- -/* We only actually use ECS when PASID support (on the new bit 40)
- - * is also advertised. Some early implementations — the ones with
- - * PASID support on bit 28 — have issues even when we *only* use
- - * extended root/context tables. */
+ +/* Broadwell and Skylake have broken ECS support — normal so-called "second
+ + * level" translation of DMA requests-without-PASID doesn't actually happen
+ + * unless you also set the NESTE bit in an extended context-entry. Which of
+ + * course means that SVM doesn't work because it's trying to do nested
+ + * translation of the physical addresses it finds in the process page tables,
+ + * through the IOVA->phys mapping found in the "second level" page tables.
+ + *
+ + * The VT-d specification was retroactively changed to change the definition
+ + * of the capability bits and pretend that Broadwell/Skylake never happened...
+ + * but unfortunately the wrong bit was changed. It's ECS which is broken, but
+ + * for some reason it was the PASID capability bit which was redefined (from
+ + * bit 28 on BDW/SKL to bit 40 in future).
+ + *
+ + * So our test for ECS needs to eschew those implementations which set the old
+ + * PASID capabiity bit 28, since those are the ones on which ECS is broken.
+ + * Unless we are working around the 'pasid28' limitations, that is, by putting
+ + * the device into passthrough mode for normal DMA and thus masking the bug.
+ + */
   #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
- -                          ecap_pasid(iommu->ecap))
+ +                          (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
+ +/* PASID support is thus enabled if ECS is enabled and *either* of the old
+ + * or new capability bits are set. */
+ +#define pasid_enabled(iommu) (ecs_enabled(iommu) &&                   \
+ +                            (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
   
   int intel_iommu_gfx_mapped;
   EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
@@@ -594,11 -566,6 +594,11 @@@ static int __init intel_iommu_setup(cha
                         printk(KERN_INFO
                                 "Intel-IOMMU: disable extended context table support\n");
                         intel_iommu_ecs = 0;
+ +              } else if (!strncmp(str, "pasid28", 7)) {
+ +                      printk(KERN_INFO
+ +                              "Intel-IOMMU: enable pre-production PASID support\n");
+ +                      intel_iommu_pasid28 = 1;
+ +                      iommu_identity_mapping |= IDENTMAP_GFX;
                 }
   
                 str += strcspn(str, ",");
@@@ -1440,22 -1407,37 +1440,22 @@@ static struct device_domain_info 
   iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
                          u8 bus, u8 devfn)
   {
- -      bool found = false;
         struct device_domain_info *info;
- -      struct pci_dev *pdev;
   
         assert_spin_locked(&device_domain_lock);
   
- -      if (!ecap_dev_iotlb_support(iommu->ecap))
- -              return NULL;
- -
         if (!iommu->qi)
                 return NULL;
   
         list_for_each_entry(info, &domain->devices, link)
                 if (info->iommu == iommu && info->bus == bus &&
                     info->devfn == devfn) {
- -                      found = true;
+ +                      if (info->ats_supported && info->dev)
+ +                              return info;
                         break;
                 }
   
- -      if (!found || !info->dev || !dev_is_pci(info->dev))
- -              return NULL;
- -
- -      pdev = to_pci_dev(info->dev);
- -
- -      if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
- -              return NULL;
- -
- -      if (!dmar_find_matched_atsr_unit(pdev))
- -              return NULL;
- -
- -      return info;
+ +      return NULL;
   }
   
   static void iommu_enable_dev_iotlb(struct device_domain_info *info)
@@@ -1466,48 -1448,20 +1466,48 @@@
                 return;
   
         pdev = to_pci_dev(info->dev);
- -      if (pci_enable_ats(pdev, VTD_PAGE_SHIFT))
- -              return;
   
- -      info->ats.enabled = 1;
- -      info->ats.qdep = pci_ats_queue_depth(pdev);
+ +#ifdef CONFIG_INTEL_IOMMU_SVM
+ +      /* The PCIe spec, in its wisdom, declares that the behaviour of
+ +         the device if you enable PASID support after ATS support is
+ +         undefined. So always enable PASID support on devices which
+ +         have it, even if we can't yet know if we're ever going to
+ +         use it. */
+ +      if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
+ +              info->pasid_enabled = 1;
+ +
+ +      if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
+ +              info->pri_enabled = 1;
+ +#endif
+ +      if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
+ +              info->ats_enabled = 1;
+ +              info->ats_qdep = pci_ats_queue_depth(pdev);
+ +      }
   }
   
   static void iommu_disable_dev_iotlb(struct device_domain_info *info)
   {
- -      if (!info->ats.enabled)
+ +      struct pci_dev *pdev;
+ +
+ +      if (dev_is_pci(info->dev))
                 return;
   
- -      pci_disable_ats(to_pci_dev(info->dev));
- -      info->ats.enabled = 0;
+ +      pdev = to_pci_dev(info->dev);
+ +
+ +      if (info->ats_enabled) {
+ +              pci_disable_ats(pdev);
+ +              info->ats_enabled = 0;
+ +      }
+ +#ifdef CONFIG_INTEL_IOMMU_SVM
+ +      if (info->pri_enabled) {
+ +              pci_disable_pri(pdev);
+ +              info->pri_enabled = 0;
+ +      }
+ +      if (info->pasid_enabled) {
+ +              pci_disable_pasid(pdev);
+ +              info->pasid_enabled = 0;
+ +      }
+ +#endif
   }
   
   static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
@@@ -1519,11 -1473,11 +1519,11 @@@
   
         spin_lock_irqsave(&device_domain_lock, flags);
         list_for_each_entry(info, &domain->devices, link) {
- -              if (!info->ats.enabled)
+ +              if (!info->ats_enabled)
                         continue;
   
                 sid = info->bus << 8 | info->devfn;
- -              qdep = info->ats.qdep;
+ +              qdep = info->ats_qdep;
                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
         }
         spin_unlock_irqrestore(&device_domain_lock, flags);
@@@ -1713,14 -1667,6 +1713,14 @@@ static void free_dmar_iommu(struct inte
   
         /* free context mapping */
         free_context_table(iommu);
+ +
+ +#ifdef CONFIG_INTEL_IOMMU_SVM
+ +      if (pasid_enabled(iommu)) {
+ +              if (ecap_prs(iommu->ecap))
+ +                      intel_svm_finish_prq(iommu);
+ +              intel_svm_free_pasid_tables(iommu);
+ +      }
+ +#endif
   }
   
   static struct dmar_domain *alloc_domain(int flags)
@@@ -1988,10 -1934,8 +1988,10 @@@ static int domain_context_mapping_one(s
                 }
   
                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
- -              translation = info ? CONTEXT_TT_DEV_IOTLB :
- -                                   CONTEXT_TT_MULTI_LEVEL;
+ +              if (info && info->ats_supported)
+ +                      translation = CONTEXT_TT_DEV_IOTLB;
+ +              else
+ +                      translation = CONTEXT_TT_MULTI_LEVEL;
   
                 context_set_address_root(context, virt_to_phys(pgd));
                 context_set_address_width(context, iommu->agaw);
@@@ -2171,19 -2115,15 +2171,19 @@@ static int __domain_mapping(struct dmar
                                 return -ENOMEM;
                         /* It is large page*/
                         if (largepage_lvl > 1) {
+ +                              unsigned long nr_superpages, end_pfn;
+ +
                                 pteval |= DMA_PTE_LARGE_PAGE;
                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
+ +
+ +                              nr_superpages = sg_res / lvl_pages;
+ +                              end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
+ +
                                 /*
                                  * Ensure that old small page tables are
- -                               * removed to make room for superpage,
- -                               * if they exist.
+ +                               * removed to make room for superpage(s).
                                  */
- -                              dma_pte_free_pagetable(domain, iov_pfn,
- -                                                     iov_pfn + lvl_pages - 1);
+ +                              dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
                         } else {
                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
                         }
@@@ -2329,34 -2269,12 +2329,34 @@@ static struct dmar_domain *dmar_insert_
   
         info->bus = bus;
         info->devfn = devfn;
- -      info->ats.enabled = 0;
- -      info->ats.qdep = 0;
+ +      info->ats_supported = info->pasid_supported = info->pri_supported = 0;
+ +      info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
+ +      info->ats_qdep = 0;
         info->dev = dev;
         info->domain = domain;
         info->iommu = iommu;
   
+ +      if (dev && dev_is_pci(dev)) {
+ +              struct pci_dev *pdev = to_pci_dev(info->dev);
+ +
+ +              if (ecap_dev_iotlb_support(iommu->ecap) &&
+ +                  pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
+ +                  dmar_find_matched_atsr_unit(pdev))
+ +                      info->ats_supported = 1;
+ +
+ +              if (ecs_enabled(iommu)) {
+ +                      if (pasid_enabled(iommu)) {
+ +                              int features = pci_pasid_features(pdev);
+ +                              if (features >= 0)
+ +                                      info->pasid_supported = features | 1;
+ +                      }
+ +
+ +                      if (info->ats_supported && ecap_prs(iommu->ecap) &&
+ +                          pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
+ +                              info->pri_supported = 1;
+ +              }
+ +      }
+ +
         spin_lock_irqsave(&device_domain_lock, flags);
         if (dev)
                 found = find_domain(dev);
@@@ -2482,6 -2400,11 +2482,6 @@@ found_domain
         return domain;
   }
   
- -static int iommu_identity_mapping;
- -#define IDENTMAP_ALL          1
- -#define IDENTMAP_GFX          2
- -#define IDENTMAP_AZALIA               4
- -
   static int iommu_domain_identity_map(struct dmar_domain *domain,
                                      unsigned long long start,
                                      unsigned long long end)
@@@ -2507,11 -2430,17 +2507,11 @@@
                                   DMA_PTE_READ|DMA_PTE_WRITE);
   }
   
- -static int iommu_prepare_identity_map(struct device *dev,
- -                                    unsigned long long start,
- -                                    unsigned long long end)
+ +static int domain_prepare_identity_map(struct device *dev,
+ +                                     struct dmar_domain *domain,
+ +                                     unsigned long long start,
+ +                                     unsigned long long end)
   {
- -      struct dmar_domain *domain;
- -      int ret;
- -
- -      domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
- -      if (!domain)
- -              return -ENOMEM;
- -
         /* For _hardware_ passthrough, don't bother. But for software
            passthrough, we do it anyway -- it may indicate a memory
            range which is reserved in E820, so which didn't get set
@@@ -2531,7 -2460,8 +2531,7 @@@
                         dmi_get_system_info(DMI_BIOS_VENDOR),
                         dmi_get_system_info(DMI_BIOS_VERSION),
                      dmi_get_system_info(DMI_PRODUCT_VERSION));
- -              ret = -EIO;
- -              goto error;
+ +              return -EIO;
         }
   
         if (end >> agaw_to_width(domain->agaw)) {
@@@ -2541,27 -2471,18 +2541,27 @@@
                      dmi_get_system_info(DMI_BIOS_VENDOR),
                      dmi_get_system_info(DMI_BIOS_VERSION),
                      dmi_get_system_info(DMI_PRODUCT_VERSION));
- -              ret = -EIO;
- -              goto error;
+ +              return -EIO;
         }
   
- -      ret = iommu_domain_identity_map(domain, start, end);
- -      if (ret)
- -              goto error;
+ +      return iommu_domain_identity_map(domain, start, end);
+ +}
   
- -      return 0;
+ +static int iommu_prepare_identity_map(struct device *dev,
+ +                                    unsigned long long start,
+ +                                    unsigned long long end)
+ +{
+ +      struct dmar_domain *domain;
+ +      int ret;
+ +
+ +      domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
+ +      if (!domain)
+ +              return -ENOMEM;
+ +
+ +      ret = domain_prepare_identity_map(dev, domain, start, end);
+ +      if (ret)
+ +              domain_exit(domain);
   
- - error:
- -      domain_exit(domain);
         return ret;
   }
   
@@@ -2887,18 -2808,18 +2887,18 @@@ static void intel_iommu_init_qi(struct 
   }
   
   static int copy_context_table(struct intel_iommu *iommu,
- -                            struct root_entry __iomem *old_re,
+ +                            struct root_entry *old_re,
                               struct context_entry **tbl,
                               int bus, bool ext)
   {
         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
- -      struct context_entry __iomem *old_ce = NULL;
         struct context_entry *new_ce = NULL, ce;
+ +      struct context_entry *old_ce = NULL;
         struct root_entry re;
         phys_addr_t old_ce_phys;
   
         tbl_idx = ext ? bus * 2 : bus;
- -      memcpy_fromio(&re, old_re, sizeof(re));
+ +      memcpy(&re, old_re, sizeof(re));
   
         for (devfn = 0; devfn < 256; devfn++) {
                 /* First calculate the correct index */
@@@ -2933,8 -2854,7 +2933,8 @@@
                         }
   
                         ret = -ENOMEM;
- -                      old_ce = ioremap_cache(old_ce_phys, PAGE_SIZE);
+ +                      old_ce = memremap(old_ce_phys, PAGE_SIZE,
+ +                                      MEMREMAP_WB);
                         if (!old_ce)
                                 goto out;
   
@@@ -2946,7 -2866,7 +2946,7 @@@
                 }
   
                 /* Now copy the context entry */
- -              memcpy_fromio(&ce, old_ce + idx, sizeof(ce));
+ +              memcpy(&ce, old_ce + idx, sizeof(ce));
   
                 if (!__context_present(&ce))
                         continue;
@@@ -2982,7 -2902,7 +2982,7 @@@
         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
   
   out_unmap:
- -      iounmap(old_ce);
+ +      memunmap(old_ce);
   
   out:
         return ret;
@@@ -2990,8 -2910,8 +2990,8 @@@
   
   static int copy_translation_tables(struct intel_iommu *iommu)
   {
- -      struct root_entry __iomem *old_rt;
         struct context_entry **ctxt_tbls;
+ +      struct root_entry *old_rt;
         phys_addr_t old_rt_phys;
         int ctxt_table_entries;
         unsigned long flags;
@@@ -3016,7 -2936,7 +3016,7 @@@
         if (!old_rt_phys)
                 return -EINVAL;
   
- -      old_rt = ioremap_cache(old_rt_phys, PAGE_SIZE);
+ +      old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
         if (!old_rt)
                 return -ENOMEM;
   
@@@ -3065,7 -2985,7 +3065,7 @@@
         ret = 0;
   
   out_unmap:
- -      iounmap(old_rt);
+ +      memunmap(old_rt);
   
         return ret;
   }
@@@ -3176,10 -3096,6 +3176,10 @@@ static int __init init_dmars(void
   
                 if (!ecap_pass_through(iommu->ecap))
                         hw_pass_through = 0;
+ +#ifdef CONFIG_INTEL_IOMMU_SVM
+ +              if (pasid_enabled(iommu))
+ +                      intel_svm_alloc_pasid_tables(iommu);
+ +#endif
         }
   
         if (iommu_pass_through)
@@@ -3267,13 -3183,6 +3267,13 @@@ domains_done
   
                 iommu_flush_write_buffer(iommu);
   
+ +#ifdef CONFIG_INTEL_IOMMU_SVM
+ +              if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
+ +                      ret = intel_svm_enable_prq(iommu);
+ +                      if (ret)
+ +                              goto free_iommu;
+ +              }
+ +#endif
                 ret = dmar_set_interrupt(iommu);
                 if (ret)
                         goto free_iommu;
@@@ -3333,10 -3242,7 +3333,10 @@@ static struct iova *intel_alloc_iova(st
   
   static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
   {
+ +      struct dmar_rmrr_unit *rmrr;
         struct dmar_domain *domain;
+ +      struct device *i_dev;
+ +      int i, ret;
   
         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
         if (!domain) {
@@@ -3345,23 -3251,6 +3345,23 @@@
                 return NULL;
         }
   
+ +      /* We have a new domain - setup possible RMRRs for the device */
+ +      rcu_read_lock();
+ +      for_each_rmrr_units(rmrr) {
+ +              for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
+ +                                        i, i_dev) {
+ +                      if (i_dev != dev)
+ +                              continue;
+ +
+ +                      ret = domain_prepare_identity_map(dev, domain,
+ +                                                        rmrr->base_address,
+ +                                                        rmrr->end_address);
+ +                      if (ret)
+ +                              dev_err(dev, "Mapping reserved region failed\n");
+ +              }
+ +      }
+ +      rcu_read_unlock();
+ +
         return domain;
   }
   
@@@ -3647,7 -3536,7 +3647,7 @@@ static void *intel_alloc_coherent(struc
                         flags |= GFP_DMA32;
         }
   
-       if (flags & __GFP_WAIT) {
+       if (gfpflags_allow_blocking(flags)) {
                 unsigned int count = size >> PAGE_SHIFT;
   
                 page = dma_alloc_from_contiguous(dev, count, order);
@@@ -4222,11 -4111,6 +4222,11 @@@ static int intel_iommu_add(struct dmar_
         if (ret)
                 goto out;
   
+ +#ifdef CONFIG_INTEL_IOMMU_SVM
+ +      if (pasid_enabled(iommu))
+ +              intel_svm_alloc_pasid_tables(iommu);
+ +#endif
+ +
         if (dmaru->ignored) {
                 /*
                  * we always have to disable PMRs or DMA may fail on this device
@@@ -4238,14 -4122,6 +4238,14 @@@
   
         intel_iommu_init_qi(iommu);
         iommu_flush_write_buffer(iommu);
+ +
+ +#ifdef CONFIG_INTEL_IOMMU_SVM
+ +      if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
+ +              ret = intel_svm_enable_prq(iommu);
+ +              if (ret)
+ +                      goto disable_iommu;
+ +      }
+ +#endif
         ret = dmar_set_interrupt(iommu);
         if (ret)
                 goto disable_iommu;
@@@ -4314,17 -4190,14 +4314,17 @@@ int dmar_find_matched_atsr_unit(struct 
         dev = pci_physfn(dev);
         for (bus = dev->bus; bus; bus = bus->parent) {
                 bridge = bus->self;
- -              if (!bridge || !pci_is_pcie(bridge) ||
+ +              /* If it's an integrated device, allow ATS */
+ +              if (!bridge)
+ +                      return 1;
+ +              /* Connected via non-PCIe: no ATS */
+ +              if (!pci_is_pcie(bridge) ||
                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
                         return 0;
+ +              /* If we found the root port, look it up in the ATSR */
                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
                         break;
         }
- -      if (!bridge)
- -              return 0;
   
         rcu_read_lock();
         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
@@@ -4988,114 -4861,6 +4988,114 @@@ static void intel_iommu_remove_device(s
         iommu_device_unlink(iommu->iommu_dev, dev);
   }
   
+ +#ifdef CONFIG_INTEL_IOMMU_SVM
+ +int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
+ +{
+ +      struct device_domain_info *info;
+ +      struct context_entry *context;
+ +      struct dmar_domain *domain;
+ +      unsigned long flags;
+ +      u64 ctx_lo;
+ +      int ret;
+ +
+ +      domain = get_valid_domain_for_dev(sdev->dev);
+ +      if (!domain)
+ +              return -EINVAL;
+ +
+ +      spin_lock_irqsave(&device_domain_lock, flags);
+ +      spin_lock(&iommu->lock);
+ +
+ +      ret = -EINVAL;
+ +      info = sdev->dev->archdata.iommu;
+ +      if (!info || !info->pasid_supported)
+ +              goto out;
+ +
+ +      context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
+ +      if (WARN_ON(!context))
+ +              goto out;
+ +
+ +      ctx_lo = context[0].lo;
+ +
+ +      sdev->did = domain->iommu_did[iommu->seq_id];
+ +      sdev->sid = PCI_DEVID(info->bus, info->devfn);
+ +
+ +      if (!(ctx_lo & CONTEXT_PASIDE)) {
+ +              context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
+ +              context[1].lo = (u64)virt_to_phys(iommu->pasid_table) | ecap_pss(iommu->ecap);
+ +              wmb();
+ +              /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
+ +               * extended to permit requests-with-PASID if the PASIDE bit
+ +               * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
+ +               * however, the PASIDE bit is ignored and requests-with-PASID
+ +               * are unconditionally blocked. Which makes less sense.
+ +               * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
+ +               * "guest mode" translation types depending on whether ATS
+ +               * is available or not. Annoyingly, we can't use the new
+ +               * modes *unless* PASIDE is set. */
+ +              if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
+ +                      ctx_lo &= ~CONTEXT_TT_MASK;
+ +                      if (info->ats_supported)
+ +                              ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
+ +                      else
+ +                              ctx_lo |= CONTEXT_TT_PT_PASID << 2;
+ +              }
+ +              ctx_lo |= CONTEXT_PASIDE;
+ +              if (iommu->pasid_state_table)
+ +                      ctx_lo |= CONTEXT_DINVE;
+ +              if (info->pri_supported)
+ +                      ctx_lo |= CONTEXT_PRS;
+ +              context[0].lo = ctx_lo;
+ +              wmb();
+ +              iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
+ +                                         DMA_CCMD_MASK_NOBIT,
+ +                                         DMA_CCMD_DEVICE_INVL);
+ +      }
+ +
+ +      /* Enable PASID support in the device, if it wasn't already */
+ +      if (!info->pasid_enabled)
+ +              iommu_enable_dev_iotlb(info);
+ +
+ +      if (info->ats_enabled) {
+ +              sdev->dev_iotlb = 1;
+ +              sdev->qdep = info->ats_qdep;
+ +              if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
+ +                      sdev->qdep = 0;
+ +      }
+ +      ret = 0;
+ +
+ + out:
+ +      spin_unlock(&iommu->lock);
+ +      spin_unlock_irqrestore(&device_domain_lock, flags);
+ +
+ +      return ret;
+ +}
+ +
+ +struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
+ +{
+ +      struct intel_iommu *iommu;
+ +      u8 bus, devfn;
+ +
+ +      if (iommu_dummy(dev)) {
+ +              dev_warn(dev,
+ +                       "No IOMMU translation for device; cannot enable SVM\n");
+ +              return NULL;
+ +      }
+ +
+ +      iommu = device_to_iommu(dev, &bus, &devfn);
+ +      if ((!iommu)) {
+ +              dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
+ +              return NULL;
+ +      }
+ +
+ +      if (!iommu->pasid_table) {
+ +              dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
+ +              return NULL;
+ +      }
+ +
+ +      return iommu;
+ +}
+ +#endif /* CONFIG_INTEL_IOMMU_SVM */
+ +
   static const struct iommu_ops intel_iommu_ops = {
         .capable        = intel_iommu_capable,
         .domain_alloc   = intel_iommu_domain_alloc,
@@@ -5108,7 -4873,6 +5108,7 @@@
         .iova_to_phys   = intel_iommu_iova_to_phys,
         .add_device     = intel_iommu_add_device,
         .remove_device  = intel_iommu_remove_device,
+ +      .device_group   = pci_device_group,
         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
   };
   
diff --combined drivers/md/dm-crypt.c

index 3729b394432c9d66c7c219a8a52ddbf9be987869,ca718c98bb9d666a7dd8e5306b049d7fe2fcd3f1..917d47e290ae08be08f4c964a3326f1f67acd077
--- 1/drivers/md/dm-crypt.c
--- 2/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@@ -994,7 -994,7 +994,7 @@@ static struct bio *crypt_alloc_buffer(s
         struct bio_vec *bvec;
   
   retry:
-       if (unlikely(gfp_mask & __GFP_WAIT))
+       if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
                 mutex_lock(&cc->bio_alloc_lock);
   
         clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
@@@ -1010,7 -1010,7 +1010,7 @@@
                 if (!page) {
                         crypt_free_buffer_pages(cc, clone);
                         bio_put(clone);
-                       gfp_mask |= __GFP_WAIT;
+                       gfp_mask |= __GFP_DIRECT_RECLAIM;
                         goto retry;
                 }
   
@@@ -1027,7 -1027,7 +1027,7 @@@
         }
   
   return_clone:
-       if (unlikely(gfp_mask & __GFP_WAIT))
+       if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
                 mutex_unlock(&cc->bio_alloc_lock);
   
         return clone;
@@@ -1544,8 -1544,10 +1544,8 @@@ static void crypt_dtr(struct dm_target 
         if (cc->bs)
                 bioset_free(cc->bs);
   
- -      if (cc->page_pool)
- -              mempool_destroy(cc->page_pool);
- -      if (cc->req_pool)
- -              mempool_destroy(cc->req_pool);
+ +      mempool_destroy(cc->page_pool);
+ +      mempool_destroy(cc->req_pool);
   
         if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
                 cc->iv_gen_ops->dtr(cc);
diff --combined drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c

index 1bd2fd47421fc43e02b7ad39c28546e55b3ea69e,fb2cb4bdc0c181258f9a44d7fc95f1f8a9fad263..4432fd69b7cbf86db946fb9aa6bfa753cb21434c
--- 1/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c
--- 2/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c
+++ b/drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c
@@@ -458,12 -458,11 +458,12 @@@ static inline u32 vop_usec(const vop_he
   static int solo_fill_jpeg(struct solo_enc_dev *solo_enc,
                           struct vb2_buffer *vb, const vop_header *vh)
   {
+ +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
         struct solo_dev *solo_dev = solo_enc->solo_dev;
- -      struct sg_table *vbuf = vb2_dma_sg_plane_desc(vb, 0);
+ +      struct sg_table *sgt = vb2_dma_sg_plane_desc(vb, 0);
         int frame_size;
   
- -      vb->v4l2_buf.flags |= V4L2_BUF_FLAG_KEYFRAME;
+ +      vbuf->flags |= V4L2_BUF_FLAG_KEYFRAME;
   
         if (vb2_plane_size(vb, 0) < vop_jpeg_size(vh) + solo_enc->jpeg_len)
                 return -EIO;
@@@ -471,7 -470,7 +471,7 @@@
         frame_size = ALIGN(vop_jpeg_size(vh) + solo_enc->jpeg_len, DMA_ALIGN);
         vb2_set_plane_payload(vb, 0, vop_jpeg_size(vh) + solo_enc->jpeg_len);
   
- -      return solo_send_desc(solo_enc, solo_enc->jpeg_len, vbuf,
+ +      return solo_send_desc(solo_enc, solo_enc->jpeg_len, sgt,
                              vop_jpeg_offset(vh) - SOLO_JPEG_EXT_ADDR(solo_dev),
                              frame_size, SOLO_JPEG_EXT_ADDR(solo_dev),
                              SOLO_JPEG_EXT_SIZE(solo_dev));
@@@ -480,9 -479,8 +480,9 @@@
   static int solo_fill_mpeg(struct solo_enc_dev *solo_enc,
                 struct vb2_buffer *vb, const vop_header *vh)
   {
+ +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
         struct solo_dev *solo_dev = solo_enc->solo_dev;
- -      struct sg_table *vbuf = vb2_dma_sg_plane_desc(vb, 0);
+ +      struct sg_table *sgt = vb2_dma_sg_plane_desc(vb, 0);
         int frame_off, frame_size;
         int skip = 0;
   
@@@ -490,15 -488,15 +490,15 @@@
                 return -EIO;
   
         /* If this is a key frame, add extra header */
- -      vb->v4l2_buf.flags &= ~(V4L2_BUF_FLAG_KEYFRAME | V4L2_BUF_FLAG_PFRAME |
+ +      vbuf->flags &= ~(V4L2_BUF_FLAG_KEYFRAME | V4L2_BUF_FLAG_PFRAME |
                 V4L2_BUF_FLAG_BFRAME);
         if (!vop_type(vh)) {
                 skip = solo_enc->vop_len;
- -              vb->v4l2_buf.flags |= V4L2_BUF_FLAG_KEYFRAME;
+ +              vbuf->flags |= V4L2_BUF_FLAG_KEYFRAME;
                 vb2_set_plane_payload(vb, 0, vop_mpeg_size(vh) +
                         solo_enc->vop_len);
         } else {
- -              vb->v4l2_buf.flags |= V4L2_BUF_FLAG_PFRAME;
+ +              vbuf->flags |= V4L2_BUF_FLAG_PFRAME;
                 vb2_set_plane_payload(vb, 0, vop_mpeg_size(vh));
         }
   
@@@ -507,7 -505,7 +507,7 @@@
                 sizeof(*vh)) % SOLO_MP4E_EXT_SIZE(solo_dev);
         frame_size = ALIGN(vop_mpeg_size(vh) + skip, DMA_ALIGN);
   
- -      return solo_send_desc(solo_enc, skip, vbuf, frame_off, frame_size,
+ +      return solo_send_desc(solo_enc, skip, sgt, frame_off, frame_size,
                         SOLO_MP4E_EXT_ADDR(solo_dev),
                         SOLO_MP4E_EXT_SIZE(solo_dev));
   }
@@@ -515,7 -513,6 +515,7 @@@
   static int solo_enc_fillbuf(struct solo_enc_dev *solo_enc,
                             struct vb2_buffer *vb, struct solo_enc_buf *enc_buf)
   {
+ +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
         const vop_header *vh = enc_buf->vh;
         int ret;
   
@@@ -530,18 -527,17 +530,18 @@@
         }
   
         if (!ret) {
- -              vb->v4l2_buf.sequence = solo_enc->sequence++;
- -              vb->v4l2_buf.timestamp.tv_sec = vop_sec(vh);
- -              vb->v4l2_buf.timestamp.tv_usec = vop_usec(vh);
+ +              vbuf->sequence = solo_enc->sequence++;
+ +              vbuf->timestamp.tv_sec = vop_sec(vh);
+ +              vbuf->timestamp.tv_usec = vop_usec(vh);
   
                 /* Check for motion flags */
                 if (solo_is_motion_on(solo_enc) && enc_buf->motion) {
                         struct v4l2_event ev = {
                                 .type = V4L2_EVENT_MOTION_DET,
                                 .u.motion_det = {
- -                                      .flags = V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ,
- -                                      .frame_sequence = vb->v4l2_buf.sequence,
+ +                                      .flags
+ +                                      = V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ,
+ +                                      .frame_sequence = vbuf->sequence,
                                         .region_mask = enc_buf->motion ? 1 : 0,
                                 },
                         };
@@@ -575,7 -571,7 +575,7 @@@ static void solo_enc_handle_one(struct 
         list_del(&vb->list);
         spin_unlock_irqrestore(&solo_enc->av_lock, flags);
   
- -      solo_enc_fillbuf(solo_enc, &vb->vb, enc_buf);
+ +      solo_enc_fillbuf(solo_enc, &vb->vb.vb2_buf, enc_buf);
   unlock:
         mutex_unlock(&solo_enc->lock);
   }
@@@ -663,7 -659,7 +663,7 @@@ static int solo_ring_thread(void *data
   }
   
   static int solo_enc_queue_setup(struct vb2_queue *q,
- -                              const struct v4l2_format *fmt,
+ +                              const void *parg,
                                 unsigned int *num_buffers,
                                 unsigned int *num_planes, unsigned int sizes[],
                                 void *alloc_ctxs[])
@@@ -682,11 -678,10 +682,11 @@@
   
   static void solo_enc_buf_queue(struct vb2_buffer *vb)
   {
+ +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
         struct vb2_queue *vq = vb->vb2_queue;
         struct solo_enc_dev *solo_enc = vb2_get_drv_priv(vq);
         struct solo_vb2_buf *solo_vb =
- -              container_of(vb, struct solo_vb2_buf, vb);
+ +              container_of(vbuf, struct solo_vb2_buf, vb);
   
         spin_lock(&solo_enc->av_lock);
         list_add_tail(&solo_vb->list, &solo_enc->vidq_active);
@@@ -739,26 -734,25 +739,26 @@@ static void solo_enc_stop_streaming(str
                                 struct solo_vb2_buf, list);
   
                 list_del(&buf->list);
- -              vb2_buffer_done(&buf->vb, VB2_BUF_STATE_ERROR);
+ +              vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR);
         }
         spin_unlock_irqrestore(&solo_enc->av_lock, flags);
   }
   
   static void solo_enc_buf_finish(struct vb2_buffer *vb)
   {
+ +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
         struct solo_enc_dev *solo_enc = vb2_get_drv_priv(vb->vb2_queue);
- -      struct sg_table *vbuf = vb2_dma_sg_plane_desc(vb, 0);
+ +      struct sg_table *sgt = vb2_dma_sg_plane_desc(vb, 0);
   
         switch (solo_enc->fmt) {
         case V4L2_PIX_FMT_MPEG4:
         case V4L2_PIX_FMT_H264:
- -              if (vb->v4l2_buf.flags & V4L2_BUF_FLAG_KEYFRAME)
- -                      sg_copy_from_buffer(vbuf->sgl, vbuf->nents,
+ +              if (vbuf->flags & V4L2_BUF_FLAG_KEYFRAME)
+ +                      sg_copy_from_buffer(sgt->sgl, sgt->nents,
                                         solo_enc->vop, solo_enc->vop_len);
                 break;
         default: /* V4L2_PIX_FMT_MJPEG */
- -              sg_copy_from_buffer(vbuf->sgl, vbuf->nents,
+ +              sg_copy_from_buffer(sgt->sgl, sgt->nents,
                                 solo_enc->jpeg_header, solo_enc->jpeg_len);
                 break;
         }
@@@ -1297,7 -1291,7 +1297,7 @@@ static struct solo_enc_dev *solo_enc_al
         solo_enc->vidq.ops = &solo_enc_video_qops;
         solo_enc->vidq.mem_ops = &vb2_dma_sg_memops;
         solo_enc->vidq.drv_priv = solo_enc;
-       solo_enc->vidq.gfp_flags = __GFP_DMA32;
+       solo_enc->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
         solo_enc->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
         solo_enc->vidq.buf_struct_size = sizeof(struct solo_vb2_buf);
         solo_enc->vidq.lock = &solo_enc->lock;
diff --combined drivers/media/pci/solo6x10/solo6x10-v4l2.c

index 26df903585d7dd02a54077bc24bbba8acda87b40,bde77b22340c4ee93591960f905af8b0d76ecce5..f7ce493b1feed72c27dd4f9d58e1ebe15bec5708
--- 1/drivers/media/pci/solo6x10/solo6x10-v4l2.c
--- 2/drivers/media/pci/solo6x10/solo6x10-v4l2.c
+++ b/drivers/media/pci/solo6x10/solo6x10-v4l2.c
@@@ -26,7 -26,6 +26,7 @@@
   #include <media/v4l2-ioctl.h>
   #include <media/v4l2-common.h>
   #include <media/v4l2-event.h>
+ +#include <media/videobuf2-v4l2.h>
   #include <media/videobuf2-dma-contig.h>
   
   #include "solo6x10.h"
@@@ -192,14 -191,13 +192,14 @@@ static int solo_v4l2_set_ch(struct solo
   static void solo_fillbuf(struct solo_dev *solo_dev,
                          struct vb2_buffer *vb)
   {
- -      dma_addr_t vbuf;
+ +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
+ +      dma_addr_t addr;
         unsigned int fdma_addr;
         int error = -1;
         int i;
   
- -      vbuf = vb2_dma_contig_plane_dma_addr(vb, 0);
- -      if (!vbuf)
+ +      addr = vb2_dma_contig_plane_dma_addr(vb, 0);
+ +      if (!addr)
                 goto finish_buf;
   
         if (erase_off(solo_dev)) {
@@@ -215,7 -213,7 +215,7 @@@
                 fdma_addr = SOLO_DISP_EXT_ADDR + (solo_dev->old_write *
                                 (SOLO_HW_BPL * solo_vlines(solo_dev)));
   
- -              error = solo_p2m_dma_t(solo_dev, 0, vbuf, fdma_addr,
+ +              error = solo_p2m_dma_t(solo_dev, 0, addr, fdma_addr,
                                        solo_bytesperline(solo_dev),
                                        solo_vlines(solo_dev), SOLO_HW_BPL);
         }
@@@ -224,8 -222,8 +224,8 @@@ finish_buf
         if (!error) {
                 vb2_set_plane_payload(vb, 0,
                         solo_vlines(solo_dev) * solo_bytesperline(solo_dev));
- -              vb->v4l2_buf.sequence = solo_dev->sequence++;
- -              v4l2_get_timestamp(&vb->v4l2_buf.timestamp);
+ +              vbuf->sequence = solo_dev->sequence++;
+ +              v4l2_get_timestamp(&vbuf->timestamp);
         }
   
         vb2_buffer_done(vb, error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE);
@@@ -258,7 -256,7 +258,7 @@@ static void solo_thread_try(struct solo
   
                 spin_unlock(&solo_dev->slock);
   
- -              solo_fillbuf(solo_dev, &vb->vb);
+ +              solo_fillbuf(solo_dev, &vb->vb.vb2_buf);
         }
   
         assert_spin_locked(&solo_dev->slock);
@@@ -313,7 -311,7 +313,7 @@@ static void solo_stop_thread(struct sol
         solo_dev->kthread = NULL;
   }
   
- -static int solo_queue_setup(struct vb2_queue *q, const struct v4l2_format *fmt,
+ +static int solo_queue_setup(struct vb2_queue *q, const void *parg,
                            unsigned int *num_buffers, unsigned int *num_planes,
                            unsigned int sizes[], void *alloc_ctxs[])
   {
@@@ -347,11 -345,10 +347,11 @@@ static void solo_stop_streaming(struct 
   
   static void solo_buf_queue(struct vb2_buffer *vb)
   {
+ +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
         struct vb2_queue *vq = vb->vb2_queue;
         struct solo_dev *solo_dev = vb2_get_drv_priv(vq);
         struct solo_vb2_buf *solo_vb =
- -              container_of(vb, struct solo_vb2_buf, vb);
+ +              container_of(vbuf, struct solo_vb2_buf, vb);
   
         spin_lock(&solo_dev->slock);
         list_add_tail(&solo_vb->list, &solo_dev->vidq_active);
@@@ -678,7 -675,7 +678,7 @@@ int solo_v4l2_init(struct solo_dev *sol
         solo_dev->vidq.mem_ops = &vb2_dma_contig_memops;
         solo_dev->vidq.drv_priv = solo_dev;
         solo_dev->vidq.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
-       solo_dev->vidq.gfp_flags = __GFP_DMA32;
+       solo_dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
         solo_dev->vidq.buf_struct_size = sizeof(struct solo_vb2_buf);
         solo_dev->vidq.lock = &solo_dev->lock;
         ret = vb2_queue_init(&solo_dev->vidq);
diff --combined drivers/media/pci/tw68/tw68-video.c

index 4c3293dcddbcde3cd6b37584c6665d870fef1bc8,e556f989aaabd26bf6d2cebaa0b67fe97ac9aa84..46642ef9151b644413c3de4e33ee5e9279d2107d
--- 1/drivers/media/pci/tw68/tw68-video.c
--- 2/drivers/media/pci/tw68/tw68-video.c
+++ b/drivers/media/pci/tw68/tw68-video.c
@@@ -376,11 -376,10 +376,11 @@@ static int tw68_buffer_count(unsigned i
   /* ------------------------------------------------------------- */
   /* vb2 queue operations                                          */
   
- -static int tw68_queue_setup(struct vb2_queue *q, const struct v4l2_format *fmt,
+ +static int tw68_queue_setup(struct vb2_queue *q, const void *parg,
                            unsigned int *num_buffers, unsigned int *num_planes,
                            unsigned int sizes[], void *alloc_ctxs[])
   {
+ +      const struct v4l2_format *fmt = parg;
         struct tw68_dev *dev = vb2_get_drv_priv(q);
         unsigned tot_bufs = q->num_buffers + *num_buffers;
   
@@@ -424,10 -423,9 +424,10 @@@
    */
   static void tw68_buf_queue(struct vb2_buffer *vb)
   {
+ +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
         struct vb2_queue *vq = vb->vb2_queue;
         struct tw68_dev *dev = vb2_get_drv_priv(vq);
- -      struct tw68_buf *buf = container_of(vb, struct tw68_buf, vb);
+ +      struct tw68_buf *buf = container_of(vbuf, struct tw68_buf, vb);
         struct tw68_buf *prev;
         unsigned long flags;
   
@@@ -459,10 -457,9 +459,10 @@@
    */
   static int tw68_buf_prepare(struct vb2_buffer *vb)
   {
+ +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
         struct vb2_queue *vq = vb->vb2_queue;
         struct tw68_dev *dev = vb2_get_drv_priv(vq);
- -      struct tw68_buf *buf = container_of(vb, struct tw68_buf, vb);
+ +      struct tw68_buf *buf = container_of(vbuf, struct tw68_buf, vb);
         struct sg_table *dma = vb2_dma_sg_plane_desc(vb, 0);
         unsigned size, bpl;
   
@@@ -502,10 -499,9 +502,10 @@@
   
   static void tw68_buf_finish(struct vb2_buffer *vb)
   {
+ +      struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
         struct vb2_queue *vq = vb->vb2_queue;
         struct tw68_dev *dev = vb2_get_drv_priv(vq);
- -      struct tw68_buf *buf = container_of(vb, struct tw68_buf, vb);
+ +      struct tw68_buf *buf = container_of(vbuf, struct tw68_buf, vb);
   
         pci_free_consistent(dev->pci, buf->size, buf->cpu, buf->dma);
   }
@@@ -532,7 -528,7 +532,7 @@@ static void tw68_stop_streaming(struct 
                         container_of(dev->active.next, struct tw68_buf, list);
   
                 list_del(&buf->list);
- -              vb2_buffer_done(&buf->vb, VB2_BUF_STATE_ERROR);
+ +              vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR);
         }
   }
   
@@@ -979,7 -975,7 +979,7 @@@ int tw68_video_init2(struct tw68_dev *d
         dev->vidq.ops = &tw68_video_qops;
         dev->vidq.mem_ops = &vb2_dma_sg_memops;
         dev->vidq.drv_priv = dev;
-       dev->vidq.gfp_flags = __GFP_DMA32;
+       dev->vidq.gfp_flags = __GFP_DMA32 | __GFP_KSWAPD_RECLAIM;
         dev->vidq.buf_struct_size = sizeof(struct tw68_buf);
         dev->vidq.lock = &dev->lock;
         dev->vidq.min_buffers_needed = 2;
@@@ -1016,10 -1012,10 +1016,10 @@@ void tw68_irq_video_done(struct tw68_de
                 buf = list_entry(dev->active.next, struct tw68_buf, list);
                 list_del(&buf->list);
                 spin_unlock(&dev->slock);
- -              v4l2_get_timestamp(&buf->vb.v4l2_buf.timestamp);
- -              buf->vb.v4l2_buf.field = dev->field;
- -              buf->vb.v4l2_buf.sequence = dev->seqnr++;
- -              vb2_buffer_done(&buf->vb, VB2_BUF_STATE_DONE);
+ +              v4l2_get_timestamp(&buf->vb.timestamp);
+ +              buf->vb.field = dev->field;
+ +              buf->vb.sequence = dev->seqnr++;
+ +              vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_DONE);
                 status &= ~(TW68_DMAPI);
                 if (0 == status)
                         return;
diff --combined drivers/misc/vmw_balloon.c

index 89300870fefb97a66291b96d52fca3aeb3714259,1b49e53463a268a35a5e75fe2bbc81c5c8c76119..1e688bfec56728c3d00ebc353031c26fde29f187
--- 1/drivers/misc/vmw_balloon.c
--- 2/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@@ -1,7 -1,7 +1,7 @@@
   /*
    * VMware Balloon driver.
    *
- - * Copyright (C) 2000-2010, VMware, Inc. All Rights Reserved.
+ + * Copyright (C) 2000-2014, VMware, Inc. All Rights Reserved.
    *
    * This program is free software; you can redistribute it and/or modify it
    * under the terms of the GNU General Public License as published by the
@@@ -37,19 -37,16 +37,19 @@@
   #include <linux/types.h>
   #include <linux/kernel.h>
   #include <linux/mm.h>
+ +#include <linux/vmalloc.h>
   #include <linux/sched.h>
   #include <linux/module.h>
   #include <linux/workqueue.h>
   #include <linux/debugfs.h>
   #include <linux/seq_file.h>
+ +#include <linux/vmw_vmci_defs.h>
+ +#include <linux/vmw_vmci_api.h>
   #include <asm/hypervisor.h>
   
   MODULE_AUTHOR("VMware, Inc.");
   MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
- -MODULE_VERSION("1.3.0.0-k");
+ +MODULE_VERSION("1.5.0.0-k");
   MODULE_ALIAS("dmi:*:svnVMware*:*");
   MODULE_ALIAS("vmware_vmmemctl");
   MODULE_LICENSE("GPL");
@@@ -59,6 -56,12 +59,6 @@@
    * measured in pages.
    */
   
- -/*
- - * Rate of allocating memory when there is no memory pressure
- - * (driver performs non-sleeping allocations).
- - */
- -#define VMW_BALLOON_NOSLEEP_ALLOC_MAX 16384U
- -
   /*
    * Rates of memory allocaton when guest experiences memory pressure
    * (driver performs sleeping allocations).
@@@ -67,6 -70,13 +67,6 @@@
   #define VMW_BALLOON_RATE_ALLOC_MAX    2048U
   #define VMW_BALLOON_RATE_ALLOC_INC    16U
   
- -/*
- - * Rates for releasing pages while deflating balloon.
- - */
- -#define VMW_BALLOON_RATE_FREE_MIN     512U
- -#define VMW_BALLOON_RATE_FREE_MAX     16384U
- -#define VMW_BALLOON_RATE_FREE_INC     16U
- -
   /*
    * When guest is under memory pressure, use a reduced page allocation
    * rate for next several cycles.
@@@ -75,7 -85,7 +75,7 @@@
   
   /*
    * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
-  * allow wait (__GFP_WAIT) for NOSLEEP page allocations. Use
+  * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
    * __GFP_NOWARN, to suppress page allocation failure warnings.
    */
   #define VMW_PAGE_ALLOC_NOSLEEP                (__GFP_HIGHMEM|__GFP_NOWARN)
@@@ -89,6 -99,9 +89,6 @@@
    */
   #define VMW_PAGE_ALLOC_CANSLEEP               (GFP_HIGHUSER)
   
- -/* Maximum number of page allocations without yielding processor */
- -#define VMW_BALLOON_YIELD_THRESHOLD   1024
- -
   /* Maximum number of refused pages we accumulate during inflation cycle */
   #define VMW_BALLOON_MAX_REFUSED               16
   
@@@ -103,45 -116,17 +103,45 @@@ enum vmwballoon_capabilities 
         /*
          * Bit 0 is reserved and not associated to any capability.
          */
- -      VMW_BALLOON_BASIC_CMDS          = (1 << 1),
- -      VMW_BALLOON_BATCHED_CMDS        = (1 << 2)
+ +      VMW_BALLOON_BASIC_CMDS                  = (1 << 1),
+ +      VMW_BALLOON_BATCHED_CMDS                = (1 << 2),
+ +      VMW_BALLOON_BATCHED_2M_CMDS             = (1 << 3),
+ +      VMW_BALLOON_SIGNALLED_WAKEUP_CMD        = (1 << 4),
   };
   
- -#define VMW_BALLOON_CAPABILITIES      (VMW_BALLOON_BASIC_CMDS)
+ +#define VMW_BALLOON_CAPABILITIES      (VMW_BALLOON_BASIC_CMDS \
+ +                                      | VMW_BALLOON_BATCHED_CMDS \
+ +                                      | VMW_BALLOON_BATCHED_2M_CMDS \
+ +                                      | VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
+ +
+ +#define VMW_BALLOON_2M_SHIFT          (9)
+ +#define VMW_BALLOON_NUM_PAGE_SIZES    (2)
+ +
+ +/*
+ + * Backdoor commands availability:
+ + *
+ + * START, GET_TARGET and GUEST_ID are always available,
+ + *
+ + * VMW_BALLOON_BASIC_CMDS:
+ + *    LOCK and UNLOCK commands,
+ + * VMW_BALLOON_BATCHED_CMDS:
+ + *    BATCHED_LOCK and BATCHED_UNLOCK commands.
+ + * VMW BALLOON_BATCHED_2M_CMDS:
+ + *    BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
+ + * VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
+ + *    VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
+ + */
+ +#define VMW_BALLOON_CMD_START                 0
+ +#define VMW_BALLOON_CMD_GET_TARGET            1
+ +#define VMW_BALLOON_CMD_LOCK                  2
+ +#define VMW_BALLOON_CMD_UNLOCK                        3
+ +#define VMW_BALLOON_CMD_GUEST_ID              4
+ +#define VMW_BALLOON_CMD_BATCHED_LOCK          6
+ +#define VMW_BALLOON_CMD_BATCHED_UNLOCK                7
+ +#define VMW_BALLOON_CMD_BATCHED_2M_LOCK               8
+ +#define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK     9
+ +#define VMW_BALLOON_CMD_VMCI_DOORBELL_SET     10
   
- -#define VMW_BALLOON_CMD_START         0
- -#define VMW_BALLOON_CMD_GET_TARGET    1
- -#define VMW_BALLOON_CMD_LOCK          2
- -#define VMW_BALLOON_CMD_UNLOCK                3
- -#define VMW_BALLOON_CMD_GUEST_ID      4
   
   /* error codes */
   #define VMW_BALLOON_SUCCESS                   0
@@@ -157,60 -142,18 +157,60 @@@
   
   #define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES (0x03000000)
   
- -#define VMWARE_BALLOON_CMD(cmd, data, result)                 \
+ +/* Batch page description */
+ +
+ +/*
+ + * Layout of a page in the batch page:
+ + *
+ + * +-------------+----------+--------+
+ + * |             |          |        |
+ + * | Page number | Reserved | Status |
+ + * |             |          |        |
+ + * +-------------+----------+--------+
+ + * 64  PAGE_SHIFT          6         0
+ + *
+ + * The reserved field should be set to 0.
+ + */
+ +#define VMW_BALLOON_BATCH_MAX_PAGES   (PAGE_SIZE / sizeof(u64))
+ +#define VMW_BALLOON_BATCH_STATUS_MASK ((1UL << 5) - 1)
+ +#define VMW_BALLOON_BATCH_PAGE_MASK   (~((1UL << PAGE_SHIFT) - 1))
+ +
+ +struct vmballoon_batch_page {
+ +      u64 pages[VMW_BALLOON_BATCH_MAX_PAGES];
+ +};
+ +
+ +static u64 vmballoon_batch_get_pa(struct vmballoon_batch_page *batch, int idx)
+ +{
+ +      return batch->pages[idx] & VMW_BALLOON_BATCH_PAGE_MASK;
+ +}
+ +
+ +static int vmballoon_batch_get_status(struct vmballoon_batch_page *batch,
+ +                              int idx)
+ +{
+ +      return (int)(batch->pages[idx] & VMW_BALLOON_BATCH_STATUS_MASK);
+ +}
+ +
+ +static void vmballoon_batch_set_pa(struct vmballoon_batch_page *batch, int idx,
+ +                              u64 pa)
+ +{
+ +      batch->pages[idx] = pa;
+ +}
+ +
+ +
+ +#define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)           \
   ({                                                            \
- -      unsigned long __status, __dummy1, __dummy2;             \
+ +      unsigned long __status, __dummy1, __dummy2, __dummy3;   \
         __asm__ __volatile__ ("inl %%dx" :                      \
                 "=a"(__status),                                 \
                 "=c"(__dummy1),                                 \
                 "=d"(__dummy2),                                 \
- -              "=b"(result) :                                  \
+ +              "=b"(result),                                   \
+ +              "=S" (__dummy3) :                               \
                 "0"(VMW_BALLOON_HV_MAGIC),                      \
                 "1"(VMW_BALLOON_CMD_##cmd),                     \
                 "2"(VMW_BALLOON_HV_PORT),                       \
- -              "3"(data) :                                     \
+ +              "3"(arg1),                                      \
+ +              "4" (arg2) :                                    \
                 "memory");                                      \
         if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)     \
                 result = __dummy1;                              \
@@@ -221,30 -164,27 +221,30 @@@
   #ifdef CONFIG_DEBUG_FS
   struct vmballoon_stats {
         unsigned int timer;
+ +      unsigned int doorbell;
   
         /* allocation statistics */
- -      unsigned int alloc;
- -      unsigned int alloc_fail;
+ +      unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
+ +      unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
         unsigned int sleep_alloc;
         unsigned int sleep_alloc_fail;
- -      unsigned int refused_alloc;
- -      unsigned int refused_free;
- -      unsigned int free;
+ +      unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
+ +      unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
+ +      unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
   
         /* monitor operations */
- -      unsigned int lock;
- -      unsigned int lock_fail;
- -      unsigned int unlock;
- -      unsigned int unlock_fail;
+ +      unsigned int lock[VMW_BALLOON_NUM_PAGE_SIZES];
+ +      unsigned int lock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
+ +      unsigned int unlock[VMW_BALLOON_NUM_PAGE_SIZES];
+ +      unsigned int unlock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
         unsigned int target;
         unsigned int target_fail;
         unsigned int start;
         unsigned int start_fail;
         unsigned int guest_type;
         unsigned int guest_type_fail;
+ +      unsigned int doorbell_set;
+ +      unsigned int doorbell_unset;
   };
   
   #define STATS_INC(stat) (stat)++
@@@ -252,30 -192,14 +252,30 @@@
   #define STATS_INC(stat)
   #endif
   
- -struct vmballoon {
+ +struct vmballoon;
   
+ +struct vmballoon_ops {
+ +      void (*add_page)(struct vmballoon *b, int idx, struct page *p);
+ +      int (*lock)(struct vmballoon *b, unsigned int num_pages,
+ +                      bool is_2m_pages, unsigned int *target);
+ +      int (*unlock)(struct vmballoon *b, unsigned int num_pages,
+ +                      bool is_2m_pages, unsigned int *target);
+ +};
+ +
+ +struct vmballoon_page_size {
         /* list of reserved physical pages */
         struct list_head pages;
   
         /* transient list of non-balloonable pages */
         struct list_head refused_pages;
         unsigned int n_refused_pages;
+ +};
+ +
+ +struct vmballoon {
+ +      struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
+ +
+ +      /* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
+ +      unsigned supported_page_sizes;
   
         /* balloon size in pages */
         unsigned int size;
@@@ -286,18 -210,11 +286,18 @@@
   
         /* adjustment rates (pages per second) */
         unsigned int rate_alloc;
- -      unsigned int rate_free;
   
         /* slowdown page allocations for next few cycles */
         unsigned int slow_allocation_cycles;
   
+ +      unsigned long capabilities;
+ +
+ +      struct vmballoon_batch_page *batch_page;
+ +      unsigned int batch_max_pages;
+ +      struct page *page;
+ +
+ +      const struct vmballoon_ops *ops;
+ +
   #ifdef CONFIG_DEBUG_FS
         /* statistics */
         struct vmballoon_stats stats;
@@@ -309,8 -226,6 +309,8 @@@
         struct sysinfo sysinfo;
   
         struct delayed_work dwork;
+ +
+ +      struct vmci_handle vmci_doorbell;
   };
   
   static struct vmballoon balloon;
@@@ -319,38 -234,20 +319,38 @@@
    * Send "start" command to the host, communicating supported version
    * of the protocol.
    */
- -static bool vmballoon_send_start(struct vmballoon *b)
+ +static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
   {
- -      unsigned long status, capabilities;
+ +      unsigned long status, capabilities, dummy = 0;
+ +      bool success;
   
         STATS_INC(b->stats.start);
   
- -      status = VMWARE_BALLOON_CMD(START, VMW_BALLOON_CAPABILITIES,
- -                              capabilities);
- -      if (status == VMW_BALLOON_SUCCESS)
- -              return true;
+ +      status = VMWARE_BALLOON_CMD(START, req_caps, dummy, capabilities);
   
- -      pr_debug("%s - failed, hv returns %ld\n", __func__, status);
- -      STATS_INC(b->stats.start_fail);
- -      return false;
+ +      switch (status) {
+ +      case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
+ +              b->capabilities = capabilities;
+ +              success = true;
+ +              break;
+ +      case VMW_BALLOON_SUCCESS:
+ +              b->capabilities = VMW_BALLOON_BASIC_CMDS;
+ +              success = true;
+ +              break;
+ +      default:
+ +              success = false;
+ +      }
+ +
+ +      if (b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS)
+ +              b->supported_page_sizes = 2;
+ +      else
+ +              b->supported_page_sizes = 1;
+ +
+ +      if (!success) {
+ +              pr_debug("%s - failed, hv returns %ld\n", __func__, status);
+ +              STATS_INC(b->stats.start_fail);
+ +      }
+ +      return success;
   }
   
   static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
@@@ -376,10 -273,9 +376,10 @@@
    */
   static bool vmballoon_send_guest_id(struct vmballoon *b)
   {
- -      unsigned long status, dummy;
+ +      unsigned long status, dummy = 0;
   
- -      status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy);
+ +      status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy,
+ +                              dummy);
   
         STATS_INC(b->stats.guest_type);
   
@@@ -391,14 -287,6 +391,14 @@@
         return false;
   }
   
+ +static u16 vmballoon_page_size(bool is_2m_page)
+ +{
+ +      if (is_2m_page)
+ +              return 1 << VMW_BALLOON_2M_SHIFT;
+ +
+ +      return 1;
+ +}
+ +
   /*
    * Retrieve desired balloon size from the host.
    */
@@@ -407,7 -295,6 +407,7 @@@ static bool vmballoon_send_get_target(s
         unsigned long status;
         unsigned long target;
         unsigned long limit;
+ +      unsigned long dummy = 0;
         u32 limit32;
   
         /*
@@@ -426,7 -313,7 +426,7 @@@
         /* update stats */
         STATS_INC(b->stats.target);
   
- -      status = VMWARE_BALLOON_CMD(GET_TARGET, limit, target);
+ +      status = VMWARE_BALLOON_CMD(GET_TARGET, limit, dummy, target);
         if (vmballoon_check_status(b, status)) {
                 *new_target = target;
                 return true;
@@@ -443,46 -330,23 +443,46 @@@
    * check the return value and maybe submit a different page.
    */
   static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
- -                                   unsigned int *hv_status)
+ +                              unsigned int *hv_status, unsigned int *target)
   {
- -      unsigned long status, dummy;
+ +      unsigned long status, dummy = 0;
         u32 pfn32;
   
         pfn32 = (u32)pfn;
         if (pfn32 != pfn)
                 return -1;
   
- -      STATS_INC(b->stats.lock);
+ +      STATS_INC(b->stats.lock[false]);
   
- -      *hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy);
+ +      *hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy, *target);
         if (vmballoon_check_status(b, status))
                 return 0;
   
         pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
- -      STATS_INC(b->stats.lock_fail);
+ +      STATS_INC(b->stats.lock_fail[false]);
+ +      return 1;
+ +}
+ +
+ +static int vmballoon_send_batched_lock(struct vmballoon *b,
+ +              unsigned int num_pages, bool is_2m_pages, unsigned int *target)
+ +{
+ +      unsigned long status;
+ +      unsigned long pfn = page_to_pfn(b->page);
+ +
+ +      STATS_INC(b->stats.lock[is_2m_pages]);
+ +
+ +      if (is_2m_pages)
+ +              status = VMWARE_BALLOON_CMD(BATCHED_2M_LOCK, pfn, num_pages,
+ +                              *target);
+ +      else
+ +              status = VMWARE_BALLOON_CMD(BATCHED_LOCK, pfn, num_pages,
+ +                              *target);
+ +
+ +      if (vmballoon_check_status(b, status))
+ +              return 0;
+ +
+ +      pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
+ +      STATS_INC(b->stats.lock_fail[is_2m_pages]);
         return 1;
   }
   
@@@ -490,66 -354,26 +490,66 @@@
    * Notify the host that guest intends to release given page back into
    * the pool of available (to the guest) pages.
    */
- -static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn)
+ +static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn,
+ +                                                      unsigned int *target)
   {
- -      unsigned long status, dummy;
+ +      unsigned long status, dummy = 0;
         u32 pfn32;
   
         pfn32 = (u32)pfn;
         if (pfn32 != pfn)
                 return false;
   
- -      STATS_INC(b->stats.unlock);
+ +      STATS_INC(b->stats.unlock[false]);
   
- -      status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy);
+ +      status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy, *target);
         if (vmballoon_check_status(b, status))
                 return true;
   
         pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
- -      STATS_INC(b->stats.unlock_fail);
+ +      STATS_INC(b->stats.unlock_fail[false]);
+ +      return false;
+ +}
+ +
+ +static bool vmballoon_send_batched_unlock(struct vmballoon *b,
+ +              unsigned int num_pages, bool is_2m_pages, unsigned int *target)
+ +{
+ +      unsigned long status;
+ +      unsigned long pfn = page_to_pfn(b->page);
+ +
+ +      STATS_INC(b->stats.unlock[is_2m_pages]);
+ +
+ +      if (is_2m_pages)
+ +              status = VMWARE_BALLOON_CMD(BATCHED_2M_UNLOCK, pfn, num_pages,
+ +                              *target);
+ +      else
+ +              status = VMWARE_BALLOON_CMD(BATCHED_UNLOCK, pfn, num_pages,
+ +                              *target);
+ +
+ +      if (vmballoon_check_status(b, status))
+ +              return true;
+ +
+ +      pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
+ +      STATS_INC(b->stats.unlock_fail[is_2m_pages]);
         return false;
   }
   
+ +static struct page *vmballoon_alloc_page(gfp_t flags, bool is_2m_page)
+ +{
+ +      if (is_2m_page)
+ +              return alloc_pages(flags, VMW_BALLOON_2M_SHIFT);
+ +
+ +      return alloc_page(flags);
+ +}
+ +
+ +static void vmballoon_free_page(struct page *page, bool is_2m_page)
+ +{
+ +      if (is_2m_page)
+ +              __free_pages(page, VMW_BALLOON_2M_SHIFT);
+ +      else
+ +              __free_page(page);
+ +}
+ +
   /*
    * Quickly release all pages allocated for the balloon. This function is
    * called when host decides to "reset" balloon for one reason or another.
@@@ -559,31 -383,35 +559,31 @@@
   static void vmballoon_pop(struct vmballoon *b)
   {
         struct page *page, *next;
- -      unsigned int count = 0;
- -
- -      list_for_each_entry_safe(page, next, &b->pages, lru) {
- -              list_del(&page->lru);
- -              __free_page(page);
- -              STATS_INC(b->stats.free);
- -              b->size--;
- -
- -              if (++count >= b->rate_free) {
- -                      count = 0;
+ +      unsigned is_2m_pages;
+ +
+ +      for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
+ +                      is_2m_pages++) {
+ +              struct vmballoon_page_size *page_size =
+ +                              &b->page_sizes[is_2m_pages];
+ +              u16 size_per_page = vmballoon_page_size(is_2m_pages);
+ +
+ +              list_for_each_entry_safe(page, next, &page_size->pages, lru) {
+ +                      list_del(&page->lru);
+ +                      vmballoon_free_page(page, is_2m_pages);
+ +                      STATS_INC(b->stats.free[is_2m_pages]);
+ +                      b->size -= size_per_page;
                         cond_resched();
                 }
         }
- -}
   
- -/*
- - * Perform standard reset sequence by popping the balloon (in case it
- - * is not  empty) and then restarting protocol. This operation normally
- - * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
- - */
- -static void vmballoon_reset(struct vmballoon *b)
- -{
- -      /* free all pages, skipping monitor unlock */
- -      vmballoon_pop(b);
+ +      if (b->batch_page) {
+ +              vunmap(b->batch_page);
+ +              b->batch_page = NULL;
+ +      }
   
- -      if (vmballoon_send_start(b)) {
- -              b->reset_required = false;
- -              if (!vmballoon_send_guest_id(b))
- -                      pr_err("failed to send guest ID to the host\n");
+ +      if (b->page) {
+ +              __free_page(b->page);
+ +              b->page = NULL;
         }
   }
   
@@@ -592,23 -420,17 +592,23 @@@
    * refuse list, those refused page are then released at the end of the
    * inflation cycle.
    */
- -static int vmballoon_lock_page(struct vmballoon *b, struct page *page)
+ +static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages,
+ +                              bool is_2m_pages, unsigned int *target)
   {
         int locked, hv_status;
+ +      struct page *page = b->page;
+ +      struct vmballoon_page_size *page_size = &b->page_sizes[false];
+ +
+ +      /* is_2m_pages can never happen as 2m pages support implies batching */
   
- -      locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status);
+ +      locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status,
+ +                                                              target);
         if (locked > 0) {
- -              STATS_INC(b->stats.refused_alloc);
+ +              STATS_INC(b->stats.refused_alloc[false]);
   
                 if (hv_status == VMW_BALLOON_ERROR_RESET ||
                                 hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED) {
- -                      __free_page(page);
+ +                      vmballoon_free_page(page, false);
                         return -EIO;
                 }
   
@@@ -617,17 -439,17 +617,17 @@@
                  * and retry allocation, unless we already accumulated
                  * too many of them, in which case take a breather.
                  */
- -              if (b->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
- -                      b->n_refused_pages++;
- -                      list_add(&page->lru, &b->refused_pages);
+ +              if (page_size->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
+ +                      page_size->n_refused_pages++;
+ +                      list_add(&page->lru, &page_size->refused_pages);
                 } else {
- -                      __free_page(page);
+ +                      vmballoon_free_page(page, false);
                 }
                 return -EIO;
         }
   
         /* track allocated page */
- -      list_add(&page->lru, &b->pages);
+ +      list_add(&page->lru, &page_size->pages);
   
         /* update balloon size */
         b->size++;
@@@ -635,81 -457,21 +635,81 @@@
         return 0;
   }
   
+ +static int vmballoon_lock_batched_page(struct vmballoon *b,
+ +              unsigned int num_pages, bool is_2m_pages, unsigned int *target)
+ +{
+ +      int locked, i;
+ +      u16 size_per_page = vmballoon_page_size(is_2m_pages);
+ +
+ +      locked = vmballoon_send_batched_lock(b, num_pages, is_2m_pages,
+ +                      target);
+ +      if (locked > 0) {
+ +              for (i = 0; i < num_pages; i++) {
+ +                      u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
+ +                      struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
+ +
+ +                      vmballoon_free_page(p, is_2m_pages);
+ +              }
+ +
+ +              return -EIO;
+ +      }
+ +
+ +      for (i = 0; i < num_pages; i++) {
+ +              u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
+ +              struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
+ +              struct vmballoon_page_size *page_size =
+ +                              &b->page_sizes[is_2m_pages];
+ +
+ +              locked = vmballoon_batch_get_status(b->batch_page, i);
+ +
+ +              switch (locked) {
+ +              case VMW_BALLOON_SUCCESS:
+ +                      list_add(&p->lru, &page_size->pages);
+ +                      b->size += size_per_page;
+ +                      break;
+ +              case VMW_BALLOON_ERROR_PPN_PINNED:
+ +              case VMW_BALLOON_ERROR_PPN_INVALID:
+ +                      if (page_size->n_refused_pages
+ +                                      < VMW_BALLOON_MAX_REFUSED) {
+ +                              list_add(&p->lru, &page_size->refused_pages);
+ +                              page_size->n_refused_pages++;
+ +                              break;
+ +                      }
+ +                      /* Fallthrough */
+ +              case VMW_BALLOON_ERROR_RESET:
+ +              case VMW_BALLOON_ERROR_PPN_NOTNEEDED:
+ +                      vmballoon_free_page(p, is_2m_pages);
+ +                      break;
+ +              default:
+ +                      /* This should never happen */
+ +                      WARN_ON_ONCE(true);
+ +              }
+ +      }
+ +
+ +      return 0;
+ +}
+ +
   /*
    * Release the page allocated for the balloon. Note that we first notify
    * the host so it can make sure the page will be available for the guest
    * to use, if needed.
    */
- -static int vmballoon_release_page(struct vmballoon *b, struct page *page)
+ +static int vmballoon_unlock_page(struct vmballoon *b, unsigned int num_pages,
+ +              bool is_2m_pages, unsigned int *target)
   {
- -      if (!vmballoon_send_unlock_page(b, page_to_pfn(page)))
- -              return -EIO;
+ +      struct page *page = b->page;
+ +      struct vmballoon_page_size *page_size = &b->page_sizes[false];
+ +
+ +      /* is_2m_pages can never happen as 2m pages support implies batching */
   
- -      list_del(&page->lru);
+ +      if (!vmballoon_send_unlock_page(b, page_to_pfn(page), target)) {
+ +              list_add(&page->lru, &page_size->pages);
+ +              return -EIO;
+ +      }
   
         /* deallocate page */
- -      __free_page(page);
- -      STATS_INC(b->stats.free);
+ +      vmballoon_free_page(page, false);
+ +      STATS_INC(b->stats.free[false]);
   
         /* update balloon size */
         b->size--;
@@@ -717,76 -479,21 +717,76 @@@
         return 0;
   }
   
+ +static int vmballoon_unlock_batched_page(struct vmballoon *b,
+ +                              unsigned int num_pages, bool is_2m_pages,
+ +                              unsigned int *target)
+ +{
+ +      int locked, i, ret = 0;
+ +      bool hv_success;
+ +      u16 size_per_page = vmballoon_page_size(is_2m_pages);
+ +
+ +      hv_success = vmballoon_send_batched_unlock(b, num_pages, is_2m_pages,
+ +                      target);
+ +      if (!hv_success)
+ +              ret = -EIO;
+ +
+ +      for (i = 0; i < num_pages; i++) {
+ +              u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
+ +              struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
+ +              struct vmballoon_page_size *page_size =
+ +                              &b->page_sizes[is_2m_pages];
+ +
+ +              locked = vmballoon_batch_get_status(b->batch_page, i);
+ +              if (!hv_success || locked != VMW_BALLOON_SUCCESS) {
+ +                      /*
+ +                       * That page wasn't successfully unlocked by the
+ +                       * hypervisor, re-add it to the list of pages owned by
+ +                       * the balloon driver.
+ +                       */
+ +                      list_add(&p->lru, &page_size->pages);
+ +              } else {
+ +                      /* deallocate page */
+ +                      vmballoon_free_page(p, is_2m_pages);
+ +                      STATS_INC(b->stats.free[is_2m_pages]);
+ +
+ +                      /* update balloon size */
+ +                      b->size -= size_per_page;
+ +              }
+ +      }
+ +
+ +      return ret;
+ +}
+ +
   /*
    * Release pages that were allocated while attempting to inflate the
    * balloon but were refused by the host for one reason or another.
    */
- -static void vmballoon_release_refused_pages(struct vmballoon *b)
+ +static void vmballoon_release_refused_pages(struct vmballoon *b,
+ +              bool is_2m_pages)
   {
         struct page *page, *next;
+ +      struct vmballoon_page_size *page_size =
+ +                      &b->page_sizes[is_2m_pages];
   
- -      list_for_each_entry_safe(page, next, &b->refused_pages, lru) {
+ +      list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
                 list_del(&page->lru);
- -              __free_page(page);
- -              STATS_INC(b->stats.refused_free);
+ +              vmballoon_free_page(page, is_2m_pages);
+ +              STATS_INC(b->stats.refused_free[is_2m_pages]);
         }
   
- -      b->n_refused_pages = 0;
+ +      page_size->n_refused_pages = 0;
+ +}
+ +
+ +static void vmballoon_add_page(struct vmballoon *b, int idx, struct page *p)
+ +{
+ +      b->page = p;
+ +}
+ +
+ +static void vmballoon_add_batched_page(struct vmballoon *b, int idx,
+ +                              struct page *p)
+ +{
+ +      vmballoon_batch_set_pa(b->batch_page, idx,
+ +                      (u64)page_to_pfn(p) << PAGE_SHIFT);
   }
   
   /*
@@@ -796,12 -503,12 +796,12 @@@
    */
   static void vmballoon_inflate(struct vmballoon *b)
   {
- -      unsigned int goal;
- -      unsigned int rate;
- -      unsigned int i;
+ +      unsigned rate;
         unsigned int allocations = 0;
+ +      unsigned int num_pages = 0;
         int error = 0;
         gfp_t flags = VMW_PAGE_ALLOC_NOSLEEP;
+ +      bool is_2m_pages;
   
         pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
   
@@@ -820,50 -527,27 +820,50 @@@
          * slowdown page allocations considerably.
          */
   
- -      goal = b->target - b->size;
         /*
          * Start with no sleep allocation rate which may be higher
          * than sleeping allocation rate.
          */
- -      rate = b->slow_allocation_cycles ?
- -                      b->rate_alloc : VMW_BALLOON_NOSLEEP_ALLOC_MAX;
+ +      if (b->slow_allocation_cycles) {
+ +              rate = b->rate_alloc;
+ +              is_2m_pages = false;
+ +      } else {
+ +              rate = UINT_MAX;
+ +              is_2m_pages =
+ +                      b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
+ +      }
   
- -      pr_debug("%s - goal: %d, no-sleep rate: %d, sleep rate: %d\n",
- -               __func__, goal, rate, b->rate_alloc);
+ +      pr_debug("%s - goal: %d, no-sleep rate: %u, sleep rate: %d\n",
+ +               __func__, b->target - b->size, rate, b->rate_alloc);
   
- -      for (i = 0; i < goal; i++) {
+ +      while (!b->reset_required &&
+ +              b->size + num_pages * vmballoon_page_size(is_2m_pages)
+ +              < b->target) {
                 struct page *page;
   
                 if (flags == VMW_PAGE_ALLOC_NOSLEEP)
- -                      STATS_INC(b->stats.alloc);
+ +                      STATS_INC(b->stats.alloc[is_2m_pages]);
                 else
                         STATS_INC(b->stats.sleep_alloc);
   
- -              page = alloc_page(flags);
+ +              page = vmballoon_alloc_page(flags, is_2m_pages);
                 if (!page) {
+ +                      STATS_INC(b->stats.alloc_fail[is_2m_pages]);
+ +
+ +                      if (is_2m_pages) {
+ +                              b->ops->lock(b, num_pages, true, &b->target);
+ +
+ +                              /*
+ +                               * ignore errors from locking as we now switch
+ +                               * to 4k pages and we might get different
+ +                               * errors.
+ +                               */
+ +
+ +                              num_pages = 0;
+ +                              is_2m_pages = false;
+ +                              continue;
+ +                      }
+ +
                         if (flags == VMW_PAGE_ALLOC_CANSLEEP) {
                                 /*
                                  * CANSLEEP page allocation failed, so guest
@@@ -875,6 -559,7 +875,6 @@@
                                 STATS_INC(b->stats.sleep_alloc_fail);
                                 break;
                         }
- -                      STATS_INC(b->stats.alloc_fail);
   
                         /*
                          * NOSLEEP page allocation failed, so the guest is
@@@ -886,7 -571,7 +886,7 @@@
                          */
                         b->slow_allocation_cycles = VMW_BALLOON_SLOW_CYCLES;
   
- -                      if (i >= b->rate_alloc)
+ +                      if (allocations >= b->rate_alloc)
                                 break;
   
                         flags = VMW_PAGE_ALLOC_CANSLEEP;
@@@ -895,40 -580,34 +895,40 @@@
                         continue;
                 }
   
- -              error = vmballoon_lock_page(b, page);
- -              if (error)
- -                      break;
- -
- -              if (++allocations > VMW_BALLOON_YIELD_THRESHOLD) {
- -                      cond_resched();
- -                      allocations = 0;
+ +              b->ops->add_page(b, num_pages++, page);
+ +              if (num_pages == b->batch_max_pages) {
+ +                      error = b->ops->lock(b, num_pages, is_2m_pages,
+ +                                      &b->target);
+ +                      num_pages = 0;
+ +                      if (error)
+ +                              break;
                 }
   
- -              if (i >= rate) {
+ +              cond_resched();
+ +
+ +              if (allocations >= rate) {
                         /* We allocated enough pages, let's take a break. */
                         break;
                 }
         }
   
+ +      if (num_pages > 0)
+ +              b->ops->lock(b, num_pages, is_2m_pages, &b->target);
+ +
         /*
          * We reached our goal without failures so try increasing
          * allocation rate.
          */
- -      if (error == 0 && i >= b->rate_alloc) {
- -              unsigned int mult = i / b->rate_alloc;
+ +      if (error == 0 && allocations >= b->rate_alloc) {
+ +              unsigned int mult = allocations / b->rate_alloc;
   
                 b->rate_alloc =
                         min(b->rate_alloc + mult * VMW_BALLOON_RATE_ALLOC_INC,
                             VMW_BALLOON_RATE_ALLOC_MAX);
         }
   
- -      vmballoon_release_refused_pages(b);
+ +      vmballoon_release_refused_pages(b, true);
+ +      vmballoon_release_refused_pages(b, false);
   }
   
   /*
@@@ -936,176 -615,35 +936,176 @@@
    */
   static void vmballoon_deflate(struct vmballoon *b)
   {
- -      struct page *page, *next;
- -      unsigned int i = 0;
- -      unsigned int goal;
- -      int error;
+ +      unsigned is_2m_pages;
   
         pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
   
- -      /* limit deallocation rate */
- -      goal = min(b->size - b->target, b->rate_free);
+ +      /* free pages to reach target */
+ +      for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
+ +                      is_2m_pages++) {
+ +              struct page *page, *next;
+ +              unsigned int num_pages = 0;
+ +              struct vmballoon_page_size *page_size =
+ +                              &b->page_sizes[is_2m_pages];
+ +
+ +              list_for_each_entry_safe(page, next, &page_size->pages, lru) {
+ +                      if (b->reset_required ||
+ +                              (b->target > 0 &&
+ +                                      b->size - num_pages
+ +                                      * vmballoon_page_size(is_2m_pages)
+ +                              < b->target + vmballoon_page_size(true)))
+ +                              break;
+ +
+ +                      list_del(&page->lru);
+ +                      b->ops->add_page(b, num_pages++, page);
   
- -      pr_debug("%s - goal: %d, rate: %d\n", __func__, goal, b->rate_free);
+ +                      if (num_pages == b->batch_max_pages) {
+ +                              int error;
   
- -      /* free pages to reach target */
- -      list_for_each_entry_safe(page, next, &b->pages, lru) {
- -              error = vmballoon_release_page(b, page);
- -              if (error) {
- -                      /* quickly decrease rate in case of error */
- -                      b->rate_free = max(b->rate_free / 2,
- -                                         VMW_BALLOON_RATE_FREE_MIN);
- -                      return;
+ +                              error = b->ops->unlock(b, num_pages,
+ +                                              is_2m_pages, &b->target);
+ +                              num_pages = 0;
+ +                              if (error)
+ +                                      return;
+ +                      }
+ +
+ +                      cond_resched();
                 }
   
- -              if (++i >= goal)
- -                      break;
+ +              if (num_pages > 0)
+ +                      b->ops->unlock(b, num_pages, is_2m_pages, &b->target);
+ +      }
+ +}
+ +
+ +static const struct vmballoon_ops vmballoon_basic_ops = {
+ +      .add_page = vmballoon_add_page,
+ +      .lock = vmballoon_lock_page,
+ +      .unlock = vmballoon_unlock_page
+ +};
+ +
+ +static const struct vmballoon_ops vmballoon_batched_ops = {
+ +      .add_page = vmballoon_add_batched_page,
+ +      .lock = vmballoon_lock_batched_page,
+ +      .unlock = vmballoon_unlock_batched_page
+ +};
+ +
+ +static bool vmballoon_init_batching(struct vmballoon *b)
+ +{
+ +      b->page = alloc_page(VMW_PAGE_ALLOC_NOSLEEP);
+ +      if (!b->page)
+ +              return false;
+ +
+ +      b->batch_page = vmap(&b->page, 1, VM_MAP, PAGE_KERNEL);
+ +      if (!b->batch_page) {
+ +              __free_page(b->page);
+ +              return false;
+ +      }
+ +
+ +      return true;
+ +}
+ +
+ +/*
+ + * Receive notification and resize balloon
+ + */
+ +static void vmballoon_doorbell(void *client_data)
+ +{
+ +      struct vmballoon *b = client_data;
+ +
+ +      STATS_INC(b->stats.doorbell);
+ +
+ +      mod_delayed_work(system_freezable_wq, &b->dwork, 0);
+ +}
+ +
+ +/*
+ + * Clean up vmci doorbell
+ + */
+ +static void vmballoon_vmci_cleanup(struct vmballoon *b)
+ +{
+ +      int error;
+ +
+ +      VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, VMCI_INVALID_ID,
+ +                      VMCI_INVALID_ID, error);
+ +      STATS_INC(b->stats.doorbell_unset);
+ +
+ +      if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
+ +              vmci_doorbell_destroy(b->vmci_doorbell);
+ +              b->vmci_doorbell = VMCI_INVALID_HANDLE;
+ +      }
+ +}
+ +
+ +/*
+ + * Initialize vmci doorbell, to get notified as soon as balloon changes
+ + */
+ +static int vmballoon_vmci_init(struct vmballoon *b)
+ +{
+ +      int error = 0;
+ +
+ +      if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) != 0) {
+ +              error = vmci_doorbell_create(&b->vmci_doorbell,
+ +                              VMCI_FLAG_DELAYED_CB,
+ +                              VMCI_PRIVILEGE_FLAG_RESTRICTED,
+ +                              vmballoon_doorbell, b);
+ +
+ +              if (error == VMCI_SUCCESS) {
+ +                      VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET,
+ +                                      b->vmci_doorbell.context,
+ +                                      b->vmci_doorbell.resource, error);
+ +                      STATS_INC(b->stats.doorbell_set);
+ +              }
+ +      }
+ +
+ +      if (error != 0) {
+ +              vmballoon_vmci_cleanup(b);
+ +
+ +              return -EIO;
         }
   
- -      /* slowly increase rate if there were no errors */
- -      b->rate_free = min(b->rate_free + VMW_BALLOON_RATE_FREE_INC,
- -                         VMW_BALLOON_RATE_FREE_MAX);
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Perform standard reset sequence by popping the balloon (in case it
+ + * is not  empty) and then restarting protocol. This operation normally
+ + * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
+ + */
+ +static void vmballoon_reset(struct vmballoon *b)
+ +{
+ +      int error;
+ +
+ +      vmballoon_vmci_cleanup(b);
+ +
+ +      /* free all pages, skipping monitor unlock */
+ +      vmballoon_pop(b);
+ +
+ +      if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
+ +              return;
+ +
+ +      if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
+ +              b->ops = &vmballoon_batched_ops;
+ +              b->batch_max_pages = VMW_BALLOON_BATCH_MAX_PAGES;
+ +              if (!vmballoon_init_batching(b)) {
+ +                      /*
+ +                       * We failed to initialize batching, inform the monitor
+ +                       * about it by sending a null capability.
+ +                       *
+ +                       * The guest will retry in one second.
+ +                       */
+ +                      vmballoon_send_start(b, 0);
+ +                      return;
+ +              }
+ +      } else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
+ +              b->ops = &vmballoon_basic_ops;
+ +              b->batch_max_pages = 1;
+ +      }
+ +
+ +      b->reset_required = false;
+ +
+ +      error = vmballoon_vmci_init(b);
+ +      if (error)
+ +              pr_err("failed to initialize vmci doorbell\n");
+ +
+ +      if (!vmballoon_send_guest_id(b))
+ +              pr_err("failed to send guest ID to the host\n");
   }
   
   /*
@@@ -1126,14 -664,13 +1126,14 @@@ static void vmballoon_work(struct work_
         if (b->slow_allocation_cycles > 0)
                 b->slow_allocation_cycles--;
   
- -      if (vmballoon_send_get_target(b, &target)) {
+ +      if (!b->reset_required && vmballoon_send_get_target(b, &target)) {
                 /* update target, adjust size */
                 b->target = target;
   
                 if (b->size < target)
                         vmballoon_inflate(b);
- -              else if (b->size > target)
+ +              else if (target == 0 ||
+ +                              b->size > target + vmballoon_page_size(true))
                         vmballoon_deflate(b);
         }
   
@@@ -1155,14 -692,6 +1155,14 @@@ static int vmballoon_debug_show(struct 
         struct vmballoon *b = f->private;
         struct vmballoon_stats *stats = &b->stats;
   
+ +      /* format capabilities info */
+ +      seq_printf(f,
+ +                 "balloon capabilities:   %#4x\n"
+ +                 "used capabilities:      %#4lx\n"
+ +                 "is resetting:           %c\n",
+ +                 VMW_BALLOON_CAPABILITIES, b->capabilities,
+ +                 b->reset_required ? 'y' : 'n');
+ +
         /* format size info */
         seq_printf(f,
                    "target:             %8d pages\n"
@@@ -1171,48 -700,35 +1171,48 @@@
   
         /* format rate info */
         seq_printf(f,
- -                 "rateNoSleepAlloc:   %8d pages/sec\n"
- -                 "rateSleepAlloc:     %8d pages/sec\n"
- -                 "rateFree:           %8d pages/sec\n",
- -                 VMW_BALLOON_NOSLEEP_ALLOC_MAX,
- -                 b->rate_alloc, b->rate_free);
+ +                 "rateSleepAlloc:     %8d pages/sec\n",
+ +                 b->rate_alloc);
   
         seq_printf(f,
                    "\n"
                    "timer:              %8u\n"
+ +                 "doorbell:           %8u\n"
                    "start:              %8u (%4u failed)\n"
                    "guestType:          %8u (%4u failed)\n"
+ +                 "2m-lock:            %8u (%4u failed)\n"
                    "lock:               %8u (%4u failed)\n"
+ +                 "2m-unlock:          %8u (%4u failed)\n"
                    "unlock:             %8u (%4u failed)\n"
                    "target:             %8u (%4u failed)\n"
+ +                 "prim2mAlloc:        %8u (%4u failed)\n"
                    "primNoSleepAlloc:   %8u (%4u failed)\n"
                    "primCanSleepAlloc:  %8u (%4u failed)\n"
+ +                 "prim2mFree:         %8u\n"
                    "primFree:           %8u\n"
+ +                 "err2mAlloc:         %8u\n"
                    "errAlloc:           %8u\n"
- -                 "errFree:            %8u\n",
+ +                 "err2mFree:          %8u\n"
+ +                 "errFree:            %8u\n"
+ +                 "doorbellSet:        %8u\n"
+ +                 "doorbellUnset:      %8u\n",
                    stats->timer,
+ +                 stats->doorbell,
                    stats->start, stats->start_fail,
                    stats->guest_type, stats->guest_type_fail,
- -                 stats->lock,  stats->lock_fail,
- -                 stats->unlock, stats->unlock_fail,
+ +                 stats->lock[true],  stats->lock_fail[true],
+ +                 stats->lock[false],  stats->lock_fail[false],
+ +                 stats->unlock[true], stats->unlock_fail[true],
+ +                 stats->unlock[false], stats->unlock_fail[false],
                    stats->target, stats->target_fail,
- -                 stats->alloc, stats->alloc_fail,
+ +                 stats->alloc[true], stats->alloc_fail[true],
+ +                 stats->alloc[false], stats->alloc_fail[false],
                    stats->sleep_alloc, stats->sleep_alloc_fail,
- -                 stats->free,
- -                 stats->refused_alloc, stats->refused_free);
+ +                 stats->free[true],
+ +                 stats->free[false],
+ +                 stats->refused_alloc[true], stats->refused_alloc[false],
+ +                 stats->refused_free[true], stats->refused_free[false],
+ +                 stats->doorbell_set, stats->doorbell_unset);
   
         return 0;
   }
@@@ -1266,7 -782,7 +1266,7 @@@ static inline void vmballoon_debugfs_ex
   static int __init vmballoon_init(void)
   {
         int error;
- -
+ +      unsigned is_2m_pages;
         /*
          * Check if we are running on VMware's hypervisor and bail out
          * if we are not.
@@@ -1274,26 -790,32 +1274,26 @@@
         if (x86_hyper != &x86_hyper_vmware)
                 return -ENODEV;
   
- -      INIT_LIST_HEAD(&balloon.pages);
- -      INIT_LIST_HEAD(&balloon.refused_pages);
+ +      for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
+ +                      is_2m_pages++) {
+ +              INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
+ +              INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
+ +      }
   
         /* initialize rates */
         balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
- -      balloon.rate_free = VMW_BALLOON_RATE_FREE_MAX;
   
         INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
   
- -      /*
- -       * Start balloon.
- -       */
- -      if (!vmballoon_send_start(&balloon)) {
- -              pr_err("failed to send start command to the host\n");
- -              return -EIO;
- -      }
- -
- -      if (!vmballoon_send_guest_id(&balloon)) {
- -              pr_err("failed to send guest ID to the host\n");
- -              return -EIO;
- -      }
- -
         error = vmballoon_debugfs_init(&balloon);
         if (error)
                 return error;
   
+ +      balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
+ +      balloon.batch_page = NULL;
+ +      balloon.page = NULL;
+ +      balloon.reset_required = true;
+ +
         queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
   
         return 0;
@@@ -1302,7 -824,6 +1302,7 @@@ module_init(vmballoon_init)
   
   static void __exit vmballoon_exit(void)
   {
+ +      vmballoon_vmci_cleanup(&balloon);
         cancel_delayed_work_sync(&balloon.dwork);
   
         vmballoon_debugfs_exit(&balloon);
@@@ -1312,7 -833,7 +1312,7 @@@
          * Reset connection before deallocating memory to avoid potential for
          * additional spurious resets from guest touching deallocated pages.
          */
- -      vmballoon_send_start(&balloon);
+ +      vmballoon_send_start(&balloon, 0);
         vmballoon_pop(&balloon);
   }
   module_exit(vmballoon_exit);
diff --combined drivers/mtd/mtdcore.c

index b1eea48c501d11cdb58e862ac73a0b29eb015aba,2dfb291a47c6da6e64caa1788f1329305658b9c5..a9a15c22ef24649ff4ef82385a1128dfef8d7818
--- 1/drivers/mtd/mtdcore.c
--- 2/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@@ -387,14 -387,6 +387,14 @@@ int add_mtd_device(struct mtd_info *mtd
         struct mtd_notifier *not;
         int i, error;
   
+ +      /*
+ +       * May occur, for instance, on buggy drivers which call
+ +       * mtd_device_parse_register() multiple times on the same master MTD,
+ +       * especially with CONFIG_MTD_PARTITIONED_MASTER=y.
+ +       */
+ +      if (WARN_ONCE(mtd->backing_dev_info, "MTD already registered\n"))
+ +              return -EEXIST;
+ +
         mtd->backing_dev_info = &mtd_bdi;
   
         BUG_ON(mtd->writesize == 0);
@@@ -426,15 -418,6 +426,15 @@@
         mtd->erasesize_mask = (1 << mtd->erasesize_shift) - 1;
         mtd->writesize_mask = (1 << mtd->writesize_shift) - 1;
   
+ +      if (mtd->dev.parent) {
+ +              if (!mtd->owner && mtd->dev.parent->driver)
+ +                      mtd->owner = mtd->dev.parent->driver->owner;
+ +              if (!mtd->name)
+ +                      mtd->name = dev_name(mtd->dev.parent);
+ +      } else {
+ +              pr_debug("mtd device won't show a device symlink in sysfs\n");
+ +      }
+ +
         /* Some chips always power up locked. Unlock them now */
         if ((mtd->flags & MTD_WRITEABLE) && (mtd->flags & MTD_POWERUP_LOCK)) {
                 error = mtd_unlock(mtd, 0, mtd->size);
@@@ -447,7 -430,7 +447,7 @@@
         }
   
         /* Caller should have set dev.parent to match the
- -       * physical device.
+ +       * physical device, if appropriate.
          */
         mtd->dev.type = &mtd_devtype;
         mtd->dev.class = &mtd_class;
@@@ -596,17 -579,9 +596,17 @@@ int mtd_device_parse_register(struct mt
                 else
                         ret = nr_parts;
         }
+ +      /* Didn't come up with either parsed OR fallback partitions */
+ +      if (ret < 0) {
+ +              pr_info("mtd: failed to find partitions; one or more parsers reports errors (%d)\n",
+ +                      ret);
+ +              /* Don't abort on errors; we can still use unpartitioned MTD */
+ +              ret = 0;
+ +      }
   
- -      if (ret >= 0)
- -              ret = mtd_add_device_partitions(mtd, real_parts, ret);
+ +      ret = mtd_add_device_partitions(mtd, real_parts, ret);
+ +      if (ret)
+ +              goto out;
   
         /*
          * FIXME: some drivers unfortunately call this function more than once.
@@@ -616,13 -591,11 +616,13 @@@
          * does cause problems with parse_mtd_partitions() above (e.g.,
          * cmdlineparts will register partitions more than once).
          */
+ +      WARN_ONCE(mtd->reboot_notifier.notifier_call, "MTD already registered\n");
         if (mtd->_reboot && !mtd->reboot_notifier.notifier_call) {
                 mtd->reboot_notifier.notifier_call = mtd_reboot_notifier;
                 register_reboot_notifier(&mtd->reboot_notifier);
         }
   
+ +out:
         kfree(real_parts);
         return ret;
   }
@@@ -1215,8 -1188,7 +1215,7 @@@ EXPORT_SYMBOL_GPL(mtd_writev)
    */
   void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
   {
-       gfp_t flags = __GFP_NOWARN | __GFP_WAIT |
-                      __GFP_NORETRY | __GFP_NO_KSWAPD;
+       gfp_t flags = __GFP_NOWARN | __GFP_DIRECT_RECLAIM | __GFP_NORETRY;
         size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
         void *kbuf;
   
@@@ -1328,7 -1300,6 +1327,7 @@@ static void __exit cleanup_mtd(void
                 remove_proc_entry("mtd", NULL);
         class_unregister(&mtd_class);
         bdi_destroy(&mtd_bdi);
+ +      idr_destroy(&mtd_idr);
   }
   
   module_init(init_mtd);
diff --combined drivers/nvme/host/pci.c

index c4bb85934aa2692cd6f46a12666ee2b6eff1c2e6,e917cf304ad0c53b470399a80fb95cca8b1ed2ed..34fae2804e0b4e75a8d6159677a9506aefca343c
--- 1/drivers/nvme/host/pci.c
--- 2/drivers/block/nvme-core.c
+++ b/drivers/nvme/host/pci.c
@@@ -12,6 -12,7 +12,6 @@@
    * more details.
    */
   
- -#include <linux/nvme.h>
   #include <linux/bitops.h>
   #include <linux/blkdev.h>
   #include <linux/blk-mq.h>
@@@ -39,13 -40,8 +39,13 @@@
   #include <linux/slab.h>
   #include <linux/t10-pi.h>
   #include <linux/types.h>
+ +#include <linux/pr.h>
   #include <scsi/sg.h>
- -#include <asm-generic/io-64-nonatomic-lo-hi.h>
+ +#include <linux/io-64-nonatomic-lo-hi.h>
+ +#include <asm/unaligned.h>
+ +
+ +#include <uapi/linux/nvme_ioctl.h>
+ +#include "nvme.h"
   
   #define NVME_MINORS           (1U << MINORBITS)
   #define NVME_Q_DEPTH          1024
@@@ -88,10 -84,9 +88,10 @@@ static wait_queue_head_t nvme_kthread_w
   
   static struct class *nvme_class;
   
- -static void nvme_reset_failed_dev(struct work_struct *ws);
+ +static int __nvme_reset(struct nvme_dev *dev);
   static int nvme_reset(struct nvme_dev *dev);
   static int nvme_process_cq(struct nvme_queue *nvmeq);
+ +static void nvme_dead_ctrl(struct nvme_dev *dev);
   
   struct async_cmd_info {
         struct kthread_work work;
@@@ -540,7 -535,7 +540,7 @@@ static void nvme_dif_remap(struct reque
         virt = bip_get_seed(bip);
         phys = nvme_block_nr(ns, blk_rq_pos(req));
         nlb = (blk_rq_bytes(req) >> ns->lba_shift);
- -      ts = ns->disk->integrity->tuple_size;
+ +      ts = ns->disk->queue->integrity.tuple_size;
   
         for (i = 0; i < nlb; i++, virt++, phys++) {
                 pi = (struct t10_pi_tuple *)p;
@@@ -550,20 -545,36 +550,20 @@@
         kunmap_atomic(pmap);
   }
   
- -static int nvme_noop_verify(struct blk_integrity_iter *iter)
- -{
- -      return 0;
- -}
- -
- -static int nvme_noop_generate(struct blk_integrity_iter *iter)
- -{
- -      return 0;
- -}
- -
- -struct blk_integrity nvme_meta_noop = {
- -      .name                   = "NVME_META_NOOP",
- -      .generate_fn            = nvme_noop_generate,
- -      .verify_fn              = nvme_noop_verify,
- -};
- -
   static void nvme_init_integrity(struct nvme_ns *ns)
   {
         struct blk_integrity integrity;
   
         switch (ns->pi_type) {
         case NVME_NS_DPS_PI_TYPE3:
- -              integrity = t10_pi_type3_crc;
+ +              integrity.profile = &t10_pi_type3_crc;
                 break;
         case NVME_NS_DPS_PI_TYPE1:
         case NVME_NS_DPS_PI_TYPE2:
- -              integrity = t10_pi_type1_crc;
+ +              integrity.profile = &t10_pi_type1_crc;
                 break;
         default:
- -              integrity = nvme_meta_noop;
+ +              integrity.profile = NULL;
                 break;
         }
         integrity.tuple_size = ns->ms;
@@@ -592,31 -603,27 +592,31 @@@ static void req_completion(struct nvme_
         struct nvme_iod *iod = ctx;
         struct request *req = iod_get_private(iod);
         struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
- -
         u16 status = le16_to_cpup(&cqe->status) >> 1;
+ +      bool requeue = false;
+ +      int error = 0;
   
         if (unlikely(status)) {
                 if (!(status & NVME_SC_DNR || blk_noretry_request(req))
                     && (jiffies - req->start_time) < req->timeout) {
                         unsigned long flags;
   
+ +                      requeue = true;
                         blk_mq_requeue_request(req);
                         spin_lock_irqsave(req->q->queue_lock, flags);
                         if (!blk_queue_stopped(req->q))
                                 blk_mq_kick_requeue_list(req->q);
                         spin_unlock_irqrestore(req->q->queue_lock, flags);
- -                      return;
+ +                      goto release_iod;
                 }
   
                 if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
                         if (cmd_rq->ctx == CMD_CTX_CANCELLED)
- -                              status = -EINTR;
+ +                              error = -EINTR;
+ +                      else
+ +                              error = status;
                 } else {
- -                      status = nvme_error_status(status);
+ +                      error = nvme_error_status(status);
                 }
         }
   
@@@ -628,9 -635,8 +628,9 @@@
         if (cmd_rq->aborted)
                 dev_warn(nvmeq->dev->dev,
                         "completing aborted command with status:%04x\n",
- -                      status);
+ +                      error);
   
+ +release_iod:
         if (iod->nents) {
                 dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
                         rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
@@@ -643,8 -649,7 +643,8 @@@
         }
         nvme_free_iod(nvmeq->dev, iod);
   
- -      blk_mq_complete_request(req, status);
+ +      if (likely(!requeue))
+ +              blk_mq_complete_request(req, error);
   }
   
   /* length is in bytes.  gfp flags indicates whether we may sleep. */
@@@ -1025,11 -1030,11 +1025,11 @@@ int __nvme_submit_sync_cmd(struct reque
         req->special = (void *)0;
   
         if (buffer && bufflen) {
-               ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT);
+               ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_RECLAIM);
                 if (ret)
                         goto out;
         } else if (ubuffer && bufflen) {
-               ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT);
+               ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_RECLAIM);
                 if (ret)
                         goto out;
                 bio = req->bio;
@@@ -1272,13 -1277,18 +1272,13 @@@ static void nvme_abort_req(struct reque
         struct nvme_command cmd;
   
         if (!nvmeq->qid || cmd_rq->aborted) {
- -              unsigned long flags;
- -
- -              spin_lock_irqsave(&dev_list_lock, flags);
- -              if (work_busy(&dev->reset_work))
- -                      goto out;
- -              list_del_init(&dev->node);
- -              dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n",
- -                                                      req->tag, nvmeq->qid);
- -              dev->reset_workfn = nvme_reset_failed_dev;
- -              queue_work(nvme_workq, &dev->reset_work);
- - out:
- -              spin_unlock_irqrestore(&dev_list_lock, flags);
+ +              spin_lock(&dev_list_lock);
+ +              if (!__nvme_reset(dev)) {
+ +                      dev_warn(dev->dev,
+ +                               "I/O %d QID %d timeout, reset controller\n",
+ +                               req->tag, nvmeq->qid);
+ +              }
+ +              spin_unlock(&dev_list_lock);
                 return;
         }
   
@@@ -1794,7 -1804,7 +1794,7 @@@ static int nvme_submit_io(struct nvme_n
   
         length = (io.nblocks + 1) << ns->lba_shift;
         meta_len = (io.nblocks + 1) * ns->ms;
- -      metadata = (void __user *)(unsigned long)io.metadata;
+ +      metadata = (void __user *)(uintptr_t)io.metadata;
         write = io.opcode & 1;
   
         if (ns->ext) {
@@@ -1834,7 -1844,7 +1834,7 @@@
         c.rw.metadata = cpu_to_le64(meta_dma);
   
         status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
- -                      (void __user *)io.addr, length, NULL, 0);
+ +                      (void __user *)(uintptr_t)io.addr, length, NULL, 0);
    unmap:
         if (meta) {
                 if (status == NVME_SC_SUCCESS && !write) {
@@@ -1876,7 -1886,7 +1876,7 @@@ static int nvme_user_cmd(struct nvme_de
                 timeout = msecs_to_jiffies(cmd.timeout_ms);
   
         status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
- -                      NULL, (void __user *)cmd.addr, cmd.data_len,
+ +                      NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
                         &cmd.result, timeout);
         if (status >= 0) {
                 if (put_user(cmd.result, &ucmd->result))
@@@ -1933,23 -1943,6 +1933,23 @@@ static int nvme_compat_ioctl(struct blo
   #define nvme_compat_ioctl     NULL
   #endif
   
+ +static void nvme_free_dev(struct kref *kref);
+ +static void nvme_free_ns(struct kref *kref)
+ +{
+ +      struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
+ +
+ +      if (ns->type == NVME_NS_LIGHTNVM)
+ +              nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
+ +
+ +      spin_lock(&dev_list_lock);
+ +      ns->disk->private_data = NULL;
+ +      spin_unlock(&dev_list_lock);
+ +
+ +      kref_put(&ns->dev->kref, nvme_free_dev);
+ +      put_disk(ns->disk);
+ +      kfree(ns);
+ +}
+ +
   static int nvme_open(struct block_device *bdev, fmode_t mode)
   {
         int ret = 0;
@@@ -1959,17 -1952,21 +1959,17 @@@
         ns = bdev->bd_disk->private_data;
         if (!ns)
                 ret = -ENXIO;
- -      else if (!kref_get_unless_zero(&ns->dev->kref))
+ +      else if (!kref_get_unless_zero(&ns->kref))
                 ret = -ENXIO;
         spin_unlock(&dev_list_lock);
   
         return ret;
   }
   
- -static void nvme_free_dev(struct kref *kref);
- -
   static void nvme_release(struct gendisk *disk, fmode_t mode)
   {
         struct nvme_ns *ns = disk->private_data;
- -      struct nvme_dev *dev = ns->dev;
- -
- -      kref_put(&dev->kref, nvme_free_dev);
+ +      kref_put(&ns->kref, nvme_free_ns);
   }
   
   static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
@@@ -2010,16 -2007,6 +2010,16 @@@ static int nvme_revalidate_disk(struct 
                 return -ENODEV;
         }
   
+ +      if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
+ +              if (nvme_nvm_register(ns->queue, disk->disk_name)) {
+ +                      dev_warn(dev->dev,
+ +                              "%s: LightNVM init failure\n", __func__);
+ +                      kfree(id);
+ +                      return -ENODEV;
+ +              }
+ +              ns->type = NVME_NS_LIGHTNVM;
+ +      }
+ +
         old_ms = ns->ms;
         lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
         ns->lba_shift = id->lbaf[lbaf].ds;
@@@ -2038,7 -2025,6 +2038,7 @@@
         pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
                                         id->dps & NVME_NS_DPS_PI_MASK : 0;
   
+ +      blk_mq_freeze_queue(disk->queue);
         if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
                                 ns->ms != old_ms ||
                                 bs != queue_logical_block_size(disk->queue) ||
@@@ -2048,116 -2034,22 +2048,116 @@@
         ns->pi_type = pi_type;
         blk_queue_logical_block_size(ns->queue, bs);
   
- -      if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) &&
- -                                                              !ns->ext)
+ +      if (ns->ms && !ns->ext)
                 nvme_init_integrity(ns);
   
- -      if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
+ +      if ((ns->ms && !(ns->ms == 8 && ns->pi_type) &&
+ +                                              !blk_get_integrity(disk)) ||
+ +                                              ns->type == NVME_NS_LIGHTNVM)
                 set_capacity(disk, 0);
         else
                 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
   
         if (dev->oncs & NVME_CTRL_ONCS_DSM)
                 nvme_config_discard(ns);
+ +      blk_mq_unfreeze_queue(disk->queue);
   
         kfree(id);
         return 0;
   }
   
+ +static char nvme_pr_type(enum pr_type type)
+ +{
+ +      switch (type) {
+ +      case PR_WRITE_EXCLUSIVE:
+ +              return 1;
+ +      case PR_EXCLUSIVE_ACCESS:
+ +              return 2;
+ +      case PR_WRITE_EXCLUSIVE_REG_ONLY:
+ +              return 3;
+ +      case PR_EXCLUSIVE_ACCESS_REG_ONLY:
+ +              return 4;
+ +      case PR_WRITE_EXCLUSIVE_ALL_REGS:
+ +              return 5;
+ +      case PR_EXCLUSIVE_ACCESS_ALL_REGS:
+ +              return 6;
+ +      default:
+ +              return 0;
+ +      }
+ +};
+ +
+ +static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
+ +                              u64 key, u64 sa_key, u8 op)
+ +{
+ +      struct nvme_ns *ns = bdev->bd_disk->private_data;
+ +      struct nvme_command c;
+ +      u8 data[16] = { 0, };
+ +
+ +      put_unaligned_le64(key, &data[0]);
+ +      put_unaligned_le64(sa_key, &data[8]);
+ +
+ +      memset(&c, 0, sizeof(c));
+ +      c.common.opcode = op;
+ +      c.common.nsid = cpu_to_le32(ns->ns_id);
+ +      c.common.cdw10[0] = cpu_to_le32(cdw10);
+ +
+ +      return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+ +}
+ +
+ +static int nvme_pr_register(struct block_device *bdev, u64 old,
+ +              u64 new, unsigned flags)
+ +{
+ +      u32 cdw10;
+ +
+ +      if (flags & ~PR_FL_IGNORE_KEY)
+ +              return -EOPNOTSUPP;
+ +
+ +      cdw10 = old ? 2 : 0;
+ +      cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
+ +      cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
+ +      return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+ +}
+ +
+ +static int nvme_pr_reserve(struct block_device *bdev, u64 key,
+ +              enum pr_type type, unsigned flags)
+ +{
+ +      u32 cdw10;
+ +
+ +      if (flags & ~PR_FL_IGNORE_KEY)
+ +              return -EOPNOTSUPP;
+ +
+ +      cdw10 = nvme_pr_type(type) << 8;
+ +      cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
+ +      return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+ +}
+ +
+ +static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
+ +              enum pr_type type, bool abort)
+ +{
+ +      u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
+ +      return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+ +}
+ +
+ +static int nvme_pr_clear(struct block_device *bdev, u64 key)
+ +{
+ +      u32 cdw10 = 1 | key ? 1 << 3 : 0;
+ +      return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
+ +}
+ +
+ +static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+ +{
+ +      u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
+ +      return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+ +}
+ +
+ +static const struct pr_ops nvme_pr_ops = {
+ +      .pr_register    = nvme_pr_register,
+ +      .pr_reserve     = nvme_pr_reserve,
+ +      .pr_release     = nvme_pr_release,
+ +      .pr_preempt     = nvme_pr_preempt,
+ +      .pr_clear       = nvme_pr_clear,
+ +};
+ +
   static const struct block_device_operations nvme_fops = {
         .owner          = THIS_MODULE,
         .ioctl          = nvme_ioctl,
@@@ -2166,7 -2058,6 +2166,7 @@@
         .release        = nvme_release,
         .getgeo         = nvme_getgeo,
         .revalidate_disk= nvme_revalidate_disk,
+ +      .pr_ops         = &nvme_pr_ops,
   };
   
   static int nvme_kthread(void *data)
@@@ -2182,11 -2073,14 +2182,11 @@@
   
                         if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
                                                         csts & NVME_CSTS_CFS) {
- -                              if (work_busy(&dev->reset_work))
- -                                      continue;
- -                              list_del_init(&dev->node);
- -                              dev_warn(dev->dev,
- -                                      "Failed status: %x, reset controller\n",
- -                                      readl(&dev->bar->csts));
- -                              dev->reset_workfn = nvme_reset_failed_dev;
- -                              queue_work(nvme_workq, &dev->reset_work);
+ +                              if (!__nvme_reset(dev)) {
+ +                                      dev_warn(dev->dev,
+ +                                              "Failed status: %x, reset controller\n",
+ +                                              readl(&dev->bar->csts));
+ +                              }
                                 continue;
                         }
                         for (i = 0; i < dev->queue_count; i++) {
@@@ -2232,7 -2126,6 +2232,7 @@@ static void nvme_alloc_ns(struct nvme_d
         if (!disk)
                 goto out_free_queue;
   
+ +      kref_init(&ns->kref);
         ns->ns_id = nsid;
         ns->disk = disk;
         ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
@@@ -2269,20 -2162,17 +2269,20 @@@
         if (nvme_revalidate_disk(ns->disk))
                 goto out_free_disk;
   
- -      add_disk(ns->disk);
- -      if (ns->ms) {
- -              struct block_device *bd = bdget_disk(ns->disk, 0);
- -              if (!bd)
- -                      return;
- -              if (blkdev_get(bd, FMODE_READ, NULL)) {
- -                      bdput(bd);
- -                      return;
+ +      kref_get(&dev->kref);
+ +      if (ns->type != NVME_NS_LIGHTNVM) {
+ +              add_disk(ns->disk);
+ +              if (ns->ms) {
+ +                      struct block_device *bd = bdget_disk(ns->disk, 0);
+ +                      if (!bd)
+ +                              return;
+ +                      if (blkdev_get(bd, FMODE_READ, NULL)) {
+ +                              bdput(bd);
+ +                              return;
+ +                      }
+ +                      blkdev_reread_part(bd);
+ +                      blkdev_put(bd, FMODE_READ);
                 }
- -              blkdev_reread_part(bd);
- -              blkdev_put(bd, FMODE_READ);
         }
         return;
    out_free_disk:
@@@ -2294,13 -2184,6 +2294,13 @@@
         kfree(ns);
   }
   
+ +/*
+ + * Create I/O queues.  Failing to create an I/O queue is not an issue,
+ + * we can continue with less than the desired amount of queues, and
+ + * even a controller without I/O queues an still be used to issue
+ + * admin commands.  This might be useful to upgrade a buggy firmware
+ + * for example.
+ + */
   static void nvme_create_io_queues(struct nvme_dev *dev)
   {
         unsigned i;
@@@ -2310,10 -2193,8 +2310,10 @@@
                         break;
   
         for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
- -              if (nvme_create_queue(dev->queues[i], i))
+ +              if (nvme_create_queue(dev->queues[i], i)) {
+ +                      nvme_free_queues(dev, i);
                         break;
+ +              }
   }
   
   static int set_queue_count(struct nvme_dev *dev, int count)
@@@ -2476,6 -2357,18 +2476,6 @@@ static int nvme_setup_io_queues(struct 
         return result;
   }
   
- -static void nvme_free_namespace(struct nvme_ns *ns)
- -{
- -      list_del(&ns->list);
- -
- -      spin_lock(&dev_list_lock);
- -      ns->disk->private_data = NULL;
- -      spin_unlock(&dev_list_lock);
- -
- -      put_disk(ns->disk);
- -      kfree(ns);
- -}
- -
   static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
   {
         struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
@@@ -2509,14 -2402,15 +2509,14 @@@ static void nvme_ns_remove(struct nvme_
   
         if (kill)
                 blk_set_queue_dying(ns->queue);
- -      if (ns->disk->flags & GENHD_FL_UP) {
- -              if (blk_get_integrity(ns->disk))
- -                      blk_integrity_unregister(ns->disk);
+ +      if (ns->disk->flags & GENHD_FL_UP)
                 del_gendisk(ns->disk);
- -      }
         if (kill || !blk_queue_dying(ns->queue)) {
                 blk_mq_abort_requeue_list(ns->queue);
                 blk_cleanup_queue(ns->queue);
- -        }
+ +      }
+ +      list_del_init(&ns->list);
+ +      kref_put(&ns->kref, nvme_free_ns);
   }
   
   static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
@@@ -2527,14 -2421,18 +2527,14 @@@
         for (i = 1; i <= nn; i++) {
                 ns = nvme_find_ns(dev, i);
                 if (ns) {
- -                      if (revalidate_disk(ns->disk)) {
+ +                      if (revalidate_disk(ns->disk))
                                 nvme_ns_remove(ns);
- -                              nvme_free_namespace(ns);
- -                      }
                 } else
                         nvme_alloc_ns(dev, i);
         }
         list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
- -              if (ns->ns_id > nn) {
+ +              if (ns->ns_id > nn)
                         nvme_ns_remove(ns);
- -                      nvme_free_namespace(ns);
- -              }
         }
         list_sort(NULL, &dev->namespaces, ns_cmp);
   }
@@@ -2924,9 -2822,9 +2924,9 @@@ static void nvme_dev_shutdown(struct nv
   
   static void nvme_dev_remove(struct nvme_dev *dev)
   {
- -      struct nvme_ns *ns;
+ +      struct nvme_ns *ns, *next;
   
- -      list_for_each_entry(ns, &dev->namespaces, list)
+ +      list_for_each_entry_safe(ns, next, &dev->namespaces, list)
                 nvme_ns_remove(ns);
   }
   
@@@ -2982,12 -2880,21 +2982,12 @@@ static void nvme_release_instance(struc
         spin_unlock(&dev_list_lock);
   }
   
- -static void nvme_free_namespaces(struct nvme_dev *dev)
- -{
- -      struct nvme_ns *ns, *next;
- -
- -      list_for_each_entry_safe(ns, next, &dev->namespaces, list)
- -              nvme_free_namespace(ns);
- -}
- -
   static void nvme_free_dev(struct kref *kref)
   {
         struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
   
         put_device(dev->dev);
         put_device(dev->device);
- -      nvme_free_namespaces(dev);
         nvme_release_instance(dev);
         if (dev->tagset.tags)
                 blk_mq_free_tag_set(&dev->tagset);
@@@ -3061,15 -2968,14 +3061,15 @@@ static const struct file_operations nvm
         .compat_ioctl   = nvme_dev_ioctl,
   };
   
- -static int nvme_dev_start(struct nvme_dev *dev)
+ +static void nvme_probe_work(struct work_struct *work)
   {
- -      int result;
+ +      struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
         bool start_thread = false;
+ +      int result;
   
         result = nvme_dev_map(dev);
         if (result)
- -              return result;
+ +              goto out;
   
         result = nvme_configure_admin_queue(dev);
         if (result)
@@@ -3104,20 -3010,7 +3104,20 @@@
                 goto free_tags;
   
         dev->event_limit = 1;
- -      return result;
+ +
+ +      /*
+ +       * Keep the controller around but remove all namespaces if we don't have
+ +       * any working I/O queue.
+ +       */
+ +      if (dev->online_queues < 2) {
+ +              dev_warn(dev->dev, "IO queues not created\n");
+ +              nvme_dev_remove(dev);
+ +      } else {
+ +              nvme_unfreeze_queues(dev);
+ +              nvme_dev_add(dev);
+ +      }
+ +
+ +      return;
   
    free_tags:
         nvme_dev_remove_admin(dev);
@@@ -3129,9 -3022,7 +3129,9 @@@
         nvme_dev_list_remove(dev);
    unmap:
         nvme_dev_unmap(dev);
- -      return result;
+ + out:
+ +      if (!work_busy(&dev->reset_work))
+ +              nvme_dead_ctrl(dev);
   }
   
   static int nvme_remove_dead_ctrl(void *arg)
@@@ -3145,6 -3036,33 +3145,6 @@@
         return 0;
   }
   
- -static void nvme_remove_disks(struct work_struct *ws)
- -{
- -      struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
- -
- -      nvme_free_queues(dev, 1);
- -      nvme_dev_remove(dev);
- -}
- -
- -static int nvme_dev_resume(struct nvme_dev *dev)
- -{
- -      int ret;
- -
- -      ret = nvme_dev_start(dev);
- -      if (ret)
- -              return ret;
- -      if (dev->online_queues < 2) {
- -              spin_lock(&dev_list_lock);
- -              dev->reset_workfn = nvme_remove_disks;
- -              queue_work(nvme_workq, &dev->reset_work);
- -              spin_unlock(&dev_list_lock);
- -      } else {
- -              nvme_unfreeze_queues(dev);
- -              nvme_dev_add(dev);
- -      }
- -      return 0;
- -}
- -
   static void nvme_dead_ctrl(struct nvme_dev *dev)
   {
         dev_warn(dev->dev, "Device failed to resume\n");
@@@ -3157,9 -3075,8 +3157,9 @@@
         }
   }
   
- -static void nvme_dev_reset(struct nvme_dev *dev)
+ +static void nvme_reset_work(struct work_struct *ws)
   {
+ +      struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
         bool in_probe = work_busy(&dev->probe_work);
   
         nvme_dev_shutdown(dev);
@@@ -3179,24 -3096,31 +3179,24 @@@
         schedule_work(&dev->probe_work);
   }
   
- -static void nvme_reset_failed_dev(struct work_struct *ws)
+ +static int __nvme_reset(struct nvme_dev *dev)
   {
- -      struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
- -      nvme_dev_reset(dev);
- -}
- -
- -static void nvme_reset_workfn(struct work_struct *work)
- -{
- -      struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
- -      dev->reset_workfn(work);
+ +      if (work_pending(&dev->reset_work))
+ +              return -EBUSY;
+ +      list_del_init(&dev->node);
+ +      queue_work(nvme_workq, &dev->reset_work);
+ +      return 0;
   }
   
   static int nvme_reset(struct nvme_dev *dev)
   {
- -      int ret = -EBUSY;
+ +      int ret;
   
         if (!dev->admin_q || blk_queue_dying(dev->admin_q))
                 return -ENODEV;
   
         spin_lock(&dev_list_lock);
- -      if (!work_pending(&dev->reset_work)) {
- -              dev->reset_workfn = nvme_reset_failed_dev;
- -              queue_work(nvme_workq, &dev->reset_work);
- -              ret = 0;
- -      }
+ +      ret = __nvme_reset(dev);
         spin_unlock(&dev_list_lock);
   
         if (!ret) {
@@@ -3223,6 -3147,7 +3223,6 @@@ static ssize_t nvme_sysfs_reset(struct 
   }
   static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
   
- -static void nvme_async_probe(struct work_struct *work);
   static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
   {
         int node, result = -ENOMEM;
@@@ -3245,7 -3170,8 +3245,7 @@@
                 goto free;
   
         INIT_LIST_HEAD(&dev->namespaces);
- -      dev->reset_workfn = nvme_reset_failed_dev;
- -      INIT_WORK(&dev->reset_work, nvme_reset_workfn);
+ +      INIT_WORK(&dev->reset_work, nvme_reset_work);
         dev->dev = get_device(&pdev->dev);
         pci_set_drvdata(pdev, dev);
         result = nvme_set_instance(dev);
@@@ -3273,7 -3199,7 +3273,7 @@@
   
         INIT_LIST_HEAD(&dev->node);
         INIT_WORK(&dev->scan_work, nvme_dev_scan);
- -      INIT_WORK(&dev->probe_work, nvme_async_probe);
+ +      INIT_WORK(&dev->probe_work, nvme_probe_work);
         schedule_work(&dev->probe_work);
         return 0;
   
@@@ -3293,6 -3219,14 +3293,6 @@@
         return result;
   }
   
- -static void nvme_async_probe(struct work_struct *work)
- -{
- -      struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
- -
- -      if (nvme_dev_resume(dev) && !work_busy(&dev->reset_work))
- -              nvme_dead_ctrl(dev);
- -}
- -
   static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
   {
         struct nvme_dev *dev = pci_get_drvdata(pdev);
@@@ -3300,7 -3234,7 +3300,7 @@@
         if (prepare)
                 nvme_dev_shutdown(dev);
         else
- -              nvme_dev_resume(dev);
+ +              schedule_work(&dev->probe_work);
   }
   
   static void nvme_shutdown(struct pci_dev *pdev)
@@@ -3354,7 -3288,10 +3354,7 @@@ static int nvme_resume(struct device *d
         struct pci_dev *pdev = to_pci_dev(dev);
         struct nvme_dev *ndev = pci_get_drvdata(pdev);
   
- -      if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) {
- -              ndev->reset_workfn = nvme_reset_failed_dev;
- -              queue_work(nvme_workq, &ndev->reset_work);
- -      }
+ +      schedule_work(&ndev->probe_work);
         return 0;
   }
   #endif
diff --combined drivers/staging/android/ion/ion_system_heap.c

index ada724aab3d586ebb81193909a2518a1e68872fb,d4cdbf28dbb6701fae141493dbad5516eb5d4d8e..d4c3e5512dd54dbcf0b3f6d9a927a1e8bbb3fc5a
--- 1/drivers/staging/android/ion/ion_system_heap.c
--- 2/drivers/staging/android/ion/ion_system_heap.c
+++ b/drivers/staging/android/ion/ion_system_heap.c
@@@ -27,7 -27,7 +27,7 @@@
   #include "ion_priv.h"
   
   static gfp_t high_order_gfp_flags = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN |
-                                    __GFP_NORETRY) & ~__GFP_WAIT;
+                                    __GFP_NORETRY) & ~__GFP_DIRECT_RECLAIM;
   static gfp_t low_order_gfp_flags  = (GFP_HIGHUSER | __GFP_ZERO | __GFP_NOWARN);
   static const unsigned int orders[] = {8, 4, 0};
   static const int num_orders = ARRAY_SIZE(orders);
@@@ -185,11 -185,8 +185,11 @@@ static void ion_system_heap_free(struc
         struct scatterlist *sg;
         int i;
   
- -      /* uncached pages come from the page pools, zero them before returning
- -         for security purposes (other allocations are zerod at alloc time */
+ +      /*
+ +       *  uncached pages come from the page pools, zero them before returning
+ +       *  for security purposes (other allocations are zerod at
+ +       *  alloc time
+ +       */
         if (!cached && !(buffer->private_flags & ION_PRIV_FLAG_SHRINKER_FREE))
                 ion_heap_buffer_zero(buffer);
   
diff --combined drivers/staging/lustre/include/linux/libcfs/libcfs_private.h

index 6af733de69caddbf7672b80e75e1827304e7b237,78bde2c11b50ba44928d0a27a2d50d2a2d9e6d05..f0b0423a716bd09252b74f5f9d22506fddc7c4dd
--- 1/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
--- 2/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
@@@ -79,7 -79,7 +79,7 @@@ do {                                                                  
   
   #define KLASSERT(e) LASSERT(e)
   
- -void lbug_with_loc(struct libcfs_debug_msg_data *)__attribute__((noreturn));
+ +void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *);
   
   #define LBUG()                                                          \
   do {                                                              \
@@@ -95,7 -95,7 +95,7 @@@
   do {                                                                      \
         LASSERT(!in_interrupt() ||                                          \
                 ((size) <= LIBCFS_VMALLOC_SIZE &&                           \
-                ((mask) & __GFP_WAIT) == 0));                              \
+                !gfpflags_allow_blocking(mask)));                          \
   } while (0)
   
   #define LIBCFS_ALLOC_POST(ptr, size)                                      \
diff --combined drivers/staging/rdma/hfi1/init.c

index 47a1202fcbdf5117e66a8c78da95c80bbffdf459,29fff7f2a45a1f59334c3783d0f0c7ecc54001ec..8666f3ad24e9960bfb85b6814de6c277ae37b522
--- 1/drivers/staging/rdma/hfi1/init.c
--- 2/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@@@ -134,8 -134,11 +134,8 @@@ int hfi1_create_ctxts(struct hfi1_devda
         dd->assigned_node_id = local_node_id;
   
         dd->rcd = kcalloc(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL);
- -      if (!dd->rcd) {
- -              dd_dev_err(dd,
- -                      "Unable to allocate receive context array, failing\n");
+ +      if (!dd->rcd)
                 goto nomem;
- -      }
   
         /* create one or more kernel contexts */
         for (i = 0; i < dd->first_user_ctxt; ++i) {
@@@ -290,14 -293,12 +290,14 @@@ struct hfi1_ctxtdata *hfi1_create_ctxtd
                  * The resulting value will be rounded down to the closest
                  * multiple of dd->rcv_entries.group_size.
                  */
- -              rcd->egrbufs.buffers = kzalloc(sizeof(*rcd->egrbufs.buffers) *
- -                                             rcd->egrbufs.count, GFP_KERNEL);
+ +              rcd->egrbufs.buffers = kcalloc(rcd->egrbufs.count,
+ +                                             sizeof(*rcd->egrbufs.buffers),
+ +                                             GFP_KERNEL);
                 if (!rcd->egrbufs.buffers)
                         goto bail;
- -              rcd->egrbufs.rcvtids = kzalloc(sizeof(*rcd->egrbufs.rcvtids) *
- -                                             rcd->egrbufs.count, GFP_KERNEL);
+ +              rcd->egrbufs.rcvtids = kcalloc(rcd->egrbufs.count,
+ +                                             sizeof(*rcd->egrbufs.rcvtids),
+ +                                             GFP_KERNEL);
                 if (!rcd->egrbufs.rcvtids)
                         goto bail;
                 rcd->egrbufs.size = eager_buffer_size;
@@@ -317,8 -318,12 +317,8 @@@
                 if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
                         rcd->opstats = kzalloc(sizeof(*rcd->opstats),
                                 GFP_KERNEL);
- -                      if (!rcd->opstats) {
- -                              dd_dev_err(dd,
- -                                         "ctxt%u: Unable to allocate per ctxt stats buffer\n",
- -                                         rcd->ctxt);
+ +                      if (!rcd->opstats)
                                 goto bail;
- -                      }
                 }
         }
         return rcd;
@@@ -413,7 -418,6 +413,7 @@@ static enum hrtimer_restart cca_timer_f
         int sl;
         u16 ccti, ccti_timer, ccti_min;
         struct cc_state *cc_state;
+ +      unsigned long flags;
   
         cca_timer = container_of(t, struct cca_timer, hrtimer);
         ppd = cca_timer->ppd;
@@@ -437,7 -441,7 +437,7 @@@
         ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
         ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
   
- -      spin_lock(&ppd->cca_timer_lock);
+ +      spin_lock_irqsave(&ppd->cca_timer_lock, flags);
   
         ccti = cca_timer->ccti;
   
@@@ -446,7 -450,7 +446,7 @@@
                 set_link_ipg(ppd);
         }
   
- -      spin_unlock(&ppd->cca_timer_lock);
+ +      spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
   
         rcu_read_unlock();
   
@@@ -1046,8 -1050,8 +1046,8 @@@ struct hfi1_devdata *hfi1_alloc_devdata
         if (!hfi1_cpulist_count) {
                 u32 count = num_online_cpus();
   
- -              hfi1_cpulist = kzalloc(BITS_TO_LONGS(count) *
- -                                    sizeof(long), GFP_KERNEL);
+ +              hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long),
+ +                                     GFP_KERNEL);
                 if (hfi1_cpulist)
                         hfi1_cpulist_count = count;
                 else
@@@ -1560,7 -1564,7 +1560,7 @@@ int hfi1_setup_eagerbufs(struct hfi1_ct
          * heavy filesystem activity makes these fail, and we can
          * use compound pages.
          */
-       gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
   
         /*
          * The minimum size of the eager buffers is a groups of MTU-sized
diff --combined drivers/staging/rdma/ipath/ipath_file_ops.c

index 5d9b9dbd8fc44804fbf7a4256782ecc7d4f7630f,c11f6c58ce534df07c91f01c57a2ddbb8f1ba1f2..13c3cd11ab92a5c610ef7a5e32795a0b0a0931c9
--- 1/drivers/staging/rdma/ipath/ipath_file_ops.c
--- 2/drivers/staging/rdma/ipath/ipath_file_ops.c
+++ b/drivers/staging/rdma/ipath/ipath_file_ops.c
@@@ -825,13 -825,13 +825,13 @@@ static void ipath_clean_part_key(struc
                                 ipath_stats.sps_pkeys[j] =
                                         dd->ipath_pkeys[j] = 0;
                                 pchanged++;
+ +                      } else {
+ +                              ipath_cdbg(VERBOSE, "p%u key %x matches #%d, "
+ +                                         "but ref still %d\n", pd->port_port,
+ +                                         pd->port_pkeys[i], j,
+ +                                         atomic_read(&dd->ipath_pkeyrefs[j]));
+ +                              break;
                         }
- -                      else ipath_cdbg(
- -                              VERBOSE, "p%u key %x matches #%d, "
- -                              "but ref still %d\n", pd->port_port,
- -                              pd->port_pkeys[i], j,
- -                              atomic_read(&dd->ipath_pkeyrefs[j]));
- -                      break;
                 }
                 pd->port_pkeys[i] = 0;
         }
@@@ -905,7 -905,7 +905,7 @@@ static int ipath_create_user_egr(struc
          * heavy filesystem activity makes these fail, and we can
          * use compound pages.
          */
-       gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+       gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
   
         egrcnt = dd->ipath_rcvegrcnt;
         /* TID number offset for this port */
@@@ -2046,6 -2046,7 +2046,6 @@@ static void unlock_expected_tids(struc
   
   static int ipath_close(struct inode *in, struct file *fp)
   {
- -      int ret = 0;
         struct ipath_filedata *fd;
         struct ipath_portdata *pd;
         struct ipath_devdata *dd;
@@@ -2157,7 -2158,7 +2157,7 @@@
   
   bail:
         kfree(fd);
- -      return ret;
+ +      return 0;
   }
   
   static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
diff --combined drivers/usb/gadget/function/f_mass_storage.c

index c69b650b7bba67ef5f1a885027b17f36f44dd3d7,9d1e3b3d39ca977381cdddd0c2502a08d44ba1d5..223ccf89d2263fb6483a1a22116a3743c9d7ed8e
--- 1/drivers/usb/gadget/function/f_mass_storage.c
--- 2/drivers/usb/gadget/function/f_mass_storage.c
+++ b/drivers/usb/gadget/function/f_mass_storage.c
@@@ -2258,10 -2258,12 +2258,10 @@@ reset
                 /* Disable the endpoints */
                 if (fsg->bulk_in_enabled) {
                         usb_ep_disable(fsg->bulk_in);
- -                      fsg->bulk_in->driver_data = NULL;
                         fsg->bulk_in_enabled = 0;
                 }
                 if (fsg->bulk_out_enabled) {
                         usb_ep_disable(fsg->bulk_out);
- -                      fsg->bulk_out->driver_data = NULL;
                         fsg->bulk_out_enabled = 0;
                 }
   
@@@ -2345,7 -2347,6 +2345,6 @@@ static void fsg_disable(struct usb_func
   
   static void handle_exception(struct fsg_common *common)
   {
-       siginfo_t               info;
         int                     i;
         struct fsg_buffhd       *bh;
         enum fsg_state          old_state;
@@@ -2357,8 -2358,7 +2356,7 @@@
          * into a high-priority EXIT exception.
          */
         for (;;) {
-               int sig =
-                       dequeue_signal_lock(current, &current->blocked, &info);
+               int sig = kernel_dequeue_signal(NULL);
                 if (!sig)
                         break;
                 if (sig != SIGUSR1) {
@@@ -2660,12 -2660,10 +2658,12 @@@ EXPORT_SYMBOL_GPL(fsg_common_put)
   /* check if fsg_num_buffers is within a valid range */
   static inline int fsg_num_buffers_validate(unsigned int fsg_num_buffers)
   {
- -      if (fsg_num_buffers >= 2 && fsg_num_buffers <= 4)
+ +#define FSG_MAX_NUM_BUFFERS   32
+ +
+ +      if (fsg_num_buffers >= 2 && fsg_num_buffers <= FSG_MAX_NUM_BUFFERS)
                 return 0;
         pr_err("fsg_num_buffers %u is out of range (%d to %d)\n",
- -             fsg_num_buffers, 2, 4);
+ +             fsg_num_buffers, 2, FSG_MAX_NUM_BUFFERS);
         return -EINVAL;
   }
   
@@@ -3070,11 -3068,13 +3068,11 @@@ static int fsg_bind(struct usb_configur
         ep = usb_ep_autoconfig(gadget, &fsg_fs_bulk_in_desc);
         if (!ep)
                 goto autoconf_fail;
- -      ep->driver_data = fsg->common;  /* claim the endpoint */
         fsg->bulk_in = ep;
   
         ep = usb_ep_autoconfig(gadget, &fsg_fs_bulk_out_desc);
         if (!ep)
                 goto autoconf_fail;
- -      ep->driver_data = fsg->common;  /* claim the endpoint */
         fsg->bulk_out = ep;
   
         /* Assume endpoint addresses are the same for both speeds */
@@@ -3142,6 -3142,9 +3140,6 @@@ static inline struct fsg_opts *to_fsg_o
                             func_inst.group);
   }
   
- -CONFIGFS_ATTR_STRUCT(fsg_lun_opts);
- -CONFIGFS_ATTR_OPS(fsg_lun_opts);
- -
   static void fsg_lun_attr_release(struct config_item *item)
   {
         struct fsg_lun_opts *lun_opts;
@@@ -3152,93 -3155,110 +3150,93 @@@
   
   static struct configfs_item_operations fsg_lun_item_ops = {
         .release                = fsg_lun_attr_release,
- -      .show_attribute         = fsg_lun_opts_attr_show,
- -      .store_attribute        = fsg_lun_opts_attr_store,
   };
   
- -static ssize_t fsg_lun_opts_file_show(struct fsg_lun_opts *opts, char *page)
+ +static ssize_t fsg_lun_opts_file_show(struct config_item *item, char *page)
   {
- -      struct fsg_opts *fsg_opts;
- -
- -      fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
+ +      struct fsg_lun_opts *opts = to_fsg_lun_opts(item);
+ +      struct fsg_opts *fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
   
         return fsg_show_file(opts->lun, &fsg_opts->common->filesem, page);
   }
   
- -static ssize_t fsg_lun_opts_file_store(struct fsg_lun_opts *opts,
+ +static ssize_t fsg_lun_opts_file_store(struct config_item *item,
                                        const char *page, size_t len)
   {
- -      struct fsg_opts *fsg_opts;
- -
- -      fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
+ +      struct fsg_lun_opts *opts = to_fsg_lun_opts(item);
+ +      struct fsg_opts *fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
   
         return fsg_store_file(opts->lun, &fsg_opts->common->filesem, page, len);
   }
   
- -static struct fsg_lun_opts_attribute fsg_lun_opts_file =
- -      __CONFIGFS_ATTR(file, S_IRUGO | S_IWUSR, fsg_lun_opts_file_show,
- -                      fsg_lun_opts_file_store);
+ +CONFIGFS_ATTR(fsg_lun_opts_, file);
   
- -static ssize_t fsg_lun_opts_ro_show(struct fsg_lun_opts *opts, char *page)
+ +static ssize_t fsg_lun_opts_ro_show(struct config_item *item, char *page)
   {
- -      return fsg_show_ro(opts->lun, page);
+ +      return fsg_show_ro(to_fsg_lun_opts(item)->lun, page);
   }
   
- -static ssize_t fsg_lun_opts_ro_store(struct fsg_lun_opts *opts,
+ +static ssize_t fsg_lun_opts_ro_store(struct config_item *item,
                                        const char *page, size_t len)
   {
- -      struct fsg_opts *fsg_opts;
- -
- -      fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
+ +      struct fsg_lun_opts *opts = to_fsg_lun_opts(item);
+ +      struct fsg_opts *fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
   
         return fsg_store_ro(opts->lun, &fsg_opts->common->filesem, page, len);
   }
   
- -static struct fsg_lun_opts_attribute fsg_lun_opts_ro =
- -      __CONFIGFS_ATTR(ro, S_IRUGO | S_IWUSR, fsg_lun_opts_ro_show,
- -                      fsg_lun_opts_ro_store);
+ +CONFIGFS_ATTR(fsg_lun_opts_, ro);
   
- -static ssize_t fsg_lun_opts_removable_show(struct fsg_lun_opts *opts,
+ +static ssize_t fsg_lun_opts_removable_show(struct config_item *item,
                                            char *page)
   {
- -      return fsg_show_removable(opts->lun, page);
+ +      return fsg_show_removable(to_fsg_lun_opts(item)->lun, page);
   }
   
- -static ssize_t fsg_lun_opts_removable_store(struct fsg_lun_opts *opts,
+ +static ssize_t fsg_lun_opts_removable_store(struct config_item *item,
                                        const char *page, size_t len)
   {
- -      return fsg_store_removable(opts->lun, page, len);
+ +      return fsg_store_removable(to_fsg_lun_opts(item)->lun, page, len);
   }
   
- -static struct fsg_lun_opts_attribute fsg_lun_opts_removable =
- -      __CONFIGFS_ATTR(removable, S_IRUGO | S_IWUSR,
- -                      fsg_lun_opts_removable_show,
- -                      fsg_lun_opts_removable_store);
+ +CONFIGFS_ATTR(fsg_lun_opts_, removable);
   
- -static ssize_t fsg_lun_opts_cdrom_show(struct fsg_lun_opts *opts, char *page)
+ +static ssize_t fsg_lun_opts_cdrom_show(struct config_item *item, char *page)
   {
- -      return fsg_show_cdrom(opts->lun, page);
+ +      return fsg_show_cdrom(to_fsg_lun_opts(item)->lun, page);
   }
   
- -static ssize_t fsg_lun_opts_cdrom_store(struct fsg_lun_opts *opts,
+ +static ssize_t fsg_lun_opts_cdrom_store(struct config_item *item,
                                        const char *page, size_t len)
   {
- -      struct fsg_opts *fsg_opts;
- -
- -      fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
+ +      struct fsg_lun_opts *opts = to_fsg_lun_opts(item);
+ +      struct fsg_opts *fsg_opts = to_fsg_opts(opts->group.cg_item.ci_parent);
   
         return fsg_store_cdrom(opts->lun, &fsg_opts->common->filesem, page,
                                len);
   }
   
- -static struct fsg_lun_opts_attribute fsg_lun_opts_cdrom =
- -      __CONFIGFS_ATTR(cdrom, S_IRUGO | S_IWUSR, fsg_lun_opts_cdrom_show,
- -                      fsg_lun_opts_cdrom_store);
+ +CONFIGFS_ATTR(fsg_lun_opts_, cdrom);
   
- -static ssize_t fsg_lun_opts_nofua_show(struct fsg_lun_opts *opts, char *page)
+ +static ssize_t fsg_lun_opts_nofua_show(struct config_item *item, char *page)
   {
- -      return fsg_show_nofua(opts->lun, page);
+ +      return fsg_show_nofua(to_fsg_lun_opts(item)->lun, page);
   }
   
- -static ssize_t fsg_lun_opts_nofua_store(struct fsg_lun_opts *opts,
+ +static ssize_t fsg_lun_opts_nofua_store(struct config_item *item,
                                        const char *page, size_t len)
   {
- -      return fsg_store_nofua(opts->lun, page, len);
+ +      return fsg_store_nofua(to_fsg_lun_opts(item)->lun, page, len);
   }
   
- -static struct fsg_lun_opts_attribute fsg_lun_opts_nofua =
- -      __CONFIGFS_ATTR(nofua, S_IRUGO | S_IWUSR, fsg_lun_opts_nofua_show,
- -                      fsg_lun_opts_nofua_store);
+ +CONFIGFS_ATTR(fsg_lun_opts_, nofua);
   
   static struct configfs_attribute *fsg_lun_attrs[] = {
- -      &fsg_lun_opts_file.attr,
- -      &fsg_lun_opts_ro.attr,
- -      &fsg_lun_opts_removable.attr,
- -      &fsg_lun_opts_cdrom.attr,
- -      &fsg_lun_opts_nofua.attr,
+ +      &fsg_lun_opts_attr_file,
+ +      &fsg_lun_opts_attr_ro,
+ +      &fsg_lun_opts_attr_removable,
+ +      &fsg_lun_opts_attr_cdrom,
+ +      &fsg_lun_opts_attr_nofua,
         NULL,
   };
   
@@@ -3330,6 -3350,9 +3328,6 @@@ static void fsg_lun_drop(struct config_
         config_item_put(item);
   }
   
- -CONFIGFS_ATTR_STRUCT(fsg_opts);
- -CONFIGFS_ATTR_OPS(fsg_opts);
- -
   static void fsg_attr_release(struct config_item *item)
   {
         struct fsg_opts *opts = to_fsg_opts(item);
@@@ -3339,11 -3362,12 +3337,11 @@@
   
   static struct configfs_item_operations fsg_item_ops = {
         .release                = fsg_attr_release,
- -      .show_attribute         = fsg_opts_attr_show,
- -      .store_attribute        = fsg_opts_attr_store,
   };
   
- -static ssize_t fsg_opts_stall_show(struct fsg_opts *opts, char *page)
+ +static ssize_t fsg_opts_stall_show(struct config_item *item, char *page)
   {
+ +      struct fsg_opts *opts = to_fsg_opts(item);
         int result;
   
         mutex_lock(&opts->lock);
@@@ -3353,10 -3377,9 +3351,10 @@@
         return result;
   }
   
- -static ssize_t fsg_opts_stall_store(struct fsg_opts *opts, const char *page,
+ +static ssize_t fsg_opts_stall_store(struct config_item *item, const char *page,
                                     size_t len)
   {
+ +      struct fsg_opts *opts = to_fsg_opts(item);
         int ret;
         bool stall;
   
@@@ -3378,12 -3401,13 +3376,12 @@@
         return ret;
   }
   
- -static struct fsg_opts_attribute fsg_opts_stall =
- -      __CONFIGFS_ATTR(stall, S_IRUGO | S_IWUSR, fsg_opts_stall_show,
- -                      fsg_opts_stall_store);
+ +CONFIGFS_ATTR(fsg_opts_, stall);
   
   #ifdef CONFIG_USB_GADGET_DEBUG_FILES
- -static ssize_t fsg_opts_num_buffers_show(struct fsg_opts *opts, char *page)
+ +static ssize_t fsg_opts_num_buffers_show(struct config_item *item, char *page)
   {
+ +      struct fsg_opts *opts = to_fsg_opts(item);
         int result;
   
         mutex_lock(&opts->lock);
@@@ -3393,10 -3417,9 +3391,10 @@@
         return result;
   }
   
- -static ssize_t fsg_opts_num_buffers_store(struct fsg_opts *opts,
+ +static ssize_t fsg_opts_num_buffers_store(struct config_item *item,
                                           const char *page, size_t len)
   {
+ +      struct fsg_opts *opts = to_fsg_opts(item);
         int ret;
         u8 num;
   
@@@ -3421,13 -3444,17 +3419,13 @@@ end
         return ret;
   }
   
- -static struct fsg_opts_attribute fsg_opts_num_buffers =
- -      __CONFIGFS_ATTR(num_buffers, S_IRUGO | S_IWUSR,
- -                      fsg_opts_num_buffers_show,
- -                      fsg_opts_num_buffers_store);
- -
+ +CONFIGFS_ATTR(fsg_opts_, num_buffers);
   #endif
   
   static struct configfs_attribute *fsg_attrs[] = {
- -      &fsg_opts_stall.attr,
+ +      &fsg_opts_attr_stall,
   #ifdef CONFIG_USB_GADGET_DEBUG_FILES
- -      &fsg_opts_num_buffers.attr,
+ +      &fsg_opts_attr_num_buffers,
   #endif
         NULL,
   };
diff --combined drivers/usb/host/u132-hcd.c

index 0a94895a358d47e8e51cf32722afbdce0d6104d9,67b3b9d9dfd13c471ac31908737a0d30e9284a78..692ccc69345e4a9998246a53b6af817a56a5d435
--- 1/drivers/usb/host/u132-hcd.c
--- 2/drivers/usb/host/u132-hcd.c
+++ b/drivers/usb/host/u132-hcd.c
@@@ -2244,8 -2244,9 +2244,8 @@@ static int u132_urb_enqueue(struct usb_
   {
         struct u132 *u132 = hcd_to_u132(hcd);
         if (irqs_disabled()) {
-               if (__GFP_WAIT & mem_flags) {
+               if (gfpflags_allow_blocking(mem_flags)) {
- -                      printk(KERN_ERR "invalid context for function that migh"
- -                              "t sleep\n");
+ +                      printk(KERN_ERR "invalid context for function that might sleep\n");
                         return -EINVAL;
                 }
         }
diff --combined fs/9p/vfs_file.c

index f23fd86697ea5ed4234ff96a8dd236b8884e6026,6b747394f6f566dfbd7e2608d1690f88f81e4206..7bf835f85bc822ef1119b639be82619af066d326
--- 1/fs/9p/vfs_file.c
--- 2/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@@ -161,7 -161,7 +161,7 @@@ static int v9fs_file_do_lock(struct fil
         if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
                 BUG();
   
- -      res = posix_lock_file_wait(filp, fl);
+ +      res = locks_lock_file_wait(filp, fl);
         if (res < 0)
                 goto out;
   
@@@ -231,7 -231,8 +231,8 @@@ out_unlock
         if (res < 0 && fl->fl_type != F_UNLCK) {
                 fl_type = fl->fl_type;
                 fl->fl_type = F_UNLCK;
-               res = locks_lock_file_wait(filp, fl);
+               /* Even if this fails we want to return the remote error */
- -              posix_lock_file_wait(filp, fl);
++              locks_lock_file_wait(filp, fl);
                 fl->fl_type = fl_type;
         }
   out:
diff --combined fs/cifs/file.c

index 47c5c97e2dd31c20663f1fa0584da2c3f87722bd,2d319e66b8f84ea0e78c37e5ffdf5782bdd3aa4e..0a2752b79e72cc2b7a083894843a8b3ae1dea23d
--- 1/fs/cifs/file.c
--- 2/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@@ -1553,7 -1553,7 +1553,7 @@@ cifs_setlk(struct file *file, struct fi
   
   out:
         if (flock->fl_flags & FL_POSIX && !rc)
- -              rc = posix_lock_file_wait(file, flock);
+ +              rc = locks_lock_file_wait(file, flock);
         return rc;
   }
   
@@@ -3380,7 -3380,7 +3380,7 @@@ readpages_get_pages(struct address_spac
         struct page *page, *tpage;
         unsigned int expected_index;
         int rc;
-       gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+       gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
   
         INIT_LIST_HEAD(tmplist);
   
@@@ -3391,13 -3391,13 +3391,13 @@@
          * should have access to this page, we're safe to simply set
          * PG_locked without checking it first.
          */
-       __set_page_locked(page);
+       __SetPageLocked(page);
         rc = add_to_page_cache_locked(page, mapping,
                                       page->index, gfp);
   
         /* give up if we can't stick it in the cache */
         if (rc) {
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
                 return rc;
         }
   
@@@ -3418,9 -3418,9 +3418,9 @@@
                 if (*bytes + PAGE_CACHE_SIZE > rsize)
                         break;
   
-               __set_page_locked(page);
+               __SetPageLocked(page);
                 if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
-                       __clear_page_locked(page);
+                       __ClearPageLocked(page);
                         break;
                 }
                 list_move_tail(&page->lru, tmplist);
diff --combined fs/coredump.c

index 53d7d46c55c82c58c321895225faaad8e9a2e6e5,1777331eee767fa323cb864fb95131983eaad588..b3c153ca435d24fdbdfcb909228b6b4787bb63f2
--- 1/fs/coredump.c
--- 2/fs/coredump.c
+++ b/fs/coredump.c
@@@ -32,7 -32,6 +32,7 @@@
   #include <linux/pipe_fs_i.h>
   #include <linux/oom.h>
   #include <linux/compat.h>
+ +#include <linux/timekeeping.h>
   
   #include <asm/uaccess.h>
   #include <asm/mmu_context.h>
@@@ -233,10 -232,9 +233,10 @@@ static int format_corename(struct core_
                                 break;
                         /* UNIX time of coredump */
                         case 't': {
- -                              struct timeval tv;
- -                              do_gettimeofday(&tv);
- -                              err = cn_printf(cn, "%lu", tv.tv_sec);
+ +                              time64_t time;
+ +
+ +                              time = ktime_get_real_seconds();
+ +                              err = cn_printf(cn, "%lld", time);
                                 break;
                         }
                         /* hostname */
@@@ -282,23 -280,24 +282,24 @@@ out
         return ispipe;
   }
   
- static int zap_process(struct task_struct *start, int exit_code)
+ static int zap_process(struct task_struct *start, int exit_code, int flags)
   {
         struct task_struct *t;
         int nr = 0;
   
+       /* ignore all signals except SIGKILL, see prepare_signal() */
+       start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
         start->signal->group_exit_code = exit_code;
         start->signal->group_stop_count = 0;
   
-       t = start;
-       do {
+       for_each_thread(start, t) {
                 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                 if (t != current && t->mm) {
                         sigaddset(&t->pending.signal, SIGKILL);
                         signal_wake_up(t, 1);
                         nr++;
                 }
-       } while_each_thread(start, t);
+       }
   
         return nr;
   }
@@@ -313,10 -312,8 +314,8 @@@ static int zap_threads(struct task_stru
         spin_lock_irq(&tsk->sighand->siglock);
         if (!signal_group_exit(tsk->signal)) {
                 mm->core_state = core_state;
-               nr = zap_process(tsk, exit_code);
                 tsk->signal->group_exit_task = tsk;
-               /* ignore all signals except SIGKILL, see prepare_signal() */
-               tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+               nr = zap_process(tsk, exit_code, 0);
                 clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
         }
         spin_unlock_irq(&tsk->sighand->siglock);
@@@ -362,18 -359,18 +361,18 @@@
                         continue;
                 if (g->flags & PF_KTHREAD)
                         continue;
-               p = g;
-               do {
-                       if (p->mm) {
-                               if (unlikely(p->mm == mm)) {
-                                       lock_task_sighand(p, &flags);
-                                       nr += zap_process(p, exit_code);
-                                       p->signal->flags = SIGNAL_GROUP_EXIT;
-                                       unlock_task_sighand(p, &flags);
-                               }
-                               break;
+ 
+               for_each_thread(g, p) {
+                       if (unlikely(!p->mm))
+                               continue;
+                       if (unlikely(p->mm == mm)) {
+                               lock_task_sighand(p, &flags);
+                               nr += zap_process(p, exit_code,
+                                                       SIGNAL_GROUP_EXIT);
+                               unlock_task_sighand(p, &flags);
                         }
-               } while_each_thread(g, p);
+                       break;
+               }
         }
         rcu_read_unlock();
   done:
diff --combined fs/direct-io.c

index 3ae0e0427191c7849fc70301e58792515f5f181c,dbb94a2d6c504a08d0349d891ccbc2d8acc2f6a1..18e7554cf94cac57d1eb3dc17ba4001e75cef5f6
--- 1/fs/direct-io.c
--- 2/fs/direct-io.c
+++ b/fs/direct-io.c
@@@ -120,7 -120,6 +120,7 @@@ struct dio 
         int page_errors;                /* errno from get_user_pages() */
         int is_async;                   /* is IO async ? */
         bool defer_completion;          /* defer AIO completion to workqueue? */
+ +      bool should_dirty;              /* if pages should be dirtied */
         int io_error;                   /* IO error in completion path */
         unsigned long refcount;         /* direct_io_worker() and bios */
         struct bio *bio_list;           /* singly linked via bi_private */
@@@ -361,7 -360,7 +361,7 @@@ dio_bio_alloc(struct dio *dio, struct d
   
         /*
          * bio_alloc() is guaranteed to return a bio when called with
-        * __GFP_WAIT and we request a valid number of vectors.
+        * __GFP_RECLAIM and we request a valid number of vectors.
          */
         bio = bio_alloc(GFP_KERNEL, nr_vecs);
   
@@@ -394,7 -393,7 +394,7 @@@ static inline void dio_bio_submit(struc
         dio->refcount++;
         spin_unlock_irqrestore(&dio->bio_lock, flags);
   
- -      if (dio->is_async && dio->rw == READ)
+ +      if (dio->is_async && dio->rw == READ && dio->should_dirty)
                 bio_set_pages_dirty(bio);
   
         if (sdio->submit_io)
@@@ -465,15 -464,14 +465,15 @@@ static int dio_bio_complete(struct dio 
         if (bio->bi_error)
                 dio->io_error = -EIO;
   
- -      if (dio->is_async && dio->rw == READ) {
+ +      if (dio->is_async && dio->rw == READ && dio->should_dirty) {
                 bio_check_pages_dirty(bio);     /* transfers ownership */
                 err = bio->bi_error;
         } else {
                 bio_for_each_segment_all(bvec, bio, i) {
                         struct page *page = bvec->bv_page;
   
- -                      if (dio->rw == READ && !PageCompound(page))
+ +                      if (dio->rw == READ && !PageCompound(page) &&
+ +                                      dio->should_dirty)
                                 set_page_dirty_lock(page);
                         page_cache_release(page);
                 }
@@@ -1221,7 -1219,6 +1221,7 @@@ do_blockdev_direct_IO(struct kiocb *ioc
         spin_lock_init(&dio->bio_lock);
         dio->refcount = 1;
   
+ +      dio->should_dirty = (iter->type == ITER_IOVEC);
         sdio.iter = iter;
         sdio.final_block_in_request =
                 (offset + iov_iter_count(iter)) >> blkbits;
diff --combined fs/ext4/inode.c

index e8d620a484f6a86bb684ce432888c329a429e948,60aaecd5598b32798cb114cbd473d5818af7c692..7d1aad1d9313155f3780cde3923710fe7f60ea1c
--- 1/fs/ext4/inode.c
--- 2/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -378,7 -378,7 +378,7 @@@ static int __check_block_validity(struc
                                  "lblock %lu mapped to illegal pblock "
                                  "(length %d)", (unsigned long) map->m_lblk,
                                  map->m_len);
- -              return -EIO;
+ +              return -EFSCORRUPTED;
         }
         return 0;
   }
@@@ -480,7 -480,7 +480,7 @@@ int ext4_map_blocks(handle_t *handle, s
   
         /* We can handle the block number less than EXT_MAX_BLOCKS */
         if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
- -              return -EIO;
+ +              return -EFSCORRUPTED;
   
         /* Lookup extent status tree firstly */
         if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
@@@ -965,7 -965,7 +965,7 @@@ static int ext4_block_write_begin(struc
         if (unlikely(err))
                 page_zero_new_buffers(page, from, to);
         else if (decrypt)
- -              err = ext4_decrypt_one(inode, page);
+ +              err = ext4_decrypt(page);
         return err;
   }
   #endif
@@@ -1181,38 -1181,6 +1181,38 @@@ errout
         return ret ? ret : copied;
   }
   
+ +/*
+ + * This is a private version of page_zero_new_buffers() which doesn't
+ + * set the buffer to be dirty, since in data=journalled mode we need
+ + * to call ext4_handle_dirty_metadata() instead.
+ + */
+ +static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
+ +{
+ +      unsigned int block_start = 0, block_end;
+ +      struct buffer_head *head, *bh;
+ +
+ +      bh = head = page_buffers(page);
+ +      do {
+ +              block_end = block_start + bh->b_size;
+ +              if (buffer_new(bh)) {
+ +                      if (block_end > from && block_start < to) {
+ +                              if (!PageUptodate(page)) {
+ +                                      unsigned start, size;
+ +
+ +                                      start = max(from, block_start);
+ +                                      size = min(to, block_end) - start;
+ +
+ +                                      zero_user(page, start, size);
+ +                                      set_buffer_uptodate(bh);
+ +                              }
+ +                              clear_buffer_new(bh);
+ +                      }
+ +              }
+ +              block_start = block_end;
+ +              bh = bh->b_this_page;
+ +      } while (bh != head);
+ +}
+ +
   static int ext4_journalled_write_end(struct file *file,
                                      struct address_space *mapping,
                                      loff_t pos, unsigned len, unsigned copied,
@@@ -1239,7 -1207,7 +1239,7 @@@
                 if (copied < len) {
                         if (!PageUptodate(page))
                                 copied = 0;
- -                      page_zero_new_buffers(page, from+copied, to);
+ +                      zero_new_buffers(page, from+copied, to);
                 }
   
                 ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
@@@ -1847,22 -1815,11 +1847,22 @@@ static int ext4_writepage(struct page *
          * the page. But we may reach here when we do a journal commit via
          * journal_submit_inode_data_buffers() and in that case we must write
          * allocated buffers to achieve data=ordered mode guarantees.
+ +       *
+ +       * Also, if there is only one buffer per page (the fs block
+ +       * size == the page size), if one buffer needs block
+ +       * allocation or needs to modify the extent tree to clear the
+ +       * unwritten flag, we know that the page can't be written at
+ +       * all, so we might as well refuse the write immediately.
+ +       * Unfortunately if the block size != page size, we can't as
+ +       * easily detect this case using ext4_walk_page_buffers(), but
+ +       * for the extremely common case, this is an optimization that
+ +       * skips a useless round trip through ext4_bio_write_page().
          */
         if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
                                    ext4_bh_delay_or_unwritten)) {
                 redirty_page_for_writepage(wbc, page);
- -              if (current->flags & PF_MEMALLOC) {
+ +              if ((current->flags & PF_MEMALLOC) ||
+ +                  (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
                         /*
                          * For memory cleaning there's no point in writing only
                          * some buffers. So just bail out. Warn if we came here
@@@ -2642,7 -2599,8 +2642,7 @@@ static int ext4_nonda_switch(struct sup
   /* We always reserve for an inode update; the superblock could be there too */
   static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
   {
- -      if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- -                              EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+ +      if (likely(ext4_has_feature_large_file(inode->i_sb)))
                 return 1;
   
         if (pos + len <= 0x7fffffffULL)
@@@ -3386,7 -3344,7 +3386,7 @@@ static int __ext4_block_zero_page_range
         int err = 0;
   
         page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-                                  mapping_gfp_mask(mapping) & ~__GFP_FS);
+                                  mapping_gfp_constraint(mapping, ~__GFP_FS));
         if (!page)
                 return -ENOMEM;
   
@@@ -3435,7 -3393,7 +3435,7 @@@
                         /* We expect the key to be set. */
                         BUG_ON(!ext4_has_encryption_key(inode));
                         BUG_ON(blocksize != PAGE_CACHE_SIZE);
- -                      WARN_ON_ONCE(ext4_decrypt_one(inode, page));
+ +                      WARN_ON_ONCE(ext4_decrypt(page));
                 }
         }
         if (ext4_should_journal_data(inode)) {
@@@ -3862,7 -3820,7 +3862,7 @@@ static int __ext4_get_inode_loc(struct 
   
         iloc->bh = NULL;
         if (!ext4_valid_inum(sb, inode->i_ino))
- -              return -EIO;
+ +              return -EFSCORRUPTED;
   
         iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
         gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
@@@ -4048,7 -4006,8 +4048,7 @@@ static blkcnt_t ext4_inode_blocks(struc
         struct inode *inode = &(ei->vfs_inode);
         struct super_block *sb = inode->i_sb;
   
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                              EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ +      if (ext4_has_feature_huge_file(sb)) {
                 /* we are using combined 48 bit field */
                 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
                                         le32_to_cpu(raw_inode->i_blocks_lo);
@@@ -4109,7 -4068,7 +4109,7 @@@ struct inode *ext4_iget(struct super_bl
                         EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
                                 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
                                 EXT4_INODE_SIZE(inode->i_sb));
- -                      ret = -EIO;
+ +                      ret = -EFSCORRUPTED;
                         goto bad_inode;
                 }
         } else
@@@ -4129,7 -4088,7 +4129,7 @@@
   
         if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
                 EXT4_ERROR_INODE(inode, "checksum invalid");
- -              ret = -EIO;
+ +              ret = -EFSBADCRC;
                 goto bad_inode;
         }
   
@@@ -4171,7 -4130,7 +4171,7 @@@
         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
         inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+ +      if (ext4_has_feature_64bit(sb))
                 ei->i_file_acl |=
                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
         inode->i_size = ext4_isize(raw_inode);
@@@ -4244,7 -4203,7 +4244,7 @@@
             !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
                 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
                                  ei->i_file_acl);
- -              ret = -EIO;
+ +              ret = -EFSCORRUPTED;
                 goto bad_inode;
         } else if (!ext4_has_inline_data(inode)) {
                 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
@@@ -4295,7 -4254,7 +4295,7 @@@
         } else if (ino == EXT4_BOOT_LOADER_INO) {
                 make_bad_inode(inode);
         } else {
- -              ret = -EIO;
+ +              ret = -EFSCORRUPTED;
                 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
                 goto bad_inode;
         }
@@@ -4313,7 -4272,7 +4313,7 @@@ bad_inode
   struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
   {
         if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
- -              return ERR_PTR(-EIO);
+ +              return ERR_PTR(-EFSCORRUPTED);
         return ext4_iget(sb, ino);
   }
   
@@@ -4335,7 -4294,7 +4335,7 @@@ static int ext4_inode_blocks_set(handle
                 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                 return 0;
         }
- -      if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ +      if (!ext4_has_feature_huge_file(sb))
                 return -EFBIG;
   
         if (i_blocks <= 0xffffffffffffULL) {
@@@ -4496,7 -4455,8 +4496,7 @@@ static int ext4_do_update_inode(handle_
                 need_datasync = 1;
         }
         if (ei->i_disksize > 0x7fffffffULL) {
- -              if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                              EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+ +              if (!ext4_has_feature_large_file(sb) ||
                                 EXT4_SB(sb)->s_es->s_rev_level ==
                     cpu_to_le32(EXT4_GOOD_OLD_REV))
                         set_large_file = 1;
@@@ -4545,7 -4505,8 +4545,7 @@@
                 if (err)
                         goto out_brelse;
                 ext4_update_dynamic_rev(sb);
- -              EXT4_SET_RO_COMPAT_FEATURE(sb,
- -                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ +              ext4_set_feature_large_file(sb);
                 ext4_handle_sync(handle);
                 err = ext4_handle_dirty_super(handle, sb);
         }
diff --combined fs/ext4/readpage.c

index d94af71a4e7fcabd1783f28cccb86363473fe3ae,1061611ae14dd6c66878584820528d757cfea399..5dc5e95063de2a7e42749a94464f00f7c50be4b8
--- 1/fs/ext4/readpage.c
--- 2/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@@ -62,7 -62,7 +62,7 @@@ static void completion_pages(struct wor
         bio_for_each_segment_all(bv, bio, i) {
                 struct page *page = bv->bv_page;
   
- -              int ret = ext4_decrypt(ctx, page);
+ +              int ret = ext4_decrypt(page);
                 if (ret) {
                         WARN_ON_ONCE(1);
                         SetPageError(page);
@@@ -166,7 -166,7 +166,7 @@@ int ext4_mpage_readpages(struct address
                         page = list_entry(pages->prev, struct page, lru);
                         list_del(&page->lru);
                         if (add_to_page_cache_lru(page, mapping, page->index,
-                                       GFP_KERNEL & mapping_gfp_mask(mapping)))
+                                 mapping_gfp_constraint(mapping, GFP_KERNEL)))
                                 goto next_page;
                 }
   
diff --combined fs/ext4/super.c

index 04d0f1b334096525030674c6d818e124a1bd81f3,49f6c78ee3afe26aa45941d443c5ea242ef95676..753f4e68b820da0dd78fc7a7e3a66e529846ea0b
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -34,6 -34,7 +34,6 @@@
   #include <linux/namei.h>
   #include <linux/quotaops.h>
   #include <linux/seq_file.h>
- -#include <linux/proc_fs.h>
   #include <linux/ctype.h>
   #include <linux/log2.h>
   #include <linux/crc16.h>
@@@ -53,8 -54,11 +53,8 @@@
   #define CREATE_TRACE_POINTS
   #include <trace/events/ext4.h>
   
- -static struct proc_dir_entry *ext4_proc_root;
- -static struct kset *ext4_kset;
   static struct ext4_lazy_init *ext4_li_info;
   static struct mutex ext4_li_mtx;
- -static struct ext4_features *ext4_feat;
   static int ext4_mballoc_ready;
   static struct ratelimit_state ext4_mount_msg_ratelimit;
   
@@@ -79,6 -83,7 +79,6 @@@ static int ext4_feature_set_ok(struct s
   static void ext4_destroy_lazyinit_thread(void);
   static void ext4_unregister_li_request(struct super_block *sb);
   static void ext4_clear_request_list(void);
- -static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
   
   #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
   static struct file_system_type ext2_fs_type = {
@@@ -110,7 -115,8 +110,7 @@@ MODULE_ALIAS("ext3")
   static int ext4_verify_csum_type(struct super_block *sb,
                                  struct ext4_super_block *es)
   {
- -      if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ +      if (!ext4_has_feature_metadata_csum(sb))
                 return 1;
   
         return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
@@@ -388,13 -394,9 +388,13 @@@ static void ext4_handle_error(struct su
                 smp_wmb();
                 sb->s_flags |= MS_RDONLY;
         }
- -      if (test_opt(sb, ERRORS_PANIC))
+ +      if (test_opt(sb, ERRORS_PANIC)) {
+ +              if (EXT4_SB(sb)->s_journal &&
+ +                !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
+ +                      return;
                 panic("EXT4-fs (device %s): panic forced after error\n",
                         sb->s_id);
+ +      }
   }
   
   #define ext4_error_ratelimit(sb)                                      \
@@@ -493,12 -495,6 +493,12 @@@ const char *ext4_decode_error(struct su
         char *errstr = NULL;
   
         switch (errno) {
+ +      case -EFSCORRUPTED:
+ +              errstr = "Corrupt filesystem";
+ +              break;
+ +      case -EFSBADCRC:
+ +              errstr = "Filesystem failed CRC";
+ +              break;
         case -EIO:
                 errstr = "IO failure";
                 break;
@@@ -589,12 -585,8 +589,12 @@@ void __ext4_abort(struct super_block *s
                         jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
                 save_error_info(sb, function, line);
         }
- -      if (test_opt(sb, ERRORS_PANIC))
+ +      if (test_opt(sb, ERRORS_PANIC)) {
+ +              if (EXT4_SB(sb)->s_journal &&
+ +                !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
+ +                      return;
                 panic("EXT4-fs panic from previous error\n");
+ +      }
   }
   
   void __ext4_msg(struct super_block *sb,
@@@ -808,7 -800,6 +808,7 @@@ static void ext4_put_super(struct super
                         ext4_abort(sb, "Couldn't clean up the journal");
         }
   
+ +      ext4_unregister_sysfs(sb);
         ext4_es_unregister_shrinker(sbi);
         del_timer_sync(&sbi->s_err_report);
         ext4_release_system_zone(sb);
@@@ -817,12 -808,18 +817,12 @@@
         ext4_xattr_put_super(sb);
   
         if (!(sb->s_flags & MS_RDONLY)) {
- -              EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ +              ext4_clear_feature_journal_needs_recovery(sb);
                 es->s_state = cpu_to_le16(sbi->s_mount_state);
         }
         if (!(sb->s_flags & MS_RDONLY))
                 ext4_commit_super(sb, 1);
   
- -      if (sbi->s_proc) {
- -              remove_proc_entry("options", sbi->s_proc);
- -              remove_proc_entry(sb->s_id, ext4_proc_root);
- -      }
- -      kobject_del(&sbi->s_kobj);
- -
         for (i = 0; i < sbi->s_gdb_count; i++)
                 brelse(sbi->s_group_desc[i]);
         kvfree(sbi->s_group_desc);
@@@ -1061,7 -1058,7 +1061,7 @@@ static int bdev_try_to_free_page(struc
                 return 0;
         if (journal)
                 return jbd2_journal_try_to_free_buffers(journal, page,
-                                                       wait & ~__GFP_WAIT);
+                                               wait & ~__GFP_DIRECT_RECLAIM);
         return try_to_free_buffers(page);
   }
   
@@@ -1291,7 -1288,7 +1291,7 @@@ static int set_qf_name(struct super_blo
                         "quota options when quota turned on");
                 return -1;
         }
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ +      if (ext4_has_feature_quota(sb)) {
                 ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
                          "when QUOTA feature is enabled");
                 return -1;
@@@ -1384,10 -1381,10 +1384,10 @@@ static const struct mount_opts 
         {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
          MOPT_EXT4_ONLY | MOPT_CLEAR},
         {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
- -       MOPT_EXT4_ONLY | MOPT_SET},
+ +       MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
         {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
                                     EXT4_MOUNT_JOURNAL_CHECKSUM),
- -       MOPT_EXT4_ONLY | MOPT_SET},
+ +       MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
         {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
         {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
         {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
@@@ -1516,14 -1513,8 +1516,14 @@@ static int handle_mount_opt(struct supe
                 return -1;
         if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
                 return -1;
- -      if (m->flags & MOPT_EXPLICIT)
- -              set_opt2(sb, EXPLICIT_DELALLOC);
+ +      if (m->flags & MOPT_EXPLICIT) {
+ +              if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
+ +                      set_opt2(sb, EXPLICIT_DELALLOC);
+ +              } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
+ +                      set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
+ +              } else
+ +                      return -1;
+ +      }
         if (m->flags & MOPT_CLEAR_ERR)
                 clear_opt(sb, ERRORS_MASK);
         if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
@@@ -1656,7 -1647,8 +1656,7 @@@
                                  "quota options when quota turned on");
                         return -1;
                 }
- -              if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                                             EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ +              if (ext4_has_feature_quota(sb)) {
                         ext4_msg(sb, KERN_ERR,
                                  "Cannot set journaled quota options "
                                  "when QUOTA feature is enabled");
@@@ -1715,7 -1707,7 +1715,7 @@@ static int parse_options(char *options
                         return 0;
         }
   #ifdef CONFIG_QUOTA
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ +      if (ext4_has_feature_quota(sb) &&
             (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
                 ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
                          "feature is enabled");
@@@ -1888,7 -1880,7 +1888,7 @@@ static int ext4_show_options(struct seq
         return _ext4_show_options(seq, root->d_sb, 0);
   }
   
- -static int options_seq_show(struct seq_file *seq, void *offset)
+ +int ext4_seq_options_show(struct seq_file *seq, void *offset)
   {
         struct super_block *sb = seq->private;
         int rc;
@@@ -1899,6 -1891,19 +1899,6 @@@
         return rc;
   }
   
- -static int options_open_fs(struct inode *inode, struct file *file)
- -{
- -      return single_open(file, options_seq_show, PDE_DATA(inode));
- -}
- -
- -static const struct file_operations ext4_seq_options_fops = {
- -      .owner = THIS_MODULE,
- -      .open = options_open_fs,
- -      .read = seq_read,
- -      .llseek = seq_lseek,
- -      .release = single_release,
- -};
- -
   static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                             int read_only)
   {
@@@ -1939,7 -1944,7 +1939,7 @@@
         es->s_mtime = cpu_to_le32(get_seconds());
         ext4_update_dynamic_rev(sb);
         if (sbi->s_journal)
- -              EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ +              ext4_set_feature_journal_needs_recovery(sb);
   
         ext4_commit_super(sb, 1);
   done:
@@@ -2022,13 -2027,12 +2022,13 @@@ failed
         return 0;
   }
   
- -static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+ +static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
                                    struct ext4_group_desc *gdp)
   {
         int offset;
         __u16 crc = 0;
         __le32 le_group = cpu_to_le32(block_group);
+ +      struct ext4_sb_info *sbi = EXT4_SB(sb);
   
         if (ext4_has_metadata_csum(sbi->s_sb)) {
                 /* Use new metadata_csum algorithm */
@@@ -2048,7 -2052,8 +2048,7 @@@
         }
   
         /* old crc16 code */
- -      if (!(sbi->s_es->s_feature_ro_compat &
- -            cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
+ +      if (!ext4_has_feature_gdt_csum(sb))
                 return 0;
   
         offset = offsetof(struct ext4_group_desc, bg_checksum);
@@@ -2058,7 -2063,8 +2058,7 @@@
         crc = crc16(crc, (__u8 *)gdp, offset);
         offset += sizeof(gdp->bg_checksum); /* skip checksum */
         /* for checksum of struct ext4_group_desc do the rest...*/
- -      if ((sbi->s_es->s_feature_incompat &
- -           cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+ +      if (ext4_has_feature_64bit(sb) &&
             offset < le16_to_cpu(sbi->s_es->s_desc_size))
                 crc = crc16(crc, (__u8 *)gdp + offset,
                             le16_to_cpu(sbi->s_es->s_desc_size) -
@@@ -2072,7 -2078,8 +2072,7 @@@ int ext4_group_desc_csum_verify(struct 
                                 struct ext4_group_desc *gdp)
   {
         if (ext4_has_group_desc_csum(sb) &&
- -          (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
- -                                                    block_group, gdp)))
+ +          (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
                 return 0;
   
         return 1;
@@@ -2083,7 -2090,7 +2083,7 @@@ void ext4_group_desc_csum_set(struct su
   {
         if (!ext4_has_group_desc_csum(sb))
                 return;
- -      gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+ +      gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
   }
   
   /* Called at mount-time, super-block is locked */
@@@ -2099,7 -2106,7 +2099,7 @@@ static int ext4_check_descriptors(struc
         int flexbg_flag = 0;
         ext4_group_t i, grp = sbi->s_groups_count;
   
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ +      if (ext4_has_feature_flex_bg(sb))
                 flexbg_flag = 1;
   
         ext4_debug("Checking group descriptors");
@@@ -2143,7 -2150,7 +2143,7 @@@
                 if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                  "Checksum for group %u failed (%u!=%u)",
- -                               i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+ +                               i, le16_to_cpu(ext4_group_desc_csum(sb, i,
                                      gdp)), le16_to_cpu(gdp->bg_checksum));
                         if (!(sb->s_flags & MS_RDONLY)) {
                                 ext4_unlock_group(sb, i);
@@@ -2406,7 -2413,8 +2406,7 @@@ static ext4_fsblk_t descriptor_loc(stru
   
         first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
   
- -      if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
- -          nr < first_meta_bg)
+ +      if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
                 return logical_sb_block + nr + 1;
         bg = sbi->s_desc_per_block * nr;
         if (ext4_bg_has_super(sb, bg))
@@@ -2462,6 -2470,335 +2462,6 @@@ static unsigned long ext4_get_stripe_si
         return ret;
   }
   
- -/* sysfs supprt */
- -
- -struct ext4_attr {
- -      struct attribute attr;
- -      ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
- -      ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
- -                       const char *, size_t);
- -      union {
- -              int offset;
- -              int deprecated_val;
- -      } u;
- -};
- -
- -static int parse_strtoull(const char *buf,
- -              unsigned long long max, unsigned long long *value)
- -{
- -      int ret;
- -
- -      ret = kstrtoull(skip_spaces(buf), 0, value);
- -      if (!ret && *value > max)
- -              ret = -EINVAL;
- -      return ret;
- -}
- -
- -static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
- -                                            struct ext4_sb_info *sbi,
- -                                            char *buf)
- -{
- -      return snprintf(buf, PAGE_SIZE, "%llu\n",
- -              (s64) EXT4_C2B(sbi,
- -                      percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
- -}
- -
- -static ssize_t session_write_kbytes_show(struct ext4_attr *a,
- -                                       struct ext4_sb_info *sbi, char *buf)
- -{
- -      struct super_block *sb = sbi->s_buddy_cache->i_sb;
- -
- -      if (!sb->s_bdev->bd_part)
- -              return snprintf(buf, PAGE_SIZE, "0\n");
- -      return snprintf(buf, PAGE_SIZE, "%lu\n",
- -                      (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- -                       sbi->s_sectors_written_start) >> 1);
- -}
- -
- -static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
- -                                        struct ext4_sb_info *sbi, char *buf)
- -{
- -      struct super_block *sb = sbi->s_buddy_cache->i_sb;
- -
- -      if (!sb->s_bdev->bd_part)
- -              return snprintf(buf, PAGE_SIZE, "0\n");
- -      return snprintf(buf, PAGE_SIZE, "%llu\n",
- -                      (unsigned long long)(sbi->s_kbytes_written +
- -                      ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- -                        EXT4_SB(sb)->s_sectors_written_start) >> 1)));
- -}
- -
- -static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
- -                                        struct ext4_sb_info *sbi,
- -                                        const char *buf, size_t count)
- -{
- -      unsigned long t;
- -      int ret;
- -
- -      ret = kstrtoul(skip_spaces(buf), 0, &t);
- -      if (ret)
- -              return ret;
- -
- -      if (t && (!is_power_of_2(t) || t > 0x40000000))
- -              return -EINVAL;
- -
- -      sbi->s_inode_readahead_blks = t;
- -      return count;
- -}
- -
- -static ssize_t sbi_ui_show(struct ext4_attr *a,
- -                         struct ext4_sb_info *sbi, char *buf)
- -{
- -      unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
- -
- -      return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
- -}
- -
- -static ssize_t sbi_ui_store(struct ext4_attr *a,
- -                          struct ext4_sb_info *sbi,
- -                          const char *buf, size_t count)
- -{
- -      unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
- -      unsigned long t;
- -      int ret;
- -
- -      ret = kstrtoul(skip_spaces(buf), 0, &t);
- -      if (ret)
- -              return ret;
- -      *ui = t;
- -      return count;
- -}
- -
- -static ssize_t es_ui_show(struct ext4_attr *a,
- -                         struct ext4_sb_info *sbi, char *buf)
- -{
- -
- -      unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
- -                         a->u.offset);
- -
- -      return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
- -}
- -
- -static ssize_t reserved_clusters_show(struct ext4_attr *a,
- -                                struct ext4_sb_info *sbi, char *buf)
- -{
- -      return snprintf(buf, PAGE_SIZE, "%llu\n",
- -              (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
- -}
- -
- -static ssize_t reserved_clusters_store(struct ext4_attr *a,
- -                                 struct ext4_sb_info *sbi,
- -                                 const char *buf, size_t count)
- -{
- -      unsigned long long val;
- -      int ret;
- -
- -      if (parse_strtoull(buf, -1ULL, &val))
- -              return -EINVAL;
- -      ret = ext4_reserve_clusters(sbi, val);
- -
- -      return ret ? ret : count;
- -}
- -
- -static ssize_t trigger_test_error(struct ext4_attr *a,
- -                                struct ext4_sb_info *sbi,
- -                                const char *buf, size_t count)
- -{
- -      int len = count;
- -
- -      if (!capable(CAP_SYS_ADMIN))
- -              return -EPERM;
- -
- -      if (len && buf[len-1] == '\n')
- -              len--;
- -
- -      if (len)
- -              ext4_error(sbi->s_sb, "%.*s", len, buf);
- -      return count;
- -}
- -
- -static ssize_t sbi_deprecated_show(struct ext4_attr *a,
- -                                 struct ext4_sb_info *sbi, char *buf)
- -{
- -      return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
- -}
- -
- -#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
- -static struct ext4_attr ext4_attr_##_name = {                 \
- -      .attr = {.name = __stringify(_name), .mode = _mode },   \
- -      .show   = _show,                                        \
- -      .store  = _store,                                       \
- -      .u = {                                                  \
- -              .offset = offsetof(struct ext4_sb_info, _elname),\
- -      },                                                      \
- -}
- -
- -#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname)         \
- -static struct ext4_attr ext4_attr_##_name = {                         \
- -      .attr = {.name = __stringify(_name), .mode = _mode },           \
- -      .show   = _show,                                                \
- -      .store  = _store,                                               \
- -      .u = {                                                          \
- -              .offset = offsetof(struct ext4_super_block, _elname),   \
- -      },                                                              \
- -}
- -
- -#define EXT4_ATTR(name, mode, show, store) \
- -static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
- -
- -#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
- -#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
- -#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
- -
- -#define EXT4_RO_ATTR_ES_UI(name, elname)      \
- -      EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
- -#define EXT4_RW_ATTR_SBI_UI(name, elname)     \
- -      EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
- -
- -#define ATTR_LIST(name) &ext4_attr_##name.attr
- -#define EXT4_DEPRECATED_ATTR(_name, _val)     \
- -static struct ext4_attr ext4_attr_##_name = {                 \
- -      .attr = {.name = __stringify(_name), .mode = 0444 },    \
- -      .show   = sbi_deprecated_show,                          \
- -      .u = {                                                  \
- -              .deprecated_val = _val,                         \
- -      },                                                      \
- -}
- -
- -EXT4_RO_ATTR(delayed_allocation_blocks);
- -EXT4_RO_ATTR(session_write_kbytes);
- -EXT4_RO_ATTR(lifetime_write_kbytes);
- -EXT4_RW_ATTR(reserved_clusters);
- -EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
- -               inode_readahead_blks_store, s_inode_readahead_blks);
- -EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
- -EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
- -EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
- -EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
- -EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
- -EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
- -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
- -EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
- -EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
- -EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
- -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
- -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
- -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
- -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
- -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
- -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
- -EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
- -EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
- -EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
- -
- -static struct attribute *ext4_attrs[] = {
- -      ATTR_LIST(delayed_allocation_blocks),
- -      ATTR_LIST(session_write_kbytes),
- -      ATTR_LIST(lifetime_write_kbytes),
- -      ATTR_LIST(reserved_clusters),
- -      ATTR_LIST(inode_readahead_blks),
- -      ATTR_LIST(inode_goal),
- -      ATTR_LIST(mb_stats),
- -      ATTR_LIST(mb_max_to_scan),
- -      ATTR_LIST(mb_min_to_scan),
- -      ATTR_LIST(mb_order2_req),
- -      ATTR_LIST(mb_stream_req),
- -      ATTR_LIST(mb_group_prealloc),
- -      ATTR_LIST(max_writeback_mb_bump),
- -      ATTR_LIST(extent_max_zeroout_kb),
- -      ATTR_LIST(trigger_fs_error),
- -      ATTR_LIST(err_ratelimit_interval_ms),
- -      ATTR_LIST(err_ratelimit_burst),
- -      ATTR_LIST(warning_ratelimit_interval_ms),
- -      ATTR_LIST(warning_ratelimit_burst),
- -      ATTR_LIST(msg_ratelimit_interval_ms),
- -      ATTR_LIST(msg_ratelimit_burst),
- -      ATTR_LIST(errors_count),
- -      ATTR_LIST(first_error_time),
- -      ATTR_LIST(last_error_time),
- -      NULL,
- -};
- -
- -/* Features this copy of ext4 supports */
- -EXT4_INFO_ATTR(lazy_itable_init);
- -EXT4_INFO_ATTR(batched_discard);
- -EXT4_INFO_ATTR(meta_bg_resize);
- -EXT4_INFO_ATTR(encryption);
- -
- -static struct attribute *ext4_feat_attrs[] = {
- -      ATTR_LIST(lazy_itable_init),
- -      ATTR_LIST(batched_discard),
- -      ATTR_LIST(meta_bg_resize),
- -      ATTR_LIST(encryption),
- -      NULL,
- -};
- -
- -static ssize_t ext4_attr_show(struct kobject *kobj,
- -                            struct attribute *attr, char *buf)
- -{
- -      struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- -                                              s_kobj);
- -      struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
- -
- -      return a->show ? a->show(a, sbi, buf) : 0;
- -}
- -
- -static ssize_t ext4_attr_store(struct kobject *kobj,
- -                             struct attribute *attr,
- -                             const char *buf, size_t len)
- -{
- -      struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- -                                              s_kobj);
- -      struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
- -
- -      return a->store ? a->store(a, sbi, buf, len) : 0;
- -}
- -
- -static void ext4_sb_release(struct kobject *kobj)
- -{
- -      struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- -                                              s_kobj);
- -      complete(&sbi->s_kobj_unregister);
- -}
- -
- -static const struct sysfs_ops ext4_attr_ops = {
- -      .show   = ext4_attr_show,
- -      .store  = ext4_attr_store,
- -};
- -
- -static struct kobj_type ext4_ktype = {
- -      .default_attrs  = ext4_attrs,
- -      .sysfs_ops      = &ext4_attr_ops,
- -      .release        = ext4_sb_release,
- -};
- -
- -static void ext4_feat_release(struct kobject *kobj)
- -{
- -      complete(&ext4_feat->f_kobj_unregister);
- -}
- -
- -static ssize_t ext4_feat_show(struct kobject *kobj,
- -                            struct attribute *attr, char *buf)
- -{
- -      return snprintf(buf, PAGE_SIZE, "supported\n");
- -}
- -
- -/*
- - * We can not use ext4_attr_show/store because it relies on the kobject
- - * being embedded in the ext4_sb_info structure which is definitely not
- - * true in this case.
- - */
- -static const struct sysfs_ops ext4_feat_ops = {
- -      .show   = ext4_feat_show,
- -      .store  = NULL,
- -};
- -
- -static struct kobj_type ext4_feat_ktype = {
- -      .default_attrs  = ext4_feat_attrs,
- -      .sysfs_ops      = &ext4_feat_ops,
- -      .release        = ext4_feat_release,
- -};
- -
   /*
    * Check whether this filesystem can be mounted based on
    * the features present and the RDONLY/RDWR mount requested.
@@@ -2470,7 -2807,7 +2470,7 @@@
    */
   static int ext4_feature_set_ok(struct super_block *sb, int readonly)
   {
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ +      if (ext4_has_unknown_ext4_incompat_features(sb)) {
                 ext4_msg(sb, KERN_ERR,
                         "Couldn't mount because of "
                         "unsupported optional features (%x)",
@@@ -2482,14 -2819,14 +2482,14 @@@
         if (readonly)
                 return 1;
   
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
+ +      if (ext4_has_feature_readonly(sb)) {
                 ext4_msg(sb, KERN_INFO, "filesystem is read-only");
                 sb->s_flags |= MS_RDONLY;
                 return 1;
         }
   
         /* Check that feature set is OK for a read-write mount */
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ +      if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
                 ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
                          "unsupported optional features (%x)",
                          (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
@@@ -2500,7 -2837,7 +2500,7 @@@
          * Large file size enabled file system can only be mounted
          * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
          */
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ +      if (ext4_has_feature_huge_file(sb)) {
                 if (sizeof(blkcnt_t) < sizeof(u64)) {
                         ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
                                  "cannot be mounted RDWR without "
@@@ -2508,7 -2845,8 +2508,7 @@@
                         return 0;
                 }
         }
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
- -          !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ +      if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
                 ext4_msg(sb, KERN_ERR,
                          "Can't support bigalloc feature without "
                          "extents feature\n");
@@@ -2516,7 -2854,8 +2516,7 @@@
         }
   
   #ifndef CONFIG_QUOTA
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- -          !readonly) {
+ +      if (ext4_has_feature_quota(sb) && !readonly) {
                 ext4_msg(sb, KERN_ERR,
                          "Filesystem with quota feature cannot be mounted RDWR "
                          "without CONFIG_QUOTA");
@@@ -2973,7 -3312,7 +2973,7 @@@ static int count_overhead(struct super_
         ext4_group_t            i, ngroups = ext4_get_groups_count(sb);
         int                     s, j, count = 0;
   
- -      if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ +      if (!ext4_has_feature_bigalloc(sb))
                 return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
                         sbi->s_itb_per_group + 2);
   
@@@ -3064,10 -3403,10 +3064,10 @@@ int ext4_calculate_overhead(struct supe
         return 0;
   }
   
- -
- -static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
+ +static void ext4_set_resv_clusters(struct super_block *sb)
   {
         ext4_fsblk_t resv_clusters;
+ +      struct ext4_sb_info *sbi = EXT4_SB(sb);
   
         /*
          * There's no need to reserve anything when we aren't using extents.
@@@ -3075,8 -3414,8 +3075,8 @@@
          * hole punching doesn't need new metadata... This is needed especially
          * to keep ext2/3 backward compatibility.
          */
- -      if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
- -              return 0;
+ +      if (!ext4_has_feature_extents(sb))
+ +              return;
         /*
          * By default we reserve 2% or 4096 clusters, whichever is smaller.
          * This should cover the situations where we can not afford to run
@@@ -3085,13 -3424,26 +3085,13 @@@
          * allocation would require 1, or 2 blocks, higher numbers are
          * very rare.
          */
- -      resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
- -                      EXT4_SB(sb)->s_cluster_bits;
+ +      resv_clusters = (ext4_blocks_count(sbi->s_es) >>
+ +                       sbi->s_cluster_bits);
   
         do_div(resv_clusters, 50);
         resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
   
- -      return resv_clusters;
- -}
- -
- -
- -static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
- -{
- -      ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
- -                              sbi->s_cluster_bits;
- -
- -      if (count >= clusters)
- -              return -EINVAL;
- -
- -      atomic64_set(&sbi->s_resv_clusters, count);
- -      return 0;
+ +      atomic64_set(&sbi->s_resv_clusters, resv_clusters);
   }
   
   static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@@ -3174,8 -3526,9 +3174,8 @@@
         sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
   
         /* Warn if metadata_csum and gdt_csum are both set. */
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                                     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
- -          EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ +      if (ext4_has_feature_metadata_csum(sb) &&
+ +          ext4_has_feature_gdt_csum(sb))
                 ext4_warning(sb, "metadata_csum and uninit_bg are "
                              "redundant flags; please run fsck.");
   
@@@ -3188,7 -3541,8 +3188,7 @@@
         }
   
         /* Load the checksum driver */
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                                     EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ +      if (ext4_has_feature_metadata_csum(sb)) {
                 sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
                 if (IS_ERR(sbi->s_chksum_driver)) {
                         ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
@@@ -3203,14 -3557,11 +3203,14 @@@
                 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
                          "invalid superblock checksum.  Run e2fsck?");
                 silent = 1;
+ +              ret = -EFSBADCRC;
                 goto cantfind_ext4;
         }
   
         /* Precompute checksum seed for all metadata */
- -      if (ext4_has_metadata_csum(sb))
+ +      if (ext4_has_feature_csum_seed(sb))
+ +              sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
+ +      else if (ext4_has_metadata_csum(sb))
                 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
                                                sizeof(es->s_uuid));
   
@@@ -3313,16 -3664,17 +3313,16 @@@
                 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
   
         if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
- -          (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
- -           EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
- -           EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+ +          (ext4_has_compat_features(sb) ||
+ +           ext4_has_ro_compat_features(sb) ||
+ +           ext4_has_incompat_features(sb)))
                 ext4_msg(sb, KERN_WARNING,
                        "feature flags set on rev 0 fs, "
                        "running e2fsck is recommended");
   
         if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
                 set_opt2(sb, HURD_COMPAT);
- -              if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- -                                            EXT4_FEATURE_INCOMPAT_64BIT)) {
+ +              if (ext4_has_feature_64bit(sb)) {
                         ext4_msg(sb, KERN_ERR,
                                  "The Hurd can't support 64-bit file systems");
                         goto failed_mount;
@@@ -3380,7 -3732,8 +3380,7 @@@
                 }
         }
   
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
- -          es->s_encryption_level) {
+ +      if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
                 ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
                          es->s_encryption_level);
                 goto failed_mount;
@@@ -3412,7 -3765,8 +3412,7 @@@
                 }
         }
   
- -      has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                              EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+ +      has_huge_files = ext4_has_feature_huge_file(sb);
         sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
                                                       has_huge_files);
         sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@@ -3436,7 -3790,7 +3436,7 @@@
         }
   
         sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+ +      if (ext4_has_feature_64bit(sb)) {
                 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
                     sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
                     !is_power_of_2(sbi->s_desc_size)) {
@@@ -3467,7 -3821,7 +3467,7 @@@
         for (i = 0; i < 4; i++)
                 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
         sbi->s_def_hash_version = es->s_def_hash_version;
- -      if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ +      if (ext4_has_feature_dir_index(sb)) {
                 i = le32_to_cpu(es->s_flags);
                 if (i & EXT2_FLAGS_UNSIGNED_HASH)
                         sbi->s_hash_unsigned = 3;
@@@ -3487,7 -3841,8 +3487,7 @@@
   
         /* Handle clustersize */
         clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
- -      has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                              EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+ +      has_bigalloc = ext4_has_feature_bigalloc(sb);
         if (has_bigalloc) {
                 if (clustersize < blocksize) {
                         ext4_msg(sb, KERN_ERR,
@@@ -3606,6 -3961,13 +3606,6 @@@
                 goto failed_mount;
         }
   
- -      if (ext4_proc_root)
- -              sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
- -
- -      if (sbi->s_proc)
- -              proc_create_data("options", S_IRUGO, sbi->s_proc,
- -                               &ext4_seq_options_fops, sb);
- -
         bgl_lock_init(sbi->s_blockgroup_lock);
   
         for (i = 0; i < db_count; i++) {
@@@ -3620,7 -3982,6 +3620,7 @@@
         }
         if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
                 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+ +              ret = -EFSCORRUPTED;
                 goto failed_mount2;
         }
   
@@@ -3646,7 -4007,7 +3646,7 @@@
         sb->s_xattr = ext4_xattr_handlers;
   #ifdef CONFIG_QUOTA
         sb->dq_op = &ext4_quota_operations;
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ +      if (ext4_has_feature_quota(sb))
                 sb->s_qcop = &dquot_quotactl_sysfile_ops;
         else
                 sb->s_qcop = &ext4_qctl_operations;
@@@ -3660,9 -4021,11 +3660,9 @@@
         sb->s_root = NULL;
   
         needs_recovery = (es->s_last_orphan != 0 ||
- -                        EXT4_HAS_INCOMPAT_FEATURE(sb,
- -                                  EXT4_FEATURE_INCOMPAT_RECOVER));
+ +                        ext4_has_feature_journal_needs_recovery(sb));
   
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
- -          !(sb->s_flags & MS_RDONLY))
+ +      if (ext4_has_feature_mmp(sb) && !(sb->s_flags & MS_RDONLY))
                 if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
                         goto failed_mount3a;
   
@@@ -3670,47 -4033,23 +3670,47 @@@
          * The first inode we look at is the journal inode.  Don't try
          * root first: it may be modified in the journal!
          */
- -      if (!test_opt(sb, NOLOAD) &&
- -          EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ +      if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
                 if (ext4_load_journal(sb, es, journal_devnum))
                         goto failed_mount3a;
         } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
- -            EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ +                 ext4_has_feature_journal_needs_recovery(sb)) {
                 ext4_msg(sb, KERN_ERR, "required journal recovery "
                        "suppressed and not mounted read-only");
                 goto failed_mount_wq;
         } else {
+ +              /* Nojournal mode, all journal mount options are illegal */
+ +              if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
+ +                      ext4_msg(sb, KERN_ERR, "can't mount with "
+ +                               "journal_checksum, fs mounted w/o journal");
+ +                      goto failed_mount_wq;
+ +              }
+ +              if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ +                      ext4_msg(sb, KERN_ERR, "can't mount with "
+ +                               "journal_async_commit, fs mounted w/o journal");
+ +                      goto failed_mount_wq;
+ +              }
+ +              if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
+ +                      ext4_msg(sb, KERN_ERR, "can't mount with "
+ +                               "commit=%lu, fs mounted w/o journal",
+ +                               sbi->s_commit_interval / HZ);
+ +                      goto failed_mount_wq;
+ +              }
+ +              if (EXT4_MOUNT_DATA_FLAGS &
+ +                  (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
+ +                      ext4_msg(sb, KERN_ERR, "can't mount with "
+ +                               "data=, fs mounted w/o journal");
+ +                      goto failed_mount_wq;
+ +              }
+ +              sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM;
+ +              clear_opt(sb, JOURNAL_CHECKSUM);
                 clear_opt(sb, DATA_FLAGS);
                 sbi->s_journal = NULL;
                 needs_recovery = 0;
                 goto no_journal;
         }
   
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
+ +      if (ext4_has_feature_64bit(sb) &&
             !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                        JBD2_FEATURE_INCOMPAT_64BIT)) {
                 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
@@@ -3762,16 -4101,18 +3762,16 @@@ no_journal
                 }
         }
   
- -      if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
- -           EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
+ +      if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
             (blocksize != PAGE_CACHE_SIZE)) {
                 ext4_msg(sb, KERN_ERR,
                          "Unsupported blocksize for fs encryption");
                 goto failed_mount_wq;
         }
   
- -      if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
- -          !(sb->s_flags & MS_RDONLY) &&
- -          !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
- -              EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+ +      if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) &&
+ +          !ext4_has_feature_encrypt(sb)) {
+ +              ext4_set_feature_encrypt(sb);
                 ext4_commit_super(sb, 1);
         }
   
@@@ -3830,7 -4171,8 +3830,7 @@@
         if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
                 sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
                                                      EXT4_GOOD_OLD_INODE_SIZE;
- -              if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                                     EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
+ +              if (ext4_has_feature_extra_isize(sb)) {
                         if (sbi->s_want_extra_isize <
                             le16_to_cpu(es->s_want_extra_isize))
                                 sbi->s_want_extra_isize =
@@@ -3850,7 -4192,12 +3850,7 @@@
                          "available");
         }
   
- -      err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
- -      if (err) {
- -              ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
- -                       "reserved pool", ext4_calculate_resv_clusters(sb));
- -              goto failed_mount4a;
- -      }
+ +      ext4_set_resv_clusters(sb);
   
         err = ext4_setup_system_zone(sb);
         if (err) {
@@@ -3889,7 -4236,7 +3889,7 @@@
                 goto failed_mount6;
         }
   
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ +      if (ext4_has_feature_flex_bg(sb))
                 if (!ext4_fill_flex_info(sb)) {
                         ext4_msg(sb, KERN_ERR,
                                "unable to initialize "
@@@ -3901,13 -4248,17 +3901,13 @@@
         if (err)
                 goto failed_mount6;
   
- -      sbi->s_kobj.kset = ext4_kset;
- -      init_completion(&sbi->s_kobj_unregister);
- -      err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
- -                                 "%s", sb->s_id);
+ +      err = ext4_register_sysfs(sb);
         if (err)
                 goto failed_mount7;
   
   #ifdef CONFIG_QUOTA
         /* Enable quota usage during mount. */
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- -          !(sb->s_flags & MS_RDONLY)) {
+ +      if (ext4_has_feature_quota(sb) && !(sb->s_flags & MS_RDONLY)) {
                 err = ext4_enable_quotas(sb);
                 if (err)
                         goto failed_mount8;
@@@ -3962,7 -4313,7 +3962,7 @@@ cantfind_ext4
   
   #ifdef CONFIG_QUOTA
   failed_mount8:
- -      kobject_del(&sbi->s_kobj);
+ +      ext4_unregister_sysfs(sb);
   #endif
   failed_mount7:
         ext4_unregister_li_request(sb);
@@@ -4002,6 -4353,10 +4002,6 @@@ failed_mount2
   failed_mount:
         if (sbi->s_chksum_driver)
                 crypto_free_shash(sbi->s_chksum_driver);
- -      if (sbi->s_proc) {
- -              remove_proc_entry("options", sbi->s_proc);
- -              remove_proc_entry(sb->s_id, ext4_proc_root);
- -      }
   #ifdef CONFIG_QUOTA
         for (i = 0; i < EXT4_MAXQUOTAS; i++)
                 kfree(sbi->s_qf_names[i]);
@@@ -4048,7 -4403,7 +4048,7 @@@ static journal_t *ext4_get_journal(stru
         struct inode *journal_inode;
         journal_t *journal;
   
- -      BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ +      BUG_ON(!ext4_has_feature_journal(sb));
   
         /* First, test for the existence of a valid inode on disk.  Bad
          * things happen if we iget() an unused inode, as the subsequent
@@@ -4098,7 -4453,7 +4098,7 @@@ static journal_t *ext4_get_dev_journal(
         struct ext4_super_block *es;
         struct block_device *bdev;
   
- -      BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ +      BUG_ON(!ext4_has_feature_journal(sb));
   
         bdev = ext4_blkdev_get(j_dev, sb);
         if (bdev == NULL)
@@@ -4190,7 -4545,7 +4190,7 @@@ static int ext4_load_journal(struct sup
         int err = 0;
         int really_read_only;
   
- -      BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ +      BUG_ON(!ext4_has_feature_journal(sb));
   
         if (journal_devnum &&
             journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@@ -4207,7 -4562,7 +4207,7 @@@
          * crash?  For recovery, we need to check in advance whether we
          * can get read-write access to the device.
          */
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ +      if (ext4_has_feature_journal_needs_recovery(sb)) {
                 if (sb->s_flags & MS_RDONLY) {
                         ext4_msg(sb, KERN_INFO, "INFO: recovery "
                                         "required on readonly filesystem");
@@@ -4238,7 -4593,7 +4238,7 @@@
         if (!(journal->j_flags & JBD2_BARRIER))
                 ext4_msg(sb, KERN_INFO, "barriers disabled");
   
- -      if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+ +      if (!ext4_has_feature_journal_needs_recovery(sb))
                 err = jbd2_journal_wipe(journal, !really_read_only);
         if (!err) {
                 char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
@@@ -4352,7 -4707,7 +4352,7 @@@ static void ext4_mark_recovery_complete
   {
         journal_t *journal = EXT4_SB(sb)->s_journal;
   
- -      if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ +      if (!ext4_has_feature_journal(sb)) {
                 BUG_ON(journal != NULL);
                 return;
         }
@@@ -4360,9 -4715,9 +4360,9 @@@
         if (jbd2_journal_flush(journal) < 0)
                 goto out;
   
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+ +      if (ext4_has_feature_journal_needs_recovery(sb) &&
             sb->s_flags & MS_RDONLY) {
- -              EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ +              ext4_clear_feature_journal_needs_recovery(sb);
                 ext4_commit_super(sb, 1);
         }
   
@@@ -4382,7 -4737,7 +4382,7 @@@ static void ext4_clear_journal_err(stru
         int j_errno;
         const char *errstr;
   
- -      BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ +      BUG_ON(!ext4_has_feature_journal(sb));
   
         journal = EXT4_SB(sb)->s_journal;
   
@@@ -4497,7 -4852,7 +4497,7 @@@ static int ext4_freeze(struct super_blo
                         goto out;
   
                 /* Journal blocked and flushed, clear needs_recovery flag. */
- -              EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ +              ext4_clear_feature_journal_needs_recovery(sb);
         }
   
         error = ext4_commit_super(sb, 1);
@@@ -4519,7 -4874,7 +4519,7 @@@ static int ext4_unfreeze(struct super_b
   
         if (EXT4_SB(sb)->s_journal) {
                 /* Reset the needs_recovery flag before the fs is unlocked. */
- -              EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ +              ext4_set_feature_journal_needs_recovery(sb);
         }
   
         ext4_commit_super(sb, 1);
@@@ -4672,7 -5027,8 +4672,7 @@@ static int ext4_remount(struct super_bl
                                 ext4_mark_recovery_complete(sb, es);
                 } else {
                         /* Make sure we can mount this feature set readwrite */
- -                      if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                                      EXT4_FEATURE_RO_COMPAT_READONLY) ||
+ +                      if (ext4_has_feature_readonly(sb) ||
                             !ext4_feature_set_ok(sb, 0)) {
                                 err = -EROFS;
                                 goto restore_opts;
@@@ -4688,9 -5044,9 +4688,9 @@@
                                 if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
                                         ext4_msg(sb, KERN_ERR,
                "ext4_remount: Checksum for group %u failed (%u!=%u)",
- -              g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+ +              g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
                                                le16_to_cpu(gdp->bg_checksum));
- -                                      err = -EINVAL;
+ +                                      err = -EFSBADCRC;
                                         goto restore_opts;
                                 }
                         }
@@@ -4720,7 -5076,8 +4720,7 @@@
                         sbi->s_mount_state = le16_to_cpu(es->s_state);
                         if (!ext4_setup_super(sb, es, 0))
                                 sb->s_flags &= ~MS_RDONLY;
- -                      if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- -                                                   EXT4_FEATURE_INCOMPAT_MMP))
+ +                      if (ext4_has_feature_mmp(sb))
                                 if (ext4_multi_mount_protect(sb,
                                                 le64_to_cpu(es->s_mmp_block))) {
                                         err = -EROFS;
@@@ -4753,7 -5110,8 +4753,7 @@@
         if (enable_quota) {
                 if (sb_any_quota_suspended(sb))
                         dquot_resume(sb, -1);
- -              else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- -                                      EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ +              else if (ext4_has_feature_quota(sb)) {
                         err = ext4_enable_quotas(sb);
                         if (err)
                                 goto restore_opts;
@@@ -4897,7 -5255,7 +4897,7 @@@ static int ext4_mark_dquot_dirty(struc
         struct ext4_sb_info *sbi = EXT4_SB(sb);
   
         /* Are we journaling quotas? */
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+ +      if (ext4_has_feature_quota(sb) ||
             sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
                 dquot_mark_dquot_dirty(dquot);
                 return ext4_write_dquot(dquot);
@@@ -4985,7 -5343,7 +4985,7 @@@ static int ext4_quota_enable(struct sup
                 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
         };
   
- -      BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
+ +      BUG_ON(!ext4_has_feature_quota(sb));
   
         if (!qf_inums[type])
                 return -EPERM;
@@@ -5179,11 -5537,11 +5179,11 @@@ static inline void unregister_as_ext2(v
   
   static inline int ext2_feature_set_ok(struct super_block *sb)
   {
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+ +      if (ext4_has_unknown_ext2_incompat_features(sb))
                 return 0;
         if (sb->s_flags & MS_RDONLY)
                 return 1;
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+ +      if (ext4_has_unknown_ext2_ro_compat_features(sb))
                 return 0;
         return 1;
   }
@@@ -5208,13 -5566,13 +5208,13 @@@ static inline void unregister_as_ext3(v
   
   static inline int ext3_feature_set_ok(struct super_block *sb)
   {
- -      if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+ +      if (ext4_has_unknown_ext3_incompat_features(sb))
                 return 0;
- -      if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ +      if (!ext4_has_feature_journal(sb))
                 return 0;
         if (sb->s_flags & MS_RDONLY)
                 return 1;
- -      if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+ +      if (ext4_has_unknown_ext3_ro_compat_features(sb))
                 return 0;
         return 1;
   }
@@@ -5228,6 -5586,37 +5228,6 @@@ static struct file_system_type ext4_fs_
   };
   MODULE_ALIAS_FS("ext4");
   
- -static int __init ext4_init_feat_adverts(void)
- -{
- -      struct ext4_features *ef;
- -      int ret = -ENOMEM;
- -
- -      ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
- -      if (!ef)
- -              goto out;
- -
- -      ef->f_kobj.kset = ext4_kset;
- -      init_completion(&ef->f_kobj_unregister);
- -      ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
- -                                 "features");
- -      if (ret) {
- -              kfree(ef);
- -              goto out;
- -      }
- -
- -      ext4_feat = ef;
- -      ret = 0;
- -out:
- -      return ret;
- -}
- -
- -static void ext4_exit_feat_adverts(void)
- -{
- -      kobject_put(&ext4_feat->f_kobj);
- -      wait_for_completion(&ext4_feat->f_kobj_unregister);
- -      kfree(ext4_feat);
- -}
- -
   /* Shared across all ext4 file systems */
   wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
   struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
@@@ -5254,15 -5643,21 +5254,15 @@@ static int __init ext4_init_fs(void
   
         err = ext4_init_pageio();
         if (err)
- -              goto out7;
+ +              goto out5;
   
         err = ext4_init_system_zone();
         if (err)
- -              goto out6;
- -      ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
- -      if (!ext4_kset) {
- -              err = -ENOMEM;
- -              goto out5;
- -      }
- -      ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+ +              goto out4;
   
- -      err = ext4_init_feat_adverts();
+ +      err = ext4_init_sysfs();
         if (err)
- -              goto out4;
+ +              goto out3;
   
         err = ext4_init_mballoc();
         if (err)
@@@ -5287,12 -5682,16 +5287,12 @@@ out1
         ext4_mballoc_ready = 0;
         ext4_exit_mballoc();
   out2:
- -      ext4_exit_feat_adverts();
- -out4:
- -      if (ext4_proc_root)
- -              remove_proc_entry("fs/ext4", NULL);
- -      kset_unregister(ext4_kset);
- -out5:
+ +      ext4_exit_sysfs();
+ +out3:
         ext4_exit_system_zone();
- -out6:
+ +out4:
         ext4_exit_pageio();
- -out7:
+ +out5:
         ext4_exit_es();
   
         return err;
@@@ -5307,7 -5706,9 +5307,7 @@@ static void __exit ext4_exit_fs(void
         unregister_filesystem(&ext4_fs_type);
         destroy_inodecache();
         ext4_exit_mballoc();
- -      ext4_exit_feat_adverts();
- -      remove_proc_entry("fs/ext4", NULL);
- -      kset_unregister(ext4_kset);
+ +      ext4_exit_sysfs();
         ext4_exit_system_zone();
         ext4_exit_pageio();
         ext4_exit_es();
diff --combined fs/fs-writeback.c

index 7378169e90be6ed485ac48d0cf633c8e37c4c3d2,e82e1194b1d84a889ec3743333f3c526f1291d54..206a68b1db1ab1b1f5a6ed6a611723f957e1e58b
--- 1/fs/fs-writeback.c
--- 2/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@@ -778,24 -778,19 +778,24 @@@ static void bdi_split_work_to_wbs(struc
                                   struct wb_writeback_work *base_work,
                                   bool skip_if_busy)
   {
- -      int next_memcg_id = 0;
- -      struct bdi_writeback *wb;
- -      struct wb_iter iter;
+ +      struct bdi_writeback *last_wb = NULL;
+ +      struct bdi_writeback *wb = list_entry(&bdi->wb_list,
+ +                                            struct bdi_writeback, bdi_node);
   
         might_sleep();
   restart:
         rcu_read_lock();
- -      bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+ +      list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                 DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
                 struct wb_writeback_work fallback_work;
                 struct wb_writeback_work *work;
                 long nr_pages;
   
+ +              if (last_wb) {
+ +                      wb_put(last_wb);
+ +                      last_wb = NULL;
+ +              }
+ +
                 /* SYNC_ALL writes out I_DIRTY_TIME too */
                 if (!wb_has_dirty_io(wb) &&
                     (base_work->sync_mode == WB_SYNC_NONE ||
@@@ -824,22 -819,12 +824,22 @@@
   
                 wb_queue_work(wb, work);
   
- -              next_memcg_id = wb->memcg_css->id + 1;
+ +              /*
+ +               * Pin @wb so that it stays on @bdi->wb_list.  This allows
+ +               * continuing iteration from @wb after dropping and
+ +               * regrabbing rcu read lock.
+ +               */
+ +              wb_get(wb);
+ +              last_wb = wb;
+ +
                 rcu_read_unlock();
                 wb_wait_for_completion(bdi, &fallback_work_done);
                 goto restart;
         }
         rcu_read_unlock();
+ +
+ +      if (last_wb)
+ +              wb_put(last_wb);
   }
   
   #else /* CONFIG_CGROUP_WRITEBACK */
@@@ -1872,11 -1857,12 +1872,11 @@@ void wakeup_flusher_threads(long nr_pag
         rcu_read_lock();
         list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                 struct bdi_writeback *wb;
- -              struct wb_iter iter;
   
                 if (!bdi_has_dirty_io(bdi))
                         continue;
   
- -              bdi_for_each_wb(wb, bdi, &iter, 0)
+ +              list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                         wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
                                            false, reason);
         }
@@@ -1908,10 -1894,11 +1908,10 @@@ static void wakeup_dirtytime_writeback(
         rcu_read_lock();
         list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                 struct bdi_writeback *wb;
- -              struct wb_iter iter;
   
- -              bdi_for_each_wb(wb, bdi, &iter, 0)
- -                      if (!list_empty(&bdi->wb.b_dirty_time))
- -                              wb_wakeup(&bdi->wb);
+ +              list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+ +                      if (!list_empty(&wb->b_dirty_time))
+ +                              wb_wakeup(wb);
         }
         rcu_read_unlock();
         schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@@ -2149,7 -2136,12 +2149,12 @@@ static void wait_sb_inodes(struct super
                 iput(old_inode);
                 old_inode = inode;
   
-               filemap_fdatawait(mapping);
+               /*
+                * We keep the error status of individual mapping so that
+                * applications can catch the writeback error using fsync(2).
+                * See filemap_fdatawait_keep_errors() for details.
+                */
+               filemap_fdatawait_keep_errors(mapping);
   
                 cond_resched();
   
diff --combined fs/jffs2/wbuf.c

index 63f31c0733c51e5e1e8cc0b51425dc07bb59cc3e,955da626ba6b7f74d02a7d7dd5badb5e2a031430..f3a4857ff0718794b967836e796b8f9c9e345ae7
--- 1/fs/jffs2/wbuf.c
--- 2/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@@ -1264,7 -1264,7 +1264,7 @@@ int jffs2_dataflash_setup(struct jffs2_
         if ((c->flash_size % c->sector_size) != 0) {
                 c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
                 pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
-       };
+       }
   
         c->wbuf_ofs = 0xFFFFFFFF;
         c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
@@@ -1274,6 -1274,7 +1274,6 @@@
   #ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
         c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
         if (!c->wbuf_verify) {
- -              kfree(c->oobbuf);
                 kfree(c->wbuf);
                 return -ENOMEM;
         }
diff --combined fs/mpage.c

index 09abba7653aa8db8189d05d7c2094b77ef1998a9,7d29c863c05214509afc5986a02820b719ab7e6e..1480d3a180370fe3922a7724e613d09b896f9d00
--- 1/fs/mpage.c
--- 2/fs/mpage.c
+++ b/fs/mpage.c
@@@ -361,7 -361,7 +361,7 @@@ mpage_readpages(struct address_space *m
         sector_t last_block_in_bio = 0;
         struct buffer_head map_bh;
         unsigned long first_logical_block = 0;
-       gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+       gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
   
         map_bh.b_state = 0;
         map_bh.b_size = 0;
@@@ -397,7 -397,7 +397,7 @@@ int mpage_readpage(struct page *page, g
         sector_t last_block_in_bio = 0;
         struct buffer_head map_bh;
         unsigned long first_logical_block = 0;
-       gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping);
+       gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
   
         map_bh.b_state = 0;
         map_bh.b_size = 0;
@@@ -485,7 -485,6 +485,7 @@@ static int __mpage_writepage(struct pag
         struct buffer_head map_bh;
         loff_t i_size = i_size_read(inode);
         int ret = 0;
+ +      int wr = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
   
         if (page_has_buffers(page)) {
                 struct buffer_head *head = page_buffers(page);
@@@ -594,7 -593,7 +594,7 @@@ page_is_mapped
          * This page will go to BIO.  Do we need to send this BIO off first?
          */
         if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- -              bio = mpage_bio_submit(WRITE, bio);
+ +              bio = mpage_bio_submit(wr, bio);
   
   alloc_new:
         if (bio == NULL) {
@@@ -621,7 -620,7 +621,7 @@@
         wbc_account_io(wbc, page, PAGE_SIZE);
         length = first_unmapped << blkbits;
         if (bio_add_page(bio, page, length, 0) < length) {
- -              bio = mpage_bio_submit(WRITE, bio);
+ +              bio = mpage_bio_submit(wr, bio);
                 goto alloc_new;
         }
   
@@@ -631,7 -630,7 +631,7 @@@
         set_page_writeback(page);
         unlock_page(page);
         if (boundary || (first_unmapped != blocks_per_page)) {
- -              bio = mpage_bio_submit(WRITE, bio);
+ +              bio = mpage_bio_submit(wr, bio);
                 if (boundary_block) {
                         write_boundary_block(boundary_bdev,
                                         boundary_block, 1 << blkbits);
@@@ -643,7 -642,7 +643,7 @@@
   
   confused:
         if (bio)
- -              bio = mpage_bio_submit(WRITE, bio);
+ +              bio = mpage_bio_submit(wr, bio);
   
         if (mpd->use_writepage) {
                 ret = mapping->a_ops->writepage(page, wbc);
@@@ -699,11 -698,8 +699,11 @@@ mpage_writepages(struct address_space *
                 };
   
                 ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- -              if (mpd.bio)
- -                      mpage_bio_submit(WRITE, mpd.bio);
+ +              if (mpd.bio) {
+ +                      int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ +                                WRITE_SYNC : WRITE);
+ +                      mpage_bio_submit(wr, mpd.bio);
+ +              }
         }
         blk_finish_plug(&plug);
         return ret;
@@@ -720,11 -716,8 +720,11 @@@ int mpage_writepage(struct page *page, 
                 .use_writepage = 0,
         };
         int ret = __mpage_writepage(page, wbc, &mpd);
- -      if (mpd.bio)
- -              mpage_bio_submit(WRITE, mpd.bio);
+ +      if (mpd.bio) {
+ +              int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ +                        WRITE_SYNC : WRITE);
+ +              mpage_bio_submit(wr, mpd.bio);
+ +      }
         return ret;
   }
   EXPORT_SYMBOL(mpage_writepage);
diff --combined fs/namei.c

index 2b729d253715ba183e912fae98cfec1e68665e24,c86ea9e89f7d573c2d735e72915ad0eb47fbe9f3..174ef4f106cd2ac9696db8cf153f40dd7ef0ecaa
--- 1/fs/namei.c
--- 2/fs/namei.c
+++ b/fs/namei.c
@@@ -955,23 -955,26 +955,23 @@@ static bool safe_hardlink_source(struc
    *  - sysctl_protected_hardlinks enabled
    *  - fsuid does not match inode
    *  - hardlink source is unsafe (see safe_hardlink_source() above)
- - *  - not CAP_FOWNER
+ + *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
    *
    * Returns 0 if successful, -ve on error.
    */
   static int may_linkat(struct path *link)
   {
- -      const struct cred *cred;
         struct inode *inode;
   
         if (!sysctl_protected_hardlinks)
                 return 0;
   
- -      cred = current_cred();
         inode = link->dentry->d_inode;
   
         /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
          * otherwise, it must be a safe source.
          */
- -      if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
- -          capable(CAP_FOWNER))
+ +      if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
                 return 0;
   
         audit_log_link_denied("linkat", link);
@@@ -1966,7 -1969,7 +1966,7 @@@ OK
                 if (err) {
                         const char *s = get_link(nd);
   
- -                      if (unlikely(IS_ERR(s)))
+ +                      if (IS_ERR(s))
                                 return PTR_ERR(s);
                         err = 0;
                         if (unlikely(!s)) {
@@@ -2279,8 -2282,6 +2279,8 @@@ EXPORT_SYMBOL(vfs_path_lookup)
    *
    * Note that this routine is purely a helper for filesystem usage and should
    * not be called by generic code.
+ + *
+ + * The caller must hold base->i_mutex.
    */
   struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
   {
@@@ -2324,78 -2325,6 +2324,78 @@@
   }
   EXPORT_SYMBOL(lookup_one_len);
   
+ +/**
+ + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ + * @name:     pathname component to lookup
+ + * @base:     base directory to lookup from
+ + * @len:      maximum length @len should be interpreted to
+ + *
+ + * Note that this routine is purely a helper for filesystem usage and should
+ + * not be called by generic code.
+ + *
+ + * Unlike lookup_one_len, it should be called without the parent
+ + * i_mutex held, and will take the i_mutex itself if necessary.
+ + */
+ +struct dentry *lookup_one_len_unlocked(const char *name,
+ +                                     struct dentry *base, int len)
+ +{
+ +      struct qstr this;
+ +      unsigned int c;
+ +      int err;
+ +      struct dentry *ret;
+ +
+ +      this.name = name;
+ +      this.len = len;
+ +      this.hash = full_name_hash(name, len);
+ +      if (!len)
+ +              return ERR_PTR(-EACCES);
+ +
+ +      if (unlikely(name[0] == '.')) {
+ +              if (len < 2 || (len == 2 && name[1] == '.'))
+ +                      return ERR_PTR(-EACCES);
+ +      }
+ +
+ +      while (len--) {
+ +              c = *(const unsigned char *)name++;
+ +              if (c == '/' || c == '\0')
+ +                      return ERR_PTR(-EACCES);
+ +      }
+ +      /*
+ +       * See if the low-level filesystem might want
+ +       * to use its own hash..
+ +       */
+ +      if (base->d_flags & DCACHE_OP_HASH) {
+ +              int err = base->d_op->d_hash(base, &this);
+ +              if (err < 0)
+ +                      return ERR_PTR(err);
+ +      }
+ +
+ +      err = inode_permission(base->d_inode, MAY_EXEC);
+ +      if (err)
+ +              return ERR_PTR(err);
+ +
+ +      ret = __d_lookup(base, &this);
+ +      if (ret)
+ +              return ret;
+ +      /*
+ +       * __d_lookup() is used to try to get a quick answer and avoid the
+ +       * mutex.  A false-negative does no harm.
+ +       */
+ +      ret = __d_lookup(base, &this);
+ +      if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+ +              dput(ret);
+ +              ret = NULL;
+ +      }
+ +      if (ret)
+ +              return ret;
+ +
+ +      mutex_lock(&base->d_inode->i_mutex);
+ +      ret =  __lookup_hash(&this, base, 0);
+ +      mutex_unlock(&base->d_inode->i_mutex);
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL(lookup_one_len_unlocked);
+ +
   int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                  struct path *path, int *empty)
   {
@@@ -3454,7 -3383,7 +3454,7 @@@ struct file *do_file_open_root(struct d
                 return ERR_PTR(-ELOOP);
   
         filename = getname_kernel(name);
- -      if (unlikely(IS_ERR(filename)))
+ +      if (IS_ERR(filename))
                 return ERR_CAST(filename);
   
         set_nameidata(&nd, -1, filename);
@@@ -4678,7 -4607,7 +4678,7 @@@ EXPORT_SYMBOL(__page_symlink)
   int page_symlink(struct inode *inode, const char *symname, int len)
   {
         return __page_symlink(inode, symname, len,
-                       !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+                       !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
   }
   EXPORT_SYMBOL(page_symlink);
   
diff --combined fs/nfs/file.c

index 37f639d50af580396bf016a2fb40c2b427c7b1dd,17d3417c8a74375a2974af6e1c4f13219b188476..93e236429c5d785a1711d643d0b4676dfe4396cf
--- 1/fs/nfs/file.c
--- 2/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@@ -473,8 -473,8 +473,8 @@@ static int nfs_release_page(struct pag
         dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
   
         /* Always try to initiate a 'commit' if relevant, but only
-        * wait for it if __GFP_WAIT is set.  Even then, only wait 1
-        * second and only if the 'bdi' is not congested.
+        * wait for it if the caller allows blocking.  Even then,
+        * only wait 1 second and only if the 'bdi' is not congested.
          * Waiting indefinitely can cause deadlocks when the NFS
          * server is on this machine, when a new TCP connection is
          * needed and in other rare cases.  There is no particular
@@@ -484,7 -484,7 +484,7 @@@
         if (mapping) {
                 struct nfs_server *nfss = NFS_SERVER(mapping->host);
                 nfs_commit_inode(mapping->host, 0);
-               if ((gfp & __GFP_WAIT) &&
+               if (gfpflags_allow_blocking(gfp) &&
                     !bdi_write_congested(&nfss->backing_dev_info)) {
                         wait_on_page_bit_killable_timeout(page, PG_private,
                                                           HZ);
@@@ -738,7 -738,18 +738,7 @@@ out_noconflict
   
   static int do_vfs_lock(struct file *file, struct file_lock *fl)
   {
- -      int res = 0;
- -      switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
- -              case FL_POSIX:
- -                      res = posix_lock_file_wait(file, fl);
- -                      break;
- -              case FL_FLOCK:
- -                      res = flock_lock_file_wait(file, fl);
- -                      break;
- -              default:
- -                      BUG();
- -      }
- -      return res;
+ +      return locks_lock_file_wait(file, fl);
   }
   
   static int
diff --combined fs/ocfs2/cluster/heartbeat.c

index e404386bd93e8535a32152fcc5936ede9ac5fa41,ddddef0021a0f8cd94f456969099bc660ec1fdb5..709fbbd44c65366ce1e31aebce0904b5966d53a6
--- 1/fs/ocfs2/cluster/heartbeat.c
--- 2/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@@ -219,7 -219,8 +219,8 @@@ struct o2hb_region 
         unsigned                hr_unclean_stop:1,
                                 hr_aborted_start:1,
                                 hr_item_pinned:1,
-                               hr_item_dropped:1;
+                               hr_item_dropped:1,
+                               hr_node_deleted:1;
   
         /* protected by the hr_callback_sem */
         struct task_struct      *hr_task;
@@@ -1078,7 -1079,13 +1079,13 @@@ static int o2hb_thread(void *data
         set_user_nice(current, MIN_NICE);
   
         /* Pin node */
-       o2nm_depend_this_node();
+       ret = o2nm_depend_this_node();
+       if (ret) {
+               mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
+               reg->hr_node_deleted = 1;
+               wake_up(&o2hb_steady_queue);
+               return 0;
+       }
   
         while (!kthread_should_stop() &&
                !reg->hr_unclean_stop && !reg->hr_aborted_start) {
@@@ -1473,17 -1480,16 +1480,17 @@@ static int o2hb_read_block_input(struc
         return 0;
   }
   
- -static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+ +static ssize_t o2hb_region_block_bytes_show(struct config_item *item,
                                             char *page)
   {
- -      return sprintf(page, "%u\n", reg->hr_block_bytes);
+ +      return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes);
   }
   
- -static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+ +static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
                                              const char *page,
                                              size_t count)
   {
+ +      struct o2hb_region *reg = to_o2hb_region(item);
         int status;
         unsigned long block_bytes;
         unsigned int block_bits;
@@@ -1502,17 -1508,16 +1509,17 @@@
         return count;
   }
   
- -static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+ +static ssize_t o2hb_region_start_block_show(struct config_item *item,
                                             char *page)
   {
- -      return sprintf(page, "%llu\n", reg->hr_start_block);
+ +      return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block);
   }
   
- -static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+ +static ssize_t o2hb_region_start_block_store(struct config_item *item,
                                              const char *page,
                                              size_t count)
   {
+ +      struct o2hb_region *reg = to_o2hb_region(item);
         unsigned long long tmp;
         char *p = (char *)page;
   
@@@ -1528,16 -1533,16 +1535,16 @@@
         return count;
   }
   
- -static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
- -                                     char *page)
+ +static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page)
   {
- -      return sprintf(page, "%d\n", reg->hr_blocks);
+ +      return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks);
   }
   
- -static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+ +static ssize_t o2hb_region_blocks_store(struct config_item *item,
                                         const char *page,
                                         size_t count)
   {
+ +      struct o2hb_region *reg = to_o2hb_region(item);
         unsigned long tmp;
         char *p = (char *)page;
   
@@@ -1556,12 -1561,13 +1563,12 @@@
         return count;
   }
   
- -static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
- -                                  char *page)
+ +static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
   {
         unsigned int ret = 0;
   
- -      if (reg->hr_bdev)
- -              ret = sprintf(page, "%s\n", reg->hr_dev_name);
+ +      if (to_o2hb_region(item)->hr_bdev)
+ +              ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name);
   
         return ret;
   }
@@@ -1671,11 -1677,10 +1678,11 @@@ out
   }
   
   /* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
- -static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+ +static ssize_t o2hb_region_dev_store(struct config_item *item,
                                      const char *page,
                                      size_t count)
   {
+ +      struct o2hb_region *reg = to_o2hb_region(item);
         struct task_struct *hb_task;
         long fd;
         int sectsize;
@@@ -1789,7 -1794,8 +1796,8 @@@
         spin_unlock(&o2hb_live_lock);
   
         ret = wait_event_interruptible(o2hb_steady_queue,
-                               atomic_read(&reg->hr_steady_iterations) == 0);
+                               atomic_read(&reg->hr_steady_iterations) == 0 ||
+                               reg->hr_node_deleted);
         if (ret) {
                 atomic_set(&reg->hr_steady_iterations, 0);
                 reg->hr_aborted_start = 1;
@@@ -1800,6 -1806,11 +1808,11 @@@
                 goto out3;
         }
   
+       if (reg->hr_node_deleted) {
+               ret = -EINVAL;
+               goto out3;
+       }
+ 
         /* Ok, we were woken.  Make sure it wasn't by drop_item() */
         spin_lock(&o2hb_live_lock);
         hb_task = reg->hr_task;
@@@ -1830,9 -1841,9 +1843,9 @@@ out
         return ret;
   }
   
- -static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
- -                                      char *page)
+ +static ssize_t o2hb_region_pid_show(struct config_item *item, char *page)
   {
+ +      struct o2hb_region *reg = to_o2hb_region(item);
         pid_t pid = 0;
   
         spin_lock(&o2hb_live_lock);
@@@ -1846,23 -1857,92 +1859,23 @@@
         return sprintf(page, "%u\n", pid);
   }
   
- -struct o2hb_region_attribute {
- -      struct configfs_attribute attr;
- -      ssize_t (*show)(struct o2hb_region *, char *);
- -      ssize_t (*store)(struct o2hb_region *, const char *, size_t);
- -};
- -
- -static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
- -      .attr   = { .ca_owner = THIS_MODULE,
- -                  .ca_name = "block_bytes",
- -                  .ca_mode = S_IRUGO | S_IWUSR },
- -      .show   = o2hb_region_block_bytes_read,
- -      .store  = o2hb_region_block_bytes_write,
- -};
- -
- -static struct o2hb_region_attribute o2hb_region_attr_start_block = {
- -      .attr   = { .ca_owner = THIS_MODULE,
- -                  .ca_name = "start_block",
- -                  .ca_mode = S_IRUGO | S_IWUSR },
- -      .show   = o2hb_region_start_block_read,
- -      .store  = o2hb_region_start_block_write,
- -};
- -
- -static struct o2hb_region_attribute o2hb_region_attr_blocks = {
- -      .attr   = { .ca_owner = THIS_MODULE,
- -                  .ca_name = "blocks",
- -                  .ca_mode = S_IRUGO | S_IWUSR },
- -      .show   = o2hb_region_blocks_read,
- -      .store  = o2hb_region_blocks_write,
- -};
- -
- -static struct o2hb_region_attribute o2hb_region_attr_dev = {
- -      .attr   = { .ca_owner = THIS_MODULE,
- -                  .ca_name = "dev",
- -                  .ca_mode = S_IRUGO | S_IWUSR },
- -      .show   = o2hb_region_dev_read,
- -      .store  = o2hb_region_dev_write,
- -};
- -
- -static struct o2hb_region_attribute o2hb_region_attr_pid = {
- -       .attr   = { .ca_owner = THIS_MODULE,
- -                   .ca_name = "pid",
- -                   .ca_mode = S_IRUGO | S_IRUSR },
- -       .show   = o2hb_region_pid_read,
- -};
+ +CONFIGFS_ATTR(o2hb_region_, block_bytes);
+ +CONFIGFS_ATTR(o2hb_region_, start_block);
+ +CONFIGFS_ATTR(o2hb_region_, blocks);
+ +CONFIGFS_ATTR(o2hb_region_, dev);
+ +CONFIGFS_ATTR_RO(o2hb_region_, pid);
   
   static struct configfs_attribute *o2hb_region_attrs[] = {
- -      &o2hb_region_attr_block_bytes.attr,
- -      &o2hb_region_attr_start_block.attr,
- -      &o2hb_region_attr_blocks.attr,
- -      &o2hb_region_attr_dev.attr,
- -      &o2hb_region_attr_pid.attr,
+ +      &o2hb_region_attr_block_bytes,
+ +      &o2hb_region_attr_start_block,
+ +      &o2hb_region_attr_blocks,
+ +      &o2hb_region_attr_dev,
+ +      &o2hb_region_attr_pid,
         NULL,
   };
   
- -static ssize_t o2hb_region_show(struct config_item *item,
- -                              struct configfs_attribute *attr,
- -                              char *page)
- -{
- -      struct o2hb_region *reg = to_o2hb_region(item);
- -      struct o2hb_region_attribute *o2hb_region_attr =
- -              container_of(attr, struct o2hb_region_attribute, attr);
- -      ssize_t ret = 0;
- -
- -      if (o2hb_region_attr->show)
- -              ret = o2hb_region_attr->show(reg, page);
- -      return ret;
- -}
- -
- -static ssize_t o2hb_region_store(struct config_item *item,
- -                               struct configfs_attribute *attr,
- -                               const char *page, size_t count)
- -{
- -      struct o2hb_region *reg = to_o2hb_region(item);
- -      struct o2hb_region_attribute *o2hb_region_attr =
- -              container_of(attr, struct o2hb_region_attribute, attr);
- -      ssize_t ret = -EINVAL;
- -
- -      if (o2hb_region_attr->store)
- -              ret = o2hb_region_attr->store(reg, page, count);
- -      return ret;
- -}
- -
   static struct configfs_item_operations o2hb_region_item_ops = {
         .release                = o2hb_region_release,
- -      .show_attribute         = o2hb_region_show,
- -      .store_attribute        = o2hb_region_store,
   };
   
   static struct config_item_type o2hb_region_type = {
@@@ -2057,14 -2137,49 +2070,14 @@@ unlock
         spin_unlock(&o2hb_live_lock);
   }
   
- -struct o2hb_heartbeat_group_attribute {
- -      struct configfs_attribute attr;
- -      ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
- -      ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
- -};
- -
- -static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
- -                                       struct configfs_attribute *attr,
- -                                       char *page)
- -{
- -      struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
- -      struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
- -              container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
- -      ssize_t ret = 0;
- -
- -      if (o2hb_heartbeat_group_attr->show)
- -              ret = o2hb_heartbeat_group_attr->show(reg, page);
- -      return ret;
- -}
- -
- -static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
- -                                        struct configfs_attribute *attr,
- -                                        const char *page, size_t count)
- -{
- -      struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
- -      struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
- -              container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
- -      ssize_t ret = -EINVAL;
- -
- -      if (o2hb_heartbeat_group_attr->store)
- -              ret = o2hb_heartbeat_group_attr->store(reg, page, count);
- -      return ret;
- -}
- -
- -static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
- -                                                   char *page)
+ +static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+ +              char *page)
   {
         return sprintf(page, "%u\n", o2hb_dead_threshold);
   }
   
- -static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
- -                                                  const char *page,
- -                                                  size_t count)
+ +static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+ +              const char *page, size_t count)
   {
         unsigned long tmp;
         char *p = (char *)page;
@@@ -2079,15 -2194,17 +2092,15 @@@
         return count;
   }
   
- -static
- -ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
- -                                     char *page)
+ +static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item,
+ +              char *page)
   {
         return sprintf(page, "%s\n",
                        o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
   }
   
- -static
- -ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
- -                                      const char *page, size_t count)
+ +static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
+ +              const char *page, size_t count)
   {
         unsigned int i;
         int ret;
@@@ -2112,15 -2229,33 +2125,15 @@@
   
   }
   
- -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
- -      .attr   = { .ca_owner = THIS_MODULE,
- -                  .ca_name = "dead_threshold",
- -                  .ca_mode = S_IRUGO | S_IWUSR },
- -      .show   = o2hb_heartbeat_group_threshold_show,
- -      .store  = o2hb_heartbeat_group_threshold_store,
- -};
- -
- -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
- -      .attr   = { .ca_owner = THIS_MODULE,
- -              .ca_name = "mode",
- -              .ca_mode = S_IRUGO | S_IWUSR },
- -      .show   = o2hb_heartbeat_group_mode_show,
- -      .store  = o2hb_heartbeat_group_mode_store,
- -};
+ +CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+ +CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
   
   static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
- -      &o2hb_heartbeat_group_attr_threshold.attr,
- -      &o2hb_heartbeat_group_attr_mode.attr,
+ +      &o2hb_heartbeat_group_attr_threshold,
+ +      &o2hb_heartbeat_group_attr_mode,
         NULL,
   };
   
- -static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
- -      .show_attribute         = o2hb_heartbeat_group_show,
- -      .store_attribute        = o2hb_heartbeat_group_store,
- -};
- -
   static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
         .make_item      = o2hb_heartbeat_group_make_item,
         .drop_item      = o2hb_heartbeat_group_drop_item,
@@@ -2128,6 -2263,7 +2141,6 @@@
   
   static struct config_item_type o2hb_heartbeat_group_type = {
         .ct_group_ops   = &o2hb_heartbeat_group_group_ops,
- -      .ct_item_ops    = &o2hb_heartbeat_group_item_ops,
         .ct_attrs       = o2hb_heartbeat_group_attrs,
         .ct_owner       = THIS_MODULE,
   };
diff --combined fs/proc/array.c

index eed2050db9be9c7795acd2153f976d4742e2fe82,ff2ce1ab064d7ab94fd5b15d7b02e091f58a8426..d73291f5f0fcbfb0cd2cff2bb1b628a72f754f6e
--- 1/fs/proc/array.c
--- 2/fs/proc/array.c
+++ b/fs/proc/array.c
@@@ -91,18 -91,18 +91,18 @@@
   static inline void task_name(struct seq_file *m, struct task_struct *p)
   {
         char *buf;
+       size_t size;
         char tcomm[sizeof(p->comm)];
+       int ret;
   
         get_task_comm(tcomm, p);
   
         seq_puts(m, "Name:\t");
-       buf = m->buf + m->count;
   
-       /* Ignore error for now */
-       buf += string_escape_str(tcomm, buf, m->size - m->count,
-                                ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+       size = seq_get_buf(m, &buf);
+       ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+       seq_commit(m, ret < size ? ret : -1);
   
-       m->count = buf - m->buf;
         seq_putc(m, '\n');
   }
   
@@@ -375,7 -375,7 +375,7 @@@ int proc_pid_status(struct seq_file *m
   static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                         struct pid *pid, struct task_struct *task, int whole)
   {
- -      unsigned long vsize, eip, esp, wchan = ~0UL;
+ +      unsigned long vsize, eip, esp, wchan = 0;
         int priority, nice;
         int tty_pgrp = -1, tty_nr = 0;
         sigset_t sigign, sigcatch;
@@@ -507,19 -507,7 +507,19 @@@
         seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
         seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
         seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
- -      seq_put_decimal_ull(m, ' ', wchan);
+ +
+ +      /*
+ +       * We used to output the absolute kernel address, but that's an
+ +       * information leak - so instead we show a 0/1 flag here, to signal
+ +       * to user-space whether there's a wchan field in /proc/PID/wchan.
+ +       *
+ +       * This works with older implementations of procps as well.
+ +       */
+ +      if (wchan)
+ +              seq_puts(m, " 1");
+ +      else
+ +              seq_puts(m, " 0");
+ +
         seq_put_decimal_ull(m, ' ', 0);
         seq_put_decimal_ull(m, ' ', 0);
         seq_put_decimal_ll(m, ' ', task->exit_signal);
diff --combined fs/proc/task_mmu.c

index b029d426c55892544afcd3bf2b8a5965f6e0e5ee,c00cb0ae24f7de274fc4bc4485156b93af395b0d..9ca699b05e78906167519fa17ccb3acdbde510ec
--- 1/fs/proc/task_mmu.c
--- 2/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@@ -70,6 -70,7 +70,7 @@@ void task_mem(struct seq_file *m, struc
                 ptes >> 10,
                 pmds >> 10,
                 swap << (PAGE_SHIFT-10));
+       hugetlb_report_usage(m, mm);
   }
   
   unsigned long task_vsize(struct mm_struct *mm)
@@@ -446,14 -447,17 +447,17 @@@ struct mem_size_stats 
         unsigned long anonymous;
         unsigned long anonymous_thp;
         unsigned long swap;
+       unsigned long shared_hugetlb;
+       unsigned long private_hugetlb;
         u64 pss;
         u64 swap_pss;
   };
   
   static void smaps_account(struct mem_size_stats *mss, struct page *page,
-               unsigned long size, bool young, bool dirty)
+               bool compound, bool young, bool dirty)
   {
-       int mapcount;
+       int i, nr = compound ? HPAGE_PMD_NR : 1;
+       unsigned long size = nr * PAGE_SIZE;
   
         if (PageAnon(page))
                 mss->anonymous += size;
@@@ -462,23 -466,37 +466,37 @@@
         /* Accumulate the size in pages that have been accessed. */
         if (young || page_is_young(page) || PageReferenced(page))
                 mss->referenced += size;
-       mapcount = page_mapcount(page);
-       if (mapcount >= 2) {
-               u64 pss_delta;
   
-               if (dirty || PageDirty(page))
-                       mss->shared_dirty += size;
-               else
-                       mss->shared_clean += size;
-               pss_delta = (u64)size << PSS_SHIFT;
-               do_div(pss_delta, mapcount);
-               mss->pss += pss_delta;
-       } else {
+       /*
+        * page_count(page) == 1 guarantees the page is mapped exactly once.
+        * If any subpage of the compound page mapped with PTE it would elevate
+        * page_count().
+        */
+       if (page_count(page) == 1) {
                 if (dirty || PageDirty(page))
                         mss->private_dirty += size;
                 else
                         mss->private_clean += size;
                 mss->pss += (u64)size << PSS_SHIFT;
+               return;
+       }
+ 
+       for (i = 0; i < nr; i++, page++) {
+               int mapcount = page_mapcount(page);
+ 
+               if (mapcount >= 2) {
+                       if (dirty || PageDirty(page))
+                               mss->shared_dirty += PAGE_SIZE;
+                       else
+                               mss->shared_clean += PAGE_SIZE;
+                       mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+               } else {
+                       if (dirty || PageDirty(page))
+                               mss->private_dirty += PAGE_SIZE;
+                       else
+                               mss->private_clean += PAGE_SIZE;
+                       mss->pss += PAGE_SIZE << PSS_SHIFT;
+               }
         }
   }
   
@@@ -513,7 -531,8 +531,8 @@@ static void smaps_pte_entry(pte_t *pte
   
         if (!page)
                 return;
-       smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+ 
+       smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
   }
   
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@@ -529,8 -548,7 +548,7 @@@ static void smaps_pmd_entry(pmd_t *pmd
         if (IS_ERR_OR_NULL(page))
                 return;
         mss->anonymous_thp += HPAGE_PMD_SIZE;
-       smaps_account(mss, page, HPAGE_PMD_SIZE,
-                       pmd_young(*pmd), pmd_dirty(*pmd));
+       smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
   }
   #else
   static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@@ -546,7 -564,7 +564,7 @@@ static int smaps_pte_range(pmd_t *pmd, 
         pte_t *pte;
         spinlock_t *ptl;
   
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                 smaps_pmd_entry(pmd, addr, walk);
                 spin_unlock(ptl);
                 return 0;
@@@ -625,12 -643,44 +643,44 @@@ static void show_smap_vma_flags(struct 
         seq_putc(m, '\n');
   }
   
+ #ifdef CONFIG_HUGETLB_PAGE
+ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+                                unsigned long addr, unsigned long end,
+                                struct mm_walk *walk)
+ {
+       struct mem_size_stats *mss = walk->private;
+       struct vm_area_struct *vma = walk->vma;
+       struct page *page = NULL;
+ 
+       if (pte_present(*pte)) {
+               page = vm_normal_page(vma, addr, *pte);
+       } else if (is_swap_pte(*pte)) {
+               swp_entry_t swpent = pte_to_swp_entry(*pte);
+ 
+               if (is_migration_entry(swpent))
+                       page = migration_entry_to_page(swpent);
+       }
+       if (page) {
+               int mapcount = page_mapcount(page);
+ 
+               if (mapcount >= 2)
+                       mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+               else
+                       mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+       }
+       return 0;
+ }
+ #endif /* HUGETLB_PAGE */
+ 
   static int show_smap(struct seq_file *m, void *v, int is_pid)
   {
         struct vm_area_struct *vma = v;
         struct mem_size_stats mss;
         struct mm_walk smaps_walk = {
                 .pmd_entry = smaps_pte_range,
+ #ifdef CONFIG_HUGETLB_PAGE
+               .hugetlb_entry = smaps_hugetlb_range,
+ #endif
                 .mm = vma->vm_mm,
                 .private = &mss,
         };
@@@ -652,6 -702,8 +702,8 @@@
                    "Referenced:     %8lu kB\n"
                    "Anonymous:      %8lu kB\n"
                    "AnonHugePages:  %8lu kB\n"
+                  "Shared_Hugetlb: %8lu kB\n"
+                  "Private_Hugetlb: %8lu kB\n"
                    "Swap:           %8lu kB\n"
                    "SwapPss:        %8lu kB\n"
                    "KernelPageSize: %8lu kB\n"
@@@ -667,6 -719,8 +719,8 @@@
                    mss.referenced >> 10,
                    mss.anonymous >> 10,
                    mss.anonymous_thp >> 10,
+                  mss.shared_hugetlb >> 10,
+                  mss.private_hugetlb >> 10,
                    mss.swap >> 10,
                    (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
                    vma_kernel_pagesize(vma) >> 10,
@@@ -753,36 -807,37 +807,37 @@@ static inline void clear_soft_dirty(str
         pte_t ptent = *pte;
   
         if (pte_present(ptent)) {
+               ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
                 ptent = pte_wrprotect(ptent);
- -              ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+ +              ptent = pte_clear_soft_dirty(ptent);
+               ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
         } else if (is_swap_pte(ptent)) {
                 ptent = pte_swp_clear_soft_dirty(ptent);
+               set_pte_at(vma->vm_mm, addr, pte, ptent);
         }
- 
-       set_pte_at(vma->vm_mm, addr, pte, ptent);
   }
+ #else
+ static inline void clear_soft_dirty(struct vm_area_struct *vma,
+               unsigned long addr, pte_t *pte)
+ {
+ }
+ #endif
   
+ #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
   static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
                 unsigned long addr, pmd_t *pmdp)
   {
-       pmd_t pmd = *pmdp;
+       pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
   
         pmd = pmd_wrprotect(pmd);
- -      pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+ +      pmd = pmd_clear_soft_dirty(pmd);
   
         if (vma->vm_flags & VM_SOFTDIRTY)
                 vma->vm_flags &= ~VM_SOFTDIRTY;
   
         set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
   }
- 
   #else
- 
- static inline void clear_soft_dirty(struct vm_area_struct *vma,
-               unsigned long addr, pte_t *pte)
- {
- }
- 
   static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
                 unsigned long addr, pmd_t *pmdp)
   {
@@@ -798,7 -853,7 +853,7 @@@ static int clear_refs_pte_range(pmd_t *
         spinlock_t *ptl;
         struct page *page;
   
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
                         clear_soft_dirty_pmd(vma, addr, pmd);
                         goto out;
@@@ -1072,7 -1127,7 +1127,7 @@@ static int pagemap_pmd_range(pmd_t *pmd
         int err = 0;
   
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
                 u64 flags = 0, frame = 0;
                 pmd_t pmd = *pmdp;
   
@@@ -1404,7 -1459,7 +1459,7 @@@ static int gather_pte_stats(pmd_t *pmd
         pte_t *orig_pte;
         pte_t *pte;
   
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                 pte_t huge_pte = *(pte_t *)pmd;
                 struct page *page;
   
diff --combined fs/xfs/xfs_qm.c

index 7af7648c06c63bd63ec21b150cc2713914f28c08,587174fd4f2c216637eda70b9986848a0980fd09..532ab79d38fe376c14a5463a97195b59a61d8f84
--- 1/fs/xfs/xfs_qm.c
--- 2/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@@ -184,7 -184,7 +184,7 @@@ xfs_qm_dqpurge
          */
         ASSERT(!list_empty(&dqp->q_lru));
         list_lru_del(&qi->qi_lru, &dqp->q_lru);
- -      XFS_STATS_DEC(xs_qm_dquot_unused);
+ +      XFS_STATS_DEC(mp, xs_qm_dquot_unused);
   
         xfs_qm_dqdestroy(dqp);
         return 0;
@@@ -448,11 -448,11 +448,11 @@@ xfs_qm_dquot_isolate
          */
         if (dqp->q_nrefs) {
                 xfs_dqunlock(dqp);
- -              XFS_STATS_INC(xs_qm_dqwants);
+ +              XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
   
                 trace_xfs_dqreclaim_want(dqp);
                 list_lru_isolate(lru, &dqp->q_lru);
- -              XFS_STATS_DEC(xs_qm_dquot_unused);
+ +              XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
                 return LRU_REMOVED;
         }
   
@@@ -496,19 -496,19 +496,19 @@@
   
         ASSERT(dqp->q_nrefs == 0);
         list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
- -      XFS_STATS_DEC(xs_qm_dquot_unused);
+ +      XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
         trace_xfs_dqreclaim_done(dqp);
- -      XFS_STATS_INC(xs_qm_dqreclaims);
+ +      XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
         return LRU_REMOVED;
   
   out_miss_busy:
         trace_xfs_dqreclaim_busy(dqp);
- -      XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ +      XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
         return LRU_SKIP;
   
   out_unlock_dirty:
         trace_xfs_dqreclaim_busy(dqp);
- -      XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ +      XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
         xfs_dqunlock(dqp);
         spin_lock(lru_lock);
         return LRU_RETRY;
@@@ -525,7 -525,7 +525,7 @@@ xfs_qm_shrink_scan
         unsigned long           freed;
         int                     error;
   
-       if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+       if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
                 return 0;
   
         INIT_LIST_HEAD(&isol.buffers);
diff --combined include/asm-generic/pgtable.h

index 14b0ff32fb9f16c6ce30e0e54c3f3b4885216699,010a7e3f6ad142dee0b702ddf763f8a181a9f16a..63abda1ac06dbf74793130ca2859b6cc553e6cfc
--- 1/include/asm-generic/pgtable.h
--- 2/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@@ -30,19 -30,9 +30,19 @@@ extern int ptep_set_access_flags(struc
   #endif
   
   #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp,
                                  pmd_t entry, int dirty);
+ +#else
+ +static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
+ +                                      unsigned long address, pmd_t *pmdp,
+ +                                      pmd_t entry, int dirty)
+ +{
+ +      BUILD_BUG();
+ +      return 0;
+ +}
+ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   #endif
   
   #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
@@@ -74,12 -64,12 +74,12 @@@ static inline int pmdp_test_and_clear_y
                 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
         return r;
   }
- -#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+ +#else
   static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                             unsigned long address,
                                             pmd_t *pmdp)
   {
- -      BUG();
+ +      BUILD_BUG();
         return 0;
   }
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@@ -91,21 -81,8 +91,21 @@@ int ptep_clear_flush_young(struct vm_ar
   #endif
   
   #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
- -int pmdp_clear_flush_young(struct vm_area_struct *vma,
- -                         unsigned long address, pmd_t *pmdp);
+ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ +extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ +                                unsigned long address, pmd_t *pmdp);
+ +#else
+ +/*
+ + * Despite relevant to THP only, this API is called from generic rmap code
+ + * under PageTransHuge(), hence needs a dummy implementation for !THP
+ + */
+ +static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ +                                       unsigned long address, pmd_t *pmdp)
+ +{
+ +      BUILD_BUG();
+ +      return 0;
+ +}
+ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   #endif
   
   #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
@@@ -198,20 -175,15 +198,15 @@@ static inline void pmdp_set_wrprotect(s
         pmd_t old_pmd = *pmdp;
         set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
   }
- -#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+ +#else
   static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                       unsigned long address, pmd_t *pmdp)
   {
- -      BUG();
+ +      BUILD_BUG();
   }
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   #endif
   
- #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-                                unsigned long address, pmd_t *pmdp);
- #endif
- 
   #ifndef pmdp_collapse_flush
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
   extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
@@@ -271,7 -243,7 +266,7 @@@ static inline int pmd_same(pmd_t pmd_a
   #else /* CONFIG_TRANSPARENT_HUGEPAGE */
   static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
   {
- -      BUG();
+ +      BUILD_BUG();
         return 0;
   }
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@@ -505,16 -477,6 +500,16 @@@ static inline pmd_t pmd_mksoft_dirty(pm
         return pmd;
   }
   
+ +static inline pte_t pte_clear_soft_dirty(pte_t pte)
+ +{
+ +      return pte;
+ +}
+ +
+ +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+ +{
+ +      return pmd;
+ +}
+ +
   static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
   {
         return pte;
@@@ -619,10 -581,6 +614,6 @@@ static inline int pmd_trans_huge(pmd_t 
   {
         return 0;
   }
- static inline int pmd_trans_splitting(pmd_t pmd)
- {
-       return 0;
- }
   #ifndef __HAVE_ARCH_PMD_WRITE
   static inline int pmd_write(pmd_t pmd)
   {
diff --combined include/drm/drmP.h

index 4d3b842f4319586fef51493cfebae68397d2b0b4,f56cdcecc1c97bd4e6e7d884845ed4551a41a673..0b921ae06cd83585e1d1cf2adb6baf665f203013
--- 1/include/drm/drmP.h
--- 2/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@@ -107,9 -107,6 +107,9 @@@ struct dma_buf_attachment
    * ATOMIC: used in the atomic code.
    *      This is the category used by the DRM_DEBUG_ATOMIC() macro.
    *
+ + * VBL: used for verbose debug message in the vblank code
+ + *      This is the category used by the DRM_DEBUG_VBL() macro.
+ + *
    * Enabling verbose debug messages is done through the drm.debug parameter,
    * each category being enabled by a bit.
    *
@@@ -117,7 -114,7 +117,7 @@@
    * drm.debug=0x2 will enable DRIVER messages
    * drm.debug=0x3 will enable CORE and DRIVER messages
    * ...
- - * drm.debug=0xf will enable all messages
+ + * drm.debug=0x3f will enable all messages
    *
    * An interesting feature is that it's possible to enable verbose logging at
    * run-time by echoing the debug value in its sysfs node:
@@@ -128,7 -125,6 +128,7 @@@
   #define DRM_UT_KMS            0x04
   #define DRM_UT_PRIME          0x08
   #define DRM_UT_ATOMIC         0x10
+ +#define DRM_UT_VBL            0x20
   
   extern __printf(2, 3)
   void drm_ut_debug_printk(const char *function_name,
@@@ -221,11 -217,6 +221,11 @@@ void drm_err(const char *format, ...)
                 if (unlikely(drm_debug & DRM_UT_ATOMIC))                \
                         drm_ut_debug_printk(__func__, fmt, ##args);     \
         } while (0)
+ +#define DRM_DEBUG_VBL(fmt, args...)                                   \
+ +      do {                                                            \
+ +              if (unlikely(drm_debug & DRM_UT_VBL))                   \
+ +                      drm_ut_debug_printk(__func__, fmt, ##args);     \
+ +      } while (0)
   
   /*@}*/
   
@@@ -421,7 -412,7 +421,7 @@@ struct drm_driver 
         /**
          * get_vblank_counter - get raw hardware vblank counter
          * @dev: DRM device
- -       * @crtc: counter to fetch
+ +       * @pipe: counter to fetch
          *
          * Driver callback for fetching a raw hardware vblank counter for @crtc.
          * If a device doesn't have a hardware counter, the driver can simply
@@@ -435,12 -426,12 +435,12 @@@
          * RETURNS
          * Raw vblank counter value.
          */
- -      u32 (*get_vblank_counter) (struct drm_device *dev, int crtc);
+ +      u32 (*get_vblank_counter) (struct drm_device *dev, unsigned int pipe);
   
         /**
          * enable_vblank - enable vblank interrupt events
          * @dev: DRM device
- -       * @crtc: which irq to enable
+ +       * @pipe: which irq to enable
          *
          * Enable vblank interrupts for @crtc.  If the device doesn't have
          * a hardware vblank counter, this routine should be a no-op, since
@@@ -450,18 -441,18 +450,18 @@@
          * Zero on success, appropriate errno if the given @crtc's vblank
          * interrupt cannot be enabled.
          */
- -      int (*enable_vblank) (struct drm_device *dev, int crtc);
+ +      int (*enable_vblank) (struct drm_device *dev, unsigned int pipe);
   
         /**
          * disable_vblank - disable vblank interrupt events
          * @dev: DRM device
- -       * @crtc: which irq to enable
+ +       * @pipe: which irq to enable
          *
          * Disable vblank interrupts for @crtc.  If the device doesn't have
          * a hardware vblank counter, this routine should be a no-op, since
          * interrupts will have to stay on to keep the count accurate.
          */
- -      void (*disable_vblank) (struct drm_device *dev, int crtc);
+ +      void (*disable_vblank) (struct drm_device *dev, unsigned int pipe);
   
         /**
          * Called by \c drm_device_is_agp.  Typically used to determine if a
@@@ -483,7 -474,7 +483,7 @@@
          * optional accurate ktime_get timestamp of when position was measured.
          *
          * \param dev  DRM device.
- -       * \param crtc Id of the crtc to query.
+ +       * \param pipe Id of the crtc to query.
          * \param flags Flags from the caller (DRM_CALLED_FROM_VBLIRQ or 0).
          * \param *vpos Target location for current vertical scanout position.
          * \param *hpos Target location for current horizontal scanout position.
@@@ -491,7 -482,6 +491,7 @@@
          *               scanout position query. Can be NULL to skip timestamp.
          * \param *etime Target location for timestamp taken immediately after
          *               scanout position query. Can be NULL to skip timestamp.
+ +       * \param mode Current display timings.
          *
          * Returns vpos as a positive number while in active scanout area.
          * Returns vpos as a negative number inside vblank, counting the number
@@@ -507,10 -497,10 +507,10 @@@
          * but unknown small number of scanlines wrt. real scanout position.
          *
          */
- -      int (*get_scanout_position) (struct drm_device *dev, int crtc,
- -                                   unsigned int flags,
- -                                   int *vpos, int *hpos, ktime_t *stime,
- -                                   ktime_t *etime);
+ +      int (*get_scanout_position) (struct drm_device *dev, unsigned int pipe,
+ +                                   unsigned int flags, int *vpos, int *hpos,
+ +                                   ktime_t *stime, ktime_t *etime,
+ +                                   const struct drm_display_mode *mode);
   
         /**
          * Called by \c drm_get_last_vbltimestamp. Should return a precise
@@@ -526,7 -516,7 +526,7 @@@
          * to the OpenML OML_sync_control extension specification.
          *
          * \param dev dev DRM device handle.
- -       * \param crtc crtc for which timestamp should be returned.
+ +       * \param pipe crtc for which timestamp should be returned.
          * \param *max_error Maximum allowable timestamp error in nanoseconds.
          *                   Implementation should strive to provide timestamp
          *                   with an error of at most *max_error nanoseconds.
@@@ -542,7 -532,7 +542,7 @@@
          * negative number on failure. A positive status code on success,
          * which describes how the vblank_time timestamp was computed.
          */
- -      int (*get_vblank_timestamp) (struct drm_device *dev, int crtc,
+ +      int (*get_vblank_timestamp) (struct drm_device *dev, unsigned int pipe,
                                      int *max_error,
                                      struct timeval *vblank_time,
                                      unsigned flags);
@@@ -711,8 -701,6 +711,8 @@@ struct drm_vblank_crtc 
         u32 last_wait;                  /* Last vblank seqno waited per CRTC */
         unsigned int inmodeset;         /* Display driver is setting mode */
         unsigned int pipe;              /* crtc index */
+ +      int framedur_ns;                /* frame/field duration in ns */
+ +      int linedur_ns;                 /* line duration in ns */
         bool enabled;                   /* so we don't call enable more than
                                            once per disable */
   };
@@@ -834,7 -822,6 +834,6 @@@ struct drm_device 
   
         struct drm_sg_mem *sg;  /**< Scatter gather memory */
         unsigned int num_crtcs;                  /**< Number of CRTCs on this device */
-       sigset_t sigmask;
   
         struct {
                 int context;
@@@ -918,8 -905,6 +917,8 @@@ extern unsigned int drm_poll(struct fil
   /* Misc. IOCTL support (drm_ioctl.c) */
   int drm_noop(struct drm_device *dev, void *data,
              struct drm_file *file_priv);
+ +int drm_invalid_op(struct drm_device *dev, void *data,
+ +                 struct drm_file *file_priv);
   
   /* Cache management (drm_cache.c) */
   void drm_clflush_pages(struct page *pages[], unsigned long num_pages);
@@@ -938,12 -923,10 +937,12 @@@ extern int drm_irq_uninstall(struct drm
   extern int drm_vblank_init(struct drm_device *dev, unsigned int num_crtcs);
   extern int drm_wait_vblank(struct drm_device *dev, void *data,
                            struct drm_file *filp);
- -extern u32 drm_vblank_count(struct drm_device *dev, int pipe);
+ +extern u32 drm_vblank_count(struct drm_device *dev, unsigned int pipe);
   extern u32 drm_crtc_vblank_count(struct drm_crtc *crtc);
   extern u32 drm_vblank_count_and_time(struct drm_device *dev, unsigned int pipe,
                                      struct timeval *vblanktime);
+ +extern u32 drm_crtc_vblank_count_and_time(struct drm_crtc *crtc,
+ +                                        struct timeval *vblanktime);
   extern void drm_send_vblank_event(struct drm_device *dev, unsigned int pipe,
                                   struct drm_pending_vblank_event *e);
   extern void drm_crtc_send_vblank_event(struct drm_crtc *crtc,
@@@ -962,12 -945,12 +961,12 @@@ extern void drm_crtc_vblank_off(struct 
   extern void drm_crtc_vblank_reset(struct drm_crtc *crtc);
   extern void drm_crtc_vblank_on(struct drm_crtc *crtc);
   extern void drm_vblank_cleanup(struct drm_device *dev);
+ +extern u32 drm_vblank_no_hw_counter(struct drm_device *dev, unsigned int pipe);
   
   extern int drm_calc_vbltimestamp_from_scanoutpos(struct drm_device *dev,
                                                  unsigned int pipe, int *max_error,
                                                  struct timeval *vblank_time,
                                                  unsigned flags,
- -                                               const struct drm_crtc *refcrtc,
                                                  const struct drm_display_mode *mode);
   extern void drm_calc_timestamping_constants(struct drm_crtc *crtc,
                                             const struct drm_display_mode *mode);
diff --combined include/linux/compiler-gcc.h

index 8efb40e61d6e48021d68f93635eea8d3ab3e8c0b,dc3d6b7ce1ebafe159f4b65b7d1dc732caae4af9..22ab246feed34c104038d3f94e1401ea9a587f8f
--- 1/include/linux/compiler-gcc.h
--- 2/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@@ -205,11 -205,31 +205,31 @@@
   
   #if GCC_VERSION >= 40600
   /*
-  * Tell the optimizer that something else uses this function or variable.
+  * When used with Link Time Optimization, gcc can optimize away C functions or
+  * variables which are referenced only from assembly code.  __visible tells the
+  * optimizer that something else uses this function or variable, thus preventing
+  * this.
    */
   #define __visible     __attribute__((externally_visible))
   #endif
   
+ 
+ #if GCC_VERSION >= 40900 && !defined(__CHECKER__)
+ /*
+  * __assume_aligned(n, k): Tell the optimizer that the returned
+  * pointer can be assumed to be k modulo n. The second argument is
+  * optional (default 0), so we use a variadic macro to make the
+  * shorthand.
+  *
+  * Beware: Do not apply this to functions which may return
+  * ERR_PTRs. Also, it is probably unwise to apply it to functions
+  * returning extra information in the low bits (but in that case the
+  * compiler should see some alignment anyway, when the return value is
+  * massaged by 'flags = ptr & 3; ptr &= ~3;').
+  */
+ #define __assume_aligned(a, ...) __attribute__((__assume_aligned__(a, ## __VA_ARGS__)))
+ #endif
+ 
   /*
    * GCC 'asm goto' miscompiles certain code sequences:
    *
@@@ -237,25 -257,12 +257,25 @@@
   #define KASAN_ABI_VERSION 3
   #endif
   
+ +#if GCC_VERSION >= 40902
+ +/*
+ + * Tell the compiler that address safety instrumentation (KASAN)
+ + * should not be applied to that function.
+ + * Conflicts with inlining: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+ + */
+ +#define __no_sanitize_address __attribute__((no_sanitize_address))
+ +#endif
+ +
   #endif        /* gcc version >= 40000 specific checks */
   
   #if !defined(__noclone)
   #define __noclone     /* not needed */
   #endif
   
+ +#if !defined(__no_sanitize_address)
+ +#define __no_sanitize_address
+ +#endif
+ +
   /*
    * A trick to suppress uninitialized variable warning without generating any
    * code
diff --combined include/linux/compiler.h

index 8807e4f1b0e6b1878c845a7301f7aded28b4707b,6167ca663ad954726540431b72a6aca5a8e98de6..f108e5222dad0ea50ae685173eaceb486f6eab94
--- 1/include/linux/compiler.h
--- 2/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@@ -56,7 -56,7 +56,7 @@@ extern void __chk_io_ptr(const volatil
   #include <linux/compiler-gcc.h>
   #endif
   
- -#ifdef CC_USING_HOTPATCH
+ +#if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__)
   #define notrace __attribute__((hotpatch(0,0)))
   #else
   #define notrace __attribute__((no_instrument_function))
@@@ -198,45 -198,19 +198,45 @@@ void ftrace_likely_update(struct ftrace
   
   #include <uapi/linux/types.h>
   
- -static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+ +#define __READ_ONCE_SIZE                                              \
+ +({                                                                    \
+ +      switch (size) {                                                 \
+ +      case 1: *(__u8 *)res = *(volatile __u8 *)p; break;              \
+ +      case 2: *(__u16 *)res = *(volatile __u16 *)p; break;            \
+ +      case 4: *(__u32 *)res = *(volatile __u32 *)p; break;            \
+ +      case 8: *(__u64 *)res = *(volatile __u64 *)p; break;            \
+ +      default:                                                        \
+ +              barrier();                                              \
+ +              __builtin_memcpy((void *)res, (const void *)p, size);   \
+ +              barrier();                                              \
+ +      }                                                               \
+ +})
+ +
+ +static __always_inline
+ +void __read_once_size(const volatile void *p, void *res, int size)
   {
- -      switch (size) {
- -      case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
- -      case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
- -      case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
- -      case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
- -      default:
- -              barrier();
- -              __builtin_memcpy((void *)res, (const void *)p, size);
- -              barrier();
- -      }
+ +      __READ_ONCE_SIZE;
+ +}
+ +
+ +#ifdef CONFIG_KASAN
+ +/*
+ + * This function is not 'inline' because __no_sanitize_address confilcts
+ + * with inlining. Attempt to inline it may cause a build failure.
+ + *    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+ + * '__maybe_unused' allows us to avoid defined-but-not-used warnings.
+ + */
+ +static __no_sanitize_address __maybe_unused
+ +void __read_once_size_nocheck(const volatile void *p, void *res, int size)
+ +{
+ +      __READ_ONCE_SIZE;
+ +}
+ +#else
+ +static __always_inline
+ +void __read_once_size_nocheck(const volatile void *p, void *res, int size)
+ +{
+ +      __READ_ONCE_SIZE;
   }
+ +#endif
   
   static __always_inline void __write_once_size(volatile void *p, void *res, int size)
   {
@@@ -274,22 -248,8 +274,22 @@@
    * required ordering.
    */
   
- -#define READ_ONCE(x) \
- -      ({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+ +#define __READ_ONCE(x, check)                                         \
+ +({                                                                    \
+ +      union { typeof(x) __val; char __c[1]; } __u;                    \
+ +      if (check)                                                      \
+ +              __read_once_size(&(x), __u.__c, sizeof(x));             \
+ +      else                                                            \
+ +              __read_once_size_nocheck(&(x), __u.__c, sizeof(x));     \
+ +      __u.__val;                                                      \
+ +})
+ +#define READ_ONCE(x) __READ_ONCE(x, 1)
+ +
+ +/*
+ + * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need
+ + * to hide memory access from KASAN.
+ + */
+ +#define READ_ONCE_NOCHECK(x) __READ_ONCE(x, 0)
   
   #define WRITE_ONCE(x, val) \
   ({                                                    \
@@@ -433,6 -393,14 +433,14 @@@
   #define __visible
   #endif
   
+ /*
+  * Assume alignment of return value.
+  */
+ #ifndef __assume_aligned
+ #define __assume_aligned(a, ...)
+ #endif
+ 
+ 
   /* Are two types/vars the same type (ignoring qualifiers)? */
   #ifndef __same_type
   # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
diff --combined include/linux/fs.h

index f2325998cd20cd551d445072bbcd0fb15c17732c,9355f377fd468914e891214874f62ca646876b80..f78dd76f682817dbdbc1d357f8ffd8126857d3c2
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -1042,7 -1042,6 +1042,7 @@@ extern int fcntl_setlease(unsigned int 
   extern int fcntl_getlease(struct file *filp);
   
   /* fs/locks.c */
+ +extern struct srcu_notifier_head      lease_notifier_chain;
   void locks_free_lock_context(struct file_lock_context *ctx);
   void locks_free_lock(struct file_lock *fl);
   extern void locks_init_lock(struct file_lock *);
@@@ -1054,11 -1053,12 +1054,11 @@@ extern void locks_remove_file(struct fi
   extern void locks_release_private(struct file_lock *);
   extern void posix_test_lock(struct file *, struct file_lock *);
   extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
- -extern int posix_lock_inode_wait(struct inode *, struct file_lock *);
   extern int posix_unblock_lock(struct file_lock *);
   extern int vfs_test_lock(struct file *, struct file_lock *);
   extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
   extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
- -extern int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl);
+ +extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);
   extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
   extern void lease_get_mtime(struct inode *, struct timespec *time);
   extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
@@@ -1144,6 -1144,12 +1144,6 @@@ static inline int posix_lock_file(struc
         return -ENOLCK;
   }
   
- -static inline int posix_lock_inode_wait(struct inode *inode,
- -                                      struct file_lock *fl)
- -{
- -      return -ENOLCK;
- -}
- -
   static inline int posix_unblock_lock(struct file_lock *waiter)
   {
         return -ENOENT;
@@@ -1165,7 -1171,8 +1165,7 @@@ static inline int vfs_cancel_lock(struc
         return 0;
   }
   
- -static inline int flock_lock_inode_wait(struct inode *inode,
- -                                      struct file_lock *request)
+ +static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
   {
         return -ENOLCK;
   }
@@@ -1208,9 -1215,14 +1208,9 @@@ static inline struct inode *file_inode(
         return f->f_inode;
   }
   
- -static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
- -{
- -      return posix_lock_inode_wait(file_inode(filp), fl);
- -}
- -
- -static inline int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+ +static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
   {
- -      return flock_lock_inode_wait(file_inode(filp), fl);
+ +      return locks_lock_inode_wait(file_inode(filp), fl);
   }
   
   struct fasync_struct {
@@@ -2410,6 -2422,7 +2410,7 @@@ extern int write_inode_now(struct inod
   extern int filemap_fdatawrite(struct address_space *);
   extern int filemap_flush(struct address_space *);
   extern int filemap_fdatawait(struct address_space *);
+ extern void filemap_fdatawait_keep_errors(struct address_space *);
   extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
                                    loff_t lend);
   extern int filemap_write_and_wait(struct address_space *mapping);
diff --combined include/linux/hugetlb_cgroup.h

index 7edd305152983af1ab6aee93f470dd99289046e8,75e34b90074894847eb63aff72d33d2639cf7337..24154c26d469c60984020b5a0441fcb6dde3fcb0
--- 1/include/linux/hugetlb_cgroup.h
--- 2/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@@ -32,7 -32,7 +32,7 @@@ static inline struct hugetlb_cgroup *hu
   
         if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
                 return NULL;
-       return (struct hugetlb_cgroup *)page[2].lru.next;
+       return (struct hugetlb_cgroup *)page[2].private;
   }
   
   static inline
@@@ -42,13 -42,15 +42,13 @@@ int set_hugetlb_cgroup(struct page *pag
   
         if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
                 return -1;
-       page[2].lru.next = (void *)h_cg;
+       page[2].private = (unsigned long)h_cg;
         return 0;
   }
   
   static inline bool hugetlb_cgroup_disabled(void)
   {
- -      if (hugetlb_cgrp_subsys.disabled)
- -              return true;
- -      return false;
+ +      return !cgroup_subsys_enabled(hugetlb_cgrp_subsys);
   }
   
   extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
diff --combined include/linux/memcontrol.h

index 27251ed428f7db8adaf54c58b7f9e41deda9048d,a11d9f5d559531c227aa58bd862ae35787c9eb2e..ffc5460ed9e55b54f473bbacd90b62f30f93e978
--- 1/include/linux/memcontrol.h
--- 2/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@@ -213,9 -213,6 +213,9 @@@ struct mem_cgroup 
         /* OOM-Killer disable */
         int             oom_kill_disable;
   
+ +      /* handle for "memory.events" */
+ +      struct cgroup_file events_file;
+ +
         /* protect arrays of thresholds */
         struct mutex thresholds_lock;
   
@@@ -288,21 -285,21 +288,22 @@@ static inline void mem_cgroup_events(st
                        unsigned int nr)
   {
         this_cpu_add(memcg->stat->events[idx], nr);
+ +      cgroup_file_notify(&memcg->events_file);
   }
   
   bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
   
   int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
-                         gfp_t gfp_mask, struct mem_cgroup **memcgp);
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound);
   void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             bool lrucare);
- void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
+                             bool lrucare, bool compound);
+ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+               bool compound);
   void mem_cgroup_uncharge(struct page *page);
   void mem_cgroup_uncharge_list(struct list_head *page_list);
   
- void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
-                       bool lrucare);
+ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage);
   
   struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
   struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
@@@ -350,7 -347,9 +351,7 @@@ ino_t page_cgroup_ino(struct page *page
   
   static inline bool mem_cgroup_disabled(void)
   {
- -      if (memory_cgrp_subsys.disabled)
- -              return true;
- -      return false;
+ +      return !cgroup_subsys_enabled(memory_cgrp_subsys);
   }
   
   /*
@@@ -384,7 -383,7 +385,7 @@@ unsigned long mem_cgroup_get_lru_size(s
         return mz->lru_size[lru];
   }
   
- static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+ static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
   {
         unsigned long inactive_ratio;
         unsigned long inactive;
@@@ -403,24 -402,26 +404,26 @@@
         return inactive * inactive_ratio < active;
   }
   
+ void mem_cgroup_handle_over_high(void);
+ 
   void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
                                 struct task_struct *p);
   
   static inline void mem_cgroup_oom_enable(void)
   {
-       WARN_ON(current->memcg_oom.may_oom);
-       current->memcg_oom.may_oom = 1;
+       WARN_ON(current->memcg_may_oom);
+       current->memcg_may_oom = 1;
   }
   
   static inline void mem_cgroup_oom_disable(void)
   {
-       WARN_ON(!current->memcg_oom.may_oom);
-       current->memcg_oom.may_oom = 0;
+       WARN_ON(!current->memcg_may_oom);
+       current->memcg_may_oom = 0;
   }
   
   static inline bool task_in_memcg_oom(struct task_struct *p)
   {
-       return p->memcg_oom.memcg;
+       return p->memcg_in_oom;
   }
   
   bool mem_cgroup_oom_synchronize(bool wait);
@@@ -512,7 -513,8 +515,8 @@@ static inline bool mem_cgroup_low(struc
   
   static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                                         gfp_t gfp_mask,
-                                       struct mem_cgroup **memcgp)
+                                       struct mem_cgroup **memcgp,
+                                       bool compound)
   {
         *memcgp = NULL;
         return 0;
@@@ -520,12 -522,13 +524,13 @@@
   
   static inline void mem_cgroup_commit_charge(struct page *page,
                                             struct mem_cgroup *memcg,
-                                           bool lrucare)
+                                           bool lrucare, bool compound)
   {
   }
   
   static inline void mem_cgroup_cancel_charge(struct page *page,
-                                           struct mem_cgroup *memcg)
+                                           struct mem_cgroup *memcg,
+                                           bool compound)
   {
   }
   
@@@ -537,9 -540,7 +542,7 @@@ static inline void mem_cgroup_uncharge_
   {
   }
   
- static inline void mem_cgroup_migrate(struct page *oldpage,
-                                     struct page *newpage,
-                                     bool lrucare)
+ static inline void mem_cgroup_replace_page(struct page *old, struct page *new)
   {
   }
   
@@@ -585,10 -586,10 +588,10 @@@ static inline bool mem_cgroup_disabled(
         return true;
   }
   
- static inline int
+ static inline bool
   mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
   {
-       return 1;
+       return true;
   }
   
   static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
@@@ -622,6 -623,10 +625,10 @@@ static inline void mem_cgroup_end_page_
   {
   }
   
+ static inline void mem_cgroup_handle_over_high(void)
+ {
+ }
+ 
   static inline void mem_cgroup_oom_enable(void)
   {
   }
@@@ -678,9 -683,8 +685,9 @@@ enum 
   
   struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
   struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
- -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
- -                       unsigned long *pdirty, unsigned long *pwriteback);
+ +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ +                       unsigned long *pheadroom, unsigned long *pdirty,
+ +                       unsigned long *pwriteback);
   
   #else /* CONFIG_CGROUP_WRITEBACK */
   
@@@ -690,8 -694,7 +697,8 @@@ static inline struct wb_domain *mem_cgr
   }
   
   static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
- -                                     unsigned long *pavail,
+ +                                     unsigned long *pfilepages,
+ +                                     unsigned long *pheadroom,
                                        unsigned long *pdirty,
                                        unsigned long *pwriteback)
   {
@@@ -748,11 -751,10 +755,10 @@@ static inline bool memcg_kmem_is_active
    * conditions, but because they are pretty simple, they are expected to be
    * fast.
    */
- bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
-                                       int order);
- void __memcg_kmem_commit_charge(struct page *page,
-                                      struct mem_cgroup *memcg, int order);
- void __memcg_kmem_uncharge_pages(struct page *page, int order);
+ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+                             struct mem_cgroup *memcg);
+ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
+ void __memcg_kmem_uncharge(struct page *page, int order);
   
   /*
    * helper for acessing a memcg's index. It will be used as an index in the
@@@ -767,77 -769,42 +773,42 @@@ static inline int memcg_cache_id(struc
   struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
   void __memcg_kmem_put_cache(struct kmem_cache *cachep);
   
- struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr);
- 
- int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                     unsigned long nr_pages);
- void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
- 
- /**
-  * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
-  * @gfp: the gfp allocation flags.
-  * @memcg: a pointer to the memcg this was charged against.
-  * @order: allocation order.
-  *
-  * returns true if the memcg where the current task belongs can hold this
-  * allocation.
-  *
-  * We return true automatically if this allocation is not to be accounted to
-  * any memcg.
-  */
- static inline bool
- memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+ static inline bool __memcg_kmem_bypass(gfp_t gfp)
   {
         if (!memcg_kmem_enabled())
                 return true;
- 
         if (gfp & __GFP_NOACCOUNT)
                 return true;
-       /*
-        * __GFP_NOFAIL allocations will move on even if charging is not
-        * possible. Therefore we don't even try, and have this allocation
-        * unaccounted. We could in theory charge it forcibly, but we hope
-        * those allocations are rare, and won't be worth the trouble.
-        */
-       if (gfp & __GFP_NOFAIL)
-               return true;
         if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
                 return true;
- 
-       /* If the test is dying, just let it go. */
-       if (unlikely(fatal_signal_pending(current)))
-               return true;
- 
-       return __memcg_kmem_newpage_charge(gfp, memcg, order);
+       return false;
   }
   
   /**
-  * memcg_kmem_uncharge_pages: uncharge pages from memcg
-  * @page: pointer to struct page being freed
-  * @order: allocation order.
+  * memcg_kmem_charge: charge a kmem page
+  * @page: page to charge
+  * @gfp: reclaim mode
+  * @order: allocation order
+  *
+  * Returns 0 on success, an error code on failure.
    */
- static inline void
- memcg_kmem_uncharge_pages(struct page *page, int order)
+ static __always_inline int memcg_kmem_charge(struct page *page,
+                                            gfp_t gfp, int order)
   {
-       if (memcg_kmem_enabled())
-               __memcg_kmem_uncharge_pages(page, order);
+       if (__memcg_kmem_bypass(gfp))
+               return 0;
+       return __memcg_kmem_charge(page, gfp, order);
   }
   
   /**
-  * memcg_kmem_commit_charge: embeds correct memcg in a page
-  * @page: pointer to struct page recently allocated
-  * @memcg: the memcg structure we charged against
-  * @order: allocation order.
-  *
-  * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
-  * failure of the allocation. if @page is NULL, this function will revert the
-  * charges. Otherwise, it will commit @page to @memcg.
+  * memcg_kmem_uncharge: uncharge a kmem page
+  * @page: page to uncharge
+  * @order: allocation order
    */
- static inline void
- memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+ static __always_inline void memcg_kmem_uncharge(struct page *page, int order)
   {
-       if (memcg_kmem_enabled() && memcg)
-               __memcg_kmem_commit_charge(page, memcg, order);
+       if (memcg_kmem_enabled())
+               __memcg_kmem_uncharge(page, order);
   }
   
   /**
@@@ -850,17 -817,8 +821,8 @@@
   static __always_inline struct kmem_cache *
   memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
   {
-       if (!memcg_kmem_enabled())
+       if (__memcg_kmem_bypass(gfp))
                 return cachep;
-       if (gfp & __GFP_NOACCOUNT)
-               return cachep;
-       if (gfp & __GFP_NOFAIL)
-               return cachep;
-       if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
-               return cachep;
-       if (unlikely(fatal_signal_pending(current)))
-               return cachep;
- 
         return __memcg_kmem_get_cache(cachep);
   }
   
@@@ -869,13 -827,6 +831,6 @@@ static __always_inline void memcg_kmem_
         if (memcg_kmem_enabled())
                 __memcg_kmem_put_cache(cachep);
   }
- 
- static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
- {
-       if (!memcg_kmem_enabled())
-               return NULL;
-       return __mem_cgroup_from_kmem(ptr);
- }
   #else
   #define for_each_memcg_cache_index(_idx)      \
         for (; NULL; )
@@@ -890,18 -841,12 +845,12 @@@ static inline bool memcg_kmem_is_active
         return false;
   }
   
- static inline bool
- memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+ static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
   {
-       return true;
+       return 0;
   }
   
- static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
- {
- }
- 
- static inline void
- memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+ static inline void memcg_kmem_uncharge(struct page *page, int order)
   {
   }
   
@@@ -927,11 -872,5 +876,5 @@@ memcg_kmem_get_cache(struct kmem_cache 
   static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
   {
   }
- 
- static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
- {
-       return NULL;
- }
   #endif /* CONFIG_MEMCG_KMEM */
   #endif /* _LINUX_MEMCONTROL_H */
- 
diff --combined include/linux/sched.h

index 4effb1025fbb1555bc9c3ce6f80d98db004271a2,02b63957a721a587d2f1db37e8628f869350d63c..4069febaa34af9e93be8bb98e807db04a67c4380
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -384,6 -384,7 +384,7 @@@ extern int proc_dowatchdog_thresh(struc
                                   void __user *buffer,
                                   size_t *lenp, loff_t *ppos);
   extern unsigned int  softlockup_panic;
+ extern unsigned int  hardlockup_panic;
   void lockup_detector_init(void);
   #else
   static inline void touch_softlockup_watchdog(void)
@@@ -599,42 -600,33 +600,42 @@@ struct task_cputime_atomic 
                 .sum_exec_runtime = ATOMIC64_INIT(0),           \
         }
   
- -#ifdef CONFIG_PREEMPT_COUNT
- -#define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
- -#else
- -#define PREEMPT_DISABLED      PREEMPT_ENABLED
- -#endif
+ +#define PREEMPT_DISABLED      (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
   
   /*
- - * Disable preemption until the scheduler is running.
- - * Reset by start_kernel()->sched_init()->init_idle().
+ + * Disable preemption until the scheduler is running -- use an unconditional
+ + * value so that it also works on !PREEMPT_COUNT kernels.
    *
- - * We include PREEMPT_ACTIVE to avoid cond_resched() from working
- - * before the scheduler is active -- see should_resched().
+ + * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
    */
- -#define INIT_PREEMPT_COUNT    (PREEMPT_DISABLED + PREEMPT_ACTIVE)
+ +#define INIT_PREEMPT_COUNT    PREEMPT_OFFSET
+ +
+ +/*
+ + * Initial preempt_count value; reflects the preempt_count schedule invariant
+ + * which states that during context switches:
+ + *
+ + *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
+ + *
+ + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
+ + * Note: See finish_task_switch().
+ + */
+ +#define FORK_PREEMPT_COUNT    (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
   
   /**
    * struct thread_group_cputimer - thread group interval timer counts
    * @cputime_atomic:   atomic thread group interval timers.
- - * @running:          non-zero when there are timers running and
- - *                    @cputime receives updates.
+ + * @running:          true when there are timers running and
+ + *                    @cputime_atomic receives updates.
+ + * @checking_timer:   true when a thread in the group is in the
+ + *                    process of checking for thread group timers.
    *
    * This structure contains the version of task_cputime, above, that is
    * used for thread group CPU timer calculations.
    */
   struct thread_group_cputimer {
         struct task_cputime_atomic cputime_atomic;
- -      int running;
+ +      bool running;
+ +      bool checking_timer;
   };
   
   #include <linux/rwsem.h>
@@@ -771,6 -763,18 +772,6 @@@ struct signal_struct 
         unsigned audit_tty_log_passwd;
         struct tty_audit_buf *tty_audit_buf;
   #endif
- -#ifdef CONFIG_CGROUPS
- -      /*
- -       * group_rwsem prevents new tasks from entering the threadgroup and
- -       * member tasks from exiting,a more specifically, setting of
- -       * PF_EXITING.  fork and exit paths are protected with this rwsem
- -       * using threadgroup_change_begin/end().  Users which require
- -       * threadgroup to remain stable should use threadgroup_[un]lock()
- -       * which also takes care of exec path.  Currently, cgroup is the
- -       * only user.
- -       */
- -      struct rw_semaphore group_rwsem;
- -#endif
   
         oom_flags_t oom_flags;
         short oom_score_adj;            /* OOM kill score adjustment */
@@@ -837,7 -841,7 +838,7 @@@ struct user_struct 
         struct hlist_node uidhash_node;
         kuid_t uid;
   
- -#ifdef CONFIG_PERF_EVENTS
+ +#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
         atomic_long_t locked_vm;
   #endif
   };
@@@ -1136,6 -1140,8 +1137,6 @@@ struct sched_domain_topology_level 
   #endif
   };
   
- -extern struct sched_domain_topology_level *sched_domain_topology;
- -
   extern void set_sched_topology(struct sched_domain_topology_level *tl);
   extern void wake_up_if_idle(int cpu);
   
@@@ -1184,10 -1190,10 +1185,10 @@@ struct load_weight 
   
   /*
    * The load_avg/util_avg accumulates an infinite geometric series.
- - * 1) load_avg factors the amount of time that a sched_entity is
- - * runnable on a rq into its weight. For cfs_rq, it is the aggregated
- - * such weights of all runnable and blocked sched_entities.
- - * 2) util_avg factors frequency scaling into the amount of time
+ + * 1) load_avg factors frequency scaling into the amount of time that a
+ + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
+ + * aggregated such weights of all runnable and blocked sched_entities.
+ + * 2) util_avg factors frequency and cpu scaling into the amount of time
    * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
    * For cfs_rq, it is the aggregated such times of all runnable and
    * blocked sched_entities.
@@@ -1337,12 -1343,10 +1338,12 @@@ struct sched_dl_entity 
   
   union rcu_special {
         struct {
- -              bool blocked;
- -              bool need_qs;
- -      } b;
- -      short s;
+ +              u8 blocked;
+ +              u8 need_qs;
+ +              u8 exp_need_qs;
+ +              u8 pad; /* Otherwise the compiler can store garbage here. */
+ +      } b; /* Bits. */
+ +      u32 s; /* Set of bits. */
   };
   struct rcu_node;
   
@@@ -1460,7 -1464,9 +1461,9 @@@ struct task_struct 
         unsigned sched_reset_on_fork:1;
         unsigned sched_contributes_to_load:1;
         unsigned sched_migrated:1;
- 
+ #ifdef CONFIG_MEMCG
+       unsigned memcg_may_oom:1;
+ #endif
   #ifdef CONFIG_MEMCG_KMEM
         unsigned memcg_kmem_skip_account:1;
   #endif
@@@ -1567,9 -1573,7 +1570,7 @@@
   
         unsigned long sas_ss_sp;
         size_t sas_ss_size;
-       int (*notifier)(void *priv);
-       void *notifier_data;
-       sigset_t *notifier_mask;
+ 
         struct callback_head *task_works;
   
         struct audit_context *audit_context;
@@@ -1791,12 -1795,12 +1792,12 @@@
         unsigned long trace_recursion;
   #endif /* CONFIG_TRACING */
   #ifdef CONFIG_MEMCG
-       struct memcg_oom_info {
-               struct mem_cgroup *memcg;
-               gfp_t gfp_mask;
-               int order;
-               unsigned int may_oom:1;
-       } memcg_oom;
+       struct mem_cgroup *memcg_in_oom;
+       gfp_t memcg_oom_gfp_mask;
+       int memcg_oom_order;
+ 
+       /* number of pages to reclaim on returning to userland */
+       unsigned int memcg_nr_pages_over_high;
   #endif
   #ifdef CONFIG_UPROBES
         struct uprobe_task *utask;
@@@ -2461,21 -2465,29 +2462,29 @@@ extern void ignore_signals(struct task_
   extern void flush_signal_handlers(struct task_struct *, int force_default);
   extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
   
- static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
+ static inline int kernel_dequeue_signal(siginfo_t *info)
   {
-       unsigned long flags;
+       struct task_struct *tsk = current;
+       siginfo_t __info;
         int ret;
   
-       spin_lock_irqsave(&tsk->sighand->siglock, flags);
-       ret = dequeue_signal(tsk, mask, info);
-       spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+       spin_lock_irq(&tsk->sighand->siglock);
+       ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
+       spin_unlock_irq(&tsk->sighand->siglock);
   
         return ret;
   }
   
- extern void block_all_signals(int (*notifier)(void *priv), void *priv,
-                             sigset_t *mask);
- extern void unblock_all_signals(void);
+ static inline void kernel_signal_stop(void)
+ {
+       spin_lock_irq(&current->sighand->siglock);
+       if (current->jobctl & JOBCTL_STOP_DEQUEUED)
+               __set_current_state(TASK_STOPPED);
+       spin_unlock_irq(&current->sighand->siglock);
+ 
+       schedule();
+ }
+ 
   extern void release_task(struct task_struct * p);
   extern int send_sig_info(int, struct siginfo *, struct task_struct *);
   extern int force_sigsegv(int, struct task_struct *);
diff --combined include/linux/skbuff.h

index 24f4dfd94c517b3b387682509180dee161e0912d,4d82b886af839af39f92325da3952567eb4464c6..4355129fff91b6f188136af2a499d6100f2e5bfd
--- 1/include/linux/skbuff.h
--- 2/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@@ -463,15 -463,6 +463,15 @@@ static inline u32 skb_mstamp_us_delta(c
         return delta_us;
   }
   
+ +static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
+ +                                  const struct skb_mstamp *t0)
+ +{
+ +      s32 diff = t1->stamp_jiffies - t0->stamp_jiffies;
+ +
+ +      if (!diff)
+ +              diff = t1->stamp_us - t0->stamp_us;
+ +      return diff > 0;
+ +}
   
   /** 
    *    struct sk_buff - socket buffer
@@@ -1224,7 -1215,7 +1224,7 @@@ static inline int skb_cloned(const stru
   
   static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
   {
-       might_sleep_if(pri & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(pri));
   
         if (skb_cloned(skb))
                 return pskb_expand_head(skb, 0, 0, pri);
@@@ -1308,7 -1299,7 +1308,7 @@@ static inline int skb_shared(const stru
    */
   static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
   {
-       might_sleep_if(pri & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(pri));
         if (skb_shared(skb)) {
                 struct sk_buff *nskb = skb_clone(skb, pri);
   
@@@ -1344,7 -1335,7 +1344,7 @@@
   static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
                                           gfp_t pri)
   {
-       might_sleep_if(pri & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(pri));
         if (skb_cloned(skb)) {
                 struct sk_buff *nskb = skb_copy(skb, pri);
   
diff --combined include/net/sock.h

index aeed5c95f3caedcdb4c10668c67764d8557e9369,509694740bede696abd69910aee0d1e87a68caf5..59a71965b47682edadc1b37b69d112c8b347a95a
--- 1/include/net/sock.h
--- 2/include/net/sock.h
+++ b/include/net/sock.h
@@@ -150,10 -150,6 +150,10 @@@ typedef __u64 __bitwise __addrpair
    *    @skc_node: main hash linkage for various protocol lookup tables
    *    @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
    *    @skc_tx_queue_mapping: tx queue number for this connection
+ + *    @skc_flags: place holder for sk_flags
+ + *            %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
+ + *            %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
+ + *    @skc_incoming_cpu: record/match cpu processing incoming packets
    *    @skc_refcnt: reference count
    *
    *    This is the minimal network layer representation of sockets, the header
@@@ -204,16 -200,6 +204,16 @@@ struct sock_common 
   
         atomic64_t              skc_cookie;
   
+ +      /* following fields are padding to force
+ +       * offset(struct sock, sk_refcnt) == 128 on 64bit arches
+ +       * assuming IPV6 is enabled. We use this padding differently
+ +       * for different kind of 'sockets'
+ +       */
+ +      union {
+ +              unsigned long   skc_flags;
+ +              struct sock     *skc_listener; /* request_sock */
+ +              struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
+ +      };
         /*
          * fields between dontcopy_begin/dontcopy_end
          * are not copied in sock_copy()
@@@ -226,20 -212,9 +226,20 @@@
                 struct hlist_nulls_node skc_nulls_node;
         };
         int                     skc_tx_queue_mapping;
+ +      union {
+ +              int             skc_incoming_cpu;
+ +              u32             skc_rcv_wnd;
+ +              u32             skc_tw_rcv_nxt; /* struct tcp_timewait_sock  */
+ +      };
+ +
         atomic_t                skc_refcnt;
         /* private: */
         int                     skc_dontcopy_end[0];
+ +      union {
+ +              u32             skc_rxhash;
+ +              u32             skc_window_clamp;
+ +              u32             skc_tw_snd_nxt; /* struct tcp_timewait_sock */
+ +      };
         /* public: */
   };
   
@@@ -268,6 -243,8 +268,6 @@@ struct cg_proto
     *   @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
     *   @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
     *   @sk_sndbuf: size of send buffer in bytes
- -  *   @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
- -  *              %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
     *   @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
     *   @sk_no_check_rx: allow zero checksum in RX packets
     *   @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
@@@ -296,6 -273,8 +296,6 @@@
     *   @sk_rcvlowat: %SO_RCVLOWAT setting
     *   @sk_rcvtimeo: %SO_RCVTIMEO setting
     *   @sk_sndtimeo: %SO_SNDTIMEO setting
- -  *   @sk_rxhash: flow hash received from netif layer
- -  *   @sk_incoming_cpu: record cpu processing incoming packets
     *   @sk_txhash: computed flow hash for use on transmit
     *   @sk_filter: socket filtering instructions
     *   @sk_timer: sock cleanup timer
@@@ -352,9 -331,6 +352,9 @@@ struct sock 
   #define sk_v6_daddr           __sk_common.skc_v6_daddr
   #define sk_v6_rcv_saddr       __sk_common.skc_v6_rcv_saddr
   #define sk_cookie             __sk_common.skc_cookie
+ +#define sk_incoming_cpu               __sk_common.skc_incoming_cpu
+ +#define sk_flags              __sk_common.skc_flags
+ +#define sk_rxhash             __sk_common.skc_rxhash
   
         socket_lock_t           sk_lock;
         struct sk_buff_head     sk_receive_queue;
@@@ -374,6 -350,14 +374,6 @@@
         } sk_backlog;
   #define sk_rmem_alloc sk_backlog.rmem_alloc
         int                     sk_forward_alloc;
- -#ifdef CONFIG_RPS
- -      __u32                   sk_rxhash;
- -#endif
- -      u16                     sk_incoming_cpu;
- -      /* 16bit hole
- -       * Warned : sk_incoming_cpu can be set from softirq,
- -       * Do not use this hole without fully understanding possible issues.
- -       */
   
         __u32                   sk_txhash;
   #ifdef CONFIG_NET_RX_BUSY_POLL
@@@ -389,6 -373,7 +389,6 @@@
   #ifdef CONFIG_XFRM
         struct xfrm_policy      *sk_policy[2];
   #endif
- -      unsigned long           sk_flags;
         struct dst_entry        *sk_rx_dst;
         struct dst_entry __rcu  *sk_dst_cache;
         spinlock_t              sk_dst_lock;
@@@ -774,7 -759,7 +774,7 @@@ static inline int sk_memalloc_socks(voi
   
   #endif
   
- -static inline gfp_t sk_gfp_atomic(struct sock *sk, gfp_t gfp_mask)
+ +static inline gfp_t sk_gfp_atomic(const struct sock *sk, gfp_t gfp_mask)
   {
         return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC);
   }
@@@ -1537,13 -1522,6 +1537,13 @@@ void sock_kfree_s(struct sock *sk, voi
   void sock_kzfree_s(struct sock *sk, void *mem, int size);
   void sk_send_sigurg(struct sock *sk);
   
+ +struct sockcm_cookie {
+ +      u32 mark;
+ +};
+ +
+ +int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
+ +                 struct sockcm_cookie *sockc);
+ +
   /*
    * Functions to fill in entries in struct proto_ops when a protocol
    * does not implement a particular function.
@@@ -1684,16 -1662,12 +1684,16 @@@ static inline void sock_graft(struct so
   kuid_t sock_i_uid(struct sock *sk);
   unsigned long sock_i_ino(struct sock *sk);
   
- -static inline void sk_set_txhash(struct sock *sk)
+ +static inline u32 net_tx_rndhash(void)
   {
- -      sk->sk_txhash = prandom_u32();
+ +      u32 v = prandom_u32();
   
- -      if (unlikely(!sk->sk_txhash))
- -              sk->sk_txhash = 1;
+ +      return v ?: 1;
+ +}
+ +
+ +static inline void sk_set_txhash(struct sock *sk)
+ +{
+ +      sk->sk_txhash = net_tx_rndhash();
   }
   
   static inline void sk_rethink_txhash(struct sock *sk)
@@@ -2054,7 -2028,7 +2054,7 @@@ struct sk_buff *sk_stream_alloc_skb(str
    */
   static inline struct page_frag *sk_page_frag(struct sock *sk)
   {
-       if (sk->sk_allocation & __GFP_WAIT)
+       if (gfpflags_allow_blocking(sk->sk_allocation))
                 return &current->task_frag;
   
         return &sk->sk_frag;
@@@ -2231,14 -2205,6 +2231,14 @@@ static inline bool sk_fullsock(const st
         return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
   }
   
+ +/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
+ + * SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
+ + */
+ +static inline bool sk_listener(const struct sock *sk)
+ +{
+ +      return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
+ +}
+ +
   void sock_enable_timestamp(struct sock *sk, int flag);
   int sock_get_timestamp(struct sock *, struct timeval __user *);
   int sock_get_timestampns(struct sock *, struct timespec __user *);
diff --combined kernel/audit.c

index 8a056a32ded7d2b4560612af7d84ac317f7a17b3,6ae6e2b62e3e5aa567526b0b24dc622e5de33e98..5ffcbd354a520b88781ed2d66c7839a7aaa7f86d
--- 1/kernel/audit.c
--- 2/kernel/audit.c
+++ b/kernel/audit.c
@@@ -407,33 -407,16 +407,33 @@@ static void audit_printk_skb(struct sk_
   static void kauditd_send_skb(struct sk_buff *skb)
   {
         int err;
+ +      int attempts = 0;
+ +#define AUDITD_RETRIES 5
+ +
+ +restart:
         /* take a reference in case we can't send it and we want to hold it */
         skb_get(skb);
         err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
         if (err < 0) {
- -              BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
+ +              pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
+ +                     audit_pid, err);
                 if (audit_pid) {
- -                      pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
- -                      audit_log_lost("auditd disappeared");
- -                      audit_pid = 0;
- -                      audit_sock = NULL;
+ +                      if (err == -ECONNREFUSED || err == -EPERM
+ +                          || ++attempts >= AUDITD_RETRIES) {
+ +                              char s[32];
+ +
+ +                              snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
+ +                              audit_log_lost(s);
+ +                              audit_pid = 0;
+ +                              audit_sock = NULL;
+ +                      } else {
+ +                              pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
+ +                                      attempts, audit_pid);
+ +                              set_current_state(TASK_INTERRUPTIBLE);
+ +                              schedule();
+ +                              __set_current_state(TASK_RUNNING);
+ +                              goto restart;
+ +                      }
                 }
                 /* we might get lucky and get this in the next auditd */
                 audit_hold_skb(skb);
@@@ -701,22 -684,25 +701,22 @@@ static int audit_netlink_ok(struct sk_b
         return err;
   }
   
- -static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
+ +static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
   {
- -      int rc = 0;
         uid_t uid = from_kuid(&init_user_ns, current_uid());
         pid_t pid = task_tgid_nr(current);
   
         if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
                 *ab = NULL;
- -              return rc;
+ +              return;
         }
   
         *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
         if (unlikely(!*ab))
- -              return rc;
+ +              return;
         audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
         audit_log_session_info(*ab);
         audit_log_task_context(*ab);
- -
- -      return rc;
   }
   
   int is_audit_feature_set(int i)
@@@ -1371,16 -1357,16 +1371,16 @@@ struct audit_buffer *audit_log_start(st
         if (unlikely(audit_filter_type(type)))
                 return NULL;
   
-       if (gfp_mask & __GFP_WAIT) {
+       if (gfp_mask & __GFP_DIRECT_RECLAIM) {
                 if (audit_pid && audit_pid == current->pid)
-                       gfp_mask &= ~__GFP_WAIT;
+                       gfp_mask &= ~__GFP_DIRECT_RECLAIM;
                 else
                         reserve = 0;
         }
   
         while (audit_backlog_limit
                && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
-               if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+               if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
                         long sleep_time;
   
                         sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
@@@ -1580,14 -1566,14 +1580,14 @@@ void audit_log_n_string(struct audit_bu
    * @string: string to be checked
    * @len: max length of the string to check
    */
- -int audit_string_contains_control(const char *string, size_t len)
+ +bool audit_string_contains_control(const char *string, size_t len)
   {
         const unsigned char *p;
         for (p = string; p < (const unsigned char *)string + len; p++) {
                 if (*p == '"' || *p < 0x21 || *p > 0x7e)
- -                      return 1;
+ +                      return true;
         }
- -      return 0;
+ +      return false;
   }
   
   /**
diff --combined kernel/cgroup.c

index b9d0cce3f9ce54937fea988b531d0cc7bf52f692,311b00c30a889e02eec2fbbffd0b0565a347943e..f1603c153890d2b9dbd37a5c687fd297c6137f24
--- 1/kernel/cgroup.c
--- 2/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@@ -45,7 -45,7 +45,7 @@@
   #include <linux/sched.h>
   #include <linux/slab.h>
   #include <linux/spinlock.h>
- -#include <linux/rwsem.h>
+ +#include <linux/percpu-rwsem.h>
   #include <linux/string.h>
   #include <linux/sort.h>
   #include <linux/kmod.h>
@@@ -75,7 -75,7 +75,7 @@@
    * cgroup_mutex is the master lock.  Any modification to cgroup or its
    * hierarchy must be performed while holding it.
    *
- - * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ + * css_set_lock protects task->cgroups pointer, the list of css_set
    * objects, and the chain of tasks off each css_set.
    *
    * These locks are exported if CONFIG_PROVE_RCU so that accessors in
@@@ -83,12 -83,12 +83,12 @@@
    */
   #ifdef CONFIG_PROVE_RCU
   DEFINE_MUTEX(cgroup_mutex);
- -DECLARE_RWSEM(css_set_rwsem);
+ +DEFINE_SPINLOCK(css_set_lock);
   EXPORT_SYMBOL_GPL(cgroup_mutex);
- -EXPORT_SYMBOL_GPL(css_set_rwsem);
+ +EXPORT_SYMBOL_GPL(css_set_lock);
   #else
   static DEFINE_MUTEX(cgroup_mutex);
- -static DECLARE_RWSEM(css_set_rwsem);
+ +static DEFINE_SPINLOCK(css_set_lock);
   #endif
   
   /*
@@@ -103,8 -103,6 +103,8 @@@ static DEFINE_SPINLOCK(cgroup_idr_lock)
    */
   static DEFINE_SPINLOCK(release_agent_path_lock);
   
+ +struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+ +
   #define cgroup_assert_mutex_or_rcu_locked()                           \
         RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
                            !lockdep_is_held(&cgroup_mutex),             \
@@@ -138,27 -136,6 +138,27 @@@ static const char *cgroup_subsys_name[
   };
   #undef SUBSYS
   
+ +/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
+ +#define SUBSYS(_x)                                                            \
+ +      DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                 \
+ +      DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                  \
+ +      EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                      \
+ +      EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
+ +#include <linux/cgroup_subsys.h>
+ +#undef SUBSYS
+ +
+ +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
+ +static struct static_key_true *cgroup_subsys_enabled_key[] = {
+ +#include <linux/cgroup_subsys.h>
+ +};
+ +#undef SUBSYS
+ +
+ +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
+ +static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
+ +#include <linux/cgroup_subsys.h>
+ +};
+ +#undef SUBSYS
+ +
   /*
    * The default hierarchy, reserved for the subsystems that are otherwise
    * unattached - it never has more than a single cgroup, and all tasks are
@@@ -173,6 -150,12 +173,6 @@@ EXPORT_SYMBOL_GPL(cgrp_dfl_root)
    */
   static bool cgrp_dfl_root_visible;
   
- -/*
- - * Set by the boot param of the same name and makes subsystems with NULL
- - * ->dfl_files to use ->legacy_files on the default hierarchy.
- - */
- -static bool cgroup_legacy_files_on_dfl;
- -
   /* some controllers are not supported in the default hierarchy */
   static unsigned long cgrp_dfl_root_inhibit_ss_mask;
   
@@@ -200,7 -183,6 +200,7 @@@ static u64 css_serial_nr_next = 1
    */
   static unsigned long have_fork_callback __read_mostly;
   static unsigned long have_exit_callback __read_mostly;
+ +static unsigned long have_free_callback __read_mostly;
   
   /* Ditto for the can_fork callback. */
   static unsigned long have_canfork_callback __read_mostly;
@@@ -210,87 -192,14 +210,87 @@@ static struct cftype cgroup_legacy_base
   
   static int rebind_subsystems(struct cgroup_root *dst_root,
                              unsigned long ss_mask);
+ +static void css_task_iter_advance(struct css_task_iter *it);
   static int cgroup_destroy_locked(struct cgroup *cgrp);
   static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
                       bool visible);
   static void css_release(struct percpu_ref *ref);
   static void kill_css(struct cgroup_subsys_state *css);
- -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ +static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+ +                            struct cgroup *cgrp, struct cftype cfts[],
                               bool is_add);
   
+ +/**
+ + * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
+ + * @ssid: subsys ID of interest
+ + *
+ + * cgroup_subsys_enabled() can only be used with literal subsys names which
+ + * is fine for individual subsystems but unsuitable for cgroup core.  This
+ + * is slower static_key_enabled() based test indexed by @ssid.
+ + */
+ +static bool cgroup_ssid_enabled(int ssid)
+ +{
+ +      return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
+ +}
+ +
+ +/**
+ + * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ + * @cgrp: the cgroup of interest
+ + *
+ + * The default hierarchy is the v2 interface of cgroup and this function
+ + * can be used to test whether a cgroup is on the default hierarchy for
+ + * cases where a subsystem should behave differnetly depending on the
+ + * interface version.
+ + *
+ + * The set of behaviors which change on the default hierarchy are still
+ + * being determined and the mount option is prefixed with __DEVEL__.
+ + *
+ + * List of changed behaviors:
+ + *
+ + * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ + *   and "name" are disallowed.
+ + *
+ + * - When mounting an existing superblock, mount options should match.
+ + *
+ + * - Remount is disallowed.
+ + *
+ + * - rename(2) is disallowed.
+ + *
+ + * - "tasks" is removed.  Everything should be at process granularity.  Use
+ + *   "cgroup.procs" instead.
+ + *
+ + * - "cgroup.procs" is not sorted.  pids will be unique unless they got
+ + *   recycled inbetween reads.
+ + *
+ + * - "release_agent" and "notify_on_release" are removed.  Replacement
+ + *   notification mechanism will be implemented.
+ + *
+ + * - "cgroup.clone_children" is removed.
+ + *
+ + * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
+ + *   and its descendants contain no task; otherwise, 1.  The file also
+ + *   generates kernfs notification which can be monitored through poll and
+ + *   [di]notify when the value of the file changes.
+ + *
+ + * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ + *   take masks of ancestors with non-empty cpus/mems, instead of being
+ + *   moved to an ancestor.
+ + *
+ + * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ + *   masks of ancestors.
+ + *
+ + * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ + *   is not created.
+ + *
+ + * - blkcg: blk-throttle becomes properly hierarchical.
+ + *
+ + * - debug: disallowed on the default hierarchy.
+ + */
+ +static bool cgroup_on_dfl(const struct cgroup *cgrp)
+ +{
+ +      return cgrp->root == &cgrp_dfl_root;
+ +}
+ +
   /* IDR wrappers which synchronize using cgroup_idr_lock */
   static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
                             gfp_t gfp_mask)
@@@ -299,7 -208,7 +299,7 @@@
   
         idr_preload(gfp_mask);
         spin_lock_bh(&cgroup_idr_lock);
-       ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
+       ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
         spin_unlock_bh(&cgroup_idr_lock);
         idr_preload_end();
         return ret;
@@@ -423,22 -332,6 +423,22 @@@ static inline bool cgroup_is_dead(cons
         return !(cgrp->self.flags & CSS_ONLINE);
   }
   
+ +static void cgroup_get(struct cgroup *cgrp)
+ +{
+ +      WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ +      css_get(&cgrp->self);
+ +}
+ +
+ +static bool cgroup_tryget(struct cgroup *cgrp)
+ +{
+ +      return css_tryget(&cgrp->self);
+ +}
+ +
+ +static void cgroup_put(struct cgroup *cgrp)
+ +{
+ +      css_put(&cgrp->self);
+ +}
+ +
   struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
   {
         struct cgroup *cgrp = of->kn->parent->priv;
@@@ -588,31 -481,19 +588,31 @@@ struct css_set init_css_set = 
         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
+ +      .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
   };
   
   static int css_set_count      = 1;    /* 1 for init_css_set */
   
+ +/**
+ + * css_set_populated - does a css_set contain any tasks?
+ + * @cset: target css_set
+ + */
+ +static bool css_set_populated(struct css_set *cset)
+ +{
+ +      lockdep_assert_held(&css_set_lock);
+ +
+ +      return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+ +}
+ +
   /**
    * cgroup_update_populated - updated populated count of a cgroup
    * @cgrp: the target cgroup
    * @populated: inc or dec populated count
    *
- - * @cgrp is either getting the first task (css_set) or losing the last.
- - * Update @cgrp->populated_cnt accordingly.  The count is propagated
- - * towards root so that a given cgroup's populated_cnt is zero iff the
- - * cgroup and all its descendants are empty.
+ + * One of the css_sets associated with @cgrp is either getting its first
+ + * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
+ + * count is propagated towards root so that a given cgroup's populated_cnt
+ + * is zero iff the cgroup and all its descendants don't contain any tasks.
    *
    * @cgrp's interface file "cgroup.populated" is zero if
    * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
@@@ -622,7 -503,7 +622,7 @@@
    */
   static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
   {
- -      lockdep_assert_held(&css_set_rwsem);
+ +      lockdep_assert_held(&css_set_lock);
   
         do {
                 bool trigger;
@@@ -635,93 -516,12 +635,93 @@@
                 if (!trigger)
                         break;
   
- -              if (cgrp->populated_kn)
- -                      kernfs_notify(cgrp->populated_kn);
+ +              check_for_release(cgrp);
+ +              cgroup_file_notify(&cgrp->events_file);
+ +
                 cgrp = cgroup_parent(cgrp);
         } while (cgrp);
   }
   
+ +/**
+ + * css_set_update_populated - update populated state of a css_set
+ + * @cset: target css_set
+ + * @populated: whether @cset is populated or depopulated
+ + *
+ + * @cset is either getting the first task or losing the last.  Update the
+ + * ->populated_cnt of all associated cgroups accordingly.
+ + */
+ +static void css_set_update_populated(struct css_set *cset, bool populated)
+ +{
+ +      struct cgrp_cset_link *link;
+ +
+ +      lockdep_assert_held(&css_set_lock);
+ +
+ +      list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+ +              cgroup_update_populated(link->cgrp, populated);
+ +}
+ +
+ +/**
+ + * css_set_move_task - move a task from one css_set to another
+ + * @task: task being moved
+ + * @from_cset: css_set @task currently belongs to (may be NULL)
+ + * @to_cset: new css_set @task is being moved to (may be NULL)
+ + * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ + *
+ + * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
+ + * css_set, @from_cset can be NULL.  If @task is being disassociated
+ + * instead of moved, @to_cset can be NULL.
+ + *
+ + * This function automatically handles populated_cnt updates and
+ + * css_task_iter adjustments but the caller is responsible for managing
+ + * @from_cset and @to_cset's reference counts.
+ + */
+ +static void css_set_move_task(struct task_struct *task,
+ +                            struct css_set *from_cset, struct css_set *to_cset,
+ +                            bool use_mg_tasks)
+ +{
+ +      lockdep_assert_held(&css_set_lock);
+ +
+ +      if (from_cset) {
+ +              struct css_task_iter *it, *pos;
+ +
+ +              WARN_ON_ONCE(list_empty(&task->cg_list));
+ +
+ +              /*
+ +               * @task is leaving, advance task iterators which are
+ +               * pointing to it so that they can resume at the next
+ +               * position.  Advancing an iterator might remove it from
+ +               * the list, use safe walk.  See css_task_iter_advance*()
+ +               * for details.
+ +               */
+ +              list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+ +                                       iters_node)
+ +                      if (it->task_pos == &task->cg_list)
+ +                              css_task_iter_advance(it);
+ +
+ +              list_del_init(&task->cg_list);
+ +              if (!css_set_populated(from_cset))
+ +                      css_set_update_populated(from_cset, false);
+ +      } else {
+ +              WARN_ON_ONCE(!list_empty(&task->cg_list));
+ +      }
+ +
+ +      if (to_cset) {
+ +              /*
+ +               * We are synchronized through cgroup_threadgroup_rwsem
+ +               * against PF_EXITING setting such that we can't race
+ +               * against cgroup_exit() changing the css_set to
+ +               * init_css_set and dropping the old one.
+ +               */
+ +              WARN_ON_ONCE(task->flags & PF_EXITING);
+ +
+ +              if (!css_set_populated(to_cset))
+ +                      css_set_update_populated(to_cset, true);
+ +              rcu_assign_pointer(task->cgroups, to_cset);
+ +              list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+ +                                                           &to_cset->tasks);
+ +      }
+ +}
+ +
   /*
    * hash table for cgroup groups. This improves the performance to find
    * an existing css_set. This hash doesn't (currently) take into
@@@ -749,7 -549,7 +749,7 @@@ static void put_css_set_locked(struct c
         struct cgroup_subsys *ss;
         int ssid;
   
- -      lockdep_assert_held(&css_set_rwsem);
+ +      lockdep_assert_held(&css_set_lock);
   
         if (!atomic_dec_and_test(&cset->refcount))
                 return;
@@@ -761,10 -561,17 +761,10 @@@
         css_set_count--;
   
         list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
- -              struct cgroup *cgrp = link->cgrp;
- -
                 list_del(&link->cset_link);
                 list_del(&link->cgrp_link);
- -
- -              /* @cgrp can't go away while we're holding css_set_rwsem */
- -              if (list_empty(&cgrp->cset_links)) {
- -                      cgroup_update_populated(cgrp, false);
- -                      check_for_release(cgrp);
- -              }
- -
+ +              if (cgroup_parent(link->cgrp))
+ +                      cgroup_put(link->cgrp);
                 kfree(link);
         }
   
@@@ -781,9 -588,9 +781,9 @@@ static void put_css_set(struct css_set 
         if (atomic_add_unless(&cset->refcount, -1, 1))
                 return;
   
- -      down_write(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         put_css_set_locked(cset);
- -      up_write(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
   }
   
   /*
@@@ -972,15 -779,15 +972,15 @@@ static void link_css_set(struct list_he
         link->cset = cset;
         link->cgrp = cgrp;
   
- -      if (list_empty(&cgrp->cset_links))
- -              cgroup_update_populated(cgrp, true);
- -      list_move(&link->cset_link, &cgrp->cset_links);
- -
         /*
- -       * Always add links to the tail of the list so that the list
- -       * is sorted by order of hierarchy creation
+ +       * Always add links to the tail of the lists so that the lists are
+ +       * in choronological order.
          */
+ +      list_move_tail(&link->cset_link, &cgrp->cset_links);
         list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+ +
+ +      if (cgroup_parent(cgrp))
+ +              cgroup_get(cgrp);
   }
   
   /**
@@@ -1006,11 -813,11 +1006,11 @@@ static struct css_set *find_css_set(str
   
         /* First see if we already have a cgroup group that matches
          * the desired set */
- -      down_read(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         cset = find_existing_css_set(old_cset, cgrp, template);
         if (cset)
                 get_css_set(cset);
- -      up_read(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
   
         if (cset)
                 return cset;
@@@ -1031,14 -838,13 +1031,14 @@@
         INIT_LIST_HEAD(&cset->mg_tasks);
         INIT_LIST_HEAD(&cset->mg_preload_node);
         INIT_LIST_HEAD(&cset->mg_node);
+ +      INIT_LIST_HEAD(&cset->task_iters);
         INIT_HLIST_NODE(&cset->hlist);
   
         /* Copy the set of subsystem state objects generated in
          * find_existing_css_set() */
         memcpy(cset->subsys, template, sizeof(cset->subsys));
   
- -      down_write(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         /* Add reference counts and links from the new css_set. */
         list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                 struct cgroup *c = link->cgrp;
@@@ -1060,11 -866,53 +1060,11 @@@
                 list_add_tail(&cset->e_cset_node[ssid],
                               &cset->subsys[ssid]->cgroup->e_csets[ssid]);
   
- -      up_write(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
   
         return cset;
   }
   
- -void cgroup_threadgroup_change_begin(struct task_struct *tsk)
- -{
- -      down_read(&tsk->signal->group_rwsem);
- -}
- -
- -void cgroup_threadgroup_change_end(struct task_struct *tsk)
- -{
- -      up_read(&tsk->signal->group_rwsem);
- -}
- -
- -/**
- - * threadgroup_lock - lock threadgroup
- - * @tsk: member task of the threadgroup to lock
- - *
- - * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
- - * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
- - * change ->group_leader/pid.  This is useful for cases where the threadgroup
- - * needs to stay stable across blockable operations.
- - *
- - * fork and exit explicitly call threadgroup_change_{begin|end}() for
- - * synchronization.  While held, no new task will be added to threadgroup
- - * and no existing live task will have its PF_EXITING set.
- - *
- - * de_thread() does threadgroup_change_{begin|end}() when a non-leader
- - * sub-thread becomes a new leader.
- - */
- -static void threadgroup_lock(struct task_struct *tsk)
- -{
- -      down_write(&tsk->signal->group_rwsem);
- -}
- -
- -/**
- - * threadgroup_unlock - unlock threadgroup
- - * @tsk: member task of the threadgroup to unlock
- - *
- - * Reverse threadgroup_lock().
- - */
- -static inline void threadgroup_unlock(struct task_struct *tsk)
- -{
- -      up_write(&tsk->signal->group_rwsem);
- -}
- -
   static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
   {
         struct cgroup *root_cgrp = kf_root->kn->priv;
@@@ -1124,15 -972,14 +1124,15 @@@ static void cgroup_destroy_root(struct 
          * Release all the links from cset_links to this hierarchy's
          * root cgroup
          */
- -      down_write(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
   
         list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                 list_del(&link->cset_link);
                 list_del(&link->cgrp_link);
                 kfree(link);
         }
- -      up_write(&css_set_rwsem);
+ +
+ +      spin_unlock_bh(&css_set_lock);
   
         if (!list_empty(&root->root_list)) {
                 list_del(&root->root_list);
@@@ -1154,7 -1001,7 +1154,7 @@@ static struct cgroup *cset_cgroup_from_
         struct cgroup *res = NULL;
   
         lockdep_assert_held(&cgroup_mutex);
- -      lockdep_assert_held(&css_set_rwsem);
+ +      lockdep_assert_held(&css_set_lock);
   
         if (cset == &init_css_set) {
                 res = &root->cgrp;
@@@ -1177,7 -1024,7 +1177,7 @@@
   
   /*
    * Return the cgroup for "task" from the given hierarchy. Must be
- - * called with cgroup_mutex and css_set_rwsem held.
+ + * called with cgroup_mutex and css_set_lock held.
    */
   static struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                             struct cgroup_root *root)
@@@ -1216,6 -1063,7 +1216,6 @@@
    * update of a tasks cgroup pointer by cgroup_attach_task()
    */
   
- -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
   static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
   static const struct file_operations proc_cgroupstats_operations;
   
@@@ -1238,25 -1086,43 +1238,25 @@@ static char *cgroup_file_name(struct cg
    * cgroup_file_mode - deduce file mode of a control file
    * @cft: the control file in question
    *
- - * returns cft->mode if ->mode is not 0
- - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- - * returns S_IRUGO if it has only a read handler
- - * returns S_IWUSR if it has only a write hander
+ + * S_IRUGO for read, S_IWUSR for write.
    */
   static umode_t cgroup_file_mode(const struct cftype *cft)
   {
         umode_t mode = 0;
   
- -      if (cft->mode)
- -              return cft->mode;
- -
         if (cft->read_u64 || cft->read_s64 || cft->seq_show)
                 mode |= S_IRUGO;
   
- -      if (cft->write_u64 || cft->write_s64 || cft->write)
- -              mode |= S_IWUSR;
+ +      if (cft->write_u64 || cft->write_s64 || cft->write) {
+ +              if (cft->flags & CFTYPE_WORLD_WRITABLE)
+ +                      mode |= S_IWUGO;
+ +              else
+ +                      mode |= S_IWUSR;
+ +      }
   
         return mode;
   }
   
- -static void cgroup_get(struct cgroup *cgrp)
- -{
- -      WARN_ON_ONCE(cgroup_is_dead(cgrp));
- -      css_get(&cgrp->self);
- -}
- -
- -static bool cgroup_tryget(struct cgroup *cgrp)
- -{
- -      return css_tryget(&cgrp->self);
- -}
- -
- -static void cgroup_put(struct cgroup *cgrp)
- -{
- -      css_put(&cgrp->self);
- -}
- -
   /**
    * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
    * @cgrp: the target cgroup
@@@ -1397,64 -1263,28 +1397,64 @@@ static void cgroup_rm_file(struct cgrou
   }
   
   /**
- - * cgroup_clear_dir - remove subsys files in a cgroup directory
- - * @cgrp: target cgroup
- - * @subsys_mask: mask of the subsystem ids whose files should be removed
+ + * css_clear_dir - remove subsys files in a cgroup directory
+ + * @css: taget css
+ + * @cgrp_override: specify if target cgroup is different from css->cgroup
    */
- -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+ +static void css_clear_dir(struct cgroup_subsys_state *css,
+ +                        struct cgroup *cgrp_override)
   {
- -      struct cgroup_subsys *ss;
- -      int i;
+ +      struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ +      struct cftype *cfts;
   
- -      for_each_subsys(ss, i) {
- -              struct cftype *cfts;
+ +      list_for_each_entry(cfts, &css->ss->cfts, node)
+ +              cgroup_addrm_files(css, cgrp, cfts, false);
+ +}
   
- -              if (!(subsys_mask & (1 << i)))
- -                      continue;
- -              list_for_each_entry(cfts, &ss->cfts, node)
- -                      cgroup_addrm_files(cgrp, cfts, false);
+ +/**
+ + * css_populate_dir - create subsys files in a cgroup directory
+ + * @css: target css
+ + * @cgrp_overried: specify if target cgroup is different from css->cgroup
+ + *
+ + * On failure, no file is added.
+ + */
+ +static int css_populate_dir(struct cgroup_subsys_state *css,
+ +                          struct cgroup *cgrp_override)
+ +{
+ +      struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ +      struct cftype *cfts, *failed_cfts;
+ +      int ret;
+ +
+ +      if (!css->ss) {
+ +              if (cgroup_on_dfl(cgrp))
+ +                      cfts = cgroup_dfl_base_files;
+ +              else
+ +                      cfts = cgroup_legacy_base_files;
+ +
+ +              return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+ +      }
+ +
+ +      list_for_each_entry(cfts, &css->ss->cfts, node) {
+ +              ret = cgroup_addrm_files(css, cgrp, cfts, true);
+ +              if (ret < 0) {
+ +                      failed_cfts = cfts;
+ +                      goto err;
+ +              }
         }
+ +      return 0;
+ +err:
+ +      list_for_each_entry(cfts, &css->ss->cfts, node) {
+ +              if (cfts == failed_cfts)
+ +                      break;
+ +              cgroup_addrm_files(css, cgrp, cfts, false);
+ +      }
+ +      return ret;
   }
   
   static int rebind_subsystems(struct cgroup_root *dst_root,
                              unsigned long ss_mask)
   {
+ +      struct cgroup *dcgrp = &dst_root->cgrp;
         struct cgroup_subsys *ss;
         unsigned long tmp_ss_mask;
         int ssid, i, ret;
@@@ -1476,13 -1306,10 +1476,13 @@@
         if (dst_root == &cgrp_dfl_root)
                 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
   
- -      ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
- -      if (ret) {
- -              if (dst_root != &cgrp_dfl_root)
- -                      return ret;
+ +      for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+ +              struct cgroup *scgrp = &ss->root->cgrp;
+ +              int tssid;
+ +
+ +              ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
+ +              if (!ret)
+ +                      continue;
   
                 /*
                  * Rebinding back to the default root is not allowed to
@@@ -1490,67 -1317,57 +1490,67 @@@
                  * be rare.  Moving subsystems back and forth even more so.
                  * Just warn about it and continue.
                  */
- -              if (cgrp_dfl_root_visible) {
- -                      pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
- -                              ret, ss_mask);
- -                      pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+ +              if (dst_root == &cgrp_dfl_root) {
+ +                      if (cgrp_dfl_root_visible) {
+ +                              pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
+ +                                      ret, ss_mask);
+ +                              pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+ +                      }
+ +                      continue;
+ +              }
+ +
+ +              for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
+ +                      if (tssid == ssid)
+ +                              break;
+ +                      css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
                 }
+ +              return ret;
         }
   
         /*
          * Nothing can fail from this point on.  Remove files for the
          * removed subsystems and rebind each subsystem.
          */
- -      for_each_subsys_which(ss, ssid, &ss_mask)
- -              cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
- -
         for_each_subsys_which(ss, ssid, &ss_mask) {
- -              struct cgroup_root *src_root;
- -              struct cgroup_subsys_state *css;
+ +              struct cgroup_root *src_root = ss->root;
+ +              struct cgroup *scgrp = &src_root->cgrp;
+ +              struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
                 struct css_set *cset;
   
- -              src_root = ss->root;
- -              css = cgroup_css(&src_root->cgrp, ss);
+ +              WARN_ON(!css || cgroup_css(dcgrp, ss));
   
- -              WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
+ +              css_clear_dir(css, NULL);
   
- -              RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
- -              rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+ +              RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
+ +              rcu_assign_pointer(dcgrp->subsys[ssid], css);
                 ss->root = dst_root;
- -              css->cgroup = &dst_root->cgrp;
+ +              css->cgroup = dcgrp;
   
- -              down_write(&css_set_rwsem);
+ +              spin_lock_bh(&css_set_lock);
                 hash_for_each(css_set_table, i, cset, hlist)
                         list_move_tail(&cset->e_cset_node[ss->id],
- -                                     &dst_root->cgrp.e_csets[ss->id]);
- -              up_write(&css_set_rwsem);
+ +                                     &dcgrp->e_csets[ss->id]);
+ +              spin_unlock_bh(&css_set_lock);
   
                 src_root->subsys_mask &= ~(1 << ssid);
- -              src_root->cgrp.subtree_control &= ~(1 << ssid);
- -              cgroup_refresh_child_subsys_mask(&src_root->cgrp);
+ +              scgrp->subtree_control &= ~(1 << ssid);
+ +              cgroup_refresh_child_subsys_mask(scgrp);
   
                 /* default hierarchy doesn't enable controllers by default */
                 dst_root->subsys_mask |= 1 << ssid;
- -              if (dst_root != &cgrp_dfl_root) {
- -                      dst_root->cgrp.subtree_control |= 1 << ssid;
- -                      cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+ +              if (dst_root == &cgrp_dfl_root) {
+ +                      static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
+ +              } else {
+ +                      dcgrp->subtree_control |= 1 << ssid;
+ +                      cgroup_refresh_child_subsys_mask(dcgrp);
+ +                      static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                 }
   
                 if (ss->bind)
                         ss->bind(css);
         }
   
- -      kernfs_activate(dst_root->cgrp.kn);
+ +      kernfs_activate(dcgrp->kn);
         return 0;
   }
   
@@@ -1680,7 -1497,7 +1680,7 @@@ static int parse_cgroupfs_options(char 
                 for_each_subsys(ss, i) {
                         if (strcmp(token, ss->legacy_name))
                                 continue;
- -                      if (ss->disabled)
+ +                      if (!cgroup_ssid_enabled(i))
                                 continue;
   
                         /* Mutually exclusive option 'all' + subsystem name */
@@@ -1711,7 -1528,7 +1711,7 @@@
          */
         if (all_ss || (!one_ss && !opts->none && !opts->name))
                 for_each_subsys(ss, i)
- -                      if (!ss->disabled)
+ +                      if (cgroup_ssid_enabled(i))
                                 opts->subsys_mask |= (1 << i);
   
         /*
@@@ -1807,7 -1624,7 +1807,7 @@@ static void cgroup_enable_task_cg_lists
   {
         struct task_struct *p, *g;
   
- -      down_write(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
   
         if (use_task_css_set_links)
                 goto out_unlock;
@@@ -1837,16 -1654,14 +1837,16 @@@
                 if (!(p->flags & PF_EXITING)) {
                         struct css_set *cset = task_css_set(p);
   
- -                      list_add(&p->cg_list, &cset->tasks);
+ +                      if (!css_set_populated(cset))
+ +                              css_set_update_populated(cset, true);
+ +                      list_add_tail(&p->cg_list, &cset->tasks);
                         get_css_set(cset);
                 }
                 spin_unlock_irq(&p->sighand->siglock);
         } while_each_thread(g, p);
         read_unlock(&tasklist_lock);
   out_unlock:
- -      up_write(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
   }
   
   static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@@ -1856,7 -1671,6 +1856,7 @@@
   
         INIT_LIST_HEAD(&cgrp->self.sibling);
         INIT_LIST_HEAD(&cgrp->self.children);
+ +      INIT_LIST_HEAD(&cgrp->self.files);
         INIT_LIST_HEAD(&cgrp->cset_links);
         INIT_LIST_HEAD(&cgrp->pidlists);
         mutex_init(&cgrp->pidlist_mutex);
@@@ -1894,6 -1708,7 +1894,6 @@@ static int cgroup_setup_root(struct cgr
   {
         LIST_HEAD(tmp_links);
         struct cgroup *root_cgrp = &root->cgrp;
- -      struct cftype *base_files;
         struct css_set *cset;
         int i, ret;
   
@@@ -1910,7 -1725,7 +1910,7 @@@
                 goto out;
   
         /*
- -       * We're accessing css_set_count without locking css_set_rwsem here,
+ +       * We're accessing css_set_count without locking css_set_lock here,
          * but that's OK - it can only be increased by someone holding
          * cgroup_lock, and that's us. The worst that can happen is that we
          * have some link structures left over
@@@ -1932,7 -1747,12 +1932,7 @@@
         }
         root_cgrp->kn = root->kf_root->kn;
   
- -      if (root == &cgrp_dfl_root)
- -              base_files = cgroup_dfl_base_files;
- -      else
- -              base_files = cgroup_legacy_base_files;
- -
- -      ret = cgroup_addrm_files(root_cgrp, base_files, true);
+ +      ret = css_populate_dir(&root_cgrp->self, NULL);
         if (ret)
                 goto destroy_root;
   
@@@ -1952,13 -1772,10 +1952,13 @@@
          * Link the root cgroup in this hierarchy into all the css_set
          * objects.
          */
- -      down_write(&css_set_rwsem);
- -      hash_for_each(css_set_table, i, cset, hlist)
+ +      spin_lock_bh(&css_set_lock);
+ +      hash_for_each(css_set_table, i, cset, hlist) {
                 link_css_set(&tmp_links, cset, root_cgrp);
- -      up_write(&css_set_rwsem);
+ +              if (css_set_populated(cset))
+ +                      cgroup_update_populated(root_cgrp, true);
+ +      }
+ +      spin_unlock_bh(&css_set_lock);
   
         BUG_ON(!list_empty(&root_cgrp->self.children));
         BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@@ -2191,7 -2008,7 +2191,7 @@@ char *task_cgroup_path(struct task_stru
         char *path = NULL;
   
         mutex_lock(&cgroup_mutex);
- -      down_read(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
   
         root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
   
@@@ -2204,7 -2021,7 +2204,7 @@@
                         path = buf;
         }
   
- -      up_read(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
         mutex_unlock(&cgroup_mutex);
         return path;
   }
@@@ -2232,49 -2049,6 +2232,49 @@@ struct cgroup_taskset 
         struct task_struct      *cur_task;
   };
   
+ +#define CGROUP_TASKSET_INIT(tset)     (struct cgroup_taskset){        \
+ +      .src_csets              = LIST_HEAD_INIT(tset.src_csets),       \
+ +      .dst_csets              = LIST_HEAD_INIT(tset.dst_csets),       \
+ +      .csets                  = &tset.src_csets,                      \
+ +}
+ +
+ +/**
+ + * cgroup_taskset_add - try to add a migration target task to a taskset
+ + * @task: target task
+ + * @tset: target taskset
+ + *
+ + * Add @task, which is a migration target, to @tset.  This function becomes
+ + * noop if @task doesn't need to be migrated.  @task's css_set should have
+ + * been added as a migration source and @task->cg_list will be moved from
+ + * the css_set's tasks list to mg_tasks one.
+ + */
+ +static void cgroup_taskset_add(struct task_struct *task,
+ +                             struct cgroup_taskset *tset)
+ +{
+ +      struct css_set *cset;
+ +
+ +      lockdep_assert_held(&css_set_lock);
+ +
+ +      /* @task either already exited or can't exit until the end */
+ +      if (task->flags & PF_EXITING)
+ +              return;
+ +
+ +      /* leave @task alone if post_fork() hasn't linked it yet */
+ +      if (list_empty(&task->cg_list))
+ +              return;
+ +
+ +      cset = task_css_set(task);
+ +      if (!cset->mg_src_cgrp)
+ +              return;
+ +
+ +      list_move_tail(&task->cg_list, &cset->mg_tasks);
+ +      if (list_empty(&cset->mg_node))
+ +              list_add_tail(&cset->mg_node, &tset->src_csets);
+ +      if (list_empty(&cset->mg_dst_cset->mg_node))
+ +              list_move_tail(&cset->mg_dst_cset->mg_node,
+ +                             &tset->dst_csets);
+ +}
+ +
   /**
    * cgroup_taskset_first - reset taskset and return the first task
    * @tset: taskset of interest
@@@ -2322,86 -2096,47 +2322,86 @@@ struct task_struct *cgroup_taskset_next
   }
   
   /**
- - * cgroup_task_migrate - move a task from one cgroup to another.
- - * @old_cgrp: the cgroup @tsk is being migrated from
- - * @tsk: the task being migrated
- - * @new_cset: the new css_set @tsk is being attached to
+ + * cgroup_taskset_migrate - migrate a taskset to a cgroup
+ + * @tset: taget taskset
+ + * @dst_cgrp: destination cgroup
    *
- - * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
+ + * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
+ + * ->can_attach callbacks fails and guarantees that either all or none of
+ + * the tasks in @tset are migrated.  @tset is consumed regardless of
+ + * success.
    */
- -static void cgroup_task_migrate(struct cgroup *old_cgrp,
- -                              struct task_struct *tsk,
- -                              struct css_set *new_cset)
+ +static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
+ +                                struct cgroup *dst_cgrp)
   {
- -      struct css_set *old_cset;
- -
- -      lockdep_assert_held(&cgroup_mutex);
- -      lockdep_assert_held(&css_set_rwsem);
+ +      struct cgroup_subsys_state *css, *failed_css = NULL;
+ +      struct task_struct *task, *tmp_task;
+ +      struct css_set *cset, *tmp_cset;
+ +      int i, ret;
   
- -      /*
- -       * We are synchronized through threadgroup_lock() against PF_EXITING
- -       * setting such that we can't race against cgroup_exit() changing the
- -       * css_set to init_css_set and dropping the old one.
- -       */
- -      WARN_ON_ONCE(tsk->flags & PF_EXITING);
- -      old_cset = task_css_set(tsk);
+ +      /* methods shouldn't be called if no task is actually migrating */
+ +      if (list_empty(&tset->src_csets))
+ +              return 0;
   
- -      get_css_set(new_cset);
- -      rcu_assign_pointer(tsk->cgroups, new_cset);
+ +      /* check that we can legitimately attach to the cgroup */
+ +      for_each_e_css(css, i, dst_cgrp) {
+ +              if (css->ss->can_attach) {
+ +                      ret = css->ss->can_attach(css, tset);
+ +                      if (ret) {
+ +                              failed_css = css;
+ +                              goto out_cancel_attach;
+ +                      }
+ +              }
+ +      }
   
         /*
- -       * Use move_tail so that cgroup_taskset_first() still returns the
- -       * leader after migration.  This works because cgroup_migrate()
- -       * ensures that the dst_cset of the leader is the first on the
- -       * tset's dst_csets list.
+ +       * Now that we're guaranteed success, proceed to move all tasks to
+ +       * the new cgroup.  There are no failure cases after here, so this
+ +       * is the commit point.
          */
- -      list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
+ +      spin_lock_bh(&css_set_lock);
+ +      list_for_each_entry(cset, &tset->src_csets, mg_node) {
+ +              list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
+ +                      struct css_set *from_cset = task_css_set(task);
+ +                      struct css_set *to_cset = cset->mg_dst_cset;
+ +
+ +                      get_css_set(to_cset);
+ +                      css_set_move_task(task, from_cset, to_cset, true);
+ +                      put_css_set_locked(from_cset);
+ +              }
+ +      }
+ +      spin_unlock_bh(&css_set_lock);
   
         /*
- -       * We just gained a reference on old_cset by taking it from the
- -       * task. As trading it for new_cset is protected by cgroup_mutex,
- -       * we're safe to drop it here; it will be freed under RCU.
+ +       * Migration is committed, all target tasks are now on dst_csets.
+ +       * Nothing is sensitive to fork() after this point.  Notify
+ +       * controllers that migration is complete.
          */
- -      put_css_set_locked(old_cset);
+ +      tset->csets = &tset->dst_csets;
+ +
+ +      for_each_e_css(css, i, dst_cgrp)
+ +              if (css->ss->attach)
+ +                      css->ss->attach(css, tset);
+ +
+ +      ret = 0;
+ +      goto out_release_tset;
+ +
+ +out_cancel_attach:
+ +      for_each_e_css(css, i, dst_cgrp) {
+ +              if (css == failed_css)
+ +                      break;
+ +              if (css->ss->cancel_attach)
+ +                      css->ss->cancel_attach(css, tset);
+ +      }
+ +out_release_tset:
+ +      spin_lock_bh(&css_set_lock);
+ +      list_splice_init(&tset->dst_csets, &tset->src_csets);
+ +      list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
+ +              list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+ +              list_del_init(&cset->mg_node);
+ +      }
+ +      spin_unlock_bh(&css_set_lock);
+ +      return ret;
   }
   
   /**
@@@ -2417,14 -2152,14 +2417,14 @@@ static void cgroup_migrate_finish(struc
   
         lockdep_assert_held(&cgroup_mutex);
   
- -      down_write(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
                 cset->mg_src_cgrp = NULL;
                 cset->mg_dst_cset = NULL;
                 list_del_init(&cset->mg_preload_node);
                 put_css_set_locked(cset);
         }
- -      up_write(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
   }
   
   /**
@@@ -2437,11 -2172,10 +2437,11 @@@
    * @src_cset and add it to @preloaded_csets, which should later be cleaned
    * up by cgroup_migrate_finish().
    *
- - * This function may be called without holding threadgroup_lock even if the
- - * target is a process.  Threads may be created and destroyed but as long
- - * as cgroup_mutex is not dropped, no new css_set can be put into play and
- - * the preloaded css_sets are guaranteed to cover all migrations.
+ + * This function may be called without holding cgroup_threadgroup_rwsem
+ + * even if the target is a process.  Threads may be created and destroyed
+ + * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ + * into play and the preloaded css_sets are guaranteed to cover all
+ + * migrations.
    */
   static void cgroup_migrate_add_src(struct css_set *src_cset,
                                    struct cgroup *dst_cgrp,
@@@ -2450,7 -2184,7 +2450,7 @@@
         struct cgroup *src_cgrp;
   
         lockdep_assert_held(&cgroup_mutex);
- -      lockdep_assert_held(&css_set_rwsem);
+ +      lockdep_assert_held(&css_set_lock);
   
         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
   
@@@ -2539,12 -2273,12 +2539,12 @@@ err
   
   /**
    * cgroup_migrate - migrate a process or task to a cgroup
- - * @cgrp: the destination cgroup
    * @leader: the leader of the process or the task to migrate
    * @threadgroup: whether @leader points to the whole process or a single task
+ + * @cgrp: the destination cgroup
    *
    * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- - * process, the caller must be holding threadgroup_lock of @leader.  The
+ + * process, the caller must be holding cgroup_threadgroup_rwsem.  The
    * caller is also responsible for invoking cgroup_migrate_add_src() and
    * cgroup_migrate_prepare_dst() on the targets before invoking this
    * function and following up with cgroup_migrate_finish().
@@@ -2555,29 -2289,115 +2555,29 @@@
    * decided for all targets by invoking group_migrate_prepare_dst() before
    * actually starting migrating.
    */
- -static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
- -                        bool threadgroup)
- -{
- -      struct cgroup_taskset tset = {
- -              .src_csets      = LIST_HEAD_INIT(tset.src_csets),
- -              .dst_csets      = LIST_HEAD_INIT(tset.dst_csets),
- -              .csets          = &tset.src_csets,
- -      };
- -      struct cgroup_subsys_state *css, *failed_css = NULL;
- -      struct css_set *cset, *tmp_cset;
- -      struct task_struct *task, *tmp_task;
- -      int i, ret;
+ +static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ +                        struct cgroup *cgrp)
+ +{
+ +      struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+ +      struct task_struct *task;
   
         /*
          * Prevent freeing of tasks while we take a snapshot. Tasks that are
          * already PF_EXITING could be freed from underneath us unless we
          * take an rcu_read_lock.
          */
- -      down_write(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         rcu_read_lock();
         task = leader;
         do {
- -              /* @task either already exited or can't exit until the end */
- -              if (task->flags & PF_EXITING)
- -                      goto next;
- -
- -              /* leave @task alone if post_fork() hasn't linked it yet */
- -              if (list_empty(&task->cg_list))
- -                      goto next;
- -
- -              cset = task_css_set(task);
- -              if (!cset->mg_src_cgrp)
- -                      goto next;
- -
- -              /*
- -               * cgroup_taskset_first() must always return the leader.
- -               * Take care to avoid disturbing the ordering.
- -               */
- -              list_move_tail(&task->cg_list, &cset->mg_tasks);
- -              if (list_empty(&cset->mg_node))
- -                      list_add_tail(&cset->mg_node, &tset.src_csets);
- -              if (list_empty(&cset->mg_dst_cset->mg_node))
- -                      list_move_tail(&cset->mg_dst_cset->mg_node,
- -                                     &tset.dst_csets);
- -      next:
+ +              cgroup_taskset_add(task, &tset);
                 if (!threadgroup)
                         break;
         } while_each_thread(leader, task);
         rcu_read_unlock();
- -      up_write(&css_set_rwsem);
- -
- -      /* methods shouldn't be called if no task is actually migrating */
- -      if (list_empty(&tset.src_csets))
- -              return 0;
- -
- -      /* check that we can legitimately attach to the cgroup */
- -      for_each_e_css(css, i, cgrp) {
- -              if (css->ss->can_attach) {
- -                      ret = css->ss->can_attach(css, &tset);
- -                      if (ret) {
- -                              failed_css = css;
- -                              goto out_cancel_attach;
- -                      }
- -              }
- -      }
- -
- -      /*
- -       * Now that we're guaranteed success, proceed to move all tasks to
- -       * the new cgroup.  There are no failure cases after here, so this
- -       * is the commit point.
- -       */
- -      down_write(&css_set_rwsem);
- -      list_for_each_entry(cset, &tset.src_csets, mg_node) {
- -              list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
- -                      cgroup_task_migrate(cset->mg_src_cgrp, task,
- -                                          cset->mg_dst_cset);
- -      }
- -      up_write(&css_set_rwsem);
- -
- -      /*
- -       * Migration is committed, all target tasks are now on dst_csets.
- -       * Nothing is sensitive to fork() after this point.  Notify
- -       * controllers that migration is complete.
- -       */
- -      tset.csets = &tset.dst_csets;
- -
- -      for_each_e_css(css, i, cgrp)
- -              if (css->ss->attach)
- -                      css->ss->attach(css, &tset);
- -
- -      ret = 0;
- -      goto out_release_tset;
+ +      spin_unlock_bh(&css_set_lock);
   
- -out_cancel_attach:
- -      for_each_e_css(css, i, cgrp) {
- -              if (css == failed_css)
- -                      break;
- -              if (css->ss->cancel_attach)
- -                      css->ss->cancel_attach(css, &tset);
- -      }
- -out_release_tset:
- -      down_write(&css_set_rwsem);
- -      list_splice_init(&tset.dst_csets, &tset.src_csets);
- -      list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
- -              list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
- -              list_del_init(&cset->mg_node);
- -      }
- -      up_write(&css_set_rwsem);
- -      return ret;
+ +      return cgroup_taskset_migrate(&tset, cgrp);
   }
   
   /**
@@@ -2586,7 -2406,7 +2586,7 @@@
    * @leader: the task or the leader of the threadgroup to be attached
    * @threadgroup: attach the whole threadgroup?
    *
- - * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ + * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
    */
   static int cgroup_attach_task(struct cgroup *dst_cgrp,
                               struct task_struct *leader, bool threadgroup)
@@@ -2596,7 -2416,7 +2596,7 @@@
         int ret;
   
         /* look up all src csets */
- -      down_read(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         rcu_read_lock();
         task = leader;
         do {
@@@ -2606,12 -2426,12 +2606,12 @@@
                         break;
         } while_each_thread(leader, task);
         rcu_read_unlock();
- -      up_read(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
   
         /* prepare dst csets and commit */
         ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
         if (!ret)
- -              ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+ +              ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
   
         cgroup_migrate_finish(&preloaded_csets);
         return ret;
@@@ -2639,15 -2459,15 +2639,15 @@@ static int cgroup_procs_write_permissio
                 struct cgroup *cgrp;
                 struct inode *inode;
   
- -              down_read(&css_set_rwsem);
+ +              spin_lock_bh(&css_set_lock);
                 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- -              up_read(&css_set_rwsem);
+ +              spin_unlock_bh(&css_set_lock);
   
                 while (!cgroup_is_descendant(dst_cgrp, cgrp))
                         cgrp = cgroup_parent(cgrp);
   
                 ret = -ENOMEM;
- -              inode = kernfs_get_inode(sb, cgrp->procs_kn);
+ +              inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
                 if (inode) {
                         ret = inode_permission(inode, MAY_WRITE);
                         iput(inode);
@@@ -2678,13 -2498,14 +2678,13 @@@ static ssize_t __cgroup_procs_write(str
         if (!cgrp)
                 return -ENODEV;
   
- -retry_find_task:
+ +      percpu_down_write(&cgroup_threadgroup_rwsem);
         rcu_read_lock();
         if (pid) {
                 tsk = find_task_by_vpid(pid);
                 if (!tsk) {
- -                      rcu_read_unlock();
                         ret = -ESRCH;
- -                      goto out_unlock_cgroup;
+ +                      goto out_unlock_rcu;
                 }
         } else {
                 tsk = current;
@@@ -2700,23 -2521,37 +2700,23 @@@
          */
         if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
                 ret = -EINVAL;
- -              rcu_read_unlock();
- -              goto out_unlock_cgroup;
+ +              goto out_unlock_rcu;
         }
   
         get_task_struct(tsk);
         rcu_read_unlock();
   
- -      threadgroup_lock(tsk);
- -      if (threadgroup) {
- -              if (!thread_group_leader(tsk)) {
- -                      /*
- -                       * a race with de_thread from another thread's exec()
- -                       * may strip us of our leadership, if this happens,
- -                       * there is no choice but to throw this task away and
- -                       * try again; this is
- -                       * "double-double-toil-and-trouble-check locking".
- -                       */
- -                      threadgroup_unlock(tsk);
- -                      put_task_struct(tsk);
- -                      goto retry_find_task;
- -              }
- -      }
- -
         ret = cgroup_procs_write_permission(tsk, cgrp, of);
         if (!ret)
                 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
   
- -      threadgroup_unlock(tsk);
- -
         put_task_struct(tsk);
- -out_unlock_cgroup:
+ +      goto out_unlock_threadgroup;
+ +
+ +out_unlock_rcu:
+ +      rcu_read_unlock();
+ +out_unlock_threadgroup:
+ +      percpu_up_write(&cgroup_threadgroup_rwsem);
         cgroup_kn_unlock(of->kn);
         return ret ?: nbytes;
   }
@@@ -2738,9 -2573,9 +2738,9 @@@ int cgroup_attach_task_all(struct task_
                 if (root == &cgrp_dfl_root)
                         continue;
   
- -              down_read(&css_set_rwsem);
+ +              spin_lock_bh(&css_set_lock);
                 from_cgrp = task_cgroup_from_root(from, root);
- -              up_read(&css_set_rwsem);
+ +              spin_unlock_bh(&css_set_lock);
   
                 retval = cgroup_attach_task(from_cgrp, tsk, false);
                 if (retval)
@@@ -2855,17 -2690,14 +2855,17 @@@ static int cgroup_subtree_control_show(
   static int cgroup_update_dfl_csses(struct cgroup *cgrp)
   {
         LIST_HEAD(preloaded_csets);
+ +      struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
         struct cgroup_subsys_state *css;
         struct css_set *src_cset;
         int ret;
   
         lockdep_assert_held(&cgroup_mutex);
   
+ +      percpu_down_write(&cgroup_threadgroup_rwsem);
+ +
         /* look up all csses currently attached to @cgrp's subtree */
- -      down_read(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
                 struct cgrp_cset_link *link;
   
@@@ -2877,31 -2709,68 +2877,31 @@@
                         cgroup_migrate_add_src(link->cset, cgrp,
                                                &preloaded_csets);
         }
- -      up_read(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
   
         /* NULL dst indicates self on default hierarchy */
         ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
         if (ret)
                 goto out_finish;
   
+ +      spin_lock_bh(&css_set_lock);
         list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
- -              struct task_struct *last_task = NULL, *task;
+ +              struct task_struct *task, *ntask;
   
                 /* src_csets precede dst_csets, break on the first dst_cset */
                 if (!src_cset->mg_src_cgrp)
                         break;
   
- -              /*
- -               * All tasks in src_cset need to be migrated to the
- -               * matching dst_cset.  Empty it process by process.  We
- -               * walk tasks but migrate processes.  The leader might even
- -               * belong to a different cset but such src_cset would also
- -               * be among the target src_csets because the default
- -               * hierarchy enforces per-process membership.
- -               */
- -              while (true) {
- -                      down_read(&css_set_rwsem);
- -                      task = list_first_entry_or_null(&src_cset->tasks,
- -                                              struct task_struct, cg_list);
- -                      if (task) {
- -                              task = task->group_leader;
- -                              WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
- -                              get_task_struct(task);
- -                      }
- -                      up_read(&css_set_rwsem);
- -
- -                      if (!task)
- -                              break;
- -
- -                      /* guard against possible infinite loop */
- -                      if (WARN(last_task == task,
- -                               "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
- -                              goto out_finish;
- -                      last_task = task;
- -
- -                      threadgroup_lock(task);
- -                      /* raced against de_thread() from another thread? */
- -                      if (!thread_group_leader(task)) {
- -                              threadgroup_unlock(task);
- -                              put_task_struct(task);
- -                              continue;
- -                      }
- -
- -                      ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
- -
- -                      threadgroup_unlock(task);
- -                      put_task_struct(task);
- -
- -                      if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
- -                              goto out_finish;
- -              }
+ +              /* all tasks in src_csets need to be migrated */
+ +              list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
+ +                      cgroup_taskset_add(task, &tset);
         }
+ +      spin_unlock_bh(&css_set_lock);
   
+ +      ret = cgroup_taskset_migrate(&tset, cgrp);
   out_finish:
         cgroup_migrate_finish(&preloaded_csets);
+ +      percpu_up_write(&cgroup_threadgroup_rwsem);
         return ret;
   }
   
@@@ -2928,8 -2797,7 +2928,8 @@@ static ssize_t cgroup_subtree_control_w
                 if (tok[0] == '\0')
                         continue;
                 for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
- -                      if (ss->disabled || strcmp(tok + 1, ss->name))
+ +                      if (!cgroup_ssid_enabled(ssid) ||
+ +                          strcmp(tok + 1, ss->name))
                                 continue;
   
                         if (*tok == '+') {
@@@ -3053,8 -2921,7 +3053,8 @@@
                                 ret = create_css(child, ss,
                                         cgrp->subtree_control & (1 << ssid));
                         else
- -                              ret = cgroup_populate_dir(child, 1 << ssid);
+ +                              ret = css_populate_dir(cgroup_css(child, ss),
+ +                                                     NULL);
                         if (ret)
                                 goto err_undo_css;
                 }
@@@ -3087,7 -2954,7 +3087,7 @@@
                         if (css_disable & (1 << ssid)) {
                                 kill_css(css);
                         } else {
- -                              cgroup_clear_dir(child, 1 << ssid);
+ +                              css_clear_dir(css, NULL);
                                 if (ss->css_reset)
                                         ss->css_reset(css);
                         }
@@@ -3135,16 -3002,15 +3135,16 @@@ err_undo_css
                         if (css_enable & (1 << ssid))
                                 kill_css(css);
                         else
- -                              cgroup_clear_dir(child, 1 << ssid);
+ +                              css_clear_dir(css, NULL);
                 }
         }
         goto out_unlock;
   }
   
- -static int cgroup_populated_show(struct seq_file *seq, void *v)
+ +static int cgroup_events_show(struct seq_file *seq, void *v)
   {
- -      seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+ +      seq_printf(seq, "populated %d\n",
+ +                 cgroup_is_populated(seq_css(seq)->cgroup));
         return 0;
   }
   
@@@ -3287,8 -3153,7 +3287,8 @@@ static int cgroup_kn_set_ugid(struct ke
         return kernfs_setattr(kn, &iattr);
   }
   
- -static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
+ +static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+ +                         struct cftype *cft)
   {
         char name[CGROUP_FILE_NAME_MAX];
         struct kernfs_node *kn;
@@@ -3310,38 -3175,33 +3310,38 @@@
                 return ret;
         }
   
- -      if (cft->write == cgroup_procs_write)
- -              cgrp->procs_kn = kn;
- -      else if (cft->seq_show == cgroup_populated_show)
- -              cgrp->populated_kn = kn;
+ +      if (cft->file_offset) {
+ +              struct cgroup_file *cfile = (void *)css + cft->file_offset;
+ +
+ +              kernfs_get(kn);
+ +              cfile->kn = kn;
+ +              list_add(&cfile->node, &css->files);
+ +      }
+ +
         return 0;
   }
   
   /**
    * cgroup_addrm_files - add or remove files to a cgroup directory
- - * @cgrp: the target cgroup
+ + * @css: the target css
+ + * @cgrp: the target cgroup (usually css->cgroup)
    * @cfts: array of cftypes to be added
    * @is_add: whether to add or remove
    *
    * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
- - * For removals, this function never fails.  If addition fails, this
- - * function doesn't remove files already added.  The caller is responsible
- - * for cleaning up.
+ + * For removals, this function never fails.
    */
- -static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+ +static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+ +                            struct cgroup *cgrp, struct cftype cfts[],
                               bool is_add)
   {
- -      struct cftype *cft;
+ +      struct cftype *cft, *cft_end = NULL;
         int ret;
   
         lockdep_assert_held(&cgroup_mutex);
   
- -      for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ +restart:
+ +      for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
                 /* does cft->flags tell us to skip this file on @cgrp? */
                 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                         continue;
@@@ -3353,13 -3213,11 +3353,13 @@@
                         continue;
   
                 if (is_add) {
- -                      ret = cgroup_add_file(cgrp, cft);
+ +                      ret = cgroup_add_file(css, cgrp, cft);
                         if (ret) {
                                 pr_warn("%s: failed to add %s, err=%d\n",
                                         __func__, cft->name, ret);
- -                              return ret;
+ +                              cft_end = cft;
+ +                              is_add = false;
+ +                              goto restart;
                         }
                 } else {
                         cgroup_rm_file(cgrp, cft);
@@@ -3385,7 -3243,7 +3385,7 @@@ static int cgroup_apply_cftypes(struct 
                 if (cgroup_is_dead(cgrp))
                         continue;
   
- -              ret = cgroup_addrm_files(cgrp, cfts, is_add);
+ +              ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
                 if (ret)
                         break;
         }
@@@ -3497,7 -3355,7 +3497,7 @@@ static int cgroup_add_cftypes(struct cg
   {
         int ret;
   
- -      if (ss->disabled)
+ +      if (!cgroup_ssid_enabled(ss->id))
                 return 0;
   
         if (!cfts || cfts[0].name[0] == '\0')
@@@ -3547,8 -3405,17 +3547,8 @@@ int cgroup_add_legacy_cftypes(struct cg
   {
         struct cftype *cft;
   
- -      /*
- -       * If legacy_flies_on_dfl, we want to show the legacy files on the
- -       * dfl hierarchy but iff the target subsystem hasn't been updated
- -       * for the dfl hierarchy yet.
- -       */
- -      if (!cgroup_legacy_files_on_dfl ||
- -          ss->dfl_cftypes != ss->legacy_cftypes) {
- -              for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
- -                      cft->flags |= __CFTYPE_NOT_ON_DFL;
- -      }
- -
+ +      for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ +              cft->flags |= __CFTYPE_NOT_ON_DFL;
         return cgroup_add_cftypes(ss, cfts);
   }
   
@@@ -3563,10 -3430,10 +3563,10 @@@ static int cgroup_task_count(const stru
         int count = 0;
         struct cgrp_cset_link *link;
   
- -      down_read(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         list_for_each_entry(link, &cgrp->cset_links, cset_link)
                 count += atomic_read(&link->cset->refcount);
- -      up_read(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
         return count;
   }
   
@@@ -3798,25 -3665,22 +3798,25 @@@ bool css_has_online_children(struct cgr
   }
   
   /**
- - * css_advance_task_iter - advance a task itererator to the next css_set
+ + * css_task_iter_advance_css_set - advance a task itererator to the next css_set
    * @it: the iterator to advance
    *
    * Advance @it to the next css_set to walk.
    */
- -static void css_advance_task_iter(struct css_task_iter *it)
+ +static void css_task_iter_advance_css_set(struct css_task_iter *it)
   {
         struct list_head *l = it->cset_pos;
         struct cgrp_cset_link *link;
         struct css_set *cset;
   
+ +      lockdep_assert_held(&css_set_lock);
+ +
         /* Advance to the next non-empty css_set */
         do {
                 l = l->next;
                 if (l == it->cset_head) {
                         it->cset_pos = NULL;
+ +                      it->task_pos = NULL;
                         return;
                 }
   
@@@ -3827,7 -3691,7 +3827,7 @@@
                         link = list_entry(l, struct cgrp_cset_link, cset_link);
                         cset = link->cset;
                 }
- -      } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+ +      } while (!css_set_populated(cset));
   
         it->cset_pos = l;
   
@@@ -3838,52 -3702,6 +3838,52 @@@
   
         it->tasks_head = &cset->tasks;
         it->mg_tasks_head = &cset->mg_tasks;
+ +
+ +      /*
+ +       * We don't keep css_sets locked across iteration steps and thus
+ +       * need to take steps to ensure that iteration can be resumed after
+ +       * the lock is re-acquired.  Iteration is performed at two levels -
+ +       * css_sets and tasks in them.
+ +       *
+ +       * Once created, a css_set never leaves its cgroup lists, so a
+ +       * pinned css_set is guaranteed to stay put and we can resume
+ +       * iteration afterwards.
+ +       *
+ +       * Tasks may leave @cset across iteration steps.  This is resolved
+ +       * by registering each iterator with the css_set currently being
+ +       * walked and making css_set_move_task() advance iterators whose
+ +       * next task is leaving.
+ +       */
+ +      if (it->cur_cset) {
+ +              list_del(&it->iters_node);
+ +              put_css_set_locked(it->cur_cset);
+ +      }
+ +      get_css_set(cset);
+ +      it->cur_cset = cset;
+ +      list_add(&it->iters_node, &cset->task_iters);
+ +}
+ +
+ +static void css_task_iter_advance(struct css_task_iter *it)
+ +{
+ +      struct list_head *l = it->task_pos;
+ +
+ +      lockdep_assert_held(&css_set_lock);
+ +      WARN_ON_ONCE(!l);
+ +
+ +      /*
+ +       * Advance iterator to find next entry.  cset->tasks is consumed
+ +       * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
+ +       * next cset.
+ +       */
+ +      l = l->next;
+ +
+ +      if (l == it->tasks_head)
+ +              l = it->mg_tasks_head->next;
+ +
+ +      if (l == it->mg_tasks_head)
+ +              css_task_iter_advance_css_set(it);
+ +      else
+ +              it->task_pos = l;
   }
   
   /**
@@@ -3895,16 -3713,19 +3895,16 @@@
    * css_task_iter_next() to walk through the tasks until the function
    * returns NULL.  On completion of iteration, css_task_iter_end() must be
    * called.
- - *
- - * Note that this function acquires a lock which is released when the
- - * iteration finishes.  The caller can't sleep while iteration is in
- - * progress.
    */
   void css_task_iter_start(struct cgroup_subsys_state *css,
                          struct css_task_iter *it)
- -      __acquires(css_set_rwsem)
   {
         /* no one should try to iterate before mounting cgroups */
         WARN_ON_ONCE(!use_task_css_set_links);
   
- -      down_read(&css_set_rwsem);
+ +      memset(it, 0, sizeof(*it));
+ +
+ +      spin_lock_bh(&css_set_lock);
   
         it->ss = css->ss;
   
@@@ -3915,9 -3736,7 +3915,9 @@@
   
         it->cset_head = it->cset_pos;
   
- -      css_advance_task_iter(it);
+ +      css_task_iter_advance_css_set(it);
+ +
+ +      spin_unlock_bh(&css_set_lock);
   }
   
   /**
@@@ -3930,23 -3749,30 +3930,23 @@@
    */
   struct task_struct *css_task_iter_next(struct css_task_iter *it)
   {
- -      struct task_struct *res;
- -      struct list_head *l = it->task_pos;
+ +      if (it->cur_task) {
+ +              put_task_struct(it->cur_task);
+ +              it->cur_task = NULL;
+ +      }
   
- -      /* If the iterator cg is NULL, we have no tasks */
- -      if (!it->cset_pos)
- -              return NULL;
- -      res = list_entry(l, struct task_struct, cg_list);
+ +      spin_lock_bh(&css_set_lock);
   
- -      /*
- -       * Advance iterator to find next entry.  cset->tasks is consumed
- -       * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
- -       * next cset.
- -       */
- -      l = l->next;
+ +      if (it->task_pos) {
+ +              it->cur_task = list_entry(it->task_pos, struct task_struct,
+ +                                        cg_list);
+ +              get_task_struct(it->cur_task);
+ +              css_task_iter_advance(it);
+ +      }
   
- -      if (l == it->tasks_head)
- -              l = it->mg_tasks_head->next;
+ +      spin_unlock_bh(&css_set_lock);
   
- -      if (l == it->mg_tasks_head)
- -              css_advance_task_iter(it);
- -      else
- -              it->task_pos = l;
- -
- -      return res;
+ +      return it->cur_task;
   }
   
   /**
@@@ -3956,16 -3782,9 +3956,16 @@@
    * Finish task iteration started by css_task_iter_start().
    */
   void css_task_iter_end(struct css_task_iter *it)
- -      __releases(css_set_rwsem)
   {
- -      up_read(&css_set_rwsem);
+ +      if (it->cur_cset) {
+ +              spin_lock_bh(&css_set_lock);
+ +              list_del(&it->iters_node);
+ +              put_css_set_locked(it->cur_cset);
+ +              spin_unlock_bh(&css_set_lock);
+ +      }
+ +
+ +      if (it->cur_task)
+ +              put_task_struct(it->cur_task);
   }
   
   /**
@@@ -3990,10 -3809,10 +3990,10 @@@ int cgroup_transfer_tasks(struct cgrou
         mutex_lock(&cgroup_mutex);
   
         /* all tasks in @from are being moved, all csets are source */
- -      down_read(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         list_for_each_entry(link, &from->cset_links, cset_link)
                 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
- -      up_read(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
   
         ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
         if (ret)
@@@ -4011,7 -3830,7 +4011,7 @@@
                 css_task_iter_end(&it);
   
                 if (task) {
- -                      ret = cgroup_migrate(to, task, false);
+ +                      ret = cgroup_migrate(task, false, to);
                         put_task_struct(task);
                 }
         } while (task && !ret);
@@@ -4508,13 -4327,13 +4508,13 @@@ static int cgroup_clone_children_write(
   static struct cftype cgroup_dfl_base_files[] = {
         {
                 .name = "cgroup.procs",
+ +              .file_offset = offsetof(struct cgroup, procs_file),
                 .seq_start = cgroup_pidlist_start,
                 .seq_next = cgroup_pidlist_next,
                 .seq_stop = cgroup_pidlist_stop,
                 .seq_show = cgroup_pidlist_show,
                 .private = CGROUP_FILE_PROCS,
                 .write = cgroup_procs_write,
- -              .mode = S_IRUGO | S_IWUSR,
         },
         {
                 .name = "cgroup.controllers",
@@@ -4532,10 -4351,9 +4532,10 @@@
                 .write = cgroup_subtree_control_write,
         },
         {
- -              .name = "cgroup.populated",
+ +              .name = "cgroup.events",
                 .flags = CFTYPE_NOT_ON_ROOT,
- -              .seq_show = cgroup_populated_show,
+ +              .file_offset = offsetof(struct cgroup, events_file),
+ +              .seq_show = cgroup_events_show,
         },
         { }     /* terminate */
   };
@@@ -4550,6 -4368,7 +4550,6 @@@ static struct cftype cgroup_legacy_base
                 .seq_show = cgroup_pidlist_show,
                 .private = CGROUP_FILE_PROCS,
                 .write = cgroup_procs_write,
- -              .mode = S_IRUGO | S_IWUSR,
         },
         {
                 .name = "cgroup.clone_children",
@@@ -4569,6 -4388,7 +4569,6 @@@
                 .seq_show = cgroup_pidlist_show,
                 .private = CGROUP_FILE_TASKS,
                 .write = cgroup_tasks_write,
- -              .mode = S_IRUGO | S_IWUSR,
         },
         {
                 .name = "notify_on_release",
@@@ -4585,6 -4405,37 +4585,6 @@@
         { }     /* terminate */
   };
   
- -/**
- - * cgroup_populate_dir - create subsys files in a cgroup directory
- - * @cgrp: target cgroup
- - * @subsys_mask: mask of the subsystem ids whose files should be added
- - *
- - * On failure, no file is added.
- - */
- -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
- -{
- -      struct cgroup_subsys *ss;
- -      int i, ret = 0;
- -
- -      /* process cftsets of each subsystem */
- -      for_each_subsys(ss, i) {
- -              struct cftype *cfts;
- -
- -              if (!(subsys_mask & (1 << i)))
- -                      continue;
- -
- -              list_for_each_entry(cfts, &ss->cfts, node) {
- -                      ret = cgroup_addrm_files(cgrp, cfts, true);
- -                      if (ret < 0)
- -                              goto err;
- -              }
- -      }
- -      return 0;
- -err:
- -      cgroup_clear_dir(cgrp, subsys_mask);
- -      return ret;
- -}
- -
   /*
    * css destruction is four-stage process.
    *
@@@ -4613,13 -4464,9 +4613,13 @@@ static void css_free_work_fn(struct wor
                 container_of(work, struct cgroup_subsys_state, destroy_work);
         struct cgroup_subsys *ss = css->ss;
         struct cgroup *cgrp = css->cgroup;
+ +      struct cgroup_file *cfile;
   
         percpu_ref_exit(&css->refcnt);
   
+ +      list_for_each_entry(cfile, &css->files, node)
+ +              kernfs_put(cfile->kn);
+ +
         if (ss) {
                 /* css free path */
                 int id = css->id;
@@@ -4724,7 -4571,6 +4724,7 @@@ static void init_and_link_css(struct cg
         css->ss = ss;
         INIT_LIST_HEAD(&css->sibling);
         INIT_LIST_HEAD(&css->children);
+ +      INIT_LIST_HEAD(&css->files);
         css->serial_nr = css_serial_nr_next++;
   
         if (cgroup_parent(cgrp)) {
@@@ -4807,7 -4653,7 +4807,7 @@@ static int create_css(struct cgroup *cg
         css->id = err;
   
         if (visible) {
- -              err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ +              err = css_populate_dir(css, NULL);
                 if (err)
                         goto err_free_id;
         }
@@@ -4833,7 -4679,7 +4833,7 @@@
   
   err_list_del:
         list_del_rcu(&css->sibling);
- -      cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+ +      css_clear_dir(css, NULL);
   err_free_id:
         cgroup_idr_remove(&ss->css_idr, css->id);
   err_free_percpu_ref:
@@@ -4850,6 -4696,7 +4850,6 @@@ static int cgroup_mkdir(struct kernfs_n
         struct cgroup_root *root;
         struct cgroup_subsys *ss;
         struct kernfs_node *kn;
- -      struct cftype *base_files;
         int ssid, ret;
   
         /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
@@@ -4925,7 -4772,12 +4925,7 @@@
         if (ret)
                 goto out_destroy;
   
- -      if (cgroup_on_dfl(cgrp))
- -              base_files = cgroup_dfl_base_files;
- -      else
- -              base_files = cgroup_legacy_base_files;
- -
- -      ret = cgroup_addrm_files(cgrp, base_files, true);
+ +      ret = css_populate_dir(&cgrp->self, NULL);
         if (ret)
                 goto out_destroy;
   
@@@ -5012,7 -4864,7 +5012,7 @@@ static void kill_css(struct cgroup_subs
          * This must happen before css is disassociated with its cgroup.
          * See seq_css() for details.
          */
- -      cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+ +      css_clear_dir(css, NULL);
   
         /*
          * Killing would put the base ref, but we need to keep it alive
@@@ -5061,15 -4913,19 +5061,15 @@@ static int cgroup_destroy_locked(struc
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
   {
         struct cgroup_subsys_state *css;
- -      bool empty;
         int ssid;
   
         lockdep_assert_held(&cgroup_mutex);
   
         /*
- -       * css_set_rwsem synchronizes access to ->cset_links and prevents
- -       * @cgrp from being removed while put_css_set() is in progress.
+ +       * Only migration can raise populated from zero and we're already
+ +       * holding cgroup_mutex.
          */
- -      down_read(&css_set_rwsem);
- -      empty = list_empty(&cgrp->cset_links);
- -      up_read(&css_set_rwsem);
- -      if (!empty)
+ +      if (cgroup_is_populated(cgrp))
                 return -EBUSY;
   
         /*
@@@ -5167,7 -5023,6 +5167,7 @@@ static void __init cgroup_init_subsys(s
   
         have_fork_callback |= (bool)ss->fork << ss->id;
         have_exit_callback |= (bool)ss->exit << ss->id;
+ +      have_free_callback |= (bool)ss->free << ss->id;
         have_canfork_callback |= (bool)ss->can_fork << ss->id;
   
         /* At system boot, before all subsystems have been
@@@ -5216,8 -5071,6 +5216,8 @@@ int __init cgroup_init_early(void
         return 0;
   }
   
+ +static unsigned long cgroup_disable_mask __initdata;
+ +
   /**
    * cgroup_init - cgroup initialization
    *
@@@ -5228,9 -5081,8 +5228,9 @@@ int __init cgroup_init(void
   {
         struct cgroup_subsys *ss;
         unsigned long key;
- -      int ssid, err;
+ +      int ssid;
   
+ +      BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
   
@@@ -5264,15 -5116,14 +5264,15 @@@
                  * disabled flag and cftype registration needs kmalloc,
                  * both of which aren't available during early_init.
                  */
- -              if (ss->disabled)
+ +              if (cgroup_disable_mask & (1 << ssid)) {
+ +                      static_branch_disable(cgroup_subsys_enabled_key[ssid]);
+ +                      printk(KERN_INFO "Disabling %s control group subsystem\n",
+ +                             ss->name);
                         continue;
+ +              }
   
                 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
   
- -              if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
- -                      ss->dfl_cftypes = ss->legacy_cftypes;
- -
                 if (!ss->dfl_cftypes)
                         cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
   
@@@ -5287,10 -5138,17 +5287,10 @@@
                         ss->bind(init_css_set.subsys[ssid]);
         }
   
- -      err = sysfs_create_mount_point(fs_kobj, "cgroup");
- -      if (err)
- -              return err;
+ +      WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
+ +      WARN_ON(register_filesystem(&cgroup_fs_type));
+ +      WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
   
- -      err = register_filesystem(&cgroup_fs_type);
- -      if (err < 0) {
- -              sysfs_remove_mount_point(fs_kobj, "cgroup");
- -              return err;
- -      }
- -
- -      proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
         return 0;
   }
   
@@@ -5337,7 -5195,7 +5337,7 @@@ int proc_cgroup_show(struct seq_file *m
                 goto out;
   
         mutex_lock(&cgroup_mutex);
- -      down_read(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
   
         for_each_root(root) {
                 struct cgroup_subsys *ss;
@@@ -5357,39 -5215,19 +5357,39 @@@
                         seq_printf(m, "%sname=%s", count ? "," : "",
                                    root->name);
                 seq_putc(m, ':');
+ +
                 cgrp = task_cgroup_from_root(tsk, root);
- -              path = cgroup_path(cgrp, buf, PATH_MAX);
- -              if (!path) {
- -                      retval = -ENAMETOOLONG;
- -                      goto out_unlock;
+ +
+ +              /*
+ +               * On traditional hierarchies, all zombie tasks show up as
+ +               * belonging to the root cgroup.  On the default hierarchy,
+ +               * while a zombie doesn't show up in "cgroup.procs" and
+ +               * thus can't be migrated, its /proc/PID/cgroup keeps
+ +               * reporting the cgroup it belonged to before exiting.  If
+ +               * the cgroup is removed before the zombie is reaped,
+ +               * " (deleted)" is appended to the cgroup path.
+ +               */
+ +              if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
+ +                      path = cgroup_path(cgrp, buf, PATH_MAX);
+ +                      if (!path) {
+ +                              retval = -ENAMETOOLONG;
+ +                              goto out_unlock;
+ +                      }
+ +              } else {
+ +                      path = "/";
                 }
+ +
                 seq_puts(m, path);
- -              seq_putc(m, '\n');
+ +
+ +              if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
+ +                      seq_puts(m, " (deleted)\n");
+ +              else
+ +                      seq_putc(m, '\n');
         }
   
         retval = 0;
   out_unlock:
- -      up_read(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
         mutex_unlock(&cgroup_mutex);
         kfree(buf);
   out:
@@@ -5413,8 -5251,7 +5413,8 @@@ static int proc_cgroupstats_show(struc
         for_each_subsys(ss, i)
                 seq_printf(m, "%s\t%d\t%d\t%d\n",
                            ss->legacy_name, ss->root->hierarchy_id,
- -                         atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+ +                         atomic_read(&ss->root->nr_cgrps),
+ +                         cgroup_ssid_enabled(i));
   
         mutex_unlock(&cgroup_mutex);
         return 0;
@@@ -5535,7 -5372,7 +5535,7 @@@ void cgroup_post_fork(struct task_struc
          * @child during its iteration.
          *
          * If we won the race, @child is associated with %current's
- -       * css_set.  Grabbing css_set_rwsem guarantees both that the
+ +       * css_set.  Grabbing css_set_lock guarantees both that the
          * association is stable, and, on completion of the parent's
          * migration, @child is visible in the source of migration or
          * already in the destination cgroup.  This guarantee is necessary
@@@ -5550,13 -5387,14 +5550,13 @@@
         if (use_task_css_set_links) {
                 struct css_set *cset;
   
- -              down_write(&css_set_rwsem);
+ +              spin_lock_bh(&css_set_lock);
                 cset = task_css_set(current);
                 if (list_empty(&child->cg_list)) {
- -                      rcu_assign_pointer(child->cgroups, cset);
- -                      list_add(&child->cg_list, &cset->tasks);
                         get_css_set(cset);
+ +                      css_set_move_task(child, NULL, cset, false);
                 }
- -              up_write(&css_set_rwsem);
+ +              spin_unlock_bh(&css_set_lock);
         }
   
         /*
@@@ -5591,42 -5429,39 +5591,42 @@@ void cgroup_exit(struct task_struct *ts
   {
         struct cgroup_subsys *ss;
         struct css_set *cset;
- -      bool put_cset = false;
         int i;
   
         /*
          * Unlink from @tsk from its css_set.  As migration path can't race
- -       * with us, we can check cg_list without grabbing css_set_rwsem.
+ +       * with us, we can check css_set and cg_list without synchronization.
          */
+ +      cset = task_css_set(tsk);
+ +
         if (!list_empty(&tsk->cg_list)) {
- -              down_write(&css_set_rwsem);
- -              list_del_init(&tsk->cg_list);
- -              up_write(&css_set_rwsem);
- -              put_cset = true;
+ +              spin_lock_bh(&css_set_lock);
+ +              css_set_move_task(tsk, cset, NULL, false);
+ +              spin_unlock_bh(&css_set_lock);
+ +      } else {
+ +              get_css_set(cset);
         }
   
- -      /* Reassign the task to the init_css_set. */
- -      cset = task_css_set(tsk);
- -      RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
- -
         /* see cgroup_post_fork() for details */
- -      for_each_subsys_which(ss, i, &have_exit_callback) {
- -              struct cgroup_subsys_state *old_css = cset->subsys[i];
- -              struct cgroup_subsys_state *css = task_css(tsk, i);
+ +      for_each_subsys_which(ss, i, &have_exit_callback)
+ +              ss->exit(tsk);
+ +}
   
- -              ss->exit(css, old_css, tsk);
- -      }
+ +void cgroup_free(struct task_struct *task)
+ +{
+ +      struct css_set *cset = task_css_set(task);
+ +      struct cgroup_subsys *ss;
+ +      int ssid;
   
- -      if (put_cset)
- -              put_css_set(cset);
+ +      for_each_subsys_which(ss, ssid, &have_free_callback)
+ +              ss->free(task);
+ +
+ +      put_css_set(cset);
   }
   
   static void check_for_release(struct cgroup *cgrp)
   {
- -      if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+ +      if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
             !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
                 schedule_work(&cgrp->release_agent_work);
   }
@@@ -5705,13 -5540,25 +5705,13 @@@ static int __init cgroup_disable(char *
                         if (strcmp(token, ss->name) &&
                             strcmp(token, ss->legacy_name))
                                 continue;
- -
- -                      ss->disabled = 1;
- -                      printk(KERN_INFO "Disabling %s control group subsystem\n",
- -                             ss->name);
- -                      break;
+ +                      cgroup_disable_mask |= 1 << i;
                 }
         }
         return 1;
   }
   __setup("cgroup_disable=", cgroup_disable);
   
- -static int __init cgroup_set_legacy_files_on_dfl(char *str)
- -{
- -      printk("cgroup: using legacy files on the default hierarchy\n");
- -      cgroup_legacy_files_on_dfl = true;
- -      return 0;
- -}
- -__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
- -
   /**
    * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
    * @dentry: directory dentry of interest
@@@ -5815,7 -5662,7 +5815,7 @@@ static int current_css_set_cg_links_rea
         if (!name_buf)
                 return -ENOMEM;
   
- -      down_read(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         rcu_read_lock();
         cset = rcu_dereference(current->cgroups);
         list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@@ -5826,7 -5673,7 +5826,7 @@@
                            c->root->hierarchy_id, name_buf);
         }
         rcu_read_unlock();
- -      up_read(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
         kfree(name_buf);
         return 0;
   }
@@@ -5837,7 -5684,7 +5837,7 @@@ static int cgroup_css_links_read(struc
         struct cgroup_subsys_state *css = seq_css(seq);
         struct cgrp_cset_link *link;
   
- -      down_read(&css_set_rwsem);
+ +      spin_lock_bh(&css_set_lock);
         list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
                 struct css_set *cset = link->cset;
                 struct task_struct *task;
@@@ -5860,13 -5707,13 +5860,13 @@@
         overflow:
                 seq_puts(seq, "  ...\n");
         }
- -      up_read(&css_set_rwsem);
+ +      spin_unlock_bh(&css_set_lock);
         return 0;
   }
   
   static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
   {
- -      return (!cgroup_has_tasks(css->cgroup) &&
+ +      return (!cgroup_is_populated(css->cgroup) &&
                 !css_has_online_children(&css->cgroup->self));
   }
   
diff --combined kernel/cpuset.c

index c02d677c541c68067f76f0053864ed176ba39ccc,9ef59a37c1907acb633581e4c9658479439b9a3a..c9ea63ff70a7433d02a7791aed66a681c5058227
--- 1/kernel/cpuset.c
--- 2/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@@ -51,7 -51,6 +51,7 @@@
   #include <linux/stat.h>
   #include <linux/string.h>
   #include <linux/time.h>
+ +#include <linux/time64.h>
   #include <linux/backing-dev.h>
   #include <linux/sort.h>
   
@@@ -69,7 -68,7 +69,7 @@@ struct static_key cpusets_enabled_key _
   struct fmeter {
         int cnt;                /* unprocessed events count */
         int val;                /* most recent output value */
- -      time_t time;            /* clock (secs) when val computed */
+ +      time64_t time;          /* clock (secs) when val computed */
         spinlock_t lock;        /* guards read or write of above */
   };
   
@@@ -474,8 -473,7 +474,8 @@@ static int validate_change(struct cpuse
   
         /* On legacy hiearchy, we must be a subset of our parent cpuset. */
         ret = -EACCES;
- -      if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
+ +      if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ +          !is_cpuset_subset(trial, par))
                 goto out;
   
         /*
@@@ -499,7 -497,7 +499,7 @@@
          * be changed to have empty cpus_allowed or mems_allowed.
          */
         ret = -ENOSPC;
- -      if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+ +      if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
                 if (!cpumask_empty(cur->cpus_allowed) &&
                     cpumask_empty(trial->cpus_allowed))
                         goto out;
@@@ -881,8 -879,7 +881,8 @@@ static void update_cpumasks_hier(struc
                  * If it becomes empty, inherit the effective mask of the
                  * parent, which is guaranteed to have some CPUs.
                  */
- -              if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
+ +              if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ +                  cpumask_empty(new_cpus))
                         cpumask_copy(new_cpus, parent->effective_cpus);
   
                 /* Skip the whole subtree if the cpumask remains the same. */
@@@ -899,7 -896,7 +899,7 @@@
                 cpumask_copy(cp->effective_cpus, new_cpus);
                 spin_unlock_irq(&callback_lock);
   
- -              WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ +              WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
   
                 update_tasks_cpumask(cp);
@@@ -1138,8 -1135,7 +1138,8 @@@ static void update_nodemasks_hier(struc
                  * If it becomes empty, inherit the effective mask of the
                  * parent, which is guaranteed to have some MEMs.
                  */
- -              if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
+ +              if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ +                  nodes_empty(*new_mems))
                         *new_mems = parent->effective_mems;
   
                 /* Skip the whole subtree if the nodemask remains the same. */
@@@ -1156,7 -1152,7 +1156,7 @@@
                 cp->effective_mems = *new_mems;
                 spin_unlock_irq(&callback_lock);
   
- -              WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ +              WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
   
                 update_tasks_nodemask(cp);
@@@ -1375,7 -1371,7 +1375,7 @@@ out
    */
   
   #define FM_COEF 933           /* coefficient for half-life of 10 secs */
- -#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
+ +#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
   #define FM_MAXCNT 1000000     /* limit cnt to avoid overflow */
   #define FM_SCALE 1000         /* faux fixed point scale */
   
@@@ -1391,11 -1387,8 +1391,11 @@@ static void fmeter_init(struct fmeter *
   /* Internal meter update - process cnt events and update value */
   static void fmeter_update(struct fmeter *fmp)
   {
- -      time_t now = get_seconds();
- -      time_t ticks = now - fmp->time;
+ +      time64_t now;
+ +      u32 ticks;
+ +
+ +      now = ktime_get_seconds();
+ +      ticks = now - fmp->time;
   
         if (ticks == 0)
                 return;
@@@ -1447,7 -1440,7 +1447,7 @@@ static int cpuset_can_attach(struct cgr
   
         /* allow moving tasks into an empty cpuset if on default hierarchy */
         ret = -ENOSPC;
- -      if (!cgroup_on_dfl(css->cgroup) &&
+ +      if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
             (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                 goto out_unlock;
   
@@@ -1491,8 -1484,9 +1491,8 @@@ static void cpuset_attach(struct cgroup
   {
         /* static buf protected by cpuset_mutex */
         static nodemask_t cpuset_attach_nodemask_to;
- -      struct mm_struct *mm;
         struct task_struct *task;
- -      struct task_struct *leader = cgroup_taskset_first(tset);
+ +      struct task_struct *leader;
         struct cpuset *cs = css_cs(css);
         struct cpuset *oldcs = cpuset_attach_old_cs;
   
@@@ -1518,30 -1512,26 +1518,30 @@@
         }
   
         /*
- -       * Change mm, possibly for multiple threads in a threadgroup. This is
- -       * expensive and may sleep.
+ +       * Change mm for all threadgroup leaders. This is expensive and may
+ +       * sleep and should be moved outside migration path proper.
          */
         cpuset_attach_nodemask_to = cs->effective_mems;
- -      mm = get_task_mm(leader);
- -      if (mm) {
- -              mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
- -
- -              /*
- -               * old_mems_allowed is the same with mems_allowed here, except
- -               * if this task is being moved automatically due to hotplug.
- -               * In that case @mems_allowed has been updated and is empty,
- -               * so @old_mems_allowed is the right nodesets that we migrate
- -               * mm from.
- -               */
- -              if (is_memory_migrate(cs)) {
- -                      cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
- -                                        &cpuset_attach_nodemask_to);
+ +      cgroup_taskset_for_each_leader(leader, tset) {
+ +              struct mm_struct *mm = get_task_mm(leader);
+ +
+ +              if (mm) {
+ +                      mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+ +
+ +                      /*
+ +                       * old_mems_allowed is the same with mems_allowed
+ +                       * here, except if this task is being moved
+ +                       * automatically due to hotplug.  In that case
+ +                       * @mems_allowed has been updated and is empty, so
+ +                       * @old_mems_allowed is the right nodesets that we
+ +                       * migrate mm from.
+ +                       */
+ +                      if (is_memory_migrate(cs)) {
+ +                              cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+ +                                                &cpuset_attach_nodemask_to);
+ +                      }
+ +                      mmput(mm);
                 }
- -              mmput(mm);
         }
   
         cs->old_mems_allowed = cpuset_attach_nodemask_to;
@@@ -1604,6 -1594,9 +1604,6 @@@ static int cpuset_write_u64(struct cgro
         case FILE_MEMORY_PRESSURE_ENABLED:
                 cpuset_memory_pressure_enabled = !!val;
                 break;
- -      case FILE_MEMORY_PRESSURE:
- -              retval = -EACCES;
- -              break;
         case FILE_SPREAD_PAGE:
                 retval = update_flag(CS_SPREAD_PAGE, cs, val);
                 break;
@@@ -1870,6 -1863,9 +1870,6 @@@ static struct cftype files[] = 
         {
                 .name = "memory_pressure",
                 .read_u64 = cpuset_read_u64,
- -              .write_u64 = cpuset_write_u64,
- -              .private = FILE_MEMORY_PRESSURE,
- -              .mode = S_IRUGO,
         },
   
         {
@@@ -1956,7 -1952,7 +1956,7 @@@ static int cpuset_css_online(struct cgr
         cpuset_inc();
   
         spin_lock_irq(&callback_lock);
- -      if (cgroup_on_dfl(cs->css.cgroup)) {
+ +      if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                 cs->effective_mems = parent->effective_mems;
         }
@@@ -2033,7 -2029,7 +2033,7 @@@ static void cpuset_bind(struct cgroup_s
         mutex_lock(&cpuset_mutex);
         spin_lock_irq(&callback_lock);
   
- -      if (cgroup_on_dfl(root_css->cgroup)) {
+ +      if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                 top_cpuset.mems_allowed = node_possible_map;
         } else {
@@@ -2214,7 -2210,7 +2214,7 @@@ retry
         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
         mems_updated = !nodes_equal(new_mems, cs->effective_mems);
   
- -      if (cgroup_on_dfl(cs->css.cgroup))
+ +      if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                 hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                      cpus_updated, mems_updated);
         else
@@@ -2245,7 -2241,7 +2245,7 @@@ static void cpuset_hotplug_workfn(struc
         static cpumask_t new_cpus;
         static nodemask_t new_mems;
         bool cpus_updated, mems_updated;
- -      bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+ +      bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
   
         mutex_lock(&cpuset_mutex);
   
@@@ -2602,22 -2598,22 +2602,22 @@@ int cpuset_mems_allowed_intersects(cons
   }
   
   /**
-  * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
-  * @tsk: pointer to task_struct of some task.
+  * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
    *
-  * Description: Prints @task's name, cpuset name, and cached copy of its
+  * Description: Prints current's name, cpuset name, and cached copy of its
    * mems_allowed to the kernel log.
    */
- void cpuset_print_task_mems_allowed(struct task_struct *tsk)
+ void cpuset_print_current_mems_allowed(void)
   {
         struct cgroup *cgrp;
   
         rcu_read_lock();
   
-       cgrp = task_cs(tsk)->css.cgroup;
-       pr_info("%s cpuset=", tsk->comm);
+       cgrp = task_cs(current)->css.cgroup;
+       pr_info("%s cpuset=", current->comm);
         pr_cont_cgroup_name(cgrp);
-       pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
+       pr_cont(" mems_allowed=%*pbl\n",
+               nodemask_pr_args(&current->mems_allowed));
   
         rcu_read_unlock();
   }
diff --combined kernel/fork.c

index 825ecc32454d23f4e60216bedfb2de31fe504699,fe7968901bea81a8f9bc40ae0a28f473c28e832b..f97f2c449f5cf556ea6c54cb4aec6e894dd8bab5
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -251,7 -251,6 +251,7 @@@ void __put_task_struct(struct task_stru
         WARN_ON(atomic_read(&tsk->usage));
         WARN_ON(tsk == current);
   
+ +      cgroup_free(tsk);
         task_numa_free(tsk);
         security_task_free(tsk);
         exit_creds(tsk);
@@@ -455,7 -454,8 +455,8 @@@ static int dup_mmap(struct mm_struct *m
                 tmp->vm_mm = mm;
                 if (anon_vma_fork(tmp, mpnt))
                         goto fail_nomem_anon_vma_fork;
-               tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
+               tmp->vm_flags &=
+                       ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
                 tmp->vm_next = tmp->vm_prev = NULL;
                 tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                 file = tmp->vm_file;
@@@ -1102,7 -1102,7 +1103,7 @@@ static void posix_cpu_timers_init_group
         cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
         if (cpu_limit != RLIM_INFINITY) {
                 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
- -              sig->cputimer.running = 1;
+ +              sig->cputimer.running = true;
         }
   
         /* The timer lists. */
@@@ -1150,6 -1150,10 +1151,6 @@@ static int copy_signal(unsigned long cl
         tty_audit_fork(sig);
         sched_autogroup_fork(sig);
   
- -#ifdef CONFIG_CGROUPS
- -      init_rwsem(&sig->group_rwsem);
- -#endif
- -
         sig->oom_score_adj = current->signal->oom_score_adj;
         sig->oom_score_adj_min = current->signal->oom_score_adj_min;
   
diff --combined kernel/futex.c

index 684d7549825a4300ced2002a3fbec0a5698a18d1,843b552ddd75264b02dcc87c662498482b47c0c8..470c06c3299a7feb5cc0598508715e453274c677
--- 1/kernel/futex.c
--- 2/kernel/futex.c
+++ b/kernel/futex.c
@@@ -255,18 -255,9 +255,18 @@@ struct futex_hash_bucket 
         struct plist_head chain;
   } ____cacheline_aligned_in_smp;
   
- -static unsigned long __read_mostly futex_hashsize;
+ +/*
+ + * The base of the bucket array and its size are always used together
+ + * (after initialization only in hash_futex()), so ensure that they
+ + * reside in the same cacheline.
+ + */
+ +static struct {
+ +      struct futex_hash_bucket *queues;
+ +      unsigned long            hashsize;
+ +} __futex_data __read_mostly __aligned(2*sizeof(long));
+ +#define futex_queues   (__futex_data.queues)
+ +#define futex_hashsize (__futex_data.hashsize)
   
- -static struct futex_hash_bucket *futex_queues;
   
   /*
    * Fault injections for futexes.
@@@ -276,10 -267,10 +276,10 @@@
   static struct {
         struct fault_attr attr;
   
- -      u32 ignore_private;
+ +      bool ignore_private;
   } fail_futex = {
         .attr = FAULT_ATTR_INITIALIZER,
- -      .ignore_private = 0,
+ +      .ignore_private = false,
   };
   
   static int __init setup_fail_futex(char *str)
@@@ -469,7 -460,7 +469,7 @@@ get_futex_key(u32 __user *uaddr, int fs
   {
         unsigned long address = (unsigned long)uaddr;
         struct mm_struct *mm = current->mm;
-       struct page *page, *page_head;
+       struct page *page;
         int err, ro = 0;
   
         /*
@@@ -519,46 -510,9 +519,9 @@@ again
         else
                 err = 0;
   
- #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       page_head = page;
-       if (unlikely(PageTail(page))) {
-               put_page(page);
-               /* serialize against __split_huge_page_splitting() */
-               local_irq_disable();
-               if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
-                       page_head = compound_head(page);
-                       /*
-                        * page_head is valid pointer but we must pin
-                        * it before taking the PG_lock and/or
-                        * PG_compound_lock. The moment we re-enable
-                        * irqs __split_huge_page_splitting() can
-                        * return and the head page can be freed from
-                        * under us. We can't take the PG_lock and/or
-                        * PG_compound_lock on a page that could be
-                        * freed from under us.
-                        */
-                       if (page != page_head) {
-                               get_page(page_head);
-                               put_page(page);
-                       }
-                       local_irq_enable();
-               } else {
-                       local_irq_enable();
-                       goto again;
-               }
-       }
- #else
-       page_head = compound_head(page);
-       if (page != page_head) {
-               get_page(page_head);
-               put_page(page);
-       }
- #endif
- 
-       lock_page(page_head);
- 
+       lock_page(page);
         /*
-        * If page_head->mapping is NULL, then it cannot be a PageAnon
+        * If page->mapping is NULL, then it cannot be a PageAnon
          * page; but it might be the ZERO_PAGE or in the gate area or
          * in a special mapping (all cases which we are happy to fail);
          * or it may have been a good file page when get_user_pages_fast
@@@ -570,12 -524,12 +533,12 @@@
          *
          * The case we do have to guard against is when memory pressure made
          * shmem_writepage move it from filecache to swapcache beneath us:
-        * an unlikely race, but we do need to retry for page_head->mapping.
+        * an unlikely race, but we do need to retry for page->mapping.
          */
-       if (!page_head->mapping) {
-               int shmem_swizzled = PageSwapCache(page_head);
-               unlock_page(page_head);
-               put_page(page_head);
+       if (!page->mapping) {
+               int shmem_swizzled = PageSwapCache(page);
+               unlock_page(page);
+               put_page(page);
                 if (shmem_swizzled)
                         goto again;
                 return -EFAULT;
@@@ -588,7 -542,7 +551,7 @@@
          * it's a read-only handle, it's expected that futexes attach to
          * the object not the particular process.
          */
-       if (PageAnon(page_head)) {
+       if (PageAnon(page)) {
                 /*
                  * A RO anonymous page will never change and thus doesn't make
                  * sense for futex operations.
@@@ -603,15 -557,15 +566,15 @@@
                 key->private.address = address;
         } else {
                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-               key->shared.inode = page_head->mapping->host;
+               key->shared.inode = page->mapping->host;
                 key->shared.pgoff = basepage_index(page);
         }
   
         get_futex_key_refs(key); /* implies MB (B) */
   
   out:
-       unlock_page(page_head);
-       put_page(page_head);
+       unlock_page(page);
+       put_page(page);
         return err;
   }
   
diff --combined kernel/kexec_core.c

index bd9f8a03cefa4ef05c08d54a357910286487afd8,dd21c783e3dde44769fbadc526ad5b73b1ea1a62..11b64a63c0f88817b80a2c35117d70bcfe446fa1
--- 1/kernel/kexec_core.c
--- 2/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@@ -6,7 -6,7 +6,7 @@@
    * Version 2.  See the file COPYING for more details.
    */
   
- #define pr_fmt(fmt)   "kexec: " fmt
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   
   #include <linux/capability.h>
   #include <linux/mm.h>
@@@ -1027,7 -1027,7 +1027,7 @@@ static int __init crash_notes_memory_in
   
         crash_notes = __alloc_percpu(size, align);
         if (!crash_notes) {
-               pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+               pr_warn("Memory allocation for saving cpu register states failed\n");
                 return -ENOMEM;
         }
         return 0;
@@@ -1149,7 -1149,7 +1149,7 @@@ static int __init parse_crashkernel_sim
         if (*cur == '@')
                 *crash_base = memparse(cur+1, &cur);
         else if (*cur != ' ' && *cur != '\0') {
- -              pr_warn("crashkernel: unrecognized char\n");
+ +              pr_warn("crashkernel: unrecognized char: %c\n", *cur);
                 return -EINVAL;
         }
   
@@@ -1186,12 -1186,12 +1186,12 @@@ static int __init parse_crashkernel_suf
   
         /* check with suffix */
         if (strncmp(cur, suffix, strlen(suffix))) {
- -              pr_warn("crashkernel: unrecognized char\n");
+ +              pr_warn("crashkernel: unrecognized char: %c\n", *cur);
                 return -EINVAL;
         }
         cur += strlen(suffix);
         if (*cur != ' ' && *cur != '\0') {
- -              pr_warn("crashkernel: unrecognized char\n");
+ +              pr_warn("crashkernel: unrecognized char: %c\n", *cur);
                 return -EINVAL;
         }
   
diff --combined kernel/params.c

index ed1e0a1cffa7c7b78d750df0b72770a7769817b7,93a380a2345d71ae8c884a56006977eec959cbf8..a6d6149c0fe60df1ca38d9a66acef281b78ee79d
--- 1/kernel/params.c
--- 2/kernel/params.c
+++ b/kernel/params.c
@@@ -223,7 -223,7 +223,7 @@@ char *parse_args(const char *doing
                  int (*unknown)(char *param, char *val,
                                 const char *doing, void *arg))
   {
- -      char *param, *val;
+ +      char *param, *val, *err = NULL;
   
         /* Chew leading spaces */
         args = skip_spaces(args);
@@@ -238,7 -238,7 +238,7 @@@
                 args = next_arg(args, &param, &val);
                 /* Stop at -- */
                 if (!val && strcmp(param, "--") == 0)
- -                      return args;
+ +                      return err ?: args;
                 irq_was_disabled = irqs_disabled();
                 ret = parse_one(param, val, doing, params, num,
                                 min_level, max_level, arg, unknown);
@@@ -247,25 -247,24 +247,25 @@@
                                 doing, param);
   
                 switch (ret) {
+ +              case 0:
+ +                      continue;
                 case -ENOENT:
                         pr_err("%s: Unknown parameter `%s'\n", doing, param);
- -                      return ERR_PTR(ret);
+ +                      break;
                 case -ENOSPC:
                         pr_err("%s: `%s' too large for parameter `%s'\n",
                                doing, val ?: "", param);
- -                      return ERR_PTR(ret);
- -              case 0:
                         break;
                 default:
                         pr_err("%s: `%s' invalid for parameter `%s'\n",
                                doing, val ?: "", param);
- -                      return ERR_PTR(ret);
+ +                      break;
                 }
+ +
+ +              err = ERR_PTR(ret);
         }
   
- -      /* All parsed OK. */
- -      return NULL;
+ +      return err;
   }
   
   /* Lazy bastard, eh? */
@@@ -326,10 -325,11 +326,11 @@@ int param_get_charp(char *buffer, cons
   }
   EXPORT_SYMBOL(param_get_charp);
   
- static void param_free_charp(void *arg)
+ void param_free_charp(void *arg)
   {
         maybe_kfree_parameter(*((char **)arg));
   }
+ EXPORT_SYMBOL(param_free_charp);
   
   const struct kernel_param_ops param_ops_charp = {
         .set = param_set_charp,
diff --combined kernel/sysctl.c

index 96c856b040819e30f5e9d4dad4ac396569f0eba0,a3411175f7166f7cf87a962204355f9aead56691..dc6858d6639ed022d65129bdbb869ff7bcc05789
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -64,7 -64,6 +64,7 @@@
   #include <linux/binfmts.h>
   #include <linux/sched/sysctl.h>
   #include <linux/kexec.h>
+ +#include <linux/bpf.h>
   
   #include <asm/uaccess.h>
   #include <asm/processor.h>
@@@ -888,6 -887,17 +888,17 @@@ static struct ctl_table kern_table[] = 
                 .extra1         = &zero,
                 .extra2         = &one,
         },
+ #ifdef CONFIG_HARDLOCKUP_DETECTOR
+       {
+               .procname       = "hardlockup_panic",
+               .data           = &hardlockup_panic,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+ #endif
   #ifdef CONFIG_SMP
         {
                 .procname       = "softlockup_all_cpu_backtrace",
@@@ -898,6 -908,15 +909,15 @@@
                 .extra1         = &zero,
                 .extra2         = &one,
         },
+       {
+               .procname       = "hardlockup_all_cpu_backtrace",
+               .data           = &sysctl_hardlockup_all_cpu_backtrace,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
   #endif /* CONFIG_SMP */
   #endif
   #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@@ -1139,18 -1158,6 +1159,18 @@@
                 .mode           = 0644,
                 .proc_handler   = timer_migration_handler,
         },
+ +#endif
+ +#ifdef CONFIG_BPF_SYSCALL
+ +      {
+ +              .procname       = "unprivileged_bpf_disabled",
+ +              .data           = &sysctl_unprivileged_bpf_disabled,
+ +              .maxlen         = sizeof(sysctl_unprivileged_bpf_disabled),
+ +              .mode           = 0644,
+ +              /* only handle a transition from default "0" to "1" */
+ +              .proc_handler   = proc_dointvec_minmax,
+ +              .extra1         = &one,
+ +              .extra2         = &one,
+ +      },
   #endif
         { }
   };
diff --combined lib/Kconfig.debug

index 565783733cd013166b1edc367ba3bc919215d2bd,4d1b97b03b2f8ee27a000f66c4ab8c6cb5cae94d..526105c18566c16b0bdd6f0dc0d958b262724e0d
--- 1/lib/Kconfig.debug
--- 2/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@@ -216,7 -216,7 +216,7 @@@ config STRIP_ASM_SYM
   
   config READABLE_ASM
           bool "Generate readable assembler code"
- -        depends on DEBUG_KERNEL
+ +        depends on DEBUG_KERNEL && !LTO
           help
             Disable some compiler optimizations that tend to generate human unreadable
             assembler output. This may make the kernel slightly slower, but it helps
@@@ -312,15 -312,6 +312,15 @@@ config DEBUG_SECTION_MISMATC
           - Enable verbose reporting from modpost in order to help resolve
             the section mismatches that are reported.
   
+ +config SECTION_MISMATCH_WARN_ONLY
+ +      bool "Make section mismatch errors non-fatal"
+ +      default y
+ +      help
+ +        If you say N here, the build process will fail if there are any
+ +        section mismatch, instead of just throwing warnings.
+ +
+ +        If unsure, say Y.
+ +
   #
   # Select this config option from the architecture Kconfig, if it
   # is preferred to always offer frame pointers as a config
@@@ -580,6 -571,14 +580,14 @@@ config DEBUG_VM_R
   
           If unsure, say N.
   
+ config DEBUG_VM_PGFLAGS
+       bool "Debug page-flags operations"
+       depends on DEBUG_VM
+       help
+         Enables extra validation on page flags operations.
+ 
+         If unsure, say N.
+ 
   config DEBUG_VIRTUAL
         bool "Debug VM translations"
         depends on DEBUG_KERNEL && X86
@@@ -1695,6 -1694,9 +1703,9 @@@ config TEST_STRING_HELPER
   config TEST_KSTRTOX
         tristate "Test kstrto*() family of functions at runtime"
   
+ config TEST_PRINTF
+       tristate "Test printf() family of functions at runtime"
+ 
   config TEST_RHASHTABLE
         tristate "Perform selftest on resizable hash table"
         default n
@@@ -1762,6 -1764,16 +1773,16 @@@ config DMA_API_DEBU
   
           If unsure, say N.
   
+ config DMA_API_DEBUG_POISON
+       bool "Poison coherent DMA buffers"
+       depends on DMA_API_DEBUG && EXPERT
+       help
+         Poison DMA buffers returned by dma_alloc_coherent unless __GFP_ZERO
+         is explicitly specified, to catch drivers depending on zeroed buffers
+         without passing the correct flags.
+ 
+         Only say Y if you're prepared for almost everything to break.
+ 
   config TEST_LKM
         tristate "Test module loading with 'hello world' module"
         default n
diff --combined lib/Makefile

index 8de3b012eac77ed2c14160022d0cc6b9f75773c6,dbf6f3d6eefb41ac2919f874c6eef5570e6cc88e..8498a5c9815a3c0273d99fd45c3d10bdf42c3375
--- 1/lib/Makefile
--- 2/lib/Makefile
+++ b/lib/Makefile
@@@ -26,8 -26,7 +26,8 @@@ obj-y += bcd.o div64.o sort.o parser.o 
          bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
          gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
          bsearch.o find_bit.o llist.o memweight.o kfifo.o \
- -       percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
+ +       percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
+ +       once.o
   obj-y += string_helpers.o
   obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
   obj-y += hexdump.o
@@@ -42,6 -41,7 +42,7 @@@ obj-$(CONFIG_TEST_RHASHTABLE) += test_r
   obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
   obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o
   obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
+ obj-$(CONFIG_TEST_PRINTF) += test_printf.o
   
   ifeq ($(CONFIG_DEBUG_KOBJECT),y)
   CFLAGS_kobject.o += -DDEBUG
@@@ -82,6 -82,7 +83,7 @@@ obj-$(CONFIG_CRC32)   += crc32.
   obj-$(CONFIG_CRC7)    += crc7.o
   obj-$(CONFIG_LIBCRC32C)       += libcrc32c.o
   obj-$(CONFIG_CRC8)    += crc8.o
+ obj-$(CONFIG_CRC64_ECMA)      += crc64_ecma.o
   obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
   
   obj-$(CONFIG_842_COMPRESS) += 842/
diff --combined lib/dma-debug.c

index fcb65d2a0b947e85e335599229bd3a44a1c425d7,40514eddb67053562978de2c06a8f12451175a98..af6262b4e02c62c11bbf6295abc22c38ca4bc5a1
--- 1/lib/dma-debug.c
--- 2/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@@ -30,6 -30,7 +30,7 @@@
   #include <linux/sched.h>
   #include <linux/ctype.h>
   #include <linux/list.h>
+ #include <linux/poison.h>
   #include <linux/slab.h>
   
   #include <asm/sections.h>
@@@ -100,7 -101,7 +101,7 @@@ static LIST_HEAD(free_entries)
   static DEFINE_SPINLOCK(free_entries_lock);
   
   /* Global disable flag - will be set in case of an error */
- -static u32 global_disable __read_mostly;
+ +static bool global_disable __read_mostly;
   
   /* Early initialization disable flag, set at the end of dma_debug_init */
   static bool dma_debug_initialized __read_mostly;
@@@ -1249,6 -1250,14 +1250,14 @@@ static void check_sync(struct device *d
                                 dir2name[entry->direction],
                                 dir2name[ref->direction]);
   
+       if (ref->sg_call_ents && ref->type == dma_debug_sg &&
+           ref->sg_call_ents != entry->sg_call_ents) {
+               err_printk(ref->dev, entry, "DMA-API: device driver syncs "
+                          "DMA sg list with different entry count "
+                          "[map count=%d] [sync count=%d]\n",
+                          entry->sg_call_ents, ref->sg_call_ents);
+       }
+ 
   out:
         put_hash_bucket(bucket, &flags);
   }
@@@ -1439,7 -1448,7 +1448,7 @@@ void debug_dma_unmap_sg(struct device *
   EXPORT_SYMBOL(debug_dma_unmap_sg);
   
   void debug_dma_alloc_coherent(struct device *dev, size_t size,
-                             dma_addr_t dma_addr, void *virt)
+                             dma_addr_t dma_addr, void *virt, gfp_t flags)
   {
         struct dma_debug_entry *entry;
   
@@@ -1449,6 -1458,9 +1458,9 @@@
         if (unlikely(virt == NULL))
                 return;
   
+       if (IS_ENABLED(CONFIG_DMA_API_DEBUG_POISON) && !(flags & __GFP_ZERO))
+               memset(virt, DMA_ALLOC_POISON, size);
+ 
         entry = dma_entry_alloc();
         if (!entry)
                 return;
diff --combined lib/kobject.c

index 0554077462669074d4df0fd01e2d087d7f997837,fee2fd950306569573d5b2c6b164ccdb9fad3f15..7cbccd2b4c72042595484e32c2e11906948414b7
--- 1/lib/kobject.c
--- 2/lib/kobject.c
+++ b/lib/kobject.c
@@@ -257,18 -257,32 +257,32 @@@ static int kobject_add_internal(struct 
   int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
                                   va_list vargs)
   {
-       char *s;
+       const char *s;
   
         if (kobj->name && !fmt)
                 return 0;
   
-       s = kvasprintf(GFP_KERNEL, fmt, vargs);
+       s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
         if (!s)
                 return -ENOMEM;
   
-       /* ewww... some of these buggers have '/' in the name ... */
-       strreplace(s, '/', '!');
-       kfree(kobj->name);
+       /*
+        * ewww... some of these buggers have '/' in the name ... If
+        * that's the case, we need to make sure we have an actual
+        * allocated copy to modify, since kvasprintf_const may have
+        * returned something from .rodata.
+        */
+       if (strchr(s, '/')) {
+               char *t;
+ 
+               t = kstrdup(s, GFP_KERNEL);
+               kfree_const(s);
+               if (!t)
+                       return -ENOMEM;
+               strreplace(t, '/', '!');
+               s = t;
+       }
+       kfree_const(kobj->name);
         kobj->name = s;
   
         return 0;
@@@ -466,7 -480,7 +480,7 @@@ int kobject_rename(struct kobject *kobj
         envp[0] = devpath_string;
         envp[1] = NULL;
   
-       name = dup_name = kstrdup(new_name, GFP_KERNEL);
+       name = dup_name = kstrdup_const(new_name, GFP_KERNEL);
         if (!name) {
                 error = -ENOMEM;
                 goto out;
@@@ -486,7 -500,7 +500,7 @@@
         kobject_uevent_env(kobj, KOBJ_MOVE, envp);
   
   out:
-       kfree(dup_name);
+       kfree_const(dup_name);
         kfree(devpath_string);
         kfree(devpath);
         kobject_put(kobj);
@@@ -568,7 -582,6 +582,7 @@@ void kobject_del(struct kobject *kobj
         kobject_put(kobj->parent);
         kobj->parent = NULL;
   }
+ +EXPORT_SYMBOL(kobject_del);
   
   /**
    * kobject_get - increment refcount for object.
@@@ -585,7 -598,6 +599,7 @@@ struct kobject *kobject_get(struct kobj
         }
         return kobj;
   }
+ +EXPORT_SYMBOL(kobject_get);
   
   static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
   {
@@@ -634,7 -646,7 +648,7 @@@ static void kobject_cleanup(struct kobj
         /* free name if we allocated it */
         if (name) {
                 pr_debug("kobject: '%s': free name\n", name);
-               kfree(name);
+               kfree_const(name);
         }
   }
   
@@@ -677,7 -689,6 +691,7 @@@ void kobject_put(struct kobject *kobj
                 kref_put(&kobj->kref, kobject_release);
         }
   }
+ +EXPORT_SYMBOL(kobject_put);
   
   static void dynamic_kobj_release(struct kobject *kobj)
   {
@@@ -806,7 -817,6 +820,7 @@@ int kset_register(struct kset *k
         kobject_uevent(&k->kobj, KOBJ_ADD);
         return 0;
   }
+ +EXPORT_SYMBOL(kset_register);
   
   /**
    * kset_unregister - remove a kset.
@@@ -819,7 -829,6 +833,7 @@@ void kset_unregister(struct kset *k
         kobject_del(&k->kobj);
         kobject_put(&k->kobj);
   }
+ +EXPORT_SYMBOL(kset_unregister);
   
   /**
    * kset_find_obj - search for object in kset.
@@@ -1056,3 -1065,10 +1070,3 @@@ void kobj_ns_drop(enum kobj_ns_type typ
                 kobj_ns_ops_tbl[type]->drop_ns(ns);
         spin_unlock(&kobj_ns_type_lock);
   }
- -
- -EXPORT_SYMBOL(kobject_get);
- -EXPORT_SYMBOL(kobject_put);
- -EXPORT_SYMBOL(kobject_del);
- -
- -EXPORT_SYMBOL(kset_register);
- -EXPORT_SYMBOL(kset_unregister);
diff --combined mm/backing-dev.c

index 619984fc07ec32792349c7fe8aec7c8b56e3d2a3,e7781eb35fd122507149fe8b8306948a95b32455..8ed2ffd963c53b910f91e1b60b04c56385a3129f
--- 1/mm/backing-dev.c
--- 2/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@@ -480,10 -480,6 +480,10 @@@ static void cgwb_release_workfn(struct 
                                                 release_work);
         struct backing_dev_info *bdi = wb->bdi;
   
+ +      spin_lock_irq(&cgwb_lock);
+ +      list_del_rcu(&wb->bdi_node);
+ +      spin_unlock_irq(&cgwb_lock);
+ +
         wb_shutdown(wb);
   
         css_put(wb->memcg_css);
@@@ -579,7 -575,6 +579,7 @@@ static int cgwb_create(struct backing_d
                 ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
                 if (!ret) {
                         atomic_inc(&bdi->usage_cnt);
+ +                      list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
                         list_add(&wb->memcg_node, memcg_cgwb_list);
                         list_add(&wb->blkcg_node, blkcg_cgwb_list);
                         css_get(memcg_css);
@@@ -637,7 -632,7 +637,7 @@@ struct bdi_writeback *wb_get_create(str
   {
         struct bdi_writeback *wb;
   
-       might_sleep_if(gfp & __GFP_WAIT);
+       might_sleep_if(gfpflags_allow_blocking(gfp));
   
         if (!memcg_css->parent)
                 return &bdi->wb;
@@@ -681,7 -676,7 +681,7 @@@ static int cgwb_bdi_init(struct backing
   static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
   {
         struct radix_tree_iter iter;
- -      struct bdi_writeback_congested *congested, *congested_n;
+ +      struct rb_node *rbn;
         void **slot;
   
         WARN_ON(test_bit(WB_registered, &bdi->wb.state));
@@@ -691,11 -686,9 +691,11 @@@
         radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
                 cgwb_kill(*slot);
   
- -      rbtree_postorder_for_each_entry_safe(congested, congested_n,
- -                                      &bdi->cgwb_congested_tree, rb_node) {
- -              rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree);
+ +      while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
+ +              struct bdi_writeback_congested *congested =
+ +                      rb_entry(rbn, struct bdi_writeback_congested, rb_node);
+ +
+ +              rb_erase(rbn, &bdi->cgwb_congested_tree);
                 congested->bdi = NULL;  /* mark @congested unlinked */
         }
   
@@@ -771,22 -764,15 +771,22 @@@ static void cgwb_bdi_destroy(struct bac
   
   int bdi_init(struct backing_dev_info *bdi)
   {
+ +      int ret;
+ +
         bdi->dev = NULL;
   
         bdi->min_ratio = 0;
         bdi->max_ratio = 100;
         bdi->max_prop_frac = FPROP_FRAC_BASE;
         INIT_LIST_HEAD(&bdi->bdi_list);
+ +      INIT_LIST_HEAD(&bdi->wb_list);
         init_waitqueue_head(&bdi->wb_waitq);
   
- -      return cgwb_bdi_init(bdi);
+ +      ret = cgwb_bdi_init(bdi);
+ +
+ +      list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+ +
+ +      return ret;
   }
   EXPORT_SYMBOL(bdi_init);
   
@@@ -837,7 -823,7 +837,7 @@@ static void bdi_remove_from_list(struc
         synchronize_rcu_expedited();
   }
   
- -void bdi_destroy(struct backing_dev_info *bdi)
+ +void bdi_unregister(struct backing_dev_info *bdi)
   {
         /* make sure nobody finds us on the bdi_list anymore */
         bdi_remove_from_list(bdi);
@@@ -849,19 -835,9 +849,19 @@@
                 device_unregister(bdi->dev);
                 bdi->dev = NULL;
         }
+ +}
   
+ +void bdi_exit(struct backing_dev_info *bdi)
+ +{
+ +      WARN_ON_ONCE(bdi->dev);
         wb_exit(&bdi->wb);
   }
+ +
+ +void bdi_destroy(struct backing_dev_info *bdi)
+ +{
+ +      bdi_unregister(bdi);
+ +      bdi_exit(bdi);
+ +}
   EXPORT_SYMBOL(bdi_destroy);
   
   /*
diff --combined mm/failslab.c

index 98fb490311eb94386aebd2f4ceb77c729f4fa01e,35c876c82b9dc2f02cf22a5bc0af899d022bc742..79171b4a58269986491198403a322d6c2a7dc814
--- 1/mm/failslab.c
--- 2/mm/failslab.c
+++ b/mm/failslab.c
@@@ -3,12 -3,12 +3,12 @@@
   
   static struct {
         struct fault_attr attr;
-       bool ignore_gfp_wait;
- -      u32 ignore_gfp_reclaim;
- -      int cache_filter;
++      bool ignore_gfp_reclaim;
+ +      bool cache_filter;
   } failslab = {
         .attr = FAULT_ATTR_INITIALIZER,
-       .ignore_gfp_wait = true,
- -      .ignore_gfp_reclaim = 1,
- -      .cache_filter = 0,
++      .ignore_gfp_reclaim = true,
+ +      .cache_filter = false,
   };
   
   bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
@@@ -16,7 -16,7 +16,7 @@@
         if (gfpflags & __GFP_NOFAIL)
                 return false;
   
-         if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
+       if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
                 return false;
   
         if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
@@@ -42,7 -42,7 +42,7 @@@ static int __init failslab_debugfs_init
                 return PTR_ERR(dir);
   
         if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                               &failslab.ignore_gfp_wait))
+                               &failslab.ignore_gfp_reclaim))
                 goto fail;
         if (!debugfs_create_bool("cache-filter", mode, dir,
                                 &failslab.cache_filter))
diff --combined mm/huge_memory.c

index 440be97ad2bb0fcb0b3e8d83aaaa9ebc0696a008,cb34583d016cfcb073e97ddce31e8b3f5f1dd5dd..4b3420ade697b04659e6032ca602c7ac77149ad7
--- 1/mm/huge_memory.c
--- 2/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -26,11 -26,41 +26,41 @@@
   #include <linux/hashtable.h>
   #include <linux/userfaultfd_k.h>
   #include <linux/page_idle.h>
+ #include <linux/swapops.h>
   
   #include <asm/tlb.h>
   #include <asm/pgalloc.h>
   #include "internal.h"
   
+ enum scan_result {
+       SCAN_FAIL,
+       SCAN_SUCCEED,
+       SCAN_PMD_NULL,
+       SCAN_EXCEED_NONE_PTE,
+       SCAN_PTE_NON_PRESENT,
+       SCAN_PAGE_RO,
+       SCAN_NO_REFERENCED_PAGE,
+       SCAN_PAGE_NULL,
+       SCAN_SCAN_ABORT,
+       SCAN_PAGE_COUNT,
+       SCAN_PAGE_LRU,
+       SCAN_PAGE_LOCK,
+       SCAN_PAGE_ANON,
+       SCAN_PAGE_COMPOUND,
+       SCAN_ANY_PROCESS,
+       SCAN_VMA_NULL,
+       SCAN_VMA_CHECK,
+       SCAN_ADDRESS_RANGE,
+       SCAN_SWAP_CACHE_PAGE,
+       SCAN_DEL_PAGE_LRU,
+       SCAN_ALLOC_HUGE_PAGE_FAIL,
+       SCAN_CGROUP_CHARGE_FAIL,
+       SCAN_EXCEED_SWAP_PTE
+ };
+ 
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/huge_memory.h>
+ 
   /*
    * By default transparent hugepage support is disabled in order that avoid
    * to risk increase the memory footprint of applications without a guaranteed
@@@ -67,6 -97,7 +97,7 @@@ static DECLARE_WAIT_QUEUE_HEAD(khugepag
    * fault.
    */
   static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+ static unsigned int khugepaged_max_ptes_swap __read_mostly = HPAGE_PMD_NR/8;
   
   static int khugepaged(void *none);
   static int khugepaged_slab_init(void);
@@@ -106,6 -137,10 +137,10 @@@ static struct khugepaged_scan khugepage
         .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
   };
   
+ static DEFINE_SPINLOCK(split_queue_lock);
+ static LIST_HEAD(split_queue);
+ static unsigned long split_queue_len;
+ static struct shrinker deferred_split_shrinker;
   
   static void set_recommended_min_free_kbytes(void)
   {
@@@ -116,7 -151,7 +151,7 @@@
         for_each_populated_zone(zone)
                 nr_zones++;
   
-       /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+       /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
         recommended_min = pageblock_nr_pages * nr_zones * 2;
   
         /*
@@@ -151,7 -186,7 +186,7 @@@ static int start_stop_khugepaged(void
                 if (!khugepaged_thread)
                         khugepaged_thread = kthread_run(khugepaged, NULL,
                                                         "khugepaged");
- -              if (unlikely(IS_ERR(khugepaged_thread))) {
+ +              if (IS_ERR(khugepaged_thread)) {
                         pr_err("khugepaged: kthread_run(khugepaged) failed\n");
                         err = PTR_ERR(khugepaged_thread);
                         khugepaged_thread = NULL;
@@@ -553,6 -588,33 +588,33 @@@ static struct kobj_attribute khugepaged
         __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
                khugepaged_max_ptes_none_store);
   
+ static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
+                                            struct kobj_attribute *attr,
+                                            char *buf)
+ {
+       return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+ }
+ 
+ static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
+                                             struct kobj_attribute *attr,
+                                             const char *buf, size_t count)
+ {
+       int err;
+       unsigned long max_ptes_swap;
+ 
+       err  = kstrtoul(buf, 10, &max_ptes_swap);
+       if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+               return -EINVAL;
+ 
+       khugepaged_max_ptes_swap = max_ptes_swap;
+ 
+       return count;
+ }
+ 
+ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
+       __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
+              khugepaged_max_ptes_swap_store);
+ 
   static struct attribute *khugepaged_attr[] = {
         &khugepaged_defrag_attr.attr,
         &khugepaged_max_ptes_none_attr.attr,
@@@ -561,6 -623,7 +623,7 @@@
         &full_scans_attr.attr,
         &scan_sleep_millisecs_attr.attr,
         &alloc_sleep_millisecs_attr.attr,
+       &khugepaged_max_ptes_swap_attr.attr,
         NULL,
   };
   
@@@ -638,6 -701,9 +701,9 @@@ static int __init hugepage_init(void
         err = register_shrinker(&huge_zero_page_shrinker);
         if (err)
                 goto err_hzp_shrinker;
+       err = register_shrinker(&deferred_split_shrinker);
+       if (err)
+               goto err_split_shrinker;
   
         /*
          * By default disable transparent hugepages on smaller systems,
@@@ -655,6 -721,8 +721,8 @@@
   
         return 0;
   err_khugepaged:
+       unregister_shrinker(&deferred_split_shrinker);
+ err_split_shrinker:
         unregister_shrinker(&huge_zero_page_shrinker);
   err_hzp_shrinker:
         khugepaged_slab_exit();
@@@ -711,6 -779,27 +779,27 @@@ static inline pmd_t mk_huge_pmd(struct 
         return entry;
   }
   
+ static inline struct list_head *page_deferred_list(struct page *page)
+ {
+       /*
+        * ->lru in the tail pages is occupied by compound_head.
+        * Let's use ->mapping + ->index in the second tail page as list_head.
+        */
+       return (struct list_head *)&page[2].mapping;
+ }
+ 
+ void prep_transhuge_page(struct page *page)
+ {
+       /*
+        * we use page->mapping and page->indexlru in second tail page
+        * as list_head: assuming THP order >= 2
+        */
+       BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+ 
+       INIT_LIST_HEAD(page_deferred_list(page));
+       set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+ }
+ 
   static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                         struct vm_area_struct *vma,
                                         unsigned long address, pmd_t *pmd,
@@@ -724,7 -813,7 +813,7 @@@
   
         VM_BUG_ON_PAGE(!PageCompound(page), page);
   
-       if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+       if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
                 put_page(page);
                 count_vm_event(THP_FAULT_FALLBACK);
                 return VM_FAULT_FALLBACK;
@@@ -732,7 -821,7 +821,7 @@@
   
         pgtable = pte_alloc_one(mm, haddr);
         if (unlikely(!pgtable)) {
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, true);
                 put_page(page);
                 return VM_FAULT_OOM;
         }
@@@ -748,7 -837,7 +837,7 @@@
         ptl = pmd_lock(mm, pmd);
         if (unlikely(!pmd_none(*pmd))) {
                 spin_unlock(ptl);
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, true);
                 put_page(page);
                 pte_free(mm, pgtable);
         } else {
@@@ -759,7 -848,7 +848,7 @@@
                         int ret;
   
                         spin_unlock(ptl);
-                       mem_cgroup_cancel_charge(page, memcg);
+                       mem_cgroup_cancel_charge(page, memcg, true);
                         put_page(page);
                         pte_free(mm, pgtable);
                         ret = handle_userfault(vma, address, flags,
@@@ -770,8 -859,8 +859,8 @@@
   
                 entry = mk_huge_pmd(page, vma->vm_page_prot);
                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-               page_add_new_anon_rmap(page, vma, haddr);
-               mem_cgroup_commit_charge(page, memcg, false);
+               page_add_new_anon_rmap(page, vma, haddr, true);
+               mem_cgroup_commit_charge(page, memcg, false, true);
                 lru_cache_add_active_or_unevictable(page, vma);
                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
                 set_pmd_at(mm, haddr, pmd, entry);
@@@ -786,7 -875,7 +875,7 @@@
   
   static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
   {
-       return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+       return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
   }
   
   /* Caller must hold page table lock. */
@@@ -865,6 -954,7 +954,7 @@@ int do_huge_pmd_anonymous_page(struct m
                 count_vm_event(THP_FAULT_FALLBACK);
                 return VM_FAULT_FALLBACK;
         }
+       prep_transhuge_page(page);
         return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
                                             flags);
   }
@@@ -956,19 -1046,10 +1046,10 @@@ int copy_huge_pmd(struct mm_struct *dst
                 goto out_unlock;
         }
   
-       if (unlikely(pmd_trans_splitting(pmd))) {
-               /* split huge page running from under us */
-               spin_unlock(src_ptl);
-               spin_unlock(dst_ptl);
-               pte_free(dst_mm, pgtable);
- 
-               wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
-               goto out;
-       }
         src_page = pmd_page(pmd);
         VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
         get_page(src_page);
-       page_dup_rmap(src_page);
+       page_dup_rmap(src_page, true);
         add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
   
         pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@@ -1008,37 -1089,6 +1089,6 @@@ unlock
         spin_unlock(ptl);
   }
   
- /*
-  * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
-  * during copy_user_huge_page()'s copy_page_rep(): in the case when
-  * the source page gets split and a tail freed before copy completes.
-  * Called under pmd_lock of checked pmd, so safe from splitting itself.
-  */
- static void get_user_huge_page(struct page *page)
- {
-       if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
-               struct page *endpage = page + HPAGE_PMD_NR;
- 
-               atomic_add(HPAGE_PMD_NR, &page->_count);
-               while (++page < endpage)
-                       get_huge_page_tail(page);
-       } else {
-               get_page(page);
-       }
- }
- 
- static void put_user_huge_page(struct page *page)
- {
-       if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
-               struct page *endpage = page + HPAGE_PMD_NR;
- 
-               while (page < endpage)
-                       put_page(page++);
-       } else {
-               put_page(page);
-       }
- }
- 
   static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                         struct vm_area_struct *vma,
                                         unsigned long address,
@@@ -1068,13 -1118,14 +1118,14 @@@
                                                vma, address, page_to_nid(page));
                 if (unlikely(!pages[i] ||
                              mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
-                                                  &memcg))) {
+                                                  &memcg, false))) {
                         if (pages[i])
                                 put_page(pages[i]);
                         while (--i >= 0) {
                                 memcg = (void *)page_private(pages[i]);
                                 set_page_private(pages[i], 0);
-                               mem_cgroup_cancel_charge(pages[i], memcg);
+                               mem_cgroup_cancel_charge(pages[i], memcg,
+                                               false);
                                 put_page(pages[i]);
                         }
                         kfree(pages);
@@@ -1112,8 -1163,8 +1163,8 @@@
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                 memcg = (void *)page_private(pages[i]);
                 set_page_private(pages[i], 0);
-               page_add_new_anon_rmap(pages[i], vma, haddr);
-               mem_cgroup_commit_charge(pages[i], memcg, false);
+               page_add_new_anon_rmap(pages[i], vma, haddr, false);
+               mem_cgroup_commit_charge(pages[i], memcg, false, false);
                 lru_cache_add_active_or_unevictable(pages[i], vma);
                 pte = pte_offset_map(&_pmd, haddr);
                 VM_BUG_ON(!pte_none(*pte));
@@@ -1124,7 -1175,7 +1175,7 @@@
   
         smp_wmb(); /* make pte visible before pmd */
         pmd_populate(mm, pmd, pgtable);
-       page_remove_rmap(page);
+       page_remove_rmap(page, true);
         spin_unlock(ptl);
   
         mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@@ -1141,7 -1192,7 +1192,7 @@@ out_free_pages
         for (i = 0; i < HPAGE_PMD_NR; i++) {
                 memcg = (void *)page_private(pages[i]);
                 set_page_private(pages[i], 0);
-               mem_cgroup_cancel_charge(pages[i], memcg);
+               mem_cgroup_cancel_charge(pages[i], memcg, false);
                 put_page(pages[i]);
         }
         kfree(pages);
@@@ -1171,7 -1222,17 +1222,17 @@@ int do_huge_pmd_wp_page(struct mm_struc
   
         page = pmd_page(orig_pmd);
         VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
-       if (page_mapcount(page) == 1) {
+       /*
+        * We can only reuse the page if nobody else maps the huge page or it's
+        * part. We can do it by checking page_mapcount() on each sub-page, but
+        * it's expensive.
+        * The cheaper way is to check page_count() to be equal 1: every
+        * mapcount takes page reference reference, so this way we can
+        * guarantee, that the PMD is the only mapping.
+        * This can give false negative if somebody pinned the page, but that's
+        * fine.
+        */
+       if (page_mapcount(page) == 1 && page_count(page) == 1) {
                 pmd_t entry;
                 entry = pmd_mkyoung(orig_pmd);
                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@@ -1180,7 -1241,7 +1241,7 @@@
                 ret |= VM_FAULT_WRITE;
                 goto out_unlock;
         }
-       get_user_huge_page(page);
+       get_page(page);
         spin_unlock(ptl);
   alloc:
         if (transparent_hugepage_enabled(vma) &&
@@@ -1190,30 -1251,33 +1251,33 @@@
         } else
                 new_page = NULL;
   
-       if (unlikely(!new_page)) {
+       if (likely(new_page)) {
+               prep_transhuge_page(new_page);
+       } else {
                 if (!page) {
-                       split_huge_page_pmd(vma, address, pmd);
+                       split_huge_pmd(vma, pmd, address);
                         ret |= VM_FAULT_FALLBACK;
                 } else {
                         ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
                                         pmd, orig_pmd, page, haddr);
                         if (ret & VM_FAULT_OOM) {
-                               split_huge_page(page);
+                               split_huge_pmd(vma, pmd, address);
                                 ret |= VM_FAULT_FALLBACK;
                         }
-                       put_user_huge_page(page);
+                       put_page(page);
                 }
                 count_vm_event(THP_FAULT_FALLBACK);
                 goto out;
         }
   
-       if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
+       if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp,
+                                       &memcg, true))) {
                 put_page(new_page);
                 if (page) {
-                       split_huge_page(page);
-                       put_user_huge_page(page);
+                       split_huge_pmd(vma, pmd, address);
+                       put_page(page);
                 } else
-                       split_huge_page_pmd(vma, address, pmd);
+                       split_huge_pmd(vma, pmd, address);
                 ret |= VM_FAULT_FALLBACK;
                 count_vm_event(THP_FAULT_FALLBACK);
                 goto out;
@@@ -1233,10 -1297,10 +1297,10 @@@
   
         spin_lock(ptl);
         if (page)
-               put_user_huge_page(page);
+               put_page(page);
         if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                 spin_unlock(ptl);
-               mem_cgroup_cancel_charge(new_page, memcg);
+               mem_cgroup_cancel_charge(new_page, memcg, true);
                 put_page(new_page);
                 goto out_mn;
         } else {
@@@ -1244,8 -1308,8 +1308,8 @@@
                 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                 pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-               page_add_new_anon_rmap(new_page, vma, haddr);
-               mem_cgroup_commit_charge(new_page, memcg, false);
+               page_add_new_anon_rmap(new_page, vma, haddr, true);
+               mem_cgroup_commit_charge(new_page, memcg, false, true);
                 lru_cache_add_active_or_unevictable(new_page, vma);
                 set_pmd_at(mm, haddr, pmd, entry);
                 update_mmu_cache_pmd(vma, address, pmd);
@@@ -1254,7 -1318,7 +1318,7 @@@
                         put_huge_zero_page();
                 } else {
                         VM_BUG_ON_PAGE(!PageHead(page), page);
-                       page_remove_rmap(page);
+                       page_remove_rmap(page, true);
                         put_page(page);
                 }
                 ret |= VM_FAULT_WRITE;
@@@ -1307,8 -1371,21 +1371,21 @@@ struct page *follow_trans_huge_pmd(stru
                                           pmd, _pmd,  1))
                         update_mmu_cache_pmd(vma, addr, pmd);
         }
-       if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
-               if (page->mapping && trylock_page(page)) {
+       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+               /*
+                * We don't mlock() pte-mapped THPs. This way we can avoid
+                * leaking mlocked pages into non-VM_LOCKED VMAs.
+                *
+                * In most cases the pmd is the only mapping of the page as we
+                * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+                * writable private mappings in populate_vma_page_range().
+                *
+                * The only scenario when we have the page shared here is if we
+                * mlocking read-only mapping shared over fork(). We skip
+                * mlocking such pages.
+                */
+               if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+                               page->mapping && trylock_page(page)) {
                         lru_add_drain();
                         if (page->mapping)
                                 mlock_vma_page(page);
@@@ -1318,7 -1395,7 +1395,7 @@@
         page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
         VM_BUG_ON_PAGE(!PageCompound(page), page);
         if (flags & FOLL_GET)
-               get_page_foll(page);
+               get_page(page);
   
   out:
         return page;
@@@ -1453,13 -1530,47 +1530,47 @@@ out
         return 0;
   }
   
+ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                pmd_t *pmd, unsigned long addr)
+ 
+ {
+       spinlock_t *ptl;
+       struct mm_struct *mm = tlb->mm;
+       int ret = 1;
+ 
+       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+               struct page *page;
+               pmd_t orig_pmd;
+ 
+               if (is_huge_zero_pmd(*pmd))
+                       goto out;
+ 
+               orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmd);
+ 
+               /* No hugepage in swapcache */
+               page = pmd_page(orig_pmd);
+               VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ 
+               orig_pmd = pmd_mkold(orig_pmd);
+               orig_pmd = pmd_mkclean(orig_pmd);
+ 
+               set_pmd_at(mm, addr, pmd, orig_pmd);
+               tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ out:
+               spin_unlock(ptl);
+               ret = 0;
+       }
+ 
+       return ret;
+ }
+ 
   int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                  pmd_t *pmd, unsigned long addr)
   {
         pmd_t orig_pmd;
         spinlock_t *ptl;
   
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+       if (!__pmd_trans_huge_lock(pmd, vma, &ptl))
                 return 0;
         /*
          * For architectures like ppc64 we look at deposited pgtable
@@@ -1481,7 -1592,7 +1592,7 @@@
                 put_huge_zero_page();
         } else {
                 struct page *page = pmd_page(orig_pmd);
-               page_remove_rmap(page);
+               page_remove_rmap(page, true);
                 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
                 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
                 VM_BUG_ON_PAGE(!PageHead(page), page);
@@@ -1493,13 -1604,12 +1604,12 @@@
         return 1;
   }
   
- int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                   unsigned long old_addr,
                   unsigned long new_addr, unsigned long old_end,
                   pmd_t *old_pmd, pmd_t *new_pmd)
   {
         spinlock_t *old_ptl, *new_ptl;
-       int ret = 0;
         pmd_t pmd;
   
         struct mm_struct *mm = vma->vm_mm;
@@@ -1508,7 -1618,7 +1618,7 @@@
             (new_addr & ~HPAGE_PMD_MASK) ||
             old_end - old_addr < HPAGE_PMD_SIZE ||
             (new_vma->vm_flags & VM_NOHUGEPAGE))
-               goto out;
+               return false;
   
         /*
          * The destination pmd shouldn't be established, free_pgtables()
@@@ -1516,15 -1626,14 +1626,14 @@@
          */
         if (WARN_ON(!pmd_none(*new_pmd))) {
                 VM_BUG_ON(pmd_trans_huge(*new_pmd));
-               goto out;
+               return false;
         }
   
         /*
          * We don't have to worry about the ordering of src and dst
          * ptlocks because exclusive mmap_sem prevents deadlock.
          */
-       ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
-       if (ret == 1) {
+       if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) {
                 new_ptl = pmd_lockptr(mm, new_pmd);
                 if (new_ptl != old_ptl)
                         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@@ -1540,9 -1649,9 +1649,9 @@@
                 if (new_ptl != old_ptl)
                         spin_unlock(new_ptl);
                 spin_unlock(old_ptl);
+               return true;
         }
- out:
-       return ret;
+       return false;
   }
   
   /*
@@@ -1558,7 -1667,7 +1667,7 @@@ int change_huge_pmd(struct vm_area_stru
         spinlock_t *ptl;
         int ret = 0;
   
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (__pmd_trans_huge_lock(pmd, vma, &ptl)) {
                 pmd_t entry;
                 bool preserve_write = prot_numa && pmd_write(*pmd);
                 ret = 1;
@@@ -1589,29 -1698,19 +1698,19 @@@
   }
   
   /*
-  * Returns 1 if a given pmd maps a stable (not under splitting) thp.
-  * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
+  * Returns true if a given pmd maps a thp, false otherwise.
    *
-  * Note that if it returns 1, this routine returns without unlocking page
-  * table locks. So callers must unlock them.
+  * Note that if it returns true, this routine returns without unlocking page
+  * table lock. So callers must unlock it.
    */
- int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+ bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
                 spinlock_t **ptl)
   {
         *ptl = pmd_lock(vma->vm_mm, pmd);
-       if (likely(pmd_trans_huge(*pmd))) {
-               if (unlikely(pmd_trans_splitting(*pmd))) {
-                       spin_unlock(*ptl);
-                       wait_split_huge_page(vma->anon_vma, pmd);
-                       return -1;
-               } else {
-                       /* Thp mapped by 'pmd' is stable, so we can
-                        * handle it as it is. */
-                       return 1;
-               }
-       }
+       if (likely(pmd_trans_huge(*pmd)))
+               return true;
         spin_unlock(*ptl);
-       return 0;
+       return false;
   }
   
   /*
@@@ -1625,7 -1724,6 +1724,6 @@@
   pmd_t *page_check_address_pmd(struct page *page,
                               struct mm_struct *mm,
                               unsigned long address,
-                             enum page_check_address_pmd_flag flag,
                               spinlock_t **ptl)
   {
         pgd_t *pgd;
@@@ -1648,349 -1746,13 +1746,13 @@@
                 goto unlock;
         if (pmd_page(*pmd) != page)
                 goto unlock;
-       /*
-        * split_vma() may create temporary aliased mappings. There is
-        * no risk as long as all huge pmd are found and have their
-        * splitting bit set before __split_huge_page_refcount
-        * runs. Finding the same huge pmd more than once during the
-        * same rmap walk is not a problem.
-        */
-       if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
-           pmd_trans_splitting(*pmd))
-               goto unlock;
-       if (pmd_trans_huge(*pmd)) {
-               VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
-                         !pmd_trans_splitting(*pmd));
+       if (pmd_trans_huge(*pmd))
                 return pmd;
-       }
   unlock:
         spin_unlock(*ptl);
         return NULL;
   }
   
- static int __split_huge_page_splitting(struct page *page,
-                                      struct vm_area_struct *vma,
-                                      unsigned long address)
- {
-       struct mm_struct *mm = vma->vm_mm;
-       spinlock_t *ptl;
-       pmd_t *pmd;
-       int ret = 0;
-       /* For mmu_notifiers */
-       const unsigned long mmun_start = address;
-       const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
- 
-       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       pmd = page_check_address_pmd(page, mm, address,
-                       PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
-       if (pmd) {
-               /*
-                * We can't temporarily set the pmd to null in order
-                * to split it, the pmd must remain marked huge at all
-                * times or the VM won't take the pmd_trans_huge paths
-                * and it won't wait on the anon_vma->root->rwsem to
-                * serialize against split_huge_page*.
-                */
-               pmdp_splitting_flush(vma, address, pmd);
- 
-               ret = 1;
-               spin_unlock(ptl);
-       }
-       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- 
-       return ret;
- }
- 
- static void __split_huge_page_refcount(struct page *page,
-                                      struct list_head *list)
- {
-       int i;
-       struct zone *zone = page_zone(page);
-       struct lruvec *lruvec;
-       int tail_count = 0;
- 
-       /* prevent PageLRU to go away from under us, and freeze lru stats */
-       spin_lock_irq(&zone->lru_lock);
-       lruvec = mem_cgroup_page_lruvec(page, zone);
- 
-       compound_lock(page);
-       /* complete memcg works before add pages to LRU */
-       mem_cgroup_split_huge_fixup(page);
- 
-       for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
-               struct page *page_tail = page + i;
- 
-               /* tail_page->_mapcount cannot change */
-               BUG_ON(page_mapcount(page_tail) < 0);
-               tail_count += page_mapcount(page_tail);
-               /* check for overflow */
-               BUG_ON(tail_count < 0);
-               BUG_ON(atomic_read(&page_tail->_count) != 0);
-               /*
-                * tail_page->_count is zero and not changing from
-                * under us. But get_page_unless_zero() may be running
-                * from under us on the tail_page. If we used
-                * atomic_set() below instead of atomic_add(), we
-                * would then run atomic_set() concurrently with
-                * get_page_unless_zero(), and atomic_set() is
-                * implemented in C not using locked ops. spin_unlock
-                * on x86 sometime uses locked ops because of PPro
-                * errata 66, 92, so unless somebody can guarantee
-                * atomic_set() here would be safe on all archs (and
-                * not only on x86), it's safer to use atomic_add().
-                */
-               atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
-                          &page_tail->_count);
- 
-               /* after clearing PageTail the gup refcount can be released */
-               smp_mb__after_atomic();
- 
-               page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-               page_tail->flags |= (page->flags &
-                                    ((1L << PG_referenced) |
-                                     (1L << PG_swapbacked) |
-                                     (1L << PG_mlocked) |
-                                     (1L << PG_uptodate) |
-                                     (1L << PG_active) |
-                                     (1L << PG_unevictable)));
-               page_tail->flags |= (1L << PG_dirty);
- 
-               /* clear PageTail before overwriting first_page */
-               smp_wmb();
- 
-               if (page_is_young(page))
-                       set_page_young(page_tail);
-               if (page_is_idle(page))
-                       set_page_idle(page_tail);
- 
-               /*
-                * __split_huge_page_splitting() already set the
-                * splitting bit in all pmd that could map this
-                * hugepage, that will ensure no CPU can alter the
-                * mapcount on the head page. The mapcount is only
-                * accounted in the head page and it has to be
-                * transferred to all tail pages in the below code. So
-                * for this code to be safe, the split the mapcount
-                * can't change. But that doesn't mean userland can't
-                * keep changing and reading the page contents while
-                * we transfer the mapcount, so the pmd splitting
-                * status is achieved setting a reserved bit in the
-                * pmd, not by clearing the present bit.
-               */
-               page_tail->_mapcount = page->_mapcount;
- 
-               BUG_ON(page_tail->mapping);
-               page_tail->mapping = page->mapping;
- 
-               page_tail->index = page->index + i;
-               page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
- 
-               BUG_ON(!PageAnon(page_tail));
-               BUG_ON(!PageUptodate(page_tail));
-               BUG_ON(!PageDirty(page_tail));
-               BUG_ON(!PageSwapBacked(page_tail));
- 
-               lru_add_page_tail(page, page_tail, lruvec, list);
-       }
-       atomic_sub(tail_count, &page->_count);
-       BUG_ON(atomic_read(&page->_count) <= 0);
- 
-       __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
- 
-       ClearPageCompound(page);
-       compound_unlock(page);
-       spin_unlock_irq(&zone->lru_lock);
- 
-       for (i = 1; i < HPAGE_PMD_NR; i++) {
-               struct page *page_tail = page + i;
-               BUG_ON(page_count(page_tail) <= 0);
-               /*
-                * Tail pages may be freed if there wasn't any mapping
-                * like if add_to_swap() is running on a lru page that
-                * had its mapping zapped. And freeing these pages
-                * requires taking the lru_lock so we do the put_page
-                * of the tail pages after the split is complete.
-                */
-               put_page(page_tail);
-       }
- 
-       /*
-        * Only the head page (now become a regular page) is required
-        * to be pinned by the caller.
-        */
-       BUG_ON(page_count(page) <= 0);
- }
- 
- static int __split_huge_page_map(struct page *page,
-                                struct vm_area_struct *vma,
-                                unsigned long address)
- {
-       struct mm_struct *mm = vma->vm_mm;
-       spinlock_t *ptl;
-       pmd_t *pmd, _pmd;
-       int ret = 0, i;
-       pgtable_t pgtable;
-       unsigned long haddr;
- 
-       pmd = page_check_address_pmd(page, mm, address,
-                       PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
-       if (pmd) {
-               pgtable = pgtable_trans_huge_withdraw(mm, pmd);
-               pmd_populate(mm, &_pmd, pgtable);
-               if (pmd_write(*pmd))
-                       BUG_ON(page_mapcount(page) != 1);
- 
-               haddr = address;
-               for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-                       pte_t *pte, entry;
-                       BUG_ON(PageCompound(page+i));
-                       /*
-                        * Note that NUMA hinting access restrictions are not
-                        * transferred to avoid any possibility of altering
-                        * permissions across VMAs.
-                        */
-                       entry = mk_pte(page + i, vma->vm_page_prot);
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                       if (!pmd_write(*pmd))
-                               entry = pte_wrprotect(entry);
-                       if (!pmd_young(*pmd))
-                               entry = pte_mkold(entry);
-                       pte = pte_offset_map(&_pmd, haddr);
-                       BUG_ON(!pte_none(*pte));
-                       set_pte_at(mm, haddr, pte, entry);
-                       pte_unmap(pte);
-               }
- 
-               smp_wmb(); /* make pte visible before pmd */
-               /*
-                * Up to this point the pmd is present and huge and
-                * userland has the whole access to the hugepage
-                * during the split (which happens in place). If we
-                * overwrite the pmd with the not-huge version
-                * pointing to the pte here (which of course we could
-                * if all CPUs were bug free), userland could trigger
-                * a small page size TLB miss on the small sized TLB
-                * while the hugepage TLB entry is still established
-                * in the huge TLB. Some CPU doesn't like that. See
-                * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
-                * Erratum 383 on page 93. Intel should be safe but is
-                * also warns that it's only safe if the permission
-                * and cache attributes of the two entries loaded in
-                * the two TLB is identical (which should be the case
-                * here). But it is generally safer to never allow
-                * small and huge TLB entries for the same virtual
-                * address to be loaded simultaneously. So instead of
-                * doing "pmd_populate(); flush_pmd_tlb_range();" we first
-                * mark the current pmd notpresent (atomically because
-                * here the pmd_trans_huge and pmd_trans_splitting
-                * must remain set at all times on the pmd until the
-                * split is complete for this pmd), then we flush the
-                * SMP TLB and finally we write the non-huge version
-                * of the pmd entry with pmd_populate.
-                */
-               pmdp_invalidate(vma, address, pmd);
-               pmd_populate(mm, pmd, pgtable);
-               ret = 1;
-               spin_unlock(ptl);
-       }
- 
-       return ret;
- }
- 
- /* must be called with anon_vma->root->rwsem held */
- static void __split_huge_page(struct page *page,
-                             struct anon_vma *anon_vma,
-                             struct list_head *list)
- {
-       int mapcount, mapcount2;
-       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-       struct anon_vma_chain *avc;
- 
-       BUG_ON(!PageHead(page));
-       BUG_ON(PageTail(page));
- 
-       mapcount = 0;
-       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-               struct vm_area_struct *vma = avc->vma;
-               unsigned long addr = vma_address(page, vma);
-               BUG_ON(is_vma_temporary_stack(vma));
-               mapcount += __split_huge_page_splitting(page, vma, addr);
-       }
-       /*
-        * It is critical that new vmas are added to the tail of the
-        * anon_vma list. This guarantes that if copy_huge_pmd() runs
-        * and establishes a child pmd before
-        * __split_huge_page_splitting() freezes the parent pmd (so if
-        * we fail to prevent copy_huge_pmd() from running until the
-        * whole __split_huge_page() is complete), we will still see
-        * the newly established pmd of the child later during the
-        * walk, to be able to set it as pmd_trans_splitting too.
-        */
-       if (mapcount != page_mapcount(page)) {
-               pr_err("mapcount %d page_mapcount %d\n",
-                       mapcount, page_mapcount(page));
-               BUG();
-       }
- 
-       __split_huge_page_refcount(page, list);
- 
-       mapcount2 = 0;
-       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
-               struct vm_area_struct *vma = avc->vma;
-               unsigned long addr = vma_address(page, vma);
-               BUG_ON(is_vma_temporary_stack(vma));
-               mapcount2 += __split_huge_page_map(page, vma, addr);
-       }
-       if (mapcount != mapcount2) {
-               pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
-                       mapcount, mapcount2, page_mapcount(page));
-               BUG();
-       }
- }
- 
- /*
-  * Split a hugepage into normal pages. This doesn't change the position of head
-  * page. If @list is null, tail pages will be added to LRU list, otherwise, to
-  * @list. Both head page and tail pages will inherit mapping, flags, and so on
-  * from the hugepage.
-  * Return 0 if the hugepage is split successfully otherwise return 1.
-  */
- int split_huge_page_to_list(struct page *page, struct list_head *list)
- {
-       struct anon_vma *anon_vma;
-       int ret = 1;
- 
-       BUG_ON(is_huge_zero_page(page));
-       BUG_ON(!PageAnon(page));
- 
-       /*
-        * The caller does not necessarily hold an mmap_sem that would prevent
-        * the anon_vma disappearing so we first we take a reference to it
-        * and then lock the anon_vma for write. This is similar to
-        * page_lock_anon_vma_read except the write lock is taken to serialise
-        * against parallel split or collapse operations.
-        */
-       anon_vma = page_get_anon_vma(page);
-       if (!anon_vma)
-               goto out;
-       anon_vma_lock_write(anon_vma);
- 
-       ret = 0;
-       if (!PageCompound(page))
-               goto out_unlock;
- 
-       BUG_ON(!PageSwapBacked(page));
-       __split_huge_page(page, anon_vma, list);
-       count_vm_event(THP_SPLIT);
- 
-       BUG_ON(PageCompound(page));
- out_unlock:
-       anon_vma_unlock_write(anon_vma);
-       put_anon_vma(anon_vma);
- out:
-       return ret;
- }
- 
   #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
   
   int hugepage_madvise(struct vm_area_struct *vma,
@@@ -2199,26 -1961,33 +1961,33 @@@ static int __collapse_huge_page_isolate
                                         unsigned long address,
                                         pte_t *pte)
   {
-       struct page *page;
+       struct page *page = NULL;
         pte_t *_pte;
-       int none_or_zero = 0;
+       int none_or_zero = 0, result = 0;
         bool referenced = false, writable = false;
+ 
         for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
              _pte++, address += PAGE_SIZE) {
                 pte_t pteval = *_pte;
                 if (pte_none(pteval) || (pte_present(pteval) &&
                                 is_zero_pfn(pte_pfn(pteval)))) {
                         if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none)
+                           ++none_or_zero <= khugepaged_max_ptes_none) {
                                 continue;
-                       else
+                       } else {
+                               result = SCAN_EXCEED_NONE_PTE;
                                 goto out;
+                       }
                 }
-               if (!pte_present(pteval))
+               if (!pte_present(pteval)) {
+                       result = SCAN_PTE_NON_PRESENT;
                         goto out;
+               }
                 page = vm_normal_page(vma, address, pteval);
-               if (unlikely(!page))
+               if (unlikely(!page)) {
+                       result = SCAN_PAGE_NULL;
                         goto out;
+               }
   
                 VM_BUG_ON_PAGE(PageCompound(page), page);
                 VM_BUG_ON_PAGE(!PageAnon(page), page);
@@@ -2230,8 -1999,10 +1999,10 @@@
                  * is needed to serialize against split_huge_page
                  * when invoked from the VM.
                  */
-               if (!trylock_page(page))
+               if (!trylock_page(page)) {
+                       result = SCAN_PAGE_LOCK;
                         goto out;
+               }
   
                 /*
                  * cannot use mapcount: can't collapse if there's a gup pin.
@@@ -2240,6 -2011,7 +2011,7 @@@
                  */
                 if (page_count(page) != 1 + !!PageSwapCache(page)) {
                         unlock_page(page);
+                       result = SCAN_PAGE_COUNT;
                         goto out;
                 }
                 if (pte_write(pteval)) {
@@@ -2247,6 -2019,7 +2019,7 @@@
                 } else {
                         if (PageSwapCache(page) && !reuse_swap_page(page)) {
                                 unlock_page(page);
+                               result = SCAN_SWAP_CACHE_PAGE;
                                 goto out;
                         }
                         /*
@@@ -2261,6 -2034,7 +2034,7 @@@
                  */
                 if (isolate_lru_page(page)) {
                         unlock_page(page);
+                       result = SCAN_DEL_PAGE_LRU;
                         goto out;
                 }
                 /* 0 stands for page_is_file_cache(page) == false */
@@@ -2274,10 -2048,21 +2048,21 @@@
                     mmu_notifier_test_young(vma->vm_mm, address))
                         referenced = true;
         }
-       if (likely(referenced && writable))
-               return 1;
+       if (likely(writable)) {
+               if (likely(referenced)) {
+                       result = SCAN_SUCCEED;
+                       trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+                                                           referenced, writable, result);
+                       return 1;
+               }
+       } else {
+               result = SCAN_PAGE_RO;
+       }
+ 
   out:
         release_pte_pages(pte, _pte);
+       trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+                                           referenced, writable, result);
         return 0;
   }
   
@@@ -2322,7 -2107,7 +2107,7 @@@ static void __collapse_huge_page_copy(p
                          * superfluous.
                          */
                         pte_clear(vma->vm_mm, address, _pte);
-                       page_remove_rmap(src_page);
+                       page_remove_rmap(src_page, false);
                         spin_unlock(ptl);
                         free_page_and_swap_cache(src_page);
                 }
@@@ -2433,6 -2218,7 +2218,7 @@@ khugepaged_alloc_page(struct page **hpa
                 return NULL;
         }
   
+       prep_transhuge_page(*hpage);
         count_vm_event(THP_COLLAPSE_ALLOC);
         return *hpage;
   }
@@@ -2444,8 -2230,12 +2230,12 @@@ static int khugepaged_find_target_node(
   
   static inline struct page *alloc_hugepage(int defrag)
   {
-       return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
-                          HPAGE_PMD_ORDER);
+       struct page *page;
+ 
+       page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+       if (page)
+               prep_transhuge_page(page);
+       return page;
   }
   
   static struct page *khugepaged_alloc_hugepage(bool *wait)
@@@ -2496,7 -2286,6 +2286,6 @@@ static bool hugepage_vma_check(struct v
         if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
             (vma->vm_flags & VM_NOHUGEPAGE))
                 return false;
- 
         if (!vma->anon_vma || vma->vm_ops)
                 return false;
         if (is_vma_temporary_stack(vma))
@@@ -2505,6 -2294,44 +2294,44 @@@
         return true;
   }
   
+ /*
+  * Bring missing pages in from swap, to complete THP collapse.
+  * Only done if khugepaged_scan_pmd believes it is worthwhile.
+  *
+  * Called and returns without pte mapped or spinlocks held,
+  * but with mmap_sem held to protect against vma changes.
+  */
+ 
+ static void __collapse_huge_page_swapin(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       unsigned long address, pmd_t *pmd)
+ {
+       unsigned long _address;
+       pte_t *pte, pteval;
+       int swapped_in = 0, ret = 0;
+ 
+       pte = pte_offset_map(pmd, address);
+       for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE;
+            pte++, _address += PAGE_SIZE) {
+               pteval = *pte;
+               if (!is_swap_pte(pteval))
+                       continue;
+               swapped_in++;
+               ret = do_swap_page(mm, vma, _address, pte, pmd,
+                                  FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_RETRY_NOWAIT,
+                                  pteval);
+               if (ret & VM_FAULT_ERROR) {
+                       trace_mm_collapse_huge_page_swapin(mm, swapped_in, 0);
+                       return;
+               }
+               /* pte is unmapped now, we need to map it */
+               pte = pte_offset_map(pmd, _address);
+       }
+       pte--;
+       pte_unmap(pte);
+       trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
+ }
+ 
   static void collapse_huge_page(struct mm_struct *mm,
                                    unsigned long address,
                                    struct page **hpage,
@@@ -2516,7 -2343,7 +2343,7 @@@
         pgtable_t pgtable;
         struct page *new_page;
         spinlock_t *pmd_ptl, *pte_ptl;
-       int isolated;
+       int isolated = 0, result = 0;
         unsigned long hstart, hend;
         struct mem_cgroup *memcg;
         unsigned long mmun_start;       /* For mmu_notifiers */
@@@ -2531,12 -2358,15 +2358,15 @@@
   
         /* release the mmap_sem read lock. */
         new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
-       if (!new_page)
-               return;
+       if (!new_page) {
+               result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+               goto out_nolock;
+       }
   
-       if (unlikely(mem_cgroup_try_charge(new_page, mm,
-                                          gfp, &memcg)))
-               return;
+       if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+               result = SCAN_CGROUP_CHARGE_FAIL;
+               goto out_nolock;
+       }
   
         /*
          * Prevent all access to pagetables with the exception of
@@@ -2544,21 -2374,33 +2374,33 @@@
          * handled by the anon_vma lock + PG_lock.
          */
         down_write(&mm->mmap_sem);
-       if (unlikely(khugepaged_test_exit(mm)))
+       if (unlikely(khugepaged_test_exit(mm))) {
+               result = SCAN_ANY_PROCESS;
                 goto out;
+       }
   
         vma = find_vma(mm, address);
-       if (!vma)
+       if (!vma) {
+               result = SCAN_VMA_NULL;
                 goto out;
+       }
         hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
         hend = vma->vm_end & HPAGE_PMD_MASK;
-       if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+       if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
+               result = SCAN_ADDRESS_RANGE;
                 goto out;
-       if (!hugepage_vma_check(vma))
+       }
+       if (!hugepage_vma_check(vma)) {
+               result = SCAN_VMA_CHECK;
                 goto out;
+       }
         pmd = mm_find_pmd(mm, address);
-       if (!pmd)
+       if (!pmd) {
+               result = SCAN_PMD_NULL;
                 goto out;
+       }
+ 
+       __collapse_huge_page_swapin(mm, vma, address, pmd);
   
         anon_vma_lock_write(vma->anon_vma);
   
@@@ -2595,6 -2437,7 +2437,7 @@@
                 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                 spin_unlock(pmd_ptl);
                 anon_vma_unlock_write(vma->anon_vma);
+               result = SCAN_FAIL;
                 goto out;
         }
   
@@@ -2621,8 -2464,8 +2464,8 @@@
   
         spin_lock(pmd_ptl);
         BUG_ON(!pmd_none(*pmd));
-       page_add_new_anon_rmap(new_page, vma, address);
-       mem_cgroup_commit_charge(new_page, memcg, false);
+       page_add_new_anon_rmap(new_page, vma, address, true);
+       mem_cgroup_commit_charge(new_page, memcg, false, true);
         lru_cache_add_active_or_unevictable(new_page, vma);
         pgtable_trans_huge_deposit(mm, pmd, pgtable);
         set_pmd_at(mm, address, pmd, _pmd);
@@@ -2632,12 -2475,14 +2475,14 @@@
         *hpage = NULL;
   
         khugepaged_pages_collapsed++;
+       result = SCAN_SUCCEED;
   out_up_write:
         up_write(&mm->mmap_sem);
+ out_nolock:
+       trace_mm_collapse_huge_page(mm, isolated, result);
         return;
- 
   out:
-       mem_cgroup_cancel_charge(new_page, memcg);
+       mem_cgroup_cancel_charge(new_page, memcg, true);
         goto out_up_write;
   }
   
@@@ -2648,39 -2493,62 +2493,62 @@@ static int khugepaged_scan_pmd(struct m
   {
         pmd_t *pmd;
         pte_t *pte, *_pte;
-       int ret = 0, none_or_zero = 0;
-       struct page *page;
+       int ret = 0, none_or_zero = 0, result = 0;
+       struct page *page = NULL;
         unsigned long _address;
         spinlock_t *ptl;
-       int node = NUMA_NO_NODE;
+       int node = NUMA_NO_NODE, unmapped = 0;
         bool writable = false, referenced = false;
   
         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
   
         pmd = mm_find_pmd(mm, address);
-       if (!pmd)
+       if (!pmd) {
+               result = SCAN_PMD_NULL;
                 goto out;
+       }
   
         memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
         pte = pte_offset_map_lock(mm, pmd, address, &ptl);
         for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
              _pte++, _address += PAGE_SIZE) {
                 pte_t pteval = *_pte;
+               if (is_swap_pte(pteval)) {
+                       if (++unmapped <= khugepaged_max_ptes_swap) {
+                               continue;
+                       } else {
+                               result = SCAN_EXCEED_SWAP_PTE;
+                               goto out_unmap;
+                       }
+               }
                 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                         if (!userfaultfd_armed(vma) &&
-                           ++none_or_zero <= khugepaged_max_ptes_none)
+                           ++none_or_zero <= khugepaged_max_ptes_none) {
                                 continue;
-                       else
+                       } else {
+                               result = SCAN_EXCEED_NONE_PTE;
                                 goto out_unmap;
+                       }
                 }
-               if (!pte_present(pteval))
+               if (!pte_present(pteval)) {
+                       result = SCAN_PTE_NON_PRESENT;
                         goto out_unmap;
+               }
                 if (pte_write(pteval))
                         writable = true;
   
                 page = vm_normal_page(vma, _address, pteval);
-               if (unlikely(!page))
+               if (unlikely(!page)) {
+                       result = SCAN_PAGE_NULL;
+                       goto out_unmap;
+               }
+ 
+               /* TODO: teach khugepaged to collapse THP mapped with pte */
+               if (PageCompound(page)) {
+                       result = SCAN_PAGE_COMPOUND;
                         goto out_unmap;
+               }
+ 
                 /*
                  * Record which node the original page is from and save this
                  * information to khugepaged_node_load[].
@@@ -2688,26 -2556,48 +2556,48 @@@
                  * hit record.
                  */
                 node = page_to_nid(page);
-               if (khugepaged_scan_abort(node))
+               if (khugepaged_scan_abort(node)) {
+                       result = SCAN_SCAN_ABORT;
                         goto out_unmap;
+               }
                 khugepaged_node_load[node]++;
-               VM_BUG_ON_PAGE(PageCompound(page), page);
-               if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+               if (!PageLRU(page)) {
+                       result = SCAN_SCAN_ABORT;
+                       goto out_unmap;
+               }
+               if (PageLocked(page)) {
+                       result = SCAN_PAGE_LOCK;
+                       goto out_unmap;
+               }
+               if (!PageAnon(page)) {
+                       result = SCAN_PAGE_ANON;
                         goto out_unmap;
+               }
+ 
                 /*
                  * cannot use mapcount: can't collapse if there's a gup pin.
                  * The page must only be referenced by the scanned process
                  * and page swap cache.
                  */
-               if (page_count(page) != 1 + !!PageSwapCache(page))
+               if (page_count(page) != 1 + !!PageSwapCache(page)) {
+                       result = SCAN_PAGE_COUNT;
                         goto out_unmap;
+               }
                 if (pte_young(pteval) ||
                     page_is_young(page) || PageReferenced(page) ||
                     mmu_notifier_test_young(vma->vm_mm, address))
                         referenced = true;
         }
-       if (referenced && writable)
-               ret = 1;
+       if (writable) {
+               if (referenced) {
+                       result = SCAN_SUCCEED;
+                       ret = 1;
+               } else {
+                       result = SCAN_NO_REFERENCED_PAGE;
+               }
+       } else {
+               result = SCAN_PAGE_RO;
+       }
   out_unmap:
         pte_unmap_unlock(pte, ptl);
         if (ret) {
@@@ -2716,6 -2606,8 +2606,8 @@@
                 collapse_huge_page(mm, address, hpage, vma, node);
         }
   out:
+       trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+                                    none_or_zero, result, unmapped);
         return ret;
   }
   
@@@ -2941,8 -2833,8 +2833,8 @@@ static void __split_huge_zero_page_pmd(
         pmd_t _pmd;
         int i;
   
-       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
         /* leave pmd empty until pte is filled */
+       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
   
         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
         pmd_populate(mm, &_pmd, pgtable);
@@@ -2961,66 -2853,123 +2853,123 @@@
         put_huge_zero_page();
   }
   
- void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
-               pmd_t *pmd)
+ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long haddr, bool freeze)
   {
-       spinlock_t *ptl;
-       struct page *page = NULL;
         struct mm_struct *mm = vma->vm_mm;
-       unsigned long haddr = address & HPAGE_PMD_MASK;
-       unsigned long mmun_start;       /* For mmu_notifiers */
-       unsigned long mmun_end;         /* For mmu_notifiers */
+       struct page *page;
+       pgtable_t pgtable;
+       pmd_t _pmd;
+       bool young, write;
+       int i;
   
-       BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+       VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+       VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+       VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+       VM_BUG_ON(!pmd_trans_huge(*pmd));
+ 
+       count_vm_event(THP_SPLIT_PMD);
   
-       mmun_start = haddr;
-       mmun_end   = haddr + HPAGE_PMD_SIZE;
- again:
-       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-       ptl = pmd_lock(mm, pmd);
-       if (unlikely(!pmd_trans_huge(*pmd)))
-               goto unlock;
         if (vma_is_dax(vma)) {
                 pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
                 if (is_huge_zero_pmd(_pmd))
                         put_huge_zero_page();
+               return;
         } else if (is_huge_zero_pmd(*pmd)) {
-               __split_huge_zero_page_pmd(vma, haddr, pmd);
-       } else {
-               page = pmd_page(*pmd);
-               VM_BUG_ON_PAGE(!page_count(page), page);
-               get_page(page);
+               return __split_huge_zero_page_pmd(vma, haddr, pmd);
         }
-  unlock:
-       spin_unlock(ptl);
-       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
   
-       if (!page)
-               return;
+       page = pmd_page(*pmd);
+       VM_BUG_ON_PAGE(!page_count(page), page);
+       atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+       write = pmd_write(*pmd);
+       young = pmd_young(*pmd);
   
-       split_huge_page(page);
-       put_page(page);
+       /* leave pmd empty until pte is filled */
+       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ 
+       pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+       pmd_populate(mm, &_pmd, pgtable);
+ 
+       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+               pte_t entry, *pte;
+               /*
+                * Note that NUMA hinting access restrictions are not
+                * transferred to avoid any possibility of altering
+                * permissions across VMAs.
+                */
+               if (freeze) {
+                       swp_entry_t swp_entry;
+                       swp_entry = make_migration_entry(page + i, write);
+                       entry = swp_entry_to_pte(swp_entry);
+               } else {
+                       entry = mk_pte(page + i, vma->vm_page_prot);
+                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                       if (!write)
+                               entry = pte_wrprotect(entry);
+                       if (!young)
+                               entry = pte_mkold(entry);
+               }
+               pte = pte_offset_map(&_pmd, haddr);
+               BUG_ON(!pte_none(*pte));
+               set_pte_at(mm, haddr, pte, entry);
+               atomic_inc(&page[i]._mapcount);
+               pte_unmap(pte);
+       }
   
         /*
-        * We don't always have down_write of mmap_sem here: a racing
-        * do_huge_pmd_wp_page() might have copied-on-write to another
-        * huge page before our split_huge_page() got the anon_vma lock.
+        * Set PG_double_map before dropping compound_mapcount to avoid
+        * false-negative page_mapped().
          */
-       if (unlikely(pmd_trans_huge(*pmd)))
-               goto again;
+       if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
+               for (i = 0; i < HPAGE_PMD_NR; i++)
+                       atomic_inc(&page[i]._mapcount);
+       }
+ 
+       if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+               /* Last compound_mapcount is gone. */
+               __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+               if (TestClearPageDoubleMap(page)) {
+                       /* No need in mapcount reference anymore */
+                       for (i = 0; i < HPAGE_PMD_NR; i++)
+                               atomic_dec(&page[i]._mapcount);
+               }
+       }
+ 
+       smp_wmb(); /* make pte visible before pmd */
+       pmd_populate(mm, pmd, pgtable);
   }
   
- void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
-               pmd_t *pmd)
+ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+               unsigned long address)
   {
-       struct vm_area_struct *vma;
+       spinlock_t *ptl;
+       struct mm_struct *mm = vma->vm_mm;
+       struct page *page = NULL;
+       unsigned long haddr = address & HPAGE_PMD_MASK;
   
-       vma = find_vma(mm, address);
-       BUG_ON(vma == NULL);
-       split_huge_page_pmd(vma, address, pmd);
+       mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       ptl = pmd_lock(mm, pmd);
+       if (unlikely(!pmd_trans_huge(*pmd)))
+               goto out;
+       page = pmd_page(*pmd);
+       __split_huge_pmd_locked(vma, pmd, haddr, false);
+       if (PageMlocked(page))
+               get_page(page);
+       else
+               page = NULL;
+ out:
+       spin_unlock(ptl);
+       mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       if (page) {
+               lock_page(page);
+               munlock_vma_page(page);
+               unlock_page(page);
+               put_page(page);
+       }
   }
   
- static void split_huge_page_address(struct mm_struct *mm,
+ static void split_huge_pmd_address(struct vm_area_struct *vma,
                                     unsigned long address)
   {
         pgd_t *pgd;
@@@ -3029,7 -2978,7 +2978,7 @@@
   
         VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
   
-       pgd = pgd_offset(mm, address);
+       pgd = pgd_offset(vma->vm_mm, address);
         if (!pgd_present(*pgd))
                 return;
   
@@@ -3038,13 -2987,13 +2987,13 @@@
                 return;
   
         pmd = pmd_offset(pud, address);
-       if (!pmd_present(*pmd))
+       if (!pmd_present(*pmd) || !pmd_trans_huge(*pmd))
                 return;
         /*
          * Caller holds the mmap_sem write mode, so a huge pmd cannot
          * materialize from under us.
          */
-       split_huge_page_pmd_mm(mm, address, pmd);
+       split_huge_pmd(vma, pmd, address);
   }
   
   void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@@ -3060,7 -3009,7 +3009,7 @@@
         if (start & ~HPAGE_PMD_MASK &&
             (start & HPAGE_PMD_MASK) >= vma->vm_start &&
             (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_page_address(vma->vm_mm, start);
+               split_huge_pmd_address(vma, start);
   
         /*
          * If the new end address isn't hpage aligned and it could
@@@ -3070,7 -3019,7 +3019,7 @@@
         if (end & ~HPAGE_PMD_MASK &&
             (end & HPAGE_PMD_MASK) >= vma->vm_start &&
             (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_page_address(vma->vm_mm, end);
+               split_huge_pmd_address(vma, end);
   
         /*
          * If we're also updating the vma->vm_next->vm_start, if the new
@@@ -3084,6 -3033,415 +3033,415 @@@
                 if (nstart & ~HPAGE_PMD_MASK &&
                     (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
                     (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
-                       split_huge_page_address(next->vm_mm, nstart);
+                       split_huge_pmd_address(next, nstart);
+       }
+ }
+ 
+ static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+               unsigned long address)
+ {
+       spinlock_t *ptl;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       int i;
+ 
+       pgd = pgd_offset(vma->vm_mm, address);
+       if (!pgd_present(*pgd))
+               return;
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return;
+       pmd = pmd_offset(pud, address);
+       ptl = pmd_lock(vma->vm_mm, pmd);
+       if (!pmd_present(*pmd)) {
+               spin_unlock(ptl);
+               return;
+       }
+       if (pmd_trans_huge(*pmd)) {
+               if (page == pmd_page(*pmd))
+                       __split_huge_pmd_locked(vma, pmd, address, true);
+               spin_unlock(ptl);
+               return;
+       }
+       spin_unlock(ptl);
+ 
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+       for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+               pte_t entry, swp_pte;
+               swp_entry_t swp_entry;
+ 
+               if (!pte_present(pte[i]))
+                       continue;
+               if (page_to_pfn(page) != pte_pfn(pte[i]))
+                       continue;
+               flush_cache_page(vma, address, page_to_pfn(page));
+               entry = ptep_clear_flush(vma, address, pte + i);
+               swp_entry = make_migration_entry(page, pte_write(entry));
+               swp_pte = swp_entry_to_pte(swp_entry);
+               if (pte_soft_dirty(entry))
+                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+               set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
+       }
+       pte_unmap_unlock(pte, ptl);
+ }
+ 
+ static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+ {
+       struct anon_vma_chain *avc;
+       pgoff_t pgoff = page_to_pgoff(page);
+ 
+       VM_BUG_ON_PAGE(!PageHead(page), page);
+ 
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+                       pgoff + HPAGE_PMD_NR - 1) {
+               unsigned long haddr;
+ 
+               haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
+               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                               haddr, haddr + HPAGE_PMD_SIZE);
+               freeze_page_vma(avc->vma, page, haddr);
+               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                               haddr, haddr + HPAGE_PMD_SIZE);
+       }
+ }
+ 
+ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+               unsigned long address)
+ {
+       spinlock_t *ptl;
+       pmd_t *pmd;
+       pte_t *pte, entry;
+       swp_entry_t swp_entry;
+       int i;
+ 
+       pmd = mm_find_pmd(vma->vm_mm, address);
+       if (!pmd)
+               return;
+       pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+       for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
+               if (!page_mapped(page))
+                       continue;
+               if (!is_swap_pte(pte[i]))
+                       continue;
+ 
+               swp_entry = pte_to_swp_entry(pte[i]);
+               if (!is_migration_entry(swp_entry))
+                       continue;
+               if (migration_entry_to_page(swp_entry) != page)
+                       continue;
+ 
+               entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+               entry = pte_mkdirty(entry);
+               if (is_write_migration_entry(swp_entry))
+                       entry = maybe_mkwrite(entry, vma);
+ 
+               flush_dcache_page(page);
+               set_pte_at(vma->vm_mm, address, pte + i, entry);
+ 
+               /* No need to invalidate - it was non-present before */
+               update_mmu_cache(vma, address, pte + i);
+       }
+       pte_unmap_unlock(pte, ptl);
+ }
+ 
+ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+ {
+       struct anon_vma_chain *avc;
+       pgoff_t pgoff = page_to_pgoff(page);
+ 
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+                       pgoff, pgoff + HPAGE_PMD_NR - 1) {
+               unsigned long address = __vma_address(page, avc->vma);
+ 
+               mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
+               unfreeze_page_vma(avc->vma, page, address);
+               mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+                               address, address + HPAGE_PMD_SIZE);
         }
   }
+ 
+ static int total_mapcount(struct page *page)
+ {
+       int i, ret;
+ 
+       ret = compound_mapcount(page);
+       for (i = 0; i < HPAGE_PMD_NR; i++)
+               ret += atomic_read(&page[i]._mapcount) + 1;
+ 
+       if (PageDoubleMap(page))
+               ret -= HPAGE_PMD_NR;
+ 
+       return ret;
+ }
+ 
+ static int __split_huge_page_tail(struct page *head, int tail,
+               struct lruvec *lruvec, struct list_head *list)
+ {
+       int mapcount;
+       struct page *page_tail = head + tail;
+ 
+       mapcount = atomic_read(&page_tail->_mapcount) + 1;
+       VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+ 
+       /*
+        * tail_page->_count is zero and not changing from under us. But
+        * get_page_unless_zero() may be running from under us on the
+        * tail_page. If we used atomic_set() below instead of atomic_add(), we
+        * would then run atomic_set() concurrently with
+        * get_page_unless_zero(), and atomic_set() is implemented in C not
+        * using locked ops. spin_unlock on x86 sometime uses locked ops
+        * because of PPro errata 66, 92, so unless somebody can guarantee
+        * atomic_set() here would be safe on all archs (and not only on x86),
+        * it's safer to use atomic_add().
+        */
+       atomic_add(mapcount + 1, &page_tail->_count);
+ 
+       /* after clearing PageTail the gup refcount can be released */
+       smp_mb__after_atomic();
+ 
+       page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+       page_tail->flags |= (head->flags &
+                       ((1L << PG_referenced) |
+                        (1L << PG_swapbacked) |
+                        (1L << PG_mlocked) |
+                        (1L << PG_uptodate) |
+                        (1L << PG_active) |
+                        (1L << PG_locked) |
+                        (1L << PG_unevictable)));
+       page_tail->flags |= (1L << PG_dirty);
+ 
+       clear_compound_head(page_tail);
+ 
+       if (page_is_young(head))
+               set_page_young(page_tail);
+       if (page_is_idle(head))
+               set_page_idle(page_tail);
+ 
+       /* ->mapping in first tail page is compound_mapcount */
+       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+                       page_tail);
+       page_tail->mapping = head->mapping;
+ 
+       page_tail->index = head->index + tail;
+       page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+       lru_add_page_tail(head, page_tail, lruvec, list);
+ 
+       return mapcount;
+ }
+ 
+ static void __split_huge_page(struct page *page, struct list_head *list)
+ {
+       struct page *head = compound_head(page);
+       struct zone *zone = page_zone(head);
+       struct lruvec *lruvec;
+       int i, tail_mapcount;
+ 
+       /* prevent PageLRU to go away from under us, and freeze lru stats */
+       spin_lock_irq(&zone->lru_lock);
+       lruvec = mem_cgroup_page_lruvec(head, zone);
+ 
+       spin_lock(&split_queue_lock);
+       if (!list_empty(page_deferred_list(head))) {
+               split_queue_len--;
+               list_del(page_deferred_list(head));
+       }
+       spin_unlock(&split_queue_lock);
+ 
+       /* complete memcg works before add pages to LRU */
+       mem_cgroup_split_huge_fixup(head);
+ 
+       tail_mapcount = 0;
+       for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+               tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
+       atomic_sub(tail_mapcount, &head->_count);
+ 
+       ClearPageCompound(head);
+       spin_unlock_irq(&zone->lru_lock);
+ 
+       unfreeze_page(page_anon_vma(head), head);
+ 
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               struct page *subpage = head + i;
+               if (subpage == page)
+                       continue;
+               unlock_page(subpage);
+ 
+               /*
+                * Subpages may be freed if there wasn't any mapping
+                * like if add_to_swap() is running on a lru page that
+                * had its mapping zapped. And freeing these pages
+                * requires taking the lru_lock so we do the put_page
+                * of the tail pages after the split is complete.
+                */
+               put_page(subpage);
+       }
+ }
+ 
+ /*
+  * This function splits huge page into normal pages. @page can point to any
+  * subpage of huge page to split. Split doesn't change the position of @page.
+  *
+  * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+  * The huge page must be locked.
+  *
+  * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+  *
+  * Both head page and tail pages will inherit mapping, flags, and so on from
+  * the hugepage.
+  *
+  * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+  * they are not mapped.
+  *
+  * Returns 0 if the hugepage is split successfully.
+  * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+  * us.
+  */
+ int split_huge_page_to_list(struct page *page, struct list_head *list)
+ {
+       struct page *head = compound_head(page);
+       struct anon_vma *anon_vma;
+       int count, mapcount, ret;
+ 
+       VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+       VM_BUG_ON_PAGE(!PageAnon(page), page);
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+       VM_BUG_ON_PAGE(!PageCompound(page), page);
+ 
+       /*
+        * The caller does not necessarily hold an mmap_sem that would prevent
+        * the anon_vma disappearing so we first we take a reference to it
+        * and then lock the anon_vma for write. This is similar to
+        * page_lock_anon_vma_read except the write lock is taken to serialise
+        * against parallel split or collapse operations.
+        */
+       anon_vma = page_get_anon_vma(head);
+       if (!anon_vma) {
+               ret = -EBUSY;
+               goto out;
+       }
+       anon_vma_lock_write(anon_vma);
+ 
+       /*
+        * Racy check if we can split the page, before freeze_page() will
+        * split PMDs
+        */
+       if (total_mapcount(head) != page_count(head) - 1) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+ 
+       freeze_page(anon_vma, head);
+       VM_BUG_ON_PAGE(compound_mapcount(head), head);
+ 
+       count = page_count(head);
+       mapcount = total_mapcount(head);
+       if (mapcount == count - 1) {
+               __split_huge_page(page, list);
+               ret = 0;
+       } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
+               pr_alert("total_mapcount: %u, page_count(): %u\n",
+                               mapcount, count);
+               if (PageTail(page))
+                       dump_page(head, NULL);
+               dump_page(page, "total_mapcount(head) > page_count(head) - 1");
+               BUG();
+       } else {
+               unfreeze_page(anon_vma, head);
+               ret = -EBUSY;
+       }
+ 
+ out_unlock:
+       anon_vma_unlock_write(anon_vma);
+       put_anon_vma(anon_vma);
+ out:
+       count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+       return ret;
+ }
+ 
+ void free_transhuge_page(struct page *page)
+ {
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (!list_empty(page_deferred_list(page))) {
+               split_queue_len--;
+               list_del(page_deferred_list(page));
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+       free_compound_page(page);
+ }
+ 
+ void deferred_split_huge_page(struct page *page)
+ {
+       unsigned long flags;
+ 
+       VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ 
+       spin_lock_irqsave(&split_queue_lock, flags);
+       if (list_empty(page_deferred_list(page))) {
+               list_add_tail(page_deferred_list(page), &split_queue);
+               split_queue_len++;
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+ }
+ 
+ static unsigned long deferred_split_count(struct shrinker *shrink,
+               struct shrink_control *sc)
+ {
+       /*
+        * Split a page from split_queue will free up at least one page,
+        * at most HPAGE_PMD_NR - 1. We don't track exact number.
+        * Let's use HPAGE_PMD_NR / 2 as ballpark.
+        */
+       return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+ }
+ 
+ static unsigned long deferred_split_scan(struct shrinker *shrink,
+               struct shrink_control *sc)
+ {
+       unsigned long flags;
+       LIST_HEAD(list), *pos, *next;
+       struct page *page;
+       int split = 0;
+ 
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_init(&split_queue, &list);
+ 
+       /* Take pin on all head pages to avoid freeing them under us */
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               page = compound_head(page);
+               /* race with put_compound_page() */
+               if (!get_page_unless_zero(page)) {
+                       list_del_init(page_deferred_list(page));
+                       split_queue_len--;
+               }
+       }
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+ 
+       list_for_each_safe(pos, next, &list) {
+               page = list_entry((void *)pos, struct page, mapping);
+               lock_page(page);
+               /* split_huge_page() removes page from list on success */
+               if (!split_huge_page(page))
+                       split++;
+               unlock_page(page);
+               put_page(page);
+       }
+ 
+       spin_lock_irqsave(&split_queue_lock, flags);
+       list_splice_tail(&list, &split_queue);
+       spin_unlock_irqrestore(&split_queue_lock, flags);
+ 
+       return split * HPAGE_PMD_NR / 2;
+ }
+ 
+ static struct shrinker deferred_split_shrinker = {
+       .count_objects = deferred_split_count,
+       .scan_objects = deferred_split_scan,
+       .seeks = DEFAULT_SEEKS,
+ };
diff --combined mm/memcontrol.c

index b732edfddb767025185f27c8879903591c2b0c82,34cd0df82a8b57ac47e2916964384af01d162385..48735e7c617b3d9454b35ac5ea6a4f9ff5ef472c
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -62,6 -62,7 +62,7 @@@
   #include <linux/oom.h>
   #include <linux/lockdep.h>
   #include <linux/file.h>
+ #include <linux/tracehook.h>
   #include "internal.h"
   #include <net/sock.h>
   #include <net/ip.h>
@@@ -434,7 -435,7 +435,7 @@@ struct cgroup_subsys_state *mem_cgroup_
   
         memcg = page->mem_cgroup;
   
- -      if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+ +      if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
                 memcg = root_mem_cgroup;
   
         rcu_read_unlock();
@@@ -695,7 -696,7 +696,7 @@@ static unsigned long mem_cgroup_read_ev
   
   static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                                          struct page *page,
-                                        int nr_pages)
+                                        bool compound, int nr_pages)
   {
         /*
          * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
@@@ -708,9 -709,11 +709,11 @@@
                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
                                 nr_pages);
   
-       if (PageTransHuge(page))
+       if (compound) {
+               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
                                 nr_pages);
+       }
   
         /* pagein of a big page is an event. So, ignore page size */
         if (nr_pages > 0)
@@@ -1661,7 -1664,7 +1664,7 @@@ static void memcg_oom_recover(struct me
   
   static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
   {
-       if (!current->memcg_oom.may_oom)
+       if (!current->memcg_may_oom)
                 return;
         /*
          * We are in the middle of the charge context here, so we
@@@ -1678,9 -1681,9 +1681,9 @@@
          * and when we know whether the fault was overall successful.
          */
         css_get(&memcg->css);
-       current->memcg_oom.memcg = memcg;
-       current->memcg_oom.gfp_mask = mask;
-       current->memcg_oom.order = order;
+       current->memcg_in_oom = memcg;
+       current->memcg_oom_gfp_mask = mask;
+       current->memcg_oom_order = order;
   }
   
   /**
@@@ -1702,7 -1705,7 +1705,7 @@@
    */
   bool mem_cgroup_oom_synchronize(bool handle)
   {
-       struct mem_cgroup *memcg = current->memcg_oom.memcg;
+       struct mem_cgroup *memcg = current->memcg_in_oom;
         struct oom_wait_info owait;
         bool locked;
   
@@@ -1730,8 -1733,8 +1733,8 @@@
         if (locked && !memcg->oom_kill_disable) {
                 mem_cgroup_unmark_under_oom(memcg);
                 finish_wait(&memcg_oom_waitq, &owait.wait);
-               mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-                                        current->memcg_oom.order);
+               mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+                                        current->memcg_oom_order);
         } else {
                 schedule();
                 mem_cgroup_unmark_under_oom(memcg);
@@@ -1748,7 -1751,7 +1751,7 @@@
                 memcg_oom_recover(memcg);
         }
   cleanup:
-       current->memcg_oom.memcg = NULL;
+       current->memcg_in_oom = NULL;
         css_put(&memcg->css);
         return true;
   }
@@@ -1972,6 -1975,31 +1975,31 @@@ static int memcg_cpu_hotplug_callback(s
         return NOTIFY_OK;
   }
   
+ /*
+  * Scheduled by try_charge() to be executed from the userland return path
+  * and reclaims memory over the high limit.
+  */
+ void mem_cgroup_handle_over_high(void)
+ {
+       unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       struct mem_cgroup *memcg, *pos;
+ 
+       if (likely(!nr_pages))
+               return;
+ 
+       pos = memcg = get_mem_cgroup_from_mm(current->mm);
+ 
+       do {
+               if (page_counter_read(&pos->memory) <= pos->high)
+                       continue;
+               mem_cgroup_events(pos, MEMCG_HIGH, 1);
+               try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+       } while ((pos = parent_mem_cgroup(pos)));
+ 
+       css_put(&memcg->css);
+       current->memcg_nr_pages_over_high = 0;
+ }
+ 
   static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                       unsigned int nr_pages)
   {
@@@ -1982,13 -2010,12 +2010,12 @@@
         unsigned long nr_reclaimed;
         bool may_swap = true;
         bool drained = false;
-       int ret = 0;
   
         if (mem_cgroup_is_root(memcg))
-               goto done;
+               return 0;
   retry:
         if (consume_stock(memcg, nr_pages))
-               goto done;
+               return 0;
   
         if (!do_swap_account ||
             !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
@@@ -2016,12 -2043,12 +2043,12 @@@
         if (unlikely(test_thread_flag(TIF_MEMDIE) ||
                      fatal_signal_pending(current) ||
                      current->flags & PF_EXITING))
-               goto bypass;
+               goto force;
   
         if (unlikely(task_in_memcg_oom(current)))
                 goto nomem;
   
-       if (!(gfp_mask & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(gfp_mask))
                 goto nomem;
   
         mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
@@@ -2062,38 -2089,54 +2089,54 @@@
                 goto retry;
   
         if (gfp_mask & __GFP_NOFAIL)
-               goto bypass;
+               goto force;
   
         if (fatal_signal_pending(current))
-               goto bypass;
+               goto force;
   
         mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
   
-       mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
+       mem_cgroup_oom(mem_over_limit, gfp_mask,
+                      get_order(nr_pages * PAGE_SIZE));
   nomem:
         if (!(gfp_mask & __GFP_NOFAIL))
                 return -ENOMEM;
- bypass:
-       return -EINTR;
+ force:
+       /*
+        * The allocation either can't fail or will lead to more memory
+        * being freed very soon.  Allow memory usage go over the limit
+        * temporarily by force charging it.
+        */
+       page_counter_charge(&memcg->memory, nr_pages);
+       if (do_swap_account)
+               page_counter_charge(&memcg->memsw, nr_pages);
+       css_get_many(&memcg->css, nr_pages);
+ 
+       return 0;
   
   done_restock:
         css_get_many(&memcg->css, batch);
         if (batch > nr_pages)
                 refill_stock(memcg, batch - nr_pages);
-       if (!(gfp_mask & __GFP_WAIT))
-               goto done;
+ 
         /*
-        * If the hierarchy is above the normal consumption range,
-        * make the charging task trim their excess contribution.
+        * If the hierarchy is above the normal consumption range, schedule
+        * reclaim on returning to userland.  We can perform reclaim here
+        * if __GFP_WAIT but let's always punt for simplicity and so that
+        * GFP_KERNEL can consistently be used during reclaim.  @memcg is
+        * not recorded as it most likely matches current's and won't
+        * change in the meantime.  As high limit is checked again before
+        * reclaim, the cost of mismatch is negligible.
          */
         do {
-               if (page_counter_read(&memcg->memory) <= memcg->high)
-                       continue;
-               mem_cgroup_events(memcg, MEMCG_HIGH, 1);
-               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+               if (page_counter_read(&memcg->memory) > memcg->high) {
+                       current->memcg_nr_pages_over_high += nr_pages;
+                       set_notify_resume(current);
+                       break;
+               }
         } while ((memcg = parent_mem_cgroup(memcg)));
- done:
-       return ret;
+ 
+       return 0;
   }
   
   static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
@@@ -2174,55 -2217,6 +2217,6 @@@ static void commit_charge(struct page *
   }
   
   #ifdef CONFIG_MEMCG_KMEM
- int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                     unsigned long nr_pages)
- {
-       struct page_counter *counter;
-       int ret = 0;
- 
-       ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
-       if (ret < 0)
-               return ret;
- 
-       ret = try_charge(memcg, gfp, nr_pages);
-       if (ret == -EINTR)  {
-               /*
-                * try_charge() chose to bypass to root due to OOM kill or
-                * fatal signal.  Since our only options are to either fail
-                * the allocation or charge it to this cgroup, do it as a
-                * temporary condition. But we can't fail. From a kmem/slab
-                * perspective, the cache has already been selected, by
-                * mem_cgroup_kmem_get_cache(), so it is too late to change
-                * our minds.
-                *
-                * This condition will only trigger if the task entered
-                * memcg_charge_kmem in a sane state, but was OOM-killed
-                * during try_charge() above. Tasks that were already dying
-                * when the allocation triggers should have been already
-                * directed to the root cgroup in memcontrol.h
-                */
-               page_counter_charge(&memcg->memory, nr_pages);
-               if (do_swap_account)
-                       page_counter_charge(&memcg->memsw, nr_pages);
-               css_get_many(&memcg->css, nr_pages);
-               ret = 0;
-       } else if (ret)
-               page_counter_uncharge(&memcg->kmem, nr_pages);
- 
-       return ret;
- }
- 
- void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
- {
-       page_counter_uncharge(&memcg->memory, nr_pages);
-       if (do_swap_account)
-               page_counter_uncharge(&memcg->memsw, nr_pages);
- 
-       page_counter_uncharge(&memcg->kmem, nr_pages);
- 
-       css_put_many(&memcg->css, nr_pages);
- }
- 
   static int memcg_alloc_cache_id(void)
   {
         int id, size;
@@@ -2384,85 -2378,59 +2378,59 @@@ void __memcg_kmem_put_cache(struct kmem
                 css_put(&cachep->memcg_params.memcg->css);
   }
   
- /*
-  * We need to verify if the allocation against current->mm->owner's memcg is
-  * possible for the given order. But the page is not allocated yet, so we'll
-  * need a further commit step to do the final arrangements.
-  *
-  * It is possible for the task to switch cgroups in this mean time, so at
-  * commit time, we can't rely on task conversion any longer.  We'll then use
-  * the handle argument to return to the caller which cgroup we should commit
-  * against. We could also return the memcg directly and avoid the pointer
-  * passing, but a boolean return value gives better semantics considering
-  * the compiled-out case as well.
-  *
-  * Returning true means the allocation is possible.
-  */
- bool
- __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+                             struct mem_cgroup *memcg)
   {
-       struct mem_cgroup *memcg;
-       int ret;
+       unsigned int nr_pages = 1 << order;
+       struct page_counter *counter;
+       int ret = 0;
   
-       *_memcg = NULL;
+       if (!memcg_kmem_is_active(memcg))
+               return 0;
   
-       memcg = get_mem_cgroup_from_mm(current->mm);
+       ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
+       if (ret)
+               return ret;
   
-       if (!memcg_kmem_is_active(memcg)) {
-               css_put(&memcg->css);
-               return true;
+       ret = try_charge(memcg, gfp, nr_pages);
+       if (ret) {
+               page_counter_uncharge(&memcg->kmem, nr_pages);
+               return ret;
         }
   
-       ret = memcg_charge_kmem(memcg, gfp, 1 << order);
-       if (!ret)
-               *_memcg = memcg;
+       page->mem_cgroup = memcg;
   
-       css_put(&memcg->css);
-       return (ret == 0);
+       return 0;
   }
   
- void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             int order)
+ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
   {
-       VM_BUG_ON(mem_cgroup_is_root(memcg));
+       struct mem_cgroup *memcg;
+       int ret;
   
-       /* The page allocation failed. Revert */
-       if (!page) {
-               memcg_uncharge_kmem(memcg, 1 << order);
-               return;
-       }
-       page->mem_cgroup = memcg;
+       memcg = get_mem_cgroup_from_mm(current->mm);
+       ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+       css_put(&memcg->css);
+       return ret;
   }
   
- void __memcg_kmem_uncharge_pages(struct page *page, int order)
+ void __memcg_kmem_uncharge(struct page *page, int order)
   {
         struct mem_cgroup *memcg = page->mem_cgroup;
+       unsigned int nr_pages = 1 << order;
   
         if (!memcg)
                 return;
   
         VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
   
-       memcg_uncharge_kmem(memcg, 1 << order);
-       page->mem_cgroup = NULL;
- }
- 
- struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
- {
-       struct mem_cgroup *memcg = NULL;
-       struct kmem_cache *cachep;
-       struct page *page;
- 
-       page = virt_to_head_page(ptr);
-       if (PageSlab(page)) {
-               cachep = page->slab_cache;
-               if (!is_root_cache(cachep))
-                       memcg = cachep->memcg_params.memcg;
-       } else
-               /* page allocated by alloc_kmem_pages */
-               memcg = page->mem_cgroup;
+       page_counter_uncharge(&memcg->kmem, nr_pages);
+       page_counter_uncharge(&memcg->memory, nr_pages);
+       if (do_swap_account)
+               page_counter_uncharge(&memcg->memsw, nr_pages);
   
-       return memcg;
+       page->mem_cgroup = NULL;
+       css_put_many(&memcg->css, nr_pages);
   }
   #endif /* CONFIG_MEMCG_KMEM */
   
@@@ -2470,9 -2438,7 +2438,7 @@@
   
   /*
    * Because tail pages are not marked as "used", set it. We're under
-  * zone->lru_lock, 'splitting on pmd' and compound_lock.
-  * charge/uncharge will be never happen and move_account() is done under
-  * compound_lock(), so we don't have to take care of races.
+  * zone->lru_lock and migration entries setup in all page mappings.
    */
   void mem_cgroup_split_huge_fixup(struct page *head)
   {
@@@ -2926,7 -2892,7 +2892,7 @@@ static int memcg_activate_kmem(struct m
          * of course permitted.
          */
         mutex_lock(&memcg_create_mutex);
- -      if (cgroup_has_tasks(memcg->css.cgroup) ||
+ +      if (cgroup_is_populated(memcg->css.cgroup) ||
             (memcg->use_hierarchy && memcg_has_children(memcg)))
                 err = -EBUSY;
         mutex_unlock(&memcg_create_mutex);
@@@ -3741,43 -3707,44 +3707,43 @@@ struct wb_domain *mem_cgroup_wb_domain(
   /**
    * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
    * @wb: bdi_writeback in question
- - * @pavail: out parameter for number of available pages
+ + * @pfilepages: out parameter for number of file pages
+ + * @pheadroom: out parameter for number of allocatable pages according to memcg
    * @pdirty: out parameter for number of dirty pages
    * @pwriteback: out parameter for number of pages under writeback
    *
- - * Determine the numbers of available, dirty, and writeback pages in @wb's
- - * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
- - * more involved.
+ + * Determine the numbers of file, headroom, dirty, and writeback pages in
+ + * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
+ + * is a bit more involved.
    *
- - * A memcg's headroom is "min(max, high) - used".  The available memory is
- - * calculated as the lowest headroom of itself and the ancestors plus the
- - * number of pages already being used for file pages.  Note that this
- - * doesn't consider the actual amount of available memory in the system.
- - * The caller should further cap *@pavail accordingly.
+ + * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
+ + * headroom is calculated as the lowest headroom of itself and the
+ + * ancestors.  Note that this doesn't consider the actual amount of
+ + * available memory in the system.  The caller should further cap
+ + * *@pheadroom accordingly.
    */
- -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
- -                       unsigned long *pdirty, unsigned long *pwriteback)
+ +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ +                       unsigned long *pheadroom, unsigned long *pdirty,
+ +                       unsigned long *pwriteback)
   {
         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
         struct mem_cgroup *parent;
- -      unsigned long head_room = PAGE_COUNTER_MAX;
- -      unsigned long file_pages;
   
         *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
   
         /* this should eventually include NR_UNSTABLE_NFS */
         *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ +      *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+ +                                                   (1 << LRU_ACTIVE_FILE));
+ +      *pheadroom = PAGE_COUNTER_MAX;
   
- -      file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- -                                                  (1 << LRU_ACTIVE_FILE));
         while ((parent = parent_mem_cgroup(memcg))) {
                 unsigned long ceiling = min(memcg->memory.limit, memcg->high);
                 unsigned long used = page_counter_read(&memcg->memory);
   
- -              head_room = min(head_room, ceiling - min(ceiling, used));
+ +              *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
                 memcg = parent;
         }
- -
- -      *pavail = file_pages + head_room;
   }
   
   #else /* CONFIG_CGROUP_WRITEBACK */
@@@ -4066,7 -4033,8 +4032,7 @@@ static struct cftype mem_cgroup_legacy_
         {
                 .name = "cgroup.event_control",         /* XXX: for compat */
                 .write = memcg_write_event_control,
- -              .flags = CFTYPE_NO_PREFIX,
- -              .mode = S_IWUGO,
+ +              .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
         },
         {
                 .name = "swappiness",
@@@ -4400,28 -4368,16 +4366,16 @@@ static int mem_cgroup_do_precharge(unsi
   {
         int ret;
   
-       /* Try a single bulk charge without reclaim first */
-       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+       /* Try a single bulk charge without reclaim first, kswapd may wake */
+       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
         if (!ret) {
                 mc.precharge += count;
                 return ret;
         }
-       if (ret == -EINTR) {
-               cancel_charge(root_mem_cgroup, count);
-               return ret;
-       }
   
         /* Try charges one by one with reclaim */
         while (count--) {
                 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
-               /*
-                * In case of failure, any residual charges against
-                * mc.to will be dropped by mem_cgroup_clear_mc()
-                * later on.  However, cancel any charges that are
-                * bypassed to root right away or they'll be lost.
-                */
-               if (ret == -EINTR)
-                       cancel_charge(root_mem_cgroup, 1);
                 if (ret)
                         return ret;
                 mc.precharge++;
@@@ -4547,39 -4503,30 +4501,30 @@@ static struct page *mc_handle_file_pte(
    * @from: mem_cgroup which the page is moved from.
    * @to:       mem_cgroup which the page is moved to. @from != @to.
    *
-  * The caller must confirm following.
-  * - page is not on LRU (isolate_page() is useful.)
-  * - compound_lock is held when nr_pages > 1
+  * The caller must make sure the page is not on LRU (isolate_page() is useful.)
    *
    * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
    * from old cgroup.
    */
   static int mem_cgroup_move_account(struct page *page,
-                                  unsigned int nr_pages,
+                                  bool compound,
                                    struct mem_cgroup *from,
                                    struct mem_cgroup *to)
   {
         unsigned long flags;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
         int ret;
         bool anon;
   
         VM_BUG_ON(from == to);
         VM_BUG_ON_PAGE(PageLRU(page), page);
-       /*
-        * The page is isolated from LRU. So, collapse function
-        * will not handle this page. But page splitting can happen.
-        * Do this check under compound_page_lock(). The caller should
-        * hold it.
-        */
-       ret = -EBUSY;
-       if (nr_pages > 1 && !PageTransHuge(page))
-               goto out;
+       VM_BUG_ON(compound && !PageTransHuge(page));
   
         /*
-        * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
-        * of its source page while we change it: page migration takes
-        * both pages off the LRU, but page cache replacement doesn't.
+        * Prevent mem_cgroup_replace_page() from looking at
+        * page->mem_cgroup of its source page while we change it.
          */
+       ret = -EBUSY;
         if (!trylock_page(page))
                 goto out;
   
@@@ -4634,9 -4581,9 +4579,9 @@@
         ret = 0;
   
         local_irq_disable();
-       mem_cgroup_charge_statistics(to, page, nr_pages);
+       mem_cgroup_charge_statistics(to, page, compound, nr_pages);
         memcg_check_events(to, page);
-       mem_cgroup_charge_statistics(from, page, -nr_pages);
+       mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
         memcg_check_events(from, page);
         local_irq_enable();
   out_unlock:
@@@ -4726,7 -4673,7 +4671,7 @@@ static int mem_cgroup_count_precharge_p
         pte_t *pte;
         spinlock_t *ptl;
   
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                         mc.precharge += HPAGE_PMD_NR;
                 spin_unlock(ptl);
@@@ -4833,7 -4780,7 +4778,7 @@@ static int mem_cgroup_can_attach(struc
   {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup *from;
- -      struct task_struct *p;
+ +      struct task_struct *leader, *p;
         struct mm_struct *mm;
         unsigned long move_flags;
         int ret = 0;
@@@ -4847,20 -4794,7 +4792,20 @@@
         if (!move_flags)
                 return 0;
   
- -      p = cgroup_taskset_first(tset);
+ +      /*
+ +       * Multi-process migrations only happen on the default hierarchy
+ +       * where charge immigration is not used.  Perform charge
+ +       * immigration if @tset contains a leader and whine if there are
+ +       * multiple.
+ +       */
+ +      p = NULL;
+ +      cgroup_taskset_for_each_leader(leader, tset) {
+ +              WARN_ON_ONCE(p);
+ +              p = leader;
+ +      }
+ +      if (!p)
+ +              return 0;
+ +
         from = mem_cgroup_from_task(p);
   
         VM_BUG_ON(from == memcg);
@@@ -4910,17 -4844,7 +4855,7 @@@ static int mem_cgroup_move_charge_pte_r
         union mc_target target;
         struct page *page;
   
-       /*
-        * We don't take compound_lock() here but no race with splitting thp
-        * happens because:
-        *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
-        *    under splitting, which means there's no concurrent thp split,
-        *  - if another thread runs into split_huge_page() just after we
-        *    entered this if-block, the thread must wait for page table lock
-        *    to be unlocked in __split_huge_page_splitting(), where the main
-        *    part of thp split is not executed yet.
-        */
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+       if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
                 if (mc.precharge < HPAGE_PMD_NR) {
                         spin_unlock(ptl);
                         return 0;
@@@ -4929,7 -4853,7 +4864,7 @@@
                 if (target_type == MC_TARGET_PAGE) {
                         page = target.page;
                         if (!isolate_lru_page(page)) {
-                               if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+                               if (!mem_cgroup_move_account(page, true,
                                                              mc.from, mc.to)) {
                                         mc.precharge -= HPAGE_PMD_NR;
                                         mc.moved_charge += HPAGE_PMD_NR;
@@@ -4958,7 -4882,8 +4893,8 @@@ retry
                         page = target.page;
                         if (isolate_lru_page(page))
                                 goto put;
-                       if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
+                       if (!mem_cgroup_move_account(page, false,
+                                               mc.from, mc.to)) {
                                 mc.precharge--;
                                 /* we uncharge from mc.from later. */
                                 mc.moved_charge++;
@@@ -5076,7 -5001,7 +5012,7 @@@ static void mem_cgroup_bind(struct cgro
          * guarantees that @root doesn't have any children, so turning it
          * on for the root memcg is enough.
          */
- -      if (cgroup_on_dfl(root_css->cgroup))
+ +      if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                 root_mem_cgroup->use_hierarchy = true;
         else
                 root_mem_cgroup->use_hierarchy = false;
@@@ -5085,7 -5010,7 +5021,7 @@@
   static u64 memory_current_read(struct cgroup_subsys_state *css,
                                struct cftype *cft)
   {
-       return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+       return page_counter_read(&mem_cgroup_from_css(css)->memory);
   }
   
   static int memory_low_show(struct seq_file *m, void *v)
@@@ -5197,6 -5122,7 +5133,7 @@@ static int memory_events_show(struct se
   static struct cftype memory_files[] = {
         {
                 .name = "current",
+               .flags = CFTYPE_NOT_ON_ROOT,
                 .read_u64 = memory_current_read,
         },
         {
@@@ -5220,7 -5146,6 +5157,7 @@@
         {
                 .name = "events",
                 .flags = CFTYPE_NOT_ON_ROOT,
+ +              .file_offset = offsetof(struct mem_cgroup, events_file),
                 .seq_show = memory_events_show,
         },
         { }     /* terminate */
@@@ -5296,10 -5221,11 +5233,11 @@@ bool mem_cgroup_low(struct mem_cgroup *
    * with mem_cgroup_cancel_charge() in case page instantiation fails.
    */
   int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
-                         gfp_t gfp_mask, struct mem_cgroup **memcgp)
+                         gfp_t gfp_mask, struct mem_cgroup **memcgp,
+                         bool compound)
   {
         struct mem_cgroup *memcg = NULL;
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
         int ret = 0;
   
         if (mem_cgroup_disabled())
@@@ -5329,22 -5255,12 +5267,12 @@@
                 }
         }
   
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
- 
         if (!memcg)
                 memcg = get_mem_cgroup_from_mm(mm);
   
         ret = try_charge(memcg, gfp_mask, nr_pages);
   
         css_put(&memcg->css);
- 
-       if (ret == -EINTR) {
-               memcg = root_mem_cgroup;
-               ret = 0;
-       }
   out:
         *memcgp = memcg;
         return ret;
@@@ -5367,9 -5283,9 +5295,9 @@@
    * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
    */
   void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             bool lrucare)
+                             bool lrucare, bool compound)
   {
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
   
         VM_BUG_ON_PAGE(!page->mapping, page);
         VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
@@@ -5386,13 -5302,8 +5314,8 @@@
   
         commit_charge(page, memcg, lrucare);
   
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
- 
         local_irq_disable();
-       mem_cgroup_charge_statistics(memcg, page, nr_pages);
+       mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
         memcg_check_events(memcg, page);
         local_irq_enable();
   
@@@ -5414,9 -5325,10 +5337,10 @@@
    *
    * Cancel a charge transaction started by mem_cgroup_try_charge().
    */
- void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+               bool compound)
   {
-       unsigned int nr_pages = 1;
+       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
   
         if (mem_cgroup_disabled())
                 return;
@@@ -5428,11 -5340,6 +5352,6 @@@
         if (!memcg)
                 return;
   
-       if (PageTransHuge(page)) {
-               nr_pages <<= compound_order(page);
-               VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-       }
- 
         cancel_charge(memcg, nr_pages);
   }
   
@@@ -5559,7 -5466,7 +5478,7 @@@ void mem_cgroup_uncharge_list(struct li
   }
   
   /**
-  * mem_cgroup_migrate - migrate a charge to another page
+  * mem_cgroup_replace_page - migrate a charge to another page
    * @oldpage: currently charged page
    * @newpage: page to transfer the charge to
    * @lrucare: either or both pages might be on the LRU already
@@@ -5568,16 -5475,13 +5487,13 @@@
    *
    * Both pages must be locked, @newpage->mapping must be set up.
    */
- void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
-                       bool lrucare)
+ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
   {
         struct mem_cgroup *memcg;
         int isolated;
   
         VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
         VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-       VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
-       VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
         VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
         VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
                        newpage);
@@@ -5589,25 -5493,16 +5505,16 @@@
         if (newpage->mem_cgroup)
                 return;
   
-       /*
-        * Swapcache readahead pages can get migrated before being
-        * charged, and migration from compaction can happen to an
-        * uncharged page when the PFN walker finds a page that
-        * reclaim just put back on the LRU but has not released yet.
-        */
+       /* Swapcache readahead pages can get replaced before being charged */
         memcg = oldpage->mem_cgroup;
         if (!memcg)
                 return;
   
-       if (lrucare)
-               lock_page_lru(oldpage, &isolated);
- 
+       lock_page_lru(oldpage, &isolated);
         oldpage->mem_cgroup = NULL;
+       unlock_page_lru(oldpage, isolated);
   
-       if (lrucare)
-               unlock_page_lru(oldpage, isolated);
- 
-       commit_charge(newpage, memcg, lrucare);
+       commit_charge(newpage, memcg, true);
   }
   
   /*
@@@ -5690,7 -5585,7 +5597,7 @@@ void mem_cgroup_swapout(struct page *pa
          * only synchronisation we have for udpating the per-CPU variables.
          */
         VM_BUG_ON(!irqs_disabled());
-       mem_cgroup_charge_statistics(memcg, page, -1);
+       mem_cgroup_charge_statistics(memcg, page, false, -1);
         memcg_check_events(memcg, page);
   }
   
diff --combined mm/memory_hotplug.c

index 0780d118d26e70d5cfc3a3e67ceca4915684a003,4b62bbac55125b07a14706797ad390b14f4a6e8a..67d488ab495e57b9018484932e135078c787903c
--- 1/mm/memory_hotplug.c
--- 2/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@@ -339,8 -339,8 +339,8 @@@ static int __ref ensure_zone_is_initial
                         unsigned long start_pfn, unsigned long num_pages)
   {
         if (!zone_is_initialized(zone))
-               return init_currently_empty_zone(zone, start_pfn, num_pages,
-                                                MEMMAP_HOTPLUG);
+               return init_currently_empty_zone(zone, start_pfn, num_pages);
+ 
         return 0;
   }
   
@@@ -1232,21 -1232,23 +1232,21 @@@ int zone_for_memory(int nid, u64 start
   }
   
   /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
- -int __ref add_memory(int nid, u64 start, u64 size)
+ +int __ref add_memory_resource(int nid, struct resource *res)
   {
+ +      u64 start, size;
         pg_data_t *pgdat = NULL;
         bool new_pgdat;
         bool new_node;
- -      struct resource *res;
         int ret;
   
+ +      start = res->start;
+ +      size = resource_size(res);
+ +
         ret = check_hotplug_memory_range(start, size);
         if (ret)
                 return ret;
   
- -      res = register_memory_resource(start, size);
- -      ret = -EEXIST;
- -      if (!res)
- -              return ret;
- -
         {       /* Stupid hack to suppress address-never-null warning */
                 void *p = NODE_DATA(nid);
                 new_pgdat = !p;
@@@ -1298,28 -1300,13 +1298,28 @@@ error
         /* rollback pgdat allocation and others */
         if (new_pgdat)
                 rollback_node_hotadd(nid, pgdat);
- -      release_memory_resource(res);
         memblock_remove(start, size);
   
   out:
         mem_hotplug_done();
         return ret;
   }
+ +EXPORT_SYMBOL_GPL(add_memory_resource);
+ +
+ +int __ref add_memory(int nid, u64 start, u64 size)
+ +{
+ +      struct resource *res;
+ +      int ret;
+ +
+ +      res = register_memory_resource(start, size);
+ +      if (!res)
+ +              return -EEXIST;
+ +
+ +      ret = add_memory_resource(nid, res);
+ +      if (ret < 0)
+ +              release_memory_resource(res);
+ +      return ret;
+ +}
   EXPORT_SYMBOL_GPL(add_memory);
   
   #ifdef CONFIG_MEMORY_HOTREMOVE
diff --combined mm/page_alloc.c

index 805bbad2e24e1a84b383ebc90fc825527238957a,cef70104614c05ccef23f2020b4ea26d78f4c08c..d0499fff8c7fb1ee2f33a34bd9e424420939a654
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -169,19 -169,19 +169,19 @@@ void pm_restrict_gfp_mask(void
         WARN_ON(!mutex_is_locked(&pm_mutex));
         WARN_ON(saved_gfp_mask);
         saved_gfp_mask = gfp_allowed_mask;
-       gfp_allowed_mask &= ~GFP_IOFS;
+       gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
   }
   
   bool pm_suspended_storage(void)
   {
-       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+       if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                 return false;
         return true;
   }
   #endif /* CONFIG_PM_SLEEP */
   
   #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
- int pageblock_order __read_mostly;
+ unsigned int pageblock_order __read_mostly;
   #endif
   
   static void __free_pages_ok(struct page *page, unsigned int order);
@@@ -229,6 -229,17 +229,17 @@@ static char * const zone_names[MAX_NR_Z
   #endif
   };
   
+ compound_page_dtor * const compound_page_dtors[] = {
+       NULL,
+       free_compound_page,
+ #ifdef CONFIG_HUGETLB_PAGE
+       free_huge_page,
+ #endif
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       free_transhuge_page,
+ #endif
+ };
+ 
   int min_free_kbytes = 1024;
   int user_min_free_kbytes = -1;
   
@@@ -436,39 -447,38 +447,38 @@@ out
   /*
    * Higher-order pages are called "compound pages".  They are structured thusly:
    *
-  * The first PAGE_SIZE page is called the "head page".
+  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
    *
-  * The remaining PAGE_SIZE pages are called "tail pages".
+  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
+  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
    *
-  * All pages have PG_compound set.  All tail pages have their ->first_page
-  * pointing at the head page.
+  * The first tail page's ->compound_dtor holds the offset in array of compound
+  * page destructors. See compound_page_dtors.
    *
-  * The first tail page's ->lru.next holds the address of the compound page's
-  * put_page() function.  Its ->lru.prev holds the order of allocation.
+  * The first tail page's ->compound_order holds the order of allocation.
    * This usage means that zero-order pages may not be compound.
    */
   
- static void free_compound_page(struct page *page)
+ void free_compound_page(struct page *page)
   {
         __free_pages_ok(page, compound_order(page));
   }
   
- void prep_compound_page(struct page *page, unsigned long order)
+ void prep_compound_page(struct page *page, unsigned int order)
   {
         int i;
         int nr_pages = 1 << order;
   
-       set_compound_page_dtor(page, free_compound_page);
+       set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
         set_compound_order(page, order);
         __SetPageHead(page);
         for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
                 set_page_count(p, 0);
-               p->first_page = page;
-               /* Make sure p->first_page is always valid for PageTail() */
-               smp_wmb();
-               __SetPageTail(p);
+               p->mapping = TAIL_MAPPING;
+               set_compound_head(p, page);
         }
+       atomic_set(compound_mapcount_ptr(page), -1);
   }
   
   #ifdef CONFIG_DEBUG_PAGEALLOC
@@@ -656,7 -666,7 +666,7 @@@ static inline void __free_one_page(stru
         unsigned long combined_idx;
         unsigned long uninitialized_var(buddy_idx);
         struct page *buddy;
-       int max_order = MAX_ORDER;
+       unsigned int max_order = MAX_ORDER;
   
         VM_BUG_ON(!zone_is_initialized(zone));
         VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@@ -669,7 -679,7 +679,7 @@@
                  * pageblock. Without this, pageblock isolation
                  * could cause incorrect freepage accounting.
                  */
-               max_order = min(MAX_ORDER, pageblock_order + 1);
+               max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
         } else {
                 __mod_zone_freepage_state(zone, 1 << order, migratetype);
         }
@@@ -733,7 -743,7 +743,7 @@@ static inline int free_pages_check(stru
         const char *bad_reason = NULL;
         unsigned long bad_flags = 0;
   
-       if (unlikely(page_mapcount(page)))
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
                 bad_reason = "nonzero mapcount";
         if (unlikely(page->mapping != NULL))
                 bad_reason = "non-NULL mapping";
@@@ -817,7 -827,6 +827,6 @@@ static void free_pcppages_bulk(struct z
                         if (unlikely(has_isolate_pageblock(zone)))
                                 mt = get_pageblock_migratetype(page);
   
-                       /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
                         __free_one_page(page, page_to_pfn(page), zone, 0, mt);
                         trace_mm_page_pcpu_drain(page, 0, mt);
                 } while (--to_free && --batch_free && !list_empty(list));
@@@ -846,17 -855,52 +855,52 @@@ static void free_one_page(struct zone *
   
   static int free_tail_pages_check(struct page *head_page, struct page *page)
   {
-       if (!IS_ENABLED(CONFIG_DEBUG_VM))
-               return 0;
+       int ret = 1;
+ 
+       /*
+        * We rely page->lru.next never has bit 0 set, unless the page
+        * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+        */
+       BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+ 
+       if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+               ret = 0;
+               goto out;
+       }
+       switch (page - head_page) {
+       case 1:
+               /* the first tail page: ->mapping is compound_mapcount() */
+               if (unlikely(compound_mapcount(page))) {
+                       bad_page(page, "nonzero compound_mapcount", 0);
+                       goto out;
+               }
+               break;
+       case 2:
+               /*
+                * the second tail page: ->mapping is
+                * page_deferred_list().next -- ignore value.
+                */
+               break;
+       default:
+               if (page->mapping != TAIL_MAPPING) {
+                       bad_page(page, "corrupted mapping in tail page", 0);
+                       goto out;
+               }
+               break;
+       }
         if (unlikely(!PageTail(page))) {
                 bad_page(page, "PageTail not set", 0);
-               return 1;
+               goto out;
         }
-       if (unlikely(page->first_page != head_page)) {
-               bad_page(page, "first_page not consistent", 0);
-               return 1;
+       if (unlikely(compound_head(page) != head_page)) {
+               bad_page(page, "compound_head not consistent", 0);
+               goto out;
         }
-       return 0;
+       ret = 0;
+ out:
+       page->mapping = NULL;
+       clear_compound_head(page);
+       return ret;
   }
   
   static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@@ -923,6 -967,10 +967,10 @@@ void __meminit reserve_bootmem_region(u
                         struct page *page = pfn_to_page(start_pfn);
   
                         init_reserved_page(start_pfn);
+ 
+                       /* Avoid false-positive PageTail() */
+                       INIT_LIST_HEAD(&page->lru);
+ 
                         SetPageReserved(page);
                 }
         }
@@@ -1314,7 -1362,7 +1362,7 @@@ static inline int check_new_page(struc
         const char *bad_reason = NULL;
         unsigned long bad_flags = 0;
   
-       if (unlikely(page_mapcount(page)))
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
                 bad_reason = "nonzero mapcount";
         if (unlikely(page->mapping != NULL))
                 bad_reason = "non-NULL mapping";
@@@ -1417,15 -1465,14 +1465,14 @@@ struct page *__rmqueue_smallest(struct 
    * the free lists for the desirable migrate type are depleted
    */
   static int fallbacks[MIGRATE_TYPES][4] = {
-       [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
-       [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
-       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
+       [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
+       [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
+       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
   #ifdef CONFIG_CMA
-       [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
+       [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
   #endif
-       [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
   #ifdef CONFIG_MEMORY_ISOLATION
-       [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
+       [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
   #endif
   };
   
@@@ -1450,7 -1497,7 +1497,7 @@@ int move_freepages(struct zone *zone
                           int migratetype)
   {
         struct page *page;
-       unsigned long order;
+       unsigned int order;
         int pages_moved = 0;
   
   #ifndef CONFIG_HOLES_IN_ZONE
@@@ -1563,7 -1610,7 +1610,7 @@@ static bool can_steal_fallback(unsigne
   static void steal_suitable_fallback(struct zone *zone, struct page *page,
                                                           int start_type)
   {
-       int current_order = page_order(page);
+       unsigned int current_order = page_order(page);
         int pages;
   
         /* Take ownership for orders >= pageblock_order */
@@@ -1598,7 -1645,7 +1645,7 @@@ int find_suitable_fallback(struct free_
         *can_steal = false;
         for (i = 0;; i++) {
                 fallback_mt = fallbacks[migratetype][i];
-               if (fallback_mt == MIGRATE_RESERVE)
+               if (fallback_mt == MIGRATE_TYPES)
                         break;
   
                 if (list_empty(&area->free_list[fallback_mt]))
@@@ -1617,6 -1664,101 +1664,101 @@@
         return -1;
   }
   
+ /*
+  * Reserve a pageblock for exclusive use of high-order atomic allocations if
+  * there are no empty page blocks that contain a page with a suitable order
+  */
+ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
+                               unsigned int alloc_order)
+ {
+       int mt;
+       unsigned long max_managed, flags;
+ 
+       /*
+        * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
+        * Check is race-prone but harmless.
+        */
+       max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+       if (zone->nr_reserved_highatomic >= max_managed)
+               return;
+ 
+       spin_lock_irqsave(&zone->lock, flags);
+ 
+       /* Recheck the nr_reserved_highatomic limit under the lock */
+       if (zone->nr_reserved_highatomic >= max_managed)
+               goto out_unlock;
+ 
+       /* Yoink! */
+       mt = get_pageblock_migratetype(page);
+       if (mt != MIGRATE_HIGHATOMIC &&
+                       !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+               zone->nr_reserved_highatomic += pageblock_nr_pages;
+               set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
+               move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+       }
+ 
+ out_unlock:
+       spin_unlock_irqrestore(&zone->lock, flags);
+ }
+ 
+ /*
+  * Used when an allocation is about to fail under memory pressure. This
+  * potentially hurts the reliability of high-order allocations when under
+  * intense memory pressure but failed atomic allocations should be easier
+  * to recover from than an OOM.
+  */
+ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
+ {
+       struct zonelist *zonelist = ac->zonelist;
+       unsigned long flags;
+       struct zoneref *z;
+       struct zone *zone;
+       struct page *page;
+       int order;
+ 
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+                                                               ac->nodemask) {
+               /* Preserve at least one pageblock */
+               if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
+                       continue;
+ 
+               spin_lock_irqsave(&zone->lock, flags);
+               for (order = 0; order < MAX_ORDER; order++) {
+                       struct free_area *area = &(zone->free_area[order]);
+ 
+                       if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+                               continue;
+ 
+                       page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next,
+                                               struct page, lru);
+ 
+                       /*
+                        * It should never happen but changes to locking could
+                        * inadvertently allow a per-cpu drain to add pages
+                        * to MIGRATE_HIGHATOMIC while unreserving so be safe
+                        * and watch for underflows.
+                        */
+                       zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
+                               zone->nr_reserved_highatomic);
+ 
+                       /*
+                        * Convert to ac->migratetype and avoid the normal
+                        * pageblock stealing heuristics. Minimally, the caller
+                        * is doing the work and needs the pages. More
+                        * importantly, if the block was always converted to
+                        * MIGRATE_UNMOVABLE or another type then the number
+                        * of pageblocks that cannot be completely freed
+                        * may increase.
+                        */
+                       set_pageblock_migratetype(page, ac->migratetype);
+                       move_freepages_block(zone, page, ac->migratetype);
+                       spin_unlock_irqrestore(&zone->lock, flags);
+                       return;
+               }
+               spin_unlock_irqrestore(&zone->lock, flags);
+       }
+ }
+ 
   /* Remove an element from the buddy allocator from the fallback list */
   static inline struct page *
   __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
@@@ -1672,29 -1814,17 +1814,17 @@@
    * Call me with the zone->lock already held.
    */
   static struct page *__rmqueue(struct zone *zone, unsigned int order,
-                                               int migratetype)
+                               int migratetype, gfp_t gfp_flags)
   {
         struct page *page;
   
- retry_reserve:
         page = __rmqueue_smallest(zone, order, migratetype);
- 
-       if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
+       if (unlikely(!page)) {
                 if (migratetype == MIGRATE_MOVABLE)
                         page = __rmqueue_cma_fallback(zone, order);
   
                 if (!page)
                         page = __rmqueue_fallback(zone, order, migratetype);
- 
-               /*
-                * Use MIGRATE_RESERVE rather than fail an allocation. goto
-                * is used because __rmqueue_smallest is an inline function
-                * and we want just one call site
-                */
-               if (!page) {
-                       migratetype = MIGRATE_RESERVE;
-                       goto retry_reserve;
-               }
         }
   
         trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@@ -1714,7 -1844,7 +1844,7 @@@ static int rmqueue_bulk(struct zone *zo
   
         spin_lock(&zone->lock);
         for (i = 0; i < count; ++i) {
-               struct page *page = __rmqueue(zone, order, migratetype);
+               struct page *page = __rmqueue(zone, order, migratetype, 0);
                 if (unlikely(page == NULL))
                         break;
   
@@@ -2086,7 -2216,7 +2216,7 @@@ int split_free_page(struct page *page
   static inline
   struct page *buffered_rmqueue(struct zone *preferred_zone,
                         struct zone *zone, unsigned int order,
-                       gfp_t gfp_flags, int migratetype)
+                       gfp_t gfp_flags, int alloc_flags, int migratetype)
   {
         unsigned long flags;
         struct page *page;
@@@ -2129,7 -2259,15 +2259,15 @@@
                         WARN_ON_ONCE(order > 1);
                 }
                 spin_lock_irqsave(&zone->lock, flags);
-               page = __rmqueue(zone, order, migratetype);
+ 
+               page = NULL;
+               if (alloc_flags & ALLOC_HARDER) {
+                       page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+                       if (page)
+                               trace_mm_page_alloc_zone_locked(page, order, migratetype);
+               }
+               if (!page)
+                       page = __rmqueue(zone, order, migratetype, gfp_flags);
                 spin_unlock(&zone->lock);
                 if (!page)
                         goto failed;
@@@ -2159,13 -2297,13 +2297,13 @@@ failed
   static struct {
         struct fault_attr attr;
   
- -      u32 ignore_gfp_highmem;
- -      u32 ignore_gfp_reclaim;
+ +      bool ignore_gfp_highmem;
-       bool ignore_gfp_wait;
++      bool ignore_gfp_reclaim;
         u32 min_order;
   } fail_page_alloc = {
         .attr = FAULT_ATTR_INITIALIZER,
-       .ignore_gfp_wait = true,
- -      .ignore_gfp_reclaim = 1,
- -      .ignore_gfp_highmem = 1,
++      .ignore_gfp_reclaim = true,
+ +      .ignore_gfp_highmem = true,
         .min_order = 1,
   };
   
@@@ -2183,7 -2321,8 +2321,8 @@@ static bool should_fail_alloc_page(gfp_
                 return false;
         if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                 return false;
-       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+       if (fail_page_alloc.ignore_gfp_reclaim &&
+                       (gfp_mask & __GFP_DIRECT_RECLAIM))
                 return false;
   
         return should_fail(&fail_page_alloc.attr, 1 << order);
@@@ -2202,7 -2341,7 +2341,7 @@@ static int __init fail_page_alloc_debug
                 return PTR_ERR(dir);
   
         if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                               &fail_page_alloc.ignore_gfp_wait))
+                               &fail_page_alloc.ignore_gfp_reclaim))
                 goto fail;
         if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
                                 &fail_page_alloc.ignore_gfp_highmem))
@@@ -2232,42 -2371,77 +2371,77 @@@ static inline bool should_fail_alloc_pa
   #endif /* CONFIG_FAIL_PAGE_ALLOC */
   
   /*
-  * Return true if free pages are above 'mark'. This takes into account the order
-  * of the allocation.
+  * Return true if free base pages are above 'mark'. For high-order checks it
+  * will return true of the order-0 watermark is reached and there is at least
+  * one free page of a suitable size. Checking now avoids taking the zone lock
+  * to check in the allocation paths if no pages are free.
    */
   static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                         unsigned long mark, int classzone_idx, int alloc_flags,
                         long free_pages)
   {
-       /* free_pages may go negative - that's OK */
         long min = mark;
         int o;
-       long free_cma = 0;
+       const int alloc_harder = (alloc_flags & ALLOC_HARDER);
   
+       /* free_pages may go negative - that's OK */
         free_pages -= (1 << order) - 1;
+ 
         if (alloc_flags & ALLOC_HIGH)
                 min -= min / 2;
-       if (alloc_flags & ALLOC_HARDER)
+ 
+       /*
+        * If the caller does not have rights to ALLOC_HARDER then subtract
+        * the high-atomic reserves. This will over-estimate the size of the
+        * atomic reserve but it avoids a search.
+        */
+       if (likely(!alloc_harder))
+               free_pages -= z->nr_reserved_highatomic;
+       else
                 min -= min / 4;
+ 
   #ifdef CONFIG_CMA
         /* If allocation can't use CMA areas don't use free CMA pages */
         if (!(alloc_flags & ALLOC_CMA))
-               free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+               free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
   #endif
   
-       if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
+       /*
+        * Check watermarks for an order-0 allocation request. If these
+        * are not met, then a high-order request also cannot go ahead
+        * even if a suitable page happened to be free.
+        */
+       if (free_pages <= min + z->lowmem_reserve[classzone_idx])
                 return false;
-       for (o = 0; o < order; o++) {
-               /* At the next order, this order's pages become unavailable */
-               free_pages -= z->free_area[o].nr_free << o;
   
-               /* Require fewer higher order pages to be free */
-               min >>= 1;
+       /* If this is an order-0 request then the watermark is fine */
+       if (!order)
+               return true;
+ 
+       /* For a high-order request, check at least one suitable page is free */
+       for (o = order; o < MAX_ORDER; o++) {
+               struct free_area *area = &z->free_area[o];
+               int mt;
+ 
+               if (!area->nr_free)
+                       continue;
+ 
+               if (alloc_harder)
+                       return true;
   
-               if (free_pages <= min)
-                       return false;
+               for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
+                       if (!list_empty(&area->free_list[mt]))
+                               return true;
+               }
+ 
+ #ifdef CONFIG_CMA
+               if ((alloc_flags & ALLOC_CMA) &&
+                   !list_empty(&area->free_list[MIGRATE_CMA])) {
+                       return true;
+               }
+ #endif
         }
-       return true;
+       return false;
   }
   
   bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
@@@ -2278,134 -2452,18 +2452,18 @@@
   }
   
   bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
-                       unsigned long mark, int classzone_idx, int alloc_flags)
+                       unsigned long mark, int classzone_idx)
   {
         long free_pages = zone_page_state(z, NR_FREE_PAGES);
   
         if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
   
-       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+       return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
                                                                 free_pages);
   }
   
   #ifdef CONFIG_NUMA
- /*
-  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
-  * skip over zones that are not allowed by the cpuset, or that have
-  * been recently (in last second) found to be nearly full.  See further
-  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
-  * that have to skip over a lot of full or unallowed zones.
-  *
-  * If the zonelist cache is present in the passed zonelist, then
-  * returns a pointer to the allowed node mask (either the current
-  * tasks mems_allowed, or node_states[N_MEMORY].)
-  *
-  * If the zonelist cache is not available for this zonelist, does
-  * nothing and returns NULL.
-  *
-  * If the fullzones BITMAP in the zonelist cache is stale (more than
-  * a second since last zap'd) then we zap it out (clear its bits.)
-  *
-  * We hold off even calling zlc_setup, until after we've checked the
-  * first zone in the zonelist, on the theory that most allocations will
-  * be satisfied from that first zone, so best to examine that zone as
-  * quickly as we can.
-  */
- static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
- {
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       nodemask_t *allowednodes;       /* zonelist_cache approximation */
- 
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return NULL;
- 
-       if (time_after(jiffies, zlc->last_full_zap + HZ)) {
-               bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-               zlc->last_full_zap = jiffies;
-       }
- 
-       allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
-                                       &cpuset_current_mems_allowed :
-                                       &node_states[N_MEMORY];
-       return allowednodes;
- }
- 
- /*
-  * Given 'z' scanning a zonelist, run a couple of quick checks to see
-  * if it is worth looking at further for free memory:
-  *  1) Check that the zone isn't thought to be full (doesn't have its
-  *     bit set in the zonelist_cache fullzones BITMAP).
-  *  2) Check that the zones node (obtained from the zonelist_cache
-  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
-  * Return true (non-zero) if zone is worth looking at further, or
-  * else return false (zero) if it is not.
-  *
-  * This check -ignores- the distinction between various watermarks,
-  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
-  * found to be full for any variation of these watermarks, it will
-  * be considered full for up to one second by all requests, unless
-  * we are so low on memory on all allowed nodes that we are forced
-  * into the second scan of the zonelist.
-  *
-  * In the second scan we ignore this zonelist cache and exactly
-  * apply the watermarks to all zones, even it is slower to do so.
-  * We are low on memory in the second scan, and should leave no stone
-  * unturned looking for a free page.
-  */
- static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
-                                               nodemask_t *allowednodes)
- {
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       int i;                          /* index of *z in zonelist zones */
-       int n;                          /* node that zone *z is on */
- 
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return 1;
- 
-       i = z - zonelist->_zonerefs;
-       n = zlc->z_to_n[i];
- 
-       /* This zone is worth trying if it is allowed but not full */
-       return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
- }
- 
- /*
-  * Given 'z' scanning a zonelist, set the corresponding bit in
-  * zlc->fullzones, so that subsequent attempts to allocate a page
-  * from that zone don't waste time re-examining it.
-  */
- static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
- {
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       int i;                          /* index of *z in zonelist zones */
- 
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return;
- 
-       i = z - zonelist->_zonerefs;
- 
-       set_bit(i, zlc->fullzones);
- }
- 
- /*
-  * clear all zones full, called after direct reclaim makes progress so that
-  * a zone that was recently full is not skipped over for up to a second
-  */
- static void zlc_clear_zones_full(struct zonelist *zonelist)
- {
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
- 
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return;
- 
-       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
- }
- 
   static bool zone_local(struct zone *local_zone, struct zone *zone)
   {
         return local_zone->node == zone->node;
@@@ -2416,28 -2474,7 +2474,7 @@@ static bool zone_allows_reclaim(struct 
         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
                                 RECLAIM_DISTANCE;
   }
- 
   #else /* CONFIG_NUMA */
- 
- static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
- {
-       return NULL;
- }
- 
- static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
-                               nodemask_t *allowednodes)
- {
-       return 1;
- }
- 
- static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
- {
- }
- 
- static void zlc_clear_zones_full(struct zonelist *zonelist)
- {
- }
- 
   static bool zone_local(struct zone *local_zone, struct zone *zone)
   {
         return true;
@@@ -2447,7 -2484,6 +2484,6 @@@ static bool zone_allows_reclaim(struct 
   {
         return true;
   }
- 
   #endif        /* CONFIG_NUMA */
   
   static void reset_alloc_batches(struct zone *preferred_zone)
@@@ -2474,11 -2510,6 +2510,6 @@@ get_page_from_freelist(gfp_t gfp_mask, 
         struct zoneref *z;
         struct page *page = NULL;
         struct zone *zone;
-       nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
-       int zlc_active = 0;             /* set if using zonelist_cache */
-       int did_zlc_setup = 0;          /* just call zlc_setup() one time */
-       bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
-                               (gfp_mask & __GFP_WRITE);
         int nr_fair_skipped = 0;
         bool zonelist_rescan;
   
@@@ -2493,9 -2524,6 +2524,6 @@@ zonelist_scan
                                                                 ac->nodemask) {
                 unsigned long mark;
   
-               if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
-                       !zlc_zone_worth_trying(zonelist, z, allowednodes))
-                               continue;
                 if (cpusets_enabled() &&
                         (alloc_flags & ALLOC_CPUSET) &&
                         !cpuset_zone_allowed(zone, gfp_mask))
@@@ -2533,14 -2561,14 +2561,14 @@@
                  *
                  * XXX: For now, allow allocations to potentially
                  * exceed the per-zone dirty limit in the slowpath
-                * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                * (spread_dirty_pages unset) before going into reclaim,
                  * which is important when on a NUMA setup the allowed
                  * zones are together not big enough to reach the
                  * global limit.  The proper fix for these situations
                  * will require awareness of zones in the
                  * dirty-throttling and the flusher threads.
                  */
-               if (consider_zone_dirty && !zone_dirty_ok(zone))
+               if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
                         continue;
   
                 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
@@@ -2553,28 -2581,8 +2581,8 @@@
                         if (alloc_flags & ALLOC_NO_WATERMARKS)
                                 goto try_this_zone;
   
-                       if (IS_ENABLED(CONFIG_NUMA) &&
-                                       !did_zlc_setup && nr_online_nodes > 1) {
-                               /*
-                                * we do zlc_setup if there are multiple nodes
-                                * and before considering the first zone allowed
-                                * by the cpuset.
-                                */
-                               allowednodes = zlc_setup(zonelist, alloc_flags);
-                               zlc_active = 1;
-                               did_zlc_setup = 1;
-                       }
- 
                         if (zone_reclaim_mode == 0 ||
                             !zone_allows_reclaim(ac->preferred_zone, zone))
-                               goto this_zone_full;
- 
-                       /*
-                        * As we may have just activated ZLC, check if the first
-                        * eligible zone has failed zone_reclaim recently.
-                        */
-                       if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
-                               !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                 continue;
   
                         ret = zone_reclaim(zone, gfp_mask, order);
@@@ -2591,34 -2599,26 +2599,26 @@@
                                                 ac->classzone_idx, alloc_flags))
                                         goto try_this_zone;
   
-                               /*
-                                * Failed to reclaim enough to meet watermark.
-                                * Only mark the zone full if checking the min
-                                * watermark or if we failed to reclaim just
-                                * 1<<order pages or else the page allocator
-                                * fastpath will prematurely mark zones full
-                                * when the watermark is between the low and
-                                * min watermarks.
-                                */
-                               if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
-                                   ret == ZONE_RECLAIM_SOME)
-                                       goto this_zone_full;
- 
                                 continue;
                         }
                 }
   
   try_this_zone:
                 page = buffered_rmqueue(ac->preferred_zone, zone, order,
-                                               gfp_mask, ac->migratetype);
+                               gfp_mask, alloc_flags, ac->migratetype);
                 if (page) {
                         if (prep_new_page(page, order, gfp_mask, alloc_flags))
                                 goto try_this_zone;
+ 
+                       /*
+                        * If this is a high-order atomic allocation then check
+                        * if the pageblock should be reserved for the future
+                        */
+                       if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
+                               reserve_highatomic_pageblock(page, zone, order);
+ 
                         return page;
                 }
- this_zone_full:
-               if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
-                       zlc_mark_zone_full(zonelist, z);
         }
   
         /*
@@@ -2639,12 -2639,6 +2639,6 @@@
                         zonelist_rescan = true;
         }
   
-       if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
-               /* Disable zlc cache for second zonelist scan */
-               zlc_active = 0;
-               zonelist_rescan = true;
-       }
- 
         if (zonelist_rescan)
                 goto zonelist_scan;
   
@@@ -2669,7 -2663,7 +2663,7 @@@ static DEFINE_RATELIMIT_STATE(nopage_rs
                 DEFAULT_RATELIMIT_INTERVAL,
                 DEFAULT_RATELIMIT_BURST);
   
- void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
   {
         unsigned int filter = SHOW_MEM_FILTER_NODES;
   
@@@ -2686,7 -2680,7 +2680,7 @@@
                 if (test_thread_flag(TIF_MEMDIE) ||
                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
                         filter &= ~SHOW_MEM_FILTER_NODES;
-       if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+       if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                 filter &= ~SHOW_MEM_FILTER_NODES;
   
         if (fmt) {
@@@ -2703,7 -2697,7 +2697,7 @@@
                 va_end(args);
         }
   
-       pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+       pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
                 current->comm, order, gfp_mask);
   
         dump_stack();
@@@ -2889,19 -2883,17 +2883,17 @@@ __alloc_pages_direct_reclaim(gfp_t gfp_
         if (unlikely(!(*did_some_progress)))
                 return NULL;
   
-       /* After successful reclaim, reconsider all zones for allocation */
-       if (IS_ENABLED(CONFIG_NUMA))
-               zlc_clear_zones_full(ac->zonelist);
- 
   retry:
         page = get_page_from_freelist(gfp_mask, order,
                                         alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
   
         /*
          * If an allocation failed after direct reclaim, it could be because
-        * pages are pinned on the per-cpu lists. Drain them and try again
+        * pages are pinned on the per-cpu lists or in high alloc reserves.
+        * Shrink them them and try again
          */
         if (!page && !drained) {
+               unreserve_highatomic_pageblock(ac);
                 drain_all_pages(NULL);
                 drained = true;
                 goto retry;
@@@ -2946,7 -2938,6 +2938,6 @@@ static inline in
   gfp_to_alloc_flags(gfp_t gfp_mask)
   {
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
-       const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
   
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@@ -2955,11 -2946,11 +2946,11 @@@
          * The caller may dip into page reserves a bit more if the caller
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
+        * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
          */
         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
   
-       if (atomic) {
+       if (gfp_mask & __GFP_ATOMIC) {
                 /*
                  * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
                  * if it can't schedule.
@@@ -2996,11 -2987,16 +2987,16 @@@ bool gfp_pfmemalloc_allowed(gfp_t gfp_m
         return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
   }
   
+ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
+ {
+       return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+ }
+ 
   static inline struct page *
   __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                                 struct alloc_context *ac)
   {
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
         struct page *page = NULL;
         int alloc_flags;
         unsigned long pages_reclaimed = 0;
@@@ -3020,16 -3016,24 +3016,24 @@@
                 return NULL;
         }
   
+       /*
+        * We also sanity check to catch abuse of atomic reserves being used by
+        * callers that are not in atomic context.
+        */
+       if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
+                               (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+               gfp_mask &= ~__GFP_ATOMIC;
+ 
         /*
          * If this allocation cannot block and it is for a specific node, then
          * fail early.  There's no need to wakeup kswapd or retry for a
          * speculative node-specific allocation.
          */
-       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
+       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
                 goto nopage;
   
   retry:
-       if (!(gfp_mask & __GFP_NO_KSWAPD))
+       if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                 wake_all_kswapds(order, ac);
   
         /*
@@@ -3072,8 -3076,8 +3076,8 @@@
                 }
         }
   
-       /* Atomic allocations - we can't balance anything */
-       if (!wait) {
+       /* Caller is not willing to reclaim, we can't balance anything */
+       if (!can_direct_reclaim) {
                 /*
                  * All existing users of the deprecated __GFP_NOFAIL are
                  * blockable, so warn of any new users that actually allow this
@@@ -3103,7 -3107,7 +3107,7 @@@
                 goto got_pg;
   
         /* Checks for THP-specific high-order allocations */
-       if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+       if (is_thp_gfp_mask(gfp_mask)) {
                 /*
                  * If compaction is deferred for high-order allocations, it is
                  * because sync compaction recently failed. If this is the case
@@@ -3138,8 -3142,7 +3142,7 @@@
          * fault, so use asynchronous memory compaction for THP unless it is
          * khugepaged trying to collapse.
          */
-       if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
-                                               (current->flags & PF_KTHREAD))
+       if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
                 migration_mode = MIGRATE_SYNC_LIGHT;
   
         /* Try direct reclaim and then allocating */
@@@ -3210,7 -3213,7 +3213,7 @@@ __alloc_pages_nodemask(gfp_t gfp_mask, 
   
         lockdep_trace_alloc(gfp_mask);
   
-       might_sleep_if(gfp_mask & __GFP_WAIT);
+       might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
   
         if (should_fail_alloc_page(gfp_mask, order))
                 return NULL;
@@@ -3231,6 -3234,10 +3234,10 @@@ retry_cpuset
   
         /* We set it here, as __alloc_pages_slowpath might have changed it */
         ac.zonelist = zonelist;
+ 
+       /* Dirty zone balancing only done in the fast path */
+       ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+ 
         /* The preferred zone is used for statistics later */
         preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                                 ac.nodemask ? : &cpuset_current_mems_allowed,
@@@ -3249,6 -3256,7 +3256,7 @@@
                  * complete.
                  */
                 alloc_mask = memalloc_noio_flags(gfp_mask);
+               ac.spread_dirty_pages = false;
   
                 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
         }
@@@ -3428,24 -3436,24 +3436,24 @@@ EXPORT_SYMBOL(__free_page_frag)
   struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
   {
         struct page *page;
-       struct mem_cgroup *memcg = NULL;
   
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
         page = alloc_pages(gfp_mask, order);
-       memcg_kmem_commit_charge(page, memcg, order);
+       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+               __free_pages(page, order);
+               page = NULL;
+       }
         return page;
   }
   
   struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
   {
         struct page *page;
-       struct mem_cgroup *memcg = NULL;
   
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
         page = alloc_pages_node(nid, gfp_mask, order);
-       memcg_kmem_commit_charge(page, memcg, order);
+       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+               __free_pages(page, order);
+               page = NULL;
+       }
         return page;
   }
   
@@@ -3455,7 -3463,7 +3463,7 @@@
    */
   void __free_kmem_pages(struct page *page, unsigned int order)
   {
-       memcg_kmem_uncharge_pages(page, order);
+       memcg_kmem_uncharge(page, order);
         __free_pages(page, order);
   }
   
@@@ -3467,7 -3475,8 +3475,8 @@@ void free_kmem_pages(unsigned long addr
         }
   }
   
- static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+ static void *make_alloc_exact(unsigned long addr, unsigned int order,
+               size_t size)
   {
         if (addr) {
                 unsigned long alloc_end = addr + (PAGE_SIZE << order);
@@@ -3517,7 -3526,7 +3526,7 @@@ EXPORT_SYMBOL(alloc_pages_exact)
    */
   void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
   {
-       unsigned order = get_order(size);
+       unsigned int order = get_order(size);
         struct page *p = alloc_pages_node(nid, gfp_mask, order);
         if (!p)
                 return NULL;
@@@ -3666,7 -3675,6 +3675,6 @@@ static void show_migration_types(unsign
                 [MIGRATE_UNMOVABLE]     = 'U',
                 [MIGRATE_RECLAIMABLE]   = 'E',
                 [MIGRATE_MOVABLE]       = 'M',
-               [MIGRATE_RESERVE]       = 'R',
   #ifdef CONFIG_CMA
                 [MIGRATE_CMA]           = 'C',
   #endif
@@@ -3819,7 -3827,8 +3827,8 @@@ void show_free_areas(unsigned int filte
         }
   
         for_each_populated_zone(zone) {
-               unsigned long nr[MAX_ORDER], flags, order, total = 0;
+               unsigned int order;
+               unsigned long nr[MAX_ORDER], flags, total = 0;
                 unsigned char types[MAX_ORDER];
   
                 if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@@ -4168,7 -4177,7 +4177,7 @@@ static void build_zonelists(pg_data_t *
         nodemask_t used_mask;
         int local_node, prev_node;
         struct zonelist *zonelist;
-       int order = current_zonelist_order;
+       unsigned int order = current_zonelist_order;
   
         /* initialize zonelists */
         for (i = 0; i < MAX_ZONELISTS; i++) {
@@@ -4212,20 -4221,6 +4221,6 @@@
         build_thisnode_zonelists(pgdat);
   }
   
- /* Construct the zonelist performance cache - see further mmzone.h */
- static void build_zonelist_cache(pg_data_t *pgdat)
- {
-       struct zonelist *zonelist;
-       struct zonelist_cache *zlc;
-       struct zoneref *z;
- 
-       zonelist = &pgdat->node_zonelists[0];
-       zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
-       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-       for (z = zonelist->_zonerefs; z->zone; z++)
-               zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
- }
- 
   #ifdef CONFIG_HAVE_MEMORYLESS_NODES
   /*
    * Return node id of node used for "local" allocations.
@@@ -4286,12 -4281,6 +4281,6 @@@ static void build_zonelists(pg_data_t *
         zonelist->_zonerefs[j].zone_idx = 0;
   }
   
- /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
- static void build_zonelist_cache(pg_data_t *pgdat)
- {
-       pgdat->node_zonelists[0].zlcache_ptr = NULL;
- }
- 
   #endif        /* CONFIG_NUMA */
   
   /*
@@@ -4332,14 -4321,12 +4321,12 @@@ static int __build_all_zonelists(void *
   
         if (self && !node_online(self->node_id)) {
                 build_zonelists(self);
-               build_zonelist_cache(self);
         }
   
         for_each_online_node(nid) {
                 pg_data_t *pgdat = NODE_DATA(nid);
   
                 build_zonelists(pgdat);
-               build_zonelist_cache(pgdat);
         }
   
         /*
@@@ -4362,13 -4349,13 +4349,13 @@@
                 /*
                  * We now know the "local memory node" for each node--
                  * i.e., the node of the first zone in the generic zonelist.
-                * Set up numa_mem percpu variable for on-line cpus.  During
-                * boot, only the boot cpu should be on-line;  we'll init the
-                * secondary cpus' numa_mem as they come on-line.  During
-                * node/memory hotplug, we'll fixup all on-line cpus.
+                * Set up numa_mem percpu variable for all possible cpus
+                * if associated node has been onlined.
                  */
-               if (cpu_online(cpu))
+               if (node_online(cpu_to_node(cpu)))
                         set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
+               else
+                       set_cpu_numa_mem(cpu, NUMA_NO_NODE);
   #endif
         }
   
@@@ -4498,120 -4485,6 +4485,6 @@@ static inline unsigned long wait_table_
         return ffz(~size);
   }
   
- /*
-  * Check if a pageblock contains reserved pages
-  */
- static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
- {
-       unsigned long pfn;
- 
-       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-               if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
-                       return 1;
-       }
-       return 0;
- }
- 
- /*
-  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
-  * of blocks reserved is based on min_wmark_pages(zone). The memory within
-  * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
-  * higher will lead to a bigger reserve which will get freed as contiguous
-  * blocks as reclaim kicks in
-  */
- static void setup_zone_migrate_reserve(struct zone *zone)
- {
-       unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
-       struct page *page;
-       unsigned long block_migratetype;
-       int reserve;
-       int old_reserve;
- 
-       /*
-        * Get the start pfn, end pfn and the number of blocks to reserve
-        * We have to be careful to be aligned to pageblock_nr_pages to
-        * make sure that we always check pfn_valid for the first page in
-        * the block.
-        */
-       start_pfn = zone->zone_start_pfn;
-       end_pfn = zone_end_pfn(zone);
-       start_pfn = roundup(start_pfn, pageblock_nr_pages);
-       reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
-                                                       pageblock_order;
- 
-       /*
-        * Reserve blocks are generally in place to help high-order atomic
-        * allocations that are short-lived. A min_free_kbytes value that
-        * would result in more than 2 reserve blocks for atomic allocations
-        * is assumed to be in place to help anti-fragmentation for the
-        * future allocation of hugepages at runtime.
-        */
-       reserve = min(2, reserve);
-       old_reserve = zone->nr_migrate_reserve_block;
- 
-       /* When memory hot-add, we almost always need to do nothing */
-       if (reserve == old_reserve)
-               return;
-       zone->nr_migrate_reserve_block = reserve;
- 
-       for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
-               if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
-                       return;
- 
-               if (!pfn_valid(pfn))
-                       continue;
-               page = pfn_to_page(pfn);
- 
-               /* Watch out for overlapping nodes */
-               if (page_to_nid(page) != zone_to_nid(zone))
-                       continue;
- 
-               block_migratetype = get_pageblock_migratetype(page);
- 
-               /* Only test what is necessary when the reserves are not met */
-               if (reserve > 0) {
-                       /*
-                        * Blocks with reserved pages will never free, skip
-                        * them.
-                        */
-                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-                       if (pageblock_is_reserved(pfn, block_end_pfn))
-                               continue;
- 
-                       /* If this block is reserved, account for it */
-                       if (block_migratetype == MIGRATE_RESERVE) {
-                               reserve--;
-                               continue;
-                       }
- 
-                       /* Suitable for reserving if this block is movable */
-                       if (block_migratetype == MIGRATE_MOVABLE) {
-                               set_pageblock_migratetype(page,
-                                                       MIGRATE_RESERVE);
-                               move_freepages_block(zone, page,
-                                                       MIGRATE_RESERVE);
-                               reserve--;
-                               continue;
-                       }
-               } else if (!old_reserve) {
-                       /*
-                        * At boot time we don't need to scan the whole zone
-                        * for turning off MIGRATE_RESERVE.
-                        */
-                       break;
-               }
- 
-               /*
-                * If the reserve is met and this is a previous reserved block,
-                * take it back
-                */
-               if (block_migratetype == MIGRATE_RESERVE) {
-                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-                       move_freepages_block(zone, page, MIGRATE_MOVABLE);
-               }
-       }
- }
- 
   /*
    * Initially all pages are reserved - free ones are freed
    * up by free_all_bootmem() once the early boot process is
@@@ -4651,9 -4524,7 +4524,7 @@@ void __meminit memmap_init_zone(unsigne
                  * movable at startup. This will force kernel allocations
                  * to reserve their blocks rather than leaking throughout
                  * the address space during boot when many long-lived
-                * kernel allocations are made. Later some blocks near
-                * the start are marked MIGRATE_RESERVE by
-                * setup_zone_migrate_reserve()
+                * kernel allocations are made.
                  *
                  * bitmap is created for zone's valid pfn range. but memmap
                  * can be created for invalid pages (for alignment)
@@@ -4900,8 -4771,7 +4771,7 @@@ static __meminit void zone_pcp_init(str
   
   int __meminit init_currently_empty_zone(struct zone *zone,
                                         unsigned long zone_start_pfn,
-                                       unsigned long size,
-                                       enum memmap_context context)
+                                       unsigned long size)
   {
         struct pglist_data *pgdat = zone->zone_pgdat;
         int ret;
@@@ -5413,8 -5283,7 +5283,7 @@@ static void __paginginit free_area_init
   
                 set_pageblock_order();
                 setup_usemap(pgdat, zone, zone_start_pfn, size);
-               ret = init_currently_empty_zone(zone, zone_start_pfn,
-                                               size, MEMMAP_EARLY);
+               ret = init_currently_empty_zone(zone, zone_start_pfn, size);
                 BUG_ON(ret);
                 memmap_init(size, nid, j, zone_start_pfn);
                 zone_start_pfn += size;
@@@ -5423,6 -5292,8 +5292,8 @@@
   
   static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
   {
+       unsigned long __maybe_unused offset = 0;
+ 
         /* Skip empty nodes */
         if (!pgdat->node_spanned_pages)
                 return;
@@@ -5439,6 -5310,7 +5310,7 @@@
                  * for the buddy allocator to function correctly.
                  */
                 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+               offset = pgdat->node_start_pfn - start;
                 end = pgdat_end_pfn(pgdat);
                 end = ALIGN(end, MAX_ORDER_NR_PAGES);
                 size =  (end - start) * sizeof(struct page);
@@@ -5446,7 -5318,7 +5318,7 @@@
                 if (!map)
                         map = memblock_virt_alloc_node_nopanic(size,
                                                                pgdat->node_id);
-               pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
+               pgdat->node_mem_map = map + offset;
         }
   #ifndef CONFIG_NEED_MULTIPLE_NODES
         /*
@@@ -5454,9 -5326,9 +5326,9 @@@
          */
         if (pgdat == NODE_DATA(0)) {
                 mem_map = NODE_DATA(0)->node_mem_map;
- #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
                 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
-                       mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
+                       mem_map -= offset;
   #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
         }
   #endif
@@@ -5668,13 -5540,17 +5540,17 @@@ static void __init find_zone_movable_pf
                  */
                 required_movablecore =
                         roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+               required_movablecore = min(totalpages, required_movablecore);
                 corepages = totalpages - required_movablecore;
   
                 required_kernelcore = max(required_kernelcore, corepages);
         }
   
-       /* If kernelcore was not specified, there is no ZONE_MOVABLE */
-       if (!required_kernelcore)
+       /*
+        * If kernelcore was not specified or kernelcore size is larger
+        * than totalpages, there is no ZONE_MOVABLE.
+        */
+       if (!required_kernelcore || required_kernelcore >= totalpages)
                 goto out;
   
         /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
@@@ -6209,7 -6085,6 +6085,6 @@@ static void __setup_per_zone_wmarks(voi
                         high_wmark_pages(zone) - low_wmark_pages(zone) -
                         atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
   
-               setup_zone_migrate_reserve(zone);
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
   
@@@ -6831,7 -6706,8 +6706,8 @@@ int alloc_contig_range(unsigned long st
                        unsigned migratetype)
   {
         unsigned long outer_start, outer_end;
-       int ret = 0, order;
+       unsigned int order;
+       int ret = 0;
   
         struct compact_control cc = {
                 .nr_migratepages = 0,
diff --combined mm/pgtable-generic.c

index 7d3db0247983b22b121290c2203ba2c2fb544ec0,89b150f8c920f200310641ed6a70e56150d06a6c..69261d4c774dd3d9894447bdbf6342aac746ad34
--- 1/mm/pgtable-generic.c
--- 2/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@@ -57,59 -57,35 +57,59 @@@ int ptep_set_access_flags(struct vm_are
   }
   #endif
   
+ +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+ +int ptep_clear_flush_young(struct vm_area_struct *vma,
+ +                         unsigned long address, pte_t *ptep)
+ +{
+ +      int young;
+ +      young = ptep_test_and_clear_young(vma, address, ptep);
+ +      if (young)
+ +              flush_tlb_page(vma, address);
+ +      return young;
+ +}
+ +#endif
+ +
+ +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+ +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ +                     pte_t *ptep)
+ +{
+ +      struct mm_struct *mm = (vma)->vm_mm;
+ +      pte_t pte;
+ +      pte = ptep_get_and_clear(mm, address, ptep);
+ +      if (pte_accessible(mm, pte))
+ +              flush_tlb_page(vma, address);
+ +      return pte;
+ +}
+ +#endif
+ +
+ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ +
+ +#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
+ +
+ +/*
+ + * ARCHes with special requirements for evicting THP backing TLB entries can
+ + * implement this. Otherwise also, it can help optimize normal TLB flush in
+ + * THP regime. stock flush_tlb_range() typically has optimization to nuke the
+ + * entire TLB TLB if flush span is greater than a threshhold, which will
+ + * likely be true for a single huge page. Thus a single thp flush will
+ + * invalidate the entire TLB which is not desitable.
+ + * e.g. see arch/arc: flush_pmd_tlb_range
+ + */
+ +#define flush_pmd_tlb_range(vma, addr, end)   flush_tlb_range(vma, addr, end)
+ +#endif
+ +
   #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
   int pmdp_set_access_flags(struct vm_area_struct *vma,
                           unsigned long address, pmd_t *pmdp,
                           pmd_t entry, int dirty)
   {
- -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
         int changed = !pmd_same(*pmdp, entry);
         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
         if (changed) {
                 set_pmd_at(vma->vm_mm, address, pmdp, entry);
- -              flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ +              flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
         }
         return changed;
- -#else /* CONFIG_TRANSPARENT_HUGEPAGE */
- -      BUG();
- -      return 0;
- -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- -}
- -#endif
- -
- -#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
- -int ptep_clear_flush_young(struct vm_area_struct *vma,
- -                         unsigned long address, pte_t *ptep)
- -{
- -      int young;
- -      young = ptep_test_and_clear_young(vma, address, ptep);
- -      if (young)
- -              flush_tlb_page(vma, address);
- -      return young;
   }
   #endif
   
@@@ -118,15 -94,33 +118,15 @@@ int pmdp_clear_flush_young(struct vm_ar
                            unsigned long address, pmd_t *pmdp)
   {
         int young;
- -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- -#else
- -      BUG();
- -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
         young = pmdp_test_and_clear_young(vma, address, pmdp);
         if (young)
- -              flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ +              flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
         return young;
   }
   #endif
   
- -#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
- -pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
- -                     pte_t *ptep)
- -{
- -      struct mm_struct *mm = (vma)->vm_mm;
- -      pte_t pte;
- -      pte = ptep_get_and_clear(mm, address, ptep);
- -      if (pte_accessible(mm, pte))
- -              flush_tlb_page(vma, address);
- -      return pte;
- -}
- -#endif
- -
   #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
- -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                             pmd_t *pmdp)
   {
@@@ -134,24 -128,14 +134,12 @@@
         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
         VM_BUG_ON(!pmd_trans_huge(*pmdp));
         pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
- -      flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ +      flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
         return pmd;
   }
- -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   #endif
   
- #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
- void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                         pmd_t *pmdp)
- {
-       pmd_t pmd = pmd_mksplitting(*pmdp);
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-       /* tlb flush only to serialize against gup-fast */
-       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- }
- #endif
- 
   #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
- -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                 pgtable_t pgtable)
   {
@@@ -164,9 -148,11 +152,9 @@@
                 list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
         pmd_huge_pte(mm, pmdp) = pgtable;
   }
- -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   #endif
   
   #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
- -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   /* no "address" argument so destroys page coloring of some arch */
   pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
   {
@@@ -185,19 -171,23 +173,19 @@@
         }
         return pgtable;
   }
- -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   #endif
   
   #ifndef __HAVE_ARCH_PMDP_INVALIDATE
- -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                      pmd_t *pmdp)
   {
         pmd_t entry = *pmdp;
         set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
- -      flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ +      flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
   }
- -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   #endif
   
   #ifndef pmdp_collapse_flush
- -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
   pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
                           pmd_t *pmdp)
   {
@@@ -210,8 -200,8 +198,8 @@@
         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
         VM_BUG_ON(pmd_trans_huge(*pmdp));
         pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
- -      flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ +      flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
         return pmd;
   }
- -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   #endif
+ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --combined mm/vmscan.c

index e7057af54b6e267558a99749fac80dc77dd7855f,f9cbe39d020bdbdf0c87055ea09236fb9a45cdc0..9b52ecf9119420bef8ce4ea2d503eb3ef4754c7f
--- 1/mm/vmscan.c
--- 2/mm/vmscan.c
+++ b/mm/vmscan.c
@@@ -175,7 -175,7 +175,7 @@@ static bool sane_reclaim(struct scan_co
         if (!memcg)
                 return true;
   #ifdef CONFIG_CGROUP_WRITEBACK
- -      if (cgroup_on_dfl(memcg->css.cgroup))
+ +      if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                 return true;
   #endif
         return false;
@@@ -194,7 -194,7 +194,7 @@@ static bool sane_reclaim(struct scan_co
   
   static unsigned long zone_reclaimable_pages(struct zone *zone)
   {
-       int nr;
+       unsigned long nr;
   
         nr = zone_page_state(zone, NR_ACTIVE_FILE) +
              zone_page_state(zone, NR_INACTIVE_FILE);
@@@ -796,6 -796,8 +796,8 @@@ static enum page_references page_check_
         int referenced_ptes, referenced_page;
         unsigned long vm_flags;
   
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+ 
         referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
                                           &vm_flags);
         referenced_page = TestClearPageReferenced(page);
@@@ -906,6 -908,7 +908,7 @@@ static unsigned long shrink_page_list(s
                 int may_enter_fs;
                 enum page_references references = PAGEREF_RECLAIM_CLEAN;
                 bool dirty, writeback;
+               bool freeable = false;
   
                 cond_resched();
   
@@@ -1049,8 -1052,8 +1052,8 @@@
                                 goto keep_locked;
                         if (!add_to_swap(page, page_list))
                                 goto activate_locked;
+                       freeable = true;
                         may_enter_fs = 1;
- 
                         /* Adding to swap updated mapping */
                         mapping = page_mapping(page);
                 }
@@@ -1060,8 -1063,9 +1063,9 @@@
                  * processes. Try to unmap it here.
                  */
                 if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page,
-                                       ttu_flags|TTU_BATCH_FLUSH)) {
+                       switch (try_to_unmap(page, freeable ?
+                                       ttu_flags | TTU_BATCH_FLUSH | TTU_FREE :
+                                       ttu_flags | TTU_BATCH_FLUSH)) {
                         case SWAP_FAIL:
                                 goto activate_locked;
                         case SWAP_AGAIN:
@@@ -1184,8 -1188,11 +1188,11 @@@
                  * we obviously don't have to worry about waking up a process
                  * waiting on the page lock, because there are no references.
                  */
-               __clear_page_locked(page);
+               __ClearPageLocked(page);
   free_it:
+               if (freeable && !PageDirty(page))
+                       count_vm_event(PGLAZYFREED);
+ 
                 nr_reclaimed++;
   
                 /*
@@@ -1476,7 -1483,7 +1483,7 @@@ static int too_many_isolated(struct zon
          * won't get blocked by normal direct-reclaimers, forming a circular
          * deadlock.
          */
-       if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+       if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                 inactive >>= 3;
   
         return isolated > inactive;
@@@ -1859,17 -1866,14 +1866,14 @@@ static void shrink_active_list(unsigne
   }
   
   #ifdef CONFIG_SWAP
- static int inactive_anon_is_low_global(struct zone *zone)
+ static bool inactive_anon_is_low_global(struct zone *zone)
   {
         unsigned long active, inactive;
   
         active = zone_page_state(zone, NR_ACTIVE_ANON);
         inactive = zone_page_state(zone, NR_INACTIVE_ANON);
   
-       if (inactive * zone->inactive_ratio < active)
-               return 1;
- 
-       return 0;
+       return inactive * zone->inactive_ratio < active;
   }
   
   /**
@@@ -1879,14 -1883,14 +1883,14 @@@
    * Returns true if the zone does not have enough inactive anon pages,
    * meaning some active anon pages need to be deactivated.
    */
- static int inactive_anon_is_low(struct lruvec *lruvec)
+ static bool inactive_anon_is_low(struct lruvec *lruvec)
   {
         /*
          * If we don't have swap space, anonymous page deactivation
          * is pointless.
          */
         if (!total_swap_pages)
-               return 0;
+               return false;
   
         if (!mem_cgroup_disabled())
                 return mem_cgroup_inactive_anon_is_low(lruvec);
@@@ -1894,9 -1898,9 +1898,9 @@@
         return inactive_anon_is_low_global(lruvec_zone(lruvec));
   }
   #else
- static inline int inactive_anon_is_low(struct lruvec *lruvec)
+ static inline bool inactive_anon_is_low(struct lruvec *lruvec)
   {
-       return 0;
+       return false;
   }
   #endif
   
@@@ -1914,7 -1918,7 +1918,7 @@@
    * This uses a different ratio than the anonymous pages, because
    * the page cache uses a use-once replacement algorithm.
    */
- static int inactive_file_is_low(struct lruvec *lruvec)
+ static bool inactive_file_is_low(struct lruvec *lruvec)
   {
         unsigned long inactive;
         unsigned long active;
@@@ -1925,7 -1929,7 +1929,7 @@@
         return active > inactive;
   }
   
- static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
+ static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
   {
         if (is_file_lru(lru))
                 return inactive_file_is_low(lruvec);
@@@ -2480,7 -2484,7 +2484,7 @@@ static inline bool compaction_ready(str
         balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
                         zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
         watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
-       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0);
   
         /*
          * If compaction is deferred, reclaim up to a point where
@@@ -2963,7 -2967,7 +2967,7 @@@ static bool zone_balanced(struct zone *
                           unsigned long balance_gap, int classzone_idx)
   {
         if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
-                                   balance_gap, classzone_idx, 0))
+                                   balance_gap, classzone_idx))
                 return false;
   
         if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
@@@ -3696,10 -3700,10 +3700,10 @@@ static inline unsigned long zone_unmapp
   }
   
   /* Work out how many page cache pages we can reclaim in this reclaim_mode */
- static long zone_pagecache_reclaimable(struct zone *zone)
+ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
   {
-       long nr_pagecache_reclaimable;
-       long delta = 0;
+       unsigned long nr_pagecache_reclaimable;
+       unsigned long delta = 0;
   
         /*
          * If RECLAIM_UNMAP is set, then all file pages are considered
@@@ -3794,7 -3798,7 +3798,7 @@@ int zone_reclaim(struct zone *zone, gfp
         /*
          * Do not scan if the allocation should not be delayed.
          */
-       if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
+       if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
                 return ZONE_RECLAIM_NOSCAN;
   
         /*
diff --combined net/core/sock.c

index 0ef30aa90132c7a1a04971c773d4de8ed4ac146b,8cab9d90b0185cfe216d75ecf932b75b31b0a222..a2040bb09916e8f54f7a30e1990944116ba255c8
--- 1/net/core/sock.c
--- 2/net/core/sock.c
+++ b/net/core/sock.c
@@@ -422,25 -422,13 +422,25 @@@ static void sock_warn_obsolete_bsdism(c
         }
   }
   
+ +static bool sock_needs_netstamp(const struct sock *sk)
+ +{
+ +      switch (sk->sk_family) {
+ +      case AF_UNSPEC:
+ +      case AF_UNIX:
+ +              return false;
+ +      default:
+ +              return true;
+ +      }
+ +}
+ +
   #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
   
   static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
   {
         if (sk->sk_flags & flags) {
                 sk->sk_flags &= ~flags;
- -              if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
+ +              if (sock_needs_netstamp(sk) &&
+ +                  !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
                         net_disable_timestamp();
         }
   }
@@@ -1000,10 -988,6 +1000,10 @@@ set_rcvbuf
                                          sk->sk_max_pacing_rate);
                 break;
   
+ +      case SO_INCOMING_CPU:
+ +              sk->sk_incoming_cpu = val;
+ +              break;
+ +
         default:
                 ret = -ENOPROTOOPT;
                 break;
@@@ -1594,8 -1578,7 +1594,8 @@@ struct sock *sk_clone_lock(const struc
                 if (newsk->sk_prot->sockets_allocated)
                         sk_sockets_allocated_inc(newsk);
   
- -              if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ +              if (sock_needs_netstamp(sk) &&
+ +                  newsk->sk_flags & SK_FLAGS_TIMESTAMP)
                         net_enable_timestamp();
         }
   out:
@@@ -1869,32 -1852,6 +1869,32 @@@ struct sk_buff *sock_alloc_send_skb(str
   }
   EXPORT_SYMBOL(sock_alloc_send_skb);
   
+ +int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
+ +                 struct sockcm_cookie *sockc)
+ +{
+ +      struct cmsghdr *cmsg;
+ +
+ +      for_each_cmsghdr(cmsg, msg) {
+ +              if (!CMSG_OK(msg, cmsg))
+ +                      return -EINVAL;
+ +              if (cmsg->cmsg_level != SOL_SOCKET)
+ +                      continue;
+ +              switch (cmsg->cmsg_type) {
+ +              case SO_MARK:
+ +                      if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ +                              return -EPERM;
+ +                      if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ +                              return -EINVAL;
+ +                      sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+ +                      break;
+ +              default:
+ +                      return -EINVAL;
+ +              }
+ +      }
+ +      return 0;
+ +}
+ +EXPORT_SYMBOL(sock_cmsg_send);
+ +
   /* On 32bit arches, an skb frag is limited to 2^15 */
   #define SKB_FRAG_PAGE_ORDER   get_order(32768)
   
@@@ -1922,8 -1879,10 +1922,10 @@@ bool skb_page_frag_refill(unsigned int 
   
         pfrag->offset = 0;
         if (SKB_FRAG_PAGE_ORDER) {
-               pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
-                                         __GFP_NOWARN | __GFP_NORETRY,
+               /* Avoid direct reclaim but allow kswapd to wake */
+               pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+                                         __GFP_COMP | __GFP_NOWARN |
+                                         __GFP_NORETRY,
                                           SKB_FRAG_PAGE_ORDER);
                 if (likely(pfrag->page)) {
                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
@@@ -2396,7 -2355,6 +2398,7 @@@ void sock_init_data(struct socket *sock
   
         sk->sk_max_pacing_rate = ~0U;
         sk->sk_pacing_rate = ~0U;
+ +      sk->sk_incoming_cpu = -1;
         /*
          * Before updating sk_refcnt, we must commit prior changes to memory
          * (Documentation/RCU/rculist_nulls.txt for details)
@@@ -2523,8 -2481,7 +2525,8 @@@ void sock_enable_timestamp(struct sock 
                  * time stamping, but time stamping might have been on
                  * already because of the other one
                  */
- -              if (!(previous_flags & SK_FLAGS_TIMESTAMP))
+ +              if (sock_needs_netstamp(sk) &&
+ +                  !(previous_flags & SK_FLAGS_TIMESTAMP))
                         net_enable_timestamp();
         }
   }
@@@ -2803,7 -2760,7 +2805,7 @@@ static int req_prot_init(const struct p
   
         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
                                            rsk_prot->obj_size, 0,
- -                                         0, NULL);
+ +                                         prot->slab_flags, NULL);
   
         if (!rsk_prot->slab) {
                 pr_crit("%s: Can't create request sock SLAB cache!\n",
diff --combined net/netlink/af_netlink.c

index fafe33bdb61989e680dc4b26dbe99dcc1d4064b5,ab061cca59d2265d018fc109c2becd2ca626306c..59651af8cc2705b39f3ad1ea71ab0b161668af02
--- 1/net/netlink/af_netlink.c
--- 2/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@@ -2116,7 -2116,7 +2116,7 @@@ int netlink_broadcast_filtered(struct s
         consume_skb(info.skb2);
   
         if (info.delivered) {
-               if (info.congested && (allocation & __GFP_WAIT))
+               if (info.congested && gfpflags_allow_blocking(allocation))
                         yield();
                 return 0;
         }
@@@ -2371,7 -2371,7 +2371,7 @@@ static int netlink_getsockopt(struct so
                 int pos, idx, shift;
   
                 err = 0;
- -              netlink_table_grab();
+ +              netlink_lock_table();
                 for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
                         if (len - pos < sizeof(u32))
                                 break;
@@@ -2386,7 -2386,7 +2386,7 @@@
                 }
                 if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen))
                         err = -EFAULT;
- -              netlink_table_ungrab();
+ +              netlink_unlock_table();
                 break;
         }
         case NETLINK_CAP_ACK:
diff --combined net/openvswitch/flow.c

index 0ea128eeeab2f835221b2068b1098a81fe1d731d,95cd5fd3a78046b232454431c3632403239b51ec..619f1d710eac0d9f9aab37d5d2340f91fa92b639
--- 1/net/openvswitch/flow.c
--- 2/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@@ -71,7 -71,7 +71,7 @@@ void ovs_flow_stats_update(struct sw_fl
                            const struct sk_buff *skb)
   {
         struct flow_stats *stats;
-       int node = numa_node_id();
+       int node = numa_mem_id();
         int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
   
         stats = rcu_dereference(flow->stats[node]);
@@@ -698,7 -698,8 +698,7 @@@ int ovs_flow_key_extract(const struct i
   {
         /* Extract metadata from packet. */
         if (tun_info) {
- -              if (ip_tunnel_info_af(tun_info) != AF_INET)
- -                      return -EINVAL;
+ +              key->tun_proto = ip_tunnel_info_af(tun_info);
                 memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
   
                 if (tun_info->options_len) {
@@@ -713,7 -714,6 +713,7 @@@
                         key->tun_opts_len = 0;
                 }
         } else  {
+ +              key->tun_proto = 0;
                 key->tun_opts_len = 0;
                 memset(&key->tun_key, 0, sizeof(key->tun_key));
         }
diff --combined net/rds/ib_recv.c

index 96744b75db9387aa2ef3b28d8ea103d81997a9ab,dcfb59775acc2bccdce963a06998a1b2d1fa70a0..977fb86065b75dbef916bd0acb9b94876c0f5c04
--- 1/net/rds/ib_recv.c
--- 2/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@@ -305,7 -305,7 +305,7 @@@ static int rds_ib_recv_refill_one(struc
         gfp_t slab_mask = GFP_NOWAIT;
         gfp_t page_mask = GFP_NOWAIT;
   
-       if (gfp & __GFP_WAIT) {
+       if (gfp & __GFP_DIRECT_RECLAIM) {
                 slab_mask = GFP_KERNEL;
                 page_mask = GFP_HIGHUSER;
         }
@@@ -379,7 -379,7 +379,7 @@@ void rds_ib_recv_refill(struct rds_conn
         struct ib_recv_wr *failed_wr;
         unsigned int posted = 0;
         int ret = 0;
-       bool can_wait = !!(gfp & __GFP_WAIT);
+       bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
         u32 pos;
   
         /* the goal here is to just make sure that someone, somewhere
@@@ -596,7 -596,8 +596,7 @@@ void rds_ib_recv_init_ack(struct rds_ib
    * wr_id and avoids working with the ring in that case.
    */
   #ifndef KERNEL_HAS_ATOMIC64
- -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
- -                              int ack_required)
+ +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
   {
         unsigned long flags;
   
@@@ -621,7 -622,8 +621,7 @@@ static u64 rds_ib_get_ack(struct rds_ib
         return seq;
   }
   #else
- -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
- -                              int ack_required)
+ +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
   {
         atomic64_set(&ic->i_ack_next, seq);
         if (ack_required) {
@@@ -828,6 -830,20 +828,6 @@@ static void rds_ib_cong_recv(struct rds
         rds_cong_map_updated(map, uncongested);
   }
   
- -/*
- - * Rings are posted with all the allocations they'll need to queue the
- - * incoming message to the receiving socket so this can't fail.
- - * All fragments start with a header, so we can make sure we're not receiving
- - * garbage, and we can tell a small 8 byte fragment from an ACK frame.
- - */
- -struct rds_ib_ack_state {
- -      u64             ack_next;
- -      u64             ack_recv;
- -      unsigned int    ack_required:1;
- -      unsigned int    ack_next_valid:1;
- -      unsigned int    ack_recv_valid:1;
- -};
- -
   static void rds_ib_process_recv(struct rds_connection *conn,
                                 struct rds_ib_recv_work *recv, u32 data_len,
                                 struct rds_ib_ack_state *state)
@@@ -953,50 -969,96 +953,50 @@@
         }
   }
   
- -/*
- - * Plucking the oldest entry from the ring can be done concurrently with
- - * the thread refilling the ring.  Each ring operation is protected by
- - * spinlocks and the transient state of refilling doesn't change the
- - * recording of which entry is oldest.
- - *
- - * This relies on IB only calling one cq comp_handler for each cq so that
- - * there will only be one caller of rds_recv_incoming() per RDS connection.
- - */
- -void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
- -{
- -      struct rds_connection *conn = context;
- -      struct rds_ib_connection *ic = conn->c_transport_data;
- -
- -      rdsdebug("conn %p cq %p\n", conn, cq);
- -
- -      rds_ib_stats_inc(s_ib_rx_cq_call);
- -
- -      tasklet_schedule(&ic->i_recv_tasklet);
- -}
- -
- -static inline void rds_poll_cq(struct rds_ib_connection *ic,
- -                             struct rds_ib_ack_state *state)
+ +void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
+ +                           struct ib_wc *wc,
+ +                           struct rds_ib_ack_state *state)
   {
         struct rds_connection *conn = ic->conn;
- -      struct ib_wc wc;
         struct rds_ib_recv_work *recv;
   
- -      while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
- -              rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
- -                       (unsigned long long)wc.wr_id, wc.status,
- -                       ib_wc_status_msg(wc.status), wc.byte_len,
- -                       be32_to_cpu(wc.ex.imm_data));
- -              rds_ib_stats_inc(s_ib_rx_cq_event);
+ +      rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+ +               (unsigned long long)wc->wr_id, wc->status,
+ +               ib_wc_status_msg(wc->status), wc->byte_len,
+ +               be32_to_cpu(wc->ex.imm_data));
   
- -              recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+ +      rds_ib_stats_inc(s_ib_rx_cq_event);
+ +      recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+ +      ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1,
+ +                      DMA_FROM_DEVICE);
   
- -              ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
- -
- -              /*
- -               * Also process recvs in connecting state because it is possible
- -               * to get a recv completion _before_ the rdmacm ESTABLISHED
- -               * event is processed.
- -               */
- -              if (wc.status == IB_WC_SUCCESS) {
- -                      rds_ib_process_recv(conn, recv, wc.byte_len, state);
- -              } else {
- -                      /* We expect errors as the qp is drained during shutdown */
- -                      if (rds_conn_up(conn) || rds_conn_connecting(conn))
- -                              rds_ib_conn_error(conn, "recv completion on %pI4 had "
- -                                                "status %u (%s), disconnecting and "
- -                                                "reconnecting\n", &conn->c_faddr,
- -                                                wc.status,
- -                                                ib_wc_status_msg(wc.status));
- -              }
- -
- -              /*
- -               * rds_ib_process_recv() doesn't always consume the frag, and
- -               * we might not have called it at all if the wc didn't indicate
- -               * success. We already unmapped the frag's pages, though, and
- -               * the following rds_ib_ring_free() call tells the refill path
- -               * that it will not find an allocated frag here. Make sure we
- -               * keep that promise by freeing a frag that's still on the ring.
- -               */
- -              if (recv->r_frag) {
- -                      rds_ib_frag_free(ic, recv->r_frag);
- -                      recv->r_frag = NULL;
- -              }
- -              rds_ib_ring_free(&ic->i_recv_ring, 1);
+ +      /* Also process recvs in connecting state because it is possible
+ +       * to get a recv completion _before_ the rdmacm ESTABLISHED
+ +       * event is processed.
+ +       */
+ +      if (wc->status == IB_WC_SUCCESS) {
+ +              rds_ib_process_recv(conn, recv, wc->byte_len, state);
+ +      } else {
+ +              /* We expect errors as the qp is drained during shutdown */
+ +              if (rds_conn_up(conn) || rds_conn_connecting(conn))
+ +                      rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
+ +                                        &conn->c_faddr,
+ +                                        wc->status,
+ +                                        ib_wc_status_msg(wc->status));
         }
- -}
   
- -void rds_ib_recv_tasklet_fn(unsigned long data)
- -{
- -      struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
- -      struct rds_connection *conn = ic->conn;
- -      struct rds_ib_ack_state state = { 0, };
- -
- -      rds_poll_cq(ic, &state);
- -      ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
- -      rds_poll_cq(ic, &state);
- -
- -      if (state.ack_next_valid)
- -              rds_ib_set_ack(ic, state.ack_next, state.ack_required);
- -      if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
- -              rds_send_drop_acked(conn, state.ack_recv, NULL);
- -              ic->i_ack_recv = state.ack_recv;
+ +      /* rds_ib_process_recv() doesn't always consume the frag, and
+ +       * we might not have called it at all if the wc didn't indicate
+ +       * success. We already unmapped the frag's pages, though, and
+ +       * the following rds_ib_ring_free() call tells the refill path
+ +       * that it will not find an allocated frag here. Make sure we
+ +       * keep that promise by freeing a frag that's still on the ring.
+ +       */
+ +      if (recv->r_frag) {
+ +              rds_ib_frag_free(ic, recv->r_frag);
+ +              recv->r_frag = NULL;
         }
- -      if (rds_conn_up(conn))
- -              rds_ib_attempt_ack(ic);
+ +      rds_ib_ring_free(&ic->i_recv_ring, 1);
   
         /* If we ever end up with a really empty receive ring, we're
          * in deep trouble, as the sender will definitely see RNR
diff --combined net/rxrpc/ar-connection.c

index 692b3e67fb54418ffb143491b9e2f1dc82d8d503,3b5de4b86058334b4d762ef965e86f2877503932..6c71ed1caf16727a587c90ff81dcd6a7abd3d10b
--- 1/net/rxrpc/ar-connection.c
--- 2/net/rxrpc/ar-connection.c
+++ b/net/rxrpc/ar-connection.c
@@@ -500,7 -500,7 +500,7 @@@ int rxrpc_connect_call(struct rxrpc_soc
                 if (bundle->num_conns >= 20) {
                         _debug("too many conns");
   
-                       if (!(gfp & __GFP_WAIT)) {
+                       if (!gfpflags_allow_blocking(gfp)) {
                                 _leave(" = -EAGAIN");
                                 return -EAGAIN;
                         }
@@@ -808,7 -808,7 +808,7 @@@ void rxrpc_put_connection(struct rxrpc_
   
         ASSERTCMP(atomic_read(&conn->usage), >, 0);
   
- -      conn->put_time = get_seconds();
+ +      conn->put_time = ktime_get_seconds();
         if (atomic_dec_and_test(&conn->usage)) {
                 _debug("zombie");
                 rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0);
@@@ -852,7 -852,7 +852,7 @@@ static void rxrpc_connection_reaper(str
   
         _enter("");
   
- -      now = get_seconds();
+ +      now = ktime_get_seconds();
         earliest = ULONG_MAX;
   
         write_lock_bh(&rxrpc_connection_lock);
diff --combined tools/testing/selftests/Makefile

index 2458288a8287861c87bc868296ea068016efb673,4b4957b8df4e879a0d19098fdacce49658fce24a..c8edff6803d1db0b9b36585746c3ecaf45b6681c
--- 1/tools/testing/selftests/Makefile
--- 2/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@@ -6,6 -6,7 +6,7 @@@ TARGETS += firmwar
   TARGETS += ftrace
   TARGETS += futex
   TARGETS += kcmp
+ TARGETS += lib
   TARGETS += membarrier
   TARGETS += memfd
   TARGETS += memory-hotplug
@@@ -13,7 -14,6 +14,7 @@@ TARGETS += moun
   TARGETS += mqueue
   TARGETS += net
   TARGETS += powerpc
+ +TARGETS += pstore
   TARGETS += ptrace
   TARGETS += seccomp
   TARGETS += size
@@@ -66,9 -66,6 +67,9 @@@ clean_hotplug
                 make -C $$TARGET clean; \
         done;
   
+ +run_pstore_crash:
+ +      make -C pstore run_crash
+ +
   INSTALL_PATH ?= install
   INSTALL_PATH := $(abspath $(INSTALL_PATH))
   ALL_SCRIPT := $(INSTALL_PATH)/run_kselftest.sh
author	Stephen Rothwell <sfr@canb.auug.org.au>
	Mon, 2 Nov 2015 03:45:18 +0000 (14:45 +1100)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Mon, 2 Nov 2015 03:45:26 +0000 (14:45 +1100)
		1	2
Documentation/filesystems/proc.txt	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arc/mm/cache.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/mm/dma-mapping.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/xen/mm.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/mm/dma-mapping.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/mips/mm/tlbex.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/include/asm/pgtable-ppc64.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/mm/hugetlbpage.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/mm/numa.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/sysdev/fsl_pci.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/entry/syscalls/syscall_32.tbl	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/pgtable_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/acpi/boot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/pci-dma.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/smpboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/gup.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq-tag.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/genhd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/drbd/drbd_bitmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/nbd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/pktcdvd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/drm_gem.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/i915/i915_gem.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/infiniband/core/sa_query.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/iommu/amd_iommu.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/iommu/intel-iommu.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-crypt.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/media/pci/solo6x10/solo6x10-v4l2-enc.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/media/pci/solo6x10/solo6x10-v4l2.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/media/pci/tw68/tw68-video.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/misc/vmw_balloon.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/mtd/mtdcore.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/pci.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/android/ion/ion_system_heap.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/lustre/include/linux/libcfs/libcfs_private.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/rdma/hfi1/init.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/staging/rdma/ipath/ipath_file_ops.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/gadget/function/f_mass_storage.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/usb/host/u132-hcd.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/9p/vfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/cifs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/coredump.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/direct-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/readpage.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fs-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/jffs2/wbuf.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/mpage.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/cluster/heartbeat.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/array.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/task_mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_qm.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-generic/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/drm/drmP.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/compiler-gcc.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/compiler.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/hugetlb_cgroup.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/memcontrol.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/skbuff.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/net/sock.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/audit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cpuset.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/futex.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/kexec_core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/params.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/Kconfig.debug	patch \|	diff1 \|	diff2 \|	blob \| history
lib/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
lib/dma-debug.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/kobject.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/backing-dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/failslab.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/huge_memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory_hotplug.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/pgtable-generic.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmscan.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/sock.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/netlink/af_netlink.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/openvswitch/flow.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/rds/ib_recv.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/rxrpc/ar-connection.c	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history