]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'akpm'
authorStephen Rothwell <sfr@canb.auug.org.au>
Tue, 20 Dec 2011 07:44:53 +0000 (18:44 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Tue, 20 Dec 2011 07:44:53 +0000 (18:44 +1100)
233 files changed:
Documentation/ABI/testing/debugfs-olpc [new file with mode: 0644]
Documentation/ABI/testing/sysfs-class-rtc-rtc0-device-rtc_calibration [new file with mode: 0644]
Documentation/ABI/testing/sysfs-devices-system-timekeeping [new file with mode: 0644]
Documentation/ABI/testing/sysfs-kernel-slab
Documentation/cgroups/memory.txt
Documentation/filesystems/proc.txt
Documentation/kernel-parameters.txt
Documentation/laptops/sony-laptop.txt
Documentation/sysctl/kernel.txt
Documentation/sysctl/vm.txt
Documentation/trace/events-kmem.txt
Documentation/trace/postprocess/trace-pagealloc-postprocess.pl
Documentation/trace/tracepoint-analysis.txt
Documentation/vm/slub.txt
MAINTAINERS
arch/Kconfig
arch/arm/Kconfig
arch/arm/include/asm/processor.h
arch/arm/mach-ux500/mbox-db5500.c
arch/avr32/include/asm/system.h
arch/avr32/kernel/traps.c
arch/ia64/include/asm/processor.h
arch/ia64/kernel/machine_kexec.c
arch/m68k/amiga/config.c
arch/mips/Kconfig
arch/mips/include/asm/ptrace.h
arch/mips/kernel/traps.c
arch/mn10300/include/asm/exceptions.h
arch/parisc/include/asm/processor.h
arch/parisc/kernel/process.c
arch/powerpc/kernel/machine_kexec_32.c
arch/powerpc/kernel/machine_kexec_64.c
arch/powerpc/mm/numa.c
arch/powerpc/platforms/pseries/nvram.c
arch/s390/include/asm/processor.h
arch/s390/kernel/nmi.c
arch/sh/kernel/process_32.c
arch/sh/kernel/process_64.c
arch/tile/kernel/machine_kexec.c
arch/x86/Kconfig
arch/x86/Kconfig.cpu
arch/x86/mm/numa.c
arch/x86/platform/iris/iris.c
arch/x86/platform/olpc/olpc-xo15-sci.c
arch/x86/platform/olpc/olpc.c
arch/x86/um/Kconfig
drivers/base/memory.c
drivers/base/sys.c
drivers/char/hpet.c
drivers/char/ipmi/ipmi_watchdog.c
drivers/char/ramoops.c
drivers/idle/intel_idle.c
drivers/leds/Kconfig
drivers/leds/Makefile
drivers/leds/leds-88pm860x.c
drivers/leds/leds-adp5520.c
drivers/leds/leds-ams-delta.c
drivers/leds/leds-asic3.c
drivers/leds/leds-atmel-pwm.c
drivers/leds/leds-bd2802.c
drivers/leds/leds-cobalt-qube.c
drivers/leds/leds-da903x.c
drivers/leds/leds-dac124s085.c
drivers/leds/leds-fsg.c
drivers/leds/leds-gpio.c
drivers/leds/leds-hp6xx.c
drivers/leds/leds-lm3530.c
drivers/leds/leds-lp3944.c
drivers/leds/leds-lp5521.c
drivers/leds/leds-lp5523.c
drivers/leds/leds-lt3593.c
drivers/leds/leds-mc13783.c
drivers/leds/leds-netxbig.c
drivers/leds/leds-ns2.c
drivers/leds/leds-pca9532.c
drivers/leds/leds-pca955x.c
drivers/leds/leds-pwm.c
drivers/leds/leds-rb532.c
drivers/leds/leds-regulator.c
drivers/leds/leds-renesas-tpu.c
drivers/leds/leds-s3c24xx.c
drivers/leds/leds-tca6507.c [new file with mode: 0644]
drivers/leds/leds-wm831x-status.c
drivers/leds/leds-wm8350.c
drivers/memstick/core/mspro_block.c
drivers/mtd/mtdoops.c
drivers/platform/x86/acerhdf.c
drivers/platform/x86/sony-laptop.c
drivers/rtc/rtc-ab8500.c
drivers/rtc/rtc-cmos.c
drivers/rtc/rtc-mxc.c
drivers/rtc/rtc-wm831x.c
drivers/scsi/aacraid/commctrl.c
drivers/scsi/megaraid.c
drivers/scsi/mpt2sas/mpt2sas_base.c
drivers/scsi/sg.c
drivers/video/backlight/88pm860x_bl.c
drivers/video/backlight/Kconfig
drivers/video/backlight/Makefile
drivers/video/backlight/adp5520_bl.c
drivers/video/backlight/adx_bl.c [deleted file]
drivers/video/backlight/da903x_bl.c
drivers/video/backlight/ep93xx_bl.c
drivers/video/backlight/generic_bl.c
drivers/video/backlight/jornada720_bl.c
drivers/video/backlight/jornada720_lcd.c
drivers/video/backlight/ld9040.c
drivers/video/backlight/max8925_bl.c
drivers/video/backlight/omap1_bl.c
drivers/video/backlight/pcf50633-backlight.c
drivers/video/backlight/platform_lcd.c
drivers/video/backlight/pwm_bl.c
drivers/video/backlight/wm831x_bl.c
fs/Kconfig.binfmt
fs/binfmt_elf.c
fs/block_dev.c
fs/btrfs/file.c
fs/cifs/cifsfs.c
fs/dcache.c
fs/direct-io.c
fs/eventpoll.c
fs/exec.c
fs/ext4/ext4.h
fs/ext4/ialloc.c
fs/file_table.c
fs/inode.c
fs/internal.h
fs/mpage.c
fs/namei.c
fs/namespace.c
fs/nilfs2/ioctl.c
fs/pipe.c
fs/pnode.c
fs/proc/array.c
fs/proc/base.c
fs/proc/inode.c
fs/proc/internal.h
fs/proc/root.c
fs/reiserfs/bitmap.c
fs/reiserfs/journal.c
fs/reiserfs/super.c
include/asm-generic/tlb.h
include/linux/compiler-gcc4.h
include/linux/compiler.h
include/linux/eventpoll.h
include/linux/fs.h
include/linux/gfp.h
include/linux/hpet.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/ipc_namespace.h
include/linux/kernel.h
include/linux/kmsg_dump.h
include/linux/leds-tca6507.h [new file with mode: 0644]
include/linux/lglock.h
include/linux/linkage.h
include/linux/memcontrol.h
include/linux/mm.h
include/linux/mm_inline.h
include/linux/mm_types.h
include/linux/mmzone.h
include/linux/oom.h
include/linux/page-debug-flags.h
include/linux/page_cgroup.h
include/linux/pagevec.h
include/linux/pid_namespace.h
include/linux/prctl.h
include/linux/proc_fs.h
include/linux/rmap.h
include/linux/sched.h
include/linux/swap.h
include/linux/sysdev.h
include/linux/workqueue.h
include/linux/writeback.h
include/scsi/scsi_netlink.h
include/trace/events/kmem.h
include/trace/events/oom.h [new file with mode: 0644]
include/trace/events/task.h [new file with mode: 0644]
init/Kconfig
init/do_mounts.c
ipc/mq_sysctl.c
ipc/mqueue.c
ipc/sem.c
kernel/Makefile
kernel/audit.c
kernel/cpuset.c
kernel/exit.c
kernel/fork.c
kernel/hrtimer.c
kernel/kexec.c
kernel/lglock.c [new file with mode: 0644]
kernel/panic.c
kernel/pid.c
kernel/pid_namespace.c
kernel/power/snapshot.c
kernel/sched/core.c
kernel/signal.c
kernel/sys.c
kernel/sysctl.c
kernel/sysctl_binary.c
kernel/time/tick-sched.c
kernel/workqueue.c
lib/crc32.c
lib/debugobjects.c
lib/decompress_unlzo.c
mm/Kconfig.debug
mm/bootmem.c
mm/compaction.c
mm/fadvise.c
mm/filemap.c
mm/huge_memory.c
mm/hugetlb.c
mm/memcontrol.c
mm/memory.c
mm/migrate.c
mm/mmap.c
mm/mremap.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_cgroup.c
mm/rmap.c
mm/slub.c
mm/swap.c
mm/swapfile.c
mm/vmalloc.c
mm/vmscan.c
mm/vmstat.c
net/netfilter/nf_conntrack_netlink.c
scripts/checkpatch.pl
scripts/get_maintainer.pl
scripts/kconfig/merge_config.sh [new file with mode: 0644]
tools/perf/Documentation/examples.txt

diff --git a/Documentation/ABI/testing/debugfs-olpc b/Documentation/ABI/testing/debugfs-olpc
new file mode 100644 (file)
index 0000000..49b9a4e
--- /dev/null
@@ -0,0 +1,16 @@
+What:          /sys/kernel/debug/olpc-ec/generic
+Date:          Dec 2011
+KernelVersion: 3.3
+Contact:       devel@lists.laptop.org
+Description:
+
+A generic interface for executing OLPC Embedded Controller commands and
+reading their responses.
+
+To execute a command, write data with the format: CC:N A A A A
+CC is the (hex) command, N is the count of expected reply bytes, and A A A A
+are optional (hex) arguments.
+
+To read the response (if any), read from the generic node after executing
+a command. Hex reply bytes will be returned, *whether or not* they came from
+the immediately previous command.
diff --git a/Documentation/ABI/testing/sysfs-class-rtc-rtc0-device-rtc_calibration b/Documentation/ABI/testing/sysfs-class-rtc-rtc0-device-rtc_calibration
new file mode 100644 (file)
index 0000000..4cf1e72
--- /dev/null
@@ -0,0 +1,12 @@
+What:           Attribute for calibrating ST-Ericsson AB8500 Real Time Clock
+Date:           Oct 2011
+KernelVersion:  3.0
+Contact:        Mark Godfrey <mark.godfrey@stericsson.com>
+Description:    The rtc_calibration attribute allows the userspace to
+                calibrate the AB8500.s 32KHz Real Time Clock.
+                Every 60 seconds the AB8500 will correct the RTC's value
+                by adding to it the value of this attribute.
+                The range of the attribute is -127 to +127 in units of
+                30.5 micro-seconds (half-parts-per-million of the 32KHz clock)
+Users:          The /vendor/st-ericsson/base_utilities/core/rtc_calibration
+                daemon uses this interface.
diff --git a/Documentation/ABI/testing/sysfs-devices-system-timekeeping b/Documentation/ABI/testing/sysfs-devices-system-timekeeping
new file mode 100644 (file)
index 0000000..a904c6d
--- /dev/null
@@ -0,0 +1,16 @@
+What:          /sys/devices/system/timekeeping/
+Date:          November 2011
+Contact:       Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:   Timekeeping attributes
+
+
+What:          /sys/devices/system/timekeeping/timekeeping0/jiffies_cpu
+Date:          November 2011
+Contact:       Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:   Show and modify the kernel's tick_do_timer_cpu.  This
+               determines the cpu on which global time (jiffies) updates
+               occur.  This can only be modified on systems running with
+               the nohz mode turned off (nohz=off).
+
+               Possible values are:
+                       0 - <num online cpus>
index 8b093f8222d318e411113735f7e841c4b52e7ae0..bfd1d9f96f7a50e6d35776780967e8c11b498b64 100644 (file)
@@ -346,6 +346,8 @@ Description:
                number of objects per slab.  If a slab cannot be allocated
                because of fragmentation, SLUB will retry with the minimum order
                possible depending on its characteristics.
+               When debug_guardpage_minorder > 0 parameter is specified, the
+               minimum possible order is used and cannot be changed.
 
 What:          /sys/kernel/slab/cache/order_fallback
 Date:          April 2008
index 6922b6cb58e3a032e4b214afed16da63772e2f32..8e19065c9a1938c4216e89b7155f15b4354083bf 100644 (file)
@@ -65,7 +65,7 @@ Brief summary of control files.
  memory.failcnt                         # show the number of memory usage hits limits
  memory.memsw.failcnt           # show the number of memory+Swap hits limits
  memory.max_usage_in_bytes      # show max memory usage recorded
- memory.memsw.usage_in_bytes    # show max memory+Swap usage recorded
+ memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded
  memory.soft_limit_in_bytes     # set/show soft limit of memory usage
  memory.stat                    # show various statistics
  memory.use_hierarchy           # set/show hierarchical account enabled
@@ -428,8 +428,11 @@ memory.stat file includes following statistics
 cache          - # of bytes of page cache memory.
 rss            - # of bytes of anonymous and swap cache memory.
 mapped_file    - # of bytes of mapped file (includes tmpfs/shmem)
-pgpgin         - # of pages paged in (equivalent to # of charging events).
-pgpgout                - # of pages paged out (equivalent to # of uncharging events).
+pgpgin         - # of charging events to the memory cgroup. The charging
+               event happens each time a page is accounted as either mapped
+               anon page(RSS) or cache page(Page Cache) to the cgroup.
+pgpgout                - # of uncharging events to the memory cgroup. The uncharging
+               event happens each time a page is unaccounted from the cgroup.
 swap           - # of bytes of swap usage
 inactive_anon  - # of bytes of anonymous memory and swap cache memory on
                LRU list.
index 0ec91f03422e5befe8dd7f69d22dec4a22250f69..a76a26a1db8a6fe855b0eeea32dee251296c0c8a 100644 (file)
@@ -41,6 +41,8 @@ Table of Contents
   3.5  /proc/<pid>/mountinfo - Information about mounts
   3.6  /proc/<pid>/comm  & /proc/<pid>/task/<tid>/comm
 
+  4    Configuring procfs
+  4.1  Mount options
 
 ------------------------------------------------------------------------------
 Preface
@@ -305,6 +307,9 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
   blkio_ticks   time spent waiting for block IO
   gtime         guest time of the task in jiffies
   cgtime        guest time of the task children in jiffies
+  start_data    address above which program data+bss is placed
+  end_data      address below which program data+bss is placed
+  start_brk     address above which program heap can be expanded with brk()
 ..............................................................................
 
 The /proc/PID/maps file containing the currently mapped memory regions and
@@ -1542,3 +1547,40 @@ a task to set its own or one of its thread siblings comm value. The comm value
 is limited in size compared to the cmdline value, so writing anything longer
 then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated
 comm value.
+
+
+------------------------------------------------------------------------------
+Configuring procfs
+------------------------------------------------------------------------------
+
+4.1    Mount options
+---------------------
+
+The following mount options are supported:
+
+       hidepid=        Set /proc/<pid>/ access mode.
+       gid=            Set the group authorized to learn processes information.
+
+hidepid=0 means classic mode - everybody may access all /proc/<pid>/ directories
+(default).
+
+hidepid=1 means users may not access any /proc/<pid>/ directories but their
+own.  Sensitive files like cmdline, sched*, status are now protected against
+other users.  This makes it impossible to learn whether any user runs
+specific program (given the program doesn't reveal itself by its behaviour).
+As an additional bonus, as /proc/<pid>/cmdline is unaccessible for other users,
+poorly written programs passing sensitive information via program arguments are
+now protected against local eavesdroppers.
+
+hidepid=2 means hidepid=1 plus all /proc/<pid>/ will be fully invisible to other
+users.  It doesn't mean that it hides a fact whether a process with a specific
+pid value exists (it can be learned by other means, e.g. by "kill -0 $PID"),
+but it hides process' uid and gid, which may be learned by stat()'ing
+/proc/<pid>/ otherwise.  It greatly complicates an intruder's task of gathering
+information about running processes, whether some daemon runs with elevated
+privileges, whether other user runs some sensitive program, whether other users
+run any program at all, etc.
+
+gid= defines a group authorized to learn processes information otherwise
+prohibited by hidepid=.  If you use some daemon like identd which needs to learn
+information about processes information, just add identd to this group.
index 063dfeb8ab4ea28bc242b8315ce7014632740058..8d80b1b0fbf9c132229cad76923762e5029e3ef0 100644 (file)
@@ -628,6 +628,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        no_debug_objects
                        [KNL] Disable object debugging
 
+       debug_guardpage_minorder=
+                       [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
+                       parameter allows control of the order of pages that will
+                       be intentionally kept free (and hence protected) by the
+                       buddy allocator. Bigger value increase the probability
+                       of catching random memory corruption, but reduce the
+                       amount of memory for normal system use. The maximum
+                       possible value is MAX_ORDER/2.  Setting this parameter
+                       to 1 or 2 should be enough to identify most random
+                       memory corruption problems caused by bugs in kernel or
+                       driver code when a CPU writes to (or reads from) a
+                       random memory location. Note that there exists a class
+                       of memory corruptions problems caused by buggy H/W or
+                       F/W or by drivers badly programing DMA (basically when
+                       memory is written at bus level and the CPU MMU is
+                       bypassed) which are not detectable by
+                       CONFIG_DEBUG_PAGEALLOC, hence this option will not help
+                       tracking down these problems.
+
        debugpat        [X86] Enable PAT debugging
 
        decnet.addr=    [HW,NET]
index 2bd4e82e5d9ff2085f78f44c6a9351a61a2512e5..0d5ac7f5287e611d4aa171937a1d83b31d0f6038 100644 (file)
@@ -17,6 +17,11 @@ subsystem. See the logs of acpid or /proc/acpi/event and
 devices are created by the driver. Additionally, loading the driver with the
 debug option will report all events in the kernel log.
 
+The "scancodes" passed to the input system (that can be remapped with udev)
+are indexes to the table "sony_laptop_input_keycode_map" in the sony-laptop.c
+module.  For example the "FN/E" key combination (EJECTCD on some models)
+generates the scancode 20 (0x14).
+
 Backlight control:
 ------------------
 If your laptop model supports it, you will find sysfs files in the
index 6d8cd8b2c30d9d6104d05654e22fd85ed1e7032d..8c20fbd8b42dd922daa92f223bbefa9ffcc4f8e3 100644 (file)
@@ -415,6 +415,14 @@ PIDs of value pid_max or larger are not allocated.
 
 ==============================================================
 
+ns_last_pid:
+
+The last pid allocated in the current (the one task using this sysctl
+lives in) pid namespace. When selecting a pid for a next task on fork
+kernel tries to allocate a number starting from this one.
+
+==============================================================
+
 powersave-nap: (PPC only)
 
 If set, Linux-PPC will use the 'nap' mode of powersaving,
index 96f0ee825bed3e71fe9758156ee8b575389d7bba..9c11d97e075ab0e923b76f40cb7d48899592ac20 100644 (file)
@@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/vm:
 - dirty_writeback_centisecs
 - drop_caches
 - extfrag_threshold
+- extra_free_kbytes
 - hugepages_treat_as_movable
 - hugetlb_shm_group
 - laptop_mode
@@ -168,6 +169,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500.
 
 ==============================================================
 
+extra_free_kbytes
+
+This parameter tells the VM to keep extra free memory between the threshold
+where background reclaim (kswapd) kicks in, and the threshold where direct
+reclaim (by allocating processes) kicks in.
+
+This is useful for workloads that require low latency memory allocations
+and have a bounded burstiness in memory allocations, for example a
+realtime application that receives and transmits network traffic
+(causing in-kernel memory allocations) with a maximum total message burst
+size of 200MB may need 200MB of extra free memory to avoid direct reclaim
+related latencies.
+
+==============================================================
+
 hugepages_treat_as_movable
 
 This parameter is only useful when kernelcore= is specified at boot time to
index aa82ee4a5a8762ef9528ef382b07b906c3d6094b..194800410061b3b73be4a68b762be0b7995f4558 100644 (file)
@@ -40,8 +40,8 @@ but the call_site can usually be used to extrapolate that information.
 ==================
 mm_page_alloc            page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s
 mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d
-mm_page_free_direct      page=%p pfn=%lu order=%d
-mm_pagevec_free                  page=%p pfn=%lu order=%d cold=%d
+mm_page_free             page=%p pfn=%lu order=%d
+mm_page_free_batched     page=%p pfn=%lu order=%d cold=%d
 
 These four events deal with page allocation and freeing. mm_page_alloc is
 a simple indicator of page allocator activity. Pages may be allocated from
@@ -53,13 +53,13 @@ amounts of activity imply high activity on the zone->lock. Taking this lock
 impairs performance by disabling interrupts, dirtying cache lines between
 CPUs and serialising many CPUs.
 
-When a page is freed directly by the caller, the mm_page_free_direct event
+When a page is freed directly by the caller, the only mm_page_free event
 is triggered. Significant amounts of activity here could indicate that the
 callers should be batching their activities.
 
-When pages are freed using a pagevec, the mm_pagevec_free is
-triggered. Broadly speaking, pages are taken off the LRU lock in bulk and
-freed in batch with a pagevec. Significant amounts of activity here could
+When pages are freed in batch, the also mm_page_free_batched is triggered.
+Broadly speaking, pages are taken off the LRU lock in bulk and
+freed in batch with a page list. Significant amounts of activity here could
 indicate that the system is under memory pressure and can also indicate
 contention on the zone->lru_lock.
 
index 7df50e8cf4d9510a7c0b8233ac52bdbcc3b8614f..0a120aae33ce5c9836dad948b18736ea183e6801 100644 (file)
@@ -17,8 +17,8 @@ use Getopt::Long;
 
 # Tracepoint events
 use constant MM_PAGE_ALLOC             => 1;
-use constant MM_PAGE_FREE_DIRECT       => 2;
-use constant MM_PAGEVEC_FREE           => 3;
+use constant MM_PAGE_FREE              => 2;
+use constant MM_PAGE_FREE_BATCHED      => 3;
 use constant MM_PAGE_PCPU_DRAIN                => 4;
 use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5;
 use constant MM_PAGE_ALLOC_EXTFRAG     => 6;
@@ -223,10 +223,10 @@ EVENT_PROCESS:
                # Perl Switch() sucks majorly
                if ($tracepoint eq "mm_page_alloc") {
                        $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++;
-               } elsif ($tracepoint eq "mm_page_free_direct") {
-                       $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}++;
-               } elsif ($tracepoint eq "mm_pagevec_free") {
-                       $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}++;
+               } elsif ($tracepoint eq "mm_page_free") {
+                       $perprocesspid{$process_pid}->{MM_PAGE_FREE}++
+               } elsif ($tracepoint eq "mm_page_free_batched") {
+                       $perprocesspid{$process_pid}->{MM_PAGE_FREE_BATCHED}++;
                } elsif ($tracepoint eq "mm_page_pcpu_drain") {
                        $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++;
                        $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++;
@@ -336,8 +336,8 @@ sub dump_stats {
                        $process_pid,
                        $stats{$process_pid}->{MM_PAGE_ALLOC},
                        $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED},
-                       $stats{$process_pid}->{MM_PAGE_FREE_DIRECT},
-                       $stats{$process_pid}->{MM_PAGEVEC_FREE},
+                       $stats{$process_pid}->{MM_PAGE_FREE},
+                       $stats{$process_pid}->{MM_PAGE_FREE_BATCHED},
                        $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN},
                        $stats{$process_pid}->{HIGH_PCPU_DRAINS},
                        $stats{$process_pid}->{HIGH_PCPU_REFILLS},
@@ -364,8 +364,8 @@ sub aggregate_perprocesspid() {
 
                $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC};
                $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED};
-               $perprocess{$process}->{MM_PAGE_FREE_DIRECT} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT};
-               $perprocess{$process}->{MM_PAGEVEC_FREE} += $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE};
+               $perprocess{$process}->{MM_PAGE_FREE} += $perprocesspid{$process_pid}->{MM_PAGE_FREE};
+               $perprocess{$process}->{MM_PAGE_FREE_BATCHED} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_BATCHED};
                $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN};
                $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS};
                $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS};
index 87bee3c129ba71f8c359b5e849bafd177f421136..058cc6c9dc56d442e4fad72c1957a82c7f630f4d 100644 (file)
@@ -93,14 +93,14 @@ By specifying the -a switch and analysing sleep, the system-wide events
 for a duration of time can be examined.
 
  $ perf stat -a \
-       -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
-       -e kmem:mm_pagevec_free \
+       -e kmem:mm_page_alloc -e kmem:mm_page_free \
+       -e kmem:mm_page_free_batched \
        sleep 10
  Performance counter stats for 'sleep 10':
 
            9630  kmem:mm_page_alloc
-           2143  kmem:mm_page_free_direct
-           7424  kmem:mm_pagevec_free
+           2143  kmem:mm_page_free
+           7424  kmem:mm_page_free_batched
 
    10.002577764  seconds time elapsed
 
@@ -119,15 +119,15 @@ basis using set_ftrace_pid.
 Events can be activated and tracked for the duration of a process on a local
 basis using PCL such as follows.
 
-  $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
-                -e kmem:mm_pagevec_free ./hackbench 10
+  $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free \
+                -e kmem:mm_page_free_batched ./hackbench 10
   Time: 0.909
 
     Performance counter stats for './hackbench 10':
 
           17803  kmem:mm_page_alloc
-          12398  kmem:mm_page_free_direct
-           4827  kmem:mm_pagevec_free
+          12398  kmem:mm_page_free
+           4827  kmem:mm_page_free_batched
 
     0.973913387  seconds time elapsed
 
@@ -146,8 +146,8 @@ to know what the standard deviation is. By and large, this is left to the
 performance analyst to do it by hand. In the event that the discrete event
 occurrences are useful to the performance analyst, then perf can be used.
 
-  $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct
-                       -e kmem:mm_pagevec_free ./hackbench 10
+  $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free
+                       -e kmem:mm_page_free_batched ./hackbench 10
   Time: 0.890
   Time: 0.895
   Time: 0.915
@@ -157,8 +157,8 @@ occurrences are useful to the performance analyst, then perf can be used.
    Performance counter stats for './hackbench 10' (5 runs):
 
           16630  kmem:mm_page_alloc         ( +-   3.542% )
-          11486  kmem:mm_page_free_direct   ( +-   4.771% )
-           4730  kmem:mm_pagevec_free       ( +-   2.325% )
+          11486  kmem:mm_page_free         ( +-   4.771% )
+           4730  kmem:mm_page_free_batched  ( +-   2.325% )
 
     0.982653002  seconds time elapsed   ( +-   1.448% )
 
@@ -168,15 +168,15 @@ aggregation of discrete events, then a script would need to be developed.
 Using --repeat, it is also possible to view how events are fluctuating over
 time on a system-wide basis using -a and sleep.
 
-  $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
-               -e kmem:mm_pagevec_free \
+  $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free \
+               -e kmem:mm_page_free_batched \
                -a --repeat 10 \
                sleep 1
   Performance counter stats for 'sleep 1' (10 runs):
 
            1066  kmem:mm_page_alloc         ( +-  26.148% )
-            182  kmem:mm_page_free_direct   ( +-   5.464% )
-            890  kmem:mm_pagevec_free       ( +-  30.079% )
+            182  kmem:mm_page_free          ( +-   5.464% )
+            890  kmem:mm_page_free_batched  ( +-  30.079% )
 
     1.002251757  seconds time elapsed   ( +-   0.005% )
 
@@ -220,8 +220,8 @@ were generating events within the kernel. To begin this sort of analysis, the
 data must be recorded. At the time of writing, this required root:
 
   $ perf record -c 1 \
-       -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
-       -e kmem:mm_pagevec_free \
+       -e kmem:mm_page_alloc -e kmem:mm_page_free \
+       -e kmem:mm_page_free_batched \
        ./hackbench 10
   Time: 0.894
   [ perf record: Captured and wrote 0.733 MB perf.data (~32010 samples) ]
@@ -260,8 +260,8 @@ noticed that X was generating an insane amount of page allocations so let's look
 at it:
 
   $ perf record -c 1 -f \
-               -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
-               -e kmem:mm_pagevec_free \
+               -e kmem:mm_page_alloc -e kmem:mm_page_free \
+               -e kmem:mm_page_free_batched \
                -p `pidof X`
 
 This was interrupted after a few seconds and
index 2acdda9601b097958ad4ea732ba784a1ed4fffce..8c72b26b8574d0b68f6316dbc2915f75fa962f21 100644 (file)
@@ -131,7 +131,9 @@ slub_min_objects.
 slub_max_order specified the order at which slub_min_objects should no
 longer be checked. This is useful to avoid SLUB trying to generate
 super large order pages to fit slub_min_objects of a slab cache with
-large object sizes into one high order page.
+large object sizes into one high order page. Setting parameter
+debug_guardpage_minorder > 0 forces setting slub_max_order to 0, what
+cause minimum possible order of slabs allocation.
 
 SLUB Debug output
 -----------------
index 3f2fa3f8a044e63c3e4cbf7d97a534ca73f23d92..09507d21f1167ecaaa830ff9529a9e26b71069d1 100644 (file)
@@ -342,7 +342,7 @@ S:  Supported
 F:     drivers/mfd/adp5520.c
 F:     drivers/video/backlight/adp5520_bl.c
 F:     drivers/leds/leds-adp5520.c
-F:     drivers/gpio/adp5520-gpio.c
+F:     drivers/gpio/gpio-adp5520.c
 F:     drivers/input/keyboard/adp5520-keys.c
 
 ADP5588 QWERTY KEYPAD AND IO EXPANDER DRIVER (ADP5588/ADP5587)
@@ -351,7 +351,7 @@ L:  device-drivers-devel@blackfin.uclinux.org
 W:     http://wiki.analog.com/ADP5588
 S:     Supported
 F:     drivers/input/keyboard/adp5588-keys.c
-F:     drivers/gpio/adp5588-gpio.c
+F:     drivers/gpio/gpio-adp5588.c
 
 ADP8860 BACKLIGHT DRIVER (ADP8860/ADP8861/ADP8863)
 M:     Michael Hennerich <michael.hennerich@analog.com>
@@ -916,7 +916,6 @@ M:  Lennert Buytenhek <kernel@wantstofly.org>
 M:     Nicolas Pitre <nico@fluxnic.net>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Odd Fixes
-F:     arch/arm/mach-loki/
 F:     arch/arm/mach-kirkwood/
 F:     arch/arm/mach-mv78xx0/
 F:     arch/arm/mach-orion5x/
@@ -1078,8 +1077,8 @@ L:        linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
 F:     arch/arm/mach-s5pv210/mach-aquila.c
 F:     arch/arm/mach-s5pv210/mach-goni.c
-F:     arch/arm/mach-exynos4/mach-universal_c210.c
-F:     arch/arm/mach-exynos4/mach-nuri.c
+F:     arch/arm/mach-exynos/mach-universal_c210.c
+F:     arch/arm/mach-exynos/mach-nuri.c
 
 ARM/SAMSUNG S5P SERIES FIMC SUPPORT
 M:     Kyungmin Park <kyungmin.park@samsung.com>
@@ -1107,7 +1106,6 @@ M:        Tomasz Stanislawski <t.stanislaws@samsung.com>
 L:     linux-arm-kernel@lists.infradead.org
 L:     linux-media@vger.kernel.org
 S:     Maintained
-F:     arch/arm/plat-s5p/dev-tv.c
 F:     drivers/media/video/s5p-tv/
 
 ARM/SHMOBILE ARM ARCHITECTURE
@@ -1149,14 +1147,13 @@ L:      linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 W:     http://www.mcuos.com
 S:     Maintained
 F:     arch/arm/mach-w90x900/
-F:     arch/arm/mach-nuc93x/
 F:     drivers/input/keyboard/w90p910_keypad.c
 F:     drivers/input/touchscreen/w90p910_ts.c
 F:     drivers/watchdog/nuc900_wdt.c
 F:     drivers/net/ethernet/nuvoton/w90p910_ether.c
 F:     drivers/mtd/nand/nuc900_nand.c
 F:     drivers/rtc/rtc-nuc900.c
-F:     drivers/spi/spi_nuc900.c
+F:     drivers/spi/spi-nuc900.c
 F:     drivers/usb/host/ehci-w90x900.c
 F:     drivers/video/nuc900fb.c
 
@@ -1181,7 +1178,6 @@ L:        linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
 F:     arch/arm/mach-ux500/
 F:     drivers/dma/ste_dma40*
-F:     drivers/mfd/ab3550*
 F:     drivers/mfd/abx500*
 F:     drivers/mfd/ab8500*
 F:     drivers/mfd/stmpe*
@@ -1361,7 +1357,7 @@ F:        drivers/net/ethernet/cadence/
 ATMEL SPI DRIVER
 M:     Nicolas Ferre <nicolas.ferre@atmel.com>
 S:     Supported
-F:     drivers/spi/atmel_spi.*
+F:     drivers/spi/spi-atmel.*
 
 ATMEL USBA UDC DRIVER
 M:     Nicolas Ferre <nicolas.ferre@atmel.com>
@@ -1500,7 +1496,7 @@ M:        Sonic Zhang <sonic.zhang@analog.com>
 L:     uclinux-dist-devel@blackfin.uclinux.org
 W:     http://blackfin.uclinux.org
 S:     Supported
-F:     drivers/tty/serial/bfin_5xx.c
+F:     drivers/tty/serial/bfin_uart.c
 
 BLACKFIN WATCHDOG DRIVER
 M:     Mike Frysinger <vapier.adi@gmail.com>
@@ -1630,7 +1626,7 @@ BT8XXGPIO DRIVER
 M:     Michael Buesch <m@bues.ch>
 W:     http://bu3sch.de/btgpio.php
 S:     Maintained
-F:     drivers/gpio/bt8xxgpio.c
+F:     drivers/gpio/gpio-bt8xx.c
 
 BTRFS FILE SYSTEM
 M:     Chris Mason <chris.mason@oracle.com>
@@ -1679,7 +1675,7 @@ L:        linux-media@vger.kernel.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-2.6.git
 S:     Maintained
 F:     Documentation/video4linux/cafe_ccic
-F:     drivers/media/video/cafe_ccic*
+F:     drivers/media/video/marvell-ccic/
 
 CAIF NETWORK LAYER
 M:     Sjur Braendeland <sjur.brandeland@stericsson.com>
@@ -2118,7 +2114,7 @@ DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
 L:     netdev@vger.kernel.org
 S:     Orphan
 F:     Documentation/networking/dmfe.txt
-F:     drivers/net/ethernet/tulip/dmfe.c
+F:     drivers/net/ethernet/dec/tulip/dmfe.c
 
 DC390/AM53C974 SCSI driver
 M:     Kurt Garloff <garloff@suse.de>
@@ -2191,6 +2187,13 @@ T:       git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git
 S:     Maintained
 F:     drivers/usb/dwc3/
 
+DEVICE FREQUENCY (DEVFREQ)
+M:     MyungJoo Ham <myungjoo.ham@samsung.com>
+M:     Kyungmin Park <kyungmin.park@samsung.com>
+L:     linux-kernel@vger.kernel.org
+S:     Maintained
+F:     drivers/devfreq/
+
 DEVICE NUMBER REGISTRY
 M:     Torben Mathiasen <device@lanana.org>
 W:     http://lanana.org/docs/device-list/index.html
@@ -2936,7 +2939,7 @@ GRETH 10/100/1G Ethernet MAC device driver
 M:     Kristoffer Glembo <kristoffer@gaisler.com>
 L:     netdev@vger.kernel.org
 S:     Maintained
-F:     drivers/net/greth*
+F:     drivers/net/ethernet/aeroflex/
 
 GSPCA FINEPIX SUBDRIVER
 M:     Frank Zago <frank@zago.net>
@@ -3885,8 +3888,7 @@ L:        keyrings@linux-nfs.org
 S:     Supported
 F:     Documentation/security/keys-trusted-encrypted.txt
 F:     include/keys/encrypted-type.h
-F:     security/keys/encrypted.c
-F:     security/keys/encrypted.h
+F:     security/keys/encrypted-keys/
 
 KGDB / KDB /debug_core
 M:     Jason Wessel <jason.wessel@windriver.com>
@@ -5342,7 +5344,7 @@ L:        linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
 F:     arch/arm/mach-pxa/
 F:     drivers/pcmcia/pxa2xx*
-F:     drivers/spi/pxa2xx*
+F:     drivers/spi/spi-pxa2xx*
 F:     drivers/usb/gadget/pxa2*
 F:     include/sound/pxa2xx-lib.h
 F:     sound/arm/pxa*
@@ -5842,13 +5844,14 @@ L:      linux-mmc@vger.kernel.org
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/cjb/mmc.git
 S:     Maintained
 F:     drivers/mmc/host/sdhci.*
+F:     drivers/mmc/host/sdhci-pltfm.[ch]
 
 SECURE DIGITAL HOST CONTROLLER INTERFACE, OPEN FIRMWARE BINDINGS (SDHCI-OF)
 M:     Anton Vorontsov <avorontsov@ru.mvista.com>
 L:     linuxppc-dev@lists.ozlabs.org
 L:     linux-mmc@vger.kernel.org
 S:     Maintained
-F:     drivers/mmc/host/sdhci-of.*
+F:     drivers/mmc/host/sdhci-pltfm.[ch]
 
 SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) SAMSUNG DRIVER
 M:     Ben Dooks <ben-linux@fluff.org>
@@ -6227,9 +6230,7 @@ M:        Viresh Kumar <viresh.kumar@st.com>
 W:     http://www.st.com/spear
 S:     Maintained
 F:     arch/arm/mach-spear*/clock.c
-F:     arch/arm/mach-spear*/include/mach/clkdev.h
 F:     arch/arm/plat-spear/clock.c
-F:     arch/arm/plat-spear/include/plat/clkdev.h
 F:     arch/arm/plat-spear/include/plat/clock.h
 
 SPEAR PAD MULTIPLEXING SUPPORT
@@ -6314,6 +6315,11 @@ M:       Mori Hess <fmhess@users.sourceforge.net>
 S:     Odd Fixes
 F:     drivers/staging/comedi/
 
+STAGING - CONEXANT CX25821 PCIE BRIDGE
+L:     linux-media@vger.kernel.org
+S:     Odd Fixes
+F:     drivers/staging/cx25821/
+
 STAGING - CRYSTAL HD VIDEO DECODER
 M:     Naren Sankar <nsankar@broadcom.com>
 M:     Jarod Wilson <jarod@wilsonet.com>
@@ -6353,7 +6359,7 @@ STAGING - LIRC (LINUX INFRARED REMOTE CONTROL) DRIVERS
 M:     Jarod Wilson <jarod@wilsonet.com>
 W:     http://www.lirc.org/
 S:     Odd Fixes
-F:     drivers/staging/lirc/
+F:     drivers/staging/media/lirc/
 
 STAGING - NVIDIA COMPLIANT EMBEDDED CONTROLLER INTERFACE (nvec)
 M:     Julian Andres Klode <jak@jak-linux.org>
@@ -6389,7 +6395,7 @@ F:        drivers/staging/sm7xx/
 STAGING - SOFTLOGIC 6x10 MPEG CODEC
 M:     Ben Collins <bcollins@bluecherry.net>
 S:     Odd Fixes
-F:     drivers/staging/solo6x10/
+F:     drivers/staging/media/solo6x10/
 
 STAGING - SPEAKUP CONSOLE SPEECH DRIVER
 M:     William Hubbs <w.d.hubbs@gmail.com>
@@ -6692,7 +6698,7 @@ TULIP NETWORK DRIVERS
 M:     Grant Grundler <grundler@parisc-linux.org>
 L:     netdev@vger.kernel.org
 S:     Maintained
-F:     drivers/net/ethernet/tulip/
+F:     drivers/net/ethernet/dec/tulip/
 
 TUN/TAP driver
 M:     Maxim Krasnyansky <maxk@qualcomm.com>
index e24f8e47bb6f57ae48d000d6335d2c9e3b88477c..ebb86e56bb552a83ec627ddfe466180ffd9ef6f1 100644 (file)
@@ -188,4 +188,18 @@ config HAVE_RCU_TABLE_FREE
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
        bool
 
+config HAVE_ALIGNED_STRUCT_PAGE
+       bool
+       help
+         This makes sure that struct pages are double word aligned and that
+         e.g. the SLUB allocator can perform double word atomic operations
+         on a struct page for better performance. However selecting this
+         might increase the size of a struct page by a word.
+
+config HAVE_CMPXCHG_LOCAL
+       bool
+
+config HAVE_CMPXCHG_DOUBLE
+       bool
+
 source "kernel/gcov/Kconfig"
index 561e33aac6c06f41c1d7e2ea5050e5ddbee16279..1e17b6cd28053f202546656bfac59dc591d38c8f 100644 (file)
@@ -16,6 +16,7 @@ config ARM
        select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
        select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL)
        select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
+       select ARCH_BINFMT_ELF_RANDOMIZE_PIE
        select HAVE_GENERIC_DMA_COHERENT
        select HAVE_KERNEL_GZIP
        select HAVE_KERNEL_LZO
index ce280b8d613cbc7821a2adedfa32186988caf1e0..d7038fa223436227f14276406128ff9b247061d4 100644 (file)
@@ -55,7 +55,6 @@ struct thread_struct {
 #define start_thread(regs,pc,sp)                                       \
 ({                                                                     \
        unsigned long *stack = (unsigned long *)sp;                     \
-       set_fs(USER_DS);                                                \
        memset(regs->uregs, 0, sizeof(regs->uregs));                    \
        if (current->personality & ADDR_LIMIT_32BIT)                    \
                regs->ARM_cpsr = USR_MODE;                              \
index 2b2d51caf9d8b9f4db2e62c1eaa7a596b3313546..0127490218cdfc4bc00b107be2816a84703d6a4f 100644 (file)
@@ -168,7 +168,7 @@ static ssize_t mbox_read_fifo(struct device *dev,
        return sprintf(buf, "0x%X\n", mbox_value);
 }
 
-static DEVICE_ATTR(fifo, S_IWUGO | S_IRUGO, mbox_read_fifo, mbox_write_fifo);
+static DEVICE_ATTR(fifo, S_IWUSR | S_IRUGO, mbox_read_fifo, mbox_write_fifo);
 
 static int mbox_show(struct seq_file *s, void *data)
 {
index 9702c2213e1e74d979fb914b286cfbfac8c1609d..62d9ded016357bc6e6a6534d56cd154694b9692a 100644 (file)
@@ -169,7 +169,7 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
 
 struct pt_regs;
-void NORET_TYPE die(const char *str, struct pt_regs *regs, long err);
+void die(const char *str, struct pt_regs *regs, long err);
 void _exception(long signr, struct pt_regs *regs, int code,
                unsigned long addr);
 
index 7aa25756412f16299b69cccfcd1d1c9851aaaec7..3d760c06f02481851e6ab27e57e819a376dcb6d5 100644 (file)
@@ -24,7 +24,7 @@
 
 static DEFINE_SPINLOCK(die_lock);
 
-void NORET_TYPE die(const char *str, struct pt_regs *regs, long err)
+void die(const char *str, struct pt_regs *regs, long err)
 {
        static int die_counter;
 
index d9f397fae03eb7e89712aa7cedef3b8d11082b4e..691be0b95c1e0fb280721cde45e9d1f88d08af77 100644 (file)
@@ -309,7 +309,6 @@ struct thread_struct {
 }
 
 #define start_thread(regs,new_ip,new_sp) do {                                                  \
-       set_fs(USER_DS);                                                                        \
        regs->cr_ipsr = ((regs->cr_ipsr | (IA64_PSR_BITS_TO_SET | IA64_PSR_CPL))                \
                         & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS));              \
        regs->cr_iip = new_ip;                                                                  \
index 3d3aeef469476a85ed4d149608ebd891e72c5f6c..4eed35814994ebfed9c6e5d5dcca527cc327daba 100644 (file)
 #include <asm/sal.h>
 #include <asm/mca.h>
 
-typedef NORET_TYPE void (*relocate_new_kernel_t)(
+typedef void (*relocate_new_kernel_t)(
                                        unsigned long indirection_page,
                                        unsigned long start_address,
                                        struct ia64_boot_param *boot_param,
-                                       unsigned long pal_addr) ATTRIB_NORET;
+                                       unsigned long pal_addr) __noreturn;
 
 struct kimage *ia64_kimage;
 
index 82a4bb51d5d85b4f6271b7f4a830f1150bea5b59..b95a451b1c3ae6292092efcf4ea24fc103e69eda 100644 (file)
@@ -511,8 +511,7 @@ static unsigned long amiga_gettimeoffset(void)
        return ticks + offset;
 }
 
-static NORET_TYPE void amiga_reset(void)
-    ATTRIB_NORET;
+static void amiga_reset(void)  __noreturn;
 
 static void amiga_reset(void)
 {
index 7639c7724300b8b047fc00b49e7365a16da9d28f..c4c1312473fbf6cd9104cc3d206846bcd38b1b4b 100644 (file)
@@ -16,6 +16,7 @@ config MIPS
        select HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_KPROBES
        select HAVE_KRETPROBES
+       select ARCH_BINFMT_ELF_RANDOMIZE_PIE
        select RTC_LIB if !MACH_LOONGSON
        select GENERIC_ATOMIC64 if !64BIT
        select HAVE_DMA_ATTRS
index de39b1f343ea2537b459eed56f53ea9b35732554..7b99c670e478ed9ab79998762c839489cefb66a1 100644 (file)
@@ -144,7 +144,7 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
 extern asmlinkage void syscall_trace_enter(struct pt_regs *regs);
 extern asmlinkage void syscall_trace_leave(struct pt_regs *regs);
 
-extern NORET_TYPE void die(const char *, struct pt_regs *) ATTRIB_NORET;
+extern void die(const char *, struct pt_regs *) __noreturn;
 
 static inline void die_if_kernel(const char *str, struct pt_regs *regs)
 {
index 48240fd8c29716f1ad3581b46788b3ae762dd15d..cc4a3f120f54d6f036e66812cf47e5c35e7dbff9 100644 (file)
@@ -1349,7 +1349,7 @@ int register_nmi_notifier(struct notifier_block *nb)
        return raw_notifier_chain_register(&nmi_chain, nb);
 }
 
-NORET_TYPE void ATTRIB_NORET nmi_exception_handler(struct pt_regs *regs)
+void __noreturn nmi_exception_handler(struct pt_regs *regs)
 {
        raw_notifier_call_chain(&nmi_chain, 0, regs);
        bust_spinlocks(1);
index ca3e20508c77556a77492b85da8697d8eba4ced6..95a4d42c3a06e6f23a814d333ab6ce111d2b86a5 100644 (file)
@@ -110,7 +110,7 @@ extern asmlinkage void nmi_handler(void);
 extern asmlinkage void misalignment(struct pt_regs *, enum exception_code);
 
 extern void die(const char *, struct pt_regs *, enum exception_code)
-       ATTRIB_NORET;
+       __noreturn;
 
 extern int die_if_no_fixup(const char *, struct pt_regs *, enum exception_code);
 
index 9ce66e9d1c2b845275c41aaaf290a5fb5e123ba1..7213ec9e594cd4de25b912c17693dca742f46192 100644 (file)
@@ -196,7 +196,6 @@ typedef unsigned int elf_caddr_t;
        /* offset pc for priv. level */                 \
        pc |= 3;                                        \
                                                        \
-       set_fs(USER_DS);                                \
        regs->iasq[0] = spaceid;                        \
        regs->iasq[1] = spaceid;                        \
        regs->iaoq[0] = pc;                             \
@@ -299,7 +298,6 @@ on downward growing arches, it looks like this:
        elf_addr_t pc = (elf_addr_t)new_pc | 3;         \
        elf_caddr_t *argv = (elf_caddr_t *)bprm->exec + 1;      \
                                                        \
-       set_fs(USER_DS);                                \
        regs->iasq[0] = spaceid;                        \
        regs->iasq[1] = spaceid;                        \
        regs->iaoq[0] = pc;                             \
index 4b4b9181a1a0aef8df25667f4a0229bb275e8d7f..62c60b87d0395d66b02a929095fe37bbb33b849b 100644 (file)
@@ -192,7 +192,6 @@ void flush_thread(void)
        /* Only needs to handle fpu stuff or perf monitors.
        ** REVISIT: several arches implement a "lazy fpu state".
        */
-       set_fs(USER_DS);
 }
 
 void release_thread(struct task_struct *dead_task)
index e63f2e7d2efb029fca417c344ecee0d9e96daf88..affe5dcce7f465bcfcae706e3c3a1ebb24b44dbe 100644 (file)
 #include <asm/hw_irq.h>
 #include <asm/io.h>
 
-typedef NORET_TYPE void (*relocate_new_kernel_t)(
+typedef void (*relocate_new_kernel_t)(
                                unsigned long indirection_page,
                                unsigned long reboot_code_buffer,
-                               unsigned long start_address) ATTRIB_NORET;
+                               unsigned long start_address) __noreturn;
 
 /*
  * This is a generic machine_kexec function suitable at least for
index 26ccbf77dd4124958f83808b002e3a0176b3f6d4..d7f609086a99925dc9b2a47b479079fbf5b466d4 100644 (file)
@@ -307,9 +307,9 @@ static union thread_union kexec_stack __init_task_data =
 struct paca_struct kexec_paca;
 
 /* Our assembly helper, in kexec_stub.S */
-extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start,
-                                       void *image, void *control,
-                                       void (*clear_all)(void)) ATTRIB_NORET;
+extern void kexec_sequence(void *newstack, unsigned long start,
+                          void *image, void *control,
+                          void (*clear_all)(void)) __noreturn;
 
 /* too late to fail here */
 void default_machine_kexec(struct kimage *image)
index d5762f08f2c505563293efc6f61b5a40d0cc7346..1d7ddcc8751080d41089d6ed44f9661e507b4772 100644 (file)
@@ -58,7 +58,7 @@ static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
  * Allocate node_to_cpumask_map based on number of available nodes
  * Requires node_possible_map to be valid.
  *
- * Note: node_to_cpumask() is not valid until after this is done.
+ * Note: cpumask_of_node() is not valid until after this is done.
  */
 static void __init setup_node_to_cpumask_map(void)
 {
index 330a57b7c17ca48ee997fa2508c9ec29716efd5d..36f957f31842f60688a7485f297ed6181dc926c9 100644 (file)
@@ -638,7 +638,6 @@ static void oops_to_nvram(struct kmsg_dumper *dumper,
                /* These are almost always orderly shutdowns. */
                return;
        case KMSG_DUMP_OOPS:
-       case KMSG_DUMP_KEXEC:
                break;
        case KMSG_DUMP_PANIC:
                panicking = true;
index 27272f6a14c2fbecf4f9a6f1a29cba7986f3bb92..d25843a6a91512409aca2d56458d06cd9f90922f 100644 (file)
@@ -236,7 +236,7 @@ static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc)
 /*
  * Function to drop a processor into disabled wait state
  */
-static inline void ATTRIB_NORET disabled_wait(unsigned long code)
+static inline void __noreturn disabled_wait(unsigned long code)
 {
         unsigned long ctl_buf;
         psw_t dw_psw;
index fab88431a06fcb70a92b4fdfe0f475d269be67a6..0fd2e863e114e161ced171368521a3512ea2571d 100644 (file)
@@ -30,7 +30,7 @@ struct mcck_struct {
 
 static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
 
-static NORET_TYPE void s390_handle_damage(char *msg)
+static void s390_handle_damage(char *msg)
 {
        smp_send_stop();
        disabled_wait((unsigned long) __builtin_return_address(0));
index aaf6d59c201227b52c431e6be891868d756a5fda..7ec6651781255e375f971c3645d73eb46f870063 100644 (file)
@@ -70,7 +70,7 @@ void show_regs(struct pt_regs * regs)
 /*
  * Create a kernel thread
  */
-ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *))
+__noreturn void kernel_thread_helper(void *arg, int (*fn)(void *))
 {
        do_exit(fn(arg));
 }
index 210c1cabcb7fd3fe0903442c40c56d37a34b2ccc..cbd4e4bb9fc526796fbbd8b79a60847e0493bfb2 100644 (file)
@@ -285,7 +285,7 @@ void show_regs(struct pt_regs *regs)
 /*
  * Create a kernel thread
  */
-ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *))
+__noreturn void kernel_thread_helper(void *arg, int (*fn)(void *))
 {
        do_exit(fn(arg));
 }
index e00d7179989e24946a20185709224d9afd97f1be..6255f2eab112c9fcae9f5f4cab1d9462a9e12477 100644 (file)
@@ -248,11 +248,11 @@ static void setup_quasi_va_is_pa(void)
 }
 
 
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
        void *reboot_code_buffer;
-       NORET_TYPE void (*rnk)(unsigned long, void *, unsigned long)
-               ATTRIB_NORET;
+       void (*rnk)(unsigned long, void *, unsigned long)
+               __noreturn;
 
        /* Mask all interrupts before starting to reboot. */
        interrupt_mask_set_mask(~0ULL);
index ddab68fc13522c9ab4e43275a45794b492d11429..cb1c1bcd05343f855275cc113e455710c41f1c01 100644 (file)
@@ -60,8 +60,12 @@ config X86
        select PERF_EVENTS
        select HAVE_PERF_EVENTS_NMI
        select ANON_INODES
+       select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
+       select HAVE_CMPXCHG_LOCAL if !M386
+       select HAVE_CMPXCHG_DOUBLE
        select HAVE_ARCH_KMEMCHECK
        select HAVE_USER_RETURN_NOTIFIER
+       select ARCH_BINFMT_ELF_RANDOMIZE_PIE
        select HAVE_ARCH_JUMP_LABEL
        select HAVE_TEXT_POKE_SMP
        select HAVE_GENERIC_HARDIRQS
index e3ca7e0d858c9203fe787244b6022193c7bc7769..3c57033e22118f2ce7771d10fa8305193c94af20 100644 (file)
@@ -309,12 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_CMPXCHG
        def_bool X86_64 || (X86_32 && !M386)
 
-config CMPXCHG_LOCAL
-       def_bool X86_64 || (X86_32 && !M386)
-
-config CMPXCHG_DOUBLE
-       def_bool y
-
 config X86_L1_CACHE_SHIFT
        int
        default "7" if MPENTIUM4 || MPSC
index 020cd2e808732408a31cad1294c5d8faa153fa5f..19d3fa08b1191493cda6415e56f8f26d86415bf7 100644 (file)
@@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu)
  * Allocate node_to_cpumask_map based on number of available nodes
  * Requires node_possible_map to be valid.
  *
- * Note: node_to_cpumask() is not valid until after this is done.
+ * Note: cpumask_of_node() is not valid until after this is done.
  * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
  */
 void __init setup_node_to_cpumask_map(void)
index 1ba7f5ed8c9b9e0675b612d358659be9d57dd82a..f1900223aca7e003a48d474f9808a7add8def92a 100644 (file)
@@ -23,6 +23,7 @@
 
 #include <linux/moduleparam.h>
 #include <linux/module.h>
+#include <linux/platform_device.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
@@ -62,29 +63,75 @@ static void iris_power_off(void)
  * by reading its input port and seeing whether the read value is
  * meaningful.
  */
-static int iris_init(void)
+static int iris_probe(struct platform_device *pdev)
 {
-       unsigned char status;
-       if (force != 1) {
-               printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
-               return -ENODEV;
-       }
-       status = inb(IRIS_GIO_INPUT);
+       unsigned char status = inb(IRIS_GIO_INPUT);
        if (status == IRIS_GIO_NODEV) {
-               printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
+               printk(KERN_ERR "This machine does not seem to be an Iris. "
+                       "Power off handler not installed.\n");
                return -ENODEV;
        }
        old_pm_power_off = pm_power_off;
        pm_power_off = &iris_power_off;
        printk(KERN_INFO "Iris power_off handler installed.\n");
-
        return 0;
 }
 
-static void iris_exit(void)
+static int iris_remove(struct platform_device *pdev)
 {
        pm_power_off = old_pm_power_off;
        printk(KERN_INFO "Iris power_off handler uninstalled.\n");
+       return 0;
+}
+
+static struct platform_driver iris_driver = {
+       .driver         = {
+               .name   = "iris",
+               .owner  = THIS_MODULE,
+       },
+       .probe          = iris_probe,
+       .remove         = iris_remove,
+};
+
+static struct resource iris_resources[] = {
+       {
+               .start  = IRIS_GIO_BASE,
+               .end    = IRIS_GIO_OUTPUT,
+               .flags  = IORESOURCE_IO,
+               .name   = "address"
+       }
+};
+
+static struct platform_device *iris_device;
+
+static int iris_init(void)
+{
+       int ret;
+       if (force != 1) {
+               printk(KERN_ERR "The force parameter has not been set to 1."
+                       " The Iris poweroff handler will not be installed.\n");
+               return -ENODEV;
+       }
+       ret = platform_driver_register(&iris_driver);
+       if (ret < 0) {
+               printk(KERN_ERR "Failed to register iris platform driver: %d\n",
+                       ret);
+               return ret;
+       }
+       iris_device = platform_device_register_simple("iris", (-1),
+                               iris_resources, ARRAY_SIZE(iris_resources));
+       if (IS_ERR(iris_device)) {
+               printk(KERN_ERR "Failed to register iris platform device\n");
+               platform_driver_unregister(&iris_driver);
+               return PTR_ERR(iris_device);
+       }
+       return 0;
+}
+
+static void iris_exit(void)
+{
+       platform_device_unregister(iris_device);
+       platform_driver_unregister(&iris_driver);
 }
 
 module_init(iris_init);
index 2b235b77d9abb0b49d5b6262fb21786eb74b9cea..e1869b48e6e98932526ffac12ade2dfcadc6dc8b 100644 (file)
 #define XO15_SCI_DEVICE_NAME           "OLPC XO-1.5 SCI"
 
 static unsigned long xo15_sci_gpe;
+static bool lid_wake_on_close;
+
+/*
+ * The normal ACPI LID wakeup behavior is wake-on-open, but not
+ * wake-on-close. This is implemented as standard by the XO-1.5 DSDT.
+ *
+ * We provide here a sysfs attribute that will additionally enable
+ * wake-on-close behavior. This is useful (e.g.) when we oportunistically
+ * suspend with the display running; if the lid is then closed, we want to
+ * wake up to turn the display off.
+ *
+ * This is controlled through a custom method in the XO-1.5 DSDT.
+ */
+static int set_lid_wake_behavior(bool wake_on_close)
+{
+       struct acpi_object_list arg_list;
+       union acpi_object arg;
+       acpi_status status;
+
+       arg_list.count = 1;
+       arg_list.pointer = &arg;
+       arg.type = ACPI_TYPE_INTEGER;
+       arg.integer.value = wake_on_close;
+       status = acpi_evaluate_object(NULL, "\\_SB.PCI0.LID.LIDW", &arg_list,
+                                     NULL);
+       if (ACPI_FAILURE(status)) {
+               pr_warning(PFX "failed to set lid behavior\n");
+               return 1;
+       }
+
+       lid_wake_on_close = wake_on_close;
+       return 0;
+}
+
+static ssize_t lid_wake_on_close_show(struct kobject *s,
+                                     struct kobj_attribute *attr, char *buf)
+{
+       return sprintf(buf, "%u\n", lid_wake_on_close);
+}
+
+static ssize_t lid_wake_on_close_store(struct kobject *s,
+                                      struct kobj_attribute *attr,
+                                      const char *buf, size_t n)
+{
+       unsigned int val;
+
+       if (sscanf(buf, "%u", &val) != 1)
+               return -EINVAL;
+
+       set_lid_wake_behavior(!!val);
+       return n;
+}
+
+static struct kobj_attribute lid_wake_on_close_attr =
+       __ATTR(lid_wake_on_close, 0644,
+              lid_wake_on_close_show, lid_wake_on_close_store);
 
 static void battery_status_changed(void)
 {
@@ -91,6 +147,7 @@ static int xo15_sci_add(struct acpi_device *device)
 {
        unsigned long long tmp;
        acpi_status status;
+       int r;
 
        if (!device)
                return -EINVAL;
@@ -112,6 +169,10 @@ static int xo15_sci_add(struct acpi_device *device)
 
        dev_info(&device->dev, "Initialized, GPE = 0x%lx\n", xo15_sci_gpe);
 
+       r = sysfs_create_file(&device->dev.kobj, &lid_wake_on_close_attr.attr);
+       if (r)
+               goto err_sysfs;
+
        /* Flush queue, and enable all SCI events */
        process_sci_queue();
        olpc_ec_mask_write(EC_SCI_SRC_ALL);
@@ -123,6 +184,11 @@ static int xo15_sci_add(struct acpi_device *device)
                device_init_wakeup(&device->dev, true);
 
        return 0;
+
+err_sysfs:
+       acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
+       cancel_work_sync(&sci_work);
+       return r;
 }
 
 static int xo15_sci_remove(struct acpi_device *device, int type)
@@ -130,6 +196,7 @@ static int xo15_sci_remove(struct acpi_device *device, int type)
        acpi_disable_gpe(NULL, xo15_sci_gpe);
        acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
        cancel_work_sync(&sci_work);
+       sysfs_remove_file(&device->dev.kobj, &lid_wake_on_close_attr.attr);
        return 0;
 }
 
index 7cce722667b83dd06b506d06b771a572c0843598..82607471abf965972443ac511d22fef9ffb4496d 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/syscore_ops.h>
+#include <linux/debugfs.h>
 
 #include <asm/geode.h>
 #include <asm/setup.h>
@@ -31,6 +32,13 @@ EXPORT_SYMBOL_GPL(olpc_platform_info);
 
 static DEFINE_SPINLOCK(ec_lock);
 
+/* debugfs interface to EC commands */
+#define EC_MAX_CMD_ARGS (5 + 1)        /* cmd byte + 5 args */
+#define EC_MAX_CMD_REPLY (8)
+static struct dentry *ec_debugfs_dir;
+static unsigned char ec_debugfs_resp[EC_MAX_CMD_REPLY];
+static unsigned int ec_debugfs_resp_bytes;
+
 /* EC event mask to be applied during suspend (defining wakeup sources). */
 static u16 ec_wakeup_mask;
 
@@ -269,6 +277,83 @@ int olpc_ec_sci_query(u16 *sci_value)
 }
 EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
 
+static ssize_t ec_gen_write(struct file *file, const char __user *buf,
+                           size_t size, loff_t *ppos)
+{
+       int i, m;
+       unsigned char ec_cmd[EC_MAX_CMD_ARGS];
+       unsigned int ec_cmd_int[EC_MAX_CMD_ARGS];
+       char cmdbuf[64];
+       int ec_cmd_bytes;
+
+       size = simple_write_to_buffer(cmdbuf, sizeof(cmdbuf), ppos, buf, size);
+
+       m = sscanf(cmdbuf, "%x:%u %x %x %x %x %x", &ec_cmd_int[0],
+                  &ec_debugfs_resp_bytes,
+                  &ec_cmd_int[1], &ec_cmd_int[2], &ec_cmd_int[3],
+                  &ec_cmd_int[4], &ec_cmd_int[5]);
+       if (m < 2 || ec_debugfs_resp_bytes > EC_MAX_CMD_REPLY) {
+               printk(KERN_DEBUG "olpc-ec: bad ec cmd:  "
+                      "cmd:response-count [arg1 [arg2 ...]]\n");
+               return -EINVAL;
+       }
+
+       /* convert scanf'd ints to char */
+       ec_cmd_bytes = m - 2;
+       for (i = 0; i <= ec_cmd_bytes; i++)
+               ec_cmd[i] = ec_cmd_int[i];
+
+       printk(KERN_DEBUG "olpc-ec: debugfs cmd 0x%02x with %d args "
+              "%02x %02x %02x %02x %02x, want %d returns\n",
+              ec_cmd[0], ec_cmd_bytes, ec_cmd[1], ec_cmd[2], ec_cmd[3],
+              ec_cmd[4], ec_cmd[5], ec_debugfs_resp_bytes);
+
+       olpc_ec_cmd((unsigned char) ec_cmd[0],
+               (ec_cmd_bytes == 0) ? NULL : &ec_cmd[1],
+               ec_cmd_bytes, ec_debugfs_resp, ec_debugfs_resp_bytes);
+
+       printk(KERN_DEBUG "olpc-ec: response "
+              "%02x %02x %02x %02x %02x %02x %02x %02x (%d bytes expected)\n",
+              ec_debugfs_resp[0], ec_debugfs_resp[1], ec_debugfs_resp[2],
+              ec_debugfs_resp[3], ec_debugfs_resp[4], ec_debugfs_resp[5],
+              ec_debugfs_resp[6], ec_debugfs_resp[7], ec_debugfs_resp_bytes);
+
+       return size;
+}
+
+static ssize_t ec_gen_read(struct file *file, char __user *buf,
+                          size_t size, loff_t *ppos)
+{
+       unsigned int i, r;
+       char *rp;
+       char respbuf[64];
+
+       rp = respbuf;
+       rp += sprintf(rp, "%02x", ec_debugfs_resp[0]);
+       for (i = 1; i < ec_debugfs_resp_bytes; i++)
+               rp += sprintf(rp, ", %02x", ec_debugfs_resp[i]);
+       rp += sprintf(rp, "\n");
+
+       r = rp - respbuf;
+
+       return simple_read_from_buffer(buf, size, ppos, respbuf, r);
+}
+
+static const struct file_operations ec_debugfs_genops = {
+       .write   = ec_gen_write,
+       .read    = ec_gen_read,
+};
+
+static void setup_debugfs(void)
+{
+       ec_debugfs_dir = debugfs_create_dir("olpc-ec", 0);
+       if (ec_debugfs_dir == ERR_PTR(-ENODEV))
+               return;
+
+       debugfs_create_file("generic", 0600, ec_debugfs_dir, NULL,
+                           &ec_debugfs_genops);
+}
+
 static int olpc_ec_suspend(void)
 {
        return olpc_ec_mask_write(ec_wakeup_mask);
@@ -372,6 +457,7 @@ static int __init olpc_init(void)
        }
 
        register_syscore_ops(&olpc_syscore_ops);
+       setup_debugfs();
 
        return 0;
 }
index 1d97bd84b6fbcfe684b559cf271f7fe72d75f92b..b2b54d2edf53979fb7daf770c0d4bf0215a73fe0 100644 (file)
@@ -6,14 +6,6 @@ menu "UML-specific options"
 
 menu "Host processor type and features"
 
-config CMPXCHG_LOCAL
-       bool
-       default n
-
-config CMPXCHG_DOUBLE
-       bool
-       default n
-
 source "arch/x86/Kconfig.cpu"
 
 endmenu
index 8272d92d22c0404412903f2948694a545d263bd9..9408829b46b41c035bc415d767cf94d7e6aec170 100644 (file)
@@ -313,11 +313,22 @@ static int memory_block_change_state(struct memory_block *mem,
 
        ret = memory_block_action(mem->start_section_nr, to_state);
 
-       if (ret)
+       if (ret) {
                mem->state = from_state_req;
-       else
-               mem->state = to_state;
+               goto out;
+       }
 
+       mem->state = to_state;
+       switch (mem->state) {
+       case MEM_OFFLINE:
+               kobject_uevent(&mem->sysdev.kobj, KOBJ_OFFLINE);
+               break;
+       case MEM_ONLINE:
+               kobject_uevent(&mem->sysdev.kobj, KOBJ_ONLINE);
+               break;
+       default:
+               break;
+       }
 out:
        mutex_unlock(&mem->state_mutex);
        return ret;
index 409f5ce78829a7a7624a56f6f19422f783ae7e83..6b04be02ab8148a7300a4629570214e90cbcb04d 100644 (file)
@@ -331,13 +331,11 @@ void sysdev_unregister(struct sys_device *sysdev)
 EXPORT_SYMBOL_GPL(sysdev_register);
 EXPORT_SYMBOL_GPL(sysdev_unregister);
 
-#define to_ext_attr(x) container_of(x, struct sysdev_ext_attribute, attr)
-
 ssize_t sysdev_store_ulong(struct sys_device *sysdev,
                           struct sysdev_attribute *attr,
                           const char *buf, size_t size)
 {
-       struct sysdev_ext_attribute *ea = to_ext_attr(attr);
+       struct sysdev_ext_attribute *ea = SYSDEV_TO_EXT_ATTR(attr);
        char *end;
        unsigned long new = simple_strtoul(buf, &end, 0);
        if (end == buf)
@@ -352,7 +350,7 @@ ssize_t sysdev_show_ulong(struct sys_device *sysdev,
                          struct sysdev_attribute *attr,
                          char *buf)
 {
-       struct sysdev_ext_attribute *ea = to_ext_attr(attr);
+       struct sysdev_ext_attribute *ea = SYSDEV_TO_EXT_ATTR(attr);
        return snprintf(buf, PAGE_SIZE, "%lx\n", *(unsigned long *)(ea->var));
 }
 EXPORT_SYMBOL_GPL(sysdev_show_ulong);
@@ -361,7 +359,7 @@ ssize_t sysdev_store_int(struct sys_device *sysdev,
                           struct sysdev_attribute *attr,
                           const char *buf, size_t size)
 {
-       struct sysdev_ext_attribute *ea = to_ext_attr(attr);
+       struct sysdev_ext_attribute *ea = SYSDEV_TO_EXT_ATTR(attr);
        char *end;
        long new = simple_strtol(buf, &end, 0);
        if (end == buf || new > INT_MAX || new < INT_MIN)
@@ -376,7 +374,7 @@ ssize_t sysdev_show_int(struct sys_device *sysdev,
                          struct sysdev_attribute *attr,
                          char *buf)
 {
-       struct sysdev_ext_attribute *ea = to_ext_attr(attr);
+       struct sysdev_ext_attribute *ea = SYSDEV_TO_EXT_ATTR(attr);
        return snprintf(buf, PAGE_SIZE, "%d\n", *(int *)(ea->var));
 }
 EXPORT_SYMBOL_GPL(sysdev_show_int);
index 0833896cf6f2aeb98721441619a9fdeae0ae54bb..c9aeb7fce8782c96987a743e19f73c45289827b3 100644 (file)
@@ -263,16 +263,40 @@ static void hpet_timer_set_irq(struct hpet_dev *devp)
 
 static int hpet_open(struct inode *inode, struct file *file)
 {
-       struct hpet_dev *devp;
        struct hpets *hpetp;
-       int i;
 
        if (file->f_mode & FMODE_WRITE)
                return -EINVAL;
 
+       hpetp = hpets;
+       /* starting with timer-neutral instance */
+       file->private_data = &hpetp->hp_dev[hpetp->hp_ntimer];
+
+       return 0;
+}
+
+static int hpet_alloc_timer(struct file *file)
+{
+       struct hpet_dev *devp;
+       struct hpets *hpetp;
+       int i;
+
+       /* once acquired, will remain */
+       devp = file->private_data;
+       if (devp->hd_timer)
+               return 0;
+
        mutex_lock(&hpet_mutex);
        spin_lock_irq(&hpet_lock);
 
+       /* check for race acquiring */
+       devp = file->private_data;
+       if (devp->hd_timer) {
+               spin_unlock_irq(&hpet_lock);
+               mutex_unlock(&hpet_mutex);
+               return 0;
+       }
+
        for (devp = NULL, hpetp = hpets; hpetp && !devp; hpetp = hpetp->hp_next)
                for (i = 0; i < hpetp->hp_ntimer; i++)
                        if (hpetp->hp_dev[i].hd_flags & HPET_OPEN)
@@ -402,6 +426,11 @@ static int hpet_mmap(struct file *file, struct vm_area_struct *vma)
 static int hpet_fasync(int fd, struct file *file, int on)
 {
        struct hpet_dev *devp;
+       int r;
+
+       r = hpet_alloc_timer(file);
+       if (r < 0)
+               return r;
 
        devp = file->private_data;
 
@@ -420,6 +449,9 @@ static int hpet_release(struct inode *inode, struct file *file)
        devp = file->private_data;
        timer = devp->hd_timer;
 
+       if (!timer)
+               goto out;
+
        spin_lock_irq(&hpet_lock);
 
        writeq((readq(&timer->hpet_config) & ~Tn_INT_ENB_CNF_MASK),
@@ -444,7 +476,7 @@ static int hpet_release(struct inode *inode, struct file *file)
 
        if (irq)
                free_irq(irq, devp);
-
+out:
        file->private_data = NULL;
        return 0;
 }
@@ -593,6 +625,9 @@ hpet_ioctl_common(struct hpet_dev *devp, int cmd, unsigned long arg,
                break;
        case HPET_IE_ON:
                return hpet_ioctl_ieon(devp);
+       case HPET_ALLOC_TIMER:
+               /* nothing to do */
+               return 0;
        default:
                return -EINVAL;
        }
@@ -859,7 +894,11 @@ int hpet_alloc(struct hpet_data *hdp)
                return 0;
        }
 
-       siz = sizeof(struct hpets) + ((hdp->hd_nirqs - 1) *
+       /*
+        * last hpet_dev will have null timer pointer, gives timer-neutral
+        * representation of block
+        */
+       siz = sizeof(struct hpets) + ((hdp->hd_nirqs) *
                                      sizeof(struct hpet_dev));
 
        hpetp = kzalloc(siz, GFP_KERNEL);
@@ -925,13 +964,16 @@ int hpet_alloc(struct hpet_data *hdp)
                writeq(mcfg, &hpet->hpet_config);
        }
 
-       for (i = 0, devp = hpetp->hp_dev; i < hpetp->hp_ntimer; i++, devp++) {
+       for (i = 0, devp = hpetp->hp_dev; i < hpetp->hp_ntimer + 1;
+            i++, devp++) {
                struct hpet_timer __iomem *timer;
 
-               timer = &hpet->hpet_timers[devp - hpetp->hp_dev];
-
                devp->hd_hpets = hpetp;
                devp->hd_hpet = hpet;
+               if (i == hpetp->hp_ntimer)
+                       continue;
+
+               timer = &hpet->hpet_timers[devp - hpetp->hp_dev];
                devp->hd_timer = timer;
 
                /*
index c2917ffad2c2a311beeeaed7f8171196601d78ff..34767a6d7f42a354edca655d901c298e3883609a 100644 (file)
 #define IPMI_WDOG_SET_TIMER            0x24
 #define IPMI_WDOG_GET_TIMER            0x25
 
+#define IPMI_WDOG_TIMER_NOT_INIT_RESP  0x80
+
 /* These are here until the real ones get into the watchdog.h interface. */
 #ifndef WDIOC_GETTIMEOUT
 #define        WDIOC_GETTIMEOUT        _IOW(WATCHDOG_IOCTL_BASE, 20, int)
@@ -596,6 +598,7 @@ static int ipmi_heartbeat(void)
        struct kernel_ipmi_msg            msg;
        int                               rv;
        struct ipmi_system_interface_addr addr;
+       int                               timeout_retries = 0;
 
        if (ipmi_ignore_heartbeat)
                return 0;
@@ -616,6 +619,7 @@ static int ipmi_heartbeat(void)
 
        mutex_lock(&heartbeat_lock);
 
+restart:
        atomic_set(&heartbeat_tofree, 2);
 
        /*
@@ -653,7 +657,33 @@ static int ipmi_heartbeat(void)
        /* Wait for the heartbeat to be sent. */
        wait_for_completion(&heartbeat_wait);
 
-       if (heartbeat_recv_msg.msg.data[0] != 0) {
+       if (heartbeat_recv_msg.msg.data[0] == IPMI_WDOG_TIMER_NOT_INIT_RESP)  {
+               timeout_retries++;
+               if (timeout_retries > 3) {
+                       printk(KERN_ERR PFX ": Unable to restore the IPMI"
+                              " watchdog's settings, giving up.\n");
+                       rv = -EIO;
+                       goto out_unlock;
+               }
+
+               /*
+                * The timer was not initialized, that means the BMC was
+                * probably reset and lost the watchdog information.  Attempt
+                * to restore the timer's info.  Note that we still hold
+                * the heartbeat lock, to keep a heartbeat from happening
+                * in this process, so must say no heartbeat to avoid a
+                * deadlock on this mutex.
+                */
+               rv = ipmi_set_timeout(IPMI_SET_TIMEOUT_NO_HB);
+               if (rv) {
+                       printk(KERN_ERR PFX ": Unable to send the command to"
+                              " set the watchdog's settings, giving up.\n");
+                       goto out_unlock;
+               }
+
+               /* We might need a new heartbeat, so do it now */
+               goto restart;
+       } else if (heartbeat_recv_msg.msg.data[0] != 0) {
                /*
                 * Got an error in the heartbeat response.  It was already
                 * reported in ipmi_wdog_msg_handler, but we should return
@@ -662,6 +692,7 @@ static int ipmi_heartbeat(void)
                rv = -EINVAL;
        }
 
+out_unlock:
        mutex_unlock(&heartbeat_lock);
 
        return rv;
@@ -922,11 +953,15 @@ static struct miscdevice ipmi_wdog_miscdev = {
 static void ipmi_wdog_msg_handler(struct ipmi_recv_msg *msg,
                                  void                 *handler_data)
 {
-       if (msg->msg.data[0] != 0) {
+       if (msg->msg.cmd == IPMI_WDOG_RESET_TIMER &&
+                       msg->msg.data[0] == IPMI_WDOG_TIMER_NOT_INIT_RESP)
+               printk(KERN_INFO PFX "response: The IPMI controller appears"
+                      " to have been reset, will attempt to reinitialize"
+                      " the watchdog timer\n");
+       else if (msg->msg.data[0] != 0)
                printk(KERN_ERR PFX "response: Error %x on cmd %x\n",
                       msg->msg.data[0],
                       msg->msg.cmd);
-       }
 
        ipmi_free_recv_msg(msg);
 }
index 7c7f42a1f880cc81af6dafd89b73810c7eca34aa..9fec3232b7361b102ff5e05e55f2444eeab6ffb8 100644 (file)
@@ -83,8 +83,7 @@ static void ramoops_do_dump(struct kmsg_dumper *dumper,
        struct timeval timestamp;
 
        if (reason != KMSG_DUMP_OOPS &&
-           reason != KMSG_DUMP_PANIC &&
-           reason != KMSG_DUMP_KEXEC)
+           reason != KMSG_DUMP_PANIC)
                return;
 
        /* Only dump oopses if dump_oops is set */
@@ -126,8 +125,8 @@ static int __init ramoops_probe(struct platform_device *pdev)
                goto fail3;
        }
 
-       rounddown_pow_of_two(pdata->mem_size);
-       rounddown_pow_of_two(pdata->record_size);
+       pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
+       pdata->record_size = rounddown_pow_of_two(pdata->record_size);
 
        /* Check for the minimum memory size */
        if (pdata->mem_size < MIN_MEM_SIZE &&
@@ -148,14 +147,6 @@ static int __init ramoops_probe(struct platform_device *pdev)
        cxt->phys_addr = pdata->mem_address;
        cxt->record_size = pdata->record_size;
        cxt->dump_oops = pdata->dump_oops;
-       /*
-        * Update the module parameter variables as well so they are visible
-        * through /sys/module/ramoops/parameters/
-        */
-       mem_size = pdata->mem_size;
-       mem_address = pdata->mem_address;
-       record_size = pdata->record_size;
-       dump_oops = pdata->dump_oops;
 
        if (!request_mem_region(cxt->phys_addr, cxt->size, "ramoops")) {
                pr_err("request mem region failed\n");
@@ -176,6 +167,15 @@ static int __init ramoops_probe(struct platform_device *pdev)
                goto fail1;
        }
 
+       /*
+        * Update the module parameter variables as well so they are visible
+        * through /sys/module/ramoops/parameters/
+        */
+       mem_size = pdata->mem_size;
+       mem_address = pdata->mem_address;
+       record_size = pdata->record_size;
+       dump_oops = pdata->dump_oops;
+
        return 0;
 
 fail1:
index 5d2f8e13cf0e670e83b48b6dc243402d781b9758..38da9f8dadc86a5ba2c15146bf48ddd845f938be 100644 (file)
@@ -297,33 +297,42 @@ static void __setup_broadcast_timer(void *arg)
        clockevents_notify(reason, &cpu);
 }
 
-static int setup_broadcast_cpuhp_notify(struct notifier_block *n,
+static void auto_demotion_disable(void *dummy)
+{
+       unsigned long long msr_bits;
+
+       rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
+       msr_bits &= ~auto_demotion_disable_flags;
+       wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
+}
+
+static void __intel_idle_notify_handler(void *arg)
+{
+       if (auto_demotion_disable_flags)
+               auto_demotion_disable(NULL);
+
+       if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
+               __setup_broadcast_timer((void *)true);
+}
+
+static int setup_intelidle_cpuhp_notify(struct notifier_block *n,
                unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
 
        switch (action & 0xf) {
        case CPU_ONLINE:
-               smp_call_function_single(hotcpu, __setup_broadcast_timer,
-                       (void *)true, 1);
+               smp_call_function_single(hotcpu, __intel_idle_notify_handler,
+                       NULL, 1);
                break;
        }
        return NOTIFY_OK;
 }
 
-static struct notifier_block setup_broadcast_notifier = {
-       .notifier_call = setup_broadcast_cpuhp_notify,
+static struct notifier_block setup_intelidle_notifier = {
+       .notifier_call = setup_intelidle_cpuhp_notify,
 };
 
-static void auto_demotion_disable(void *dummy)
-{
-       unsigned long long msr_bits;
-
-       rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
-       msr_bits &= ~auto_demotion_disable_flags;
-       wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
-}
-
 /*
  * intel_idle_probe()
  */
@@ -393,10 +402,8 @@ static int intel_idle_probe(void)
 
        if (boot_cpu_has(X86_FEATURE_ARAT))     /* Always Reliable APIC Timer */
                lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
-       else {
-               smp_call_function(__setup_broadcast_timer, (void *)true, 1);
-               register_cpu_notifier(&setup_broadcast_notifier);
-       }
+       else
+               on_each_cpu(__setup_broadcast_timer, (void *)true, 1);
 
        pr_debug(PREFIX "v" INTEL_IDLE_VERSION
                " model 0x%X\n", boot_cpu_data.x86_model);
@@ -471,7 +478,7 @@ static int intel_idle_cpuidle_driver_init(void)
        }
 
        if (auto_demotion_disable_flags)
-               smp_call_function(auto_demotion_disable, NULL, 1);
+               on_each_cpu(auto_demotion_disable, NULL, 1);
 
        return 0;
 }
@@ -559,6 +566,10 @@ static int __init intel_idle_init(void)
                return retval;
        }
 
+       if (auto_demotion_disable_flags || lapic_timer_reliable_states !=
+           LAPIC_TIMER_ALWAYS_RELIABLE)
+               register_cpu_notifier(&setup_intelidle_notifier);
+
        return 0;
 }
 
@@ -567,10 +578,12 @@ static void __exit intel_idle_exit(void)
        intel_idle_cpuidle_devices_uninit();
        cpuidle_unregister_driver(&intel_idle_driver);
 
-       if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) {
-               smp_call_function(__setup_broadcast_timer, (void *)false, 1);
-               unregister_cpu_notifier(&setup_broadcast_notifier);
-       }
+       if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
+               on_each_cpu(__setup_broadcast_timer, (void *)false, 1);
+
+       if (auto_demotion_disable_flags || lapic_timer_reliable_states !=
+           LAPIC_TIMER_ALWAYS_RELIABLE)
+               unregister_cpu_notifier(&setup_intelidle_notifier);
 
        return;
 }
index 1b75a56ebd08016cb517e62086ec62949e9aed0a..897a77dfa9d7dd923ec7567c26c729e1901e1757 100644 (file)
@@ -388,6 +388,14 @@ config LEDS_RENESAS_TPU
          pin function. The latter to support brightness control.
          Brightness control is supported but hardware blinking is not.
 
+config LEDS_TCA6507
+       tristate "LED Support for TCA6507 I2C chip"
+       depends on LEDS_CLASS && I2C
+       help
+         This option enables support for LEDs connected to TC6507
+         LED driver chips accessed via the I2C bus.
+         Driver support brightness control and hardware-assisted blinking.
+
 config LEDS_TRIGGERS
        bool "LED Trigger support"
        depends on LEDS_CLASS
index e4f6bf568880d284cc4743e4cb6231b450d57296..a0525f371023dd7a7fc274ed8e8645655320be8a 100644 (file)
@@ -43,6 +43,7 @@ obj-$(CONFIG_LEDS_NS2)                        += leds-ns2.o
 obj-$(CONFIG_LEDS_NETXBIG)             += leds-netxbig.o
 obj-$(CONFIG_LEDS_ASIC3)               += leds-asic3.o
 obj-$(CONFIG_LEDS_RENESAS_TPU)         += leds-renesas-tpu.o
+obj-$(CONFIG_LEDS_TCA6507)             += leds-tca6507.o
 
 # LED SPI Drivers
 obj-$(CONFIG_LEDS_DAC124S085)          += leds-dac124s085.o
index 0810604dc701307973f2967b8fd578f5a9bee997..4ca00624bd1860ed3b1f29e27ea79b59b47dc46d 100644 (file)
@@ -238,17 +238,7 @@ static struct platform_driver pm860x_led_driver = {
        .remove = pm860x_led_remove,
 };
 
-static int __devinit pm860x_led_init(void)
-{
-       return platform_driver_register(&pm860x_led_driver);
-}
-module_init(pm860x_led_init);
-
-static void __devexit pm860x_led_exit(void)
-{
-       platform_driver_unregister(&pm860x_led_driver);
-}
-module_exit(pm860x_led_exit);
+module_platform_driver(pm860x_led_driver);
 
 MODULE_DESCRIPTION("LED driver for Marvell PM860x");
 MODULE_AUTHOR("Haojian Zhuang <haojian.zhuang@marvell.com>");
index 7ba4c7b5b97e07ce6ff11d26895a154b25b5a956..b1400db3f839a2730b6dc29db302306fb3f627eb 100644 (file)
@@ -213,17 +213,7 @@ static struct platform_driver adp5520_led_driver = {
        .remove         = __devexit_p(adp5520_led_remove),
 };
 
-static int __init adp5520_led_init(void)
-{
-       return platform_driver_register(&adp5520_led_driver);
-}
-module_init(adp5520_led_init);
-
-static void __exit adp5520_led_exit(void)
-{
-       platform_driver_unregister(&adp5520_led_driver);
-}
-module_exit(adp5520_led_exit);
+module_platform_driver(adp5520_led_driver);
 
 MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
 MODULE_DESCRIPTION("LEDS ADP5520(01) Driver");
index 8c00937bf7e74d02bc1759cbab01ca6eba1bc13e..07428357c83fd467ef2c524c65f4c4f4404f2fe1 100644 (file)
@@ -118,18 +118,7 @@ static struct platform_driver ams_delta_led_driver = {
        },
 };
 
-static int __init ams_delta_led_init(void)
-{
-       return platform_driver_register(&ams_delta_led_driver);
-}
-
-static void __exit ams_delta_led_exit(void)
-{
-       platform_driver_unregister(&ams_delta_led_driver);
-}
-
-module_init(ams_delta_led_init);
-module_exit(ams_delta_led_exit);
+module_platform_driver(ams_delta_led_driver);
 
 MODULE_AUTHOR("Jonathan McDowell <noodles@earth.li>");
 MODULE_DESCRIPTION("Amstrad Delta LED driver");
index 48d9fe61bdfcdc39e7bd6b1d942bb637f28c00ff..525a92492837bb892de5b4fb29050bf939a0a4fb 100644 (file)
@@ -179,21 +179,9 @@ static struct platform_driver asic3_led_driver = {
        },
 };
 
-MODULE_ALIAS("platform:leds-asic3");
-
-static int __init asic3_led_init(void)
-{
-       return platform_driver_register(&asic3_led_driver);
-}
-
-static void __exit asic3_led_exit(void)
-{
-       platform_driver_unregister(&asic3_led_driver);
-}
-
-module_init(asic3_led_init);
-module_exit(asic3_led_exit);
+module_platform_driver(asic3_led_driver);
 
 MODULE_AUTHOR("Paul Parsons <lost.distance@yahoo.com>");
 MODULE_DESCRIPTION("HTC ASIC3 LED driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:leds-asic3");
index 109c875ea233486fc191a33b938f06f719ad1eaa..800243b6037ed9edc5b945b74d226a17be05f712 100644 (file)
@@ -134,29 +134,18 @@ static int __exit pwmled_remove(struct platform_device *pdev)
        return 0;
 }
 
-/* work with hotplug and coldplug */
-MODULE_ALIAS("platform:leds-atmel-pwm");
-
 static struct platform_driver pwmled_driver = {
        .driver = {
                .name =         "leds-atmel-pwm",
                .owner =        THIS_MODULE,
        },
        /* REVISIT add suspend() and resume() methods */
+       .probe =        pwmled_probe,
        .remove =       __exit_p(pwmled_remove),
 };
 
-static int __init modinit(void)
-{
-       return platform_driver_probe(&pwmled_driver, pwmled_probe);
-}
-module_init(modinit);
-
-static void __exit modexit(void)
-{
-       platform_driver_unregister(&pwmled_driver);
-}
-module_exit(modexit);
+module_platform_driver(pwmled_driver);
 
 MODULE_DESCRIPTION("Driver for LEDs with PWM-controlled brightness");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:leds-atmel-pwm");
index ea2185531f826e064d53437f956fbfb04e5e35f5..591cbdf5a0463e99ab91892bfc50b0beeb1c579c 100644 (file)
@@ -688,8 +688,7 @@ static int __devinit bd2802_probe(struct i2c_client *client,
        i2c_set_clientdata(client, led);
 
        /* Configure RESET GPIO (L: RESET, H: RESET cancel) */
-       gpio_request(pdata->reset_gpio, "RGB_RESETB");
-       gpio_direction_output(pdata->reset_gpio, 1);
+       gpio_request_one(pdata->reset_gpio, GPIOF_OUT_INIT_HIGH, "RGB_RESETB");
 
        /* Tacss = min 0.1ms */
        udelay(100);
@@ -813,17 +812,7 @@ static struct i2c_driver bd2802_i2c_driver = {
        .id_table       = bd2802_id,
 };
 
-static int __init bd2802_init(void)
-{
-       return i2c_add_driver(&bd2802_i2c_driver);
-}
-module_init(bd2802_init);
-
-static void __exit bd2802_exit(void)
-{
-       i2c_del_driver(&bd2802_i2c_driver);
-}
-module_exit(bd2802_exit);
+module_i2c_driver(bd2802_i2c_driver);
 
 MODULE_AUTHOR("Kim Kyuwon <q1.kim@samsung.com>");
 MODULE_DESCRIPTION("BD2802 LED driver");
index da5fb016b1a550fabfee5114bb11727a22c01749..6a8725cc7b4dfe119a2083e37e1290198bdbdd24 100644 (file)
@@ -75,9 +75,6 @@ static int __devexit cobalt_qube_led_remove(struct platform_device *pdev)
        return 0;
 }
 
-/* work with hotplug and coldplug */
-MODULE_ALIAS("platform:cobalt-qube-leds");
-
 static struct platform_driver cobalt_qube_led_driver = {
        .probe  = cobalt_qube_led_probe,
        .remove = __devexit_p(cobalt_qube_led_remove),
@@ -87,19 +84,9 @@ static struct platform_driver cobalt_qube_led_driver = {
        },
 };
 
-static int __init cobalt_qube_led_init(void)
-{
-       return platform_driver_register(&cobalt_qube_led_driver);
-}
-
-static void __exit cobalt_qube_led_exit(void)
-{
-       platform_driver_unregister(&cobalt_qube_led_driver);
-}
-
-module_init(cobalt_qube_led_init);
-module_exit(cobalt_qube_led_exit);
+module_platform_driver(cobalt_qube_led_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Front LED support for Cobalt Server");
 MODULE_AUTHOR("Florian Fainelli <florian@openwrt.org>");
+MODULE_ALIAS("platform:cobalt-qube-leds");
index f28931cf6781049562018d0af5d59e014b2936d2..d9cd73ebd6c44c3e89ff032e2c2155eb036ad972 100644 (file)
@@ -158,17 +158,7 @@ static struct platform_driver da903x_led_driver = {
        .remove         = __devexit_p(da903x_led_remove),
 };
 
-static int __init da903x_led_init(void)
-{
-       return platform_driver_register(&da903x_led_driver);
-}
-module_init(da903x_led_init);
-
-static void __exit da903x_led_exit(void)
-{
-       platform_driver_unregister(&da903x_led_driver);
-}
-module_exit(da903x_led_exit);
+module_platform_driver(da903x_led_driver);
 
 MODULE_DESCRIPTION("LEDs driver for Dialog Semiconductor DA9030/DA9034");
 MODULE_AUTHOR("Eric Miao <eric.miao@marvell.com>"
index 31cf0d60a9a546052e782d5177b8461c9673d941..d56c14269ff0c7e9c5e40992f689919024cef7bd 100644 (file)
@@ -131,18 +131,7 @@ static struct spi_driver dac124s085_driver = {
        },
 };
 
-static int __init dac124s085_leds_init(void)
-{
-       return spi_register_driver(&dac124s085_driver);
-}
-
-static void __exit dac124s085_leds_exit(void)
-{
-       spi_unregister_driver(&dac124s085_driver);
-}
-
-module_init(dac124s085_leds_init);
-module_exit(dac124s085_leds_exit);
+module_spi_driver(dac124s085_driver);
 
 MODULE_AUTHOR("Guennadi Liakhovetski <lg@denx.de>");
 MODULE_DESCRIPTION("DAC124S085 LED driver");
index 49aceffaa5b6b76d0fb4b5fc8e14d6b90012f7fa..b9053fa6e2534312495217164c4a5f4e1eeb403d 100644 (file)
@@ -224,20 +224,7 @@ static struct platform_driver fsg_led_driver = {
        },
 };
 
-
-static int __init fsg_led_init(void)
-{
-       return platform_driver_register(&fsg_led_driver);
-}
-
-static void __exit fsg_led_exit(void)
-{
-       platform_driver_unregister(&fsg_led_driver);
-}
-
-
-module_init(fsg_led_init);
-module_exit(fsg_led_exit);
+module_platform_driver(fsg_led_driver);
 
 MODULE_AUTHOR("Rod Whitby <rod@whitby.id.au>");
 MODULE_DESCRIPTION("Freecom FSG-3 LED driver");
index 399a86f2013a145f1945f39077d23805bb40213d..7df74cb97e702e693935ab798598b9967b0d0ae7 100644 (file)
@@ -293,21 +293,9 @@ static struct platform_driver gpio_led_driver = {
        },
 };
 
-MODULE_ALIAS("platform:leds-gpio");
-
-static int __init gpio_led_init(void)
-{
-       return platform_driver_register(&gpio_led_driver);
-}
-
-static void __exit gpio_led_exit(void)
-{
-       platform_driver_unregister(&gpio_led_driver);
-}
-
-module_init(gpio_led_init);
-module_exit(gpio_led_exit);
+module_platform_driver(gpio_led_driver);
 
 MODULE_AUTHOR("Raphael Assenat <raph@8d.com>, Trent Piepho <tpiepho@freescale.com>");
 MODULE_DESCRIPTION("GPIO LED driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:leds-gpio");
index bcfbd3a60eab6b8ee4ae0ecb4cf14efce17a2ac3..366b6055e33063e5461d5fdc2bdac3ba23fff611 100644 (file)
@@ -79,9 +79,6 @@ static int hp6xxled_remove(struct platform_device *pdev)
        return 0;
 }
 
-/* work with hotplug and coldplug */
-MODULE_ALIAS("platform:hp6xx-led");
-
 static struct platform_driver hp6xxled_driver = {
        .probe          = hp6xxled_probe,
        .remove         = hp6xxled_remove,
@@ -91,19 +88,9 @@ static struct platform_driver hp6xxled_driver = {
        },
 };
 
-static int __init hp6xxled_init(void)
-{
-       return platform_driver_register(&hp6xxled_driver);
-}
-
-static void __exit hp6xxled_exit(void)
-{
-       platform_driver_unregister(&hp6xxled_driver);
-}
-
-module_init(hp6xxled_init);
-module_exit(hp6xxled_exit);
+module_platform_driver(hp6xxled_driver);
 
 MODULE_AUTHOR("Kristoffer Ericson <kristoffer.ericson@gmail.com>");
 MODULE_DESCRIPTION("HP Jornada 6xx LED driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:hp6xx-led");
index 0630e4f4b2866a8829a1731411dfa19afb7e0bae..45e6878d73741d8359db2069b68bbe1df2589f2e 100644 (file)
@@ -457,18 +457,7 @@ static struct i2c_driver lm3530_i2c_driver = {
        },
 };
 
-static int __init lm3530_init(void)
-{
-       return i2c_add_driver(&lm3530_i2c_driver);
-}
-
-static void __exit lm3530_exit(void)
-{
-       i2c_del_driver(&lm3530_i2c_driver);
-}
-
-module_init(lm3530_init);
-module_exit(lm3530_exit);
+module_i2c_driver(lm3530_i2c_driver);
 
 MODULE_DESCRIPTION("Back Light driver for LM3530");
 MODULE_LICENSE("GPL v2");
index 9010c054615e414fa5c236a8fe2cc4a700a00757..b8f9f0a5d4318d1291e377fe1fd4599d3bc15e13 100644 (file)
@@ -453,18 +453,7 @@ static struct i2c_driver lp3944_driver = {
        .id_table = lp3944_id,
 };
 
-static int __init lp3944_module_init(void)
-{
-       return i2c_add_driver(&lp3944_driver);
-}
-
-static void __exit lp3944_module_exit(void)
-{
-       i2c_del_driver(&lp3944_driver);
-}
-
-module_init(lp3944_module_init);
-module_exit(lp3944_module_exit);
+module_i2c_driver(lp3944_driver);
 
 MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
 MODULE_DESCRIPTION("LP3944 Fun Light Chip");
index cb641f1b33429ab2776cb300194b4e0fd00d3fdb..d62a7982a5e66ad25812e87c86773985c3ffc802 100644 (file)
@@ -797,25 +797,7 @@ static struct i2c_driver lp5521_driver = {
        .id_table       = lp5521_id,
 };
 
-static int __init lp5521_init(void)
-{
-       int ret;
-
-       ret = i2c_add_driver(&lp5521_driver);
-
-       if (ret < 0)
-               printk(KERN_ALERT "Adding lp5521 driver failed\n");
-
-       return ret;
-}
-
-static void __exit lp5521_exit(void)
-{
-       i2c_del_driver(&lp5521_driver);
-}
-
-module_init(lp5521_init);
-module_exit(lp5521_exit);
+module_i2c_driver(lp5521_driver);
 
 MODULE_AUTHOR("Mathias Nyman, Yuri Zaporozhets, Samu Onkalo");
 MODULE_DESCRIPTION("LP5521 LED engine");
index 5971e309b2342390a1988cd7f58d6e49d5a24301..73e791ae725993e1833f40cc8b0b059a0a55b53e 100644 (file)
@@ -870,8 +870,6 @@ static int __devinit lp5523_init_led(struct lp5523_led *led, struct device *dev,
        return 0;
 }
 
-static struct i2c_driver lp5523_driver;
-
 static int __devinit lp5523_probe(struct i2c_client *client,
                        const struct i2c_device_id *id)
 {
@@ -1021,25 +1019,7 @@ static struct i2c_driver lp5523_driver = {
        .id_table       = lp5523_id,
 };
 
-static int __init lp5523_init(void)
-{
-       int ret;
-
-       ret = i2c_add_driver(&lp5523_driver);
-
-       if (ret < 0)
-               printk(KERN_ALERT "Adding lp5523 driver failed\n");
-
-       return ret;
-}
-
-static void __exit lp5523_exit(void)
-{
-       i2c_del_driver(&lp5523_driver);
-}
-
-module_init(lp5523_init);
-module_exit(lp5523_exit);
+module_i2c_driver(lp5523_driver);
 
 MODULE_AUTHOR("Mathias Nyman <mathias.nyman@nokia.com>");
 MODULE_DESCRIPTION("LP5523 LED engine");
index 53f67b8ce55db505bd4ae5127423202e9141bca3..e311a96c4469759ae72eadabfc3f86c8c7795760 100644 (file)
@@ -199,21 +199,9 @@ static struct platform_driver lt3593_led_driver = {
        },
 };
 
-MODULE_ALIAS("platform:leds-lt3593");
-
-static int __init lt3593_led_init(void)
-{
-       return platform_driver_register(&lt3593_led_driver);
-}
-
-static void __exit lt3593_led_exit(void)
-{
-       platform_driver_unregister(&lt3593_led_driver);
-}
-
-module_init(lt3593_led_init);
-module_exit(lt3593_led_exit);
+module_platform_driver(lt3593_led_driver);
 
 MODULE_AUTHOR("Daniel Mack <daniel@caiaq.de>");
 MODULE_DESCRIPTION("LED driver for LT3593 controllers");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:leds-lt3593");
index b3393a9f21398c119ad0440b68cf58953bc002a5..8bc4915415509d6e2456a0d4ea46446bf9188eca 100644 (file)
@@ -275,7 +275,7 @@ static int __devinit mc13783_led_probe(struct platform_device *pdev)
                return -ENODEV;
        }
 
-       if (pdata->num_leds < 1 || pdata->num_leds > MC13783_LED_MAX) {
+       if (pdata->num_leds < 1 || pdata->num_leds > (MC13783_LED_MAX + 1)) {
                dev_err(&pdev->dev, "Invalid led count %d\n", pdata->num_leds);
                return -EINVAL;
        }
@@ -385,17 +385,7 @@ static struct platform_driver mc13783_led_driver = {
        .remove         = __devexit_p(mc13783_led_remove),
 };
 
-static int __init mc13783_led_init(void)
-{
-       return platform_driver_register(&mc13783_led_driver);
-}
-module_init(mc13783_led_init);
-
-static void __exit mc13783_led_exit(void)
-{
-       platform_driver_unregister(&mc13783_led_driver);
-}
-module_exit(mc13783_led_exit);
+module_platform_driver(mc13783_led_driver);
 
 MODULE_DESCRIPTION("LEDs driver for Freescale MC13783 PMIC");
 MODULE_AUTHOR("Philippe Retornaz <philippe.retornaz@epfl.ch>");
index f2e51c13439962391626b682337e912ca2a83bfb..d8433f2d53bc712995b899db7a19e6d16bb091be 100644 (file)
@@ -81,35 +81,23 @@ static int __devinit gpio_ext_init(struct netxbig_gpio_ext *gpio_ext)
 
        /* Configure address GPIOs. */
        for (i = 0; i < gpio_ext->num_addr; i++) {
-               err = gpio_request(gpio_ext->addr[i], "GPIO extension addr");
+               err = gpio_request_one(gpio_ext->addr[i], GPIOF_OUT_INIT_LOW,
+                                      "GPIO extension addr");
                if (err)
                        goto err_free_addr;
-               err = gpio_direction_output(gpio_ext->addr[i], 0);
-               if (err) {
-                       gpio_free(gpio_ext->addr[i]);
-                       goto err_free_addr;
-               }
        }
        /* Configure data GPIOs. */
        for (i = 0; i < gpio_ext->num_data; i++) {
-               err = gpio_request(gpio_ext->data[i], "GPIO extension data");
+               err = gpio_request_one(gpio_ext->data[i], GPIOF_OUT_INIT_LOW,
+                                  "GPIO extension data");
                if (err)
                        goto err_free_data;
-               err = gpio_direction_output(gpio_ext->data[i], 0);
-               if (err) {
-                       gpio_free(gpio_ext->data[i]);
-                       goto err_free_data;
-               }
        }
        /* Configure "enable select" GPIO. */
-       err = gpio_request(gpio_ext->enable, "GPIO extension enable");
+       err = gpio_request_one(gpio_ext->enable, GPIOF_OUT_INIT_LOW,
+                              "GPIO extension enable");
        if (err)
                goto err_free_data;
-       err = gpio_direction_output(gpio_ext->enable, 0);
-       if (err) {
-               gpio_free(gpio_ext->enable);
-               goto err_free_data;
-       }
 
        return 0;
 
@@ -429,21 +417,10 @@ static struct platform_driver netxbig_led_driver = {
                .owner  = THIS_MODULE,
        },
 };
-MODULE_ALIAS("platform:leds-netxbig");
 
-static int __init netxbig_led_init(void)
-{
-       return platform_driver_register(&netxbig_led_driver);
-}
-
-static void __exit netxbig_led_exit(void)
-{
-       platform_driver_unregister(&netxbig_led_driver);
-}
-
-module_init(netxbig_led_init);
-module_exit(netxbig_led_exit);
+module_platform_driver(netxbig_led_driver);
 
 MODULE_AUTHOR("Simon Guinot <sguinot@lacie.com>");
 MODULE_DESCRIPTION("LED driver for LaCie xBig Network boards");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:leds-netxbig");
index 37b7d0cfe5867e91ed0c1aa94039dc4985dc79f7..2f0a14421a7344fa0505f27bea94b23857fba340 100644 (file)
@@ -323,21 +323,10 @@ static struct platform_driver ns2_led_driver = {
                .owner  = THIS_MODULE,
        },
 };
-MODULE_ALIAS("platform:leds-ns2");
-
-static int __init ns2_led_init(void)
-{
-       return platform_driver_register(&ns2_led_driver);
-}
 
-static void __exit ns2_led_exit(void)
-{
-       platform_driver_unregister(&ns2_led_driver);
-}
-
-module_init(ns2_led_init);
-module_exit(ns2_led_exit);
+module_platform_driver(ns2_led_driver);
 
 MODULE_AUTHOR("Simon Guinot <sguinot@lacie.com>");
 MODULE_DESCRIPTION("Network Space v2 LED driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:leds-ns2");
index a2c874623e3521550ab6917be4b56eb3e42b4aaa..ceccab44b5b818404da63b76f74cda552e7eb367 100644 (file)
@@ -489,20 +489,8 @@ static int pca9532_remove(struct i2c_client *client)
        return 0;
 }
 
-static int __init pca9532_init(void)
-{
-       return i2c_add_driver(&pca9532_driver);
-}
-
-static void __exit pca9532_exit(void)
-{
-       i2c_del_driver(&pca9532_driver);
-}
+module_i2c_driver(pca9532_driver);
 
 MODULE_AUTHOR("Riku Voipio");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("PCA 9532 LED dimmer");
-
-module_init(pca9532_init);
-module_exit(pca9532_exit);
-
index 66aa3e8e786f545c4db0e6bd4318a173a792fc1e..dcc3bc3d38db8cac7384c65009965967bc026f24 100644 (file)
@@ -371,18 +371,7 @@ static struct i2c_driver pca955x_driver = {
        .id_table = pca955x_id,
 };
 
-static int __init pca955x_leds_init(void)
-{
-       return i2c_add_driver(&pca955x_driver);
-}
-
-static void __exit pca955x_leds_exit(void)
-{
-       i2c_del_driver(&pca955x_driver);
-}
-
-module_init(pca955x_leds_init);
-module_exit(pca955x_leds_exit);
+module_i2c_driver(pca955x_driver);
 
 MODULE_AUTHOR("Nate Case <ncase@xes-inc.com>");
 MODULE_DESCRIPTION("PCA955x LED driver");
index 666daf77872e56239bbf59a002c180593996ae7a..3ed92f34bd4477108549499e7bbf4959af1a56cd 100644 (file)
@@ -135,18 +135,7 @@ static struct platform_driver led_pwm_driver = {
        },
 };
 
-static int __init led_pwm_init(void)
-{
-       return platform_driver_register(&led_pwm_driver);
-}
-
-static void __exit led_pwm_exit(void)
-{
-       platform_driver_unregister(&led_pwm_driver);
-}
-
-module_init(led_pwm_init);
-module_exit(led_pwm_exit);
+module_platform_driver(led_pwm_driver);
 
 MODULE_AUTHOR("Luotao Fu <l.fu@pengutronix.de>");
 MODULE_DESCRIPTION("PWM LED driver for PXA");
index c3525f37f73d6895c5abfd619af06c7164cf119f..a7815b6cd8567d9dfd97ed9437f348d9cac4ebc8 100644 (file)
@@ -57,21 +57,9 @@ static struct platform_driver rb532_led_driver = {
        },
 };
 
-static int __init rb532_led_init(void)
-{
-       return platform_driver_register(&rb532_led_driver);
-}
-
-static void __exit rb532_led_exit(void)
-{
-       platform_driver_unregister(&rb532_led_driver);
-}
-
-module_init(rb532_led_init);
-module_exit(rb532_led_exit);
-
-MODULE_ALIAS("platform:rb532-led");
+module_platform_driver(rb532_led_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("User LED support for Routerboard532");
 MODULE_AUTHOR("Phil Sutter <n0-1@freewrt.org>");
+MODULE_ALIAS("platform:rb532-led");
index 8497f56f8e461cf70d8d1b8692e04b11e353323c..df7e963bddd304f37bf507fe61ca317066111253 100644 (file)
@@ -229,17 +229,7 @@ static struct platform_driver regulator_led_driver = {
        .remove = __devexit_p(regulator_led_remove),
 };
 
-static int __init regulator_led_init(void)
-{
-       return platform_driver_register(&regulator_led_driver);
-}
-module_init(regulator_led_init);
-
-static void __exit regulator_led_exit(void)
-{
-       platform_driver_unregister(&regulator_led_driver);
-}
-module_exit(regulator_led_exit);
+module_platform_driver(regulator_led_driver);
 
 MODULE_AUTHOR("Antonio Ospite <ospite@studenti.unina.it>");
 MODULE_DESCRIPTION("Regulator driven LED driver");
index 3ee540eb127e7ffadfd0d5c2e471dc14380fa307..32fe337d5c687fbb09c06bd778e70cdc3cc7a223 100644 (file)
@@ -339,18 +339,7 @@ static struct platform_driver r_tpu_device_driver = {
        }
 };
 
-static int __init r_tpu_init(void)
-{
-       return platform_driver_register(&r_tpu_device_driver);
-}
-
-static void __exit r_tpu_exit(void)
-{
-       platform_driver_unregister(&r_tpu_device_driver);
-}
-
-module_init(r_tpu_init);
-module_exit(r_tpu_exit);
+module_platform_driver(r_tpu_device_driver);
 
 MODULE_AUTHOR("Magnus Damm");
 MODULE_DESCRIPTION("Renesas TPU LED Driver");
index 29f8b0f0e2c6f880429aea67bcddee59f52f6b91..bd0a5ed49c42d9473bf1fa2e735500ee8c2f67ad 100644 (file)
@@ -121,18 +121,7 @@ static struct platform_driver s3c24xx_led_driver = {
        },
 };
 
-static int __init s3c24xx_led_init(void)
-{
-       return platform_driver_register(&s3c24xx_led_driver);
-}
-
-static void __exit s3c24xx_led_exit(void)
-{
-       platform_driver_unregister(&s3c24xx_led_driver);
-}
-
-module_init(s3c24xx_led_init);
-module_exit(s3c24xx_led_exit);
+module_platform_driver(s3c24xx_led_driver);
 
 MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>");
 MODULE_DESCRIPTION("S3C24XX LED driver");
diff --git a/drivers/leds/leds-tca6507.c b/drivers/leds/leds-tca6507.c
new file mode 100644 (file)
index 0000000..dc88c16
--- /dev/null
@@ -0,0 +1,687 @@
+/*
+ * leds-tca6507
+ *
+ * The TCA6507 is a programmable LED controller that can drive 7
+ * separate lines either by holding them low, or by pulsing them
+ * with modulated width.
+ * The modulation can be varied in a simple pattern to produce a blink or
+ * double-blink.
+ *
+ * This driver can configure each line either as a 'GPIO' which is out-only
+ * (no pull-up) or as an LED with variable brightness and hardware-assisted
+ * blinking.
+ *
+ * Apart from OFF and ON there are three programmable brightness levels which
+ * can be programmed from 0 to 15 and indicate how many 500usec intervals in
+ * each 8msec that the led is 'on'.  The levels are named MASTER, BANK0 and
+ * BANK1.
+ *
+ * There are two different blink rates that can be programmed, each with
+ * separate time for rise, on, fall, off and second-off.  Thus if 3 or more
+ * different non-trivial rates are required, software must be used for the extra
+ * rates. The two different blink rates must align with the two levels BANK0 and
+ * BANK1.
+ * This driver does not support double-blink so 'second-off' always matches
+ * 'off'.
+ *
+ * Only 16 different times can be programmed is a roughly logarithmic scale from
+ * 64ms to 16320ms.   Times that cannot be closely matched with these must be
+ * handled in software.  This driver allows 12.5% error in matching.
+ *
+ * This driver does not allow rise/fall rates to be set explicitly.  When trying
+ * to match a given 'on' or 'off' period, an appropriate pair of 'change' and
+ * 'hold' times are chosen to get a close match, with the 'change' being the
+ * smaller.
+ *
+ * Access to the 3 levels and 2 blinks are on a first-come, first-served basis.
+ * Access can be shared by multiple leds if they have the same level and
+ * either same blink rates, or some don't blink.
+ * When a led changes, it relinquishes access and tries again, so it might
+ * lose access to hardware blink.
+ * If a blink engine cannot be allocated, software blink is used.  If the
+ * desired brightness cannot be allocated, the closest available non-zero
+ * brightness is used.  As 'full' is always available, the worst case would be
+ * to have two different blink rates at '1', with Max at '2', then other leds
+ * will have to choose between '2' and '16'.  Hopefully this is not likely.
+ *
+ * Each bank (BANK0 and BANK1) have two usage counts - Leds using the brightness
+ * and leds using the blink.  It can only be reprogrammed when appropriate
+ * counter is zero.  The MASTER level has as single usage count.
+ *
+ * Each Led has programmable 'on' and 'off' time as milliseconds.  With each
+ * there is a flag saying if it was explicitly requested or defaulted.
+ * Similarly the banks know if each time was explicit or a default.  Defaults
+ * are permitted to be changed freely - they are not recognised when matching.
+ *
+ *
+ * An led-tca6507 device must be provided with platform data.  This data
+ * lists for each output: the name, default trigger, and whether the signal
+ * is being used as a GPiO rather than an led.  'struct led_plaform_data'
+ * is used for this.  If 'name' is NULL, the output isn't used.  If 'flags'
+ * is non-zero, the output is a GPO.  The 'flags' for the first GPIO should
+ * be the base gpio number, or -1.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/leds.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <linux/workqueue.h>
+#include <linux/leds-tca6507.h>
+
+/* LED select registers determine the source that drives LED outputs */
+#define TCA6507_LS_LED_OFF     0x0     /* Output HI-Z (off) */
+#define TCA6507_LS_LED_PWM0    0x2     /* Output LOW with Bank0 rate */
+#define TCA6507_LS_LED_PWM1    0x3     /* Output LOW with Bank1 rate */
+#define TCA6507_LS_LED_ON      0x4     /* Output LOW (on) */
+#define TCA6507_LS_LED_MIR     0x5     /* Output LOW with Master Intensity */
+#define TCA6507_LS_BLINK0      0x6     /* Blink at Bank0 rate */
+#define TCA6507_LS_BLINK1      0x7     /* Blink at Bank1 rate */
+
+enum {
+       BANK0,
+       BANK1,
+       MASTER,
+};
+static int bank_source[3] = {
+       TCA6507_LS_LED_PWM0,
+       TCA6507_LS_LED_PWM1,
+       TCA6507_LS_LED_MIR,
+};
+static int blink_source[3] = {
+       TCA6507_LS_BLINK0,
+       TCA6507_LS_BLINK1,
+};
+
+/* PWM registers */
+#define        TCA6507_REG_CNT                 11
+
+#define        TCA6507_FADE_ON                 0x03
+#define        TCA6507_FULL_ON                 0x04
+#define        TCA6507_FADE_OFF                0x05
+#define        TCA6507_FIRST_OFF               0x06
+#define        TCA6507_SECOND_OFF              0x07
+#define        TCA6507_MAX_INTENSITY           0x08
+#define        TCA6507_MASTER_INTENSITY        0x09
+#define        TCA6507_INITIALIZE              0x0A
+
+#define        INIT_CODE                       0x8
+
+#define TIMECODES 16
+static int time_codes[TIMECODES] = {
+       0, 64, 128, 192, 256, 384, 512, 768,
+       1024, 1536, 2048, 3072, 4096, 5760, 8128, 16320
+};
+
+/* Convert an led.brightness level (0..255) to a TCA6507 level (0..15) */
+static inline int TO_LEVEL(int brightness)
+{
+       return brightness >> 4;
+}
+
+/* ...and convert back */
+static inline int TO_BRIGHT(int level)
+{
+       if (level)
+               return (level << 4) | 0xf;
+       return 0;
+}
+
+#define NUM_LEDS 7
+struct tca6507_chip {
+       int                     reg_set;        /* a '1' means the register
+                                                * should be written */
+       u8                      reg_file[TCA6507_REG_CNT];
+       /* Bank 0 is Master Intensity */
+       struct bank {
+               int level;
+               int ontime, offtime;
+               int on_dflt, off_dflt;
+               int time_use, level_use;
+       } bank[3];
+       struct i2c_client       *client;
+       struct work_struct      work;
+       spinlock_t              lock;
+
+       struct tca6507_led {
+               struct tca6507_chip     *chip;
+               struct led_classdev     led_cdev;
+               int                     num;
+               int                     ontime, offtime;
+               int                     on_dflt, off_dflt;
+               int                     bank;   /* Bank used, or -1 */
+               int                     blink;  /* 1 if we are hardware-blinking */
+       } leds[NUM_LEDS];
+       struct gpio_chip                gpio;
+       const char                      *gpio_name[NUM_LEDS];
+       int                             gpio_map[NUM_LEDS];
+};
+
+static const struct i2c_device_id tca6507_id[] = {
+       { "tca6507" },
+       { }
+};
+MODULE_DEVICE_TABLE(i2c, tca6507_id);
+
+static int choose_times(int msec, int *c1p, int *c2p)
+{
+       /*
+        * Choose two timecodes which add to 'msec' as near as possible.
+        * The first returned should be the larger.
+        * If cannot get within 1/8, fail.
+        * If two possibilities are equally good (e.g. 512+0, 256+256), choose
+        * the first pair so there is more change-time visible (i.e. it is
+        * softer).
+        */
+       int c1, c2;
+       int tmax = msec * 9 / 8;
+       int tmin = msec * 7 / 8;
+       int diff = 65536;
+
+       for (c1 = 1; c1 <= TIMECODES; c1++) {
+               int t = time_codes[c1];
+               if (t*2 < tmin)
+                       continue;
+               if (t > tmax)
+                       break;
+               for (c2 = 0; c2 <= c1; c2--) {
+                       int tt = t + time_codes[c2];
+                       int d;
+                       if (tt < tmin)
+                               continue;
+                       if (tt > tmax)
+                               break;
+                       /* This works! */
+                       d = abs(msec - tt);
+                       if (d >= diff)
+                               continue;
+                       /* Best yet */
+                       *c1p = c1;
+                       *c2p = c2;
+                       diff = d;
+                       if (d == 0)
+                               return 0;
+               }
+       }
+       if (diff < 65536)
+               return 0;
+       /* No close match */
+       return -EINVAL;
+}
+
+/*
+ * Update the register file with the appropriate 3-bit state for
+ * the given led.
+ */
+static void set_select(struct tca6507_chip *tca, int led, int val)
+{
+       int mask = (1 << led);
+       int bit;
+
+       for (bit = 0; bit < 3; bit++) {
+               int n = tca->reg_file[bit] & ~mask;
+               if (val & (1 << bit))
+                       n |= mask;
+               if (tca->reg_file[bit] != n) {
+                       tca->reg_file[bit] = n;
+                       tca->reg_set |= (1 << bit);
+               }
+       }
+}
+
+/* Update the register file with the appropriate 4-bit code for
+ * one bank or other.  This can be used for timers, for levels, or
+ * for initialisation.
+ */
+static void set_code(struct tca6507_chip *tca, int reg, int bank, int new)
+{
+       int mask = (0xF << bank);
+       int n = tca->reg_file[reg] & ~mask;
+       if (bank)
+               new <<= 4;
+       n |= new;
+       if (tca->reg_file[reg] != n) {
+               tca->reg_file[reg] = n;
+               tca->reg_set |= 1 << reg;
+       }
+}
+
+/* Update brightness level. */
+static void set_level(struct tca6507_chip *tca, int bank, int level)
+{
+       switch (bank) {
+       case BANK0:
+       case BANK1:
+               set_code(tca, TCA6507_MAX_INTENSITY, bank, level);
+               break;
+       case MASTER:
+               set_code(tca, TCA6507_MASTER_INTENSITY, 0, level);
+               break;
+       }
+       tca->bank[bank].level = level;
+}
+
+/* Record all relevant time code for a given bank */
+static void set_times(struct tca6507_chip *tca, int bank)
+{
+       int c1, c2;
+
+       choose_times(tca->bank[bank].ontime, &c1, &c2);
+       set_code(tca, TCA6507_FADE_ON, bank, c2);
+       set_code(tca, TCA6507_FULL_ON, bank, c1);
+       tca->bank[bank].ontime = time_codes[c1] + time_codes[c2];
+
+       choose_times(tca->bank[bank].offtime, &c1, &c2);
+       set_code(tca, TCA6507_FADE_OFF, bank, c2);
+       set_code(tca, TCA6507_FIRST_OFF, bank, c1);
+       set_code(tca, TCA6507_SECOND_OFF, bank, c1);
+       tca->bank[bank].offtime = time_codes[c1] + time_codes[c2];
+
+       set_code(tca, TCA6507_INITIALIZE, bank, INIT_CODE);
+}
+
+/* Write all needed register of tca6507 */
+
+static void tca6507_work(struct work_struct *work)
+{
+       struct tca6507_chip *tca = container_of(work, struct tca6507_chip,
+                                               work);
+       struct i2c_client *cl = tca->client;
+       int set;
+       u8 file[TCA6507_REG_CNT];
+       int r;
+
+       spin_lock(&tca->lock);
+       set = tca->reg_set;
+       memcpy(file, tca->reg_file, TCA6507_REG_CNT);
+       tca->reg_set = 0;
+       spin_unlock(&tca->lock);
+
+       for (r = 0; r < TCA6507_REG_CNT; r++)
+               if (set & (1<<r))
+                       i2c_smbus_write_byte_data(cl, r, file[r]);
+}
+
+static void led_release(struct tca6507_led *led)
+{
+       /* If led owns any resource, release it. */
+       struct tca6507_chip *tca = led->chip;
+       if (led->bank >= 0) {
+               struct bank *b = tca->bank + led->bank;
+               if (led->blink)
+                       b->time_use--;
+               b->level_use--;
+       }
+       led->blink = 0;
+       led->bank = -1;
+}
+
+static int led_prepare(struct tca6507_led *led)
+{
+       /* Assign this led to a bank. configuring that bank if necessary */
+       int level = TO_LEVEL(led->led_cdev.brightness);
+       struct tca6507_chip *tca = led->chip;
+       int c1, c2;
+       int i;
+       struct bank *b;
+       int need_init = 0;
+
+       led->led_cdev.brightness = TO_BRIGHT(level);
+       if (level == 0) {
+               set_select(tca, led->num, TCA6507_LS_LED_OFF);
+               return 0;
+       }
+
+       if (led->ontime == 0 || led->offtime == 0) {
+               /*
+                * Just set the brightness, choosing first usable bank.
+                * If none perfect, choose best.
+                * Count backwards so we check MASTER bank first
+                * to avoid wasting a timer.
+                */
+               int best = -1;/* full-on */
+               int diff = 15-level;
+
+               if (level == 15) {
+                       set_select(tca, led->num, TCA6507_LS_LED_ON);
+                       return 0;
+               }
+
+               for (i = MASTER; i >= BANK0; i++) {
+                       int d;
+                       if (tca->bank[i].level == level ||
+                           tca->bank[i].level_use == 0) {
+                               best = i;
+                               break;
+                       }
+                       d = abs(level - tca->bank[i].level);
+                       if (d < diff) {
+                               diff = d;
+                               best = i;
+                       }
+               }
+               if (best == -1) {
+                       /* Best brightness is full-on */
+                       set_select(tca, led->num, TCA6507_LS_LED_ON);
+                       led->led_cdev.brightness = LED_FULL;
+                       return 0;
+               }
+
+               if (!tca->bank[best].level_use)
+                       set_level(tca, best, level);
+
+               tca->bank[best].level_use++;
+               led->bank = best;
+               set_select(tca, led->num, bank_source[best]);
+               led->led_cdev.brightness = TO_BRIGHT(tca->bank[best].level);
+               return 0;
+       }
+
+       /* We have on/off time so we need to try to allocate a timing bank. */
+       if (choose_times(led->ontime, &c1, &c2) < 0)
+               return -EINVAL;
+       if (choose_times(led->offtime, &c1, &c2) < 0)
+               return -EINVAL;
+
+       for (i = BANK0; i <= BANK1; i++) {
+               if (tca->bank[i].level_use == 0)
+                       /* not in use - it is ours! */
+                       break;
+               if (tca->bank[i].level != level)
+                       /* Incompatible level - skip */
+                       /* FIX: if timer matches we maybe should consider
+                        * this anyway...
+                        */
+                       continue;
+
+               if (tca->bank[i].time_use == 0)
+                       /* Timer not in use, and level matches - use it */
+                       break;
+
+               if (!(tca->bank[i].on_dflt ||
+                     led->on_dflt ||
+                     tca->bank[i].ontime == led->ontime))
+                       /* on time is incompatible */
+                       continue;
+
+               if (!(tca->bank[i].off_dflt ||
+                     led->off_dflt ||
+                     tca->bank[i].offtime == led->offtime))
+                       /* off time is incompatible */
+                       continue;
+
+               /* looks like a suitable match */
+               break;
+       }
+
+       if (i > BANK1)
+               /* Nothing matches - how sad */
+               return -EINVAL;
+
+       b = &tca->bank[i];
+       if (b->level_use == 0)
+               set_level(tca, i, level);
+       b->level_use++;
+       led->bank = i;
+
+       if (b->on_dflt ||
+           !led->on_dflt ||
+           b->time_use == 0) {
+               b->ontime = led->ontime;
+               b->on_dflt = led->on_dflt;
+               need_init = 1;
+       }
+
+       if (b->off_dflt ||
+           !led->off_dflt ||
+           b->time_use == 0) {
+               b->offtime = led->offtime;
+               b->off_dflt = led->off_dflt;
+               need_init = 1;
+       }
+
+       if (need_init)
+               set_times(tca, i);
+
+       led->ontime = b->ontime;
+       led->offtime = b->offtime;
+
+       b->time_use++;
+       led->blink = 1;
+       led->led_cdev.brightness = TO_BRIGHT(b->level);
+       set_select(tca, led->num, blink_source[i]);
+       return 0;
+}
+
+static int led_assign(struct tca6507_led *led)
+{
+       struct tca6507_chip *tca = led->chip;
+       int err;
+
+       spin_lock(&tca->lock);
+       led_release(led);
+       err = led_prepare(led);
+       if (err) {
+               /*
+                * Can only fail on timer setup.  In that case we need to
+                * re-establish as steady level.
+                */
+               led->ontime = 0;
+               led->offtime = 0;
+               led_prepare(led);
+       }
+       spin_unlock(&tca->lock);
+
+       if (tca->reg_set)
+               schedule_work(&tca->work);
+       return err;
+}
+
+static void tca6507_brightness_set(struct led_classdev *led_cdev,
+                                  enum led_brightness brightness)
+{
+       struct tca6507_led *led = container_of(led_cdev, struct tca6507_led,
+                                              led_cdev);
+       led->led_cdev.brightness = brightness;
+       led->ontime = 0;
+       led->offtime = 0;
+       led_assign(led);
+}
+
+static int tca6507_blink_set(struct led_classdev *led_cdev,
+                            unsigned long *delay_on,
+                            unsigned long *delay_off)
+{
+       struct tca6507_led *led = container_of(led_cdev, struct tca6507_led,
+                                              led_cdev);
+
+       if (*delay_on == 0)
+               led->on_dflt = 1;
+       else if (delay_on != &led_cdev->blink_delay_on)
+               led->on_dflt = 0;
+       led->ontime = *delay_on;
+
+       if (*delay_off == 0)
+               led->off_dflt = 1;
+       else if (delay_off != &led_cdev->blink_delay_off)
+               led->off_dflt = 0;
+       led->offtime = *delay_off;
+
+       if (led->ontime == 0)
+               led->ontime = 512;
+       if (led->offtime == 0)
+               led->offtime = 512;
+
+       if (led->led_cdev.brightness == LED_OFF)
+               led->led_cdev.brightness = LED_FULL;
+       if (led_assign(led) < 0) {
+               led->ontime = 0;
+               led->offtime = 0;
+               led->led_cdev.brightness = LED_OFF;
+               return -EINVAL;
+       }
+       *delay_on = led->ontime;
+       *delay_off = led->offtime;
+       return 0;
+}
+
+static void tca6507_gpio_set_value(struct gpio_chip *gc,
+                                  unsigned offset, int val)
+{
+       struct tca6507_chip *tca = container_of(gc, struct tca6507_chip, gpio);
+
+       spin_lock(&tca->lock);
+       /*
+        * 'OFF' is floating high, and 'ON' is pulled down, so it has the
+        * inverse sense of 'val'.
+        */
+       set_select(tca, tca->gpio_map[offset],
+                  val ? TCA6507_LS_LED_OFF : TCA6507_LS_LED_ON);
+       spin_unlock(&tca->lock);
+       if (tca->reg_set)
+               schedule_work(&tca->work);
+}
+
+static int tca6507_gpio_direction_output(struct gpio_chip *gc,
+                                         unsigned offset, int val)
+{
+       tca6507_gpio_set_value(gc, offset, val);
+       return 0;
+}
+
+static int __devinit tca6507_probe(struct i2c_client *client,
+                                  const struct i2c_device_id *id)
+{
+       struct tca6507_chip *tca;
+       struct i2c_adapter *adapter;
+       struct tca6507_platform_data *pdata;
+       int err;
+       int i = 0;
+       int gpios = 0;
+
+       adapter = to_i2c_adapter(client->dev.parent);
+       pdata = client->dev.platform_data;
+
+       if (!i2c_check_functionality(adapter, I2C_FUNC_I2C))
+               return -EIO;
+
+       if (!pdata || pdata->leds.num_leds != NUM_LEDS) {
+               dev_err(&client->dev, "Need %d entries in platform-data list\n",
+                       NUM_LEDS);
+               return -ENODEV;
+       }
+       err = -ENOMEM;
+       tca = kzalloc(sizeof(*tca), GFP_KERNEL);
+       if (!tca)
+               goto exit;
+
+       tca->client = client;
+       INIT_WORK(&tca->work, tca6507_work);
+       spin_lock_init(&tca->lock);
+
+       for (i = 0; i < NUM_LEDS; i++) {
+               struct tca6507_led *l = tca->leds + i;
+
+               l->chip = tca;
+               l->num = i;
+               if (pdata->leds.leds[i].name && !pdata->leds.leds[i].flags) {
+                       l->led_cdev.name = pdata->leds.leds[i].name;
+                       l->led_cdev.default_trigger
+                               = pdata->leds.leds[i].default_trigger;
+                       l->led_cdev.brightness_set = tca6507_brightness_set;
+                       l->led_cdev.blink_set = tca6507_blink_set;
+                       l->bank = -1;
+                       err = led_classdev_register(&client->dev,
+                                                   &l->led_cdev);
+                       if (err < 0)
+                               goto exit;
+               }
+               if (pdata->leds.leds[i].name && pdata->leds.leds[i].flags) {
+                       /* Configure as a gpio */
+                       tca->gpio_name[gpios] = pdata->leds.leds[i].name;
+                       tca->gpio_map[gpios] = i;
+                       gpios++;
+               }
+       }
+       if (gpios) {
+               tca->gpio.label = "gpio-tca6507";
+               tca->gpio.names = tca->gpio_name;
+               tca->gpio.ngpio = gpios;
+               tca->gpio.base = pdata->gpio_base;
+               tca->gpio.owner = THIS_MODULE;
+               tca->gpio.direction_output = tca6507_gpio_direction_output;
+               tca->gpio.set = tca6507_gpio_set_value;
+               tca->gpio.dev = &client->dev;
+               err = gpiochip_add(&tca->gpio);
+               if (err) {
+                       tca->gpio.ngpio = 0;
+                       goto exit;
+               }
+               if (pdata->setup)
+                       pdata->setup(tca->gpio.base, tca->gpio.ngpio);
+       }
+       i2c_set_clientdata(client, tca);
+       /* set all registers to known state - zero */
+       tca->reg_set = 0x7f;
+       schedule_work(&tca->work);
+
+       return 0;
+exit:
+       while (i--)
+               if (tca->leds[i].led_cdev.name)
+                       led_classdev_unregister(&tca->leds[i].led_cdev);
+       kfree(tca);
+       return err;
+}
+
+static int __devexit tca6507_remove(struct i2c_client *client)
+{
+       int i;
+       struct tca6507_chip *tca = i2c_get_clientdata(client);
+       struct tca6507_led *tca_leds = tca->leds;
+
+       for (i = 0; i < NUM_LEDS; i++) {
+               if (tca_leds[i].led_cdev.name)
+                       led_classdev_unregister(&tca_leds[i].led_cdev);
+       }
+       if (tca->gpio.ngpio) {
+               int err = gpiochip_remove(&tca->gpio);
+               dev_err(&tca->client->dev, "%s failed, %d\n",
+                       "gpiochip_remove()", err);
+       }
+       cancel_work_sync(&tca->work);
+       kfree(tca);
+       i2c_set_clientdata(client, NULL);
+
+       return 0;
+}
+
+static struct i2c_driver tca6507_driver = {
+       .driver   = {
+               .name    = "leds-tca6507",
+               .owner   = THIS_MODULE,
+       },
+       .probe    = tca6507_probe,
+       .remove   = __devexit_p(tca6507_remove),
+       .id_table = tca6507_id,
+};
+
+static int __init tca6507_leds_init(void)
+{
+       return i2c_add_driver(&tca6507_driver);
+}
+
+static void __exit tca6507_leds_exit(void)
+{
+       i2c_del_driver(&tca6507_driver);
+}
+
+module_init(tca6507_leds_init);
+module_exit(tca6507_leds_exit);
+
+MODULE_AUTHOR("NeilBrown <neilb@suse.de>");
+MODULE_DESCRIPTION("TCA6507 LED/GPO driver");
+MODULE_LICENSE("GPL v2");
index b1eb34c3e81f41d5eafdd00ed112373a5f25c970..444a68d8e17e1f9b49da76b4dba18e03fb27b329 100644 (file)
@@ -325,17 +325,7 @@ static struct platform_driver wm831x_status_driver = {
        .remove = wm831x_status_remove,
 };
 
-static int __devinit wm831x_status_init(void)
-{
-       return platform_driver_register(&wm831x_status_driver);
-}
-module_init(wm831x_status_init);
-
-static void wm831x_status_exit(void)
-{
-       platform_driver_unregister(&wm831x_status_driver);
-}
-module_exit(wm831x_status_exit);
+module_platform_driver(wm831x_status_driver);
 
 MODULE_AUTHOR("Mark Brown <broonie@opensource.wolfsonmicro.com>");
 MODULE_DESCRIPTION("WM831x status LED driver");
index 4a1276578352283544ceb6fda2ff176d0772d887..390c0f679628f74780dc695db48424371b53aaf4 100644 (file)
@@ -295,17 +295,7 @@ static struct platform_driver wm8350_led_driver = {
        .shutdown = wm8350_led_shutdown,
 };
 
-static int __devinit wm8350_led_init(void)
-{
-       return platform_driver_register(&wm8350_led_driver);
-}
-module_init(wm8350_led_init);
-
-static void wm8350_led_exit(void)
-{
-       platform_driver_unregister(&wm8350_led_driver);
-}
-module_exit(wm8350_led_exit);
+module_platform_driver(wm8350_led_driver);
 
 MODULE_AUTHOR("Mark Brown");
 MODULE_DESCRIPTION("WM8350 LED driver");
index 9729b92fbfdd3150280328a2be3d5872beacd618..b4a273f5f1b1df6c27d638694d6ae951c5bf303e 100644 (file)
@@ -1034,12 +1034,11 @@ static int mspro_block_read_attributes(struct memstick_dev *card)
        }
        msb->attr_group.name = "media_attributes";
 
-       buffer = kmalloc(attr_len, GFP_KERNEL);
+       buffer = kmemdup((char *)attr, attr_len, GFP_KERNEL);
        if (!buffer) {
                rc = -ENOMEM;
                goto out_free_attr;
        }
-       memcpy(buffer, (char *)attr, attr_len);
 
        for (cnt = 0; cnt < attr_count; ++cnt) {
                s_attr = kzalloc(sizeof(struct mspro_sys_attr), GFP_KERNEL);
index f3cdce9a85a6602f2078ce1ffa67d095a26e4e7e..b7883e529d346bb659a4ec516bccda6424c88b29 100644 (file)
@@ -311,8 +311,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper,
        char *dst;
 
        if (reason != KMSG_DUMP_OOPS &&
-           reason != KMSG_DUMP_PANIC &&
-           reason != KMSG_DUMP_KEXEC)
+           reason != KMSG_DUMP_PANIC)
                return;
 
        /* Only dump oopses if dump_oops is set */
index 760c6d7624fe505f5da3f40060a1df9d0eaf12b0..833c0ec61656cfe48b401628fafdf235b20df049 100644 (file)
@@ -50,7 +50,7 @@
  */
 #undef START_IN_KERNEL_MODE
 
-#define DRV_VER "0.5.24"
+#define DRV_VER "0.5.26"
 
 /*
  * According to the Atom N270 datasheet,
@@ -83,8 +83,8 @@ static int kernelmode;
 #endif
 
 static unsigned int interval = 10;
-static unsigned int fanon = 63000;
-static unsigned int fanoff = 58000;
+static unsigned int fanon = 60000;
+static unsigned int fanoff = 53000;
 static unsigned int verbose;
 static unsigned int fanstate = ACERHDF_FAN_AUTO;
 static char force_bios[16];
@@ -150,6 +150,8 @@ static const struct bios_settings_t bios_tbl[] = {
        {"Acer", "AOA150", "v0.3308", 0x55, 0x58, {0x20, 0x00} },
        {"Acer", "AOA150", "v0.3309", 0x55, 0x58, {0x20, 0x00} },
        {"Acer", "AOA150", "v0.3310", 0x55, 0x58, {0x20, 0x00} },
+       /* LT1005u */
+       {"Acer", "LT-10Q", "v0.3310", 0x55, 0x58, {0x20, 0x00} },
        /* Acer 1410 */
        {"Acer", "Aspire 1410", "v0.3108", 0x55, 0x58, {0x9e, 0x00} },
        {"Acer", "Aspire 1410", "v0.3113", 0x55, 0x58, {0x9e, 0x00} },
@@ -161,6 +163,7 @@ static const struct bios_settings_t bios_tbl[] = {
        {"Acer", "Aspire 1410", "v1.3303", 0x55, 0x58, {0x9e, 0x00} },
        {"Acer", "Aspire 1410", "v1.3308", 0x55, 0x58, {0x9e, 0x00} },
        {"Acer", "Aspire 1410", "v1.3310", 0x55, 0x58, {0x9e, 0x00} },
+       {"Acer", "Aspire 1410", "v1.3314", 0x55, 0x58, {0x9e, 0x00} },
        /* Acer 1810xx */
        {"Acer", "Aspire 1810TZ", "v0.3108", 0x55, 0x58, {0x9e, 0x00} },
        {"Acer", "Aspire 1810T",  "v0.3108", 0x55, 0x58, {0x9e, 0x00} },
@@ -183,29 +186,44 @@ static const struct bios_settings_t bios_tbl[] = {
        {"Acer", "Aspire 1810TZ", "v1.3310", 0x55, 0x58, {0x9e, 0x00} },
        {"Acer", "Aspire 1810T",  "v1.3310", 0x55, 0x58, {0x9e, 0x00} },
        {"Acer", "Aspire 1810TZ", "v1.3314", 0x55, 0x58, {0x9e, 0x00} },
+       {"Acer", "Aspire 1810T",  "v1.3314", 0x55, 0x58, {0x9e, 0x00} },
        /* Acer 531 */
+       {"Acer", "AO531h", "v0.3104", 0x55, 0x58, {0x20, 0x00} },
        {"Acer", "AO531h", "v0.3201", 0x55, 0x58, {0x20, 0x00} },
+       {"Acer", "AO531h", "v0.3304", 0x55, 0x58, {0x20, 0x00} },
+       /* Acer 751 */
+       {"Acer", "AO751h", "V0.3212", 0x55, 0x58, {0x21, 0x00} },
+       /* Acer 1825 */
+       {"Acer", "Aspire 1825PTZ", "V1.3118", 0x55, 0x58, {0x9e, 0x00} },
+       {"Acer", "Aspire 1825PTZ", "V1.3127", 0x55, 0x58, {0x9e, 0x00} },
+       /* Acer TravelMate 7730 */
+       {"Acer", "TravelMate 7730G", "v0.3509", 0x55, 0x58, {0xaf, 0x00} },
        /* Gateway */
-       {"Gateway", "AOA110", "v0.3103", 0x55, 0x58, {0x21, 0x00} },
-       {"Gateway", "AOA150", "v0.3103", 0x55, 0x58, {0x20, 0x00} },
-       {"Gateway", "LT31",   "v1.3103", 0x55, 0x58, {0x9e, 0x00} },
-       {"Gateway", "LT31",   "v1.3201", 0x55, 0x58, {0x9e, 0x00} },
-       {"Gateway", "LT31",   "v1.3302", 0x55, 0x58, {0x9e, 0x00} },
+       {"Gateway", "AOA110", "v0.3103",  0x55, 0x58, {0x21, 0x00} },
+       {"Gateway", "AOA150", "v0.3103",  0x55, 0x58, {0x20, 0x00} },
+       {"Gateway", "LT31",   "v1.3103",  0x55, 0x58, {0x9e, 0x00} },
+       {"Gateway", "LT31",   "v1.3201",  0x55, 0x58, {0x9e, 0x00} },
+       {"Gateway", "LT31",   "v1.3302",  0x55, 0x58, {0x9e, 0x00} },
+       {"Gateway", "LT31",   "v1.3303t", 0x55, 0x58, {0x9e, 0x00} },
        /* Packard Bell */
-       {"Packard Bell", "DOA150", "v0.3104", 0x55, 0x58, {0x21, 0x00} },
-       {"Packard Bell", "DOA150", "v0.3105", 0x55, 0x58, {0x20, 0x00} },
-       {"Packard Bell", "AOA110", "v0.3105", 0x55, 0x58, {0x21, 0x00} },
-       {"Packard Bell", "AOA150", "v0.3105", 0x55, 0x58, {0x20, 0x00} },
-       {"Packard Bell", "DOTMU",  "v1.3303", 0x55, 0x58, {0x9e, 0x00} },
-       {"Packard Bell", "DOTMU",  "v0.3120", 0x55, 0x58, {0x9e, 0x00} },
-       {"Packard Bell", "DOTMU",  "v0.3108", 0x55, 0x58, {0x9e, 0x00} },
-       {"Packard Bell", "DOTMU",  "v0.3113", 0x55, 0x58, {0x9e, 0x00} },
-       {"Packard Bell", "DOTMU",  "v0.3115", 0x55, 0x58, {0x9e, 0x00} },
-       {"Packard Bell", "DOTMU",  "v0.3117", 0x55, 0x58, {0x9e, 0x00} },
-       {"Packard Bell", "DOTMU",  "v0.3119", 0x55, 0x58, {0x9e, 0x00} },
-       {"Packard Bell", "DOTMU",  "v1.3204", 0x55, 0x58, {0x9e, 0x00} },
-       {"Packard Bell", "DOTMA",  "v1.3201", 0x55, 0x58, {0x9e, 0x00} },
-       {"Packard Bell", "DOTMA",  "v1.3302", 0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOA150",  "v0.3104",  0x55, 0x58, {0x21, 0x00} },
+       {"Packard Bell", "DOA150",  "v0.3105",  0x55, 0x58, {0x20, 0x00} },
+       {"Packard Bell", "AOA110",  "v0.3105",  0x55, 0x58, {0x21, 0x00} },
+       {"Packard Bell", "AOA150",  "v0.3105",  0x55, 0x58, {0x20, 0x00} },
+       {"Packard Bell", "ENBFT",   "V1.3118",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "ENBFT",   "V1.3127",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMU",   "v1.3303",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMU",   "v0.3120",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMU",   "v0.3108",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMU",   "v0.3113",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMU",   "v0.3115",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMU",   "v0.3117",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMU",   "v0.3119",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMU",   "v1.3204",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMA",   "v1.3201",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMA",   "v1.3302",  0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTMA",   "v1.3303t", 0x55, 0x58, {0x9e, 0x00} },
+       {"Packard Bell", "DOTVR46", "v1.3308",  0x55, 0x58, {0x9e, 0x00} },
        /* pewpew-terminator */
        {"", "", "", 0, 0, {0, 0} }
 };
@@ -704,15 +722,20 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Peter Feuerer");
 MODULE_DESCRIPTION("Aspire One temperature and fan driver");
 MODULE_ALIAS("dmi:*:*Acer*:pnAOA*:");
+MODULE_ALIAS("dmi:*:*Acer*:pnAO751h*:");
 MODULE_ALIAS("dmi:*:*Acer*:pnAspire*1410*:");
 MODULE_ALIAS("dmi:*:*Acer*:pnAspire*1810*:");
+MODULE_ALIAS("dmi:*:*Acer*:pnAspire*1825PTZ:");
 MODULE_ALIAS("dmi:*:*Acer*:pnAO531*:");
+MODULE_ALIAS("dmi:*:*Acer*:TravelMate*7730G:");
 MODULE_ALIAS("dmi:*:*Gateway*:pnAOA*:");
 MODULE_ALIAS("dmi:*:*Gateway*:pnLT31*:");
 MODULE_ALIAS("dmi:*:*Packard*Bell*:pnAOA*:");
 MODULE_ALIAS("dmi:*:*Packard*Bell*:pnDOA*:");
 MODULE_ALIAS("dmi:*:*Packard*Bell*:pnDOTMU*:");
+MODULE_ALIAS("dmi:*:*Packard*Bell*:pnENBFT*:");
 MODULE_ALIAS("dmi:*:*Packard*Bell*:pnDOTMA*:");
+MODULE_ALIAS("dmi:*:*Packard*Bell*:pnDOTVR46*:");
 
 module_init(acerhdf_init);
 module_exit(acerhdf_exit);
index c006dee5ebfe84bb33a962dbdddf4ff9440c74be..f5d369d379e5382d69399d067ee2065a93c2efb6 100644 (file)
@@ -347,6 +347,7 @@ static void sony_laptop_report_input_event(u8 event)
        struct input_dev *jog_dev = sony_laptop_input.jog_dev;
        struct input_dev *key_dev = sony_laptop_input.key_dev;
        struct sony_laptop_keypress kp = { NULL };
+       int scancode = -1;
 
        if (event == SONYPI_EVENT_FNKEY_RELEASED ||
                        event == SONYPI_EVENT_ANYBUTTON_RELEASED) {
@@ -380,8 +381,9 @@ static void sony_laptop_report_input_event(u8 event)
                        dprintk("sony_laptop_report_input_event, event not known: %d\n", event);
                        break;
                }
-               if (sony_laptop_input_index[event] != -1) {
-                       kp.key = sony_laptop_input_keycode_map[sony_laptop_input_index[event]];
+               scancode = sony_laptop_input_index[event];
+               if (scancode != -1) {
+                       kp.key = sony_laptop_input_keycode_map[scancode];
                        if (kp.key != KEY_UNKNOWN)
                                kp.dev = key_dev;
                }
@@ -389,9 +391,13 @@ static void sony_laptop_report_input_event(u8 event)
        }
 
        if (kp.dev) {
+               /*
+                * If we have a scancode we emit it so we can always remap the
+                * key
+                */
+               if (scancode != -1)
+                       input_event(kp.dev, EV_MSC, MSC_SCAN, scancode);
                input_report_key(kp.dev, kp.key, 1);
-               /* we emit the scancode so we can always remap the key */
-               input_event(kp.dev, EV_MSC, MSC_SCAN, event);
                input_sync(kp.dev);
 
                /* schedule key release */
@@ -466,7 +472,7 @@ static int sony_laptop_setup_input(struct acpi_device *acpi_device)
        jog_dev->name = "Sony Vaio Jogdial";
        jog_dev->id.bustype = BUS_ISA;
        jog_dev->id.vendor = PCI_VENDOR_ID_SONY;
-       key_dev->dev.parent = &acpi_device->dev;
+       jog_dev->dev.parent = &acpi_device->dev;
 
        input_set_capability(jog_dev, EV_KEY, BTN_MIDDLE);
        input_set_capability(jog_dev, EV_REL, REL_WHEEL);
index e346705aae92f1ebae4570f7f13adc77adc89fef..df7bfc304c5e8bbf3191efdaace13bcf2cf79e3a 100644 (file)
@@ -90,7 +90,7 @@ static int ab8500_rtc_read_time(struct device *dev, struct rtc_time *tm)
 
        /* Early AB8500 chips will not clear the rtc read request bit */
        if (abx500_get_chip_id(dev) == 0) {
-               msleep(1);
+               usleep_range(1000, 1000);
        } else {
                /* Wait for some cycles after enabling the rtc read in ab8500 */
                while (time_before(jiffies, timeout)) {
@@ -102,7 +102,7 @@ static int ab8500_rtc_read_time(struct device *dev, struct rtc_time *tm)
                        if (!(value & RTC_READ_REQUEST))
                                break;
 
-                       msleep(1);
+                       usleep_range(1000, 5000);
                }
        }
 
@@ -258,6 +258,109 @@ static int ab8500_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm)
        return ab8500_rtc_irq_enable(dev, alarm->enabled);
 }
 
+
+static int ab8500_rtc_set_calibration(struct device *dev, int calibration)
+{
+       int retval;
+       u8  rtccal = 0;
+
+       /*
+        * Check that the calibration value (which is in units of 0.5
+        * parts-per-million) is in the AB8500's range for RtcCalibration
+        * register. -128 (0x80) is not permitted because the AB8500 uses
+        * a sign-bit rather than two's complement, so 0x80 is just another
+        * representation of zero.
+        */
+       if ((calibration < -127) || (calibration > 127)) {
+               dev_err(dev, "RtcCalibration value outside permitted range\n");
+               return -EINVAL;
+       }
+
+       /*
+        * The AB8500 uses sign (in bit7) and magnitude (in bits0-7)
+        * so need to convert to this sort of representation before writing
+        * into RtcCalibration register...
+        */
+       if (calibration >= 0)
+               rtccal = 0x7F & calibration;
+       else
+               rtccal = ~(calibration - 1) | 0x80;
+
+       retval = abx500_set_register_interruptible(dev, AB8500_RTC,
+                       AB8500_RTC_CALIB_REG, rtccal);
+
+       return retval;
+}
+
+static int ab8500_rtc_get_calibration(struct device *dev, int *calibration)
+{
+       int retval;
+       u8  rtccal = 0;
+
+       retval =  abx500_get_register_interruptible(dev, AB8500_RTC,
+                       AB8500_RTC_CALIB_REG, &rtccal);
+       if (retval >= 0) {
+               /*
+                * The AB8500 uses sign (in bit7) and magnitude (in bits0-7)
+                * so need to convert value from RtcCalibration register into
+                * a two's complement signed value...
+                */
+               if (rtccal & 0x80)
+                       *calibration = 0 - (rtccal & 0x7F);
+               else
+                       *calibration = 0x7F & rtccal;
+       }
+
+       return retval;
+}
+
+static ssize_t ab8500_sysfs_store_rtc_calibration(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t count)
+{
+       int retval;
+       int calibration = 0;
+
+       if (sscanf(buf, " %i ", &calibration) != 1) {
+               dev_err(dev, "Failed to store RTC calibration attribute\n");
+               return -EINVAL;
+       }
+
+       retval = ab8500_rtc_set_calibration(dev, calibration);
+
+       return retval ? retval : count;
+}
+
+static ssize_t ab8500_sysfs_show_rtc_calibration(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       int  retval = 0;
+       int  calibration = 0;
+
+       retval = ab8500_rtc_get_calibration(dev, &calibration);
+       if (retval < 0) {
+               dev_err(dev, "Failed to read RTC calibration attribute\n");
+               sprintf(buf, "0\n");
+               return retval;
+       }
+
+       return sprintf(buf, "%d\n", calibration);
+}
+
+static DEVICE_ATTR(rtc_calibration, S_IRUGO | S_IWUSR,
+                  ab8500_sysfs_show_rtc_calibration,
+                  ab8500_sysfs_store_rtc_calibration);
+
+static int ab8500_sysfs_rtc_register(struct device *dev)
+{
+       return device_create_file(dev, &dev_attr_rtc_calibration);
+}
+
+static void ab8500_sysfs_rtc_unregister(struct device *dev)
+{
+       device_remove_file(dev, &dev_attr_rtc_calibration);
+}
+
 static irqreturn_t rtc_alarm_handler(int irq, void *data)
 {
        struct rtc_device *rtc = data;
@@ -295,7 +398,7 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev)
                return err;
 
        /* Wait for reset by the PorRtc */
-       msleep(1);
+       usleep_range(1000, 5000);
 
        err = abx500_get_register_interruptible(&pdev->dev, AB8500_RTC,
                AB8500_RTC_STAT_REG, &rtc_ctrl);
@@ -308,6 +411,8 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev)
                return -ENODEV;
        }
 
+       device_init_wakeup(&pdev->dev, true);
+
        rtc = rtc_device_register("ab8500-rtc", &pdev->dev, &ab8500_rtc_ops,
                        THIS_MODULE);
        if (IS_ERR(rtc)) {
@@ -316,8 +421,8 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev)
                return err;
        }
 
-       err = request_threaded_irq(irq, NULL, rtc_alarm_handler, 0,
-                                  "ab8500-rtc", rtc);
+       err = request_threaded_irq(irq, NULL, rtc_alarm_handler,
+               IRQF_NO_SUSPEND, "ab8500-rtc", rtc);
        if (err < 0) {
                rtc_device_unregister(rtc);
                return err;
@@ -325,6 +430,13 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, rtc);
 
+
+       err = ab8500_sysfs_rtc_register(&pdev->dev);
+       if (err) {
+               dev_err(&pdev->dev, "sysfs RTC failed to register\n");
+               return err;
+       }
+
        return 0;
 }
 
@@ -333,6 +445,8 @@ static int __devexit ab8500_rtc_remove(struct platform_device *pdev)
        struct rtc_device *rtc = platform_get_drvdata(pdev);
        int irq = platform_get_irq_byname(pdev, "ALARM");
 
+       ab8500_sysfs_rtc_unregister(&pdev->dev);
+
        free_irq(irq, rtc);
        rtc_device_unregister(rtc);
        platform_set_drvdata(pdev, NULL);
index 05beb6c1ca79c7a0e22793818debbfb774da4187..d7782aa099439575a1a4f1887190b70a1b5e6370 100644 (file)
@@ -164,7 +164,7 @@ static inline unsigned char cmos_read_bank2(unsigned char addr)
 static inline void cmos_write_bank2(unsigned char val, unsigned char addr)
 {
        outb(addr, RTC_PORT(2));
-       outb(val, RTC_PORT(2));
+       outb(val, RTC_PORT(3));
 }
 
 #else
index 39e41fbdf08ba6de6259347feb0cc5dee76803d7..5e1d64ee52289b9e7a47990f8c7de0b41c7fccaf 100644 (file)
@@ -155,7 +155,6 @@ static int rtc_update_alarm(struct device *dev, struct rtc_time *alrm)
 {
        struct rtc_time alarm_tm, now_tm;
        unsigned long now, time;
-       int ret;
        struct platform_device *pdev = to_platform_device(dev);
        struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
        void __iomem *ioaddr = pdata->ioaddr;
@@ -168,21 +167,33 @@ static int rtc_update_alarm(struct device *dev, struct rtc_time *alrm)
        alarm_tm.tm_hour = alrm->tm_hour;
        alarm_tm.tm_min = alrm->tm_min;
        alarm_tm.tm_sec = alrm->tm_sec;
-       rtc_tm_to_time(&now_tm, &now);
        rtc_tm_to_time(&alarm_tm, &time);
 
-       if (time < now) {
-               time += 60 * 60 * 24;
-               rtc_time_to_tm(time, &alarm_tm);
-       }
-
-       ret = rtc_tm_to_time(&alarm_tm, &time);
-
        /* clear all the interrupt status bits */
        writew(readw(ioaddr + RTC_RTCISR), ioaddr + RTC_RTCISR);
        set_alarm_or_time(dev, MXC_RTC_ALARM, time);
 
-       return ret;
+       return 0;
+}
+
+static void mxc_rtc_irq_enable(struct device *dev, unsigned int bit,
+                               unsigned int enabled)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
+       void __iomem *ioaddr = pdata->ioaddr;
+       u32 reg;
+
+       spin_lock_irq(&pdata->rtc->irq_lock);
+       reg = readw(ioaddr + RTC_RTCIENR);
+
+       if (enabled)
+               reg |= bit;
+       else
+               reg &= ~bit;
+
+       writew(reg, ioaddr + RTC_RTCIENR);
+       spin_unlock_irq(&pdata->rtc->irq_lock);
 }
 
 /* This function is the RTC interrupt service routine. */
@@ -199,13 +210,12 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id)
        /* clear interrupt sources */
        writew(status, ioaddr + RTC_RTCISR);
 
-       /* clear alarm interrupt if it has occurred */
-       if (status & RTC_ALM_BIT)
-               status &= ~RTC_ALM_BIT;
-
        /* update irq data & counter */
-       if (status & RTC_ALM_BIT)
+       if (status & RTC_ALM_BIT) {
                events |= (RTC_AF | RTC_IRQF);
+               /* RTC alarm should be one-shot */
+               mxc_rtc_irq_enable(&pdev->dev, RTC_ALM_BIT, 0);
+       }
 
        if (status & RTC_1HZ_BIT)
                events |= (RTC_UF | RTC_IRQF);
@@ -213,9 +223,6 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id)
        if (status & PIT_ALL_ON)
                events |= (RTC_PF | RTC_IRQF);
 
-       if ((status & RTC_ALM_BIT) && rtc_valid_tm(&pdata->g_rtc_alarm))
-               rtc_update_alarm(&pdev->dev, &pdata->g_rtc_alarm);
-
        rtc_update_irq(pdata->rtc, 1, events);
        spin_unlock_irq(&pdata->rtc->irq_lock);
 
@@ -242,26 +249,6 @@ static void mxc_rtc_release(struct device *dev)
        spin_unlock_irq(&pdata->rtc->irq_lock);
 }
 
-static void mxc_rtc_irq_enable(struct device *dev, unsigned int bit,
-                               unsigned int enabled)
-{
-       struct platform_device *pdev = to_platform_device(dev);
-       struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
-       void __iomem *ioaddr = pdata->ioaddr;
-       u32 reg;
-
-       spin_lock_irq(&pdata->rtc->irq_lock);
-       reg = readw(ioaddr + RTC_RTCIENR);
-
-       if (enabled)
-               reg |= bit;
-       else
-               reg &= ~bit;
-
-       writew(reg, ioaddr + RTC_RTCIENR);
-       spin_unlock_irq(&pdata->rtc->irq_lock);
-}
-
 static int mxc_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
        mxc_rtc_irq_enable(dev, RTC_ALM_BIT, enabled);
@@ -290,6 +277,17 @@ static int mxc_rtc_read_time(struct device *dev, struct rtc_time *tm)
  */
 static int mxc_rtc_set_mmss(struct device *dev, unsigned long time)
 {
+       /*
+        * TTC_DAYR register is 9-bit in MX1 SoC, save time and day of year only
+        */
+       if (cpu_is_mx1()) {
+               struct rtc_time tm;
+
+               rtc_time_to_tm(time, &tm);
+               tm.tm_year = 70;
+               rtc_tm_to_time(&tm, &time);
+       }
+
        /* Avoid roll-over from reading the different registers */
        do {
                set_alarm_or_time(dev, MXC_RTC_TIME, time);
@@ -324,21 +322,7 @@ static int mxc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
        struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
        int ret;
 
-       if (rtc_valid_tm(&alrm->time)) {
-               if (alrm->time.tm_sec > 59 ||
-                   alrm->time.tm_hour > 23 ||
-                   alrm->time.tm_min > 59)
-                       return -EINVAL;
-
-               ret = rtc_update_alarm(dev, &alrm->time);
-       } else {
-               ret = rtc_valid_tm(&alrm->time);
-               if (ret)
-                       return ret;
-
-               ret = rtc_update_alarm(dev, &alrm->time);
-       }
-
+       ret = rtc_update_alarm(dev, &alrm->time);
        if (ret)
                return ret;
 
@@ -424,6 +408,9 @@ static int __init mxc_rtc_probe(struct platform_device *pdev)
                pdata->irq = -1;
        }
 
+       if (pdata->irq >=0)
+               device_init_wakeup(&pdev->dev, 1);
+
        rtc = rtc_device_register(pdev->name, &pdev->dev, &mxc_rtc_ops,
                                  THIS_MODULE);
        if (IS_ERR(rtc)) {
@@ -459,9 +446,39 @@ static int __exit mxc_rtc_remove(struct platform_device *pdev)
        return 0;
 }
 
+#ifdef CONFIG_PM
+static int mxc_rtc_suspend(struct device *dev)
+{
+       struct rtc_plat_data *pdata = dev_get_drvdata(dev);
+
+       if (device_may_wakeup(dev))
+               enable_irq_wake(pdata->irq);
+
+       return 0;
+}
+
+static int mxc_rtc_resume(struct device *dev)
+{
+       struct rtc_plat_data *pdata = dev_get_drvdata(dev);
+
+       if (device_may_wakeup(dev))
+               disable_irq_wake(pdata->irq);
+
+       return 0;
+}
+
+static struct dev_pm_ops mxc_rtc_pm_ops = {
+       .suspend        = mxc_rtc_suspend,
+       .resume         = mxc_rtc_resume,
+};
+#endif
+
 static struct platform_driver mxc_rtc_driver = {
        .driver = {
                   .name        = "mxc_rtc",
+#ifdef CONFIG_PM
+                  .pm          = &mxc_rtc_pm_ops,
+#endif
                   .owner       = THIS_MODULE,
        },
        .remove         = __exit_p(mxc_rtc_remove),
index bdc909bd56da0f4c7c615349df6f43023bd52517..657c6f67b2878469422185043741afd655f138b1 100644 (file)
@@ -324,15 +324,6 @@ static irqreturn_t wm831x_alm_irq(int irq, void *data)
        return IRQ_HANDLED;
 }
 
-static irqreturn_t wm831x_per_irq(int irq, void *data)
-{
-       struct wm831x_rtc *wm831x_rtc = data;
-
-       rtc_update_irq(wm831x_rtc->rtc, 1, RTC_IRQF | RTC_UF);
-
-       return IRQ_HANDLED;
-}
-
 static const struct rtc_class_ops wm831x_rtc_ops = {
        .read_time = wm831x_rtc_readtime,
        .set_mmss = wm831x_rtc_set_mmss,
@@ -405,11 +396,10 @@ static int wm831x_rtc_probe(struct platform_device *pdev)
 {
        struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
        struct wm831x_rtc *wm831x_rtc;
-       int per_irq = platform_get_irq_byname(pdev, "PER");
        int alm_irq = platform_get_irq_byname(pdev, "ALM");
        int ret = 0;
 
-       wm831x_rtc = kzalloc(sizeof(*wm831x_rtc), GFP_KERNEL);
+       wm831x_rtc = devm_kzalloc(&pdev->dev, sizeof(*wm831x_rtc), GFP_KERNEL);
        if (wm831x_rtc == NULL)
                return -ENOMEM;
 
@@ -433,14 +423,6 @@ static int wm831x_rtc_probe(struct platform_device *pdev)
                goto err;
        }
 
-       ret = request_threaded_irq(per_irq, NULL, wm831x_per_irq,
-                                  IRQF_TRIGGER_RISING, "RTC period",
-                                  wm831x_rtc);
-       if (ret != 0) {
-               dev_err(&pdev->dev, "Failed to request periodic IRQ %d: %d\n",
-                       per_irq, ret);
-       }
-
        ret = request_threaded_irq(alm_irq, NULL, wm831x_alm_irq,
                                   IRQF_TRIGGER_RISING, "RTC alarm",
                                   wm831x_rtc);
@@ -452,20 +434,16 @@ static int wm831x_rtc_probe(struct platform_device *pdev)
        return 0;
 
 err:
-       kfree(wm831x_rtc);
        return ret;
 }
 
 static int __devexit wm831x_rtc_remove(struct platform_device *pdev)
 {
        struct wm831x_rtc *wm831x_rtc = platform_get_drvdata(pdev);
-       int per_irq = platform_get_irq_byname(pdev, "PER");
        int alm_irq = platform_get_irq_byname(pdev, "ALM");
 
        free_irq(alm_irq, wm831x_rtc);
-       free_irq(per_irq, wm831x_rtc);
        rtc_device_unregister(wm831x_rtc->rtc);
-       kfree(wm831x_rtc);
 
        return 0;
 }
index 8a0b33033177174bca1766233ae596ae1f8e3ffa..0bd38da4ada0e7cfc9e112cefdd693b039efc54c 100644 (file)
@@ -650,6 +650,7 @@ static int aac_send_raw_srb(struct aac_dev* dev, void __user * arg)
                                     AAC_OPT_NEW_COMM) ?
                                      (dev->scsi_host_ptr->max_sectors << 9) :
                                      65536)) {
+                                       kfree(usg);
                                        rcode = -EINVAL;
                                        goto cleanup;
                                }
index d3b1ddd19097caa9713b2f55af79bbda3a3d2f89..63b117db938d7bd51d5e1bfaa4fe1a5431c0d8a5 100644 (file)
@@ -310,15 +310,15 @@ mega_query_adapter(adapter_t *adapter)
        if (adapter->product_info.subsysvid == HP_SUBSYS_VID) {
                sprintf (adapter->fw_version, "%c%d%d.%d%d",
                         adapter->product_info.fw_version[2],
-                        adapter->product_info.fw_version[1] >> 8,
+                        adapter->product_info.fw_version[1] >> 4,
                         adapter->product_info.fw_version[1] & 0x0f,
-                        adapter->product_info.fw_version[0] >> 8,
+                        adapter->product_info.fw_version[0] >> 4,
                         adapter->product_info.fw_version[0] & 0x0f);
                sprintf (adapter->bios_version, "%c%d%d.%d%d",
                         adapter->product_info.bios_version[2],
-                        adapter->product_info.bios_version[1] >> 8,
+                        adapter->product_info.bios_version[1] >> 4,
                         adapter->product_info.bios_version[1] & 0x0f,
-                        adapter->product_info.bios_version[0] >> 8,
+                        adapter->product_info.bios_version[0] >> 4,
                         adapter->product_info.bios_version[0] & 0x0f);
        } else {
                memcpy(adapter->fw_version,
index 0b2c95583660f67a652217d8b5ca3f25bee24984..a78036f5e1a60565256d2576f02cde3c2343149a 100644 (file)
@@ -4548,7 +4548,7 @@ mpt2sas_base_hard_reset_handler(struct MPT2SAS_ADAPTER *ioc, int sleep_flag,
                printk(MPT2SAS_ERR_FMT "%s: pci error recovery reset\n",
                    ioc->name, __func__);
                r = 0;
-               goto out;
+               goto out_unlocked;
        }
 
        if (mpt2sas_fwfault_debug)
@@ -4604,6 +4604,7 @@ mpt2sas_base_hard_reset_handler(struct MPT2SAS_ADAPTER *ioc, int sleep_flag,
        spin_unlock_irqrestore(&ioc->ioc_reset_in_progress_lock, flags);
        mutex_unlock(&ioc->reset_in_progress_mutex);
 
+ out_unlocked:
        dtmprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: exit\n", ioc->name,
            __func__));
        return r;
index 441a1c5b8974a2ffaf9d104924cdfa58daa54713..6b35b72504e252568f68e2760f7acdabdb85db65 100644 (file)
@@ -2369,16 +2369,15 @@ static ssize_t
 sg_proc_write_adio(struct file *filp, const char __user *buffer,
                   size_t count, loff_t *off)
 {
-       int num;
-       char buff[11];
+       int err;
+       unsigned long num;
 
        if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
                return -EACCES;
-       num = (count < 10) ? count : 10;
-       if (copy_from_user(buff, buffer, num))
-               return -EFAULT;
-       buff[num] = '\0';
-       sg_allow_dio = simple_strtoul(buff, NULL, 10) ? 1 : 0;
+       err = kstrtoul_from_user(buffer, count, 0, &num);
+       if (err)
+               return err;
+       sg_allow_dio = num ? 1 : 0;
        return count;
 }
 
@@ -2391,17 +2390,15 @@ static ssize_t
 sg_proc_write_dressz(struct file *filp, const char __user *buffer,
                     size_t count, loff_t *off)
 {
-       int num;
+       int err;
        unsigned long k = ULONG_MAX;
-       char buff[11];
 
        if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
                return -EACCES;
-       num = (count < 10) ? count : 10;
-       if (copy_from_user(buff, buffer, num))
-               return -EFAULT;
-       buff[num] = '\0';
-       k = simple_strtoul(buff, NULL, 10);
+
+       err = kstrtoul_from_user(buffer, count, 0, &k);
+       if (err)
+               return err;
        if (k <= 1048576) {     /* limit "big buff" to 1 MB */
                sg_big_buff = k;
                return count;
index 1105fa1ed7f4a47faf5e26e0796588d4dcaef585..a1376dc73d71cd79492c388afc340d7fba229545 100644 (file)
@@ -270,17 +270,7 @@ static struct platform_driver pm860x_backlight_driver = {
        .remove         = pm860x_backlight_remove,
 };
 
-static int __init pm860x_backlight_init(void)
-{
-       return platform_driver_register(&pm860x_backlight_driver);
-}
-module_init(pm860x_backlight_init);
-
-static void __exit pm860x_backlight_exit(void)
-{
-       platform_driver_unregister(&pm860x_backlight_driver);
-}
-module_exit(pm860x_backlight_exit);
+module_platform_driver(pm860x_backlight_driver);
 
 MODULE_DESCRIPTION("Backlight Driver for Marvell Semiconductor 88PM8606");
 MODULE_AUTHOR("Haojian Zhuang <haojian.zhuang@marvell.com>");
index 278aeaa925059673fdcefa7b1c243f5a92b41837..681b36929fe406642756ca6bcd8614422a16aa0c 100644 (file)
@@ -280,14 +280,6 @@ config BACKLIGHT_WM831X
          If you have a backlight driven by the ISINK and DCDC of a
          WM831x PMIC say y to enable the backlight driver for it.
 
-config BACKLIGHT_ADX
-       tristate "Avionic Design Xanthos Backlight Driver"
-       depends on ARCH_PXA_ADX
-       default y
-       help
-         Say Y to enable the backlight driver on Avionic Design Xanthos-based
-         boards.
-
 config BACKLIGHT_ADP5520
        tristate "Backlight Driver for ADP5520/ADP5501 using WLED"
        depends on PMIC_ADP5520
index fdd1fc4b277062333846f4f9cd3ba75855d394a7..af5cf654ec7c5c4dba3a632f0d4fc2867a681112 100644 (file)
@@ -32,7 +32,6 @@ obj-$(CONFIG_BACKLIGHT_APPLE) += apple_bl.o
 obj-$(CONFIG_BACKLIGHT_TOSA)   += tosa_bl.o
 obj-$(CONFIG_BACKLIGHT_SAHARA) += kb3886_bl.o
 obj-$(CONFIG_BACKLIGHT_WM831X) += wm831x_bl.o
-obj-$(CONFIG_BACKLIGHT_ADX)    += adx_bl.o
 obj-$(CONFIG_BACKLIGHT_ADP5520)        += adp5520_bl.o
 obj-$(CONFIG_BACKLIGHT_ADP8860)        += adp8860_bl.o
 obj-$(CONFIG_BACKLIGHT_ADP8870)        += adp8870_bl.o
index dfb763e9147ff923676e9ff2778116e3677f61ea..2e630bf1164cab84c771da9accb52109b6b69ff7 100644 (file)
@@ -384,17 +384,7 @@ static struct platform_driver adp5520_bl_driver = {
        .resume         = adp5520_bl_resume,
 };
 
-static int __init adp5520_bl_init(void)
-{
-       return platform_driver_register(&adp5520_bl_driver);
-}
-module_init(adp5520_bl_init);
-
-static void __exit adp5520_bl_exit(void)
-{
-       platform_driver_unregister(&adp5520_bl_driver);
-}
-module_exit(adp5520_bl_exit);
+module_platform_driver(adp5520_bl_driver);
 
 MODULE_AUTHOR("Michael Hennerich <hennerich@blackfin.uclinux.org>");
 MODULE_DESCRIPTION("ADP5520(01) Backlight Driver");
diff --git a/drivers/video/backlight/adx_bl.c b/drivers/video/backlight/adx_bl.c
deleted file mode 100644 (file)
index c861c41..0000000
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * linux/drivers/video/backlight/adx.c
- *
- * Copyright (C) 2009 Avionic Design GmbH
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Written by Thierry Reding <thierry.reding@avionic-design.de>
- */
-
-#include <linux/backlight.h>
-#include <linux/fb.h>
-#include <linux/gfp.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-
-/* register definitions */
-#define ADX_BACKLIGHT_CONTROL          0x00
-#define ADX_BACKLIGHT_CONTROL_ENABLE   (1 << 0)
-#define ADX_BACKLIGHT_BRIGHTNESS       0x08
-#define ADX_BACKLIGHT_STATUS           0x10
-#define ADX_BACKLIGHT_ERROR            0x18
-
-struct adxbl {
-       void __iomem *base;
-};
-
-static int adx_backlight_update_status(struct backlight_device *bldev)
-{
-       struct adxbl *bl = bl_get_data(bldev);
-       u32 value;
-
-       value = bldev->props.brightness;
-       writel(value, bl->base + ADX_BACKLIGHT_BRIGHTNESS);
-
-       value = readl(bl->base + ADX_BACKLIGHT_CONTROL);
-
-       if (bldev->props.state & BL_CORE_FBBLANK)
-               value &= ~ADX_BACKLIGHT_CONTROL_ENABLE;
-       else
-               value |= ADX_BACKLIGHT_CONTROL_ENABLE;
-
-       writel(value, bl->base + ADX_BACKLIGHT_CONTROL);
-
-       return 0;
-}
-
-static int adx_backlight_get_brightness(struct backlight_device *bldev)
-{
-       struct adxbl *bl = bl_get_data(bldev);
-       u32 brightness;
-
-       brightness = readl(bl->base + ADX_BACKLIGHT_BRIGHTNESS);
-       return brightness & 0xff;
-}
-
-static int adx_backlight_check_fb(struct backlight_device *bldev, struct fb_info *fb)
-{
-       return 1;
-}
-
-static const struct backlight_ops adx_backlight_ops = {
-       .options = 0,
-       .update_status = adx_backlight_update_status,
-       .get_brightness = adx_backlight_get_brightness,
-       .check_fb = adx_backlight_check_fb,
-};
-
-static int __devinit adx_backlight_probe(struct platform_device *pdev)
-{
-       struct backlight_properties props;
-       struct backlight_device *bldev;
-       struct resource *res;
-       struct adxbl *bl;
-       int ret = 0;
-
-       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       if (!res) {
-               ret = -ENXIO;
-               goto out;
-       }
-
-       res = devm_request_mem_region(&pdev->dev, res->start,
-                       resource_size(res), res->name);
-       if (!res) {
-               ret = -ENXIO;
-               goto out;
-       }
-
-       bl = devm_kzalloc(&pdev->dev, sizeof(*bl), GFP_KERNEL);
-       if (!bl) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       bl->base = devm_ioremap_nocache(&pdev->dev, res->start,
-                       resource_size(res));
-       if (!bl->base) {
-               ret = -ENXIO;
-               goto out;
-       }
-
-       memset(&props, 0, sizeof(struct backlight_properties));
-       props.type = BACKLIGHT_RAW;
-       props.max_brightness = 0xff;
-       bldev = backlight_device_register(dev_name(&pdev->dev), &pdev->dev,
-                                         bl, &adx_backlight_ops, &props);
-       if (IS_ERR(bldev)) {
-               ret = PTR_ERR(bldev);
-               goto out;
-       }
-
-       bldev->props.brightness = 0xff;
-       bldev->props.power = FB_BLANK_UNBLANK;
-
-       platform_set_drvdata(pdev, bldev);
-
-out:
-       return ret;
-}
-
-static int __devexit adx_backlight_remove(struct platform_device *pdev)
-{
-       struct backlight_device *bldev;
-       int ret = 0;
-
-       bldev = platform_get_drvdata(pdev);
-       bldev->props.power = FB_BLANK_UNBLANK;
-       bldev->props.brightness = 0xff;
-       backlight_update_status(bldev);
-       backlight_device_unregister(bldev);
-       platform_set_drvdata(pdev, NULL);
-
-       return ret;
-}
-
-#ifdef CONFIG_PM
-static int adx_backlight_suspend(struct platform_device *pdev,
-               pm_message_t state)
-{
-       return 0;
-}
-
-static int adx_backlight_resume(struct platform_device *pdev)
-{
-       return 0;
-}
-#else
-#define adx_backlight_suspend NULL
-#define adx_backlight_resume NULL
-#endif
-
-static struct platform_driver adx_backlight_driver = {
-       .probe = adx_backlight_probe,
-       .remove = __devexit_p(adx_backlight_remove),
-       .suspend = adx_backlight_suspend,
-       .resume = adx_backlight_resume,
-       .driver = {
-               .name = "adx-backlight",
-               .owner = THIS_MODULE,
-       },
-};
-
-static int __init adx_backlight_init(void)
-{
-       return platform_driver_register(&adx_backlight_driver);
-}
-
-static void __exit adx_backlight_exit(void)
-{
-       platform_driver_unregister(&adx_backlight_driver);
-}
-
-module_init(adx_backlight_init);
-module_exit(adx_backlight_exit);
-
-MODULE_AUTHOR("Thierry Reding <thierry.reding@avionic-design.de>");
-MODULE_DESCRIPTION("Avionic Design Xanthos Backlight Driver");
-MODULE_LICENSE("GPL v2");
index d68f14bbb687d8391b60f72f8949c3d2c396fbcf..abb4a06268f1831e57815475338447fb83822543 100644 (file)
@@ -199,17 +199,7 @@ static struct platform_driver da903x_backlight_driver = {
        .remove         = da903x_backlight_remove,
 };
 
-static int __init da903x_backlight_init(void)
-{
-       return platform_driver_register(&da903x_backlight_driver);
-}
-module_init(da903x_backlight_init);
-
-static void __exit da903x_backlight_exit(void)
-{
-       platform_driver_unregister(&da903x_backlight_driver);
-}
-module_exit(da903x_backlight_exit);
+module_platform_driver(da903x_backlight_driver);
 
 MODULE_DESCRIPTION("Backlight Driver for Dialog Semiconductor DA9030/DA9034");
 MODULE_AUTHOR("Eric Miao <eric.miao@marvell.com>"
index c74a6f4baa127a313ea8eea5bf16adde62856b2a..b62b8b9063b53beeacffa86c8259d971372e3ec9 100644 (file)
@@ -13,7 +13,6 @@
 
 #include <linux/module.h>
 #include <linux/platform_device.h>
-#include <linux/module.h>
 #include <linux/io.h>
 #include <linux/fb.h>
 #include <linux/backlight.h>
@@ -144,17 +143,7 @@ static struct platform_driver ep93xxbl_driver = {
        .resume         = ep93xxbl_resume,
 };
 
-static int __init ep93xxbl_init(void)
-{
-       return platform_driver_register(&ep93xxbl_driver);
-}
-module_init(ep93xxbl_init);
-
-static void __exit ep93xxbl_exit(void)
-{
-       platform_driver_unregister(&ep93xxbl_driver);
-}
-module_exit(ep93xxbl_exit);
+module_platform_driver(ep93xxbl_driver);
 
 MODULE_DESCRIPTION("EP93xx Backlight Driver");
 MODULE_AUTHOR("H Hartley Sweeten <hsweeten@visionengravers.com>");
index adb191466d646355fdfcb5193c80b864631c3919..9ce6170c186079414dcb62123100e7786b032721 100644 (file)
@@ -132,18 +132,7 @@ static struct platform_driver genericbl_driver = {
        },
 };
 
-static int __init genericbl_init(void)
-{
-       return platform_driver_register(&genericbl_driver);
-}
-
-static void __exit genericbl_exit(void)
-{
-       platform_driver_unregister(&genericbl_driver);
-}
-
-module_init(genericbl_init);
-module_exit(genericbl_exit);
+module_platform_driver(genericbl_driver);
 
 MODULE_AUTHOR("Richard Purdie <rpurdie@rpsys.net>");
 MODULE_DESCRIPTION("Generic Backlight Driver");
index de65d80159beed3b90b21313070538b42c3ee4aa..2f8af5d786abbb5971aed6cad8c153229a4b4b84 100644 (file)
@@ -147,19 +147,8 @@ static struct platform_driver jornada_bl_driver = {
        },
 };
 
-static int __init jornada_bl_init(void)
-{
-       return platform_driver_register(&jornada_bl_driver);
-}
-
-static void __exit jornada_bl_exit(void)
-{
-       platform_driver_unregister(&jornada_bl_driver);
-}
+module_platform_driver(jornada_bl_driver);
 
 MODULE_AUTHOR("Kristoffer Ericson <kristoffer.ericson>");
 MODULE_DESCRIPTION("HP Jornada 710/720/728 Backlight driver");
 MODULE_LICENSE("GPL");
-
-module_init(jornada_bl_init);
-module_exit(jornada_bl_exit);
index d2ff658b4144f91cf6d10a6e04f5178822db26f8..22d231a17e3c4649652ec1992567111e5b70cd6c 100644 (file)
@@ -135,19 +135,8 @@ static struct platform_driver jornada_lcd_driver = {
        },
 };
 
-static int __init jornada_lcd_init(void)
-{
-       return platform_driver_register(&jornada_lcd_driver);
-}
-
-static void __exit jornada_lcd_exit(void)
-{
-       platform_driver_unregister(&jornada_lcd_driver);
-}
+module_platform_driver(jornada_lcd_driver);
 
 MODULE_AUTHOR("Kristoffer Ericson <kristoffer.ericson@gmail.com>");
 MODULE_DESCRIPTION("HP Jornada 710/720/728 LCD driver");
 MODULE_LICENSE("GPL");
-
-module_init(jornada_lcd_init);
-module_exit(jornada_lcd_exit);
index da9a5ce0ccb8463dcb85b18c708738b483383352..78dafc0c8fc5a1ec4830d04660d5914441e4fe09 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/lcd.h>
 #include <linux/backlight.h>
 #include <linux/module.h>
+#include <linux/regulator/consumer.h>
 
 #include "ld9040_gamma.h"
 
@@ -53,8 +54,51 @@ struct ld9040 {
        struct lcd_device               *ld;
        struct backlight_device         *bd;
        struct lcd_platform_data        *lcd_pd;
+
+       struct mutex                    lock;
+       bool  enabled;
+};
+
+static struct regulator_bulk_data supplies[] = {
+       { .supply = "vdd3", },
+       { .supply = "vci", },
 };
 
+static void ld9040_regulator_enable(struct ld9040 *lcd)
+{
+       int ret = 0;
+       struct lcd_platform_data *pd = NULL;
+
+       pd = lcd->lcd_pd;
+       mutex_lock(&lcd->lock);
+       if (!lcd->enabled) {
+               ret = regulator_bulk_enable(ARRAY_SIZE(supplies), supplies);
+               if (ret)
+                       goto out;
+
+               lcd->enabled = true;
+       }
+       mdelay(pd->power_on_delay);
+out:
+       mutex_unlock(&lcd->lock);
+}
+
+static void ld9040_regulator_disable(struct ld9040 *lcd)
+{
+       int ret = 0;
+
+       mutex_lock(&lcd->lock);
+       if (lcd->enabled) {
+               ret = regulator_bulk_disable(ARRAY_SIZE(supplies), supplies);
+               if (ret)
+                       goto out;
+
+               lcd->enabled = false;
+       }
+out:
+       mutex_unlock(&lcd->lock);
+}
+
 static const unsigned short seq_swreset[] = {
        0x01, COMMAND_ONLY,
        ENDDEF, 0x00
@@ -532,13 +576,8 @@ static int ld9040_power_on(struct ld9040 *lcd)
                return -EFAULT;
        }
 
-       if (!pd->power_on) {
-               dev_err(lcd->dev, "power_on is NULL.\n");
-               return -EFAULT;
-       } else {
-               pd->power_on(lcd->ld, 1);
-               mdelay(pd->power_on_delay);
-       }
+       /* lcd power on */
+       ld9040_regulator_enable(lcd);
 
        if (!pd->reset) {
                dev_err(lcd->dev, "reset is NULL.\n");
@@ -582,11 +621,8 @@ static int ld9040_power_off(struct ld9040 *lcd)
 
        mdelay(pd->power_off_delay);
 
-       if (!pd->power_on) {
-               dev_err(lcd->dev, "power_on is NULL.\n");
-               return -EFAULT;
-       } else
-               pd->power_on(lcd->ld, 0);
+       /* lcd power off */
+       ld9040_regulator_disable(lcd);
 
        return 0;
 }
@@ -693,6 +729,14 @@ static int ld9040_probe(struct spi_device *spi)
                goto out_free_lcd;
        }
 
+       mutex_init(&lcd->lock);
+
+       ret = regulator_bulk_get(lcd->dev, ARRAY_SIZE(supplies), supplies);
+       if (ret) {
+               dev_err(lcd->dev, "Failed to get regulators: %d\n", ret);
+               goto out_free_lcd;
+       }
+
        ld = lcd_device_register("ld9040", &spi->dev, lcd, &ld9040_lcd_ops);
        if (IS_ERR(ld)) {
                ret = PTR_ERR(ld);
@@ -739,6 +783,8 @@ static int ld9040_probe(struct spi_device *spi)
 out_unregister_lcd:
        lcd_device_unregister(lcd->ld);
 out_free_lcd:
+       regulator_bulk_free(ARRAY_SIZE(supplies), supplies);
+
        kfree(lcd);
        return ret;
 }
@@ -750,6 +796,7 @@ static int __devexit ld9040_remove(struct spi_device *spi)
        ld9040_power(lcd, FB_BLANK_POWERDOWN);
        backlight_device_unregister(lcd->bd);
        lcd_device_unregister(lcd->ld);
+       regulator_bulk_free(ARRAY_SIZE(supplies), supplies);
        kfree(lcd);
 
        return 0;
index 7bbc802560ea0473229e74ce89b2da299e140fce..c915e3b5388698b2c8c8007167ed531ad5e38580 100644 (file)
@@ -188,17 +188,7 @@ static struct platform_driver max8925_backlight_driver = {
        .remove         = __devexit_p(max8925_backlight_remove),
 };
 
-static int __init max8925_backlight_init(void)
-{
-       return platform_driver_register(&max8925_backlight_driver);
-}
-module_init(max8925_backlight_init);
-
-static void __exit max8925_backlight_exit(void)
-{
-       platform_driver_unregister(&max8925_backlight_driver);
-};
-module_exit(max8925_backlight_exit);
+module_platform_driver(max8925_backlight_driver);
 
 MODULE_DESCRIPTION("Backlight Driver for Maxim MAX8925");
 MODULE_AUTHOR("Haojian Zhuang <haojian.zhuang@marvell.com>");
index 08d26a72394c8e9920cc23e115802c642902305d..d8cde277ec83a691c658b9e0afef6c3aef63ffb4 100644 (file)
@@ -195,18 +195,7 @@ static struct platform_driver omapbl_driver = {
        },
 };
 
-static int __init omapbl_init(void)
-{
-       return platform_driver_register(&omapbl_driver);
-}
-
-static void __exit omapbl_exit(void)
-{
-       platform_driver_unregister(&omapbl_driver);
-}
-
-module_init(omapbl_init);
-module_exit(omapbl_exit);
+module_platform_driver(omapbl_driver);
 
 MODULE_AUTHOR("Andrzej Zaborowski <balrog@zabor.org>");
 MODULE_DESCRIPTION("OMAP LCD Backlight driver");
index ef5628d60563841aceab7bca62191c0faf57f763..13e88b71daecc42b5e833c252fd1305a19cc5ed0 100644 (file)
@@ -173,17 +173,7 @@ static struct platform_driver pcf50633_bl_driver = {
        },
 };
 
-static int __init pcf50633_bl_init(void)
-{
-       return platform_driver_register(&pcf50633_bl_driver);
-}
-module_init(pcf50633_bl_init);
-
-static void __exit pcf50633_bl_exit(void)
-{
-       platform_driver_unregister(&pcf50633_bl_driver);
-}
-module_exit(pcf50633_bl_exit);
+module_platform_driver(pcf50633_bl_driver);
 
 MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
 MODULE_DESCRIPTION("PCF50633 backlight driver");
index 302330acf6284e0b22e7dc77a6ea055603aa5358..187da59e3a13f652bb44650b9c618c15593039f5 100644 (file)
@@ -157,18 +157,7 @@ static struct platform_driver platform_lcd_driver = {
        .resume         = platform_lcd_resume,
 };
 
-static int __init platform_lcd_init(void)
-{
-       return platform_driver_register(&platform_lcd_driver);
-}
-
-static void __exit platform_lcd_cleanup(void)
-{
-       platform_driver_unregister(&platform_lcd_driver);
-}
-
-module_init(platform_lcd_init);
-module_exit(platform_lcd_cleanup);
+module_platform_driver(platform_lcd_driver);
 
 MODULE_AUTHOR("Ben Dooks <ben-linux@fluff.org>");
 MODULE_LICENSE("GPL v2");
index 8b5b2a4124c7980129be8146f881772f5c33fa97..b811e8fb40627334c348cc2fcea9a72bc93622db 100644 (file)
@@ -207,17 +207,7 @@ static struct platform_driver pwm_backlight_driver = {
        .resume         = pwm_backlight_resume,
 };
 
-static int __init pwm_backlight_init(void)
-{
-       return platform_driver_register(&pwm_backlight_driver);
-}
-module_init(pwm_backlight_init);
-
-static void __exit pwm_backlight_exit(void)
-{
-       platform_driver_unregister(&pwm_backlight_driver);
-}
-module_exit(pwm_backlight_exit);
+module_platform_driver(pwm_backlight_driver);
 
 MODULE_DESCRIPTION("PWM based Backlight Driver");
 MODULE_LICENSE("GPL");
index fbe9e9316f3b51a09e5de299a9a544d878bf5b33..4e915f5eca99b43697e195207150a00bae631f22 100644 (file)
@@ -236,17 +236,7 @@ static struct platform_driver wm831x_backlight_driver = {
        .remove         = wm831x_backlight_remove,
 };
 
-static int __init wm831x_backlight_init(void)
-{
-       return platform_driver_register(&wm831x_backlight_driver);
-}
-module_init(wm831x_backlight_init);
-
-static void __exit wm831x_backlight_exit(void)
-{
-       platform_driver_unregister(&wm831x_backlight_driver);
-}
-module_exit(wm831x_backlight_exit);
+module_platform_driver(wm831x_backlight_driver);
 
 MODULE_DESCRIPTION("Backlight Driver for WM831x PMICs");
 MODULE_AUTHOR("Mark Brown <broonie@opensource.wolfsonmicro.com");
index 79e2ca7973b7a2b503a7f7fc168f716a5f6c0284..e95d1b64082cae708f3213a453ea3275f6afbb57 100644 (file)
@@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF
        bool
        depends on COMPAT && BINFMT_ELF
 
+config ARCH_BINFMT_ELF_RANDOMIZE_PIE
+       bool
+
 config BINFMT_ELF_FDPIC
        bool "Kernel support for FDPIC ELF binaries"
        default y
index 21ac5ee4b43f3e767b56aff2f54c6018175bc6af..bcb884e2d613e76d94570dd81b99ba27e3906a66 100644 (file)
@@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         * default mmap base, as well as whatever program they
                         * might try to exec.  This is because the brk will
                         * follow the loader, and is not movable.  */
-#if defined(CONFIG_X86) || defined(CONFIG_ARM)
+#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
                        /* Memory randomization might have been switched off
                         * in runtime via sysctl.
                         * If that is the case, retain the original non-zero
index b07f1da1de4e34470fd64af913c9366e0d6c8513..3154fbe9ba44199c55c48d1188b4030768ed5d39 100644 (file)
@@ -1117,6 +1117,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (!bdev->bd_openers) {
                bdev->bd_disk = disk;
+               bdev->bd_queue = disk->queue;
                bdev->bd_contains = bdev;
                if (!partno) {
                        struct backing_dev_info *bdi;
@@ -1137,6 +1138,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                        disk_put_part(bdev->bd_part);
                                        bdev->bd_part = NULL;
                                        bdev->bd_disk = NULL;
+                                       bdev->bd_queue = NULL;
                                        mutex_unlock(&bdev->bd_mutex);
                                        disk_unblock_events(disk);
                                        put_disk(disk);
@@ -1210,6 +1212,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        disk_put_part(bdev->bd_part);
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
+       bdev->bd_queue = NULL;
        bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
index bfb620ead295dc808fdb27c962b1da962cc50b20..034d985032296cd4dfbc80e4d6717ac8b4ea77c4 100644 (file)
@@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = find_or_create_page(inode->i_mapping, index + i,
-                                              mask);
+                                              mask | __GFP_WRITE);
                if (!pages[i]) {
                        faili = i - 1;
                        err = -ENOMEM;
index 8f1fe324162b4203fc245c9032052f6862623501..f8351c21ff3b31e916c23d49780ab5f2b6e6511c 100644 (file)
@@ -703,7 +703,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
         * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
         * the cached file length
         */
-       if (origin != SEEK_SET || origin != SEEK_CUR) {
+       if (origin != SEEK_SET && origin != SEEK_CUR) {
                int rc;
                struct inode *inode = file->f_path.dentry->d_inode;
 
index 89509b5a090e27320e45b9c0c2f5480e082b1a37..e297655d8946a0b3a6bfb7ce1c94531cd4184165 100644 (file)
@@ -2454,7 +2454,7 @@ static int prepend_path(const struct path *path,
        bool slash = false;
        int error = 0;
 
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        while (dentry != root->dentry || vfsmnt != root->mnt) {
                struct dentry * parent;
 
@@ -2485,7 +2485,7 @@ static int prepend_path(const struct path *path,
                error = prepend(buffer, buflen, "/", 1);
 
 out:
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        return error;
 
 global_root:
@@ -2859,11 +2859,11 @@ int path_is_under(struct path *path1, struct path *path2)
        struct dentry *dentry = path1->dentry;
        int res;
 
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        if (mnt != path2->mnt) {
                for (;;) {
                        if (mnt->mnt_parent == mnt) {
-                               br_read_unlock(vfsmount_lock);
+                               br_read_unlock(&vfsmount_lock);
                                return 0;
                        }
                        if (mnt->mnt_parent == path2->mnt)
@@ -2873,7 +2873,7 @@ int path_is_under(struct path *path1, struct path *path2)
                dentry = mnt->mnt_mountpoint;
        }
        res = is_subdir(dentry, path2->dentry);
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        return res;
 }
 EXPORT_SYMBOL(path_is_under);
index d740ab67ff6e1bd581218fb149eb7a58c40fa3f3..4a588dbd11bfa57926b647e1e66bdd35a446cad4 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/rwsem.h>
 #include <linux/uio.h>
 #include <linux/atomic.h>
+#include <linux/prefetch.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 {
        int ret;
        sector_t fs_startblk;   /* Into file, in filesystem-sized blocks */
+       sector_t fs_endblk;     /* Into file, in filesystem-sized blocks */
        unsigned long fs_count; /* Number of filesystem-sized blocks */
-       unsigned long dio_count;/* Number of dio_block-sized blocks */
-       unsigned long blkmask;
        int create;
 
        /*
@@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
        if (ret == 0) {
                BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
                fs_startblk = sdio->block_in_file >> sdio->blkfactor;
-               dio_count = sdio->final_block_in_request - sdio->block_in_file;
-               fs_count = dio_count >> sdio->blkfactor;
-               blkmask = (1 << sdio->blkfactor) - 1;
-               if (dio_count & blkmask)        
-                       fs_count++;
+               fs_endblk = (sdio->final_block_in_request - 1) >>
+                                       sdio->blkfactor;
+               fs_count = fs_endblk - fs_startblk + 1;
 
                map_bh->b_state = 0;
                map_bh->b_size = fs_count << dio->inode->i_blkbits;
@@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
  * individual fields and will generate much worse code. This is important
  * for the whole file.
  */
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+static inline ssize_t
+do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset, 
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
        dio_submit_t submit_io, int flags)
@@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        size_t size;
        unsigned long addr;
        unsigned blkbits = inode->i_blkbits;
-       unsigned bdev_blkbits = 0;
        unsigned blocksize_mask = (1 << blkbits) - 1;
        ssize_t retval = -EINVAL;
        loff_t end = offset;
@@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (rw & WRITE)
                rw = WRITE_ODIRECT;
 
-       if (bdev)
-               bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+       /*
+        * Avoid references to bdev if not absolutely needed to give
+        * the early prefetch in the caller enough time.
+        */
 
        if (offset & blocksize_mask) {
                if (bdev)
-                        blkbits = bdev_blkbits;
+                       blkbits = blksize_bits(bdev_logical_block_size(bdev));
                blocksize_mask = (1 << blkbits) - 1;
                if (offset & blocksize_mask)
                        goto out;
@@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                addr = (unsigned long)iov[seg].iov_base;
                size = iov[seg].iov_len;
                end += size;
-               if ((addr & blocksize_mask) || (size & blocksize_mask))  {
+               if (unlikely((addr & blocksize_mask) ||
+                            (size & blocksize_mask))) {
                        if (bdev)
-                                blkbits = bdev_blkbits;
+                               blkbits = blksize_bits(
+                                        bdev_logical_block_size(bdev));
                        blocksize_mask = (1 << blkbits) - 1;
-                       if ((addr & blocksize_mask) || (size & blocksize_mask))  
+                       if ((addr & blocksize_mask) || (size & blocksize_mask))
                                goto out;
                }
        }
@@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 out:
        return retval;
 }
+
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+       struct block_device *bdev, const struct iovec *iov, loff_t offset,
+       unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+       dio_submit_t submit_io, int flags)
+{
+       /*
+        * The block device state is needed in the end to finally
+        * submit everything.  Since it's likely to be cache cold
+        * prefetch it here as first thing to hide some of the
+        * latency.
+        *
+        * Attempt to prefetch the pieces we likely need later.
+        */
+       prefetch(&bdev->bd_disk->part_tbl);
+       prefetch(bdev->bd_queue);
+       prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+
+       return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+                                    nr_segs, get_block, end_io,
+                                    submit_io, flags);
+}
+
 EXPORT_SYMBOL(__blockdev_direct_IO);
 
 static __init int dio_init(void)
index f816c65c45b30d2e29a11fbd6a91038e4fdbfd30..fd8b8b908601d7cf7181ca17ae43951ad12f587f 100644 (file)
@@ -197,6 +197,12 @@ struct eventpoll {
 
        /* The user that created the eventpoll descriptor */
        struct user_struct *user;
+
+       struct file *file;
+
+       /* used to optimize loop detection check */
+       int visited;
+       struct list_head visitedllink;
 };
 
 /* Wait structure used by the poll hooks */
@@ -255,6 +261,12 @@ static struct kmem_cache *epi_cache __read_mostly;
 /* Slab cache used to allocate "struct eppoll_entry" */
 static struct kmem_cache *pwq_cache __read_mostly;
 
+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
+LIST_HEAD(visited_list);
+
+/* Files with newly added links, which need a limit on emanating paths */
+LIST_HEAD(tfile_check_list);
+
 #ifdef CONFIG_SYSCTL
 
 #include <linux/sysctl.h>
@@ -276,6 +288,12 @@ ctl_table epoll_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
+static const struct file_operations eventpoll_fops;
+
+static inline int is_file_epoll(struct file *f)
+{
+       return f->f_op == &eventpoll_fops;
+}
 
 /* Setup the structure that is used as key for the RB tree */
 static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -714,12 +732,6 @@ static const struct file_operations eventpoll_fops = {
        .llseek         = noop_llseek,
 };
 
-/* Fast test to see if the file is an eventpoll file */
-static inline int is_file_epoll(struct file *f)
-{
-       return f->f_op == &eventpoll_fops;
-}
-
 /*
  * This is called from eventpoll_release() to unlink files from the eventpoll
  * interface. We need to have this facility to cleanup correctly files that are
@@ -929,6 +941,96 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
        rb_insert_color(&epi->rbn, &ep->rbr);
 }
 
+
+
+#define PATH_ARR_SIZE 5
+/* These are the number paths of length 1 to 5, that we are allowing to emanate
+ * from a single file of interest. For example, we allow 1000 paths of length
+ * 1, to emanate from each file of interest. This essentially represents the
+ * potential wakeup paths, which need to be limited in order to avoid massive
+ * uncontrolled wakeup storms. The common use case should be a single ep which
+ * is connected to n file sources. In this case each file source has 1 path
+ * of length 1. Thus, the numbers below should be more than sufficient.
+ */
+int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
+int path_count[PATH_ARR_SIZE];
+
+static int path_count_inc(int nests)
+{
+       if (++path_count[nests] > path_limits[nests])
+               return -1;
+       return 0;
+}
+
+static void path_count_init(void)
+{
+       int i;
+
+       for (i = 0; i < PATH_ARR_SIZE; i++)
+               path_count[i] = 0;
+}
+
+static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
+{
+       int error = 0;
+       struct file *file = priv;
+       struct file *child_file;
+       struct epitem *epi;
+
+       list_for_each_entry(epi, &file->f_ep_links, fllink) {
+               child_file = epi->ep->file;
+               if (is_file_epoll(child_file)) {
+                       if (list_empty(&child_file->f_ep_links)) {
+                               if (path_count_inc(call_nests)) {
+                                       error = -1;
+                                       break;
+                               }
+                       } else {
+                               error = ep_call_nested(&poll_loop_ncalls,
+                                                       EP_MAX_NESTS,
+                                                       reverse_path_check_proc,
+                                                       child_file, child_file,
+                                                       current);
+                       }
+                       if (error != 0)
+                               break;
+               } else {
+                       printk(KERN_ERR "reverse_path_check_proc: "
+                               "file is not an ep!\n");
+               }
+       }
+       return error;
+}
+
+/**
+ * reverse_path_check - The tfile_check_list is list of file *, which have
+ *                      links that are proposed to be newly added. We need to
+ *                      make sure that those added links don't add too many
+ *                      paths such that we will spend all our time waking up
+ *                      eventpoll objects.
+ *
+ * Returns: Returns zero if the proposed links don't create too many paths,
+ *         -1 otherwise.
+ */
+static int reverse_path_check(void)
+{
+       int length = 0;
+       int error = 0;
+       struct file *current_file;
+
+       /* let's call this for all tfiles */
+       list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+               length++;
+               path_count_init();
+               error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                                       reverse_path_check_proc, current_file,
+                                       current_file, current);
+               if (error)
+                       break;
+       }
+       return error;
+}
+
 /*
  * Must be called with "mtx" held.
  */
@@ -991,6 +1093,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
         */
        ep_rbtree_insert(ep, epi);
 
+       /* now check if we've created too many backpaths */
+       error = -EINVAL;
+       if (reverse_path_check())
+               goto error_remove_epi;
+
        /* We have to drop the new item inside our item list to keep track of it */
        spin_lock_irqsave(&ep->lock, flags);
 
@@ -1015,6 +1122,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
        return 0;
 
+error_remove_epi:
+       spin_lock(&tfile->f_lock);
+       if (ep_is_linked(&epi->fllink))
+               list_del_init(&epi->fllink);
+       spin_unlock(&tfile->f_lock);
+
+       rb_erase(&epi->rbn, &ep->rbr);
+
 error_unregister:
        ep_unregister_pollwait(ep, epi);
 
@@ -1287,18 +1402,35 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
        int error = 0;
        struct file *file = priv;
        struct eventpoll *ep = file->private_data;
+       struct eventpoll *ep_tovisit;
        struct rb_node *rbp;
        struct epitem *epi;
 
        mutex_lock_nested(&ep->mtx, call_nests + 1);
+       ep->visited = 1;
+       list_add(&ep->visitedllink, &visited_list);
        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (unlikely(is_file_epoll(epi->ffd.file))) {
+                       ep_tovisit = epi->ffd.file->private_data;
+                       if (ep_tovisit->visited)
+                               continue;
                        error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
-                                              ep_loop_check_proc, epi->ffd.file,
-                                              epi->ffd.file->private_data, current);
+                                       ep_loop_check_proc, epi->ffd.file,
+                                       ep_tovisit, current);
                        if (error != 0)
                                break;
+               } else {
+                       /* if we've reached a file that is not associated with
+                        * an ep, then then we need to check if the newly added
+                        * links are going to add too many wakeup paths. We do
+                        * this by adding it to the tfile_check_list, if it's
+                        * not already there, and calling reverse_path_check()
+                        * during ep_insert()
+                        */
+                       if (list_empty(&epi->ffd.file->f_tfile_llink))
+                               list_add(&epi->ffd.file->f_tfile_llink,
+                                        &tfile_check_list);
                }
        }
        mutex_unlock(&ep->mtx);
@@ -1319,8 +1451,30 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
  */
 static int ep_loop_check(struct eventpoll *ep, struct file *file)
 {
-       return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+       int ret;
+       struct eventpoll *ep_cur, *ep_next;
+
+       ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
                              ep_loop_check_proc, file, ep, current);
+       /* clear visited list */
+       list_for_each_entry_safe(ep_cur, ep_next, &visited_list, visitedllink) {
+               ep_cur->visited = 0;
+               list_del(&ep_cur->visitedllink);
+       }
+       return ret;
+}
+
+static void clear_tfile_check_list(void)
+{
+       struct file *file;
+
+       /* first clear the tfile_check_list */
+       while (!list_empty(&tfile_check_list)) {
+               file = list_first_entry(&tfile_check_list, struct file,
+                                       f_tfile_llink);
+               list_del_init(&file->f_tfile_llink);
+       }
+       INIT_LIST_HEAD(&tfile_check_list);
 }
 
 /*
@@ -1328,8 +1482,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
  */
 SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
-       int error;
+       int error, fd;
        struct eventpoll *ep = NULL;
+       struct file *file;
 
        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1346,11 +1501,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
         */
-       error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+       fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+       if (fd < 0) {
+               error = fd;
+               goto out_free_ep;
+       }
+       file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                                 O_RDWR | (flags & O_CLOEXEC));
-       if (error < 0)
-               ep_free(ep);
-
+       if (IS_ERR(file)) {
+               error = PTR_ERR(file);
+               goto out_free_fd;
+       }
+       fd_install(fd, file);
+       ep->file = file;
+       return fd;
+
+out_free_fd:
+       put_unused_fd(fd);
+out_free_ep:
+       ep_free(ep);
        return error;
 }
 
@@ -1416,21 +1585,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        /*
         * When we insert an epoll file descriptor, inside another epoll file
         * descriptor, there is the change of creating closed loops, which are
-        * better be handled here, than in more critical paths.
+        * better be handled here, than in more critical paths. While we are
+        * checking for loops we also determine the list of files reachable
+        * and hang them on the tfile_check_list, so we can check that we
+        * haven't created too many possible wakeup paths.
         *
-        * We hold epmutex across the loop check and the insert in this case, in
-        * order to prevent two separate inserts from racing and each doing the
-        * insert "at the same time" such that ep_loop_check passes on both
-        * before either one does the insert, thereby creating a cycle.
+        * We need to hold the epmutex across both ep_insert and ep_remove
+        * b/c we want to make sure we are looking at a coherent view of
+        * epoll network.
         */
-       if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+       if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
                mutex_lock(&epmutex);
                did_lock_epmutex = 1;
-               error = -ELOOP;
-               if (ep_loop_check(ep, tfile) != 0)
-                       goto error_tgt_fput;
        }
-
+       if (op == EPOLL_CTL_ADD) {
+               if (is_file_epoll(tfile)) {
+                       error = -ELOOP;
+                       if (ep_loop_check(ep, tfile) != 0)
+                               goto error_tgt_fput;
+               } else
+                       list_add(&tfile->f_tfile_llink, &tfile_check_list);
+       }
 
        mutex_lock_nested(&ep->mtx, 0);
 
@@ -1449,6 +1624,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                        error = ep_insert(ep, &epds, tfile, fd);
                } else
                        error = -EEXIST;
+               clear_tfile_check_list();
                break;
        case EPOLL_CTL_DEL:
                if (epi)
@@ -1467,7 +1643,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        mutex_unlock(&ep->mtx);
 
 error_tgt_fput:
-       if (unlikely(did_lock_epmutex))
+       if (did_lock_epmutex)
                mutex_unlock(&epmutex);
 
        fput(tfile);
index ca141db681fef2612d4994abb1114aa16fc6233f..fd0bfbddd4ea7d30b89bcc0c8a5d32a80f53ebfc 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,6 +59,8 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+
+#include <trace/events/task.h>
 #include "internal.h"
 
 int core_uses_pid;
@@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 {
        task_lock(tsk);
 
+       trace_task_rename(tsk, buf);
+
        /*
         * Threads may access current->comm without holding
         * the task lock, so write the string carefully.
index 5b0e26a1272dcc6c6573463767f8563ec683e6e3..41c9ef17acf84b6b7fb2328b5b7f214c7034aad9 100644 (file)
@@ -957,12 +957,13 @@ struct ext4_inode_info {
 #define test_opt2(sb, opt)             (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)
 
-#define ext4_set_bit                   __test_and_set_bit_le
+#define ext4_test_and_set_bit          __test_and_set_bit_le
+#define ext4_set_bit                   __set_bit_le
 #define ext4_set_bit_atomic            ext2_set_bit_atomic
-#define ext4_clear_bit                 __test_and_clear_bit_le
+#define ext4_test_and_clear_bit                __test_and_clear_bit_le
+#define ext4_clear_bit                 __clear_bit_le
 #define ext4_clear_bit_atomic          ext2_clear_bit_atomic
 #define ext4_test_bit                  test_bit_le
-#define ext4_find_first_zero_bit       find_first_zero_bit_le
 #define ext4_find_next_zero_bit                find_next_zero_bit_le
 #define ext4_find_next_bit             find_next_bit_le
 
index 00beb4f9cc4ff0501012b8bdf338541fcb5e5a69..e4f356f0780d343ed5b0bd1719da63e4d7efc9e5 100644 (file)
@@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                fatal = ext4_journal_get_write_access(handle, bh2);
        }
        ext4_lock_group(sb, block_group);
-       cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+       cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
        if (fatal || !cleared) {
                ext4_unlock_group(sb, block_group);
                goto out;
@@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb,
         */
        down_read(&grp->alloc_sem);
        ext4_lock_group(sb, group);
-       if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+       if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
                retval = 1;
                goto err_ret;
index c322794f7360c2065edc871a0fa91f2333ed6423..6f19cf549a6e27ea7782c2cd294af04a8e15a23f 100644 (file)
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
        .max_files = NR_FILE
 };
 
-DECLARE_LGLOCK(files_lglock);
 DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
@@ -422,9 +421,9 @@ static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
  */
 void file_sb_list_add(struct file *file, struct super_block *sb)
 {
-       lg_local_lock(files_lglock);
+       lg_local_lock(&files_lglock);
        __file_sb_list_add(file, sb);
-       lg_local_unlock(files_lglock);
+       lg_local_unlock(&files_lglock);
 }
 
 /**
@@ -437,9 +436,9 @@ void file_sb_list_add(struct file *file, struct super_block *sb)
 void file_sb_list_del(struct file *file)
 {
        if (!list_empty(&file->f_u.fu_list)) {
-               lg_local_lock_cpu(files_lglock, file_list_cpu(file));
+               lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
                list_del_init(&file->f_u.fu_list);
-               lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
+               lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
        }
 }
 
@@ -478,7 +477,7 @@ int fs_may_remount_ro(struct super_block *sb)
 {
        struct file *file;
        /* Check that no files are currently opened for writing. */
-       lg_global_lock(files_lglock);
+       lg_global_lock(&files_lglock);
        do_file_list_for_each_entry(sb, file) {
                struct inode *inode = file->f_path.dentry->d_inode;
 
@@ -490,10 +489,10 @@ int fs_may_remount_ro(struct super_block *sb)
                if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
                        goto too_bad;
        } while_file_list_for_each_entry;
-       lg_global_unlock(files_lglock);
+       lg_global_unlock(&files_lglock);
        return 1; /* Tis' cool bro. */
 too_bad:
-       lg_global_unlock(files_lglock);
+       lg_global_unlock(&files_lglock);
        return 0;
 }
 
@@ -509,7 +508,7 @@ void mark_files_ro(struct super_block *sb)
        struct file *f;
 
 retry:
-       lg_global_lock(files_lglock);
+       lg_global_lock(&files_lglock);
        do_file_list_for_each_entry(sb, f) {
                struct vfsmount *mnt;
                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
@@ -526,12 +525,12 @@ retry:
                file_release_write(f);
                mnt = mntget(f->f_path.mnt);
                /* This can sleep, so we can't hold the spinlock. */
-               lg_global_unlock(files_lglock);
+               lg_global_unlock(&files_lglock);
                mnt_drop_write(mnt);
                mntput(mnt);
                goto retry;
        } while_file_list_for_each_entry;
-       lg_global_unlock(files_lglock);
+       lg_global_unlock(&files_lglock);
 }
 
 void __init files_init(unsigned long mempages)
@@ -549,6 +548,6 @@ void __init files_init(unsigned long mempages)
        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
        files_defer_init();
-       lg_lock_init(files_lglock);
+       lg_lock_init(&files_lglock, "files_lglock");
        percpu_counter_init(&nr_files, 0);
 } 
index ee4e66b998f40d170b1822db68875df73d445db6..1f6c48d59a0dfe8402dc59f484e3ca0fca3cd08a 100644 (file)
@@ -692,6 +692,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
        else
                __count_vm_events(PGINODESTEAL, reap);
        spin_unlock(&sb->s_inode_lru_lock);
+       if (current->reclaim_state)
+               current->reclaim_state->reclaimed_slab += reap;
 
        dispose_list(&freeable);
 }
index fe327c20af8372b4f3ce5c7e42e369b81ab440ba..6e7fe58a76c0b5dd2819789e18607962b113df0b 100644 (file)
@@ -77,8 +77,7 @@ extern void mnt_make_shortterm(struct vfsmount *);
 
 extern void __init mnt_init(void);
 
-DECLARE_BRLOCK(vfsmount_lock);
-
+extern struct lglock vfsmount_lock;
 
 /*
  * fs_struct.c
index fdfae9fa98cda52f26b730ab6786db084124d279..643e9f55ef297860cbbd52ab48d548e578191846 100644 (file)
@@ -371,9 +371,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        sector_t last_block_in_bio = 0;
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-       struct blk_plug plug;
-
-       blk_start_plug(&plug);
 
        map_bh.b_state = 0;
        map_bh.b_size = 0;
@@ -395,7 +392,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        BUG_ON(!list_empty(pages));
        if (bio)
                mpage_bio_submit(READ, bio);
-       blk_finish_plug(&plug);
        return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
index c02b2c66213146ad9c7baee2313e0429a5fc70c2..a9a7166c5e5037b276171ce62498a8b0e876c998 100644 (file)
@@ -463,7 +463,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
        mntget(nd->path.mnt);
 
        rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        nd->flags &= ~LOOKUP_RCU;
        return 0;
 
@@ -521,14 +521,14 @@ static int complete_walk(struct nameidata *nd)
                if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
                        spin_unlock(&dentry->d_lock);
                        rcu_read_unlock();
-                       br_read_unlock(vfsmount_lock);
+                       br_read_unlock(&vfsmount_lock);
                        return -ECHILD;
                }
                BUG_ON(nd->inode != dentry->d_inode);
                spin_unlock(&dentry->d_lock);
                mntget(nd->path.mnt);
                rcu_read_unlock();
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
        }
 
        if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -693,15 +693,15 @@ int follow_up(struct path *path)
        struct vfsmount *parent;
        struct dentry *mountpoint;
 
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        parent = path->mnt->mnt_parent;
        if (parent == path->mnt) {
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
                return 0;
        }
        mntget(parent);
        mountpoint = dget(path->mnt->mnt_mountpoint);
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
@@ -833,7 +833,7 @@ static int follow_managed(struct path *path, unsigned flags)
                        /* Something is mounted on this dentry in another
                         * namespace and/or whatever was mounted there in this
                         * namespace got unmounted before we managed to get the
-                        * vfsmount_lock */
+                        * &vfsmount_lock */
                }
 
                /* Handle an automount point */
@@ -959,7 +959,7 @@ failed:
        if (!(nd->flags & LOOKUP_ROOT))
                nd->root.mnt = NULL;
        rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        return -ECHILD;
 }
 
@@ -1253,7 +1253,7 @@ static void terminate_walk(struct nameidata *nd)
                if (!(nd->flags & LOOKUP_ROOT))
                        nd->root.mnt = NULL;
                rcu_read_unlock();
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
        }
 }
 
@@ -1487,7 +1487,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                nd->path = nd->root;
                nd->inode = inode;
                if (flags & LOOKUP_RCU) {
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                        rcu_read_lock();
                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                } else {
@@ -1500,7 +1500,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 
        if (*name=='/') {
                if (flags & LOOKUP_RCU) {
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                        rcu_read_lock();
                        set_root_rcu(nd);
                } else {
@@ -1513,7 +1513,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                        struct fs_struct *fs = current->fs;
                        unsigned seq;
 
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                        rcu_read_lock();
 
                        do {
@@ -1549,7 +1549,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                        if (fput_needed)
                                *fp = file;
                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                        rcu_read_lock();
                } else {
                        path_get(&file->f_path);
index dd4d1f4021f13ff577bbcad55561f0dc49596bc4..5c07af8cd87cea98ad33cefb095184dfab8447ae 100644 (file)
@@ -419,7 +419,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
        int ret = 0;
 
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        mnt->mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -453,15 +453,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
         */
        smp_wmb();
        mnt->mnt_flags &= ~MNT_WRITE_HOLD;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        return ret;
 }
 
 static void __mnt_unmake_readonly(struct vfsmount *mnt)
 {
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        mnt->mnt_flags &= ~MNT_READONLY;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 }
 
 static void free_vfsmnt(struct vfsmount *mnt)
@@ -508,10 +508,10 @@ struct vfsmount *lookup_mnt(struct path *path)
 {
        struct vfsmount *child_mnt;
 
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
                mntget(child_mnt);
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        return child_mnt;
 }
 
@@ -776,34 +776,34 @@ static void mntput_no_expire(struct vfsmount *mnt)
 {
 put_again:
 #ifdef CONFIG_SMP
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        if (likely(atomic_read(&mnt->mnt_longterm))) {
                mnt_dec_count(mnt);
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
                return;
        }
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
 
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        mnt_dec_count(mnt);
        if (mnt_get_count(mnt)) {
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                return;
        }
 #else
        mnt_dec_count(mnt);
        if (likely(mnt_get_count(mnt)))
                return;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
 #endif
        if (unlikely(mnt->mnt_pinned)) {
                mnt_add_count(mnt, mnt->mnt_pinned + 1);
                mnt->mnt_pinned = 0;
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                acct_auto_close_mnt(mnt);
                goto put_again;
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        mntfree(mnt);
 }
 
@@ -828,20 +828,20 @@ EXPORT_SYMBOL(mntget);
 
 void mnt_pin(struct vfsmount *mnt)
 {
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        mnt->mnt_pinned++;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *mnt)
 {
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        if (mnt->mnt_pinned) {
                mnt_inc_count(mnt);
                mnt->mnt_pinned--;
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
 
@@ -931,12 +931,12 @@ int mnt_had_events(struct proc_mounts *p)
        struct mnt_namespace *ns = p->ns;
        int res = 0;
 
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        if (p->m.poll_event != ns->event) {
                p->m.poll_event = ns->event;
                res = 1;
        }
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
 
        return res;
 }
@@ -1157,12 +1157,12 @@ int may_umount_tree(struct vfsmount *mnt)
        struct vfsmount *p;
 
        /* write lock needed for mnt_get_count */
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 
        if (actual_refs > minimum_refs)
                return 0;
@@ -1189,10 +1189,10 @@ int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
        down_read(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_read(&namespace_sem);
        return ret;
 }
@@ -1209,13 +1209,13 @@ void release_mounts(struct list_head *head)
                        struct dentry *dentry;
                        struct vfsmount *m;
 
-                       br_write_lock(vfsmount_lock);
+                       br_write_lock(&vfsmount_lock);
                        dentry = mnt->mnt_mountpoint;
                        m = mnt->mnt_parent;
                        mnt->mnt_mountpoint = mnt->mnt_root;
                        mnt->mnt_parent = mnt;
                        m->mnt_ghosts--;
-                       br_write_unlock(vfsmount_lock);
+                       br_write_unlock(&vfsmount_lock);
                        dput(dentry);
                        mntput(m);
                }
@@ -1281,12 +1281,12 @@ static int do_umount(struct vfsmount *mnt, int flags)
                 * probably don't strictly need the lock here if we examined
                 * all race cases, but it's a slowpath.
                 */
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                if (mnt_get_count(mnt) != 2) {
-                       br_write_unlock(vfsmount_lock);
+                       br_write_unlock(&vfsmount_lock);
                        return -EBUSY;
                }
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
 
                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
@@ -1328,7 +1328,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
        }
 
        down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        event++;
 
        if (!(flags & MNT_DETACH))
@@ -1340,7 +1340,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
                        umount_tree(mnt, 1, &umount_list);
                retval = 0;
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        return retval;
@@ -1452,19 +1452,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
                        q = clone_mnt(p, p->mnt_root, flag);
                        if (!q)
                                goto Enomem;
-                       br_write_lock(vfsmount_lock);
+                       br_write_lock(&vfsmount_lock);
                        list_add_tail(&q->mnt_list, &res->mnt_list);
                        attach_mnt(q, &path);
-                       br_write_unlock(vfsmount_lock);
+                       br_write_unlock(&vfsmount_lock);
                }
        }
        return res;
 Enomem:
        if (res) {
                LIST_HEAD(umount_list);
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                umount_tree(res, 0, &umount_list);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                release_mounts(&umount_list);
        }
        return NULL;
@@ -1483,9 +1483,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
 {
        LIST_HEAD(umount_list);
        down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        umount_tree(mnt, 0, &umount_list);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
 }
@@ -1613,7 +1613,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (err)
                goto out_cleanup_ids;
 
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
 
        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1632,7 +1632,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
                list_del_init(&child->mnt_hash);
                commit_tree(child);
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 
        return 0;
 
@@ -1729,10 +1729,10 @@ static int do_change_type(struct path *path, int flag)
                        goto out_unlock;
        }
 
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 
  out_unlock:
        up_write(&namespace_sem);
@@ -1779,9 +1779,9 @@ static int do_loopback(struct path *path, char *old_name,
 
        err = graft_tree(mnt, path);
        if (err) {
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                umount_tree(mnt, 0, &umount_list);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
        }
 out2:
        unlock_mount(path);
@@ -1838,16 +1838,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        else
                err = do_remount_sb(sb, flags, data, 0);
        if (!err) {
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
                path->mnt->mnt_flags = mnt_flags;
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
        }
        up_write(&sb->s_umount);
        if (!err) {
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                touch_mnt_namespace(path->mnt->mnt_ns);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
        }
        return err;
 }
@@ -2052,9 +2052,9 @@ fail:
        /* remove m from any expiration list it may be on */
        if (!list_empty(&m->mnt_expire)) {
                down_write(&namespace_sem);
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                list_del_init(&m->mnt_expire);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                up_write(&namespace_sem);
        }
        mntput(m);
@@ -2070,11 +2070,11 @@ fail:
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
        down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
 
        list_add_tail(&mnt->mnt_expire, expiry_list);
 
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
 }
 EXPORT_SYMBOL(mnt_set_expiry);
@@ -2094,7 +2094,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                return;
 
        down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
 
        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
@@ -2113,7 +2113,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, 1, &umounts);
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
 
        release_mounts(&umounts);
@@ -2376,9 +2376,9 @@ void mnt_make_shortterm(struct vfsmount *mnt)
 #ifdef CONFIG_SMP
        if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
                return;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        atomic_dec(&mnt->mnt_longterm);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 #endif
 }
 
@@ -2406,9 +2406,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                kfree(new_ns);
                return ERR_PTR(-ENOMEM);
        }
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 
        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2647,7 +2647,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                        goto out4;
        } else if (!is_subdir(old.dentry, new.dentry))
                goto out4;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        detach_mnt(new.mnt, &parent_path);
        detach_mnt(root.mnt, &root_parent);
        /* mount old root on put_old */
@@ -2655,7 +2655,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        /* mount new_root on / */
        attach_mnt(new.mnt, &root_parent);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
 out4:
@@ -2718,7 +2718,7 @@ void __init mnt_init(void)
        for (u = 0; u < HASH_SIZE; u++)
                INIT_LIST_HEAD(&mount_hashtable[u]);
 
-       br_lock_init(vfsmount_lock);
+       br_lock_init(&vfsmount_lock);
 
        err = sysfs_init();
        if (err)
@@ -2738,9 +2738,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
        if (!atomic_dec_and_test(&ns->count))
                return;
        down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        umount_tree(ns->root, 0, &umount_list);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        kfree(ns);
index 41d6743d303c2a7923e5d09bc1df1fcf41514c29..ac258beeda3c4e3e9a3a36a375619838c8c7a350 100644 (file)
@@ -625,6 +625,9 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment)
                        goto out_free;
 
+               if (argv[n].v_nmembs >= UINT_MAX / argv[n].v_size)
+                       goto out_free;
+
                len = argv[n].v_size * argv[n].v_nmembs;
                base = (void __user *)(unsigned long)argv[n].v_base;
                if (len == 0) {
@@ -842,6 +845,19 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        case FS_IOC32_GETVERSION:
                cmd = FS_IOC_GETVERSION;
                break;
+       case NILFS_IOCTL_CHANGE_CPMODE:
+       case NILFS_IOCTL_DELETE_CHECKPOINT:
+       case NILFS_IOCTL_GET_CPINFO:
+       case NILFS_IOCTL_GET_CPSTAT:
+       case NILFS_IOCTL_GET_SUINFO:
+       case NILFS_IOCTL_GET_SUSTAT:
+       case NILFS_IOCTL_GET_VINFO:
+       case NILFS_IOCTL_GET_BDESCS:
+       case NILFS_IOCTL_CLEAN_SEGMENTS:
+       case NILFS_IOCTL_SYNC:
+       case NILFS_IOCTL_RESIZE:
+       case NILFS_IOCTL_SET_ALLOC_RANGE:
+               break;
        default:
                return -ENOIOCTLCMD;
        }
index 2c59dfa838ff9a772781a2c6df870d892f6c81f6..406e8be94cec511b8942f40d62e332b9f61e79df 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
        if (nr_pages < pipe->nrbufs)
                return -EBUSY;
 
-       bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
+       bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL | __GFP_NOWARN);
        if (unlikely(!bufs))
                return -ENOMEM;
 
index d42514e32380b5edb38f7985069efe8d8ccc80fa..8b81ea883ad0366e86f9bad476740a272c2852ca 100644 (file)
@@ -273,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
                prev_src_mnt  = child;
        }
 out:
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        while (!list_empty(&tmp_list)) {
                child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
                umount_tree(child, 0, &umount_list);
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        release_mounts(&umount_list);
        return ret;
 }
index 8c344f037bd0195b8fad3a0630190cf32428883c..9252ee3b71e3815c03e3dc399b11a5635297214b 100644 (file)
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
        seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
                pid_nr_ns(pid, ns),
                tcomm,
                state,
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                task->policy,
                (unsigned long long)delayacct_blkio_ticks(task),
                cputime_to_clock_t(gtime),
-               cputime_to_clock_t(cgtime));
+               cputime_to_clock_t(cgtime),
+               (mm && permitted) ? mm->start_data : 0,
+               (mm && permitted) ? mm->end_data : 0,
+               (mm && permitted) ? mm->start_brk : 0);
        if (mm)
                mmput(mm);
        return 0;
index 851ba3dcdc290ab6b750793c2840bdffac755de7..22524d0adfe8a3d05a54780f7d80d7e9b4db6916 100644 (file)
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
+#include <trace/events/oom.h>
 #include "internal.h"
 
 /* NOTE:
@@ -133,6 +135,8 @@ struct pid_entry {
                NULL, &proc_single_file_operations,     \
                { .proc_show = show } )
 
+static int proc_fd_permission(struct inode *inode, int mask);
+
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
@@ -165,9 +169,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
        return result;
 }
 
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
 {
-       struct task_struct *task = get_proc_task(inode);
+       struct task_struct *task = get_proc_task(dentry->d_inode);
        int result = -ENOENT;
 
        if (task) {
@@ -182,9 +186,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
        return result;
 }
 
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
 {
-       struct task_struct *task = get_proc_task(inode);
+       struct task_struct *task = get_proc_task(dentry->d_inode);
        int result = -ENOENT;
 
        if (task) {
@@ -627,6 +631,50 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
        return 0;
 }
 
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+                                struct task_struct *task,
+                                int hide_pid_min)
+{
+       if (pid->hide_pid < hide_pid_min)
+               return true;
+       if (in_group_p(pid->pid_gid))
+               return true;
+       return ptrace_may_access(task, PTRACE_MODE_READ);
+}
+
+
+static int proc_pid_permission(struct inode *inode, int mask)
+{
+       struct pid_namespace *pid = inode->i_sb->s_fs_info;
+       struct task_struct *task;
+       bool has_perms;
+
+       task = get_proc_task(inode);
+       has_perms = has_pid_permissions(pid, task, 1);
+       put_task_struct(task);
+
+       if (!has_perms) {
+               if (pid->hide_pid == 2) {
+                       /*
+                        * Let's make getdents(), stat(), and open()
+                        * consistent with each other.  If a process
+                        * may not stat() a file, it shouldn't be seen
+                        * in procfs at all.
+                        */
+                       return -ENOENT;
+               }
+
+               return -EPERM;
+       }
+       return generic_permission(inode, mask);
+}
+
+
+
 static const struct inode_operations proc_def_inode_operations = {
        .setattr        = proc_setattr,
 };
@@ -1124,6 +1172,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
                                                                -OOM_DISABLE;
+       trace_oom_score_adj_update(task);
 err_sighand:
        unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -1211,6 +1260,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        task->signal->oom_score_adj = oom_score_adj;
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = oom_score_adj;
+       trace_oom_score_adj_update(task);
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
         * always attainable.
@@ -1567,13 +1617,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
        .release        = single_release,
 };
 
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
        struct task_struct *task;
        struct mm_struct *mm;
        struct file *exe_file;
 
-       task = get_proc_task(inode);
+       task = get_proc_task(dentry->d_inode);
        if (!task)
                return -ENOENT;
        mm = get_task_mm(task);
@@ -1603,7 +1653,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
        if (!proc_fd_access_allowed(inode))
                goto out;
 
-       error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+       error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
 out:
        return ERR_PTR(error);
 }
@@ -1642,7 +1692,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
        if (!proc_fd_access_allowed(inode))
                goto out;
 
-       error = PROC_I(inode)->op.proc_get_link(inode, &path);
+       error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;
 
@@ -1723,6 +1773,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        struct inode *inode = dentry->d_inode;
        struct task_struct *task;
        const struct cred *cred;
+       struct pid_namespace *pid = dentry->d_sb->s_fs_info;
 
        generic_fillattr(inode, stat);
 
@@ -1731,6 +1782,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        stat->gid = 0;
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (task) {
+               if (!has_pid_permissions(pid, task, 2)) {
+                       rcu_read_unlock();
+                       /*
+                        * This doesn't prevent learning whether PID exists,
+                        * it only makes getattr() consistent with readdir().
+                        */
+                       return -ENOENT;
+               }
                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
                    task_dumpable(task)) {
                        cred = __task_cred(task);
@@ -1934,9 +1993,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
        return -ENOENT;
 }
 
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-       return proc_fd_info(inode, path, NULL);
+       return proc_fd_info(dentry->d_inode, path, NULL);
 }
 
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
@@ -2157,6 +2216,351 @@ static const struct file_operations proc_fd_operations = {
        .llseek         = default_llseek,
 };
 
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+                            unsigned long *start, unsigned long *end)
+{
+       if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+       unsigned long vm_start, vm_end;
+       bool exact_vma_exists = false;
+       struct mm_struct *mm = NULL;
+       struct task_struct *task;
+       const struct cred *cred;
+       struct inode *inode;
+       int status = 0;
+
+       if (nd && nd->flags & LOOKUP_RCU)
+               return -ECHILD;
+
+       if (!capable(CAP_SYS_ADMIN)) {
+               status = -EACCES;
+               goto out_notask;
+       }
+
+       inode = dentry->d_inode;
+       task = get_proc_task(inode);
+       if (!task)
+               goto out_notask;
+
+       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+               goto out;
+
+       mm = get_task_mm(task);
+       if (!mm)
+               goto out;
+
+       if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+               down_read(&mm->mmap_sem);
+               exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+               up_read(&mm->mmap_sem);
+       }
+
+       mmput(mm);
+
+       if (exact_vma_exists) {
+               if (task_dumpable(task)) {
+                       rcu_read_lock();
+                       cred = __task_cred(task);
+                       inode->i_uid = cred->euid;
+                       inode->i_gid = cred->egid;
+                       rcu_read_unlock();
+               } else {
+                       inode->i_uid = 0;
+                       inode->i_gid = 0;
+               }
+               security_task_to_inode(task, inode);
+               status = 1;
+       }
+
+out:
+       put_task_struct(task);
+
+out_notask:
+       if (status <= 0)
+               d_drop(dentry);
+
+       return status;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+       .d_revalidate   = map_files_d_revalidate,
+       .d_delete       = pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+       unsigned long vm_start, vm_end;
+       struct vm_area_struct *vma;
+       struct task_struct *task;
+       struct mm_struct *mm;
+       int rc;
+
+       rc = -ENOENT;
+       task = get_proc_task(dentry->d_inode);
+       if (!task)
+               goto out;
+
+       mm = get_task_mm(task);
+       put_task_struct(task);
+       if (!mm)
+               goto out;
+
+       rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+       if (rc)
+               goto out_mmput;
+
+       down_read(&mm->mmap_sem);
+       vma = find_exact_vma(mm, vm_start, vm_end);
+       if (vma && vma->vm_file) {
+               *path = vma->vm_file->f_path;
+               path_get(path);
+               rc = 0;
+       }
+       up_read(&mm->mmap_sem);
+
+out_mmput:
+       mmput(mm);
+out:
+       return rc;
+}
+
+struct map_files_info {
+       struct file     *file;
+       unsigned long   len;
+       unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+                          struct task_struct *task, const void *ptr)
+{
+       const struct file *file = ptr;
+       struct proc_inode *ei;
+       struct inode *inode;
+
+       if (!file)
+               return ERR_PTR(-ENOENT);
+
+       inode = proc_pid_make_inode(dir->i_sb, task);
+       if (!inode)
+               return ERR_PTR(-ENOENT);
+
+       ei = PROC_I(inode);
+       ei->op.proc_get_link = proc_map_files_get_link;
+
+       inode->i_op = &proc_pid_link_inode_operations;
+       inode->i_size = 64;
+       inode->i_mode = S_IFLNK;
+
+       if (file->f_mode & FMODE_READ)
+               inode->i_mode |= S_IRUSR;
+       if (file->f_mode & FMODE_WRITE)
+               inode->i_mode |= S_IWUSR;
+
+       d_set_d_op(dentry, &tid_map_files_dentry_operations);
+       d_add(dentry, inode);
+
+       return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+               struct dentry *dentry, struct nameidata *nd)
+{
+       unsigned long vm_start, vm_end;
+       struct vm_area_struct *vma;
+       struct task_struct *task;
+       struct dentry *result;
+       struct mm_struct *mm;
+
+       result = ERR_PTR(-EACCES);
+       if (!capable(CAP_SYS_ADMIN))
+               goto out;
+
+       result = ERR_PTR(-ENOENT);
+       task = get_proc_task(dir);
+       if (!task)
+               goto out;
+
+       result = ERR_PTR(-EACCES);
+       if (lock_trace(task))
+               goto out_put_task;
+
+       result = ERR_PTR(-ENOENT);
+       if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+               goto out_unlock;
+
+       mm = get_task_mm(task);
+       if (!mm)
+               goto out_unlock;
+
+       down_read(&mm->mmap_sem);
+       vma = find_exact_vma(mm, vm_start, vm_end);
+       if (!vma)
+               goto out_no_vma;
+
+       result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+       up_read(&mm->mmap_sem);
+       mmput(mm);
+out_unlock:
+       unlock_trace(task);
+out_put_task:
+       put_task_struct(task);
+out:
+       return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+       .lookup         = proc_map_files_lookup,
+       .permission     = proc_fd_permission,
+       .setattr        = proc_setattr,
+};
+
+static int
+proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+       struct dentry *dentry = filp->f_path.dentry;
+       struct inode *inode = dentry->d_inode;
+       struct vm_area_struct *vma;
+       struct task_struct *task;
+       struct mm_struct *mm;
+       ino_t ino;
+       int ret;
+
+       ret = -EACCES;
+       if (!capable(CAP_SYS_ADMIN))
+               goto out;
+
+       ret = -ENOENT;
+       task = get_proc_task(inode);
+       if (!task)
+               goto out;
+
+       ret = -EACCES;
+       if (lock_trace(task))
+               goto out_put_task;
+
+       ret = 0;
+       switch (filp->f_pos) {
+       case 0:
+               ino = inode->i_ino;
+               if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+                       goto out_unlock;
+               filp->f_pos++;
+       case 1:
+               ino = parent_ino(dentry);
+               if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+                       goto out_unlock;
+               filp->f_pos++;
+       default:
+       {
+               unsigned long nr_files, pos, i;
+               struct flex_array *fa = NULL;
+               struct map_files_info info;
+               struct map_files_info *p;
+
+               mm = get_task_mm(task);
+               if (!mm)
+                       goto out_unlock;
+               down_read(&mm->mmap_sem);
+
+               nr_files = 0;
+
+               /*
+                * We need two passes here:
+                *
+                *  1) Collect vmas of mapped files with mmap_sem taken
+                *  2) Release mmap_sem and instantiate entries
+                *
+                * otherwise we get lockdep complained, since filldir()
+                * routine might require mmap_sem taken in might_fault().
+                */
+
+               for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+                       if (vma->vm_file && ++pos > filp->f_pos)
+                               nr_files++;
+               }
+
+               if (nr_files) {
+                       fa = flex_array_alloc(sizeof(info), nr_files,
+                                               GFP_KERNEL);
+                       if (!fa || flex_array_prealloc(fa, 0, nr_files,
+                                                       GFP_KERNEL)) {
+                               ret = -ENOMEM;
+                               if (fa)
+                                       flex_array_free(fa);
+                               up_read(&mm->mmap_sem);
+                               mmput(mm);
+                               goto out_unlock;
+                       }
+                       for (i = 0, vma = mm->mmap, pos = 2; vma;
+                                       vma = vma->vm_next) {
+                               if (!vma->vm_file)
+                                       continue;
+                               if (++pos <= filp->f_pos)
+                                       continue;
+
+                               get_file(vma->vm_file);
+                               info.file = vma->vm_file;
+                               info.len = snprintf(info.name,
+                                               sizeof(info.name), "%lx-%lx",
+                                               vma->vm_start, vma->vm_end);
+                               if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+                                       BUG();
+                       }
+               }
+               up_read(&mm->mmap_sem);
+
+               for (i = 0; i < nr_files; i++) {
+                       p = flex_array_get(fa, i);
+                       ret = proc_fill_cache(filp, dirent, filldir,
+                                             p->name, p->len,
+                                             proc_map_files_instantiate,
+                                             task, p->file);
+                       if (ret)
+                               break;
+                       filp->f_pos++;
+                       fput(p->file);
+               }
+               for (; i < nr_files; i++) {
+                       /*
+                        * In case of error don't forget
+                        * to put rest of file refs.
+                        */
+                       p = flex_array_get(fa, i);
+                       fput(p->file);
+               }
+               if (fa)
+                       flex_array_free(fa);
+               mmput(mm);
+       }
+       }
+
+out_unlock:
+       unlock_trace(task);
+out_put_task:
+       put_task_struct(task);
+out:
+       return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+       .read           = generic_read_dir,
+       .readdir        = proc_map_files_readdir,
+       .llseek         = default_llseek,
+};
+
 /*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
@@ -2772,6 +3176,7 @@ static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+       DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
@@ -2875,6 +3280,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
        .lookup         = proc_tgid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
+       .permission     = proc_pid_permission,
 };
 
 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -3078,6 +3484,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
                                proc_pid_instantiate, iter.task, NULL);
 }
 
+static int fake_filldir(void *buf, const char *name, int namelen,
+                       loff_t offset, u64 ino, unsigned d_type)
+{
+       return 0;
+}
+
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
@@ -3085,6 +3497,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
        struct task_struct *reaper;
        struct tgid_iter iter;
        struct pid_namespace *ns;
+       filldir_t __filldir;
 
        if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
                goto out_no_task;
@@ -3106,8 +3519,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
+               if (has_pid_permissions(ns, iter.task, 2))
+                       __filldir = filldir;
+               else
+                       __filldir = fake_filldir;
+
                filp->f_pos = iter.tgid + TGID_OFFSET;
-               if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+               if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
                        put_task_struct(iter.task);
                        goto out;
                }
@@ -3442,6 +3860,7 @@ static const struct inode_operations proc_task_inode_operations = {
        .lookup         = proc_task_lookup,
        .getattr        = proc_task_getattr,
        .setattr        = proc_setattr,
+       .permission     = proc_pid_permission,
 };
 
 static const struct file_operations proc_task_operations = {
index 3f8c1433565716d9550388dc0870dd3b8c3efa8f..20dde8d267372b1b511cf08772200374cd3e5c45 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
+#include <linux/pid_namespace.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
@@ -17,7 +18,9 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
+#include <linux/mount.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -104,12 +107,27 @@ void __init proc_init_inodecache(void)
                                             init_once);
 }
 
+static int proc_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+       struct super_block *sb = vfs->mnt_sb;
+       struct pid_namespace *pid = sb->s_fs_info;
+
+       if (pid->pid_gid)
+               seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid);
+       if (pid->hide_pid != 0)
+               seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+
+       return 0;
+}
+
 static const struct super_operations proc_sops = {
        .alloc_inode    = proc_alloc_inode,
        .destroy_inode  = proc_destroy_inode,
        .drop_inode     = generic_delete_inode,
        .evict_inode    = proc_evict_inode,
        .statfs         = simple_statfs,
+       .remount_fs     = proc_remount,
+       .show_options   = proc_show_options,
 };
 
 static void __pde_users_dec(struct proc_dir_entry *pde)
index 7838e5cfec145d4655d4af0e291a8d8bebad91cd..292577531ad13e665bfd94ff63dbfd5a206a5472 100644 (file)
@@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde);
 
 int proc_fill_super(struct super_block *);
 struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+int proc_remount(struct super_block *sb, int *flags, char *data);
 
 /*
  * These are generic /proc routines that use the internal
index 03102d978180eba68469ef01d13ad25465f5b796..46a15d8a29ca74d9ca0a68a57e416108e6b599f3 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/bitops.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
+#include <linux/parser.h>
 
 #include "internal.h"
 
@@ -36,6 +37,63 @@ static int proc_set_super(struct super_block *sb, void *data)
        return err;
 }
 
+enum {
+       Opt_gid, Opt_hidepid, Opt_err,
+};
+
+static const match_table_t tokens = {
+       {Opt_hidepid, "hidepid=%u"},
+       {Opt_gid, "gid=%u"},
+       {Opt_err, NULL},
+};
+
+static int proc_parse_options(char *options, struct pid_namespace *pid)
+{
+       char *p;
+       substring_t args[MAX_OPT_ARGS];
+       int option;
+
+       if (!options)
+               return 1;
+
+       while ((p = strsep(&options, ",")) != NULL) {
+               int token;
+               if (!*p)
+                       continue;
+
+               args[0].to = args[0].from = 0;
+               token = match_token(p, tokens, args);
+               switch (token) {
+               case Opt_gid:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       pid->pid_gid = option;
+                       break;
+               case Opt_hidepid:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       if (option < 0 || option > 2) {
+                               pr_err("proc: hidepid value must be between 0 and 2.\n");
+                               return 0;
+                       }
+                       pid->hide_pid = option;
+                       break;
+               default:
+                       pr_err("proc: unrecognized mount option \"%s\" "
+                              "or missing value\n", p);
+                       return 0;
+               }
+       }
+
+       return 1;
+}
+
+int proc_remount(struct super_block *sb, int *flags, char *data)
+{
+       struct pid_namespace *pid = sb->s_fs_info;
+       return !proc_parse_options(data, pid);
+}
+
 static struct dentry *proc_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
@@ -43,11 +101,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
        struct super_block *sb;
        struct pid_namespace *ns;
        struct proc_inode *ei;
+       char *options;
 
-       if (flags & MS_KERNMOUNT)
+       if (flags & MS_KERNMOUNT) {
                ns = (struct pid_namespace *)data;
-       else
+               options = NULL;
+       } else {
                ns = current->nsproxy->pid_ns;
+               options = data;
+       }
 
        sb = sget(fs_type, proc_test_super, proc_set_super, ns);
        if (IS_ERR(sb))
@@ -55,6 +117,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 
        if (!sb->s_root) {
                sb->s_flags = flags;
+               if (!proc_parse_options(options, ns)) {
+                       deactivate_locked_super(sb);
+                       return ERR_PTR(-EINVAL);
+               }
                err = proc_fill_super(sb);
                if (err) {
                        deactivate_locked_super(sb);
index d1aca1df4f9208ec706d1b7228b68ae32c17ce96..5bab9216cfd00717b8814a29ae5ea71d98e2f760 100644 (file)
@@ -1273,10 +1273,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
        struct reiserfs_bitmap_info *bitmap;
        unsigned int bmap_nr = reiserfs_bmap_count(sb);
 
-       /* Avoid lock recursion in fault case */
-       reiserfs_write_unlock(sb);
        bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
-       reiserfs_write_lock(sb);
        if (bitmap == NULL)
                return -ENOMEM;
 
index eb711060a6f2b771d4b4e68613ea4890273633dd..c3cf54fd4de327c343c0964488a89e8056665062 100644 (file)
@@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        char b[BDEVNAME_SIZE];
        int ret;
 
-       /*
-        * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
-        * dependency inversion warnings.
-        */
-       reiserfs_write_unlock(sb);
        journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
        if (!journal) {
                reiserfs_warning(sb, "journal-1256",
                                 "unable to get memory for journal structure");
-               reiserfs_write_lock(sb);
                return 1;
        }
        INIT_LIST_HEAD(&journal->j_bitmap_nodes);
@@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        INIT_LIST_HEAD(&journal->j_working_list);
        INIT_LIST_HEAD(&journal->j_journal_list);
        journal->j_persistent_trans = 0;
-       ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
-                                          reiserfs_bmap_count(sb));
-       reiserfs_write_lock(sb);
-       if (ret)
+       if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
+                                          reiserfs_bmap_count(sb)))
                goto free_and_return;
 
        allocate_bitmap_nodes(sb);
@@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
                goto free_and_return;
        }
 
-       /*
-        * We need to unlock here to avoid creating the following
-        * dependency:
-        * reiserfs_lock -> sysfs_mutex
-        * Because the reiserfs mmap path creates the following dependency:
-        * mm->mmap -> reiserfs_lock, hence we have
-        * mm->mmap -> reiserfs_lock ->sysfs_mutex
-        * This would ends up in a circular dependency with sysfs readdir path
-        * which does sysfs_mutex -> mm->mmap_sem
-        * This is fine because the reiserfs lock is useless in mount path,
-        * at least until we call journal_begin. We keep it for paranoid
-        * reasons.
-        */
-       reiserfs_write_unlock(sb);
        if (journal_init_dev(sb, journal, j_dev_name) != 0) {
-               reiserfs_write_lock(sb);
                reiserfs_warning(sb, "sh-462",
                                 "unable to initialize jornal device");
                goto free_and_return;
        }
-       reiserfs_write_lock(sb);
 
        rs = SB_DISK_SUPER_BLOCK(sb);
 
@@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        journal->j_mount_id = 10;
        journal->j_state = 0;
        atomic_set(&(journal->j_jlock), 0);
-       reiserfs_write_unlock(sb);
        journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
-       reiserfs_write_lock(sb);
        journal->j_cnode_free_orig = journal->j_cnode_free_list;
        journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
        journal->j_cnode_used = 0;
@@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 
        init_journal_hash(sb);
        jl = journal->j_current_jl;
+
+       /*
+        * get_list_bitmap() may call flush_commit_list() which
+        * requires the lock. Calling flush_commit_list() shouldn't happen
+        * this early but I like to be paranoid.
+        */
+       reiserfs_write_lock(sb);
        jl->j_list_bitmap = get_list_bitmap(sb, jl);
+       reiserfs_write_unlock(sb);
        if (!jl->j_list_bitmap) {
                reiserfs_warning(sb, "journal-2005",
                                 "get_list_bitmap failed for journal list 0");
                goto free_and_return;
        }
-       if (journal_read(sb) < 0) {
+
+       /*
+        * Journal_read needs to be inspected in order to push down
+        * the lock further inside (or even remove it).
+        */
+       reiserfs_write_lock(sb);
+       ret = journal_read(sb);
+       reiserfs_write_unlock(sb);
+       if (ret < 0) {
                reiserfs_warning(sb, "reiserfs-2006",
                                 "Replay Failure, unable to mount");
                goto free_and_return;
        }
 
        reiserfs_mounted_fs_count++;
-       if (reiserfs_mounted_fs_count <= 1) {
-               reiserfs_write_unlock(sb);
+       if (reiserfs_mounted_fs_count <= 1)
                commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
-               reiserfs_write_lock(sb);
-       }
 
        INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
        journal->j_work_sb = sb;
@@ -2896,14 +2883,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
            journal->j_cnode_free < (journal->j_trans_max * 3)) {
                return 1;
        }
-       /* protected by the BKL here */
+
        journal->j_len_alloc += new_alloc;
        th->t_blocks_allocated += new_alloc ;
        return 0;
 }
 
-/* this must be called inside a transaction, and requires the
-** kernel_lock to be held
+/* this must be called inside a transaction
 */
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
 {
@@ -2914,8 +2900,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
        return;
 }
 
-/* this must be called without a transaction started, and does not
-** require BKL
+/* this must be called without a transaction started
 */
 void reiserfs_allow_writes(struct super_block *s)
 {
@@ -2924,8 +2909,7 @@ void reiserfs_allow_writes(struct super_block *s)
        wake_up(&journal->j_join_wait);
 }
 
-/* this must be called without a transaction started, and does not
-** require BKL
+/* this must be called without a transaction started
 */
 void reiserfs_wait_on_write_block(struct super_block *s)
 {
index 14363b96b6afafa74a80c2c66553c8d914064268..7cd93139e259ac15b69146a1d95bce7e2294f6ad 100644 (file)
@@ -1428,9 +1428,7 @@ static int read_super_block(struct super_block *s, int offset)
 static int reread_meta_blocks(struct super_block *s)
 {
        ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
-       reiserfs_write_unlock(s);
        wait_on_buffer(SB_BUFFER_WITH_SB(s));
-       reiserfs_write_lock(s);
        if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
                reiserfs_warning(s, "reiserfs-2504", "error reading the super");
                return 1;
@@ -1655,22 +1653,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        mutex_init(&REISERFS_SB(s)->lock);
        REISERFS_SB(s)->lock_depth = -1;
 
-       /*
-        * This function is called with the bkl, which also was the old
-        * locking used here.
-        * do_journal_begin() will soon check if we hold the lock (ie: was the
-        * bkl). This is likely because do_journal_begin() has several another
-        * callers because at this time, it doesn't seem to be necessary to
-        * protect against anything.
-        * Anyway, let's be conservative and lock for now.
-        */
-       reiserfs_write_lock(s);
-
        jdev_name = NULL;
        if (reiserfs_parse_options
            (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
             &commit_max_age, qf_names, &qfmt) == 0) {
-               goto error;
+               goto error_unlocked;
        }
 #ifdef CONFIG_QUOTA
        handle_quota_files(s, qf_names, &qfmt);
@@ -1678,7 +1665,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 
        if (blocks) {
                SWARN(silent, s, "jmacd-7", "resize option for remount only");
-               goto error;
+               goto error_unlocked;
        }
 
        /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
@@ -1688,7 +1675,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
                SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
                      reiserfs_bdevname(s));
-               goto error;
+               goto error_unlocked;
        }
 
        rs = SB_DISK_SUPER_BLOCK(s);
@@ -1704,7 +1691,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                      "or increase size of your LVM partition");
                SWARN(silent, s, "", "Or may be you forgot to "
                      "reboot after fdisk when it told you to");
-               goto error;
+               goto error_unlocked;
        }
 
        sbi->s_mount_state = SB_REISERFS_STATE(s);
@@ -1712,8 +1699,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 
        if ((errval = reiserfs_init_bitmap_cache(s))) {
                SWARN(silent, s, "jmacd-8", "unable to read bitmap");
-               goto error;
+               goto error_unlocked;
        }
+
        errval = -EINVAL;
 #ifdef CONFIG_REISERFS_CHECK
        SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
@@ -1736,24 +1724,26 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        if (reiserfs_barrier_flush(s)) {
                printk("reiserfs: using flush barriers\n");
        }
+
        // set_device_ro(s->s_dev, 1) ;
        if (journal_init(s, jdev_name, old_format, commit_max_age)) {
                SWARN(silent, s, "sh-2022",
                      "unable to initialize journal space");
-               goto error;
+               goto error_unlocked;
        } else {
                jinit_done = 1; /* once this is set, journal_release must be called
                                 ** if we error out of the mount
                                 */
        }
+
        if (reread_meta_blocks(s)) {
                SWARN(silent, s, "jmacd-9",
                      "unable to reread meta blocks after journal init");
-               goto error;
+               goto error_unlocked;
        }
 
        if (replay_only(s))
-               goto error;
+               goto error_unlocked;
 
        if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
                SWARN(silent, s, "clm-7000",
@@ -1767,9 +1757,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                         reiserfs_init_locked_inode, (void *)(&args));
        if (!root_inode) {
                SWARN(silent, s, "jmacd-10", "get root inode failed");
-               goto error;
+               goto error_unlocked;
        }
 
+       /*
+        * This path assumed to be called with the BKL in the old times.
+        * Now we have inherited the big reiserfs lock from it and many
+        * reiserfs helpers called in the mount path and elsewhere require
+        * this lock to be held even if it's not always necessary. Let's be
+        * conservative and hold it early. The window can be reduced after
+        * careful review of the code.
+        */
+       reiserfs_write_lock(s);
+
        if (root_inode->i_state & I_NEW) {
                reiserfs_read_locked_inode(root_inode, &args);
                unlock_new_inode(root_inode);
@@ -1896,12 +1896,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        return (0);
 
 error:
-       if (jinit_done) {       /* kill the commit thread, free journal ram */
+       reiserfs_write_unlock(s);
+
+error_unlocked:
+       /* kill the commit thread, free journal ram */
+       if (jinit_done) {
+               reiserfs_write_lock(s);
                journal_release_error(NULL, s);
+               reiserfs_write_unlock(s);
        }
 
-       reiserfs_write_unlock(s);
-
        reiserfs_free_bitmap_cache(s);
        if (SB_BUFFER_WITH_SB(s))
                brelse(SB_BUFFER_WITH_SB(s));
index e58fa777fa09abe91831026852bfe21bdfa4f8d4..f96a5b58a975c6ccc1ebadecf746f84dd47a38ea 100644 (file)
@@ -139,6 +139,20 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
                __tlb_remove_tlb_entry(tlb, ptep, address);     \
        } while (0)
 
+/**
+ * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
+ * This is a nop so far, because only x86 needs it.
+ */
+#ifndef __tlb_remove_pmd_tlb_entry
+#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
+#endif
+
+#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)           \
+       do {                                                    \
+               tlb->need_flush = 1;                            \
+               __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \
+       } while (0)
+
 #define pte_free_tlb(tlb, ptep, address)                       \
        do {                                                    \
                tlb->need_flush = 1;                            \
index dfadc96e9d63851c334885f0ec7761c8d6cfe3ff..2f4079175afb81f80f641e28da9b1145b5415bae 100644 (file)
@@ -29,6 +29,7 @@
    the kernel context */
 #define __cold                 __attribute__((__cold__))
 
+#define __linktime_error(message) __attribute__((__error__(message)))
 
 #if __GNUC_MINOR__ >= 5
 /*
index 320d6c94ff848d5db94fb1fd76576501a88e9a3a..4a243546d142b3e0264b115332bf73e354aa0fdf 100644 (file)
@@ -293,7 +293,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 #ifndef __compiletime_error
 # define __compiletime_error(message)
 #endif
-
+#ifndef __linktime_error
+# define __linktime_error(message)
+#endif
 /*
  * Prevent the compiler from merging or refetching accesses.  The compiler
  * is also forbidden from reordering successive instances of ACCESS_ONCE(),
index f362733186a5a6f4a1657b0a85ee32e158095ed2..657ab55beda014c15b33833e84dc7e1496d06abf 100644 (file)
@@ -61,6 +61,7 @@ struct file;
 static inline void eventpoll_init_file(struct file *file)
 {
        INIT_LIST_HEAD(&file->f_ep_links);
+       INIT_LIST_HEAD(&file->f_tfile_llink);
 }
 
 
index e0bc4ffb8e7f0ec42a916219ab02f43a112609d1..057434af2b900e18745b375b109e52ef03a9c6c0 100644 (file)
@@ -656,6 +656,7 @@ struct address_space {
         * must be enforced here for CRIS, to let the least significant bit
         * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
         */
+struct request_queue;
 
 struct block_device {
        dev_t                   bd_dev;  /* not a kdev_t - it's a search key */
@@ -678,6 +679,7 @@ struct block_device {
        unsigned                bd_part_count;
        int                     bd_invalidated;
        struct gendisk *        bd_disk;
+       struct request_queue *  bd_queue;
        struct list_head        bd_list;
        /*
         * Private data.  You must have bd_claim'ed the block_device
@@ -1001,6 +1003,7 @@ struct file {
 #ifdef CONFIG_EPOLL
        /* Used by fs/eventpoll.c to link all the hooks to this file */
        struct list_head        f_ep_links;
+       struct list_head        f_tfile_llink;
 #endif /* #ifdef CONFIG_EPOLL */
        struct address_space    *f_mapping;
 #ifdef CONFIG_DEBUG_WRITECOUNT
index 3a76faf6a3ee82cd20f82d36208c6f50c8f29685..ec065569350f45ae6bd09fa0e5eda2153abcdc44 100644 (file)
@@ -36,6 +36,7 @@ struct vm_area_struct;
 #endif
 #define ___GFP_NO_KSWAPD       0x400000u
 #define ___GFP_OTHER_NODE      0x800000u
+#define ___GFP_WRITE           0x1000000u
 
 /*
  * GFP bitmasks..
@@ -83,8 +84,20 @@ struct vm_area_struct;
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
 #define __GFP_NOTRACK  ((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
+/*
+ * __GFP_NO_KSWAPD indicates that the VM should favour failing the allocation
+ * over excessive disruption of the system. Currently this means
+ * 1. Do not wake kswapd (hence the flag name)
+ * 2. Do not use stall in synchronous compaction for high-order allocations
+ *    as this may cause the caller to stall writing out pages
+ *
+ * This flag it primarily intended for use with transparent hugepage support.
+ * If the flag is used outside the VM, linux-mm should be cc'd for review.
+ */
 #define __GFP_NO_KSWAPD        ((__force gfp_t)___GFP_NO_KSWAPD)
+
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
+#define __GFP_WRITE    ((__force gfp_t)___GFP_WRITE)   /* Allocator intends to dirty page */
 
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
@@ -92,7 +105,7 @@ struct vm_area_struct;
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 24    /* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 25    /* Room for N __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
@@ -313,7 +326,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
                                                unsigned int order)
 {
-       VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+       VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
 
        return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
 }
@@ -358,6 +371,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 extern void free_hot_cold_page(struct page *page, int cold);
+extern void free_hot_cold_page_list(struct list_head *list, int cold);
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
@@ -367,9 +381,25 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(void);
 void drain_local_pages(void *dummy);
 
+/*
+ * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
+ * GFP flags are used before interrupts are enabled. Once interrupts are
+ * enabled, it is set to __GFP_BITS_MASK while the system is running. During
+ * hibernation, it is used by PM to avoid I/O during memory allocation while
+ * devices are suspended.
+ */
 extern gfp_t gfp_allowed_mask;
 
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
+#ifdef CONFIG_PM_SLEEP
+extern bool pm_suspended_storage(void);
+#else
+static inline bool pm_suspended_storage(void)
+{
+       return false;
+}
+#endif /* CONFIG_PM_SLEEP */
+
 #endif /* __LINUX_GFP_H */
index 219ca4f6bea66a0a755101ea2d9c615a67854283..d690c0fe4b8b24b425790789b66e317285af1e5e 100644 (file)
@@ -125,6 +125,7 @@ struct hpet_info {
 #define        HPET_EPI        _IO('h', 0x04)  /* enable periodic */
 #define        HPET_DPI        _IO('h', 0x05)  /* disable periodic */
 #define        HPET_IRQFREQ    _IOW('h', 0x6, unsigned long)   /* IRQFREQ usec */
+#define        HPET_ALLOC_TIMER _IO('h', 0x7)
 
 #define MAX_HPET_TBS   8               /* maximum hpet timer blocks */
 
index a9ace9c32507e31a851bcd719d0a56759b3a4428..7246cfa602d0517accd022bd0c3837c86699b3f0 100644 (file)
@@ -18,7 +18,7 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
                                          unsigned int flags);
 extern int zap_huge_pmd(struct mmu_gather *tlb,
                        struct vm_area_struct *vma,
-                       pmd_t *pmd);
+                       pmd_t *pmd, unsigned long addr);
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned char *vec);
@@ -145,10 +145,10 @@ static inline struct page *compound_trans_head(struct page *page)
        }
        return page;
 }
-#else /* CONFIG_TRANSPARENT_HUGEPAGE */
-#define HPAGE_PMD_SHIFT ({ BUG(); 0; })
-#define HPAGE_PMD_MASK ({ BUG(); 0; })
-#define HPAGE_PMD_SIZE ({ BUG(); 0; })
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
+#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
+#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
 
 #define hpage_nr_pages(x) 1
 
index d9d6c868b86bc01226031d63ce5ee72eb44ebed6..12899372ff558930eda798c8f95cdca26408436e 100644 (file)
@@ -85,11 +85,11 @@ static inline unsigned long hugetlb_total_pages(void)
        return 0;
 }
 
-#define follow_hugetlb_page(m,v,p,vs,a,b,i,w)  ({ BUG(); 0; })
+#define follow_hugetlb_page(m,v,p,vs,a,b,i,w)  ({ BUILD_BUG(); 0; })
 #define follow_huge_addr(mm, addr, write)      ERR_PTR(-EINVAL)
-#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
-#define hugetlb_prefault(mapping, vma)         ({ BUG(); 0; })
-#define unmap_hugepage_range(vma, start, end, page)    BUG()
+#define copy_hugetlb_page_range(src, dst, vma) ({ BUILD_BUG(); 0; })
+#define hugetlb_prefault(mapping, vma)         ({ BUILD_BUG(); 0; })
+#define unmap_hugepage_range(vma, start, end, page)    BUILD_BUG()
 static inline void hugetlb_report_meminfo(struct seq_file *m)
 {
 }
@@ -100,8 +100,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 #define pmd_huge(x)    0
 #define pud_huge(x)    0
 #define is_hugepage_only_range(mm, addr, len)  0
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
-#define hugetlb_fault(mm, vma, addr, flags)    ({ BUG(); 0; })
+#define hugetlb_free_pgd_range(tlb, addr, end, f, c) ({BUILD_BUG(); 0; })
+#define hugetlb_fault(mm, vma, addr, flags)    ({ BUILD_BUG(); 0; })
 #define huge_pte_offset(mm, address)   0
 #define dequeue_hwpoisoned_huge_page(page)     0
 static inline void copy_huge_page(struct page *dst, struct page *src)
@@ -186,7 +186,7 @@ static inline void set_file_hugepages(struct file *file)
 #else /* !CONFIG_HUGETLBFS */
 
 #define is_file_hugepages(file)                        0
-#define set_file_hugepages(file)               BUG()
+#define set_file_hugepages(file)               BUILD_BUG()
 static inline struct file *hugetlb_file_setup(const char *name, size_t size,
                vm_flags_t acctflag, struct user_struct **user, int creat_flags)
 {
index 8a297a5e794cc8e51c22351098b80a35ce43ef09..e2bac00eb5a353e7603dd9c7caf69e1b0fe17828 100644 (file)
@@ -90,11 +90,41 @@ static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
 
 #ifdef CONFIG_POSIX_MQUEUE
 extern int mq_init_ns(struct ipc_namespace *ns);
-/* default values */
-#define DFLT_QUEUESMAX 256     /* max number of message queues */
-#define DFLT_MSGMAX    10      /* max number of messages in each queue */
-#define HARD_MSGMAX    (32768*sizeof(void *)/4)
-#define DFLT_MSGSIZEMAX 8192   /* max message size */
+/*
+ * POSIX Message Queue default values:
+ *
+ * MIN_*: Lowest value an admin can set the maximum unprivileged limit to
+ * DFLT_*MAX: Default values for the maximum unprivileged limits
+ * DFLT_{MSG,MSGSIZE}: Default values used when the user doesn't supply
+ *   an attribute to the open call and the queue must be created
+ * HARD_*: Highest value the maximums can be set to.  These are enforced
+ *   on CAP_SYS_RESOURCE apps as well making them inviolate (so make them
+ *   suitably high)
+ *
+ * POSIX Requirements:
+ *   Per app minimum openable message queues - 8.  This does not map well
+ *     to the fact that we limit the number of queues on a per namespace
+ *     basis instead of a per app basis.  So, make the default high enough
+ *     that no given app should have a hard time opening 8 queues.
+ *   Minimum maximum for HARD_MSGMAX - 32767.  I bumped this to 65536.
+ *   Minimum maximum for HARD_MSGSIZEMAX - POSIX is silent on this.  However,
+ *     we have run into a situation where running applications in the wild
+ *     require this to be at least 5MB, and preferably 10MB, so I set the
+ *     value to 16MB in hopes that this user is the worst of the bunch and
+ *     the new maximum will handle anyone else.  I may have to revisit this
+ *     in the future.
+ */
+#define MIN_QUEUESMAX                  1
+#define DFLT_QUEUESMAX               256
+#define HARD_QUEUESMAX              1024
+#define MIN_MSGMAX                     1
+#define DFLT_MSG                      64U
+#define DFLT_MSGMAX                 1024
+#define HARD_MSGMAX                65536
+#define MIN_MSGSIZEMAX               128
+#define DFLT_MSGSIZE                8192U
+#define DFLT_MSGSIZEMAX                (1024*1024)
+#define HARD_MSGSIZEMAX             (16*1024*1024)
 #else
 static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 #endif
index ff83683c0b9d446a43b01411e434ef4e462c28a3..e8343422240a8ca981dbd5b5971ccef6c30d1382 100644 (file)
@@ -185,16 +185,17 @@ static inline void might_fault(void)
 
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(int state);
-NORET_TYPE void panic(const char * fmt, ...)
-       __attribute__ ((NORET_AND format (printf, 1, 2))) __cold;
+__printf(1, 2)
+void panic(const char *fmt, ...)
+       __noreturn __cold;
 extern void oops_enter(void);
 extern void oops_exit(void);
 void print_oops_end_marker(void);
 extern int oops_may_print(void);
-NORET_TYPE void do_exit(long error_code)
-       ATTRIB_NORET;
-NORET_TYPE void complete_and_exit(struct completion *, long)
-       ATTRIB_NORET;
+void do_exit(long error_code)
+       __noreturn;
+void complete_and_exit(struct completion *, long)
+       __noreturn;
 
 /* Internal, do not use. */
 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
@@ -666,6 +667,7 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #define BUILD_BUG_ON_ZERO(e) (0)
 #define BUILD_BUG_ON_NULL(e) ((void*)0)
 #define BUILD_BUG_ON(condition)
+#define BUILD_BUG() (0)
 #else /* __CHECKER__ */
 
 /* Force a compilation error if a constant expression is not a power of 2 */
@@ -704,6 +706,21 @@ extern int __build_bug_on_failed;
                if (condition) __build_bug_on_failed = 1;       \
        } while(0)
 #endif
+
+/**
+ * BUILD_BUG - break compile if used.
+ *
+ * If you have some code that you expect the compiler to eliminate at
+ * build time, you should use BUILD_BUG to detect if it is
+ * unexpectedly used.
+ */
+#define BUILD_BUG()                                            \
+       do {                                                    \
+               extern void __build_bug_failed(void)            \
+                       __linktime_error("BUILD_BUG failed");   \
+               __build_bug_failed();                           \
+       } while (0)
+
 #endif /* __CHECKER__ */
 
 /* Trap pasters of __FUNCTION__ at compile-time */
index ee0c952188de2c99281fd7567f701893e90a47dd..fee66317e071547a4a75e0b90b61399b97249ed2 100644 (file)
@@ -18,7 +18,6 @@
 enum kmsg_dump_reason {
        KMSG_DUMP_OOPS,
        KMSG_DUMP_PANIC,
-       KMSG_DUMP_KEXEC,
        KMSG_DUMP_RESTART,
        KMSG_DUMP_HALT,
        KMSG_DUMP_POWEROFF,
diff --git a/include/linux/leds-tca6507.h b/include/linux/leds-tca6507.h
new file mode 100644 (file)
index 0000000..3b8ac62
--- /dev/null
@@ -0,0 +1,33 @@
+/*
+ * TCA6507 LED chip driver.
+ *
+ * Copyright (C) 2011 Neil Brown <neil@brown.name>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#ifndef __LINUX_TCA6507_H
+#define __LINUX_TCA6507_H
+#include <linux/leds.h>
+
+struct tca6507_platform_data {
+       struct led_platform_data leds;
+       int gpio_base;
+       void (*setup)(unsigned gpio_base, unsigned ngpio);
+       void (*teardown)(unsigned gpio_base, unsigned ngpio);
+};
+
+#define        TCA6507_MAKE_GPIO 1
+#endif /* __LINUX_TCA6507_H*/
index f549056fb20bd5533555918cc1b1f9805c2cdcc3..f246d5a841ebcb35124a4b334f950a3f2673628a 100644 (file)
 #include <linux/percpu.h>
 
 /* can make br locks by using local lock for read side, global lock for write */
-#define br_lock_init(name)     name##_lock_init()
-#define br_read_lock(name)     name##_local_lock()
-#define br_read_unlock(name)   name##_local_unlock()
-#define br_write_lock(name)    name##_global_lock_online()
-#define br_write_unlock(name)  name##_global_unlock_online()
+#define br_lock_init(name)     lg_lock_init(name, #name)
+#define br_read_lock(name)     lg_local_lock(name)
+#define br_read_unlock(name)   lg_local_unlock(name)
+#define br_write_lock(name)    lg_global_lock_online(name)
+#define br_write_unlock(name)  lg_global_unlock_online(name)
 
-#define DECLARE_BRLOCK(name)   DECLARE_LGLOCK(name)
 #define DEFINE_BRLOCK(name)    DEFINE_LGLOCK(name)
 
-
-#define lg_lock_init(name)     name##_lock_init()
-#define lg_local_lock(name)    name##_local_lock()
-#define lg_local_unlock(name)  name##_local_unlock()
-#define lg_local_lock_cpu(name, cpu)   name##_local_lock_cpu(cpu)
-#define lg_local_unlock_cpu(name, cpu) name##_local_unlock_cpu(cpu)
-#define lg_global_lock(name)   name##_global_lock()
-#define lg_global_unlock(name) name##_global_unlock()
-#define lg_global_lock_online(name) name##_global_lock_online()
-#define lg_global_unlock_online(name) name##_global_unlock_online()
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
 
 #define DEFINE_LGLOCK_LOCKDEP(name)
 #endif
 
+struct lglock {
+       arch_spinlock_t __percpu *lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       struct lock_class_key lock_key;
+       struct lockdep_map    lock_dep_map;
+#endif 
+};
+
+#define DEFINE_LGLOCK(name) \
+       DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) = __ARCH_SPIN_LOCK_UNLOCKED; \
+       struct lglock name = { .lock = &name ## _lock }
 
-#define DECLARE_LGLOCK(name)                                           \
- extern void name##_lock_init(void);                                   \
- extern void name##_local_lock(void);                                  \
- extern void name##_local_unlock(void);                                        \
- extern void name##_local_lock_cpu(int cpu);                           \
- extern void name##_local_unlock_cpu(int cpu);                         \
- extern void name##_global_lock(void);                                 \
- extern void name##_global_unlock(void);                               \
- extern void name##_global_lock_online(void);                          \
- extern void name##_global_unlock_online(void);                                \
+/* Only valid for statics */
+void lg_lock_init(struct lglock *lg, char *name);
+void lg_local_lock(struct lglock *lg);
+void lg_local_unlock(struct lglock *lg);
+void lg_local_lock_cpu(struct lglock *lg, int cpu);
+void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+void lg_global_lock_online(struct lglock *lg);
+void lg_global_unlock_online(struct lglock *lg);
+void lg_global_lock(struct lglock *lg);
+void lg_global_unlock(struct lglock *lg);
 
-#define DEFINE_LGLOCK(name)                                            \
-                                                                       \
- DEFINE_PER_CPU(arch_spinlock_t, name##_lock);                         \
- DEFINE_LGLOCK_LOCKDEP(name);                                          \
-                                                                       \
- void name##_lock_init(void) {                                         \
-       int i;                                                          \
-       LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
-       for_each_possible_cpu(i) {                                      \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;     \
-       }                                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_lock_init);                                      \
-                                                                       \
- void name##_local_lock(void) {                                                \
-       arch_spinlock_t *lock;                                          \
-       preempt_disable();                                              \
-       rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);     \
-       lock = &__get_cpu_var(name##_lock);                             \
-       arch_spin_lock(lock);                                           \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_lock);                                     \
-                                                                       \
- void name##_local_unlock(void) {                                      \
-       arch_spinlock_t *lock;                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);             \
-       lock = &__get_cpu_var(name##_lock);                             \
-       arch_spin_unlock(lock);                                         \
-       preempt_enable();                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_unlock);                                   \
-                                                                       \
- void name##_local_lock_cpu(int cpu) {                                 \
-       arch_spinlock_t *lock;                                          \
-       preempt_disable();                                              \
-       rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);     \
-       lock = &per_cpu(name##_lock, cpu);                              \
-       arch_spin_lock(lock);                                           \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_lock_cpu);                                 \
-                                                                       \
- void name##_local_unlock_cpu(int cpu) {                               \
-       arch_spinlock_t *lock;                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);             \
-       lock = &per_cpu(name##_lock, cpu);                              \
-       arch_spin_unlock(lock);                                         \
-       preempt_enable();                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_unlock_cpu);                               \
-                                                                       \
- void name##_global_lock_online(void) {                                        \
-       int i;                                                          \
-       preempt_disable();                                              \
-       rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);           \
-       for_each_online_cpu(i) {                                        \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_lock(lock);                                   \
-       }                                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_lock_online);                             \
-                                                                       \
- void name##_global_unlock_online(void) {                              \
-       int i;                                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);              \
-       for_each_online_cpu(i) {                                        \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_unlock(lock);                                 \
-       }                                                               \
-       preempt_enable();                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_unlock_online);                           \
-                                                                       \
- void name##_global_lock(void) {                                       \
-       int i;                                                          \
-       preempt_disable();                                              \
-       rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);           \
-       for_each_possible_cpu(i) {                                      \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_lock(lock);                                   \
-       }                                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_lock);                                    \
-                                                                       \
- void name##_global_unlock(void) {                                     \
-       int i;                                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);              \
-       for_each_possible_cpu(i) {                                      \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_unlock(lock);                                 \
-       }                                                               \
-       preempt_enable();                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_unlock);
 #endif
index 3f46aedea42fbb5024f8e9896e380d0051c74132..807f1e5332267e6bd776f9591f106b723e1c02ab 100644 (file)
@@ -88,8 +88,4 @@
 
 #endif
 
-#define NORET_TYPE    /**/
-#define ATTRIB_NORET  __attribute__((noreturn))
-#define NORET_AND     noreturn,
-
 #endif
index 9b296ea41bb85a5543c915dad41748c59fdd1734..b430015a4348dec0db18ae517bd96e4047fffae5 100644 (file)
@@ -32,13 +32,11 @@ enum mem_cgroup_page_stat_item {
        MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
 };
 
-extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
-                                       struct list_head *dst,
-                                       unsigned long *scanned, int order,
-                                       isolate_mode_t mode,
-                                       struct zone *z,
-                                       struct mem_cgroup *mem_cont,
-                                       int active, int file);
+struct mem_cgroup_reclaim_cookie {
+       struct zone *zone;
+       int priority;
+       unsigned int generation;
+};
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /*
@@ -56,20 +54,21 @@ extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask);
 /* for swap handling */
 extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-               struct page *page, gfp_t mask, struct mem_cgroup **ptr);
+               struct page *page, gfp_t mask, struct mem_cgroup **memcgp);
 extern void mem_cgroup_commit_charge_swapin(struct page *page,
-                                       struct mem_cgroup *ptr);
-extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);
+                                       struct mem_cgroup *memcg);
+extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg);
 
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                        gfp_t gfp_mask);
-extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
-extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
-extern void mem_cgroup_rotate_reclaimable_page(struct page *page);
-extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
-extern void mem_cgroup_del_lru(struct page *page);
-extern void mem_cgroup_move_lists(struct page *page,
-                                 enum lru_list from, enum lru_list to);
+
+struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
+struct lruvec *mem_cgroup_lru_add_list(struct zone *, struct page *,
+                                      enum lru_list);
+void mem_cgroup_lru_del_list(struct page *, enum lru_list);
+void mem_cgroup_lru_del(struct page *);
+struct lruvec *mem_cgroup_lru_move_lists(struct zone *, struct page *,
+                                        enum lru_list, enum lru_list);
 
 /* For coalescing uncharge for reducing memcg' overhead*/
 extern void mem_cgroup_uncharge_start(void);
@@ -102,10 +101,15 @@ extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
 
 extern int
 mem_cgroup_prepare_migration(struct page *page,
-       struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask);
+       struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask);
 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        struct page *oldpage, struct page *newpage, bool migration_ok);
 
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
+                                  struct mem_cgroup *,
+                                  struct mem_cgroup_reclaim_cookie *);
+void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+
 /*
  * For memory reclaim.
  */
@@ -122,6 +126,8 @@ struct zone_reclaim_stat*
 mem_cgroup_get_reclaim_stat_from_page(struct page *page);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
                                        struct task_struct *p);
+extern void mem_cgroup_replace_page_cache(struct page *oldpage,
+                                       struct page *newpage);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 extern int do_swap_account;
@@ -157,7 +163,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg);
 
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail);
+void mem_cgroup_split_huge_fixup(struct page *head);
 #endif
 
 #ifdef CONFIG_DEBUG_VM
@@ -180,17 +186,17 @@ static inline int mem_cgroup_cache_charge(struct page *page,
 }
 
 static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-               struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr)
+               struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp)
 {
        return 0;
 }
 
 static inline void mem_cgroup_commit_charge_swapin(struct page *page,
-                                         struct mem_cgroup *ptr)
+                                         struct mem_cgroup *memcg)
 {
 }
 
-static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr)
+static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
 {
 }
 
@@ -210,33 +216,33 @@ static inline void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 }
 
-static inline void mem_cgroup_add_lru_list(struct page *page, int lru)
-{
-}
-
-static inline void mem_cgroup_del_lru_list(struct page *page, int lru)
+static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
+                                                   struct mem_cgroup *memcg)
 {
-       return ;
+       return &zone->lruvec;
 }
 
-static inline void mem_cgroup_rotate_reclaimable_page(struct page *page)
+static inline struct lruvec *mem_cgroup_lru_add_list(struct zone *zone,
+                                                    struct page *page,
+                                                    enum lru_list lru)
 {
-       return ;
+       return &zone->lruvec;
 }
 
-static inline void mem_cgroup_rotate_lru_list(struct page *page, int lru)
+static inline void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
 {
-       return ;
 }
 
-static inline void mem_cgroup_del_lru(struct page *page)
+static inline void mem_cgroup_lru_del(struct page *page)
 {
-       return ;
 }
 
-static inline void
-mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to)
+static inline struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
+                                                      struct page *page,
+                                                      enum lru_list from,
+                                                      enum lru_list to)
 {
+       return &zone->lruvec;
 }
 
 static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -269,7 +275,7 @@ static inline struct cgroup_subsys_state
 
 static inline int
 mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-       struct mem_cgroup **ptr, gfp_t gfp_mask)
+       struct mem_cgroup **memcgp, gfp_t gfp_mask)
 {
        return 0;
 }
@@ -279,6 +285,19 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 {
 }
 
+static inline struct mem_cgroup *
+mem_cgroup_iter(struct mem_cgroup *root,
+               struct mem_cgroup *prev,
+               struct mem_cgroup_reclaim_cookie *reclaim)
+{
+       return NULL;
+}
+
+static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
+                                        struct mem_cgroup *prev)
+{
+}
+
 static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg)
 {
        return 0;
@@ -360,8 +379,7 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        return 0;
 }
 
-static inline void mem_cgroup_split_huge_fixup(struct page *head,
-                                               struct page *tail)
+static inline void mem_cgroup_split_huge_fixup(struct page *head)
 {
 }
 
@@ -369,6 +387,10 @@ static inline
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
+                               struct page *newpage)
+{
+}
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
index 02e39b0042b16978f4d2868933ed17df23c334eb..17b27cd269c404e117416e468ca9853dd94d141b 100644 (file)
@@ -1482,6 +1482,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
        return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
+                               unsigned long vm_start, unsigned long vm_end)
+{
+       struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+       if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+               vma = NULL;
+
+       return vma;
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 #else
@@ -1608,5 +1620,22 @@ extern void copy_user_huge_page(struct page *dst, struct page *src,
                                unsigned int pages_per_huge_page);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+extern unsigned int _debug_guardpage_minorder;
+
+static inline unsigned int debug_guardpage_minorder(void)
+{
+       return _debug_guardpage_minorder;
+}
+
+static inline bool page_is_guard(struct page *page)
+{
+       return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline unsigned int debug_guardpage_minorder(void) { return 0; }
+static inline bool page_is_guard(struct page *page) { return false; }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
index 8f7d24712dc115790269442be6f8c245d4cf6145..4e3478e719260fb6f90279334bfb6cde045830a8 100644 (file)
@@ -21,27 +21,22 @@ static inline int page_is_file_cache(struct page *page)
        return !PageSwapBacked(page);
 }
 
-static inline void
-__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l,
-                      struct list_head *head)
-{
-       list_add(&page->lru, head);
-       __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
-       mem_cgroup_add_lru_list(page, l);
-}
-
 static inline void
 add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
-       __add_page_to_lru_list(zone, page, l, &zone->lru[l].list);
+       struct lruvec *lruvec;
+
+       lruvec = mem_cgroup_lru_add_list(zone, page, l);
+       list_add(&page->lru, &lruvec->lists[l]);
+       __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page));
 }
 
 static inline void
 del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
+       mem_cgroup_lru_del_list(page, l);
        list_del(&page->lru);
        __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
-       mem_cgroup_del_lru_list(page, l);
 }
 
 /**
@@ -64,7 +59,6 @@ del_page_from_lru(struct zone *zone, struct page *page)
 {
        enum lru_list l;
 
-       list_del(&page->lru);
        if (PageUnevictable(page)) {
                __ClearPageUnevictable(page);
                l = LRU_UNEVICTABLE;
@@ -75,8 +69,9 @@ del_page_from_lru(struct zone *zone, struct page *page)
                        l += LRU_ACTIVE;
                }
        }
+       mem_cgroup_lru_del_list(page, l);
+       list_del(&page->lru);
        __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page));
-       mem_cgroup_del_lru_list(page, l);
 }
 
 /**
index 2595c9c220fdef4cbfa1f8e45dbf70ac98186a75..81a56dfd24f159acbfbf70a29fe49e3fdbc0fd7f 100644 (file)
@@ -152,12 +152,11 @@ struct page {
 #endif
 }
 /*
- * If another subsystem starts using the double word pairing for atomic
- * operations on struct page then it must change the #if to ensure
- * proper alignment of the page struct.
+ * The struct page can be forced to be double word aligned so that atomic ops
+ * on double words work. The SLUB allocator can make use of such a feature.
  */
-#if defined(CONFIG_SLUB) && defined(CONFIG_CMPXCHG_LOCAL)
-       __attribute__((__aligned__(2*sizeof(unsigned long))))
+#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
+       __aligned(2 * sizeof(unsigned long))
 #endif
 ;
 
@@ -356,6 +355,7 @@ struct mm_struct {
        unsigned int faultstamp;
        unsigned int token_priority;
        unsigned int last_interval;
+       atomic_t active_swap_token;
 
        unsigned long flags; /* Must use atomic bitops to access the bits */
 
index 3ac040f1936963e729c3688af5702ec9f97bb977..28cbda16743870650390b63be604bd639db7a47e 100644 (file)
@@ -159,6 +159,10 @@ static inline int is_unevictable_lru(enum lru_list l)
        return (l == LRU_UNEVICTABLE);
 }
 
+struct lruvec {
+       struct list_head lists[NR_LRU_LISTS];
+};
+
 /* Mask used at gathering information at once (see memcontrol.c) */
 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
@@ -317,6 +321,12 @@ struct zone {
         */
        unsigned long           lowmem_reserve[MAX_NR_ZONES];
 
+       /*
+        * This is a per-zone reserve of pages that should not be
+        * considered dirtyable memory.
+        */
+       unsigned long           dirty_balance_reserve;
+
 #ifdef CONFIG_NUMA
        int node;
        /*
@@ -358,10 +368,8 @@ struct zone {
        ZONE_PADDING(_pad1_)
 
        /* Fields commonly accessed by the page reclaim scanner */
-       spinlock_t              lru_lock;       
-       struct zone_lru {
-               struct list_head list;
-       } lru[NR_LRU_LISTS];
+       spinlock_t              lru_lock;
+       struct lruvec           lruvec;
 
        struct zone_reclaim_stat reclaim_stat;
 
@@ -785,7 +793,7 @@ static inline int is_dma(struct zone *zone)
 
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
-int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
+int free_kbytes_sysctl_handler(struct ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
index 6f9d04a8533606afa788c351909c5dba74e54438..552fba9c7d5a5a17386901ba00c6f403513e636f 100644 (file)
@@ -43,7 +43,7 @@ enum oom_constraint {
 extern void compare_swap_oom_score_adj(int old_val, int new_val);
 extern int test_set_oom_score_adj(int new_val);
 
-extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
                        const nodemask_t *nodemask, unsigned long totalpages);
 extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
index b0638fd91e92fdacc2f2675207ef723342072597..22691f614043df3d3cdf77c3b12dd97a551bf813 100644 (file)
@@ -13,6 +13,7 @@
 
 enum page_debug_flags {
        PAGE_DEBUG_FLAG_POISON,         /* Page is poisoned */
+       PAGE_DEBUG_FLAG_GUARD,
 };
 
 /*
@@ -21,7 +22,8 @@ enum page_debug_flags {
  */
 
 #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
-#if !defined(CONFIG_PAGE_POISONING) \
+#if !defined(CONFIG_PAGE_POISONING) && \
+    !defined(CONFIG_PAGE_GUARD) \
 /* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */
 #error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features!
 #endif
index 961ecc7d30bc2ec69b87699c39e5db63a5aff0f7..1153095ee457e496c51c0c893b5853fc7b683afb 100644 (file)
@@ -31,7 +31,6 @@ enum {
 struct page_cgroup {
        unsigned long flags;
        struct mem_cgroup *mem_cgroup;
-       struct list_head lru;           /* per cgroup LRU list */
 };
 
 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
@@ -122,39 +121,6 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
        local_irq_restore(*flags);
 }
 
-#ifdef CONFIG_SPARSEMEM
-#define PCG_ARRAYID_WIDTH      SECTIONS_SHIFT
-#else
-#define PCG_ARRAYID_WIDTH      NODES_SHIFT
-#endif
-
-#if (PCG_ARRAYID_WIDTH > BITS_PER_LONG - NR_PCG_FLAGS)
-#error Not enough space left in pc->flags to store page_cgroup array IDs
-#endif
-
-/* pc->flags: ARRAY-ID | FLAGS */
-
-#define PCG_ARRAYID_MASK       ((1UL << PCG_ARRAYID_WIDTH) - 1)
-
-#define PCG_ARRAYID_OFFSET     (BITS_PER_LONG - PCG_ARRAYID_WIDTH)
-/*
- * Zero the shift count for non-existent fields, to prevent compiler
- * warnings and ensure references are optimized away.
- */
-#define PCG_ARRAYID_SHIFT      (PCG_ARRAYID_OFFSET * (PCG_ARRAYID_WIDTH != 0))
-
-static inline void set_page_cgroup_array_id(struct page_cgroup *pc,
-                                           unsigned long id)
-{
-       pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_SHIFT);
-       pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_SHIFT;
-}
-
-static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc)
-{
-       return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK;
-}
-
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
@@ -183,7 +149,7 @@ static inline void __init page_cgroup_init_flatmem(void)
 extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
                                        unsigned short old, unsigned short new);
 extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
-extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
+extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
 extern int swap_cgroup_swapon(int type, unsigned long max_pages);
 extern void swap_cgroup_swapoff(int type);
 #else
@@ -195,7 +161,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 }
 
 static inline
-unsigned short lookup_swap_cgroup(swp_entry_t ent)
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
 {
        return 0;
 }
index bab82f4c571c611d91fb9f180dc2a69ed12c95e2..ed17024d2ebee598044ec693632fc4cc1cd1eb7f 100644 (file)
@@ -21,7 +21,6 @@ struct pagevec {
 };
 
 void __pagevec_release(struct pagevec *pvec);
-void __pagevec_free(struct pagevec *pvec);
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
 void pagevec_strip(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
@@ -67,12 +66,6 @@ static inline void pagevec_release(struct pagevec *pvec)
                __pagevec_release(pvec);
 }
 
-static inline void pagevec_free(struct pagevec *pvec)
-{
-       if (pagevec_count(pvec))
-               __pagevec_free(pvec);
-}
-
 static inline void __pagevec_lru_add_anon(struct pagevec *pvec)
 {
        ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON);
index 38d10326246afbbec371b5cddc2beaff37f1dd48..e7cf6669ac3482e606c04e90470af6a37d4adcf4 100644 (file)
@@ -30,6 +30,8 @@ struct pid_namespace {
 #ifdef CONFIG_BSD_PROCESS_ACCT
        struct bsd_acct_struct *bacct;
 #endif
+       gid_t pid_gid;
+       int hide_pid;
 };
 
 extern struct pid_namespace init_pid_ns;
index a3baeb2c216156831606a11c5bd8636d80b8b76c..7ddc7f1b480fd41318d94c0a39c8e2ff80f9c5f8 100644 (file)
 
 #define PR_MCE_KILL_GET 34
 
+/*
+ * Tune up process memory map specifics.
+ */
+#define PR_SET_MM              35
+# define PR_SET_MM_START_CODE          1
+# define PR_SET_MM_END_CODE            2
+# define PR_SET_MM_START_DATA          3
+# define PR_SET_MM_END_DATA            4
+# define PR_SET_MM_START_STACK         5
+# define PR_SET_MM_START_BRK           6
+# define PR_SET_MM_BRK                 7
+
 #endif /* _LINUX_PRCTL_H */
index 643b96c7a94f3cc78d6750b878ad0a0b00c2ea58..c3d11ff368bce542622303710e08cbd2a1365f6f 100644 (file)
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
 
 union proc_op {
-       int (*proc_get_link)(struct inode *, struct path *);
+       int (*proc_get_link)(struct dentry *, struct path *);
        int (*proc_read)(struct task_struct *task, char *page);
        int (*proc_show)(struct seq_file *m,
                struct pid_namespace *ns, struct pid *pid,
index 2148b122779b5a2fd8421c5e15cfc9ca91ea85ff..1cdd62a2788a99436f666ec9d9e194f3c341e769 100644 (file)
@@ -120,6 +120,7 @@ void anon_vma_init(void);   /* create anon_vma_cachep */
 int  anon_vma_prepare(struct vm_area_struct *);
 void unlink_anon_vmas(struct vm_area_struct *);
 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
+void anon_vma_moveto_tail(struct vm_area_struct *);
 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
 void __anon_vma_link(struct vm_area_struct *);
 
@@ -157,7 +158,7 @@ static inline void page_dup_rmap(struct page *page)
  * Called from mm/vmscan.c to handle paging out
  */
 int page_referenced(struct page *, int is_locked,
-                       struct mem_cgroup *cnt, unsigned long *vm_flags);
+                       struct mem_cgroup *memcg, unsigned long *vm_flags);
 int page_referenced_one(struct page *, struct vm_area_struct *,
        unsigned long address, unsigned int *mapcount, unsigned long *vm_flags);
 
@@ -235,7 +236,7 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
 #define anon_vma_link(vma)     do {} while (0)
 
 static inline int page_referenced(struct page *page, int is_locked,
-                                 struct mem_cgroup *cnt,
+                                 struct mem_cgroup *memcg,
                                  unsigned long *vm_flags)
 {
        *vm_flags = 0;
index 45de3899faf684a133720728a9dc5ccb38c84901..33342b5ca0c9b5ef902be400b2d64a95483e5c8b 100644 (file)
@@ -2279,7 +2279,7 @@ extern void __cleanup_sighand(struct sighand_struct *);
 extern void exit_itimers(struct signal_struct *);
 extern void flush_itimer_signals(void);
 
-extern NORET_TYPE void do_group_exit(int);
+extern void do_group_exit(int);
 
 extern void daemonize(const char *, ...);
 extern int allow_signal(int);
index c02de0f3a0cbd284731f80badc0f54ec8a86c413..1aded4911d7cb6a499cb769293b01fe839e408e9 100644 (file)
@@ -211,6 +211,9 @@ struct swap_list_t {
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
+extern unsigned long dirty_balance_reserve;
+extern int min_free_kbytes;
+extern int extra_free_kbytes;
 extern unsigned int nr_free_buffer_pages(void);
 extern unsigned int nr_free_pagecache_pages(void);
 
@@ -365,6 +368,26 @@ static inline void put_swap_token(struct mm_struct *mm)
                __put_swap_token(mm);
 }
 
+static inline bool has_active_swap_token(struct mm_struct *mm)
+{
+       return has_swap_token(mm) && atomic_read(&mm->active_swap_token);
+}
+
+static inline bool activate_swap_token(struct mm_struct *mm)
+{
+       if (has_swap_token(mm)) {
+               atomic_inc(&mm->active_swap_token);
+               return true;
+       }
+       return false;
+}
+
+static inline void deactivate_swap_token(struct mm_struct *mm, bool swap_token)
+{
+       if (swap_token)
+               atomic_dec(&mm->active_swap_token);
+}
+
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 extern void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
@@ -490,6 +513,20 @@ static inline int has_swap_token(struct mm_struct *mm)
        return 0;
 }
 
+static inline bool has_active_swap_token(struct mm_struct *mm)
+{
+       return false;
+}
+
+static inline bool activate_swap_token(struct mm_struct *mm)
+{
+       return false;
+}
+
+static inline void deactivate_swap_token(struct mm_struct *mm, bool swap_token)
+{
+}
+
 static inline void disable_swap_token(struct mem_cgroup *memcg)
 {
 }
index 20f63d3e6144654f0124170827eab474a1b41faa..7ab0c81cab1d37bf39135d4bda4e2e2b34a6867f 100644 (file)
@@ -132,6 +132,8 @@ struct sysdev_ext_attribute {
        void *var;
 };
 
+#define SYSDEV_TO_EXT_ATTR(x) container_of(x, struct sysdev_ext_attribute, attr)
+
 /*
  * Support for simple variable sysdev attributes.
  * The pointer to the variable is stored in a sysdev_ext_attribute
index 0d556deb497b9988f14ac9322beb8cd815a85211..eb8b9f15f2e03b24ec6ca89b188ee9b955478268 100644 (file)
@@ -297,32 +297,50 @@ extern struct workqueue_struct *system_unbound_wq;
 extern struct workqueue_struct *system_freezable_wq;
 
 extern struct workqueue_struct *
-__alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
-                     struct lock_class_key *key, const char *lock_name);
+__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
+       struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6);
 
+/**
+ * alloc_workqueue - allocate a workqueue
+ * @fmt: printf format for the name of the workqueue
+ * @flags: WQ_* flags
+ * @max_active: max in-flight work items, 0 for default
+ * @args: args for @fmt
+ *
+ * Allocate a workqueue with the specified parameters.  For detailed
+ * information on WQ_* flags, please refer to Documentation/workqueue.txt.
+ *
+ * The __lock_name macro dance is to guarantee that single lock_class_key
+ * doesn't end up with different namesm, which isn't allowed by lockdep.
+ *
+ * RETURNS:
+ * Pointer to the allocated workqueue on success, %NULL on failure.
+ */
 #ifdef CONFIG_LOCKDEP
-#define alloc_workqueue(name, flags, max_active)               \
+#define alloc_workqueue(fmt, flags, max_active, args...)       \
 ({                                                             \
        static struct lock_class_key __key;                     \
        const char *__lock_name;                                \
                                                                \
-       if (__builtin_constant_p(name))                         \
-               __lock_name = (name);                           \
+       if (__builtin_constant_p(fmt))                          \
+               __lock_name = (fmt);                            \
        else                                                    \
-               __lock_name = #name;                            \
+               __lock_name = #fmt;                             \
                                                                \
-       __alloc_workqueue_key((name), (flags), (max_active),    \
-                             &__key, __lock_name);             \
+       __alloc_workqueue_key((fmt), (flags), (max_active),     \
+                             &__key, __lock_name, ##args);     \
 })
 #else
-#define alloc_workqueue(name, flags, max_active)               \
-       __alloc_workqueue_key((name), (flags), (max_active), NULL, NULL)
+#define alloc_workqueue(fmt, flags, max_active, args...)       \
+       __alloc_workqueue_key((fmt), (flags), (max_active),     \
+                             NULL, NULL, ##args)
 #endif
 
 /**
  * alloc_ordered_workqueue - allocate an ordered workqueue
- * @name: name of the workqueue
+ * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
+ * @args: args for @fmt
  *
  * Allocate an ordered workqueue.  An ordered workqueue executes at
  * most one work item at any given time in the queued order.  They are
@@ -331,11 +349,8 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
  * RETURNS:
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
-static inline struct workqueue_struct *
-alloc_ordered_workqueue(const char *name, unsigned int flags)
-{
-       return alloc_workqueue(name, WQ_UNBOUND | flags, 1);
-}
+#define alloc_ordered_workqueue(fmt, flags, args...)           \
+       alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args)
 
 #define create_workqueue(name)                                 \
        alloc_workqueue((name), WQ_MEM_RECLAIM, 1)
index b30419cd425efbf10b346cbdc93b72ccaddfab2e..2322fd0cb8751997b8b2e87d475bb7ce936cc74e 100644 (file)
@@ -126,6 +126,7 @@ void laptop_mode_timer_fn(unsigned long data);
 static inline void laptop_sync_completion(void) { }
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
+bool zone_dirty_ok(struct zone *zone);
 
 extern unsigned long global_dirty_limit;
 
@@ -140,8 +141,6 @@ extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
 
-extern unsigned long determine_dirtyable_memory(void);
-
 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos);
index 58ce8fe4478365484ac2d5ab2e7883efce83d114..5cb20ccb195606b9cc0fdac6869b2789c06e1649 100644 (file)
@@ -23,7 +23,7 @@
 #define SCSI_NETLINK_H
 
 #include <linux/netlink.h>
-
+#include <linux/types.h>
 
 /*
  * This file intended to be included by both kernel and user space
index a9c87ad8331c61de25b9d2a51489de398a03d306..5f889f16b0c891dbedafd65686c48688aa7110da 100644 (file)
@@ -147,7 +147,7 @@ DEFINE_EVENT(kmem_free, kmem_cache_free,
        TP_ARGS(call_site, ptr)
 );
 
-TRACE_EVENT(mm_page_free_direct,
+TRACE_EVENT(mm_page_free,
 
        TP_PROTO(struct page *page, unsigned int order),
 
@@ -169,7 +169,7 @@ TRACE_EVENT(mm_page_free_direct,
                        __entry->order)
 );
 
-TRACE_EVENT(mm_pagevec_free,
+TRACE_EVENT(mm_page_free_batched,
 
        TP_PROTO(struct page *page, int cold),
 
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
new file mode 100644 (file)
index 0000000..bb75e5c
--- /dev/null
@@ -0,0 +1,35 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM oom
+
+#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OOM_H
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(oom_score_adj_update,
+
+       TP_PROTO(struct task_struct *task),
+
+       TP_ARGS(task),
+
+       TP_STRUCT__entry(
+               __field(        pid_t,  pid)
+               __array(        char,   comm,   TASK_COMM_LEN )
+               __field(         int,   oom_score_adj)
+       ),
+
+       TP_fast_assign(
+               __entry->pid = task->pid;
+               memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+               __entry->oom_score_adj = task->signal->oom_score_adj;
+       ),
+
+       TP_printk("pid=%d comm=%s oom_score_adj=%d",
+               __entry->pid, __entry->comm, __entry->oom_score_adj)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
+
diff --git a/include/trace/events/task.h b/include/trace/events/task.h
new file mode 100644 (file)
index 0000000..2ac7484
--- /dev/null
@@ -0,0 +1,63 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM task
+
+#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TASK_H
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(task_newtask,
+
+       TP_PROTO(struct task_struct *task, unsigned long clone_flags),
+
+       TP_ARGS(task, clone_flags),
+
+       TP_STRUCT__entry(
+               __field(        pid_t,  pid)
+               __array(        char,   comm, TASK_COMM_LEN)
+               __field( unsigned long, clone_flags)
+               __field(        int,    oom_score_adj)
+       ),
+
+       TP_fast_assign(
+               __entry->pid = task->pid;
+               memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+               __entry->clone_flags = clone_flags;
+               __entry->oom_score_adj = task->signal->oom_score_adj;
+       ),
+
+       TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d",
+               __entry->pid, __entry->comm,
+               __entry->clone_flags, __entry->oom_score_adj)
+);
+
+TRACE_EVENT(task_rename,
+
+       TP_PROTO(struct task_struct *task, char *comm),
+
+       TP_ARGS(task, comm),
+
+       TP_STRUCT__entry(
+               __field(        pid_t,  pid)
+               __array(        char, oldcomm,  TASK_COMM_LEN)
+               __array(        char, newcomm,  TASK_COMM_LEN)
+               __field(        int, oom_score_adj)
+       ),
+
+       TP_fast_assign(
+               __entry->pid = task->pid;
+               memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN);
+               memcpy(entry->newcomm, comm, TASK_COMM_LEN);
+               __entry->oom_score_adj = task->signal->oom_score_adj;
+       ),
+
+       TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d",
+               __entry->pid, __entry->oldcomm,
+               __entry->newcomm, __entry->oom_score_adj)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
+
index a075765d5fbe4b85ca033708f454de38c3ea3798..34f6d0c5dbaeb5beff35d2f301919a93d0e5ed97 100644 (file)
@@ -784,6 +784,17 @@ config DEBUG_BLK_CGROUP
 
 endif # CGROUPS
 
+config CHECKPOINT_RESTORE
+       bool "Checkpoint/restore support" if EXPERT
+       default n
+       help
+         Enables additional kernel features in a sake of checkpoint/restore.
+         In particular it adds auxiliary prctl codes to setup process text,
+         data and heap segment sizes, and a few additional /proc filesystem
+         entries.
+
+         If unsure, say N here.
+
 menuconfig NAMESPACES
        bool "Namespaces support" if EXPERT
        default !EXPERT
index 0f6e1d985a3b2851c662337c1671b57cc4270f0a..7ea68fb301e6e9ec9619abcd6e65f7ad6fd3106e 100644 (file)
@@ -350,6 +350,9 @@ void __init mount_block_root(char *name, int flags)
        const char *b = name;
 #endif
 
+       if (sys_access((const char __user *) "/root", 0) != 0)
+               sys_mkdir((const char __user *) "/root", 0700);
+
        get_fs_names(fs_names);
 retry:
        for (p = fs_names; *p; p += strlen(p)+1) {
index 0c09366b96f3a634365c945c1a2d987a5afbf55e..e22336a09b4a7932c0a5de4615fb4c7de9a98687 100644 (file)
 #include <linux/ipc_namespace.h>
 #include <linux/sysctl.h>
 
-/*
- * Define the ranges various user-specified maximum values can
- * be set to.
- */
-#define MIN_MSGMAX     1               /* min value for msg_max */
-#define MAX_MSGMAX     HARD_MSGMAX     /* max value for msg_max */
-#define MIN_MSGSIZEMAX 128             /* min value for msgsize_max */
-#define MAX_MSGSIZEMAX (8192*128)      /* max value for msgsize_max */
-
 #ifdef CONFIG_PROC_SYSCTL
 static void *get_mq(ctl_table *table)
 {
@@ -31,16 +22,6 @@ static void *get_mq(ctl_table *table)
        return which;
 }
 
-static int proc_mq_dointvec(ctl_table *table, int write,
-       void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-       struct ctl_table mq_table;
-       memcpy(&mq_table, table, sizeof(mq_table));
-       mq_table.data = get_mq(table);
-
-       return proc_dointvec(&mq_table, write, buffer, lenp, ppos);
-}
-
 static int proc_mq_dointvec_minmax(ctl_table *table, int write,
        void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -52,15 +33,17 @@ static int proc_mq_dointvec_minmax(ctl_table *table, int write,
                                        lenp, ppos);
 }
 #else
-#define proc_mq_dointvec NULL
 #define proc_mq_dointvec_minmax NULL
 #endif
 
+static int msg_queues_limit_min = MIN_QUEUESMAX;
+static int msg_queues_limit_max = HARD_QUEUESMAX;
+
 static int msg_max_limit_min = MIN_MSGMAX;
-static int msg_max_limit_max = MAX_MSGMAX;
+static int msg_max_limit_max = HARD_MSGMAX;
 
 static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
-static int msg_maxsize_limit_max = MAX_MSGSIZEMAX;
+static int msg_maxsize_limit_max = HARD_MSGSIZEMAX;
 
 static ctl_table mq_sysctls[] = {
        {
@@ -68,7 +51,9 @@ static ctl_table mq_sysctls[] = {
                .data           = &init_ipc_ns.mq_queues_max,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = proc_mq_dointvec,
+               .proc_handler   = proc_mq_dointvec_minmax,
+               .extra1         = &msg_queues_limit_min,
+               .extra2         = &msg_queues_limit_max,
        },
        {
                .procname       = "msg_max",
index 5b4293d9819d87b636d09d63224df43535e44fc4..0edbf54217f3babfdca5c6e00f22e0308aaafd30 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/vmalloc.h>
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
@@ -32,6 +33,7 @@
 #include <linux/nsproxy.h>
 #include <linux/pid.h>
 #include <linux/ipc_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/slab.h>
 
 #include <net/sock.h>
@@ -127,7 +129,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 
        if (S_ISREG(mode)) {
                struct mqueue_inode_info *info;
-               struct task_struct *p = current;
                unsigned long mq_bytes, mq_msg_tblsz;
 
                inode->i_fop = &mqueue_file_operations;
@@ -142,14 +143,18 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
                info->qsize = 0;
                info->user = NULL;      /* set when all is ok */
                memset(&info->attr, 0, sizeof(info->attr));
-               info->attr.mq_maxmsg = ipc_ns->mq_msg_max;
-               info->attr.mq_msgsize = ipc_ns->mq_msgsize_max;
+               info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max, DFLT_MSG);
+               info->attr.mq_msgsize =
+                       min(ipc_ns->mq_msgsize_max, DFLT_MSGSIZE);
                if (attr) {
                        info->attr.mq_maxmsg = attr->mq_maxmsg;
                        info->attr.mq_msgsize = attr->mq_msgsize;
                }
                mq_msg_tblsz = info->attr.mq_maxmsg * sizeof(struct msg_msg *);
-               info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL);
+               if (mq_msg_tblsz > KMALLOC_MAX_SIZE)
+                       info->messages = vmalloc(mq_msg_tblsz);
+               else
+                       info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL);
                if (!info->messages)
                        goto out_inode;
 
@@ -158,7 +163,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 
                spin_lock(&mq_lock);
                if (u->mq_bytes + mq_bytes < u->mq_bytes ||
-                   u->mq_bytes + mq_bytes > task_rlimit(p, RLIMIT_MSGQUEUE)) {
+                   u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) {
                        spin_unlock(&mq_lock);
                        /* mqueue_evict_inode() releases info->messages */
                        ret = -EMFILE;
@@ -270,7 +275,10 @@ static void mqueue_evict_inode(struct inode *inode)
        spin_lock(&info->lock);
        for (i = 0; i < info->attr.mq_curmsgs; i++)
                free_msg(info->messages[i]);
-       kfree(info->messages);
+       if (info->attr.mq_maxmsg * sizeof(struct msg_msg *) > KMALLOC_MAX_SIZE)
+               vfree(info->messages);
+       else
+               kfree(info->messages);
        spin_unlock(&info->lock);
 
        /* Total amount of bytes accounted for the mqueue */
@@ -309,8 +317,9 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,
                error = -EACCES;
                goto out_unlock;
        }
-       if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
-                       !capable(CAP_SYS_RESOURCE)) {
+       if (ipc_ns->mq_queues_count >= HARD_QUEUESMAX ||
+           (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
+            !capable(CAP_SYS_RESOURCE))) {
                error = -ENOSPC;
                goto out_unlock;
        }
@@ -543,9 +552,13 @@ static void __do_notify(struct mqueue_inode_info *info)
                        sig_i.si_errno = 0;
                        sig_i.si_code = SI_MESGQ;
                        sig_i.si_value = info->notify.sigev_value;
+                       /* map current pid/uid into info->owner's namespaces */
+                       rcu_read_lock();
                        sig_i.si_pid = task_tgid_nr_ns(current,
                                                ns_of_pid(info->notify_owner));
-                       sig_i.si_uid = current_uid();
+                       sig_i.si_uid = user_ns_map_uid(info->user->user_ns,
+                                               current_cred(), current_uid());
+                       rcu_read_unlock();
 
                        kill_pid_info(info->notify.sigev_signo,
                                      &sig_i, info->notify_owner);
@@ -590,7 +603,8 @@ static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)
        if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0)
                return 0;
        if (capable(CAP_SYS_RESOURCE)) {
-               if (attr->mq_maxmsg > HARD_MSGMAX)
+               if (attr->mq_maxmsg > HARD_MSGMAX ||
+                   attr->mq_msgsize > HARD_MSGSIZEMAX)
                        return 0;
        } else {
                if (attr->mq_maxmsg > ipc_ns->mq_msg_max ||
index 5215a81420df9b1802dd9f6c40c465f1b63a2bbd..10386b83a8cadd376e34c552d08fe2961ecf85d6 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -61,8 +61,8 @@
  * - A woken up task may not even touch the semaphore array anymore, it may
  *   have been destroyed already by a semctl(RMID).
  * - The synchronizations between wake-ups due to a timeout/signal and a
- *   wake-up due to a completed semaphore operation is achieved by using an
- *   intermediate state (IN_WAKEUP).
+ *   wake-up due to a completed semaphore operation is achieved by using a
+ *   special wakeup scheme (queuewakeup_wait and support functions)
  * - UNDO values are stored in an array (one per process and per
  *   semaphore array, lazily allocated). For backwards compatibility, multiple
  *   modes for the UNDO variables are supported (per process, per thread)
 #include <asm/uaccess.h>
 #include "util.h"
 
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+       #define SYSVSEM_COMPLETION 1
+#else
+       #define SYSVSEM_CUSTOM 1
+#endif
+
+#ifdef SYSVSEM_COMPLETION
+       /* Using a completion causes some overhead, but avoids a busy loop
+        * that increases the worst case latency.
+        */
+       struct queue_done {
+               struct completion done;
+       };
+
+       static void queuewakeup_prepare(void)
+       {
+               /* no preparation necessary */
+       }
+
+       static void queuewakeup_completed(void)
+       {
+               /* empty */
+       }
+
+       static void queuewakeup_block(struct queue_done *qd)
+       {
+               /* empty */
+       }
+
+       static void queuewakeup_handsoff(struct queue_done *qd)
+       {
+               complete_all(&qd->done);
+       }
+
+       static void queuewakeup_init(struct queue_done *qd)
+       {
+               init_completion(&qd->done);
+       }
+
+       static void queuewakeup_wait(struct queue_done *qd)
+       {
+               wait_for_completion(&qd->done);
+       }
+
+#elif defined(SYSVSEM_SPINLOCK)
+       /* Note: Spinlocks do not work because:
+        * - lockdep complains [could be fixed]
+        * - only 255 concurrent spin_lock() calls are permitted, then the
+        *   preempt-counter overflows
+        */
+#error SYSVSEM_SPINLOCK is a prove of concept, does not work.
+       struct queue_done {
+               spinlock_t done;
+       };
+
+       static void queuewakeup_prepare(void)
+       {
+               /* empty */
+       }
+
+       static void queuewakeup_completed(void)
+       {
+               /* empty */
+       }
+
+       static void queuewakeup_block(struct queue_done *qd)
+       {
+               BUG_ON(spin_is_locked(&qd->done));
+               spin_lock(&qd->done);
+       }
+
+       static void queuewakeup_handsoff(struct queue_done *qd)
+       {
+               spin_unlock(&qd->done);
+       }
+
+       static void queuewakeup_init(struct queue_done *qd)
+       {
+               spin_lock_init(&qd->done);
+       }
+
+       static void queuewakeup_wait(struct queue_done *qd)
+       {
+               spin_unlock_wait(&qd->done);
+       }
+#else
+       struct queue_done {
+               atomic_t done;
+       };
+
+       static void queuewakeup_prepare(void)
+       {
+               preempt_disable();
+       }
+
+       static void queuewakeup_completed(void)
+       {
+               preempt_enable();
+       }
+
+       static void queuewakeup_block(struct queue_done *qd)
+       {
+               BUG_ON(atomic_read(&qd->done) != 1);
+               atomic_set(&qd->done, 2);
+       }
+
+       static void queuewakeup_handsoff(struct queue_done *qd)
+       {
+               BUG_ON(atomic_read(&qd->done) != 2);
+               smp_mb();
+               atomic_set(&qd->done, 1);
+       }
+
+       static void queuewakeup_init(struct queue_done *qd)
+       {
+               atomic_set(&qd->done, 1);
+       }
+
+       static void queuewakeup_wait(struct queue_done *qd)
+       {
+               while (atomic_read(&qd->done) != 1)
+                       cpu_relax();
+
+               smp_mb();
+       }
+#endif
+
+
 /* One semaphore structure for each semaphore in the system. */
 struct sem {
        int     semval;         /* current value */
@@ -108,6 +237,7 @@ struct sem_queue {
        struct sembuf           *sops;   /* array of pending operations */
        int                     nsops;   /* number of operations */
        int                     alter;   /* does *sops alter the array? */
+       struct queue_done       done;    /* completion synchronization */
 };
 
 /* Each task has a list of undo requests. They are executed automatically
@@ -245,23 +375,27 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
  * - queue.status is initialized to -EINTR before blocking.
  * - wakeup is performed by
  *     * unlinking the queue entry from sma->sem_pending
- *     * setting queue.status to IN_WAKEUP
- *       This is the notification for the blocked thread that a
- *       result value is imminent.
+ *     * setting queue.status to the actual result code
+ *       This is the notification for the blocked thread that someone
+ *       (usually: update_queue()) completed the semtimedop() operation.
  *     * call wake_up_process
- *     * set queue.status to the final value.
+ *     * queuewakeup_handsoff(&q->done);
  * - the previously blocked thread checks queue.status:
- *     * if it's IN_WAKEUP, then it must wait until the value changes
- *     * if it's not -EINTR, then the operation was completed by
- *       update_queue. semtimedop can return queue.status without
- *       performing any operation on the sem array.
- *     * otherwise it must acquire the spinlock and check what's up.
+ *     * if it's not -EINTR, then someone completed the operation.
+ *       First, queuewakeup_wait() must be called. Afterwards,
+ *       semtimedop must return queue.status without performing any
+ *       operation on the sem array.
+ *       - otherwise it must acquire the spinlock and repeat the test
+ *       - If it is still -EINTR, then no update_queue() completed the
+ *         operation, thus semtimedop() can proceed normally.
  *
- * The two-stage algorithm is necessary to protect against the following
+ * queuewakeup_wait() is necessary to protect against the following
  * races:
  * - if queue.status is set after wake_up_process, then the woken up idle
  *   thread could race forward and try (and fail) to acquire sma->lock
- *   before update_queue had a chance to set queue.status
+ *   before update_queue had a chance to set queue.status.
+ *   More importantly, it would mean that wake_up_process must be done
+ *   while holding sma->lock, i.e. this would reduce the scalability.
  * - if queue.status is written before wake_up_process and if the
  *   blocked process is woken up by a signal between writing
  *   queue.status and the wake_up_process, then the woken up
@@ -271,7 +405,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
  *   (yes, this happened on s390 with sysv msg).
  *
  */
-#define IN_WAKEUP      1
 
 /**
  * newary - Create a new semaphore set
@@ -461,15 +594,11 @@ undo:
 static void wake_up_sem_queue_prepare(struct list_head *pt,
                                struct sem_queue *q, int error)
 {
-       if (list_empty(pt)) {
-               /*
-                * Hold preempt off so that we don't get preempted and have the
-                * wakee busy-wait until we're scheduled back on.
-                */
-               preempt_disable();
-       }
-       q->status = IN_WAKEUP;
-       q->pid = error;
+       if (list_empty(pt))
+               queuewakeup_prepare();
+
+       queuewakeup_block(&q->done);
+       q->status = error;
 
        list_add_tail(&q->simple_list, pt);
 }
@@ -480,8 +609,8 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
  *
  * Do the actual wake-up.
  * The function is called without any locks held, thus the semaphore array
- * could be destroyed already and the tasks can disappear as soon as the
- * status is set to the actual return code.
+ * could be destroyed already and the tasks can disappear as soon as
+ * queuewakeup_handsoff() is called.
  */
 static void wake_up_sem_queue_do(struct list_head *pt)
 {
@@ -491,12 +620,11 @@ static void wake_up_sem_queue_do(struct list_head *pt)
        did_something = !list_empty(pt);
        list_for_each_entry_safe(q, t, pt, simple_list) {
                wake_up_process(q->sleeper);
-               /* q can disappear immediately after writing q->status. */
-               smp_wmb();
-               q->status = q->pid;
+               /* q can disappear immediately after completing q->done */
+               queuewakeup_handsoff(&q->done);
        }
        if (did_something)
-               preempt_enable();
+               queuewakeup_completed();
 }
 
 static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -1300,33 +1428,6 @@ out:
        return un;
 }
 
-
-/**
- * get_queue_result - Retrieve the result code from sem_queue
- * @q: Pointer to queue structure
- *
- * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
- * q->status, then we must loop until the value is replaced with the final
- * value: This may happen if a task is woken up by an unrelated event (e.g.
- * signal) and in parallel the task is woken up by another task because it got
- * the requested semaphores.
- *
- * The function can be called with or without holding the semaphore spinlock.
- */
-static int get_queue_result(struct sem_queue *q)
-{
-       int error;
-
-       error = q->status;
-       while (unlikely(error == IN_WAKEUP)) {
-               cpu_relax();
-               error = q->status;
-       }
-
-       return error;
-}
-
-
 SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
                unsigned, nsops, const struct timespec __user *, timeout)
 {
@@ -1472,6 +1573,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 
        queue.status = -EINTR;
        queue.sleeper = current;
+       queuewakeup_init(&queue.done);
 
 sleep_again:
        current->state = TASK_INTERRUPTIBLE;
@@ -1482,17 +1584,14 @@ sleep_again:
        else
                schedule();
 
-       error = get_queue_result(&queue);
+       error = queue.status;
 
        if (error != -EINTR) {
                /* fast path: update_queue already obtained all requested
-                * resources.
-                * Perform a smp_mb(): User space could assume that semop()
-                * is a memory barrier: Without the mb(), the cpu could
-                * speculatively read in user space stale data that was
-                * overwritten by the previous owner of the semaphore.
+                * resources. Just ensure that update_queue completed
+                * it's access to &queue.
                 */
-               smp_mb();
+               queuewakeup_wait(&queue.done);
 
                goto out_free;
        }
@@ -1502,23 +1601,16 @@ sleep_again:
        /*
         * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing.
         */
-       error = get_queue_result(&queue);
-
-       /*
-        * Array removed? If yes, leave without sem_unlock().
-        */
-       if (IS_ERR(sma)) {
-               goto out_free;
-       }
-
-
-       /*
-        * If queue.status != -EINTR we are woken up by another process.
-        * Leave without unlink_queue(), but with sem_unlock().
-        */
-
+       error = queue.status;
        if (error != -EINTR) {
-               goto out_unlock_free;
+               /* If there is a return code, then we can leave immediately. */
+               if (!IS_ERR(sma)) {
+                       /* sem_lock() succeeded - then unlock */
+                       sem_unlock(sma);
+               }
+               /* Except that we must wait for the hands-off */
+               queuewakeup_wait(&queue.done);
+               goto out_free;
        }
 
        /*
index 4363a41e1ff4e0a4f66767a442e20b5c0045b033..174edccdc884042835aa3369ec0c1da75614d976 100644 (file)
@@ -6,7 +6,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
-           rcupdate.o extable.o params.o posix-timers.o \
+           rcupdate.o extable.o params.o posix-timers.o lglock.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o cred.o \
index 09fae2677a45e11ab650b91b1dee7877a2ee8957..2c1d6ab7106ee9921bc8456d0e9b5e11637a11f3 100644 (file)
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
                avail = audit_expand(ab,
                        max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
                if (!avail)
-                       goto out;
+                       goto out_va_end;
                len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
        }
-       va_end(args2);
        if (len > 0)
                skb_put(skb, len);
+out_va_end:
+       va_end(args2);
 out:
        return;
 }
index 42e568306382e0213d9149af6a57d137647b8fd0..ca85d353e0fef674c93588073b24b5760c095139 100644 (file)
@@ -59,6 +59,7 @@
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
+#include <linux/kthread.h>
 
 /*
  * Workqueue for cpuset related tasks.
@@ -123,6 +124,19 @@ static inline struct cpuset *task_cs(struct task_struct *task)
                            struct cpuset, css);
 }
 
+#ifdef CONFIG_NUMA
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+       return task->mempolicy;
+}
+#else
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+       return false;
+}
+#endif
+
+
 /* bits in struct cpuset flags field */
 typedef enum {
        CS_CPU_EXCLUSIVE,
@@ -949,7 +963,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
                                        nodemask_t *newmems)
 {
-       bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed);
+       bool need_loop;
 
 repeat:
        /*
@@ -962,6 +976,14 @@ repeat:
                return;
 
        task_lock(tsk);
+       /*
+        * Determine if a loop is necessary if another thread is doing
+        * get_mems_allowed().  If at least one node remains unchanged and
+        * tsk does not have a mempolicy, then an empty nodemask will not be
+        * possible when mems_allowed is larger than a word.
+        */
+       need_loop = task_has_mempolicy(tsk) ||
+                       !nodes_intersects(*newmems, tsk->mems_allowed);
        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
 
@@ -981,11 +1003,9 @@ repeat:
 
        /*
         * Allocation of memory is very fast, we needn't sleep when waiting
-        * for the read-side.  No wait is necessary, however, if at least one
-        * node remains unchanged.
+        * for the read-side.
         */
-       while (masks_disjoint &&
-                       ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+       while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
                task_unlock(tsk);
                if (!task_curr(tsk))
                        yield();
@@ -1398,9 +1418,11 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
                 * unnecessary.  Thus, cpusets are not applicable for such
                 * threads.  This prevents checking for success of
                 * set_cpus_allowed_ptr() on all attached tasks before
-                * cpus_allowed may be changed.
+                * cpus_allowed may be changed.  We also disallow attaching
+                * kthreadd, to prevent its child from becoming trapped should
+                * it then acquire PF_THREAD_BOUND.
                 */
-               if (task->flags & PF_THREAD_BOUND)
+               if (task->flags & PF_THREAD_BOUND || task == kthreadd_task)
                        return -EINVAL;
                if ((ret = security_task_setscheduler(task)))
                        return ret;
index 766b805648861c9b9d917dd7f90d02d49d2eb46a..56b813a72b0e2eb718585bd3fe78666f5d0cb154 100644 (file)
@@ -887,7 +887,7 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
 
-NORET_TYPE void do_exit(long code)
+void do_exit(long code)
 {
        struct task_struct *tsk = current;
        int group_dead;
@@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code)
 
 EXPORT_SYMBOL_GPL(do_exit);
 
-NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+void complete_and_exit(struct completion *comp, long code)
 {
        if (comp)
                complete(comp);
@@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
  * Take down every thread in the group.  This is called by fatal signals
  * as well as by sys_exit_group (below).
  */
-NORET_TYPE void
+void
 do_group_exit(int exit_code)
 {
        struct signal_struct *sig = current->signal;
index 2a8a46561fc024e7732fa78a12a113a2f2117de1..e5ac415a268f5ca027e6475c95e43a8341e32fbb 100644 (file)
@@ -77,6 +77,9 @@
 
 #include <trace/events/sched.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -736,6 +739,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
        mm->last_interval = 0;
+       atomic_set(&mm->active_swap_token, 0);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        mm->pmd_huge_pte = NULL;
@@ -1388,6 +1392,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (clone_flags & CLONE_THREAD)
                threadgroup_change_end(current);
        perf_event_fork(p);
+
+       trace_task_newtask(p, clone_flags);
+
        return p;
 
 bad_fork_free_pid:
index ae34bf51682b4a204de93f62943055350cd5c4d0..a53fff92fda4417fe2718bc7144f144c32f5ae5d 100644 (file)
@@ -1568,6 +1568,14 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
        if (rt_task(current))
                slack = 0;
 
+       /*
+        * Applications will often sleep(0) to indicate that they wish to
+        * be scheduled. Special case that to avoid actually putting them
+        * to sleep for the duration of the slack.
+        */
+       if (rqtp->tv_sec == 0 && rqtp->tv_nsec == 0)
+               slack = 0;
+
        hrtimer_init_on_stack(&t.timer, clockid, mode);
        hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
        if (do_nanosleep(&t, mode))
index 090ee10d960485c6b9ed1923ef66c467c6fa19ea..7b0886786701b845cfbd8823252392e40b11a1c8 100644 (file)
@@ -32,7 +32,6 @@
 #include <linux/console.h>
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
-#include <linux/kmsg_dump.h>
 #include <linux/syscore_ops.h>
 
 #include <asm/page.h>
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
                if (kexec_crash_image) {
                        struct pt_regs fixed_regs;
 
-                       kmsg_dump(KMSG_DUMP_KEXEC);
-
                        crash_setup_regs(&fixed_regs, regs);
                        crash_save_vmcoreinfo();
                        machine_crash_shutdown(&fixed_regs);
@@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)
 {
        int ret = 0;
        unsigned long start, end;
+       unsigned long old_size;
+       struct resource *ram_res;
 
        mutex_lock(&kexec_mutex);
 
@@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size)
        }
        start = crashk_res.start;
        end = crashk_res.end;
+       old_size = (end == 0) ? 0 : end - start + 1;
+       if (new_size >= old_size) {
+               ret = (new_size == old_size) ? 0 : -EINVAL;
+               goto unlock;
+       }
 
-       if (new_size >= end - start + 1) {
-               ret = -EINVAL;
-               if (new_size == end - start + 1)
-                       ret = 0;
+       ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+       if (!ram_res) {
+               ret = -ENOMEM;
                goto unlock;
        }
 
@@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size)
 
        if ((start == end) && (crashk_res.parent != NULL))
                release_resource(&crashk_res);
+
+       ram_res->start = end;
+       ram_res->end = crashk_res.end;
+       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       ram_res->name = "System RAM";
+
        crashk_res.end = end - 1;
+
+       insert_resource(&iomem_resource, ram_res);
        crash_unmap_reserved_pages();
 
 unlock:
diff --git a/kernel/lglock.c b/kernel/lglock.c
new file mode 100644 (file)
index 0000000..26c5bc2
--- /dev/null
@@ -0,0 +1,101 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+       LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+       arch_spinlock_t *lock;
+       preempt_disable();
+       rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+       lock = this_cpu_ptr(lg->lock);
+       arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+       arch_spinlock_t *lock;
+       rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       lock = this_cpu_ptr(lg->lock);
+       arch_spin_unlock(lock);
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+       arch_spinlock_t *lock;
+       preempt_disable();
+       rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+       lock = per_cpu_ptr(lg->lock, cpu);
+       arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+       arch_spinlock_t *lock;
+       rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       lock = per_cpu_ptr(lg->lock, cpu);
+       arch_spin_unlock(lock);
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock_online(struct lglock *lg)
+{
+       int i;
+       preempt_disable();
+       rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+       for_each_online_cpu(i) {
+               arch_spinlock_t *lock;
+               lock = per_cpu_ptr(lg->lock, i);
+               arch_spin_lock(lock);
+       }
+}
+EXPORT_SYMBOL(lg_global_lock_online);
+
+void lg_global_unlock_online(struct lglock *lg)
+{
+       int i;
+       rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       for_each_online_cpu(i) {
+               arch_spinlock_t *lock;
+               lock = per_cpu_ptr(lg->lock, i);
+               arch_spin_unlock(lock);
+       }
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock_online);
+
+void lg_global_lock(struct lglock *lg)
+{
+       int i;
+       preempt_disable();
+       rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+       for_each_possible_cpu(i) {
+               arch_spinlock_t *lock;
+               lock = per_cpu_ptr(lg->lock, i);
+               arch_spin_lock(lock);
+       }
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+       int i;
+       rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       for_each_possible_cpu(i) {
+               arch_spinlock_t *lock;
+               lock = per_cpu_ptr(lg->lock, i);
+               arch_spin_unlock(lock);
+       }
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
index 3458469eb7c3489badf84de6916defa9cd6472a0..5dce5404eeef1113a3cb5e563cd7dccebd072ecb 100644 (file)
@@ -49,6 +49,15 @@ static long no_blink(int state)
 long (*panic_blink)(int state);
 EXPORT_SYMBOL(panic_blink);
 
+/*
+ * Stop ourself in panic -- architecture code may override this
+ */
+void __weak panic_smp_self_stop(void)
+{
+       while (1)
+               cpu_relax();
+}
+
 /**
  *     panic - halt the system
  *     @fmt: The text string to print
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);
  *
  *     This function never returns.
  */
-NORET_TYPE void panic(const char * fmt, ...)
+void panic(const char *fmt, ...)
 {
+       static DEFINE_SPINLOCK(panic_lock);
        static char buf[1024];
        va_list args;
        long i, i_next = 0;
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)
         * It's possible to come here directly from a panic-assertion and
         * not have preempt disabled. Some functions called from here want
         * preempt to be disabled. No point enabling it later though...
+        *
+        * Only one CPU is allowed to execute the panic code from here. For
+        * multiple parallel invocations of panic, all other CPUs either
+        * stop themself or will wait until they are stopped by the 1st CPU
+        * with smp_send_stop().
         */
-       preempt_disable();
+       if (!spin_trylock(&panic_lock))
+               panic_smp_self_stop();
 
        console_verbose();
        bust_spinlocks(1);
index fa5f72227e5f432127c4f3b1383e3c93abcd0eaa..ce8e00deaccb38452188436438dc942313bd6c44 100644 (file)
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
 }
 
 /*
- * We might be racing with someone else trying to set pid_ns->last_pid.
+ * We might be racing with someone else trying to set pid_ns->last_pid
+ * at the pid allocation time (there's also a sysctl for this, but racing
+ * with this one is OK, see comment in kernel/pid_namespace.c about it).
  * We want the winner to have the "later" value, because if the
  * "earlier" value prevails, then a pid may get reused immediately.
  *
index e9c9adc84ca6e50f5f457e081bb45c36b35bf4d1..a8968396046d3b2f9310c0ca6bd6bb0757c34cca 100644 (file)
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        return;
 }
 
+static int pid_ns_ctl_handler(struct ctl_table *table, int write,
+               void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct ctl_table tmp = *table;
+
+       if (write && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       /*
+        * Writing directly to ns' last_pid field is OK, since this field
+        * is volatile in a living namespace anyway and a code writing to
+        * it should synchronize its usage with external means.
+        */
+
+       tmp.data = &current->nsproxy->pid_ns->last_pid;
+       return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table pid_ns_ctl_table[] = {
+       {
+               .procname = "ns_last_pid",
+               .maxlen = sizeof(int),
+               .mode = 0666, /* permissions are checked in the handler */
+               .proc_handler = pid_ns_ctl_handler,
+       },
+       { }
+};
+
+static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+
 static __init int pid_namespaces_init(void)
 {
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+       register_sysctl_paths(kern_path, pid_ns_ctl_table);
        return 0;
 }
 
index 8018450517a9b1f3a33763be9341fadfae25c259..383c637aa25ac03aee64e6cb8413918754aba055 100644 (file)
@@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
            PageReserved(page))
                return NULL;
 
+       if (page_is_guard(page))
+               return NULL;
+
        return page;
 }
 
@@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
            && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
                return NULL;
 
+       if (page_is_guard(page))
+               return NULL;
+
        return page;
 }
 
index 1730c24d8ff49b92756659c201529f0bd3efde6c..4bc4750172192f6b678a95418a27a0a1ca78e4e6 100644 (file)
@@ -71,6 +71,7 @@
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
+#include <linux/kthread.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -7535,6 +7536,14 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
        struct task_struct *task;
 
        cgroup_taskset_for_each(task, cgrp, tset) {
+               /*
+                * kthreadd can fork workers for an RT workqueue in a cgroup
+                * which may or may not have rt_runtime allocated.  Just say no,
+                * as attaching a global resource to a non-root group  doesn't
+                * make any sense anyway.
+                */
+               if (task == kthreadd_task)
+                       return -EINVAL;
 #ifdef CONFIG_RT_GROUP_SCHED
                if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
                        return -EINVAL;
index 0b569656ca045ca8407be38df17a0f34afff5cd0..2daed3188118d43470500be91248b6414b3caf80 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
 
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
+/*
+ * map the uid in struct cred into user namespace *ns
+ */
+static inline uid_t map_cred_ns(const struct cred *cred,
+                               struct user_namespace *ns)
+{
+       return user_ns_map_uid(ns, cred, cred->uid);
+}
+
+#ifdef CONFIG_USER_NS
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+       if (current_user_ns() == task_cred_xxx(t, user_ns))
+               return;
+
+       if (SI_FROMKERNEL(info))
+               return;
+
+       info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
+                                       current_cred(), info->si_uid);
+}
+#else
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+       return;
+}
+#endif
+
 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        int group, int from_ancestor_ns)
 {
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                                q->info.si_pid = 0;
                        break;
                }
+
+               userns_fixup_signal_uid(&q->info, t);
+
        } else if (!is_si_special(info)) {
                if (sig >= SIGRTMIN && info->si_code != SI_USER) {
                        /*
@@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
-       info.si_uid = __task_cred(tsk)->uid;
+       info.si_uid = map_cred_ns(__task_cred(tsk),
+                       task_cred_xxx(tsk->parent, user_ns));
        rcu_read_unlock();
 
        info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-       info.si_uid = __task_cred(tsk)->uid;
+       info.si_uid = map_cred_ns(__task_cred(tsk),
+                       task_cred_xxx(parent, user_ns));
        rcu_read_unlock();
 
        info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -2127,8 +2161,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
                info->si_signo = signr;
                info->si_errno = 0;
                info->si_code = SI_USER;
+               rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
-               info->si_uid = task_uid(current->parent);
+               info->si_uid = map_cred_ns(__task_cred(current->parent),
+                               current_user_ns());
+               rcu_read_unlock();
        }
 
        /* If the (new) signal is now blocked, requeue it.  */
index ddf8155bf3f8c09a3745a3cea82ec4e9aef49208..40701538fbd168db2de95315ac5c512a79686f52 100644 (file)
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
        return mask;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm(int opt, unsigned long addr,
+                       unsigned long arg4, unsigned long arg5)
+{
+       unsigned long rlim = rlimit(RLIMIT_DATA);
+       unsigned long vm_req_flags;
+       unsigned long vm_bad_flags;
+       struct vm_area_struct *vma;
+       int error = 0;
+       struct mm_struct *mm = current->mm;
+
+       if (arg4 | arg5)
+               return -EINVAL;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (addr >= TASK_SIZE)
+               return -EINVAL;
+
+       down_read(&mm->mmap_sem);
+       vma = find_vma(mm, addr);
+
+       if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
+               /* It must be existing VMA */
+               if (!vma || vma->vm_start > addr)
+                       goto out;
+       }
+
+       error = -EINVAL;
+       switch (opt) {
+       case PR_SET_MM_START_CODE:
+       case PR_SET_MM_END_CODE:
+               vm_req_flags = VM_READ | VM_EXEC;
+               vm_bad_flags = VM_WRITE | VM_MAYSHARE;
+
+               if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+                   (vma->vm_flags & vm_bad_flags))
+                       goto out;
+
+               if (opt == PR_SET_MM_START_CODE)
+                       mm->start_code = addr;
+               else
+                       mm->end_code = addr;
+               break;
+
+       case PR_SET_MM_START_DATA:
+       case PR_SET_MM_END_DATA:
+               vm_req_flags = VM_READ | VM_WRITE;
+               vm_bad_flags = VM_EXEC | VM_MAYSHARE;
+
+               if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+                   (vma->vm_flags & vm_bad_flags))
+                       goto out;
+
+               if (opt == PR_SET_MM_START_DATA)
+                       mm->start_data = addr;
+               else
+                       mm->end_data = addr;
+               break;
+
+       case PR_SET_MM_START_STACK:
+
+#ifdef CONFIG_STACK_GROWSUP
+               vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
+#else
+               vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
+#endif
+               if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
+                       goto out;
+
+               mm->start_stack = addr;
+               break;
+
+       case PR_SET_MM_START_BRK:
+               if (addr <= mm->end_data)
+                       goto out;
+
+               if (rlim < RLIM_INFINITY &&
+                   (mm->brk - addr) +
+                   (mm->end_data - mm->start_data) > rlim)
+                       goto out;
+
+               mm->start_brk = addr;
+               break;
+
+       case PR_SET_MM_BRK:
+               if (addr <= mm->end_data)
+                       goto out;
+
+               if (rlim < RLIM_INFINITY &&
+                   (addr - mm->start_brk) +
+                   (mm->end_data - mm->start_data) > rlim)
+                       goto out;
+
+               mm->brk = addr;
+               break;
+
+       default:
+               error = -EINVAL;
+               goto out;
+       }
+
+       error = 0;
+
+out:
+       up_read(&mm->mmap_sem);
+
+       return error;
+}
+#else /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_mm(int opt, unsigned long addr,
+                       unsigned long arg4, unsigned long arg5)
+{
+       return -EINVAL;
+}
+#endif
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
 {
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        else
                                error = PR_MCE_KILL_DEFAULT;
                        break;
+               case PR_SET_MM:
+                       error = prctl_set_mm(arg2, arg3, arg4, arg5);
+                       break;
                default:
                        error = -EINVAL;
                        break;
index f487f257e05e4f1dc8836ded96ca36556b048706..a0d30d00fa050a98ae46d348d656847930078cdf 100644 (file)
@@ -96,7 +96,6 @@ extern int suid_dumpable;
 extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 extern int pid_max;
-extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -1205,7 +1204,15 @@ static struct ctl_table vm_table[] = {
                .data           = &min_free_kbytes,
                .maxlen         = sizeof(min_free_kbytes),
                .mode           = 0644,
-               .proc_handler   = min_free_kbytes_sysctl_handler,
+               .proc_handler   = free_kbytes_sysctl_handler,
+               .extra1         = &zero,
+       },
+       {
+               .procname       = "extra_free_kbytes",
+               .data           = &extra_free_kbytes,
+               .maxlen         = sizeof(extra_free_kbytes),
+               .mode           = 0644,
+               .proc_handler   = free_kbytes_sysctl_handler,
                .extra1         = &zero,
        },
        {
index 6318b511afa10b3044c7bbd1eb49b28f7c490fcc..a650694883a180e93c5ec1d6414e45ba904fcff3 100644 (file)
@@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 
        fput(file);
 out_putname:
-       putname(pathname);
+       __putname(pathname);
 out:
        return result;
 }
index 696c997f8784ae1a0a4f9a20cbc1a9d5133fe399..1305a49f20e3c89da506690fd91d313ace629758 100644 (file)
@@ -865,6 +865,73 @@ void tick_cancel_sched_timer(int cpu)
 }
 #endif
 
+#ifdef CONFIG_SYSFS
+/*
+ * Allow modification of tick_do_timer_cpu when nohz mode is off.
+ */
+static ssize_t sysfs_store_do_timer_cpu(struct sys_device *dev,
+                                               struct sysdev_attribute *attr,
+                                               const char *buf, size_t size)
+{
+       struct sysdev_ext_attribute *ea = SYSDEV_TO_EXT_ATTR(attr);
+       unsigned int new;
+       int rv;
+
+#ifdef CONFIG_NO_HZ
+       /* nohz mode not supported */
+       if (tick_nohz_enabled)
+               return -EINVAL;
+#endif
+
+       rv = kstrtouint(buf, 0, &new);
+       if (rv)
+               return rv;
+
+       /* Protect against cpu-hotplug */
+       get_online_cpus();
+
+       if (new >= nr_cpu_ids || !cpu_online(new)) {
+               put_online_cpus();
+               return -ERANGE;
+       }
+
+       *(unsigned int *)(ea->var) = new;
+
+       put_online_cpus();
+
+       return size;
+}
+
+static struct sysdev_ext_attribute attr_jiffies_cpu = {
+                       _SYSDEV_ATTR(jiffies_cpu, 0644, sysdev_show_int,
+                                       sysfs_store_do_timer_cpu),
+                       &tick_do_timer_cpu };
+
+static struct sysdev_class timekeeping_sysclass = {
+       .name = "timekeeping",
+};
+
+static struct sys_device device_timekeeping = {
+       .id     = 0,
+       .cls    = &timekeeping_sysclass,
+};
+
+static int __init init_timekeeping_sysfs(void)
+{
+       int error = sysdev_class_register(&timekeeping_sysclass);
+
+       if (!error)
+               error = sysdev_register(&device_timekeeping);
+       if (!error)
+               error = sysdev_create_file(
+                               &device_timekeeping,
+                               &attr_jiffies_cpu.attr);
+       return error;
+}
+
+device_initcall(init_timekeeping_sysfs);
+#endif /* SYSFS */
+
 /**
  * Async notification about clocksource changes
  */
index 42fa9ad0a810482494b5782f4ba9a23f4aa32b86..bec7b5b53e03db1d443a2f221255fd22344f4de4 100644 (file)
@@ -242,10 +242,10 @@ struct workqueue_struct {
 
        int                     nr_drainers;    /* W: drain in progress */
        int                     saved_max_active; /* W: saved cwq max_active */
-       const char              *name;          /* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
        struct lockdep_map      lockdep_map;
 #endif
+       char                    name[];         /* I: workqueue name */
 };
 
 struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
        return clamp_val(max_active, 1, lim);
 }
 
-struct workqueue_struct *__alloc_workqueue_key(const char *name,
+struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                                               unsigned int flags,
                                               int max_active,
                                               struct lock_class_key *key,
-                                              const char *lock_name)
+                                              const char *lock_name, ...)
 {
+       va_list args, args1;
        struct workqueue_struct *wq;
        unsigned int cpu;
+       size_t namelen;
+
+       /* determine namelen, allocate wq and format name */
+       va_start(args, lock_name);
+       va_copy(args1, args);
+       namelen = vsnprintf(NULL, 0, fmt, args) + 1;
+
+       wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
+       if (!wq)
+               goto err;
+
+       vsnprintf(wq->name, namelen, fmt, args1);
+       va_end(args);
+       va_end(args1);
 
        /*
         * Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
                flags |= WQ_HIGHPRI;
 
        max_active = max_active ?: WQ_DFL_ACTIVE;
-       max_active = wq_clamp_max_active(max_active, flags, name);
-
-       wq = kzalloc(sizeof(*wq), GFP_KERNEL);
-       if (!wq)
-               goto err;
+       max_active = wq_clamp_max_active(max_active, flags, wq->name);
 
+       /* init wq */
        wq->flags = flags;
        wq->saved_max_active = max_active;
        mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
 
-       wq->name = name;
        lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
        INIT_LIST_HEAD(&wq->list);
 
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
                if (!rescuer)
                        goto err;
 
-               rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+               rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+                                              wq->name);
                if (IS_ERR(rescuer->task))
                        goto err;
 
index a6e633a48cea887fbba14b7c1cbab84bd485c67b..4b35d2b4437cc76b3b75b45ae2d21ac6076f7809 100644 (file)
@@ -51,20 +51,21 @@ static inline u32
 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
 {
 # ifdef __LITTLE_ENDIAN
-#  define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8)
-#  define DO_CRC4 crc = tab[3][(crc) & 255] ^ \
-               tab[2][(crc >> 8) & 255] ^ \
-               tab[1][(crc >> 16) & 255] ^ \
-               tab[0][(crc >> 24) & 255]
+#  define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8)
+#  define DO_CRC4 crc = t3[(crc) & 255] ^ \
+               t2[(crc >> 8) & 255] ^ \
+               t1[(crc >> 16) & 255] ^ \
+               t0[(crc >> 24) & 255]
 # else
-#  define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
-#  define DO_CRC4 crc = tab[0][(crc) & 255] ^ \
-               tab[1][(crc >> 8) & 255] ^ \
-               tab[2][(crc >> 16) & 255] ^ \
-               tab[3][(crc >> 24) & 255]
+#  define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
+#  define DO_CRC4 crc = t0[(crc) & 255] ^ \
+               t1[(crc >> 8) & 255] ^  \
+               t2[(crc >> 16) & 255] ^ \
+               t3[(crc >> 24) & 255]
 # endif
        const u32 *b;
        size_t    rem_len;
+       const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3];
 
        /* Align it */
        if (unlikely((long)buf & 3 && len)) {
index 77cb245f8e7bdee634b98f6c275ae221c7c90715..0ab9ae8057f00e8545deba43ff31fab7873a619c 100644 (file)
@@ -818,17 +818,9 @@ static int __init fixup_activate(void *addr, enum debug_obj_state state)
                if (obj->static_init == 1) {
                        debug_object_init(obj, &descr_type_test);
                        debug_object_activate(obj, &descr_type_test);
-                       /*
-                        * Real code should return 0 here ! This is
-                        * not a fixup of some bad behaviour. We
-                        * merily call the debug_init function to keep
-                        * track of the object.
-                        */
-                       return 1;
-               } else {
-                       /* Real code needs to emit a warning here */
+                       return 0;
                }
-               return 0;
+               return 1;
 
        case ODEBUG_STATE_ACTIVE:
                debug_object_deactivate(obj, &descr_type_test);
@@ -967,7 +959,7 @@ static void __init debug_objects_selftest(void)
 
        obj.static_init = 1;
        debug_object_activate(&obj, &descr_type_test);
-       if (check_results(&obj, ODEBUG_STATE_ACTIVE, ++fixups, warnings))
+       if (check_results(&obj, ODEBUG_STATE_ACTIVE, fixups, warnings))
                goto out;
        debug_object_init(&obj, &descr_type_test);
        if (check_results(&obj, ODEBUG_STATE_INIT, ++fixups, ++warnings))
index 5a7a2adf4c4c2aa8d64d881e0ff1739acbe8782c..4531294fa62f2cb5de592178ddca2c0afc223558 100644 (file)
@@ -279,7 +279,7 @@ STATIC inline int INIT unlzo(u8 *input, int in_len,
        ret = 0;
 exit_2:
        if (!input)
-               free(in_buf);
+               free(in_buf_save);
 exit_1:
        if (!output)
                free(out_buf);
index 8b1a477162dc07242e7a461951bca291cff3dde7..4b2443254de260d2e05cdecb8d3eea903d91734f 100644 (file)
@@ -4,6 +4,7 @@ config DEBUG_PAGEALLOC
        depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
        depends on !KMEMCHECK
        select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+       select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
        ---help---
          Unmap pages from the kernel linear mapping after free_pages().
          This results in a large slowdown, but helps to find certain types
@@ -22,3 +23,7 @@ config WANT_PAGE_DEBUG_FLAGS
 config PAGE_POISONING
        bool
        select WANT_PAGE_DEBUG_FLAGS
+
+config PAGE_GUARD
+       bool
+       select WANT_PAGE_DEBUG_FLAGS
index 1a77012ecdb3c56a92d53ef5b779c9648d54c6a3..3e6f152f117e878a97572d881eb8606f885e0673 100644 (file)
@@ -56,7 +56,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
 
 static unsigned long __init bootmap_bytes(unsigned long pages)
 {
-       unsigned long bytes = (pages + 7) / 8;
+       unsigned long bytes = DIV_ROUND_UP(pages, 8);
 
        return ALIGN(bytes, sizeof(long));
 }
@@ -197,7 +197,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                idx = start - bdata->node_min_pfn;
                vec = ~map[idx / BITS_PER_LONG];
 
-               if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+               if (aligned && vec == ~0UL && start + BITS_PER_LONG <= end) {
                        int order = ilog2(BITS_PER_LONG);
 
                        __free_pages_bootmem(pfn_to_page(start), order);
index 899d95638586fcc65b44d631705845bef6245be3..8f8a5525dcde569fd2d4101fa9a0666328e58e4e 100644 (file)
@@ -365,8 +365,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                nr_isolated++;
 
                /* Avoid isolating too much */
-               if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+               if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+                       ++low_pfn;
                        break;
+               }
        }
 
        acct_isolated(zone, cc);
index 8d723c9e8b75b316041ea564f5e243b79a94e2b7..469491e0af79fed994d48fd4ada817f8e50c6d6a 100644 (file)
@@ -117,7 +117,8 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                break;
        case POSIX_FADV_DONTNEED:
                if (!bdi_write_congested(mapping->backing_dev_info))
-                       filemap_flush(mapping);
+                       __filemap_fdatawrite_range(mapping, offset, endbyte,
+                                                  WB_SYNC_NONE);
 
                /* First and last FULL page! */
                start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
index 0d2a0cbcfe3212fa732717a7988e75fa34584619..4642211c8493bfbebe983d5b35a94f4722d4ff67 100644 (file)
@@ -393,24 +393,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 {
        int error;
-       struct mem_cgroup *memcg = NULL;
 
        VM_BUG_ON(!PageLocked(old));
        VM_BUG_ON(!PageLocked(new));
        VM_BUG_ON(new->mapping);
 
-       /*
-        * This is not page migration, but prepare_migration and
-        * end_migration does enough work for charge replacement.
-        *
-        * In the longer term we probably want a specialized function
-        * for moving the charge from old to new in a more efficient
-        * manner.
-        */
-       error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
-       if (error)
-               return error;
-
        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (!error) {
                struct address_space *mapping = old->mapping;
@@ -432,13 +419,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
                spin_unlock_irq(&mapping->tree_lock);
+               /* mem_cgroup codes must not be called under tree_lock */
+               mem_cgroup_replace_page_cache(old, new);
                radix_tree_preload_end();
                if (freepage)
                        freepage(old);
                page_cache_release(old);
-               mem_cgroup_end_migration(memcg, old, new, true);
-       } else {
-               mem_cgroup_end_migration(memcg, old, new, false);
        }
 
        return error;
@@ -2354,8 +2340,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                        pgoff_t index, unsigned flags)
 {
        int status;
+       gfp_t gfp_mask;
        struct page *page;
        gfp_t gfp_notmask = 0;
+
+       gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE;
        if (flags & AOP_FLAG_NOFS)
                gfp_notmask = __GFP_FS;
 repeat:
@@ -2363,7 +2352,7 @@ repeat:
        if (page)
                goto found;
 
-       page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
+       page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
        if (!page)
                return NULL;
        status = add_to_page_cache_lru(page, mapping, index,
index 36b3d988b4ef6ac8c263ee0732c1d08513afb04f..76cc3f7dd4f057cd44540edf3dfd5d479e8e31ca 100644 (file)
@@ -487,41 +487,68 @@ static struct attribute_group khugepaged_attr_group = {
        .attrs = khugepaged_attr,
        .name = "khugepaged",
 };
-#endif /* CONFIG_SYSFS */
 
-static int __init hugepage_init(void)
+static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 {
        int err;
-#ifdef CONFIG_SYSFS
-       static struct kobject *hugepage_kobj;
-#endif
 
-       err = -EINVAL;
-       if (!has_transparent_hugepage()) {
-               transparent_hugepage_flags = 0;
-               goto out;
-       }
-
-#ifdef CONFIG_SYSFS
-       err = -ENOMEM;
-       hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
-       if (unlikely(!hugepage_kobj)) {
+       *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+       if (unlikely(!*hugepage_kobj)) {
                printk(KERN_ERR "hugepage: failed kobject create\n");
-               goto out;
+               return -ENOMEM;
        }
 
-       err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+       err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
        if (err) {
                printk(KERN_ERR "hugepage: failed register hugeage group\n");
-               goto out;
+               goto delete_obj;
        }
 
-       err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+       err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
        if (err) {
                printk(KERN_ERR "hugepage: failed register hugeage group\n");
-               goto out;
+               goto remove_hp_group;
        }
-#endif
+
+       return 0;
+
+remove_hp_group:
+       sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
+delete_obj:
+       kobject_put(*hugepage_kobj);
+       return err;
+}
+
+static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
+{
+       sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
+       sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
+       kobject_put(hugepage_kobj);
+}
+#else
+static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
+{
+       return 0;
+}
+
+static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
+{
+}
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+       int err;
+       struct kobject *hugepage_kobj;
+
+       if (!has_transparent_hugepage()) {
+               transparent_hugepage_flags = 0;
+               return -EINVAL;
+       }
+
+       err = hugepage_init_sysfs(&hugepage_kobj);
+       if (err)
+               return err;
 
        err = khugepaged_slab_init();
        if (err)
@@ -545,7 +572,9 @@ static int __init hugepage_init(void)
 
        set_recommended_min_free_kbytes();
 
+       return 0;
 out:
+       hugepage_exit_sysfs(hugepage_kobj);
        return err;
 }
 module_init(hugepage_init)
@@ -997,7 +1026,7 @@ out:
 }
 
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
-                pmd_t *pmd)
+                pmd_t *pmd, unsigned long addr)
 {
        int ret = 0;
 
@@ -1013,6 +1042,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                        pgtable = get_pmd_huge_pte(tlb->mm);
                        page = pmd_page(*pmd);
                        pmd_clear(pmd);
+                       tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                        page_remove_rmap(page);
                        VM_BUG_ON(page_mapcount(page) < 0);
                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
@@ -1116,7 +1146,6 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        entry = pmd_modify(entry, newprot);
                        set_pmd_at(mm, addr, pmd, entry);
                        spin_unlock(&vma->vm_mm->page_table_lock);
-                       flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
                        ret = 1;
                }
        } else
@@ -1199,7 +1228,6 @@ static int __split_huge_page_splitting(struct page *page,
 static void __split_huge_page_refcount(struct page *page)
 {
        int i;
-       unsigned long head_index = page->index;
        struct zone *zone = page_zone(page);
        int zonestat;
        int tail_count = 0;
@@ -1207,8 +1235,10 @@ static void __split_huge_page_refcount(struct page *page)
        /* prevent PageLRU to go away from under us, and freeze lru stats */
        spin_lock_irq(&zone->lru_lock);
        compound_lock(page);
+       /* complete memcg works before add pages to LRU */
+       mem_cgroup_split_huge_fixup(page);
 
-       for (i = 1; i < HPAGE_PMD_NR; i++) {
+       for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
                struct page *page_tail = page + i;
 
                /* tail_page->_mapcount cannot change */
@@ -1271,14 +1301,13 @@ static void __split_huge_page_refcount(struct page *page)
                BUG_ON(page_tail->mapping);
                page_tail->mapping = page->mapping;
 
-               page_tail->index = ++head_index;
+               page_tail->index = page->index + i;
 
                BUG_ON(!PageAnon(page_tail));
                BUG_ON(!PageUptodate(page_tail));
                BUG_ON(!PageDirty(page_tail));
                BUG_ON(!PageSwapBacked(page_tail));
 
-               mem_cgroup_split_huge_fixup(page, page_tail);
 
                lru_add_page_tail(zone, page, page_tail);
        }
index 73f17c0293c0a0e57a62f65f11c969b9319532f5..6161f7cae0da9f30f8f2acdb685b2e05fa227d6f 100644 (file)
@@ -2349,6 +2349,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 
 /*
  * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * cannot race with other handlers or page migration.
+ * Keep the pte_same checks anyway to make transition from the mutex easier.
  */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, pte_t pte,
@@ -2408,7 +2411,14 @@ retry_avoidcopy:
                                BUG_ON(page_count(old_page) != 1);
                                BUG_ON(huge_pte_none(pte));
                                spin_lock(&mm->page_table_lock);
-                               goto retry_avoidcopy;
+                               ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+                               if (likely(pte_same(huge_ptep_get(ptep), pte)))
+                                       goto retry_avoidcopy;
+                               /*
+                                * race occurs while re-acquiring page_table_lock, and
+                                * our job is done.
+                                */
+                               return 0;
                        }
                        WARN_ON_ONCE(1);
                }
@@ -2630,6 +2640,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
        struct hstate *h = hstate_vma(vma);
 
+       address &= huge_page_mask(h);
+
        ptep = huge_pte_offset(mm, address);
        if (ptep) {
                entry = huge_ptep_get(ptep);
index 0597a155b49829d63ce4c4c4b361166e86207fa9..6a417fe43d2a5e5e7fbd66f718af0f235b385d37 100644 (file)
@@ -123,16 +123,22 @@ struct mem_cgroup_stat_cpu {
        unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 
+struct mem_cgroup_reclaim_iter {
+       /* css_id of the last scanned hierarchy member */
+       int position;
+       /* scan generation, increased every round-trip */
+       unsigned int generation;
+};
+
 /*
  * per-zone information in memory controller.
  */
 struct mem_cgroup_per_zone {
-       /*
-        * spin_lock to protect the per cgroup LRU
-        */
-       struct list_head        lists[NR_LRU_LISTS];
+       struct lruvec           lruvec;
        unsigned long           count[NR_LRU_LISTS];
 
+       struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
+
        struct zone_reclaim_stat reclaim_stat;
        struct rb_node          tree_node;      /* RB tree node */
        unsigned long long      usage_in_excess;/* Set to the value by which */
@@ -237,11 +243,6 @@ struct mem_cgroup {
         * per zone LRU lists.
         */
        struct mem_cgroup_lru_info info;
-       /*
-        * While reclaiming in a hierarchy, we cache the last child we
-        * reclaimed from.
-        */
-       int last_scanned_child;
        int last_scanned_node;
 #if MAX_NUMNODES > 1
        nodemask_t      scan_nodes;
@@ -380,8 +381,6 @@ enum mem_type {
 #define MEM_CGROUP_RECLAIM_NOSWAP      (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT  0x1
 #define MEM_CGROUP_RECLAIM_SHRINK      (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-#define MEM_CGROUP_RECLAIM_SOFT_BIT    0x2
-#define MEM_CGROUP_RECLAIM_SOFT                (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 
 static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
@@ -661,16 +660,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 }
 
-void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
-{
-       this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
-}
-
-void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
-{
-       this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
-}
-
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
                                            enum mem_cgroup_events_index idx)
 {
@@ -754,37 +743,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
        return total;
 }
 
-static bool __memcg_event_check(struct mem_cgroup *memcg, int target)
+static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
+                                      enum mem_cgroup_events_target target)
 {
        unsigned long val, next;
 
        val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
        next = __this_cpu_read(memcg->stat->targets[target]);
        /* from time_after() in jiffies.h */
-       return ((long)next - (long)val < 0);
-}
-
-static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
-{
-       unsigned long val, next;
-
-       val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
-
-       switch (target) {
-       case MEM_CGROUP_TARGET_THRESH:
-               next = val + THRESHOLDS_EVENTS_TARGET;
-               break;
-       case MEM_CGROUP_TARGET_SOFTLIMIT:
-               next = val + SOFTLIMIT_EVENTS_TARGET;
-               break;
-       case MEM_CGROUP_TARGET_NUMAINFO:
-               next = val + NUMAINFO_EVENTS_TARGET;
-               break;
-       default:
-               return;
+       if ((long)next - (long)val < 0) {
+               switch (target) {
+               case MEM_CGROUP_TARGET_THRESH:
+                       next = val + THRESHOLDS_EVENTS_TARGET;
+                       break;
+               case MEM_CGROUP_TARGET_SOFTLIMIT:
+                       next = val + SOFTLIMIT_EVENTS_TARGET;
+                       break;
+               case MEM_CGROUP_TARGET_NUMAINFO:
+                       next = val + NUMAINFO_EVENTS_TARGET;
+                       break;
+               default:
+                       break;
+               }
+               __this_cpu_write(memcg->stat->targets[target], next);
+               return true;
        }
-
-       __this_cpu_write(memcg->stat->targets[target], next);
+       return false;
 }
 
 /*
@@ -795,25 +779,27 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
        preempt_disable();
        /* threshold event is triggered in finer grain than soft limit */
-       if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) {
+       if (unlikely(mem_cgroup_event_ratelimit(memcg,
+                                               MEM_CGROUP_TARGET_THRESH))) {
+               bool do_softlimit, do_numainfo;
+
+               do_softlimit = mem_cgroup_event_ratelimit(memcg,
+                                               MEM_CGROUP_TARGET_SOFTLIMIT);
+#if MAX_NUMNODES > 1
+               do_numainfo = mem_cgroup_event_ratelimit(memcg,
+                                               MEM_CGROUP_TARGET_NUMAINFO);
+#endif
+               preempt_enable();
+
                mem_cgroup_threshold(memcg);
-               __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH);
-               if (unlikely(__memcg_event_check(memcg,
-                            MEM_CGROUP_TARGET_SOFTLIMIT))) {
+               if (unlikely(do_softlimit))
                        mem_cgroup_update_tree(memcg, page);
-                       __mem_cgroup_target_update(memcg,
-                                                  MEM_CGROUP_TARGET_SOFTLIMIT);
-               }
 #if MAX_NUMNODES > 1
-               if (unlikely(__memcg_event_check(memcg,
-                       MEM_CGROUP_TARGET_NUMAINFO))) {
+               if (unlikely(do_numainfo))
                        atomic_inc(&memcg->numainfo_events);
-                       __mem_cgroup_target_update(memcg,
-                               MEM_CGROUP_TARGET_NUMAINFO);
-               }
 #endif
-       }
-       preempt_enable();
+       } else
+               preempt_enable();
 }
 
 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
@@ -858,83 +844,116 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
        return memcg;
 }
 
-/* The caller has to guarantee "mem" exists before calling this */
-static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg)
+/**
+ * mem_cgroup_iter - iterate over memory cgroup hierarchy
+ * @root: hierarchy root
+ * @prev: previously returned memcg, NULL on first invocation
+ * @reclaim: cookie for shared reclaim walks, NULL for full walks
+ *
+ * Returns references to children of the hierarchy below @root, or
+ * @root itself, or %NULL after a full round-trip.
+ *
+ * Caller must pass the return value in @prev on subsequent
+ * invocations for reference counting, or use mem_cgroup_iter_break()
+ * to cancel a hierarchy walk before the round-trip is complete.
+ *
+ * Reclaimers can specify a zone and a priority level in @reclaim to
+ * divide up the memcgs in the hierarchy among all concurrent
+ * reclaimers operating on the same zone and priority.
+ */
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+                                  struct mem_cgroup *prev,
+                                  struct mem_cgroup_reclaim_cookie *reclaim)
 {
-       struct cgroup_subsys_state *css;
-       int found;
+       struct mem_cgroup *memcg = NULL;
+       int id = 0;
 
-       if (!memcg) /* ROOT cgroup has the smallest ID */
-               return root_mem_cgroup; /*css_put/get against root is ignored*/
-       if (!memcg->use_hierarchy) {
-               if (css_tryget(&memcg->css))
-                       return memcg;
+       if (mem_cgroup_disabled())
                return NULL;
-       }
-       rcu_read_lock();
-       /*
-        * searching a memory cgroup which has the smallest ID under given
-        * ROOT cgroup. (ID >= 1)
-        */
-       css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
-       if (css && css_tryget(css))
-               memcg = container_of(css, struct mem_cgroup, css);
-       else
-               memcg = NULL;
-       rcu_read_unlock();
-       return memcg;
-}
 
-static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
-                                       struct mem_cgroup *root,
-                                       bool cond)
-{
-       int nextid = css_id(&iter->css) + 1;
-       int found;
-       int hierarchy_used;
-       struct cgroup_subsys_state *css;
+       if (!root)
+               root = root_mem_cgroup;
 
-       hierarchy_used = iter->use_hierarchy;
+       if (prev && !reclaim)
+               id = css_id(&prev->css);
 
-       css_put(&iter->css);
-       /* If no ROOT, walk all, ignore hierarchy */
-       if (!cond || (root && !hierarchy_used))
-               return NULL;
+       if (prev && prev != root)
+               css_put(&prev->css);
 
-       if (!root)
-               root = root_mem_cgroup;
+       if (!root->use_hierarchy && root != root_mem_cgroup) {
+               if (prev)
+                       return NULL;
+               return root;
+       }
 
-       do {
-               iter = NULL;
-               rcu_read_lock();
+       while (!memcg) {
+               struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+               struct cgroup_subsys_state *css;
+
+               if (reclaim) {
+                       int nid = zone_to_nid(reclaim->zone);
+                       int zid = zone_idx(reclaim->zone);
+                       struct mem_cgroup_per_zone *mz;
 
-               css = css_get_next(&mem_cgroup_subsys, nextid,
-                               &root->css, &found);
-               if (css && css_tryget(css))
-                       iter = container_of(css, struct mem_cgroup, css);
+                       mz = mem_cgroup_zoneinfo(root, nid, zid);
+                       iter = &mz->reclaim_iter[reclaim->priority];
+                       if (prev && reclaim->generation != iter->generation)
+                               return NULL;
+                       id = iter->position;
+               }
+
+               rcu_read_lock();
+               css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
+               if (css) {
+                       if (css == &root->css || css_tryget(css))
+                               memcg = container_of(css,
+                                                    struct mem_cgroup, css);
+               } else
+                       id = 0;
                rcu_read_unlock();
-               /* If css is NULL, no more cgroups will be found */
-               nextid = found + 1;
-       } while (css && !iter);
 
-       return iter;
+               if (reclaim) {
+                       iter->position = id;
+                       if (!css)
+                               iter->generation++;
+                       else if (!prev && memcg)
+                               reclaim->generation = iter->generation;
+               }
+
+               if (prev && !css)
+                       return NULL;
+       }
+       return memcg;
 }
-/*
- * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
- * be careful that "break" loop is not allowed. We have reference count.
- * Instead of that modify "cond" to be false and "continue" to exit the loop.
- */
-#define for_each_mem_cgroup_tree_cond(iter, root, cond)        \
-       for (iter = mem_cgroup_start_loop(root);\
-            iter != NULL;\
-            iter = mem_cgroup_get_next(iter, root, cond))
 
-#define for_each_mem_cgroup_tree(iter, root) \
-       for_each_mem_cgroup_tree_cond(iter, root, true)
+/**
+ * mem_cgroup_iter_break - abort a hierarchy walk prematurely
+ * @root: hierarchy root
+ * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
+ */
+void mem_cgroup_iter_break(struct mem_cgroup *root,
+                          struct mem_cgroup *prev)
+{
+       if (!root)
+               root = root_mem_cgroup;
+       if (prev && prev != root)
+               css_put(&prev->css);
+}
 
-#define for_each_mem_cgroup_all(iter) \
-       for_each_mem_cgroup_tree_cond(iter, NULL, true)
+/*
+ * Iteration constructs for visiting all cgroups (under a tree).  If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root)           \
+       for (iter = mem_cgroup_iter(root, NULL, NULL);  \
+            iter != NULL;                              \
+            iter = mem_cgroup_iter(root, iter, NULL))
 
+#define for_each_mem_cgroup(iter)                      \
+       for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
+            iter != NULL;                              \
+            iter = mem_cgroup_iter(NULL, iter, NULL))
 
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
@@ -955,10 +974,10 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 
        switch (idx) {
        case PGMAJFAULT:
-               mem_cgroup_pgmajfault(memcg, 1);
+               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
                break;
        case PGFAULT:
-               mem_cgroup_pgfault(memcg, 1);
+               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
                break;
        default:
                BUG();
@@ -968,6 +987,27 @@ out:
 }
 EXPORT_SYMBOL(mem_cgroup_count_vm_event);
 
+/**
+ * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
+ * @zone: zone of the wanted lruvec
+ * @mem: memcg of the wanted lruvec
+ *
+ * Returns the lru list vector holding pages for the given @zone and
+ * @mem.  This can be the global zone lruvec, if the memory controller
+ * is disabled.
+ */
+struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
+                                     struct mem_cgroup *memcg)
+{
+       struct mem_cgroup_per_zone *mz;
+
+       if (mem_cgroup_disabled())
+               return &zone->lruvec;
+
+       mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
+       return &mz->lruvec;
+}
+
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
@@ -982,112 +1022,123 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
  * When moving account, the page is not on LRU. It's isolated.
  */
 
-void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
+/**
+ * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
+ * @zone: zone of the page
+ * @page: the page
+ * @lru: current lru
+ *
+ * This function accounts for @page being added to @lru, and returns
+ * the lruvec for the given @zone and the memcg @page is charged to.
+ *
+ * The callsite is then responsible for physically linking the page to
+ * the returned lruvec->lists[@lru].
+ */
+struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
+                                      enum lru_list lru)
 {
-       struct page_cgroup *pc;
        struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup *memcg;
+       struct page_cgroup *pc;
 
        if (mem_cgroup_disabled())
-               return;
+               return &zone->lruvec;
+
        pc = lookup_page_cgroup(page);
-       /* can happen while we handle swapcache. */
-       if (!TestClearPageCgroupAcctLRU(pc))
-               return;
-       VM_BUG_ON(!pc->mem_cgroup);
+       VM_BUG_ON(PageCgroupAcctLRU(pc));
        /*
-        * We don't check PCG_USED bit. It's cleared when the "page" is finally
-        * removed from global LRU.
+        * putback:                             charge:
+        * SetPageLRU                           SetPageCgroupUsed
+        * smp_mb                               smp_mb
+        * PageCgroupUsed && add to memcg LRU   PageLRU && add to memcg LRU
+        *
+        * Ensure that one of the two sides adds the page to the memcg
+        * LRU during a race.
         */
-       mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
-       /* huge page split is done under lru_lock. so, we have no races. */
-       MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
-       if (mem_cgroup_is_root(pc->mem_cgroup))
-               return;
-       VM_BUG_ON(list_empty(&pc->lru));
-       list_del_init(&pc->lru);
-}
-
-void mem_cgroup_del_lru(struct page *page)
-{
-       mem_cgroup_del_lru_list(page, page_lru(page));
+       smp_mb();
+       /*
+        * If the page is uncharged, it may be freed soon, but it
+        * could also be swap cache (readahead, swapoff) that needs to
+        * be reclaimable in the future.  root_mem_cgroup will babysit
+        * it for the time being.
+        */
+       if (PageCgroupUsed(pc)) {
+               /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+               smp_rmb();
+               memcg = pc->mem_cgroup;
+               SetPageCgroupAcctLRU(pc);
+       } else
+               memcg = root_mem_cgroup;
+       mz = page_cgroup_zoneinfo(memcg, page);
+       /* compound_order() is stabilized through lru_lock */
+       MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
+       return &mz->lruvec;
 }
 
-/*
- * Writeback is about to end against a page which has been marked for immediate
- * reclaim.  If it still appears to be reclaimable, move it to the tail of the
- * inactive list.
+/**
+ * mem_cgroup_lru_del_list - account for removing an lru page
+ * @page: the page
+ * @lru: target lru
+ *
+ * This function accounts for @page being removed from @lru.
+ *
+ * The callsite is then responsible for physically unlinking
+ * @page->lru.
  */
-void mem_cgroup_rotate_reclaimable_page(struct page *page)
+void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
 {
        struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup *memcg;
        struct page_cgroup *pc;
-       enum lru_list lru = page_lru(page);
 
        if (mem_cgroup_disabled())
                return;
 
        pc = lookup_page_cgroup(page);
-       /* unused or root page is not rotated. */
-       if (!PageCgroupUsed(pc))
-               return;
-       /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
-       smp_rmb();
-       if (mem_cgroup_is_root(pc->mem_cgroup))
-               return;
-       mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
-       list_move_tail(&pc->lru, &mz->lists[lru]);
+       /*
+        * root_mem_cgroup babysits uncharged LRU pages, but
+        * PageCgroupUsed is cleared when the page is about to get
+        * freed.  PageCgroupAcctLRU remembers whether the
+        * LRU-accounting happened against pc->mem_cgroup or
+        * root_mem_cgroup.
+        */
+       if (TestClearPageCgroupAcctLRU(pc)) {
+               VM_BUG_ON(!pc->mem_cgroup);
+               memcg = pc->mem_cgroup;
+       } else
+               memcg = root_mem_cgroup;
+       mz = page_cgroup_zoneinfo(memcg, page);
+       /* huge page split is done under lru_lock. so, we have no races. */
+       MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
 }
 
-void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+void mem_cgroup_lru_del(struct page *page)
 {
-       struct mem_cgroup_per_zone *mz;
-       struct page_cgroup *pc;
-
-       if (mem_cgroup_disabled())
-               return;
-
-       pc = lookup_page_cgroup(page);
-       /* unused or root page is not rotated. */
-       if (!PageCgroupUsed(pc))
-               return;
-       /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
-       smp_rmb();
-       if (mem_cgroup_is_root(pc->mem_cgroup))
-               return;
-       mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
-       list_move(&pc->lru, &mz->lists[lru]);
+       mem_cgroup_lru_del_list(page, page_lru(page));
 }
 
-void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
+/**
+ * mem_cgroup_lru_move_lists - account for moving a page between lrus
+ * @zone: zone of the page
+ * @page: the page
+ * @from: current lru
+ * @to: target lru
+ *
+ * This function accounts for @page being moved between the lrus @from
+ * and @to, and returns the lruvec for the given @zone and the memcg
+ * @page is charged to.
+ *
+ * The callsite is then responsible for physically relinking
+ * @page->lru to the returned lruvec->lists[@to].
+ */
+struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
+                                        struct page *page,
+                                        enum lru_list from,
+                                        enum lru_list to)
 {
-       struct page_cgroup *pc;
-       struct mem_cgroup_per_zone *mz;
-
-       if (mem_cgroup_disabled())
-               return;
-       pc = lookup_page_cgroup(page);
-       VM_BUG_ON(PageCgroupAcctLRU(pc));
-       /*
-        * putback:                             charge:
-        * SetPageLRU                           SetPageCgroupUsed
-        * smp_mb                               smp_mb
-        * PageCgroupUsed && add to memcg LRU   PageLRU && add to memcg LRU
-        *
-        * Ensure that one of the two sides adds the page to the memcg
-        * LRU during a race.
-        */
-       smp_mb();
-       if (!PageCgroupUsed(pc))
-               return;
-       /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
-       smp_rmb();
-       mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
-       /* huge page split is done under lru_lock. so, we have no races. */
-       MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
-       SetPageCgroupAcctLRU(pc);
-       if (mem_cgroup_is_root(pc->mem_cgroup))
-               return;
-       list_add(&pc->lru, &mz->lists[lru]);
+       /* XXX: Optimize this, especially for @from == @to */
+       mem_cgroup_lru_del_list(page, from);
+       return mem_cgroup_lru_add_list(zone, page, to);
 }
 
 /*
@@ -1098,6 +1149,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
  */
 static void mem_cgroup_lru_del_before_commit(struct page *page)
 {
+       enum lru_list lru;
        unsigned long flags;
        struct zone *zone = page_zone(page);
        struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -1114,17 +1166,28 @@ static void mem_cgroup_lru_del_before_commit(struct page *page)
                return;
 
        spin_lock_irqsave(&zone->lru_lock, flags);
+       lru = page_lru(page);
        /*
-        * Forget old LRU when this page_cgroup is *not* used. This Used bit
-        * is guarded by lock_page() because the page is SwapCache.
+        * The uncharged page could still be registered to the LRU of
+        * the stale pc->mem_cgroup.
+        *
+        * As pc->mem_cgroup is about to get overwritten, the old LRU
+        * accounting needs to be taken care of.  Let root_mem_cgroup
+        * babysit the page until the new memcg is responsible for it.
+        *
+        * The PCG_USED bit is guarded by lock_page() as the page is
+        * swapcache/pagecache.
         */
-       if (!PageCgroupUsed(pc))
-               mem_cgroup_del_lru_list(page, page_lru(page));
+       if (PageLRU(page) && PageCgroupAcctLRU(pc) && !PageCgroupUsed(pc)) {
+               del_page_from_lru_list(zone, page, lru);
+               add_page_to_lru_list(zone, page, lru);
+       }
        spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
 
 static void mem_cgroup_lru_add_after_commit(struct page *page)
 {
+       enum lru_list lru;
        unsigned long flags;
        struct zone *zone = page_zone(page);
        struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -1142,22 +1205,22 @@ static void mem_cgroup_lru_add_after_commit(struct page *page)
        if (likely(!PageLRU(page)))
                return;
        spin_lock_irqsave(&zone->lru_lock, flags);
-       /* link when the page is linked to LRU but page_cgroup isn't */
-       if (PageLRU(page) && !PageCgroupAcctLRU(pc))
-               mem_cgroup_add_lru_list(page, page_lru(page));
+       lru = page_lru(page);
+       /*
+        * If the page is not on the LRU, someone will soon put it
+        * there.  If it is, and also already accounted for on the
+        * memcg-side, it must be on the right lruvec as setting
+        * pc->mem_cgroup and PageCgroupUsed is properly ordered.
+        * Otherwise, root_mem_cgroup has been babysitting the page
+        * during the charge.  Move it to the new memcg now.
+        */
+       if (PageLRU(page) && !PageCgroupAcctLRU(pc)) {
+               del_page_from_lru_list(zone, page, lru);
+               add_page_to_lru_list(zone, page, lru);
+       }
        spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
 
-
-void mem_cgroup_move_lists(struct page *page,
-                          enum lru_list from, enum lru_list to)
-{
-       if (mem_cgroup_disabled())
-               return;
-       mem_cgroup_del_lru_list(page, from);
-       mem_cgroup_add_lru_list(page, to);
-}
-
 /*
  * Checks whether given mem is same or in the root_mem_cgroup's
  * hierarchy subtree
@@ -1180,10 +1243,21 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
        struct task_struct *p;
 
        p = find_lock_task_mm(task);
-       if (!p)
-               return 0;
-       curr = try_get_mem_cgroup_from_mm(p->mm);
-       task_unlock(p);
+       if (p) {
+               curr = try_get_mem_cgroup_from_mm(p->mm);
+               task_unlock(p);
+       } else {
+               /*
+                * All threads may have already detached their mm's, but the oom
+                * killer still needs to detect if they have already been oom
+                * killed to prevent needlessly killing additional tasks.
+                */
+               task_lock(task);
+               curr = mem_cgroup_from_task(task);
+               if (curr)
+                       css_get(&curr->css);
+               task_unlock(task);
+       }
        if (!curr)
                return 0;
        /*
@@ -1263,68 +1337,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
        return &mz->reclaim_stat;
 }
 
-unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
-                                       struct list_head *dst,
-                                       unsigned long *scanned, int order,
-                                       isolate_mode_t mode,
-                                       struct zone *z,
-                                       struct mem_cgroup *mem_cont,
-                                       int active, int file)
-{
-       unsigned long nr_taken = 0;
-       struct page *page;
-       unsigned long scan;
-       LIST_HEAD(pc_list);
-       struct list_head *src;
-       struct page_cgroup *pc, *tmp;
-       int nid = zone_to_nid(z);
-       int zid = zone_idx(z);
-       struct mem_cgroup_per_zone *mz;
-       int lru = LRU_FILE * file + active;
-       int ret;
-
-       BUG_ON(!mem_cont);
-       mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
-       src = &mz->lists[lru];
-
-       scan = 0;
-       list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
-               if (scan >= nr_to_scan)
-                       break;
-
-               if (unlikely(!PageCgroupUsed(pc)))
-                       continue;
-
-               page = lookup_cgroup_page(pc);
-
-               if (unlikely(!PageLRU(page)))
-                       continue;
-
-               scan++;
-               ret = __isolate_lru_page(page, mode, file);
-               switch (ret) {
-               case 0:
-                       list_move(&page->lru, dst);
-                       mem_cgroup_del_lru(page);
-                       nr_taken += hpage_nr_pages(page);
-                       break;
-               case -EBUSY:
-                       /* we don't affect global LRU but rotate in our LRU */
-                       mem_cgroup_rotate_lru_list(page, page_lru(page));
-                       break;
-               default:
-                       break;
-               }
-       }
-
-       *scanned = scan;
-
-       trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
-                                     0, 0, 0, mode);
-
-       return nr_taken;
-}
-
 #define mem_cgroup_from_res_counter(counter, member)   \
        container_of(counter, struct mem_cgroup, member)
 
@@ -1541,41 +1553,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        return min(limit, memsw);
 }
 
-/*
- * Visit the first child (need not be the first child as per the ordering
- * of the cgroup list, since we track last_scanned_child) of @mem and use
- * that to reclaim free pages from.
- */
-static struct mem_cgroup *
-mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
+static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
+                                       gfp_t gfp_mask,
+                                       unsigned long flags)
 {
-       struct mem_cgroup *ret = NULL;
-       struct cgroup_subsys_state *css;
-       int nextid, found;
-
-       if (!root_memcg->use_hierarchy) {
-               css_get(&root_memcg->css);
-               ret = root_memcg;
-       }
+       unsigned long total = 0;
+       bool noswap = false;
+       int loop;
 
-       while (!ret) {
-               rcu_read_lock();
-               nextid = root_memcg->last_scanned_child + 1;
-               css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,
-                                  &found);
-               if (css && css_tryget(css))
-                       ret = container_of(css, struct mem_cgroup, css);
+       if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
+               noswap = true;
+       if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
+               noswap = true;
 
-               rcu_read_unlock();
-               /* Updates scanning parameter */
-               if (!css) {
-                       /* this means start scan from ID:1 */
-                       root_memcg->last_scanned_child = 0;
-               } else
-                       root_memcg->last_scanned_child = found;
+       for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
+               if (loop)
+                       drain_all_stock_async(memcg);
+               total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
+               /*
+                * Allow limit shrinkers, which are triggered directly
+                * by userspace, to catch signals and stop reclaim
+                * after minimal progress, regardless of the margin.
+                */
+               if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
+                       break;
+               if (mem_cgroup_margin(memcg))
+                       break;
+               /*
+                * If nothing was reclaimed after two attempts, there
+                * may be no reclaimable pages in this hierarchy.
+                */
+               if (loop && !total)
+                       break;
        }
-
-       return ret;
+       return total;
 }
 
 /**
@@ -1715,61 +1726,35 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 }
 #endif
 
-/*
- * Scan the hierarchy if needed to reclaim memory. We remember the last child
- * we reclaimed from, so that we don't end up penalizing one child extensively
- * based on its position in the children list.
- *
- * root_memcg is the original ancestor that we've been reclaim from.
- *
- * We give up and return to the caller when we visit root_memcg twice.
- * (other groups can be removed while we're walking....)
- *
- * If shrink==true, for avoiding to free too much, this returns immedieately.
- */
-static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
-                                               struct zone *zone,
-                                               gfp_t gfp_mask,
-                                               unsigned long reclaim_options,
-                                               unsigned long *total_scanned)
-{
-       struct mem_cgroup *victim;
-       int ret, total = 0;
+static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
+                                  struct zone *zone,
+                                  gfp_t gfp_mask,
+                                  unsigned long *total_scanned)
+{
+       struct mem_cgroup *victim = NULL;
+       int total = 0;
        int loop = 0;
-       bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
-       bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
-       bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
        unsigned long excess;
        unsigned long nr_scanned;
+       struct mem_cgroup_reclaim_cookie reclaim = {
+               .zone = zone,
+               .priority = 0,
+       };
 
        excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
 
-       /* If memsw_is_minimum==1, swap-out is of-no-use. */
-       if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
-               noswap = true;
-
        while (1) {
-               victim = mem_cgroup_select_victim(root_memcg);
-               if (victim == root_memcg) {
+               victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
+               if (!victim) {
                        loop++;
-                       /*
-                        * We are not draining per cpu cached charges during
-                        * soft limit reclaim  because global reclaim doesn't
-                        * care about charges. It tries to free some memory and
-                        * charges will not give any.
-                        */
-                       if (!check_soft && loop >= 1)
-                               drain_all_stock_async(root_memcg);
                        if (loop >= 2) {
                                /*
                                 * If we have not been able to reclaim
                                 * anything, it might because there are
                                 * no reclaimable pages under this hierarchy
                                 */
-                               if (!check_soft || !total) {
-                                       css_put(&victim->css);
+                               if (!total)
                                        break;
-                               }
                                /*
                                 * We want to do more targeted reclaim.
                                 * excess >> 2 is not to excessive so as to
@@ -1777,40 +1762,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
                                 * coming back to reclaim from this cgroup
                                 */
                                if (total >= (excess >> 2) ||
-                                       (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
-                                       css_put(&victim->css);
+                                       (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
                                        break;
-                               }
                        }
-               }
-               if (!mem_cgroup_reclaimable(victim, noswap)) {
-                       /* this cgroup's local usage == 0 */
-                       css_put(&victim->css);
                        continue;
                }
-               /* we use swappiness of local cgroup */
-               if (check_soft) {
-                       ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                               noswap, zone, &nr_scanned);
-                       *total_scanned += nr_scanned;
-               } else
-                       ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
-                                               noswap);
-               css_put(&victim->css);
-               /*
-                * At shrinking usage, we can't check we should stop here or
-                * reclaim more. It's depends on callers. last_scanned_child
-                * will work enough for keeping fairness under tree.
-                */
-               if (shrink)
-                       return ret;
-               total += ret;
-               if (check_soft) {
-                       if (!res_counter_soft_limit_excess(&root_memcg->res))
-                               return total;
-               } else if (mem_cgroup_margin(root_memcg))
-                       return total;
+               if (!mem_cgroup_reclaimable(victim, false))
+                       continue;
+               total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+                                                    zone, &nr_scanned);
+               *total_scanned += nr_scanned;
+               if (!res_counter_soft_limit_excess(&root_memcg->res))
+                       break;
        }
+       mem_cgroup_iter_break(root_memcg, victim);
        return total;
 }
 
@@ -1822,16 +1787,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
 static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *iter, *failed = NULL;
-       bool cond = true;
 
-       for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
+       for_each_mem_cgroup_tree(iter, memcg) {
                if (iter->oom_lock) {
                        /*
                         * this subtree of our hierarchy is already locked
                         * so we cannot give a lock.
                         */
                        failed = iter;
-                       cond = false;
+                       mem_cgroup_iter_break(memcg, iter);
+                       break;
                } else
                        iter->oom_lock = true;
        }
@@ -1843,11 +1808,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
         * OK, we failed to lock the whole subtree so we have to clean up
         * what we set up to the failing subtree
         */
-       cond = true;
-       for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
+       for_each_mem_cgroup_tree(iter, memcg) {
                if (iter == failed) {
-                       cond = false;
-                       continue;
+                       mem_cgroup_iter_break(memcg, iter);
+                       break;
                }
                iter->oom_lock = false;
        }
@@ -2012,9 +1976,6 @@ void mem_cgroup_update_page_stat(struct page *page,
        bool need_unlock = false;
        unsigned long uninitialized_var(flags);
 
-       if (unlikely(!pc))
-               return;
-
        rcu_read_lock();
        memcg = pc->mem_cgroup;
        if (unlikely(!memcg || !PageCgroupUsed(pc)))
@@ -2243,7 +2204,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
        struct mem_cgroup *iter;
 
        if ((action == CPU_ONLINE)) {
-               for_each_mem_cgroup_all(iter)
+               for_each_mem_cgroup(iter)
                        synchronize_mem_cgroup_on_move(iter, cpu);
                return NOTIFY_OK;
        }
@@ -2251,7 +2212,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
        if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
 
-       for_each_mem_cgroup_all(iter)
+       for_each_mem_cgroup(iter)
                mem_cgroup_drain_pcp_counter(iter, cpu);
 
        stock = &per_cpu(memcg_stock, cpu);
@@ -2305,8 +2266,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
 
-       ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                             gfp_mask, flags, NULL);
+       ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                return CHARGE_RETRY;
        /*
@@ -2527,7 +2487,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
                        memcg = NULL;
        } else if (PageSwapCache(page)) {
                ent.val = page_private(page);
-               id = lookup_swap_cgroup(ent);
+               id = lookup_swap_cgroup_id(ent);
                rcu_read_lock();
                memcg = mem_cgroup_lookup(id);
                if (memcg && !css_tryget(&memcg->css))
@@ -2593,39 +2553,39 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
                        (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
 /*
  * Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ * zone->lru_lock, 'splitting on pmd' and compound_lock.
+ * charge/uncharge will be never happen and move_account() is done under
+ * compound_lock(), so we don't have to take care of races.
  */
-void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+void mem_cgroup_split_huge_fixup(struct page *head)
 {
        struct page_cgroup *head_pc = lookup_page_cgroup(head);
-       struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
-       unsigned long flags;
+       struct page_cgroup *pc;
+       int i;
 
        if (mem_cgroup_disabled())
                return;
-       /*
-        * We have no races with charge/uncharge but will have races with
-        * page state accounting.
-        */
-       move_lock_page_cgroup(head_pc, &flags);
+       for (i = 1; i < HPAGE_PMD_NR; i++) {
+               pc = head_pc + i;
+               pc->mem_cgroup = head_pc->mem_cgroup;
+               smp_wmb();/* see __commit_charge() */
+               /*
+                * LRU flags cannot be copied because we need to add tail
+                * page to LRU by generic call and our hooks will be called.
+                */
+               pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+       }
 
-       tail_pc->mem_cgroup = head_pc->mem_cgroup;
-       smp_wmb(); /* see __commit_charge() */
        if (PageCgroupAcctLRU(head_pc)) {
                enum lru_list lru;
                struct mem_cgroup_per_zone *mz;
-
                /*
-                * LRU flags cannot be copied because we need to add tail
-                *.page to LRU by generic call and our hook will be called.
                 * We hold lru_lock, then, reduce counter directly.
                 */
                lru = page_lru(head);
                mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
-               MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+               MEM_CGROUP_ZSTAT(mz, lru) -= HPAGE_PMD_NR - 1;
        }
-       tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
-       move_unlock_page_cgroup(head_pc, &flags);
 }
 #endif
 
@@ -2788,8 +2748,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        }
 
        pc = lookup_page_cgroup(page);
-       BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
-
        ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
        if (ret || !memcg)
                return ret;
@@ -2803,19 +2761,11 @@ int mem_cgroup_newpage_charge(struct page *page,
 {
        if (mem_cgroup_disabled())
                return 0;
-       /*
-        * If already mapped, we don't have to account.
-        * If page cache, page->mapping has address_space.
-        * But page->mapping may have out-of-use anon_vma pointer,
-        * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
-        * is NULL.
-        */
-       if (page_mapped(page) || (page->mapping && !PageAnon(page)))
-               return 0;
-       if (unlikely(!mm))
-               mm = &init_mm;
+       VM_BUG_ON(page_mapped(page));
+       VM_BUG_ON(page->mapping && !PageAnon(page));
+       VM_BUG_ON(!mm);
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                               MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                       MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 
 static void
@@ -2887,12 +2837,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
  */
 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                                 struct page *page,
-                                gfp_t mask, struct mem_cgroup **ptr)
+                                gfp_t mask, struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg;
        int ret;
 
-       *ptr = NULL;
+       *memcgp = NULL;
 
        if (mem_cgroup_disabled())
                return 0;
@@ -2910,27 +2860,27 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
        memcg = try_get_mem_cgroup_from_page(page);
        if (!memcg)
                goto charge_cur_mm;
-       *ptr = memcg;
-       ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
+       *memcgp = memcg;
+       ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
        css_put(&memcg->css);
        return ret;
 charge_cur_mm:
        if (unlikely(!mm))
                mm = &init_mm;
-       return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
+       return __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
 }
 
 static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
+__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                                        enum charge_type ctype)
 {
        if (mem_cgroup_disabled())
                return;
-       if (!ptr)
+       if (!memcg)
                return;
-       cgroup_exclude_rmdir(&ptr->css);
+       cgroup_exclude_rmdir(&memcg->css);
 
-       __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
+       __mem_cgroup_commit_charge_lrucare(page, memcg, ctype);
        /*
         * Now swap is on-memory. This means this page may be
         * counted both as mem and swap....double count.
@@ -2940,21 +2890,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
         */
        if (do_swap_account && PageSwapCache(page)) {
                swp_entry_t ent = {.val = page_private(page)};
+               struct mem_cgroup *swap_memcg;
                unsigned short id;
-               struct mem_cgroup *memcg;
 
                id = swap_cgroup_record(ent, 0);
                rcu_read_lock();
-               memcg = mem_cgroup_lookup(id);
-               if (memcg) {
+               swap_memcg = mem_cgroup_lookup(id);
+               if (swap_memcg) {
                        /*
                         * This recorded memcg can be obsolete one. So, avoid
                         * calling css_tryget
                         */
-                       if (!mem_cgroup_is_root(memcg))
-                               res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
-                       mem_cgroup_swap_statistics(memcg, false);
-                       mem_cgroup_put(memcg);
+                       if (!mem_cgroup_is_root(swap_memcg))
+                               res_counter_uncharge(&swap_memcg->memsw,
+                                                    PAGE_SIZE);
+                       mem_cgroup_swap_statistics(swap_memcg, false);
+                       mem_cgroup_put(swap_memcg);
                }
                rcu_read_unlock();
        }
@@ -2963,13 +2914,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
         * So, rmdir()->pre_destroy() can be called while we do this charge.
         * In that case, we need to call pre_destroy() again. check it here.
         */
-       cgroup_release_and_wakeup_rmdir(&ptr->css);
+       cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 
-void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
+void mem_cgroup_commit_charge_swapin(struct page *page,
+                                    struct mem_cgroup *memcg)
 {
-       __mem_cgroup_commit_charge_swapin(page, ptr,
-                                       MEM_CGROUP_CHARGE_TYPE_MAPPED);
+       __mem_cgroup_commit_charge_swapin(page, memcg,
+                                         MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 
 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
@@ -3059,7 +3011,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
         * Check if our page_cgroup is valid
         */
        pc = lookup_page_cgroup(page);
-       if (unlikely(!pc || !PageCgroupUsed(pc)))
+       if (unlikely(!PageCgroupUsed(pc)))
                return NULL;
 
        lock_page_cgroup(pc);
@@ -3122,8 +3074,7 @@ void mem_cgroup_uncharge_page(struct page *page)
        /* early check. */
        if (page_mapped(page))
                return;
-       if (page->mapping && !PageAnon(page))
-               return;
+       VM_BUG_ON(page->mapping && !PageAnon(page));
        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 
@@ -3298,14 +3249,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
  * page belongs to.
  */
 int mem_cgroup_prepare_migration(struct page *page,
-       struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
+       struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
 {
        struct mem_cgroup *memcg = NULL;
        struct page_cgroup *pc;
        enum charge_type ctype;
        int ret = 0;
 
-       *ptr = NULL;
+       *memcgp = NULL;
 
        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
@@ -3356,10 +3307,10 @@ int mem_cgroup_prepare_migration(struct page *page,
        if (!memcg)
                return 0;
 
-       *ptr = memcg;
-       ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
+       *memcgp = memcg;
+       ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
        css_put(&memcg->css);/* drop extra refcnt */
-       if (ret || *ptr == NULL) {
+       if (ret || *memcgp == NULL) {
                if (PageAnon(page)) {
                        lock_page_cgroup(pc);
                        ClearPageCgroupMigration(pc);
@@ -3437,12 +3388,60 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 
+/*
+ * At replace page cache, newpage is not under any memcg but it's on
+ * LRU. So, this function doesn't touch res_counter but handles LRU
+ * in correct way. Both pages are locked so we cannot race with uncharge.
+ */
+void mem_cgroup_replace_page_cache(struct page *oldpage,
+                                 struct page *newpage)
+{
+       struct mem_cgroup *memcg;
+       struct page_cgroup *pc;
+       struct zone *zone;
+       enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+       unsigned long flags;
+
+       if (mem_cgroup_disabled())
+               return;
+
+       pc = lookup_page_cgroup(oldpage);
+       /* fix accounting on old pages */
+       lock_page_cgroup(pc);
+       memcg = pc->mem_cgroup;
+       mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
+       ClearPageCgroupUsed(pc);
+       unlock_page_cgroup(pc);
+
+       if (PageSwapBacked(oldpage))
+               type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+
+       zone = page_zone(newpage);
+       pc = lookup_page_cgroup(newpage);
+       /*
+        * Even if newpage->mapping was NULL before starting replacement,
+        * the newpage may be on LRU(or pagevec for LRU) already. We lock
+        * LRU while we overwrite pc->mem_cgroup.
+        */
+       spin_lock_irqsave(&zone->lru_lock, flags);
+       if (PageLRU(newpage))
+               del_page_from_lru_list(zone, newpage, page_lru(newpage));
+       __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
+       if (PageLRU(newpage))
+               add_page_to_lru_list(zone, newpage, page_lru(newpage));
+       spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
+
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
        struct page_cgroup *pc;
 
        pc = lookup_page_cgroup(page);
+       /*
+        * Can be NULL while feeding pages into the page allocator for
+        * the first time, i.e. during boot or memory hotplug.
+        */
        if (likely(pc) && PageCgroupUsed(pc))
                return pc;
        return NULL;
@@ -3539,9 +3538,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
 
-               mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
-                                               MEM_CGROUP_RECLAIM_SHRINK,
-                                               NULL);
+               mem_cgroup_reclaim(memcg, GFP_KERNEL,
+                                  MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -3599,10 +3597,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
 
-               mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
-                                               MEM_CGROUP_RECLAIM_NOSWAP |
-                                               MEM_CGROUP_RECLAIM_SHRINK,
-                                               NULL);
+               mem_cgroup_reclaim(memcg, GFP_KERNEL,
+                                  MEM_CGROUP_RECLAIM_NOSWAP |
+                                  MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -3645,10 +3642,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                        break;
 
                nr_scanned = 0;
-               reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
-                                               gfp_mask,
-                                               MEM_CGROUP_RECLAIM_SOFT,
-                                               &nr_scanned);
+               reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
+                                                   gfp_mask, &nr_scanned);
                nr_reclaimed += reclaimed;
                *total_scanned += nr_scanned;
                spin_lock(&mctz->lock);
@@ -3716,22 +3711,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                                int node, int zid, enum lru_list lru)
 {
-       struct zone *zone;
        struct mem_cgroup_per_zone *mz;
-       struct page_cgroup *pc, *busy;
        unsigned long flags, loop;
        struct list_head *list;
+       struct page *busy;
+       struct zone *zone;
        int ret = 0;
 
        zone = &NODE_DATA(node)->node_zones[zid];
        mz = mem_cgroup_zoneinfo(memcg, node, zid);
-       list = &mz->lists[lru];
+       list = &mz->lruvec.lists[lru];
 
        loop = MEM_CGROUP_ZSTAT(mz, lru);
        /* give some margin against EBUSY etc...*/
        loop += 256;
        busy = NULL;
        while (loop--) {
+               struct page_cgroup *pc;
                struct page *page;
 
                ret = 0;
@@ -3740,16 +3736,16 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                        spin_unlock_irqrestore(&zone->lru_lock, flags);
                        break;
                }
-               pc = list_entry(list->prev, struct page_cgroup, lru);
-               if (busy == pc) {
-                       list_move(&pc->lru, list);
+               page = list_entry(list->prev, struct page, lru);
+               if (busy == page) {
+                       list_move(&page->lru, list);
                        busy = NULL;
                        spin_unlock_irqrestore(&zone->lru_lock, flags);
                        continue;
                }
                spin_unlock_irqrestore(&zone->lru_lock, flags);
 
-               page = lookup_cgroup_page(pc);
+               pc = lookup_page_cgroup(page);
 
                ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
                if (ret == -ENOMEM)
@@ -3757,7 +3753,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 
                if (ret == -EBUSY || ret == -EINVAL) {
                        /* found lock contention or "pc" is obsolete. */
-                       busy = pc;
+                       busy = page;
                        cond_resched();
                } else
                        busy = NULL;
@@ -4916,7 +4912,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
                for_each_lru(l)
-                       INIT_LIST_HEAD(&mz->lists[l]);
+                       INIT_LIST_HEAD(&mz->lruvec.lists[l]);
                mz->usage_in_excess = 0;
                mz->on_tree = false;
                mz->mem = memcg;
@@ -5041,7 +5037,7 @@ static int mem_cgroup_soft_limit_tree_init(void)
                        tmp = -1;
                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
                if (!rtpn)
-                       return 1;
+                       goto err_cleanup;
 
                soft_limit_tree.rb_tree_per_node[node] = rtpn;
 
@@ -5052,6 +5048,16 @@ static int mem_cgroup_soft_limit_tree_init(void)
                }
        }
        return 0;
+
+err_cleanup:
+       for_each_node_state(node, N_POSSIBLE) {
+               if (!soft_limit_tree.rb_tree_per_node[node])
+                       break;
+               kfree(soft_limit_tree.rb_tree_per_node[node]);
+               soft_limit_tree.rb_tree_per_node[node] = NULL;
+       }
+       return 1;
+
 }
 
 static struct cgroup_subsys_state * __ref
@@ -5074,9 +5080,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                int cpu;
                enable_swap_cgroup();
                parent = NULL;
-               root_mem_cgroup = memcg;
                if (mem_cgroup_soft_limit_tree_init())
                        goto free_out;
+               root_mem_cgroup = memcg;
                for_each_possible_cpu(cpu) {
                        struct memcg_stock_pcp *stock =
                                                &per_cpu(memcg_stock, cpu);
@@ -5105,7 +5111,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                res_counter_init(&memcg->memsw, NULL);
                res_counter_init(&memcg->kmem, NULL);
        }
-       memcg->last_scanned_child = 0;
        memcg->last_scanned_node = MAX_NUMNODES;
        INIT_LIST_HEAD(&memcg->oom_notify);
 
@@ -5117,7 +5122,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        return &memcg->css;
 free_out:
        __mem_cgroup_free(memcg);
-       root_mem_cgroup = NULL;
        return ERR_PTR(error);
 }
 
@@ -5349,7 +5353,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
        }
        /* There is a swap entry and a page doesn't exist or isn't charged */
        if (ent.val && !ret &&
-                       css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
+                       css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
                ret = MC_TARGET_SWAP;
                if (target)
                        target->ent = ent;
index 07ace8725c837c1b9233dc2f5bc26d8a5c9699ae..30f35e9ea34bdf2422f7d80a73f81fe8bf7eeafa 100644 (file)
@@ -293,7 +293,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
        struct mmu_gather_batch *batch;
 
-       tlb->need_flush = 1;
+       VM_BUG_ON(!tlb->need_flush);
 
        if (tlb_fast_mode(tlb)) {
                free_page_and_swap_cache(page);
@@ -1231,7 +1231,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                        if (next-addr != HPAGE_PMD_SIZE) {
                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                split_huge_page_pmd(vma->vm_mm, pmd);
-                       } else if (zap_huge_pmd(tlb, vma, pmd))
+                       } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                continue;
                        /* fall through */
                }
@@ -2861,6 +2861,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct mem_cgroup *ptr;
        int exclusive = 0;
        int ret = 0;
+       bool swap_token;
 
        if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
                goto out;
@@ -2909,7 +2910,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_release;
        }
 
+       swap_token = activate_swap_token(mm);
+
        locked = lock_page_or_retry(page, mm, flags);
+
+       deactivate_swap_token(mm, swap_token);
+
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        if (!locked) {
                ret |= VM_FAULT_RETRY;
@@ -3156,6 +3162,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        struct vm_fault vmf;
        int ret;
        int page_mkwrite = 0;
+       bool swap_token;
 
        /*
         * If we do COW later, allocate page befor taking lock_page()
@@ -3177,6 +3184,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        } else
                cow_page = NULL;
 
+       swap_token = activate_swap_token(mm);
+
        vmf.virtual_address = (void __user *)(address & PAGE_MASK);
        vmf.pgoff = pgoff;
        vmf.flags = flags;
@@ -3245,6 +3254,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
        }
 
+       deactivate_swap_token(mm, swap_token);
+
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 
        /*
@@ -3316,9 +3327,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return ret;
 
 unwritable_page:
+       deactivate_swap_token(mm, swap_token);
        page_cache_release(page);
        return ret;
 uncharge_out:
+       deactivate_swap_token(mm, swap_token);
        /* fs's fault handler get error */
        if (cow_page) {
                mem_cgroup_uncharge_page(cow_page);
index 177aca424a069ac1ae1b44d48a8e6d992cd42a4d..594dc375d0f96923ca9654dea0cd20bafb0f37f2 100644 (file)
@@ -269,12 +269,12 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 
        radix_tree_replace_slot(pslot, newpage);
 
-       page_unfreeze_refs(page, expected_count);
        /*
-        * Drop cache reference from old page.
+        * Drop cache reference from old page by unfreezing
+        * to one less reference.
         * We know this isn't the last reference.
         */
-       __put_page(page);
+       page_unfreeze_refs(page, expected_count - 1);
 
        /*
         * If moved to a different zone then also account
@@ -334,9 +334,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 
        radix_tree_replace_slot(pslot, newpage);
 
-       page_unfreeze_refs(page, expected_count);
-
-       __put_page(page);
+       page_unfreeze_refs(page, expected_count - 1);
 
        spin_unlock_irq(&mapping->tree_lock);
        return 0;
index 83813fa99114dc7d88fbfc26c6dfc87ce24a80a2..e7f5e3ca2c0717838973ad767416bca7d6e9f6f2 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1626,39 +1626,19 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 
 EXPORT_SYMBOL(find_vma);
 
-/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
+/*
+ * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+ * Note: pprev is set to NULL when return value is NULL.
+ */
 struct vm_area_struct *
 find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
 {
-       struct vm_area_struct *vma = NULL, *prev = NULL;
-       struct rb_node *rb_node;
-       if (!mm)
-               goto out;
-
-       /* Guard against addr being lower than the first VMA */
-       vma = mm->mmap;
-
-       /* Go through the RB tree quickly. */
-       rb_node = mm->mm_rb.rb_node;
-
-       while (rb_node) {
-               struct vm_area_struct *vma_tmp;
-               vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
-
-               if (addr < vma_tmp->vm_end) {
-                       rb_node = rb_node->rb_left;
-               } else {
-                       prev = vma_tmp;
-                       if (!prev->vm_next || (addr < prev->vm_next->vm_end))
-                               break;
-                       rb_node = rb_node->rb_right;
-               }
-       }
+       struct vm_area_struct *vma;
 
-out:
-       *pprev = prev;
-       return prev ? prev->vm_next : vma;
+       vma = find_vma(mm, addr);
+       *pprev = vma ? vma->vm_prev : NULL;
+       return vma;
 }
 
 /*
@@ -2349,13 +2329,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        struct vm_area_struct *new_vma, *prev;
        struct rb_node **rb_link, *rb_parent;
        struct mempolicy *pol;
+       bool faulted_in_anon_vma = true;
 
        /*
         * If anonymous vma has not yet been faulted, update new pgoff
         * to match new location, to increase its chance of merging.
         */
-       if (!vma->vm_file && !vma->anon_vma)
+       if (unlikely(!vma->vm_file && !vma->anon_vma)) {
                pgoff = addr >> PAGE_SHIFT;
+               faulted_in_anon_vma = false;
+       }
 
        find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
@@ -2364,9 +2347,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                /*
                 * Source vma may have been merged into new_vma
                 */
-               if (vma_start >= new_vma->vm_start &&
-                   vma_start < new_vma->vm_end)
+               if (unlikely(vma_start >= new_vma->vm_start &&
+                            vma_start < new_vma->vm_end)) {
+                       /*
+                        * The only way we can get a vma_merge with
+                        * self during an mremap is if the vma hasn't
+                        * been faulted in yet and we were allowed to
+                        * reset the dst vma->vm_pgoff to the
+                        * destination address of the mremap to allow
+                        * the merge to happen. mremap must change the
+                        * vm_pgoff linearity between src and dst vmas
+                        * (in turn preventing a vma_merge) to be
+                        * safe. It is only safe to keep the vm_pgoff
+                        * linear if there are no pages mapped yet.
+                        */
+                       VM_BUG_ON(faulted_in_anon_vma);
                        *vmap = new_vma;
+               } else
+                       anon_vma_moveto_tail(new_vma);
        } else {
                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (new_vma) {
index d6959cb4df58f1d694c179898553bf7e3150cc49..87bb8393e7d238115a450139d24c090347c215a7 100644 (file)
@@ -220,6 +220,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 
        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
        if (moved_len < old_len) {
+               /*
+                * Before moving the page tables from the new vma to
+                * the old vma, we need to be sure the old vma is
+                * queued after new vma in the same_anon_vma list to
+                * prevent SMP races with rmap_walk (that could lead
+                * rmap_walk to miss some page table).
+                */
+               anon_vma_moveto_tail(vma);
+
                /*
                 * On error, move entries back from new area to old,
                 * which will succeed since page tables still there,
index 3134ee2fb2e8a27e4bda50319d9b11e0da9467d5..2958fd8e7c9abcfcf6cc7b38e2dac34133905b22 100644 (file)
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/freezer.h>
+#include <linux/ftrace.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/oom.h>
 
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
@@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val)
        spin_lock_irq(&sighand->siglock);
        if (current->signal->oom_score_adj == old_val)
                current->signal->oom_score_adj = new_val;
+       trace_oom_score_adj_update(current);
        spin_unlock_irq(&sighand->siglock);
 }
 
@@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val)
        spin_lock_irq(&sighand->siglock);
        old_val = current->signal->oom_score_adj;
        current->signal->oom_score_adj = new_val;
+       trace_oom_score_adj_update(current);
        spin_unlock_irq(&sighand->siglock);
 
        return old_val;
@@ -146,7 +152,7 @@ struct task_struct *find_lock_task_mm(struct task_struct *p)
 
 /* return true if the task is not adequate as candidate victim task. */
 static bool oom_unkillable_task(struct task_struct *p,
-               const struct mem_cgroup *mem, const nodemask_t *nodemask)
+               const struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
        if (is_global_init(p))
                return true;
@@ -154,7 +160,7 @@ static bool oom_unkillable_task(struct task_struct *p,
                return true;
 
        /* When mem_cgroup_out_of_memory() and p is not member of the group */
-       if (mem && !task_in_mem_cgroup(p, mem))
+       if (memcg && !task_in_mem_cgroup(p, memcg))
                return true;
 
        /* p may not have freeable memory in nodemask */
@@ -173,12 +179,12 @@ static bool oom_unkillable_task(struct task_struct *p,
  * predictable as possible.  The goal is to return the highest value for the
  * task consuming the most memory to avoid subsequent oom failures.
  */
-unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
+unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
                      const nodemask_t *nodemask, unsigned long totalpages)
 {
-       int points;
+       long points;
 
-       if (oom_unkillable_task(p, mem, nodemask))
+       if (oom_unkillable_task(p, memcg, nodemask))
                return 0;
 
        p = find_lock_task_mm(p);
@@ -302,7 +308,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
  * (not docbooked, we don't want this one cluttering up the manual)
  */
 static struct task_struct *select_bad_process(unsigned int *ppoints,
-               unsigned long totalpages, struct mem_cgroup *mem,
+               unsigned long totalpages, struct mem_cgroup *memcg,
                const nodemask_t *nodemask)
 {
        struct task_struct *g, *p;
@@ -314,7 +320,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 
                if (p->exit_state)
                        continue;
-               if (oom_unkillable_task(p, mem, nodemask))
+               if (oom_unkillable_task(p, memcg, nodemask))
                        continue;
 
                /*
@@ -358,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                        }
                }
 
-               points = oom_badness(p, mem, nodemask, totalpages);
+               points = oom_badness(p, memcg, nodemask, totalpages);
                if (points > *ppoints) {
                        chosen = p;
                        *ppoints = points;
@@ -381,14 +387,14 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
  *
  * Call with tasklist_lock read-locked.
  */
-static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
+static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
        struct task_struct *p;
        struct task_struct *task;
 
        pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
        for_each_process(p) {
-               if (oom_unkillable_task(p, mem, nodemask))
+               if (oom_unkillable_task(p, memcg, nodemask))
                        continue;
 
                task = find_lock_task_mm(p);
@@ -411,7 +417,7 @@ static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
 }
 
 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
-                       struct mem_cgroup *mem, const nodemask_t *nodemask)
+                       struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
        task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
@@ -421,14 +427,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
        dump_stack();
-       mem_cgroup_print_oom_info(mem, p);
+       mem_cgroup_print_oom_info(memcg, p);
        show_mem(SHOW_MEM_FILTER_NODES);
        if (sysctl_oom_dump_tasks)
-               dump_tasks(mem, nodemask);
+               dump_tasks(memcg, nodemask);
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
-static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
+static int oom_kill_task(struct task_struct *p)
 {
        struct task_struct *q;
        struct mm_struct *mm;
@@ -478,7 +484,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 
 static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                            unsigned int points, unsigned long totalpages,
-                           struct mem_cgroup *mem, nodemask_t *nodemask,
+                           struct mem_cgroup *memcg, nodemask_t *nodemask,
                            const char *message)
 {
        struct task_struct *victim = p;
@@ -487,7 +493,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        unsigned int victim_points = 0;
 
        if (printk_ratelimit())
-               dump_header(p, gfp_mask, order, mem, nodemask);
+               dump_header(p, gfp_mask, order, memcg, nodemask);
 
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
@@ -518,7 +524,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        /*
                         * oom_badness() returns 0 if the thread is unkillable
                         */
-                       child_points = oom_badness(child, mem, nodemask,
+                       child_points = oom_badness(child, memcg, nodemask,
                                                                totalpages);
                        if (child_points > victim_points) {
                                victim = child;
@@ -527,7 +533,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                }
        } while_each_thread(p, t);
 
-       return oom_kill_task(victim, mem);
+       return oom_kill_task(victim);
 }
 
 /*
@@ -555,7 +561,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 }
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
 {
        unsigned long limit;
        unsigned int points = 0;
@@ -572,14 +578,14 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
        }
 
        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
-       limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
+       limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
        read_lock(&tasklist_lock);
 retry:
-       p = select_bad_process(&points, limit, mem, NULL);
+       p = select_bad_process(&points, limit, memcg, NULL);
        if (!p || PTR_ERR(p) == -1UL)
                goto out;
 
-       if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
+       if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
                                "Memory cgroup out of memory"))
                goto retry;
 out:
index 0ae2008eb54043970a1632a4d4d9f244a7a3ee7b..e57a831d8701296d23005d0b63eb252fd2f2f7af 100644 (file)
@@ -135,6 +135,191 @@ unsigned long global_dirty_limit;
  */
 static struct prop_descriptor vm_completions;
 
+/*
+ * Work out the current dirty-memory clamping and background writeout
+ * thresholds.
+ *
+ * The main aim here is to lower them aggressively if there is a lot of mapped
+ * memory around.  To avoid stressing page reclaim with lots of unreclaimable
+ * pages.  It is better to clamp down on writers than to start swapping, and
+ * performing lots of scanning.
+ *
+ * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ *
+ * We don't permit the clamping level to fall below 5% - that is getting rather
+ * excessive.
+ *
+ * We make sure that the background writeout level is below the adjusted
+ * clamping level.
+ */
+
+/*
+ * In a memory zone, there is a certain amount of pages we consider
+ * available for the page cache, which is essentially the number of
+ * free and reclaimable pages, minus some zone reserves to protect
+ * lowmem and the ability to uphold the zone's watermarks without
+ * requiring writeback.
+ *
+ * This number of dirtyable pages is the base value of which the
+ * user-configurable dirty ratio is the effictive number of pages that
+ * are allowed to be actually dirtied.  Per individual zone, or
+ * globally by using the sum of dirtyable pages over all zones.
+ *
+ * Because the user is allowed to specify the dirty limit globally as
+ * absolute number of bytes, calculating the per-zone dirty limit can
+ * require translating the configured limit into a percentage of
+ * global dirtyable memory first.
+ */
+
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
+#ifdef CONFIG_HIGHMEM
+       int node;
+       unsigned long x = 0;
+
+       for_each_node_state(node, N_HIGH_MEMORY) {
+               struct zone *z =
+                       &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+
+               x += zone_page_state(z, NR_FREE_PAGES) +
+                    zone_reclaimable_pages(z) - z->dirty_balance_reserve;
+       }
+       /*
+        * Make sure that the number of highmem pages is never larger
+        * than the number of the total dirtyable memory. This can only
+        * occur in very strange VM situations but we want to make sure
+        * that this does not occur.
+        */
+       return min(x, total);
+#else
+       return 0;
+#endif
+}
+
+/**
+ * global_dirtyable_memory - number of globally dirtyable pages
+ *
+ * Returns the global number of pages potentially available for dirty
+ * page cache.  This is the base value for the global dirty limits.
+ */
+unsigned long global_dirtyable_memory(void)
+{
+       unsigned long x;
+
+       x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
+           dirty_balance_reserve;
+
+       if (!vm_highmem_is_dirtyable)
+               x -= highmem_dirtyable_memory(x);
+
+       return x + 1;   /* Ensure that we never return 0 */
+}
+
+/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
+ * - vm.dirty_ratio             or  vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * real-time tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+       unsigned long background;
+       unsigned long dirty;
+       unsigned long uninitialized_var(available_memory);
+       struct task_struct *tsk;
+
+       if (!vm_dirty_bytes || !dirty_background_bytes)
+               available_memory = global_dirtyable_memory();
+
+       if (vm_dirty_bytes)
+               dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+       else
+               dirty = (vm_dirty_ratio * available_memory) / 100;
+
+       if (dirty_background_bytes)
+               background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+       else
+               background = (dirty_background_ratio * available_memory) / 100;
+
+       if (background >= dirty)
+               background = dirty / 2;
+       tsk = current;
+       if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+               background += background / 4;
+               dirty += dirty / 4;
+       }
+       *pbackground = background;
+       *pdirty = dirty;
+       trace_global_dirty_state(background, dirty);
+}
+
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache.  This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+       /*
+        * The effective global number of dirtyable pages may exclude
+        * highmem as a big-picture measure to keep the ratio between
+        * dirty memory and lowmem reasonable.
+        *
+        * But this function is purely about the individual zone and a
+        * highmem zone can hold its share of dirty pages, so we don't
+        * care about vm_highmem_is_dirtyable here.
+        */
+       return zone_page_state(zone, NR_FREE_PAGES) +
+              zone_reclaimable_pages(zone) -
+              zone->dirty_balance_reserve;
+}
+
+/**
+ * zone_dirty_limit - maximum number of dirty pages allowed in a zone
+ * @zone: the zone
+ *
+ * Returns the maximum number of dirty pages allowed in a zone, based
+ * on the zone's dirtyable memory.
+ */
+static unsigned long zone_dirty_limit(struct zone *zone)
+{
+       unsigned long zone_memory = zone_dirtyable_memory(zone);
+       struct task_struct *tsk = current;
+       unsigned long dirty;
+
+       if (vm_dirty_bytes)
+               dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
+                       zone_memory / global_dirtyable_memory();
+       else
+               dirty = vm_dirty_ratio * zone_memory / 100;
+
+       if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+               dirty += dirty / 4;
+
+       return dirty;
+}
+
+/**
+ * zone_dirty_ok - tells whether a zone is within its dirty limits
+ * @zone: the zone to check
+ *
+ * Returns %true when the dirty pages in @zone are within the zone's
+ * dirty limit, %false if the limit is exceeded.
+ */
+bool zone_dirty_ok(struct zone *zone)
+{
+       unsigned long limit = zone_dirty_limit(zone);
+
+       return zone_page_state(zone, NR_FILE_DIRTY) +
+              zone_page_state(zone, NR_UNSTABLE_NFS) +
+              zone_page_state(zone, NR_WRITEBACK) <= limit;
+}
+
 /*
  * couple the period to the dirty_ratio:
  *
@@ -147,7 +332,7 @@ static int calc_period_shift(void)
        if (vm_dirty_bytes)
                dirty_total = vm_dirty_bytes / PAGE_SIZE;
        else
-               dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+               dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
                                100;
        return 2 + ilog2(dirty_total - 1);
 }
@@ -202,7 +387,6 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
        return ret;
 }
 
-
 int dirty_bytes_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -297,67 +481,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 }
 EXPORT_SYMBOL(bdi_set_max_ratio);
 
-/*
- * Work out the current dirty-memory clamping and background writeout
- * thresholds.
- *
- * The main aim here is to lower them aggressively if there is a lot of mapped
- * memory around.  To avoid stressing page reclaim with lots of unreclaimable
- * pages.  It is better to clamp down on writers than to start swapping, and
- * performing lots of scanning.
- *
- * We only allow 1/2 of the currently-unmapped memory to be dirtied.
- *
- * We don't permit the clamping level to fall below 5% - that is getting rather
- * excessive.
- *
- * We make sure that the background writeout level is below the adjusted
- * clamping level.
- */
-
-static unsigned long highmem_dirtyable_memory(unsigned long total)
-{
-#ifdef CONFIG_HIGHMEM
-       int node;
-       unsigned long x = 0;
-
-       for_each_node_state(node, N_HIGH_MEMORY) {
-               struct zone *z =
-                       &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
-
-               x += zone_page_state(z, NR_FREE_PAGES) +
-                    zone_reclaimable_pages(z);
-       }
-       /*
-        * Make sure that the number of highmem pages is never larger
-        * than the number of the total dirtyable memory. This can only
-        * occur in very strange VM situations but we want to make sure
-        * that this does not occur.
-        */
-       return min(x, total);
-#else
-       return 0;
-#endif
-}
-
-/**
- * determine_dirtyable_memory - amount of memory that may be used
- *
- * Returns the numebr of pages that can currently be freed and used
- * by the kernel for direct mappings.
- */
-unsigned long determine_dirtyable_memory(void)
-{
-       unsigned long x;
-
-       x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
-
-       if (!vm_highmem_is_dirtyable)
-               x -= highmem_dirtyable_memory(x);
-
-       return x + 1;   /* Ensure that we never return 0 */
-}
-
 static unsigned long dirty_freerun_ceiling(unsigned long thresh,
                                           unsigned long bg_thresh)
 {
@@ -369,47 +492,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
        return max(thresh, global_dirty_limit);
 }
 
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
- *
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
- * - vm.dirty_ratio             or  vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
- */
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
-{
-       unsigned long background;
-       unsigned long dirty;
-       unsigned long uninitialized_var(available_memory);
-       struct task_struct *tsk;
-
-       if (!vm_dirty_bytes || !dirty_background_bytes)
-               available_memory = determine_dirtyable_memory();
-
-       if (vm_dirty_bytes)
-               dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
-       else
-               dirty = (vm_dirty_ratio * available_memory) / 100;
-
-       if (dirty_background_bytes)
-               background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
-       else
-               background = (dirty_background_ratio * available_memory) / 100;
-
-       if (background >= dirty)
-               background = dirty / 2;
-       tsk = current;
-       if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
-               background += background / 4;
-               dirty += dirty / 4;
-       }
-       *pbackground = background;
-       *pdirty = dirty;
-       trace_global_dirty_state(background, dirty);
-}
-
 /**
  * bdi_dirty_limit - @bdi's share of dirty throttling threshold
  * @bdi: the backing_dev_info to query
index 9d895e5e53f21bfb67b741ca087138cf5372ef83..7f28eb8ff607c6dd206784c9196cb1cafd78a076 100644 (file)
@@ -57,6 +57,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
+#include <linux/page-debug-flags.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
 
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory.  This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
        saved_gfp_mask = gfp_allowed_mask;
        gfp_allowed_mask &= ~GFP_IOFS;
 }
+
+bool pm_suspended_storage(void)
+{
+       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+               return false;
+       return true;
+}
 #endif /* CONFIG_PM_SLEEP */
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -175,8 +191,21 @@ static char * const zone_names[MAX_NR_ZONES] = {
         "Movable",
 };
 
+/*
+ * Try to keep at least this much lowmem free.  Do not allow normal
+ * allocations below this point, only high priority ones. Automatically
+ * tuned according to the amount of memory in the system.
+ */
 int min_free_kbytes = 1024;
 
+/*
+ * Extra memory for the system to try freeing between the min and
+ * low watermarks.  Useful for workloads that require low latency
+ * memory allocations in bursts larger than the normal gap between
+ * low and min.
+ */
+int extra_free_kbytes;
+
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
@@ -381,6 +410,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
                clear_highpage(page + i);
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+       unsigned long res;
+
+       if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
+               printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+               return 0;
+       }
+       _debug_guardpage_minorder = res;
+       printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+       return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+
+static inline void set_page_guard_flag(struct page *page)
+{
+       __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+
+static inline void clear_page_guard_flag(struct page *page)
+{
+       __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
+
 static inline void set_page_order(struct page *page, int order)
 {
        set_page_private(page, order);
@@ -438,6 +498,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
        if (page_zone_id(page) != page_zone_id(buddy))
                return 0;
 
+       if (page_is_guard(buddy) && page_order(buddy) == order) {
+               VM_BUG_ON(page_count(buddy) != 0);
+               return 1;
+       }
+
        if (PageBuddy(buddy) && page_order(buddy) == order) {
                VM_BUG_ON(page_count(buddy) != 0);
                return 1;
@@ -494,11 +559,19 @@ static inline void __free_one_page(struct page *page,
                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
-
-               /* Our buddy is free, merge with it and move up one order. */
-               list_del(&buddy->lru);
-               zone->free_area[order].nr_free--;
-               rmv_page_order(buddy);
+               /*
+                * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+                * merge with it and move up one order.
+                */
+               if (page_is_guard(buddy)) {
+                       clear_page_guard_flag(buddy);
+                       set_page_private(page, 0);
+                       __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+               } else {
+                       list_del(&buddy->lru);
+                       zone->free_area[order].nr_free--;
+                       rmv_page_order(buddy);
+               }
                combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
@@ -632,7 +705,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        int i;
        int bad = 0;
 
-       trace_mm_page_free_direct(page, order);
+       trace_mm_page_free(page, order);
        kmemcheck_free_shadow(page, order);
 
        if (PageAnon(page))
@@ -724,6 +797,23 @@ static inline void expand(struct zone *zone, struct page *page,
                high--;
                size >>= 1;
                VM_BUG_ON(bad_range(zone, &page[size]));
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+               if (high < debug_guardpage_minorder()) {
+                       /*
+                        * Mark as guard pages (or page), that will allow to
+                        * merge back to allocator when buddy will be freed.
+                        * Corresponding page table entries will not be touched,
+                        * pages will stay not present in virtual address space
+                        */
+                       INIT_LIST_HEAD(&page[size].lru);
+                       set_page_guard_flag(&page[size]);
+                       set_page_private(&page[size], high);
+                       /* Guard pages are not available for any usage */
+                       __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+                       continue;
+               }
+#endif
                list_add(&page[size].lru, &area->free_list[migratetype]);
                area->nr_free++;
                set_page_order(&page[size], high);
@@ -1188,6 +1278,19 @@ out:
        local_irq_restore(flags);
 }
 
+/*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+       struct page *page, *next;
+
+       list_for_each_entry_safe(page, next, list, lru) {
+               trace_mm_page_free_batched(page, cold);
+               free_hot_cold_page(page, cold);
+       }
+}
+
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
@@ -1435,7 +1538,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        long min = mark;
        int o;
 
-       free_pages -= (1 << order) + 1;
+       free_pages -= (1 << order) - 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
@@ -1645,6 +1748,35 @@ zonelist_scan:
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                continue;
+               /*
+                * When allocating a page cache page for writing, we
+                * want to get it from a zone that is within its dirty
+                * limit, such that no single zone holds more than its
+                * proportional share of globally allowed dirty pages.
+                * The dirty limits take into account the zone's
+                * lowmem reserves and high watermark so that kswapd
+                * should be able to balance it without having to
+                * write pages from its LRU list.
+                *
+                * This may look like it could increase pressure on
+                * lower zones by failing allocations in higher zones
+                * before they are full.  But the pages that do spill
+                * over are limited as the lower zones are protected
+                * by this very same mechanism.  It should not become
+                * a practical burden to them.
+                *
+                * XXX: For now, allow allocations to potentially
+                * exceed the per-zone dirty limit in the slowpath
+                * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                * which is important when on a NUMA setup the allowed
+                * zones are together not big enough to reach the
+                * global limit.  The proper fix for these situations
+                * will require awareness of zones in the
+                * dirty-throttling and the flusher threads.
+                */
+               if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                   (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+                       goto this_zone_full;
 
                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1734,7 +1866,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
        unsigned int filter = SHOW_MEM_FILTER_NODES;
 
-       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+           debug_guardpage_minorder() > 0)
                return;
 
        /*
@@ -1773,12 +1906,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+                               unsigned long did_some_progress,
                                unsigned long pages_reclaimed)
 {
        /* Do not loop if specifically requested */
        if (gfp_mask & __GFP_NORETRY)
                return 0;
 
+       /* Always retry if specifically requested */
+       if (gfp_mask & __GFP_NOFAIL)
+               return 1;
+
+       /*
+        * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+        * making forward progress without invoking OOM. Suspend also disables
+        * storage devices so kswapd will not help. Bail if we are suspending.
+        */
+       if (!did_some_progress && pm_suspended_storage())
+               return 0;
+
        /*
         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
         * means __GFP_NOFAIL, but that may not be true in other
@@ -1797,13 +1943,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
        if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
                return 1;
 
-       /*
-        * Don't let big-order allocations loop unless the caller
-        * explicitly requests that.
-        */
-       if (gfp_mask & __GFP_NOFAIL)
-               return 1;
-
        return 0;
 }
 
@@ -2146,7 +2285,14 @@ rebalance:
                                        sync_migration);
        if (page)
                goto got_pg;
-       sync_migration = true;
+
+       /*
+        * Do not use sync migration if __GFP_NO_KSWAPD is used to indicate
+        * the system should not be heavily disrupted. In practice, this is
+        * to avoid THP callers being stalled in writeback during migration
+        * as it's preferable for the the allocations to fail than to stall
+        */
+       sync_migration = !(gfp_mask & __GFP_NO_KSWAPD);
 
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2196,7 +2342,8 @@ rebalance:
 
        /* Check if we should retry the allocation */
        pages_reclaimed += did_some_progress;
-       if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+       if (should_alloc_retry(gfp_mask, order, did_some_progress,
+                                               pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
@@ -2306,16 +2453,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
 
-void __pagevec_free(struct pagevec *pvec)
-{
-       int i = pagevec_count(pvec);
-
-       while (--i >= 0) {
-               trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
-               free_hot_cold_page(pvec->pages[i], pvec->cold);
-       }
-}
-
 void __free_pages(struct page *page, unsigned int order)
 {
        if (put_page_testzero(page)) {
@@ -3385,25 +3522,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                if (page_to_nid(page) != zone_to_nid(zone))
                        continue;
 
-               /* Blocks with reserved pages will never free, skip them. */
-               block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-               if (pageblock_is_reserved(pfn, block_end_pfn))
-                       continue;
-
                block_migratetype = get_pageblock_migratetype(page);
 
-               /* If this block is reserved, account for it */
-               if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
-                       reserve--;
-                       continue;
-               }
+               /* Only test what is necessary when the reserves are not met */
+               if (reserve > 0) {
+                       /*
+                        * Blocks with reserved pages will never free, skip
+                        * them.
+                        */
+                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                       if (pageblock_is_reserved(pfn, block_end_pfn))
+                               continue;
 
-               /* Suitable for reserving if this block is movable */
-               if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
-                       set_pageblock_migratetype(page, MIGRATE_RESERVE);
-                       move_freepages_block(zone, page, MIGRATE_RESERVE);
-                       reserve--;
-                       continue;
+                       /* If this block is reserved, account for it */
+                       if (block_migratetype == MIGRATE_RESERVE) {
+                               reserve--;
+                               continue;
+                       }
+
+                       /* Suitable for reserving if this block is movable */
+                       if (block_migratetype == MIGRATE_MOVABLE) {
+                               set_pageblock_migratetype(page,
+                                                       MIGRATE_RESERVE);
+                               move_freepages_block(zone, page,
+                                                       MIGRATE_RESERVE);
+                               reserve--;
+                               continue;
+                       }
                }
 
                /*
@@ -4172,7 +4317,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 
                zone_pcp_init(zone);
                for_each_lru(l)
-                       INIT_LIST_HEAD(&zone->lru[l].list);
+                       INIT_LIST_HEAD(&zone->lruvec.lists[l]);
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
                zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4734,8 +4879,19 @@ static void calculate_totalreserve_pages(void)
                        if (max > zone->present_pages)
                                max = zone->present_pages;
                        reserve_pages += max;
+                       /*
+                        * Lowmem reserves are not available to
+                        * GFP_HIGHUSER page cache allocations and
+                        * kswapd tries to balance zones to their high
+                        * watermark.  As a result, neither should be
+                        * regarded as dirtyable memory, to prevent a
+                        * situation where reclaim has to clean pages
+                        * in order to balance the zones.
+                        */
+                       zone->dirty_balance_reserve = max;
                }
        }
+       dirty_balance_reserve = reserve_pages;
        totalreserve_pages = reserve_pages;
 }
 
@@ -4788,6 +4944,7 @@ static void setup_per_zone_lowmem_reserve(void)
 void setup_per_zone_wmarks(void)
 {
        unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+       unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
        unsigned long lowmem_pages = 0;
        struct zone *zone;
        unsigned long flags;
@@ -4799,11 +4956,14 @@ void setup_per_zone_wmarks(void)
        }
 
        for_each_zone(zone) {
-               u64 tmp;
+               u64 min, low;
 
                spin_lock_irqsave(&zone->lock, flags);
-               tmp = (u64)pages_min * zone->present_pages;
-               do_div(tmp, lowmem_pages);
+               min = (u64)pages_min * zone->present_pages;
+               do_div(min, lowmem_pages);
+               low = (u64)pages_low * zone->present_pages;
+               do_div(low, vm_total_pages);
+
                if (is_highmem(zone)) {
                        /*
                         * __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -4827,11 +4987,13 @@ void setup_per_zone_wmarks(void)
                         * If it's a lowmem zone, reserve a number of pages
                         * proportionate to the zone's size.
                         */
-                       zone->watermark[WMARK_MIN] = tmp;
+                       zone->watermark[WMARK_MIN] = min;
                }
 
-               zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-               zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+               zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) +
+                                       low + (min >> 2);
+               zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
+                                       low + (min >> 1);
                setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lock, flags);
        }
@@ -4927,11 +5089,11 @@ int __meminit init_per_zone_wmark_min(void)
 module_init(init_per_zone_wmark_min)
 
 /*
- * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ * free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *     that we can call two helper functions whenever min_free_kbytes
- *     changes.
+ *     or extra_free_kbytes changes.
  */
-int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+int free_kbytes_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec(table, write, buffer, length, ppos);
index 2d123f94a8df49addd2ee3167d31861073da9f84..b99d19edf89b3f09a1975f604f754175026f5a7b 100644 (file)
 #include <linux/swapops.h>
 #include <linux/kmemleak.h>
 
-static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
-{
-       pc->flags = 0;
-       set_page_cgroup_array_id(pc, id);
-       pc->mem_cgroup = NULL;
-       INIT_LIST_HEAD(&pc->lru);
-}
 static unsigned long total_usage;
 
 #if !defined(CONFIG_SPARSEMEM)
@@ -35,35 +28,27 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
        struct page_cgroup *base;
 
        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
+#ifdef CONFIG_DEBUG_VM
+       /*
+        * The sanity checks the page allocator does upon freeing a
+        * page can reach here before the page_cgroup arrays are
+        * allocated when feeding a range of pages to the allocator
+        * for the first time during bootup or memory hotplug.
+        */
        if (unlikely(!base))
                return NULL;
-
+#endif
        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
        return base + offset;
 }
 
-struct page *lookup_cgroup_page(struct page_cgroup *pc)
-{
-       unsigned long pfn;
-       struct page *page;
-       pg_data_t *pgdat;
-
-       pgdat = NODE_DATA(page_cgroup_array_id(pc));
-       pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
-       page = pfn_to_page(pfn);
-       VM_BUG_ON(pc != lookup_page_cgroup(page));
-       return page;
-}
-
 static int __init alloc_node_page_cgroup(int nid)
 {
-       struct page_cgroup *base, *pc;
+       struct page_cgroup *base;
        unsigned long table_size;
-       unsigned long start_pfn, nr_pages, index;
+       unsigned long nr_pages;
 
-       start_pfn = NODE_DATA(nid)->node_start_pfn;
        nr_pages = NODE_DATA(nid)->node_spanned_pages;
-
        if (!nr_pages)
                return 0;
 
@@ -73,10 +58,6 @@ static int __init alloc_node_page_cgroup(int nid)
                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
        if (!base)
                return -ENOMEM;
-       for (index = 0; index < nr_pages; index++) {
-               pc = base + index;
-               init_page_cgroup(pc, nid);
-       }
        NODE_DATA(nid)->node_page_cgroup = base;
        total_usage += table_size;
        return 0;
@@ -111,29 +92,23 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
 {
        unsigned long pfn = page_to_pfn(page);
        struct mem_section *section = __pfn_to_section(pfn);
-
+#ifdef CONFIG_DEBUG_VM
+       /*
+        * The sanity checks the page allocator does upon freeing a
+        * page can reach here before the page_cgroup arrays are
+        * allocated when feeding a range of pages to the allocator
+        * for the first time during bootup or memory hotplug.
+        */
        if (!section->page_cgroup)
                return NULL;
+#endif
        return section->page_cgroup + pfn;
 }
 
-struct page *lookup_cgroup_page(struct page_cgroup *pc)
-{
-       struct mem_section *section;
-       struct page *page;
-       unsigned long nr;
-
-       nr = page_cgroup_array_id(pc);
-       section = __nr_to_section(nr);
-       page = pfn_to_page(pc - section->page_cgroup);
-       VM_BUG_ON(pc != lookup_page_cgroup(page));
-       return page;
-}
-
 static void *__meminit alloc_page_cgroup(size_t size, int nid)
 {
+       gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
        void *addr = NULL;
-       gfp_t flags = GFP_KERNEL | __GFP_NOWARN;
 
        addr = alloc_pages_exact_nid(nid, size, flags);
        if (addr) {
@@ -142,9 +117,9 @@ static void *__meminit alloc_page_cgroup(size_t size, int nid)
        }
 
        if (node_state(nid, N_HIGH_MEMORY))
-               addr = vmalloc_node(size, nid);
+               addr = vzalloc_node(size, nid);
        else
-               addr = vmalloc(size);
+               addr = vzalloc(size);
 
        return addr;
 }
@@ -167,14 +142,11 @@ static void free_page_cgroup(void *addr)
 
 static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
 {
-       struct page_cgroup *base, *pc;
        struct mem_section *section;
+       struct page_cgroup *base;
        unsigned long table_size;
-       unsigned long nr;
-       int index;
 
-       nr = pfn_to_section_nr(pfn);
-       section = __nr_to_section(nr);
+       section = __pfn_to_section(pfn);
 
        if (section->page_cgroup)
                return 0;
@@ -194,10 +166,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
                return -ENOMEM;
        }
 
-       for (index = 0; index < PAGES_PER_SECTION; index++) {
-               pc = base + index;
-               init_page_cgroup(pc, nr);
-       }
        /*
         * The passed "pfn" may not be aligned to SECTION.  For the calculation
         * we need to apply a mask.
@@ -366,7 +334,6 @@ struct swap_cgroup {
        unsigned short          id;
 };
 #define SC_PER_PAGE    (PAGE_SIZE/sizeof(struct swap_cgroup))
-#define SC_POS_MASK    (SC_PER_PAGE - 1)
 
 /*
  * SwapCgroup implements "lookup" and "exchange" operations.
@@ -408,6 +375,21 @@ not_enough_page:
        return -ENOMEM;
 }
 
+static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
+                                       struct swap_cgroup_ctrl **ctrlp)
+{
+       pgoff_t offset = swp_offset(ent);
+       struct swap_cgroup_ctrl *ctrl;
+       struct page *mappage;
+
+       ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+       if (ctrlp)
+               *ctrlp = ctrl;
+
+       mappage = ctrl->map[offset / SC_PER_PAGE];
+       return page_address(mappage) + offset % SC_PER_PAGE;
+}
+
 /**
  * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
  * @end: swap entry to be cmpxchged
@@ -420,21 +402,13 @@ not_enough_page:
 unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
                                        unsigned short old, unsigned short new)
 {
-       int type = swp_type(ent);
-       unsigned long offset = swp_offset(ent);
-       unsigned long idx = offset / SC_PER_PAGE;
-       unsigned long pos = offset & SC_POS_MASK;
        struct swap_cgroup_ctrl *ctrl;
-       struct page *mappage;
        struct swap_cgroup *sc;
        unsigned long flags;
        unsigned short retval;
 
-       ctrl = &swap_cgroup_ctrl[type];
+       sc = lookup_swap_cgroup(ent, &ctrl);
 
-       mappage = ctrl->map[idx];
-       sc = page_address(mappage);
-       sc += pos;
        spin_lock_irqsave(&ctrl->lock, flags);
        retval = sc->id;
        if (retval == old)
@@ -455,21 +429,13 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
  */
 unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 {
-       int type = swp_type(ent);
-       unsigned long offset = swp_offset(ent);
-       unsigned long idx = offset / SC_PER_PAGE;
-       unsigned long pos = offset & SC_POS_MASK;
        struct swap_cgroup_ctrl *ctrl;
-       struct page *mappage;
        struct swap_cgroup *sc;
        unsigned short old;
        unsigned long flags;
 
-       ctrl = &swap_cgroup_ctrl[type];
+       sc = lookup_swap_cgroup(ent, &ctrl);
 
-       mappage = ctrl->map[idx];
-       sc = page_address(mappage);
-       sc += pos;
        spin_lock_irqsave(&ctrl->lock, flags);
        old = sc->id;
        sc->id = id;
@@ -479,28 +445,14 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
 }
 
 /**
- * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
+ * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
  * @ent: swap entry to be looked up.
  *
  * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
  */
-unsigned short lookup_swap_cgroup(swp_entry_t ent)
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
 {
-       int type = swp_type(ent);
-       unsigned long offset = swp_offset(ent);
-       unsigned long idx = offset / SC_PER_PAGE;
-       unsigned long pos = offset & SC_POS_MASK;
-       struct swap_cgroup_ctrl *ctrl;
-       struct page *mappage;
-       struct swap_cgroup *sc;
-       unsigned short ret;
-
-       ctrl = &swap_cgroup_ctrl[type];
-       mappage = ctrl->map[idx];
-       sc = page_address(mappage);
-       sc += pos;
-       ret = sc->id;
-       return ret;
+       return lookup_swap_cgroup(ent, NULL)->id;
 }
 
 int swap_cgroup_swapon(int type, unsigned long max_pages)
index a4fd3680038be499a47cf747d1ba657646e5d36c..aa547d488cd21caf3838268f3867a8841027de95 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -271,6 +271,51 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
        return -ENOMEM;
 }
 
+/*
+ * Some rmap walk that needs to find all ptes/hugepmds without false
+ * negatives (like migrate and split_huge_page) running concurrent
+ * with operations that copy or move pagetables (like mremap() and
+ * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
+ * list to be in a certain order: the dst_vma must be placed after the
+ * src_vma in the list. This is always guaranteed by fork() but
+ * mremap() needs to call this function to enforce it in case the
+ * dst_vma isn't newly allocated and chained with the anon_vma_clone()
+ * function but just an extension of a pre-existing vma through
+ * vma_merge.
+ *
+ * NOTE: the same_anon_vma list can still be changed by other
+ * processes while mremap runs because mremap doesn't hold the
+ * anon_vma mutex to prevent modifications to the list while it
+ * runs. All we need to enforce is that the relative order of this
+ * process vmas isn't changing (we don't care about other vmas
+ * order). Each vma corresponds to an anon_vma_chain structure so
+ * there's no risk that other processes calling anon_vma_moveto_tail()
+ * and changing the same_anon_vma list under mremap() will screw with
+ * the relative order of this process vmas in the list, because we
+ * they can't alter the order of any vma that belongs to this
+ * process. And there can't be another anon_vma_moveto_tail() running
+ * concurrently with mremap() coming from this process because we hold
+ * the mmap_sem for the whole mremap(). fork() ordering dependency
+ * also shouldn't be affected because fork() only cares that the
+ * parent vmas are placed in the list before the child vmas and
+ * anon_vma_moveto_tail() won't reorder vmas from either the fork()
+ * parent or child.
+ */
+void anon_vma_moveto_tail(struct vm_area_struct *dst)
+{
+       struct anon_vma_chain *pavc;
+       struct anon_vma *root = NULL;
+
+       list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
+               struct anon_vma *anon_vma = pavc->anon_vma;
+               VM_BUG_ON(pavc->vma != dst);
+               root = lock_anon_vma_root(root, anon_vma);
+               list_del(&pavc->same_anon_vma);
+               list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
+       }
+       unlock_anon_vma_root(root);
+}
+
 /*
  * Attach vma to its own anon_vma, as well as to the anon_vmas that
  * the corresponding VMA in the parent process is attached to.
@@ -715,8 +760,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 
        /* Pretend the page is referenced if the task has the
           swap token and is in the middle of a page fault. */
-       if (mm != current->mm && has_swap_token(mm) &&
-                       rwsem_is_locked(&mm->mmap_sem))
+       if (mm != current->mm && has_active_swap_token(mm))
                referenced++;
 
        (*mapcount)--;
@@ -728,7 +772,7 @@ out:
 }
 
 static int page_referenced_anon(struct page *page,
-                               struct mem_cgroup *mem_cont,
+                               struct mem_cgroup *memcg,
                                unsigned long *vm_flags)
 {
        unsigned int mapcount;
@@ -751,7 +795,7 @@ static int page_referenced_anon(struct page *page,
                 * counting on behalf of references from different
                 * cgroups
                 */
-               if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
+               if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
                        continue;
                referenced += page_referenced_one(page, vma, address,
                                                  &mapcount, vm_flags);
@@ -766,7 +810,7 @@ static int page_referenced_anon(struct page *page,
 /**
  * page_referenced_file - referenced check for object-based rmap
  * @page: the page we're checking references on.
- * @mem_cont: target memory controller
+ * @memcg: target memory control group
  * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
  *
  * For an object-based mapped page, find all the places it is mapped and
@@ -777,7 +821,7 @@ static int page_referenced_anon(struct page *page,
  * This function is only called from page_referenced for object-based pages.
  */
 static int page_referenced_file(struct page *page,
-                               struct mem_cgroup *mem_cont,
+                               struct mem_cgroup *memcg,
                                unsigned long *vm_flags)
 {
        unsigned int mapcount;
@@ -819,7 +863,7 @@ static int page_referenced_file(struct page *page,
                 * counting on behalf of references from different
                 * cgroups
                 */
-               if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
+               if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
                        continue;
                referenced += page_referenced_one(page, vma, address,
                                                  &mapcount, vm_flags);
@@ -835,7 +879,7 @@ static int page_referenced_file(struct page *page,
  * page_referenced - test if the page was referenced
  * @page: the page to test
  * @is_locked: caller holds lock on the page
- * @mem_cont: target memory controller
+ * @memcg: target memory cgroup
  * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
  *
  * Quick test_and_clear_referenced for all mappings to a page,
@@ -843,7 +887,7 @@ static int page_referenced_file(struct page *page,
  */
 int page_referenced(struct page *page,
                    int is_locked,
-                   struct mem_cgroup *mem_cont,
+                   struct mem_cgroup *memcg,
                    unsigned long *vm_flags)
 {
        int referenced = 0;
@@ -859,13 +903,13 @@ int page_referenced(struct page *page,
                        }
                }
                if (unlikely(PageKsm(page)))
-                       referenced += page_referenced_ksm(page, mem_cont,
+                       referenced += page_referenced_ksm(page, memcg,
                                                                vm_flags);
                else if (PageAnon(page))
-                       referenced += page_referenced_anon(page, mem_cont,
+                       referenced += page_referenced_anon(page, memcg,
                                                                vm_flags);
                else if (page->mapping)
-                       referenced += page_referenced_file(page, mem_cont,
+                       referenced += page_referenced_file(page, memcg,
                                                                vm_flags);
                if (we_locked)
                        unlock_page(page);
index 48a07287386f4187abe789a1b6ccaf3840611e18..7ce64de1ec69f8034e320bf3d190e82a667db7ab 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -366,7 +366,8 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
                const char *n)
 {
        VM_BUG_ON(!irqs_disabled());
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
        if (s->flags & __CMPXCHG_DOUBLE) {
                if (cmpxchg_double(&page->freelist,
                        freelist_old, counters_old,
@@ -400,7 +401,8 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
                void *freelist_new, unsigned long counters_new,
                const char *n)
 {
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
        if (s->flags & __CMPXCHG_DOUBLE) {
                if (cmpxchg_double(&page->freelist,
                        freelist_old, counters_old,
@@ -3014,7 +3016,8 @@ static int kmem_cache_open(struct kmem_cache *s,
                }
        }
 
-#ifdef CONFIG_CMPXCHG_DOUBLE
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
        if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
                /* Enable fast mode */
                s->flags |= __CMPXCHG_DOUBLE;
@@ -3671,6 +3674,9 @@ void __init kmem_cache_init(void)
        struct kmem_cache *temp_kmem_cache_node;
        unsigned long kmalloc_size;
 
+       if (debug_guardpage_minorder())
+               slub_max_order = 0;
+
        kmem_size = offsetof(struct kmem_cache, node) +
                                nr_node_ids * sizeof(struct kmem_cache_node *);
 
index a91caf754d9badb5f0b1f3c54b8e86be4723ff84..ddccf8e0b4ae597084742c661dd6ef60a8167eea 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -232,12 +232,14 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
 static void pagevec_move_tail_fn(struct page *page, void *arg)
 {
        int *pgmoved = arg;
-       struct zone *zone = page_zone(page);
 
        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
                enum lru_list lru = page_lru_base_type(page);
-               list_move_tail(&page->lru, &zone->lru[lru].list);
-               mem_cgroup_rotate_reclaimable_page(page);
+               struct lruvec *lruvec;
+
+               lruvec = mem_cgroup_lru_move_lists(page_zone(page),
+                                                  page, lru, lru);
+               list_move_tail(&page->lru, &lruvec->lists[lru]);
                (*pgmoved)++;
        }
 }
@@ -476,12 +478,13 @@ static void lru_deactivate_fn(struct page *page, void *arg)
                 */
                SetPageReclaim(page);
        } else {
+               struct lruvec *lruvec;
                /*
                 * The page's writeback ends up during pagevec
                 * We moves tha page into tail of inactive.
                 */
-               list_move_tail(&page->lru, &zone->lru[lru].list);
-               mem_cgroup_rotate_reclaimable_page(page);
+               lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
+               list_move_tail(&page->lru, &lruvec->lists[lru]);
                __count_vm_event(PGROTATED);
        }
 
@@ -585,11 +588,10 @@ int lru_add_drain_all(void)
 void release_pages(struct page **pages, int nr, int cold)
 {
        int i;
-       struct pagevec pages_to_free;
+       LIST_HEAD(pages_to_free);
        struct zone *zone = NULL;
        unsigned long uninitialized_var(flags);
 
-       pagevec_init(&pages_to_free, cold);
        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];
 
@@ -620,19 +622,12 @@ void release_pages(struct page **pages, int nr, int cold)
                        del_page_from_lru(zone, page);
                }
 
-               if (!pagevec_add(&pages_to_free, page)) {
-                       if (zone) {
-                               spin_unlock_irqrestore(&zone->lru_lock, flags);
-                               zone = NULL;
-                       }
-                       __pagevec_free(&pages_to_free);
-                       pagevec_reinit(&pages_to_free);
-               }
+               list_add(&page->lru, &pages_to_free);
        }
        if (zone)
                spin_unlock_irqrestore(&zone->lru_lock, flags);
 
-       pagevec_free(&pages_to_free);
+       free_hot_cold_page_list(&pages_to_free, cold);
 }
 EXPORT_SYMBOL(release_pages);
 
@@ -662,7 +657,6 @@ void lru_add_page_tail(struct zone* zone,
        int active;
        enum lru_list lru;
        const int file = 0;
-       struct list_head *head;
 
        VM_BUG_ON(!PageHead(page));
        VM_BUG_ON(PageCompound(page_tail));
@@ -672,6 +666,8 @@ void lru_add_page_tail(struct zone* zone,
        SetPageLRU(page_tail);
 
        if (page_evictable(page_tail, NULL)) {
+               struct lruvec *lruvec;
+
                if (PageActive(page)) {
                        SetPageActive(page_tail);
                        active = 1;
@@ -681,11 +677,13 @@ void lru_add_page_tail(struct zone* zone,
                        lru = LRU_INACTIVE_ANON;
                }
                update_page_reclaim_stat(zone, page_tail, file, active);
+               lruvec = mem_cgroup_lru_add_list(zone, page_tail, lru);
                if (likely(PageLRU(page)))
-                       head = page->lru.prev;
+                       list_add(&page_tail->lru, page->lru.prev);
                else
-                       head = &zone->lru[lru].list;
-               __add_page_to_lru_list(zone, page_tail, lru, head);
+                       list_add(&page_tail->lru, lruvec->lists[lru].prev);
+               __mod_zone_page_state(zone, NR_LRU_BASE + lru,
+                                     hpage_nr_pages(page_tail));
        } else {
                SetPageUnevictable(page_tail);
                add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
index 230eac32fdd440a40993c0fe280a42ab399e3d50..5758d89d5f5eee71d459e7d6eb458d1e5af325e5 100644 (file)
@@ -670,10 +670,10 @@ int try_to_free_swap(struct page *page)
         * original page might be freed under memory pressure, then
         * later read back in from swap, now with the wrong data.
         *
-        * Hibernation clears bits from gfp_allowed_mask to prevent
-        * memory reclaim from writing to disk, so check that here.
+        * Hibration suspends storage while it is writing the image
+        * to disk so check that here.
         */
-       if (!(gfp_allowed_mask & __GFP_IO))
+       if (pm_suspended_storage())
                return 0;
 
        delete_from_swap_cache(page);
@@ -850,12 +850,13 @@ unsigned int count_swap_pages(int type, int free)
 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct page *page)
 {
-       struct mem_cgroup *ptr;
+       struct mem_cgroup *memcg;
        spinlock_t *ptl;
        pte_t *pte;
        int ret = 1;
 
-       if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
+       if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
+                                        GFP_KERNEL, &memcg)) {
                ret = -ENOMEM;
                goto out_nolock;
        }
@@ -863,7 +864,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
                if (ret > 0)
-                       mem_cgroup_cancel_charge_swapin(ptr);
+                       mem_cgroup_cancel_charge_swapin(memcg);
                ret = 0;
                goto out;
        }
@@ -874,7 +875,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        set_pte_at(vma->vm_mm, addr, pte,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
        page_add_anon_rmap(page, vma, addr);
-       mem_cgroup_commit_charge_swapin(page, ptr);
+       mem_cgroup_commit_charge_swapin(page, memcg);
        swap_free(entry);
        /*
         * Move the page to the active list so it is not
index 51c3554e1c353660a9a790729acd57a66bf3cf68..5738454c8b562a5be62e415835fd00488b6ae6c7 100644 (file)
@@ -1315,7 +1315,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
                unsigned long end, int node, gfp_t gfp_mask, void *caller)
 {
-       static struct vmap_area *va;
+       struct vmap_area *va;
        struct vm_struct *area;
 
        BUG_ON(in_interrupt());
@@ -2378,7 +2378,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
        vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
        vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
        if (!vas || !vms)
-               goto err_free;
+               goto err_free2;
 
        for (area = 0; area < nr_vms; area++) {
                vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
@@ -2476,11 +2476,10 @@ found:
 
 err_free:
        for (area = 0; area < nr_vms; area++) {
-               if (vas)
-                       kfree(vas[area]);
-               if (vms)
-                       kfree(vms[area]);
+               kfree(vas[area]);
+               kfree(vms[area]);
        }
+err_free2:
        kfree(vas);
        kfree(vms);
        return NULL;
index f54a05b7a61d9eb658562b191996beb9d4cea397..1715e4e88d22a230b9f181e3e9bbb1ce6d8b59d2 100644 (file)
@@ -103,8 +103,11 @@ struct scan_control {
         */
        reclaim_mode_t reclaim_mode;
 
-       /* Which cgroup do we reclaim from */
-       struct mem_cgroup *mem_cgroup;
+       /*
+        * The memory cgroup that hit its limit and as a result is the
+        * primary target of this reclaim invocation.
+        */
+       struct mem_cgroup *target_mem_cgroup;
 
        /*
         * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -113,6 +116,11 @@ struct scan_control {
        nodemask_t      *nodemask;
 };
 
+struct mem_cgroup_zone {
+       struct mem_cgroup *mem_cgroup;
+       struct zone *zone;
+};
+
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 #ifdef ARCH_HAS_PREFETCH
@@ -153,28 +161,45 @@ static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-#define scanning_global_lru(sc)        (!(sc)->mem_cgroup)
+static bool global_reclaim(struct scan_control *sc)
+{
+       return !sc->target_mem_cgroup;
+}
+
+static bool scanning_global_lru(struct mem_cgroup_zone *mz)
+{
+       return !mz->mem_cgroup;
+}
 #else
-#define scanning_global_lru(sc)        (1)
+static bool global_reclaim(struct scan_control *sc)
+{
+       return true;
+}
+
+static bool scanning_global_lru(struct mem_cgroup_zone *mz)
+{
+       return true;
+}
 #endif
 
-static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
-                                                 struct scan_control *sc)
+static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
 {
-       if (!scanning_global_lru(sc))
-               return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
 
-       return &zone->reclaim_stat;
+       return &mz->zone->reclaim_stat;
 }
 
-static unsigned long zone_nr_lru_pages(struct zone *zone,
-                               struct scan_control *sc, enum lru_list lru)
+static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
+                                      enum lru_list lru)
 {
-       if (!scanning_global_lru(sc))
-               return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
-                               zone_to_nid(zone), zone_idx(zone), BIT(lru));
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
+                                                   zone_to_nid(mz->zone),
+                                                   zone_idx(mz->zone),
+                                                   BIT(lru));
 
-       return zone_page_state(zone, NR_LRU_BASE + lru);
+       return zone_page_state(mz->zone, NR_LRU_BASE + lru);
 }
 
 
@@ -677,12 +702,13 @@ enum page_references {
 };
 
 static enum page_references page_check_references(struct page *page,
+                                                 struct mem_cgroup_zone *mz,
                                                  struct scan_control *sc)
 {
        int referenced_ptes, referenced_page;
        unsigned long vm_flags;
 
-       referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
+       referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
        referenced_page = TestClearPageReferenced(page);
 
        /* Lumpy reclaim - ignore references */
@@ -715,7 +741,13 @@ static enum page_references page_check_references(struct page *page,
                 */
                SetPageReferenced(page);
 
-               if (referenced_page)
+               if (referenced_page || referenced_ptes > 1)
+                       return PAGEREF_ACTIVATE;
+
+               /*
+                * Activate file-backed executable pages after first usage.
+                */
+               if (vm_flags & VM_EXEC)
                        return PAGEREF_ACTIVATE;
 
                return PAGEREF_KEEP;
@@ -728,29 +760,11 @@ static enum page_references page_check_references(struct page *page,
        return PAGEREF_RECLAIM;
 }
 
-static noinline_for_stack void free_page_list(struct list_head *free_pages)
-{
-       struct pagevec freed_pvec;
-       struct page *page, *tmp;
-
-       pagevec_init(&freed_pvec, 1);
-
-       list_for_each_entry_safe(page, tmp, free_pages, lru) {
-               list_del(&page->lru);
-               if (!pagevec_add(&freed_pvec, page)) {
-                       __pagevec_free(&freed_pvec);
-                       pagevec_reinit(&freed_pvec);
-               }
-       }
-
-       pagevec_free(&freed_pvec);
-}
-
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
-                                     struct zone *zone,
+                                     struct mem_cgroup_zone *mz,
                                      struct scan_control *sc,
                                      int priority,
                                      unsigned long *ret_nr_dirty,
@@ -781,7 +795,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        goto keep;
 
                VM_BUG_ON(PageActive(page));
-               VM_BUG_ON(page_zone(page) != zone);
+               VM_BUG_ON(page_zone(page) != mz->zone);
 
                sc->nr_scanned++;
 
@@ -815,7 +829,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        }
                }
 
-               references = page_check_references(page, sc);
+               references = page_check_references(page, mz, sc);
                switch (references) {
                case PAGEREF_ACTIVATE:
                        goto activate_locked;
@@ -1006,10 +1020,10 @@ keep_lumpy:
         * back off and wait for congestion to clear because further reclaim
         * will encounter the same problem
         */
-       if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
-               zone_set_flag(zone, ZONE_CONGESTED);
+       if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
+               zone_set_flag(mz->zone, ZONE_CONGESTED);
 
-       free_page_list(&free_pages);
+       free_hot_cold_page_list(&free_pages, 1);
 
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
@@ -1125,15 +1139,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
                switch (__isolate_lru_page(page, mode, file)) {
                case 0:
+                       mem_cgroup_lru_del(page);
                        list_move(&page->lru, dst);
-                       mem_cgroup_del_lru(page);
                        nr_taken += hpage_nr_pages(page);
                        break;
 
                case -EBUSY:
                        /* else it is being freed elsewhere */
                        list_move(&page->lru, src);
-                       mem_cgroup_rotate_lru_list(page, page_lru(page));
                        continue;
 
                default:
@@ -1183,8 +1196,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                                break;
 
                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
+                               mem_cgroup_lru_del(cursor_page);
                                list_move(&cursor_page->lru, dst);
-                               mem_cgroup_del_lru(cursor_page);
                                nr_taken += hpage_nr_pages(page);
                                nr_lumpy_taken++;
                                if (PageDirty(cursor_page))
@@ -1225,19 +1238,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        return nr_taken;
 }
 
-static unsigned long isolate_pages_global(unsigned long nr,
-                                       struct list_head *dst,
-                                       unsigned long *scanned, int order,
-                                       isolate_mode_t mode,
-                                       struct zone *z, int active, int file)
+static unsigned long isolate_pages(unsigned long nr, struct mem_cgroup_zone *mz,
+                                  struct list_head *dst,
+                                  unsigned long *scanned, int order,
+                                  isolate_mode_t mode, int active, int file)
 {
+       struct lruvec *lruvec;
        int lru = LRU_BASE;
+
+       lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
        if (active)
                lru += LRU_ACTIVE;
        if (file)
                lru += LRU_FILE;
-       return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
-                                                               mode, file);
+       return isolate_lru_pages(nr, &lruvec->lists[lru], dst,
+                                scanned, order, mode, file);
 }
 
 /*
@@ -1325,7 +1340,7 @@ static int too_many_isolated(struct zone *zone, int file,
        if (current_is_kswapd())
                return 0;
 
-       if (!scanning_global_lru(sc))
+       if (!global_reclaim(sc))
                return 0;
 
        if (file) {
@@ -1343,13 +1358,14 @@ static int too_many_isolated(struct zone *zone, int file,
  * TODO: Try merging with migrations version of putback_lru_pages
  */
 static noinline_for_stack void
-putback_lru_pages(struct zone *zone, struct scan_control *sc,
-                               unsigned long nr_anon, unsigned long nr_file,
-                               struct list_head *page_list)
+putback_lru_pages(struct mem_cgroup_zone *mz, struct scan_control *sc,
+                 unsigned long nr_anon, unsigned long nr_file,
+                 struct list_head *page_list)
 {
        struct page *page;
        struct pagevec pvec;
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+       struct zone *zone = mz->zone;
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
 
        pagevec_init(&pvec, 1);
 
@@ -1389,15 +1405,17 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
        pagevec_release(&pvec);
 }
 
-static noinline_for_stack void update_isolated_counts(struct zone *zone,
-                                       struct scan_control *sc,
-                                       unsigned long *nr_anon,
-                                       unsigned long *nr_file,
-                                       struct list_head *isolated_list)
+static noinline_for_stack void
+update_isolated_counts(struct mem_cgroup_zone *mz,
+                      struct scan_control *sc,
+                      unsigned long *nr_anon,
+                      unsigned long *nr_file,
+                      struct list_head *isolated_list)
 {
        unsigned long nr_active;
+       struct zone *zone = mz->zone;
        unsigned int count[NR_LRU_LISTS] = { 0, };
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
 
        nr_active = clear_active_flags(isolated_list, count);
        __count_vm_events(PGDEACTIVATE, nr_active);
@@ -1466,8 +1484,8 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
  * of reclaimed pages
  */
 static noinline_for_stack unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
-                       struct scan_control *sc, int priority, int file)
+shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
+                    struct scan_control *sc, int priority, int file)
 {
        LIST_HEAD(page_list);
        unsigned long nr_scanned;
@@ -1478,6 +1496,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_dirty = 0;
        unsigned long nr_writeback = 0;
        isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
+       struct zone *zone = mz->zone;
 
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1500,9 +1519,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
 
        spin_lock_irq(&zone->lru_lock);
 
-       if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_to_scan, &page_list,
-                       &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
+       nr_taken = isolate_pages(nr_to_scan, mz, &page_list,
+                                &nr_scanned, sc->order,
+                                reclaim_mode, 0, file);
+       if (global_reclaim(sc)) {
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1510,14 +1530,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                else
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
-       } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
-                       &nr_scanned, sc->order, reclaim_mode, zone,
-                       sc->mem_cgroup, 0, file);
-               /*
-                * mem_cgroup_isolate_pages() keeps track of
-                * scanned pages on its own.
-                */
        }
 
        if (nr_taken == 0) {
@@ -1525,17 +1537,17 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                return 0;
        }
 
-       update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
+       update_isolated_counts(mz, sc, &nr_anon, &nr_file, &page_list);
 
        spin_unlock_irq(&zone->lru_lock);
 
-       nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+       nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
                                                &nr_dirty, &nr_writeback);
 
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                set_reclaim_mode(priority, sc, true);
-               nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+               nr_reclaimed += shrink_page_list(&page_list, mz, sc,
                                        priority, &nr_dirty, &nr_writeback);
        }
 
@@ -1544,7 +1556,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
        __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
 
-       putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+       putback_lru_pages(mz, sc, nr_anon, nr_file, &page_list);
 
        /*
         * If reclaim is isolating dirty pages under writeback, it implies
@@ -1609,13 +1621,15 @@ static void move_active_pages_to_lru(struct zone *zone,
        pagevec_init(&pvec, 1);
 
        while (!list_empty(list)) {
+               struct lruvec *lruvec;
+
                page = lru_to_page(list);
 
                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
 
-               list_move(&page->lru, &zone->lru[lru].list);
-               mem_cgroup_add_lru_list(page, lru);
+               lruvec = mem_cgroup_lru_add_list(zone, page, lru);
+               list_move(&page->lru, &lruvec->lists[lru]);
                pgmoved += hpage_nr_pages(page);
 
                if (!pagevec_add(&pvec, page) || list_empty(list)) {
@@ -1631,8 +1645,10 @@ static void move_active_pages_to_lru(struct zone *zone,
                __count_vm_events(PGDEACTIVATE, pgmoved);
 }
 
-static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-                       struct scan_control *sc, int priority, int file)
+static void shrink_active_list(unsigned long nr_pages,
+                              struct mem_cgroup_zone *mz,
+                              struct scan_control *sc,
+                              int priority, int file)
 {
        unsigned long nr_taken;
        unsigned long pgscanned;
@@ -1641,9 +1657,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        LIST_HEAD(l_active);
        LIST_HEAD(l_inactive);
        struct page *page;
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        unsigned long nr_rotated = 0;
        isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
+       struct zone *zone = mz->zone;
 
        lru_add_drain();
 
@@ -1653,22 +1670,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                reclaim_mode |= ISOLATE_CLEAN;
 
        spin_lock_irq(&zone->lru_lock);
-       if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_pages, &l_hold,
-                                               &pgscanned, sc->order,
-                                               reclaim_mode, zone,
-                                               1, file);
+
+       nr_taken = isolate_pages(nr_pages, mz, &l_hold,
+                                &pgscanned, sc->order,
+                                reclaim_mode, 1, file);
+
+       if (global_reclaim(sc))
                zone->pages_scanned += pgscanned;
-       } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
-                                               &pgscanned, sc->order,
-                                               reclaim_mode, zone,
-                                               sc->mem_cgroup, 1, file);
-               /*
-                * mem_cgroup_isolate_pages() keeps track of
-                * scanned pages on its own.
-                */
-       }
 
        reclaim_stat->recent_scanned[file] += nr_taken;
 
@@ -1690,7 +1698,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                        continue;
                }
 
-               if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
+               if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
                        nr_rotated += hpage_nr_pages(page);
                        /*
                         * Identify referenced, file-backed active pages and
@@ -1753,10 +1761,8 @@ static int inactive_anon_is_low_global(struct zone *zone)
  * Returns true if the zone does not have enough inactive anon pages,
  * meaning some active anon pages need to be deactivated.
  */
-static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
+static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
 {
-       int low;
-
        /*
         * If we don't have swap space, anonymous page deactivation
         * is pointless.
@@ -1764,15 +1770,14 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
        if (!total_swap_pages)
                return 0;
 
-       if (scanning_global_lru(sc))
-               low = inactive_anon_is_low_global(zone);
-       else
-               low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
-       return low;
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
+                                                      mz->zone);
+
+       return inactive_anon_is_low_global(mz->zone);
 }
 #else
-static inline int inactive_anon_is_low(struct zone *zone,
-                                       struct scan_control *sc)
+static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
 {
        return 0;
 }
@@ -1790,8 +1795,7 @@ static int inactive_file_is_low_global(struct zone *zone)
 
 /**
  * inactive_file_is_low - check if file pages need to be deactivated
- * @zone: zone to check
- * @sc:   scan control of this context
+ * @mz: memory cgroup and zone to check
  *
  * When the system is doing streaming IO, memory pressure here
  * ensures that active file pages get deactivated, until more
@@ -1803,45 +1807,44 @@ static int inactive_file_is_low_global(struct zone *zone)
  * This uses a different ratio than the anonymous pages, because
  * the page cache uses a use-once replacement algorithm.
  */
-static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
+static int inactive_file_is_low(struct mem_cgroup_zone *mz)
 {
-       int low;
+       if (!scanning_global_lru(mz))
+               return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
+                                                      mz->zone);
 
-       if (scanning_global_lru(sc))
-               low = inactive_file_is_low_global(zone);
-       else
-               low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
-       return low;
+       return inactive_file_is_low_global(mz->zone);
 }
 
-static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
-                               int file)
+static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
 {
        if (file)
-               return inactive_file_is_low(zone, sc);
+               return inactive_file_is_low(mz);
        else
-               return inactive_anon_is_low(zone, sc);
+               return inactive_anon_is_low(mz);
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-       struct zone *zone, struct scan_control *sc, int priority)
+                                struct mem_cgroup_zone *mz,
+                                struct scan_control *sc, int priority)
 {
        int file = is_file_lru(lru);
 
        if (is_active_lru(lru)) {
-               if (inactive_list_is_low(zone, sc, file))
-                   shrink_active_list(nr_to_scan, zone, sc, priority, file);
+               if (inactive_list_is_low(mz, file))
+                       shrink_active_list(nr_to_scan, mz, sc, priority, file);
                return 0;
        }
 
-       return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
+       return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
 }
 
-static int vmscan_swappiness(struct scan_control *sc)
+static int vmscan_swappiness(struct mem_cgroup_zone *mz,
+                            struct scan_control *sc)
 {
-       if (scanning_global_lru(sc))
+       if (global_reclaim(sc))
                return vm_swappiness;
-       return mem_cgroup_swappiness(sc->mem_cgroup);
+       return mem_cgroup_swappiness(mz->mem_cgroup);
 }
 
 /*
@@ -1852,13 +1855,13 @@ static int vmscan_swappiness(struct scan_control *sc)
  *
  * nr[0] = anon pages to scan; nr[1] = file pages to scan
  */
-static void get_scan_count(struct zone *zone, struct scan_control *sc,
-                                       unsigned long *nr, int priority)
+static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
+                          unsigned long *nr, int priority)
 {
        unsigned long anon, file, free;
        unsigned long anon_prio, file_prio;
        unsigned long ap, fp;
-       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+       struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
        u64 fraction[2], denominator;
        enum lru_list l;
        int noswap = 0;
@@ -1874,9 +1877,9 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         * latencies, so it's better to scan a minimum amount there as
         * well.
         */
-       if (scanning_global_lru(sc) && current_is_kswapd())
+       if (current_is_kswapd() && mz->zone->all_unreclaimable)
                force_scan = true;
-       if (!scanning_global_lru(sc))
+       if (!global_reclaim(sc))
                force_scan = true;
 
        /* If we have no swap space, do not bother scanning anon pages. */
@@ -1888,16 +1891,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                goto out;
        }
 
-       anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-       file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+       anon  = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
+               zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
+       file  = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
+               zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
 
-       if (scanning_global_lru(sc)) {
-               free  = zone_page_state(zone, NR_FREE_PAGES);
+       if (global_reclaim(sc)) {
+               free  = zone_page_state(mz->zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
                   force-scan anon pages. */
-               if (unlikely(file + free <= high_wmark_pages(zone))) {
+               if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
                        fraction[0] = 1;
                        fraction[1] = 0;
                        denominator = 1;
@@ -1909,8 +1912,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
-       anon_prio = vmscan_swappiness(sc);
-       file_prio = 200 - vmscan_swappiness(sc);
+       anon_prio = vmscan_swappiness(mz, sc);
+       file_prio = 200 - vmscan_swappiness(mz, sc);
 
        /*
         * OK, so we have swap space and a fair amount of page cache
@@ -1923,7 +1926,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         *
         * anon in [0], file in [1]
         */
-       spin_lock_irq(&zone->lru_lock);
+       spin_lock_irq(&mz->zone->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                reclaim_stat->recent_scanned[0] /= 2;
                reclaim_stat->recent_rotated[0] /= 2;
@@ -1944,7 +1947,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
 
        fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
        fp /= reclaim_stat->recent_rotated[1] + 1;
-       spin_unlock_irq(&zone->lru_lock);
+       spin_unlock_irq(&mz->zone->lru_lock);
 
        fraction[0] = ap;
        fraction[1] = fp;
@@ -1954,7 +1957,7 @@ out:
                int file = is_file_lru(l);
                unsigned long scan;
 
-               scan = zone_nr_lru_pages(zone, sc, l);
+               scan = zone_nr_lru_pages(mz, l);
                if (priority || noswap) {
                        scan >>= priority;
                        if (!scan && force_scan)
@@ -1972,7 +1975,7 @@ out:
  * back to the allocator and call try_to_compact_zone(), we ensure that
  * there are enough free pages for it to be likely successful
  */
-static inline bool should_continue_reclaim(struct zone *zone,
+static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
                                        unsigned long nr_reclaimed,
                                        unsigned long nr_scanned,
                                        struct scan_control *sc)
@@ -2012,14 +2015,14 @@ static inline bool should_continue_reclaim(struct zone *zone,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-       inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
-                               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+       inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_ANON) +
+                               zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
        if (sc->nr_reclaimed < pages_for_compaction &&
                        inactive_lru_pages > pages_for_compaction)
                return true;
 
        /* If compaction would go ahead or the allocation would succeed, stop */
-       switch (compaction_suitable(zone, sc->order)) {
+       switch (compaction_suitable(mz->zone, sc->order)) {
        case COMPACT_PARTIAL:
        case COMPACT_CONTINUE:
                return false;
@@ -2031,8 +2034,8 @@ static inline bool should_continue_reclaim(struct zone *zone,
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void shrink_zone(int priority, struct zone *zone,
-                               struct scan_control *sc)
+static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
+                                  struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
@@ -2044,7 +2047,7 @@ static void shrink_zone(int priority, struct zone *zone,
 restart:
        nr_reclaimed = 0;
        nr_scanned = sc->nr_scanned;
-       get_scan_count(zone, sc, nr, priority);
+       get_scan_count(mz, sc, nr, priority);
 
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2056,7 +2059,7 @@ restart:
                                nr[l] -= nr_to_scan;
 
                                nr_reclaimed += shrink_list(l, nr_to_scan,
-                                                           zone, sc, priority);
+                                                           mz, sc, priority);
                        }
                }
                /*
@@ -2077,17 +2080,53 @@ restart:
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (inactive_anon_is_low(zone, sc))
-               shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+       if (inactive_anon_is_low(mz))
+               shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
 
        /* reclaim/compaction might need reclaim to continue */
-       if (should_continue_reclaim(zone, nr_reclaimed,
+       if (should_continue_reclaim(mz, nr_reclaimed,
                                        sc->nr_scanned - nr_scanned, sc))
                goto restart;
 
        throttle_vm_writeout(sc->gfp_mask);
 }
 
+static void shrink_zone(int priority, struct zone *zone,
+                       struct scan_control *sc)
+{
+       struct mem_cgroup *root = sc->target_mem_cgroup;
+       struct mem_cgroup_reclaim_cookie reclaim = {
+               .zone = zone,
+               .priority = priority,
+       };
+       struct mem_cgroup *memcg;
+
+       memcg = mem_cgroup_iter(root, NULL, &reclaim);
+       do {
+               struct mem_cgroup_zone mz = {
+                       .mem_cgroup = memcg,
+                       .zone = zone,
+               };
+
+               shrink_mem_cgroup_zone(priority, &mz, sc);
+               /*
+                * Limit reclaim has historically picked one memcg and
+                * scanned it with decreasing priority levels until
+                * nr_to_reclaim had been reclaimed.  This priority
+                * cycle is thus over after a single memcg.
+                *
+                * Direct reclaim and kswapd, on the other hand, have
+                * to scan all memory cgroups to fulfill the overall
+                * scan target for the zone.
+                */
+               if (!global_reclaim(sc)) {
+                       mem_cgroup_iter_break(root, memcg);
+                       break;
+               }
+               memcg = mem_cgroup_iter(root, memcg, &reclaim);
+       } while (memcg);
+}
+
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2125,7 +2164,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
                 */
-               if (scanning_global_lru(sc)) {
+               if (global_reclaim(sc)) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -2223,13 +2262,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        get_mems_allowed();
        delayacct_freepages_start();
 
-       if (scanning_global_lru(sc))
+       if (global_reclaim(sc))
                count_vm_event(ALLOCSTALL);
 
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc->nr_scanned = 0;
                if (!priority)
-                       disable_swap_token(sc->mem_cgroup);
+                       disable_swap_token(sc->target_mem_cgroup);
                if (shrink_zones(priority, zonelist, sc))
                        break;
 
@@ -2237,7 +2276,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
                 */
-               if (scanning_global_lru(sc)) {
+               if (global_reclaim(sc)) {
                        unsigned long lru_pages = 0;
                        for_each_zone_zonelist(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask)) {
@@ -2299,7 +2338,7 @@ out:
                return 0;
 
        /* top priority shrink_zones still had more to do? don't OOM, then */
-       if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
+       if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
                return 1;
 
        return 0;
@@ -2316,7 +2355,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .may_unmap = 1,
                .may_swap = 1,
                .order = order,
-               .mem_cgroup = NULL,
+               .target_mem_cgroup = NULL,
                .nodemask = nodemask,
        };
        struct shrink_control shrink = {
@@ -2336,7 +2375,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
                                                struct zone *zone,
                                                unsigned long *nr_scanned)
@@ -2348,7 +2387,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .may_unmap = 1,
                .may_swap = !noswap,
                .order = 0,
-               .mem_cgroup = mem,
+               .target_mem_cgroup = memcg,
+       };
+       struct mem_cgroup_zone mz = {
+               .mem_cgroup = memcg,
+               .zone = zone,
        };
 
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2365,7 +2408,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
         * will pick up pages from other mem cgroup's as well. We hack
         * the priority and make it zero.
         */
-       shrink_zone(0, zone, &sc);
+       shrink_mem_cgroup_zone(0, &mz, &sc);
 
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
@@ -2373,7 +2416,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
        return sc.nr_reclaimed;
 }
 
-unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                           gfp_t gfp_mask,
                                           bool noswap)
 {
@@ -2386,7 +2429,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .may_swap = !noswap,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .order = 0,
-               .mem_cgroup = mem_cont,
+               .target_mem_cgroup = memcg,
                .nodemask = NULL, /* we don't care the placement */
                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2400,7 +2443,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
         * take care of from where we get pages. So the node where we start the
         * scan does not need to be the current node.
         */
-       nid = mem_cgroup_select_victim_node(mem_cont);
+       nid = mem_cgroup_select_victim_node(memcg);
 
        zonelist = NODE_DATA(nid)->node_zonelists;
 
@@ -2416,6 +2459,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
 
+static void age_active_anon(struct zone *zone, struct scan_control *sc,
+                           int priority)
+{
+       struct mem_cgroup *memcg;
+
+       if (!total_swap_pages)
+               return;
+
+       memcg = mem_cgroup_iter(NULL, NULL, NULL);
+       do {
+               struct mem_cgroup_zone mz = {
+                       .mem_cgroup = memcg,
+                       .zone = zone,
+               };
+
+               if (inactive_anon_is_low(&mz))
+                       shrink_active_list(SWAP_CLUSTER_MAX, &mz,
+                                          sc, priority, 0);
+
+               memcg = mem_cgroup_iter(NULL, memcg, NULL);
+       } while (memcg);
+}
+
 /*
  * pgdat_balanced is used when checking if a node is balanced for high-order
  * allocations. Only zones that meet watermarks and are in a zone allowed
@@ -2536,7 +2602,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                 */
                .nr_to_reclaim = ULONG_MAX,
                .order = order,
-               .mem_cgroup = NULL,
+               .target_mem_cgroup = NULL,
        };
        struct shrink_control shrink = {
                .gfp_mask = sc.gfp_mask,
@@ -2575,9 +2641,7 @@ loop_again:
                         * Do some background aging of the anon list, to give
                         * pages a chance to be referenced before reclaiming.
                         */
-                       if (inactive_anon_is_low(zone, &sc))
-                               shrink_active_list(SWAP_CLUSTER_MAX, zone,
-                                                       &sc, priority, 0);
+                       age_active_anon(zone, &sc, priority);
 
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
@@ -3366,16 +3430,18 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
  */
 static void check_move_unevictable_page(struct page *page, struct zone *zone)
 {
-       VM_BUG_ON(PageActive(page));
+       struct lruvec *lruvec;
 
+       VM_BUG_ON(PageActive(page));
 retry:
        ClearPageUnevictable(page);
        if (page_evictable(page, NULL)) {
                enum lru_list l = page_lru_base_type(page);
 
                __dec_zone_state(zone, NR_UNEVICTABLE);
-               list_move(&page->lru, &zone->lru[l].list);
-               mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
+               lruvec = mem_cgroup_lru_move_lists(zone, page,
+                                                  LRU_UNEVICTABLE, l);
+               list_move(&page->lru, &lruvec->lists[l]);
                __inc_zone_state(zone, NR_INACTIVE_ANON + l);
                __count_vm_event(UNEVICTABLE_PGRESCUED);
        } else {
@@ -3383,8 +3449,9 @@ retry:
                 * rotate unevictable list
                 */
                SetPageUnevictable(page);
-               list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
-               mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
+               lruvec = mem_cgroup_lru_move_lists(zone, page, LRU_UNEVICTABLE,
+                                                  LRU_UNEVICTABLE);
+               list_move(&page->lru, &lruvec->lists[LRU_UNEVICTABLE]);
                if (page_evictable(page, NULL))
                        goto retry;
        }
@@ -3448,9 +3515,10 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
 static void warn_scan_unevictable_pages(void)
 {
        printk_once(KERN_WARNING
-                   "The scan_unevictable_pages sysctl/node-interface has been "
+                   "%s: The scan_unevictable_pages sysctl/node-interface has been "
                    "disabled for lack of a legitimate use case.  If you have "
-                   "one, please send an email to linux-mm@kvack.org.\n");
+                   "one, please send an email to linux-mm@kvack.org.\n",
+                   current->comm);
 }
 
 /*
index 8fd603b1665e5be65bc10c93f146b860f706925d..f600557a76596231ef659fdff0c9f2ea8aed71ae 100644 (file)
@@ -295,7 +295,7 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
-#ifdef CONFIG_CMPXCHG_LOCAL
+#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
 /*
  * If we have cmpxchg_local support then we do not need to incur the overhead
  * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
index ef21b221f0363a8700fbbc3aadd316423c4f9d66..cee82c85d99522947172f63f4e21d80d68f62d54 100644 (file)
@@ -570,6 +570,11 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
                return 0;
 
        net = nf_ct_net(ct);
+
+       /* container deinit, netlink may have died before death_by_timeout */
+       if (!net->nfnl)
+               return 0;
+
        if (!item->report && !nfnetlink_has_listeners(net, group))
                return 0;
 
index 8fda3b3f7be87c01958c80823aea23cc22c790bf..e3bfcbe8a520b63639b04115bf148fb4ddd144eb 100755 (executable)
@@ -227,7 +227,7 @@ our $Inline = qr{inline|__always_inline|noinline};
 our $Member    = qr{->$Ident|\.$Ident|\[[^]]*\]};
 our $Lval      = qr{$Ident(?:$Member)*};
 
-our $Constant  = qr{(?:[0-9]+|0x[0-9a-fA-F]+)[UL]*};
+our $Constant  = qr{(?i:(?:[0-9]+|0x[0-9a-f]+)[ul]*)};
 our $Assignment        = qr{(?:\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=)};
 our $Compare    = qr{<=|>=|==|!=|<|>};
 our $Operators = qr{
@@ -315,7 +315,7 @@ sub build_types {
        $NonptrType     = qr{
                        (?:$Modifier\s+|const\s+)*
                        (?:
-                               (?:typeof|__typeof__)\s*\(\s*\**\s*$Ident\s*\)|
+                               (?:typeof|__typeof__)\s*\([^\)]*\)|
                                (?:$typeTypedefs\b)|
                                (?:${all}\b)
                        )
@@ -334,6 +334,7 @@ our $match_balanced_parentheses = qr/(\((?:[^\(\)]+|(-1))*\))/;
 
 our $Typecast  = qr{\s*(\(\s*$NonptrType\s*\)){0,1}\s*};
 our $LvalOrFunc        = qr{($Lval)\s*($match_balanced_parentheses{0,1})\s*};
+our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant)};
 
 sub deparenthesize {
        my ($string) = @_;
@@ -676,6 +677,10 @@ sub ctx_statement_block {
                        if ($off >= $len) {
                                last;
                        }
+                       if ($level == 0 && substr($blk, $off) =~ /^.\s*#\s*define/) {
+                               $level++;
+                               $type = '#';
+                       }
                }
                $p = $c;
                $c = substr($blk, $off, 1);
@@ -738,6 +743,13 @@ sub ctx_statement_block {
                                last;
                        }
                }
+               # Preprocessor commands end at the newline unless escaped.
+               if ($type eq '#' && $c eq "\n" && $p ne "\\") {
+                       $level--;
+                       $type = '';
+                       $off++;
+                       last;
+               }
                $off++;
        }
        # We are truly at the end, so shuffle to the next line.
@@ -1020,7 +1032,7 @@ sub annotate_values {
                } elsif ($cur =~ /^(\(\s*$Type\s*)\)/ && $av_pending eq '_') {
                        print "CAST($1)\n" if ($dbg_values > 1);
                        push(@av_paren_type, $type);
-                       $type = 'C';
+                       $type = 'c';
 
                } elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\(|\s*$)/) {
                        print "DECLARE($1)\n" if ($dbg_values > 1);
@@ -1212,7 +1224,9 @@ sub possible {
                        case|
                        else|
                        asm|__asm__|
-                       do
+                       do|
+                       \#|
+                       \#\#|
                )(?:\s|$)|
                ^(?:typedef|struct|enum)\b
            )}x;
@@ -1359,6 +1373,7 @@ sub process {
        my %suppress_ifbraces;
        my %suppress_whiletrailers;
        my %suppress_export;
+       my $suppress_statement = 0;
 
        # Pre-scan the patch sanitizing the lines.
        # Pre-scan the patch looking for any __setup documentation.
@@ -1468,6 +1483,7 @@ sub process {
                        %suppress_ifbraces = ();
                        %suppress_whiletrailers = ();
                        %suppress_export = ();
+                       $suppress_statement = 0;
                        next;
 
 # track the line number as we move through the hunk, note that
@@ -1504,9 +1520,11 @@ sub process {
                if ($line =~ /^diff --git.*?(\S+)$/) {
                        $realfile = $1;
                        $realfile =~ s@^([^/]*)/@@;
+                       $in_commit_log = 0;
                } elsif ($line =~ /^\+\+\+\s+(\S+)/) {
                        $realfile = $1;
                        $realfile =~ s@^([^/]*)/@@;
+                       $in_commit_log = 0;
 
                        $p1_prefix = $1;
                        if (!$file && $tree && $p1_prefix ne '' &&
@@ -1546,7 +1564,8 @@ sub process {
                }
 
 # Check signature styles
-               if ($line =~ /^(\s*)($signature_tags)(\s*)(.*)/) {
+               if (!$in_header_lines &&
+                   $line =~ /^(\s*)($signature_tags)(\s*)(.*)/) {
                        my $space_before = $1;
                        my $sign_off = $2;
                        my $space_after = $3;
@@ -1623,7 +1642,7 @@ sub process {
 # Check if it's the start of a commit log
 # (not a header line and we haven't seen the patch filename)
                if ($in_header_lines && $realfile =~ /^$/ &&
-                   $rawline !~ /^(commit\b|from\b|\w+:).+$/i) {
+                   $rawline !~ /^(commit\b|from\b|[\w-]+:).+$/i) {
                        $in_header_lines = 0;
                        $in_commit_log = 1;
                }
@@ -1655,19 +1674,26 @@ sub process {
 # Only applies when adding the entry originally, after that we do not have
 # sufficient context to determine whether it is indeed long enough.
                if ($realfile =~ /Kconfig/ &&
-                   $line =~ /\+\s*(?:---)?help(?:---)?$/) {
+                   $line =~ /.\s*config\s+/) {
                        my $length = 0;
                        my $cnt = $realcnt;
                        my $ln = $linenr + 1;
                        my $f;
+                       my $is_start = 0;
                        my $is_end = 0;
-                       while ($cnt > 0 && defined $lines[$ln - 1]) {
+                       for (; $cnt > 0 && defined $lines[$ln - 1]; $ln++) {
                                $f = $lines[$ln - 1];
                                $cnt-- if ($lines[$ln - 1] !~ /^-/);
                                $is_end = $lines[$ln - 1] =~ /^\+/;
-                               $ln++;
 
                                next if ($f =~ /^-/);
+
+                               if ($lines[$ln - 1] =~ /.\s*(?:bool|tristate)\s*\"/) {
+                                       $is_start = 1;
+                               } elsif ($lines[$ln - 1] =~ /.\s*(?:---)?help(?:---)?$/) {
+                                       $length = -1;
+                               }
+
                                $f =~ s/^.//;
                                $f =~ s/#.*//;
                                $f =~ s/^\s+//;
@@ -1679,8 +1705,8 @@ sub process {
                                $length++;
                        }
                        WARN("CONFIG_DESCRIPTION",
-                            "please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($is_end && $length < 4);
-                       #print "is_end<$is_end> length<$length>\n";
+                            "please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($is_start && $is_end && $length < 4);
+                       #print "is_start<$is_start> is_end<$is_end> length<$length>\n";
                }
 
                if (($realfile =~ /Makefile.*/ || $realfile =~ /Kbuild.*/) &&
@@ -1792,12 +1818,24 @@ sub process {
 # Check for potential 'bare' types
                my ($stat, $cond, $line_nr_next, $remain_next, $off_next,
                    $realline_next);
-               if ($realcnt && $line =~ /.\s*\S/) {
+#print "LINE<$line>\n";
+               if ($linenr >= $suppress_statement &&
+                   $realcnt && $line =~ /.\s*\S/) {
                        ($stat, $cond, $line_nr_next, $remain_next, $off_next) =
                                ctx_statement_block($linenr, $realcnt, 0);
                        $stat =~ s/\n./\n /g;
                        $cond =~ s/\n./\n /g;
 
+#print "linenr<$linenr> <$stat>\n";
+                       # If this statement has no statement boundaries within
+                       # it there is no point in retrying a statement scan
+                       # until we hit end of it.
+                       my $frag = $stat; $frag =~ s/;+\s*$//;
+                       if ($frag !~ /(?:{|;)/) {
+#print "skip<$line_nr_next>\n";
+                               $suppress_statement = $line_nr_next;
+                       }
+
                        # Find the real next line.
                        $realline_next = $line_nr_next;
                        if (defined $realline_next &&
@@ -1923,6 +1961,9 @@ sub process {
 
 # Check relative indent for conditionals and blocks.
                if ($line =~ /\b(?:(?:if|while|for)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) {
+                       ($stat, $cond, $line_nr_next, $remain_next, $off_next) =
+                               ctx_statement_block($linenr, $realcnt, 0)
+                                       if (!defined $stat);
                        my ($s, $c) = ($stat, $cond);
 
                        substr($s, 0, length($c), '');
@@ -2090,7 +2131,7 @@ sub process {
                        #   XXX(foo);
                        #   EXPORT_SYMBOL(something_foo);
                        my $name = $1;
-                       if ($stat =~ /^.([A-Z_]+)\s*\(\s*($Ident)/ &&
+                       if ($stat =~ /^(?:.\s*}\s*\n)?.([A-Z_]+)\s*\(\s*($Ident)/ &&
                            $name =~ /^${Ident}_$2/) {
 #print "FOO C name<$name>\n";
                                $suppress_export{$realline_next} = 1;
@@ -2168,8 +2209,9 @@ sub process {
 
 # * goes on variable not on type
                # (char*[ const])
-               if ($line =~ m{\($NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)\)}) {
-                       my ($from, $to) = ($1, $1);
+               while ($line =~ m{(\($NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)\))}g) {
+                       #print "AA<$1>\n";
+                       my ($from, $to) = ($2, $2);
 
                        # Should start with a space.
                        $to =~ s/^(\S)/ $1/;
@@ -2184,8 +2226,10 @@ sub process {
                                ERROR("POINTER_LOCATION",
                                      "\"(foo$from)\" should be \"(foo$to)\"\n" .  $herecurr);
                        }
-               } elsif ($line =~ m{\b$NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)($Ident)}) {
-                       my ($from, $to, $ident) = ($1, $1, $2);
+               }
+               while ($line =~ m{(\b$NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)($Ident))}g) {
+                       #print "BB<$1>\n";
+                       my ($from, $to, $ident) = ($2, $2, $3);
 
                        # Should start with a space.
                        $to =~ s/^(\S)/ $1/;
@@ -2568,7 +2612,7 @@ sub process {
                        # Flatten any parentheses
                        $value =~ s/\(/ \(/g;
                        $value =~ s/\)/\) /g;
-                       while ($value =~ s/\[[^\{\}]*\]/1/ ||
+                       while ($value =~ s/\[[^\[\]]*\]/1/ ||
                               $value !~ /(?:$Ident|-?$Constant)\s*
                                             $Compare\s*
                                             (?:$Ident|-?$Constant)/x &&
@@ -2593,28 +2637,6 @@ sub process {
                        }
                }
 
-# typecasts on min/max could be min_t/max_t
-               if ($line =~ /^\+(?:.*?)\b(min|max)\s*\($Typecast{0,1}($LvalOrFunc)\s*,\s*$Typecast{0,1}($LvalOrFunc)\s*\)/) {
-                       if (defined $2 || defined $8) {
-                               my $call = $1;
-                               my $cast1 = deparenthesize($2);
-                               my $arg1 = $3;
-                               my $cast2 = deparenthesize($8);
-                               my $arg2 = $9;
-                               my $cast;
-
-                               if ($cast1 ne "" && $cast2 ne "") {
-                                       $cast = "$cast1 or $cast2";
-                               } elsif ($cast1 ne "") {
-                                       $cast = $cast1;
-                               } else {
-                                       $cast = $cast2;
-                               }
-                               WARN("MINMAX",
-                                    "$call() should probably be ${call}_t($cast, $arg1, $arg2)\n" . $herecurr);
-                       }
-               }
-
 # Need a space before open parenthesis after if, while etc
                if ($line=~/\b(if|while|for|switch)\(/) {
                        ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr);
@@ -2623,6 +2645,9 @@ sub process {
 # Check for illegal assignment in if conditional -- and check for trailing
 # statements after the conditional.
                if ($line =~ /do\s*(?!{)/) {
+                       ($stat, $cond, $line_nr_next, $remain_next, $off_next) =
+                               ctx_statement_block($linenr, $realcnt, 0)
+                                       if (!defined $stat);
                        my ($stat_next) = ctx_statement_block($line_nr_next,
                                                $remain_next, $off_next);
                        $stat_next =~ s/\n./\n /g;
@@ -2778,47 +2803,13 @@ sub process {
                        my $cnt = $realcnt;
                        my ($off, $dstat, $dcond, $rest);
                        my $ctx = '';
-
-                       my $args = defined($1);
-
-                       # Find the end of the macro and limit our statement
-                       # search to that.
-                       while ($cnt > 0 && defined $lines[$ln - 1] &&
-                               $lines[$ln - 1] =~ /^(?:-|..*\\$)/)
-                       {
-                               $ctx .= $rawlines[$ln - 1] . "\n";
-                               $cnt-- if ($lines[$ln - 1] !~ /^-/);
-                               $ln++;
-                       }
-                       $ctx .= $rawlines[$ln - 1];
-
                        ($dstat, $dcond, $ln, $cnt, $off) =
-                               ctx_statement_block($linenr, $ln - $linenr + 1, 0);
+                               ctx_statement_block($linenr, $realcnt, 0);
+                       $ctx = $dstat;
                        #print "dstat<$dstat> dcond<$dcond> cnt<$cnt> off<$off>\n";
                        #print "LINE<$lines[$ln-1]> len<" . length($lines[$ln-1]) . "\n";
 
-                       # Extract the remainder of the define (if any) and
-                       # rip off surrounding spaces, and trailing \'s.
-                       $rest = '';
-                       while ($off != 0 || ($cnt > 0 && $rest =~ /\\\s*$/)) {
-                               #print "ADDING cnt<$cnt> $off <" . substr($lines[$ln - 1], $off) . "> rest<$rest>\n";
-                               if ($off != 0 || $lines[$ln - 1] !~ /^-/) {
-                                       $rest .= substr($lines[$ln - 1], $off) . "\n";
-                                       $cnt--;
-                               }
-                               $ln++;
-                               $off = 0;
-                       }
-                       $rest =~ s/\\\n.//g;
-                       $rest =~ s/^\s*//s;
-                       $rest =~ s/\s*$//s;
-
-                       # Clean up the original statement.
-                       if ($args) {
-                               substr($dstat, 0, length($dcond), '');
-                       } else {
-                               $dstat =~ s/^.\s*\#\s*define\s+$Ident\s*//;
-                       }
+                       $dstat =~ s/^.\s*\#\s*define\s+$Ident(?:\([^\)]*\))?\s*//;
                        $dstat =~ s/$;//g;
                        $dstat =~ s/\\\n.//g;
                        $dstat =~ s/^\s*//s;
@@ -2827,7 +2818,7 @@ sub process {
                        # Flatten any parentheses and braces
                        while ($dstat =~ s/\([^\(\)]*\)/1/ ||
                               $dstat =~ s/\{[^\{\}]*\}/1/ ||
-                              $dstat =~ s/\[[^\{\}]*\]/1/)
+                              $dstat =~ s/\[[^\[\]]*\]/1/)
                        {
                        }
 
@@ -2844,23 +2835,32 @@ sub process {
                                ^\"|\"$
                        }x;
                        #print "REST<$rest> dstat<$dstat> ctx<$ctx>\n";
-                       if ($rest ne '' && $rest ne ',') {
-                               if ($rest !~ /while\s*\(/ &&
-                                   $dstat !~ /$exceptions/)
-                               {
-                                       ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE",
-                                             "Macros with multiple statements should be enclosed in a do - while loop\n" . "$here\n$ctx\n");
+                       if ($dstat ne '' &&
+                           $dstat !~ /^(?:$Ident|-?$Constant),$/ &&                    # 10, // foo(),
+                           $dstat !~ /^(?:$Ident|-?$Constant);$/ &&                    # foo();
+                           $dstat !~ /^(?:$Ident|-?$Constant)$/ &&                     # 10 // foo()
+                           $dstat !~ /$exceptions/ &&
+                           $dstat !~ /^\.$Ident\s*=/ &&                                # .foo =
+                           $dstat !~ /^do\s*$Constant\s*while\s*$Constant;?$/ &&       # do {...} while (...); // do {...} while (...)
+                           $dstat !~ /^for\s*$Constant$/ &&                            # for (...)
+                           $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ &&   # for (...) bar()
+                           $dstat !~ /^do\s*{/ &&                                      # do {...
+                           $dstat !~ /^\({/)                                           # ({...
+                       {
+                               $ctx =~ s/\n*$//;
+                               my $herectx = $here . "\n";
+                               my $cnt = statement_rawlines($ctx);
+
+                               for (my $n = 0; $n < $cnt; $n++) {
+                                       $herectx .= raw_line($linenr, $n) . "\n";
                                }
 
-                       } elsif ($ctx !~ /;/) {
-                               if ($dstat ne '' &&
-                                   $dstat !~ /^(?:$Ident|-?$Constant)$/ &&
-                                   $dstat !~ /$exceptions/ &&
-                                   $dstat !~ /^\.$Ident\s*=/ &&
-                                   $dstat =~ /$Operators/)
-                               {
+                               if ($dstat =~ /;/) {
+                                       ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE",
+                                             "Macros with multiple statements should be enclosed in a do - while loop\n" . "$herectx");
+                               } else {
                                        ERROR("COMPLEX_MACRO",
-                                             "Macros with complex values should be enclosed in parenthesis\n" . "$here\n$ctx\n");
+                                             "Macros with complex values should be enclosed in parenthesis\n" . "$herectx");
                                }
                        }
                }
@@ -3111,6 +3111,12 @@ sub process {
                             "__aligned(size) is preferred over __attribute__((aligned(size)))\n" . $herecurr);
                }
 
+# Check for __attribute__ format(printf, prefer __printf
+               if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) {
+                       WARN("PREFER_PRINTF",
+                            "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr);
+               }
+
 # check for sizeof(&)
                if ($line =~ /\bsizeof\s*\(\s*\&/) {
                        WARN("SIZEOF_ADDRESS",
@@ -3123,6 +3129,46 @@ sub process {
                             "Avoid line continuations in quoted strings\n" . $herecurr);
                }
 
+# Check for misused memsets
+               if (defined $stat &&
+                   $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*$FuncArg\s*\)/s) {
+
+                       my $ms_addr = $2;
+                       my $ms_val = $8;
+                       my $ms_size = $14;
+
+                       if ($ms_size =~ /^(0x|)0$/i) {
+                               ERROR("MEMSET",
+                                     "memset to 0's uses 0 as the 2nd argument, not the 3rd\n" . "$here\n$stat\n");
+                       } elsif ($ms_size =~ /^(0x|)1$/i) {
+                               WARN("MEMSET",
+                                    "single byte memset is suspicious. Swapped 2nd/3rd argument?\n" . "$here\n$stat\n");
+                       }
+               }
+
+# typecasts on min/max could be min_t/max_t
+               if (defined $stat &&
+                   $stat =~ /^\+(?:.*?)\b(min|max)\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\)/) {
+                       if (defined $2 || defined $8) {
+                               my $call = $1;
+                               my $cast1 = deparenthesize($2);
+                               my $arg1 = $3;
+                               my $cast2 = deparenthesize($8);
+                               my $arg2 = $9;
+                               my $cast;
+
+                               if ($cast1 ne "" && $cast2 ne "") {
+                                       $cast = "$cast1 or $cast2";
+                               } elsif ($cast1 ne "") {
+                                       $cast = $cast1;
+                               } else {
+                                       $cast = $cast2;
+                               }
+                               WARN("MINMAX",
+                                    "$call() should probably be ${call}_t($cast, $arg1, $arg2)\n" . "$here\n$stat\n");
+                       }
+               }
+
 # check for new externs in .c files.
                if ($realfile =~ /\.c$/ && defined $stat &&
                    $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s)
@@ -3294,12 +3340,6 @@ sub process {
                        WARN("EXPORTED_WORLD_WRITABLE",
                             "Exporting world writable files is usually an error. Consider more restrictive permissions.\n" . $herecurr);
                }
-
-               # Check for memset with swapped arguments
-               if ($line =~ /memset.*\,(\ |)(0x|)0(\ |0|)\);/) {
-                       ERROR("MEMSET",
-                             "memset size is 3rd argument, not the second.\n" . $herecurr);
-               }
        }
 
        # If we have no input at all, then there is nothing to report on
index 4594f334105110fefba64883186a2cd546641a82..f32a04c4c5bc1c56a88e0f7fba3cb521aa997e52 100755 (executable)
@@ -95,7 +95,7 @@ my %VCS_cmds_git = (
     "execute_cmd" => \&git_execute_cmd,
     "available" => '(which("git") ne "") && (-d ".git")',
     "find_signers_cmd" =>
-       "git log --no-color --since=\$email_git_since " .
+       "git log --no-color --follow --since=\$email_git_since " .
            '--format="GitCommit: %H%n' .
                      'GitAuthor: %an <%ae>%n' .
                      'GitDate: %aD%n' .
diff --git a/scripts/kconfig/merge_config.sh b/scripts/kconfig/merge_config.sh
new file mode 100644 (file)
index 0000000..ceadf0e
--- /dev/null
@@ -0,0 +1,117 @@
+#!/bin/sh
+#  merge_config.sh - Takes a list of config fragment values, and merges
+#  them one by one. Provides warnings on overridden values, and specified
+#  values that did not make it to the resulting .config file (due to missed
+#  dependencies or config symbol removal).
+#
+#  Portions reused from kconf_check and generate_cfg:
+#  http://git.yoctoproject.org/cgit/cgit.cgi/yocto-kernel-tools/tree/tools/kconf_check
+#  http://git.yoctoproject.org/cgit/cgit.cgi/yocto-kernel-tools/tree/tools/generate_cfg
+#
+#  Copyright (c) 2009-2010 Wind River Systems, Inc.
+#  Copyright 2011 Linaro
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License version 2 as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#  See the GNU General Public License for more details.
+
+clean_up() {
+       rm -f $TMP_FILE
+       exit
+}
+trap clean_up HUP INT TERM
+
+usage() {
+       echo "Usage: $0 [OPTIONS] [CONFIG [...]]"
+       echo "  -h    display this help text"
+       echo "  -m    only merge the fragments, do not execute the make command"
+       echo "  -n    use allnoconfig instead of alldefconfig"
+}
+
+MAKE=true
+ALLTARGET=alldefconfig
+
+while true; do
+       case $1 in
+       "-n")
+               ALLTARGET=allnoconfig
+               shift
+               continue
+               ;;
+       "-m")
+               MAKE=false
+               shift
+               continue
+               ;;
+       "-h")
+               usage
+               exit
+               ;;
+       *)
+               break
+               ;;
+       esac
+done
+
+
+
+MERGE_LIST=$*
+SED_CONFIG_EXP="s/^\(# \)\{0,1\}\(CONFIG_[a-zA-Z0-9_]*\)[= ].*/\2/p"
+TMP_FILE=$(mktemp ./.tmp.config.XXXXXXXXXX)
+
+# Merge files, printing warnings on overrided values
+for MERGE_FILE in $MERGE_LIST ; do
+       echo "Merging $MERGE_FILE"
+       CFG_LIST=$(sed -n "$SED_CONFIG_EXP" $MERGE_FILE)
+
+       for CFG in $CFG_LIST ; do
+               grep -q -w $CFG $TMP_FILE
+               if [ $? -eq 0 ] ; then
+                       PREV_VAL=$(grep -w $CFG $TMP_FILE)
+                       NEW_VAL=$(grep -w $CFG $MERGE_FILE)
+                       if [ "x$PREV_VAL" != "x$NEW_VAL" ] ; then
+                       echo Value of $CFG is redefined by fragment $MERGE_FILE:
+                       echo Previous  value: $PREV_VAL
+                       echo New value:       $NEW_VAL
+                       echo
+                       fi
+                       sed -i "/$CFG[ =]/d" $TMP_FILE
+               fi
+       done
+       cat $MERGE_FILE >> $TMP_FILE
+done
+
+if [ "$MAKE" = "false" ]; then
+       cp $TMP_FILE .config
+       echo "#"
+       echo "# merged configuration written to .config (needs make)"
+       echo "#"
+       clean_up
+       exit
+fi
+
+# Use the merged file as the starting point for:
+# alldefconfig: Fills in any missing symbols with Kconfig default
+# allnoconfig: Fills in any missing symbols with # CONFIG_* is not set
+make KCONFIG_ALLCONFIG=$TMP_FILE $ALLTARGET
+
+
+# Check all specified config values took (might have missed-dependency issues)
+for CFG in $(sed -n "$SED_CONFIG_EXP" $TMP_FILE); do
+
+       REQUESTED_VAL=$(grep -w -e "$CFG" $TMP_FILE)
+       ACTUAL_VAL=$(grep -w -e "$CFG" .config)
+       if [ "x$REQUESTED_VAL" != "x$ACTUAL_VAL" ] ; then
+               echo "Value requested for $CFG not in final .config"
+               echo "Requested value:  $REQUESTED_VAL"
+               echo "Actual value:     $ACTUAL_VAL"
+               echo ""
+       fi
+done
+
+clean_up
index 8eb6c489fb152c006d1cd781800cfe7c73059cd8..77f952762426661615d3d0180ab2728010e6069f 100644 (file)
@@ -17,8 +17,8 @@ titan:~> perf list
   kmem:kmem_cache_alloc_node               [Tracepoint event]
   kmem:kfree                               [Tracepoint event]
   kmem:kmem_cache_free                     [Tracepoint event]
-  kmem:mm_page_free_direct                 [Tracepoint event]
-  kmem:mm_pagevec_free                     [Tracepoint event]
+  kmem:mm_page_free                        [Tracepoint event]
+  kmem:mm_page_free_batched                [Tracepoint event]
   kmem:mm_page_alloc                       [Tracepoint event]
   kmem:mm_page_alloc_zone_locked           [Tracepoint event]
   kmem:mm_page_pcpu_drain                  [Tracepoint event]
@@ -29,15 +29,15 @@ measured. For example the page alloc/free properties of a 'hackbench
 run' are:
 
  titan:~> perf stat -e kmem:mm_page_pcpu_drain -e kmem:mm_page_alloc
- -e kmem:mm_pagevec_free -e kmem:mm_page_free_direct ./hackbench 10
+ -e kmem:mm_page_free_batched -e kmem:mm_page_free ./hackbench 10
  Time: 0.575
 
  Performance counter stats for './hackbench 10':
 
           13857  kmem:mm_page_pcpu_drain
           27576  kmem:mm_page_alloc
-           6025  kmem:mm_pagevec_free
-          20934  kmem:mm_page_free_direct
+           6025  kmem:mm_page_free_batched
+          20934  kmem:mm_page_free
 
     0.613972165  seconds time elapsed
 
@@ -45,8 +45,8 @@ You can observe the statistical properties as well, by using the
 'repeat the workload N times' feature of perf stat:
 
  titan:~> perf stat --repeat 5 -e kmem:mm_page_pcpu_drain -e
-   kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
-   kmem:mm_page_free_direct ./hackbench 10
+   kmem:mm_page_alloc -e kmem:mm_page_free_batched -e
+   kmem:mm_page_free ./hackbench 10
  Time: 0.627
  Time: 0.644
  Time: 0.564
@@ -57,8 +57,8 @@ You can observe the statistical properties as well, by using the
 
           12920  kmem:mm_page_pcpu_drain    ( +-   3.359% )
           25035  kmem:mm_page_alloc         ( +-   3.783% )
-           6104  kmem:mm_pagevec_free       ( +-   0.934% )
-          18376  kmem:mm_page_free_direct   ( +-   4.941% )
+           6104  kmem:mm_page_free_batched  ( +-   0.934% )
+          18376  kmem:mm_page_free         ( +-   4.941% )
 
     0.643954516  seconds time elapsed   ( +-   2.363% )
 
@@ -158,15 +158,15 @@ Or you can observe the whole system's page allocations for 10
 seconds:
 
 titan:~/git> perf stat -a -e kmem:mm_page_pcpu_drain -e
-kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
-kmem:mm_page_free_direct sleep 10
+kmem:mm_page_alloc -e kmem:mm_page_free_batched -e
+kmem:mm_page_free sleep 10
 
  Performance counter stats for 'sleep 10':
 
          171585  kmem:mm_page_pcpu_drain
          322114  kmem:mm_page_alloc
-          73623  kmem:mm_pagevec_free
-         254115  kmem:mm_page_free_direct
+          73623  kmem:mm_page_free_batched
+         254115  kmem:mm_page_free
 
    10.000591410  seconds time elapsed
 
@@ -174,15 +174,15 @@ Or observe how fluctuating the page allocations are, via statistical
 analysis done over ten 1-second intervals:
 
  titan:~/git> perf stat --repeat 10 -a -e kmem:mm_page_pcpu_drain -e
-   kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
-   kmem:mm_page_free_direct sleep 1
+   kmem:mm_page_alloc -e kmem:mm_page_free_batched -e
+   kmem:mm_page_free sleep 1
 
  Performance counter stats for 'sleep 1' (10 runs):
 
           17254  kmem:mm_page_pcpu_drain    ( +-   3.709% )
           34394  kmem:mm_page_alloc         ( +-   4.617% )
-           7509  kmem:mm_pagevec_free       ( +-   4.820% )
-          25653  kmem:mm_page_free_direct   ( +-   3.672% )
+           7509  kmem:mm_page_free_batched  ( +-   4.820% )
+          25653  kmem:mm_page_free         ( +-   3.672% )
 
     1.058135029  seconds time elapsed   ( +-   3.089% )