]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge master.kernel.org:/pub/scm/linux/kernel/git/mchehab/v4l-dvb
authorLinus Torvalds <torvalds@g5.osdl.org>
Wed, 28 Jun 2006 01:22:13 +0000 (18:22 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Wed, 28 Jun 2006 01:22:13 +0000 (18:22 -0700)
* master.kernel.org:/pub/scm/linux/kernel/git/mchehab/v4l-dvb: (26 commits)
  V4L/DVB (4263): Fix warning when compiling on 64 bit machines
  V4L/DVB (4261): Included required header for in-kernel compilation
  V4L/DVB (4260): Stradis.c: make 2 functions static
  V4L/DVB (4259): Pass an explicit log prefix to cx2341x_log_status
  V4L/DVB (4257): Fix 64-bit compile warnings.
  V4L/DVB (4255): Tda9887 default TOP value is 0x10
  V4L/DVB (4254): Remove obsoleted tuner_debug option.
  V4L/DVB (4253): IVTV VBI format description too long.
  V4L/DVB (4252): Remove duplicate 'tda9887' in info messages.
  V4L/DVB (4245): Reduce the amount of pvrusb2-sourced noise going into the system log
  V4L/DVB (4244): Implement use of cx2341x module in pvrusb2 driver
  V4L/DVB (4243): Exploit new V4L control features in pvrusb2
  V4L/DVB (4242): Don't suspend encoder when changing its attributes (in pvrusb2)
  V4L/DVB (4241): Fix faulty encoder error recovery in pvrusb2
  V4L/DVB (4240): Various V4L control enhancements in pvrusb2
  V4L/DVB (4239): Handle boolean controls in pvrusb2
  V4L/DVB (4238): Make sure flags field is initialized when quering a control in pvrusb2
  V4L/DVB (4237): Move LOG_STATUS bracketing to a different part of the pvrusb2 driver
  V4L/DVB (4236): Rearrange things in pvrusb2 driver in preparation for using cx2341x module
  V4L/DVB (4235): Increase the maximum number of controls that pvrusb2-sysfs.c can handle.
  ...

258 files changed:
Documentation/RCU/torture.txt
Documentation/kernel-parameters.txt
Documentation/pi-futex.txt [new file with mode: 0644]
Documentation/robust-futexes.txt
Documentation/rt-mutex-design.txt [new file with mode: 0644]
Documentation/rt-mutex.txt [new file with mode: 0644]
arch/alpha/kernel/setup.c
arch/arm/kernel/setup.c
arch/i386/Kconfig
arch/i386/kernel/asm-offsets.c
arch/i386/kernel/cpu/amd.c
arch/i386/kernel/cpu/common.c
arch/i386/kernel/cpu/intel_cacheinfo.c
arch/i386/kernel/cpu/proc.c
arch/i386/kernel/cpuid.c
arch/i386/kernel/entry.S
arch/i386/kernel/irq.c
arch/i386/kernel/msr.c
arch/i386/kernel/scx200.c
arch/i386/kernel/signal.c
arch/i386/kernel/smpboot.c
arch/i386/kernel/sysenter.c
arch/i386/kernel/topology.c
arch/i386/kernel/vsyscall-sysenter.S
arch/i386/kernel/vsyscall.lds.S
arch/i386/mach-voyager/setup.c
arch/i386/mm/init.c
arch/i386/mm/pageattr.c
arch/ia64/Kconfig
arch/ia64/kernel/palinfo.c
arch/ia64/kernel/salinfo.c
arch/ia64/kernel/topology.c
arch/ia64/mm/discontig.c
arch/ia64/mm/init.c
arch/ia64/sn/kernel/irq.c
arch/m32r/kernel/setup.c
arch/mips/kernel/smp.c
arch/mips/kernel/smtc.c
arch/parisc/kernel/topology.c
arch/powerpc/kernel/setup_32.c
arch/powerpc/kernel/sysfs.c
arch/powerpc/mm/init_64.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/numa.c
arch/powerpc/platforms/cell/spufs/switch.c
arch/powerpc/platforms/powermac/pfunc_core.c
arch/powerpc/platforms/pseries/eeh_event.c
arch/powerpc/sysdev/mmio_nvram.c
arch/ppc/kernel/setup.c
arch/s390/appldata/appldata_base.c
arch/s390/kernel/smp.c
arch/sh/kernel/setup.c
arch/sh64/kernel/setup.c
arch/sparc64/kernel/setup.c
arch/sparc64/mm/init.c
arch/x86_64/kernel/entry.S
arch/x86_64/kernel/irq.c
arch/x86_64/kernel/mce.c
arch/x86_64/kernel/smp.c
arch/x86_64/kernel/smpboot.c
arch/x86_64/mm/init.c
arch/xtensa/kernel/time.c
arch/xtensa/kernel/traps.c
block/ll_rw_blk.c
drivers/acpi/Kconfig
drivers/acpi/acpi_memhotplug.c
drivers/acpi/numa.c
drivers/atm/firestream.c
drivers/base/cpu.c
drivers/base/dmapool.c
drivers/base/memory.c
drivers/base/node.c
drivers/base/topology.c
drivers/char/Kconfig
drivers/char/Makefile
drivers/char/agp/sgi-agp.c
drivers/char/drm/drm_memory_debug.h
drivers/char/drm/via_dmablit.c
drivers/char/epca.c
drivers/char/hvcs.c
drivers/char/ipmi/ipmi_msghandler.c
drivers/char/ipmi/ipmi_si_intf.c
drivers/char/moxa.c
drivers/char/nsc_gpio.c [new file with mode: 0644]
drivers/char/pc8736x_gpio.c [new file with mode: 0644]
drivers/char/scx200_gpio.c
drivers/char/specialix.c
drivers/char/stallion.c
drivers/char/sx.c
drivers/char/tty_io.c
drivers/cpufreq/cpufreq.c
drivers/cpufreq/cpufreq_stats.c
drivers/input/input.c
drivers/isdn/gigaset/common.c
drivers/isdn/i4l/isdn_tty.c
drivers/leds/led-core.c
drivers/leds/led-triggers.c
drivers/message/fusion/mptfc.c
drivers/message/fusion/mptsas.c
drivers/message/i2o/iop.c
drivers/misc/ibmasm/module.c
drivers/net/fs_enet/fs_enet-mii.c
drivers/net/wireless/ipw2200.c
drivers/pcmcia/m8xx_pcmcia.c
drivers/rapidio/rio-access.c
drivers/rtc/class.c
drivers/rtc/rtc-ds1553.c
drivers/rtc/rtc-sa1100.c
drivers/rtc/rtc-vr41xx.c
drivers/s390/block/dasd_eer.c
drivers/scsi/libata-core.c
drivers/scsi/libata-scsi.c
drivers/sn/ioc3.c
drivers/video/au1100fb.c
drivers/video/backlight/hp680_bl.c
fs/buffer.c
fs/jbd/journal.c
fs/nfs/direct.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/pagelist.c
fs/nfs/read.c
fs/nfs/write.c
fs/nfsd/nfs4state.c
fs/ocfs2/cluster/heartbeat.c
fs/ocfs2/cluster/tcp.c
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmlock.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/dlmglue.c
fs/ocfs2/journal.c
fs/ocfs2/vote.c
fs/proc/task_mmu.c
fs/ufs/inode.c
fs/xfs/xfs_mount.c
include/asm-alpha/core_t2.h
include/asm-generic/bug.h
include/asm-i386/cpu.h
include/asm-i386/elf.h
include/asm-i386/fixmap.h
include/asm-i386/mmu.h
include/asm-i386/node.h [deleted file]
include/asm-i386/page.h
include/asm-i386/processor.h
include/asm-i386/thread_info.h
include/asm-i386/topology.h
include/asm-i386/unwind.h
include/asm-ia64/nodedata.h
include/asm-ia64/topology.h
include/asm-powerpc/topology.h
include/asm-sparc64/topology.h
include/asm-x86_64/hw_irq.h
include/asm-x86_64/topology.h
include/linux/acpi.h
include/linux/buffer_head.h
include/linux/cpu.h
include/linux/dmaengine.h
include/linux/futex.h
include/linux/init_task.h
include/linux/ioport.h
include/linux/ipmi.h
include/linux/list.h
include/linux/memory_hotplug.h
include/linux/mm.h
include/linux/node.h
include/linux/nsc_gpio.h [new file with mode: 0644]
include/linux/plist.h [new file with mode: 0644]
include/linux/poison.h [new file with mode: 0644]
include/linux/rcupdate.h
include/linux/rtmutex.h [new file with mode: 0644]
include/linux/sched.h
include/linux/scx200.h
include/linux/scx200_gpio.h
include/linux/swap.h
include/linux/syscalls.h
include/linux/sysctl.h
include/linux/topology.h
init/Kconfig
init/main.c
kernel/Makefile
kernel/acct.c
kernel/audit.c
kernel/auditsc.c
kernel/cpu.c
kernel/exit.c
kernel/fork.c
kernel/futex.c
kernel/futex_compat.c
kernel/hrtimer.c
kernel/mutex-debug.c
kernel/power/Kconfig
kernel/profile.c
kernel/rcupdate.c
kernel/rcutorture.c
kernel/resource.c
kernel/rtmutex-debug.c [new file with mode: 0644]
kernel/rtmutex-debug.h [new file with mode: 0644]
kernel/rtmutex-tester.c [new file with mode: 0644]
kernel/rtmutex.c [new file with mode: 0644]
kernel/rtmutex.h [new file with mode: 0644]
kernel/rtmutex_common.h [new file with mode: 0644]
kernel/sched.c
kernel/softirq.c
kernel/softlockup.c
kernel/sysctl.c
kernel/timer.c
kernel/workqueue.c
lib/Kconfig
lib/Kconfig.debug
lib/Makefile
lib/plist.c [new file with mode: 0644]
lib/zlib_inflate/inffast.c
lib/zlib_inflate/inftrees.c
mm/Kconfig
mm/filemap.c
mm/memory_hotplug.c
mm/page-writeback.c
mm/page_alloc.c
mm/slab.c
mm/sparse.c
mm/vmscan.c
net/ipv6/route.c
net/sunrpc/auth_gss/gss_krb5_seal.c
net/tipc/bcast.c
net/tipc/bearer.c
net/tipc/config.c
net/tipc/dbg.c
net/tipc/handler.c
net/tipc/name_table.c
net/tipc/net.c
net/tipc/node.c
net/tipc/port.c
net/tipc/ref.c
net/tipc/subscr.c
net/tipc/user_reg.c
scripts/Kbuild.include
scripts/Makefile.build
scripts/Makefile.host
scripts/Makefile.lib
scripts/Makefile.modpost
scripts/rt-tester/check-all.sh [new file with mode: 0644]
scripts/rt-tester/rt-tester.py [new file with mode: 0644]
scripts/rt-tester/t2-l1-2rt-sameprio.tst [new file with mode: 0644]
scripts/rt-tester/t2-l1-pi.tst [new file with mode: 0644]
scripts/rt-tester/t2-l1-signal.tst [new file with mode: 0644]
scripts/rt-tester/t2-l2-2rt-deadlock.tst [new file with mode: 0644]
scripts/rt-tester/t3-l1-pi-1rt.tst [new file with mode: 0644]
scripts/rt-tester/t3-l1-pi-2rt.tst [new file with mode: 0644]
scripts/rt-tester/t3-l1-pi-3rt.tst [new file with mode: 0644]
scripts/rt-tester/t3-l1-pi-signal.tst [new file with mode: 0644]
scripts/rt-tester/t3-l1-pi-steal.tst [new file with mode: 0644]
scripts/rt-tester/t3-l2-pi.tst [new file with mode: 0644]
scripts/rt-tester/t4-l2-pi-deboost.tst [new file with mode: 0644]
scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst [new file with mode: 0644]
scripts/rt-tester/t5-l4-pi-boost-deboost.tst [new file with mode: 0644]
security/keys/key.c
security/selinux/hooks.c
sound/oss/via82cxxx_audio.c

index e4c38152f7f799b94a70493f90dbf36b830e9cde..a4948591607d0e1d7b653ce1f66c08e4831cbad1 100644 (file)
@@ -7,7 +7,7 @@ The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
 implementations.  It creates an rcutorture kernel module that can
 be loaded to run a torture test.  The test periodically outputs
 status messages via printk(), which can be examined via the dmesg
-command (perhaps grepping for "rcutorture").  The test is started
+command (perhaps grepping for "torture").  The test is started
 when the module is loaded, and stops when the module is unloaded.
 
 However, actually setting this config option to "y" results in the system
@@ -35,6 +35,19 @@ stat_interval        The number of seconds between output of torture
                be printed -only- when the module is unloaded, and this
                is the default.
 
+shuffle_interval
+               The number of seconds to keep the test threads affinitied
+               to a particular subset of the CPUs.  Used in conjunction
+               with test_no_idle_hz.
+
+test_no_idle_hz        Whether or not to test the ability of RCU to operate in
+               a kernel that disables the scheduling-clock interrupt to
+               idle CPUs.  Boolean parameter, "1" to test, "0" otherwise.
+
+torture_type   The type of RCU to test: "rcu" for the rcu_read_lock()
+               API, "rcu_bh" for the rcu_read_lock_bh() API, and "srcu"
+               for the "srcu_read_lock()" API.
+
 verbose                Enable debug printk()s.  Default is disabled.
 
 
@@ -42,14 +55,14 @@ OUTPUT
 
 The statistics output is as follows:
 
-       rcutorture: --- Start of test: nreaders=16 stat_interval=0 verbose=0
-       rcutorture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915
-       rcutorture: Reader Pipe:  1466408 9747 0 0 0 0 0 0 0 0 0
-       rcutorture: Reader Batch:  1464477 11678 0 0 0 0 0 0 0 0
-       rcutorture: Free-Block Circulation:  1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0
-       rcutorture: --- End of test
+       rcu-torture: --- Start of test: nreaders=16 stat_interval=0 verbose=0
+       rcu-torture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915
+       rcu-torture: Reader Pipe:  1466408 9747 0 0 0 0 0 0 0 0 0
+       rcu-torture: Reader Batch:  1464477 11678 0 0 0 0 0 0 0 0
+       rcu-torture: Free-Block Circulation:  1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0
+       rcu-torture: --- End of test
 
-The command "dmesg | grep rcutorture:" will extract this information on
+The command "dmesg | grep torture:" will extract this information on
 most systems.  On more esoteric configurations, it may be necessary to
 use other commands to access the output of the printk()s used by
 the RCU torture test.  The printk()s use KERN_ALERT, so they should
@@ -115,8 +128,9 @@ The following script may be used to torture RCU:
        modprobe rcutorture
        sleep 100
        rmmod rcutorture
-       dmesg | grep rcutorture:
+       dmesg | grep torture:
 
 The output can be manually inspected for the error flag of "!!!".
 One could of course create a more elaborate script that automatically
-checked for such errors.
+checked for such errors.  The "rmmod" command forces a "SUCCESS" or
+"FAILURE" indication to be printk()ed.
index 2e352a605fcfef3a6dcb83d4c875635f9f89a71a..0d189c93eeaf94b283d22fd39d4dea8274d2d3cc 100644 (file)
@@ -1669,6 +1669,10 @@ running once the system is up.
        usbhid.mousepoll=
                        [USBHID] The interval which mice are to be polled at.
 
+       vdso=           [IA-32]
+                       vdso=1: enable VDSO (default)
+                       vdso=0: disable VDSO mapping
+
        video=          [FB] Frame buffer configuration
                        See Documentation/fb/modedb.txt.
 
diff --git a/Documentation/pi-futex.txt b/Documentation/pi-futex.txt
new file mode 100644 (file)
index 0000000..5d61dac
--- /dev/null
@@ -0,0 +1,121 @@
+Lightweight PI-futexes
+----------------------
+
+We are calling them lightweight for 3 reasons:
+
+ - in the user-space fastpath a PI-enabled futex involves no kernel work
+   (or any other PI complexity) at all. No registration, no extra kernel
+   calls - just pure fast atomic ops in userspace.
+
+ - even in the slowpath, the system call and scheduling pattern is very
+   similar to normal futexes.
+
+ - the in-kernel PI implementation is streamlined around the mutex
+   abstraction, with strict rules that keep the implementation
+   relatively simple: only a single owner may own a lock (i.e. no
+   read-write lock support), only the owner may unlock a lock, no
+   recursive locking, etc.
+
+Priority Inheritance - why?
+---------------------------
+
+The short reply: user-space PI helps achieving/improving determinism for
+user-space applications. In the best-case, it can help achieve
+determinism and well-bound latencies. Even in the worst-case, PI will
+improve the statistical distribution of locking related application
+delays.
+
+The longer reply:
+-----------------
+
+Firstly, sharing locks between multiple tasks is a common programming
+technique that often cannot be replaced with lockless algorithms. As we
+can see it in the kernel [which is a quite complex program in itself],
+lockless structures are rather the exception than the norm - the current
+ratio of lockless vs. locky code for shared data structures is somewhere
+between 1:10 and 1:100. Lockless is hard, and the complexity of lockless
+algorithms often endangers to ability to do robust reviews of said code.
+I.e. critical RT apps often choose lock structures to protect critical
+data structures, instead of lockless algorithms. Furthermore, there are
+cases (like shared hardware, or other resource limits) where lockless
+access is mathematically impossible.
+
+Media players (such as Jack) are an example of reasonable application
+design with multiple tasks (with multiple priority levels) sharing
+short-held locks: for example, a highprio audio playback thread is
+combined with medium-prio construct-audio-data threads and low-prio
+display-colory-stuff threads. Add video and decoding to the mix and
+we've got even more priority levels.
+
+So once we accept that synchronization objects (locks) are an
+unavoidable fact of life, and once we accept that multi-task userspace
+apps have a very fair expectation of being able to use locks, we've got
+to think about how to offer the option of a deterministic locking
+implementation to user-space.
+
+Most of the technical counter-arguments against doing priority
+inheritance only apply to kernel-space locks. But user-space locks are
+different, there we cannot disable interrupts or make the task
+non-preemptible in a critical section, so the 'use spinlocks' argument
+does not apply (user-space spinlocks have the same priority inversion
+problems as other user-space locking constructs). Fact is, pretty much
+the only technique that currently enables good determinism for userspace
+locks (such as futex-based pthread mutexes) is priority inheritance:
+
+Currently (without PI), if a high-prio and a low-prio task shares a lock
+[this is a quite common scenario for most non-trivial RT applications],
+even if all critical sections are coded carefully to be deterministic
+(i.e. all critical sections are short in duration and only execute a
+limited number of instructions), the kernel cannot guarantee any
+deterministic execution of the high-prio task: any medium-priority task
+could preempt the low-prio task while it holds the shared lock and
+executes the critical section, and could delay it indefinitely.
+
+Implementation:
+---------------
+
+As mentioned before, the userspace fastpath of PI-enabled pthread
+mutexes involves no kernel work at all - they behave quite similarly to
+normal futex-based locks: a 0 value means unlocked, and a value==TID
+means locked. (This is the same method as used by list-based robust
+futexes.) Userspace uses atomic ops to lock/unlock these mutexes without
+entering the kernel.
+
+To handle the slowpath, we have added two new futex ops:
+
+  FUTEX_LOCK_PI
+  FUTEX_UNLOCK_PI
+
+If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to
+TID fails], then FUTEX_LOCK_PI is called. The kernel does all the
+remaining work: if there is no futex-queue attached to the futex address
+yet then the code looks up the task that owns the futex [it has put its
+own TID into the futex value], and attaches a 'PI state' structure to
+the futex-queue. The pi_state includes an rt-mutex, which is a PI-aware,
+kernel-based synchronization object. The 'other' task is made the owner
+of the rt-mutex, and the FUTEX_WAITERS bit is atomically set in the
+futex value. Then this task tries to lock the rt-mutex, on which it
+blocks. Once it returns, it has the mutex acquired, and it sets the
+futex value to its own TID and returns. Userspace has no other work to
+perform - it now owns the lock, and futex value contains
+FUTEX_WAITERS|TID.
+
+If the unlock side fastpath succeeds, [i.e. userspace manages to do a
+TID -> 0 atomic transition of the futex value], then no kernel work is
+triggered.
+
+If the unlock fastpath fails (because the FUTEX_WAITERS bit is set),
+then FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the
+behalf of userspace - and it also unlocks the attached
+pi_state->rt_mutex and thus wakes up any potential waiters.
+
+Note that under this approach, contrary to previous PI-futex approaches,
+there is no prior 'registration' of a PI-futex. [which is not quite
+possible anyway, due to existing ABI properties of pthread mutexes.]
+
+Also, under this scheme, 'robustness' and 'PI' are two orthogonal
+properties of futexes, and all four combinations are possible: futex,
+robust-futex, PI-futex, robust+PI-futex.
+
+More details about priority inheritance can be found in
+Documentation/rtmutex.txt.
index df82d75245a01b5055c6793fa822efe949982a2e..76e8064b8c3a5ccb60e6cbb2f55ad5d455d097b0 100644 (file)
@@ -95,7 +95,7 @@ comparison. If the thread has registered a list, then normally the list
 is empty. If the thread/process crashed or terminated in some incorrect
 way then the list might be non-empty: in this case the kernel carefully
 walks the list [not trusting it], and marks all locks that are owned by
-this thread with the FUTEX_OWNER_DEAD bit, and wakes up one waiter (if
+this thread with the FUTEX_OWNER_DIED bit, and wakes up one waiter (if
 any).
 
 The list is guaranteed to be private and per-thread at do_exit() time,
diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt
new file mode 100644 (file)
index 0000000..c472ffa
--- /dev/null
@@ -0,0 +1,781 @@
+#
+# Copyright (c) 2006 Steven Rostedt
+# Licensed under the GNU Free Documentation License, Version 1.2
+#
+
+RT-mutex implementation design
+------------------------------
+
+This document tries to describe the design of the rtmutex.c implementation.
+It doesn't describe the reasons why rtmutex.c exists. For that please see
+Documentation/rt-mutex.txt.  Although this document does explain problems
+that happen without this code, but that is in the concept to understand
+what the code actually is doing.
+
+The goal of this document is to help others understand the priority
+inheritance (PI) algorithm that is used, as well as reasons for the
+decisions that were made to implement PI in the manner that was done.
+
+
+Unbounded Priority Inversion
+----------------------------
+
+Priority inversion is when a lower priority process executes while a higher
+priority process wants to run.  This happens for several reasons, and
+most of the time it can't be helped.  Anytime a high priority process wants
+to use a resource that a lower priority process has (a mutex for example),
+the high priority process must wait until the lower priority process is done
+with the resource.  This is a priority inversion.  What we want to prevent
+is something called unbounded priority inversion.  That is when the high
+priority process is prevented from running by a lower priority process for
+an undetermined amount of time.
+
+The classic example of unbounded priority inversion is were you have three
+processes, let's call them processes A, B, and C, where A is the highest
+priority process, C is the lowest, and B is in between. A tries to grab a lock
+that C owns and must wait and lets C run to release the lock. But in the
+meantime, B executes, and since B is of a higher priority than C, it preempts C,
+but by doing so, it is in fact preempting A which is a higher priority process.
+Now there's no way of knowing how long A will be sleeping waiting for C
+to release the lock, because for all we know, B is a CPU hog and will
+never give C a chance to release the lock.  This is called unbounded priority
+inversion.
+
+Here's a little ASCII art to show the problem.
+
+   grab lock L1 (owned by C)
+     |
+A ---+
+        C preempted by B
+          |
+C    +----+
+
+B         +-------->
+                B now keeps A from running.
+
+
+Priority Inheritance (PI)
+-------------------------
+
+There are several ways to solve this issue, but other ways are out of scope
+for this document.  Here we only discuss PI.
+
+PI is where a process inherits the priority of another process if the other
+process blocks on a lock owned by the current process.  To make this easier
+to understand, let's use the previous example, with processes A, B, and C again.
+
+This time, when A blocks on the lock owned by C, C would inherit the priority
+of A.  So now if B becomes runnable, it would not preempt C, since C now has
+the high priority of A.  As soon as C releases the lock, it loses its
+inherited priority, and A then can continue with the resource that C had.
+
+Terminology
+-----------
+
+Here I explain some terminology that is used in this document to help describe
+the design that is used to implement PI.
+
+PI chain - The PI chain is an ordered series of locks and processes that cause
+           processes to inherit priorities from a previous process that is
+           blocked on one of its locks.  This is described in more detail
+           later in this document.
+
+mutex    - In this document, to differentiate from locks that implement
+           PI and spin locks that are used in the PI code, from now on
+           the PI locks will be called a mutex.
+
+lock     - In this document from now on, I will use the term lock when
+           referring to spin locks that are used to protect parts of the PI
+           algorithm.  These locks disable preemption for UP (when
+           CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
+           entering critical sections simultaneously.
+
+spin lock - Same as lock above.
+
+waiter   - A waiter is a struct that is stored on the stack of a blocked
+           process.  Since the scope of the waiter is within the code for
+           a process being blocked on the mutex, it is fine to allocate
+           the waiter on the process's stack (local variable).  This
+           structure holds a pointer to the task, as well as the mutex that
+           the task is blocked on.  It also has the plist node structures to
+           place the task in the waiter_list of a mutex as well as the
+           pi_list of a mutex owner task (described below).
+
+           waiter is sometimes used in reference to the task that is waiting
+           on a mutex. This is the same as waiter->task.
+
+waiters  - A list of processes that are blocked on a mutex.
+
+top waiter - The highest priority process waiting on a specific mutex.
+
+top pi waiter - The highest priority process waiting on one of the mutexes
+                that a specific process owns.
+
+Note:  task and process are used interchangeably in this document, mostly to
+       differentiate between two processes that are being described together.
+
+
+PI chain
+--------
+
+The PI chain is a list of processes and mutexes that may cause priority
+inheritance to take place.  Multiple chains may converge, but a chain
+would never diverge, since a process can't be blocked on more than one
+mutex at a time.
+
+Example:
+
+   Process:  A, B, C, D, E
+   Mutexes:  L1, L2, L3, L4
+
+   A owns: L1
+           B blocked on L1
+           B owns L2
+                  C blocked on L2
+                  C owns L3
+                         D blocked on L3
+                         D owns L4
+                                E blocked on L4
+
+The chain would be:
+
+   E->L4->D->L3->C->L2->B->L1->A
+
+To show where two chains merge, we could add another process F and
+another mutex L5 where B owns L5 and F is blocked on mutex L5.
+
+The chain for F would be:
+
+   F->L5->B->L1->A
+
+Since a process may own more than one mutex, but never be blocked on more than
+one, the chains merge.
+
+Here we show both chains:
+
+   E->L4->D->L3->C->L2-+
+                       |
+                       +->B->L1->A
+                       |
+                 F->L5-+
+
+For PI to work, the processes at the right end of these chains (or we may
+also call it the Top of the chain) must be equal to or higher in priority
+than the processes to the left or below in the chain.
+
+Also since a mutex may have more than one process blocked on it, we can
+have multiple chains merge at mutexes.  If we add another process G that is
+blocked on mutex L2:
+
+  G->L2->B->L1->A
+
+And once again, to show how this can grow I will show the merging chains
+again.
+
+   E->L4->D->L3->C-+
+                   +->L2-+
+                   |     |
+                 G-+     +->B->L1->A
+                         |
+                   F->L5-+
+
+
+Plist
+-----
+
+Before I go further and talk about how the PI chain is stored through lists
+on both mutexes and processes, I'll explain the plist.  This is similar to
+the struct list_head functionality that is already in the kernel.
+The implementation of plist is out of scope for this document, but it is
+very important to understand what it does.
+
+There are a few differences between plist and list, the most important one
+being that plist is a priority sorted linked list.  This means that the
+priorities of the plist are sorted, such that it takes O(1) to retrieve the
+highest priority item in the list.  Obviously this is useful to store processes
+based on their priorities.
+
+Another difference, which is important for implementation, is that, unlike
+list, the head of the list is a different element than the nodes of a list.
+So the head of the list is declared as struct plist_head and nodes that will
+be added to the list are declared as struct plist_node.
+
+
+Mutex Waiter List
+-----------------
+
+Every mutex keeps track of all the waiters that are blocked on itself. The mutex
+has a plist to store these waiters by priority.  This list is protected by
+a spin lock that is located in the struct of the mutex. This lock is called
+wait_lock.  Since the modification of the waiter list is never done in
+interrupt context, the wait_lock can be taken without disabling interrupts.
+
+
+Task PI List
+------------
+
+To keep track of the PI chains, each process has its own PI list.  This is
+a list of all top waiters of the mutexes that are owned by the process.
+Note that this list only holds the top waiters and not all waiters that are
+blocked on mutexes owned by the process.
+
+The top of the task's PI list is always the highest priority task that
+is waiting on a mutex that is owned by the task.  So if the task has
+inherited a priority, it will always be the priority of the task that is
+at the top of this list.
+
+This list is stored in the task structure of a process as a plist called
+pi_list.  This list is protected by a spin lock also in the task structure,
+called pi_lock.  This lock may also be taken in interrupt context, so when
+locking the pi_lock, interrupts must be disabled.
+
+
+Depth of the PI Chain
+---------------------
+
+The maximum depth of the PI chain is not dynamic, and could actually be
+defined.  But is very complex to figure it out, since it depends on all
+the nesting of mutexes.  Let's look at the example where we have 3 mutexes,
+L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
+The following shows a locking order of L1->L2->L3, but may not actually
+be directly nested that way.
+
+void func1(void)
+{
+       mutex_lock(L1);
+
+       /* do anything */
+
+       mutex_unlock(L1);
+}
+
+void func2(void)
+{
+       mutex_lock(L1);
+       mutex_lock(L2);
+
+       /* do something */
+
+       mutex_unlock(L2);
+       mutex_unlock(L1);
+}
+
+void func3(void)
+{
+       mutex_lock(L2);
+       mutex_lock(L3);
+
+       /* do something else */
+
+       mutex_unlock(L3);
+       mutex_unlock(L2);
+}
+
+void func4(void)
+{
+       mutex_lock(L3);
+
+       /* do something again */
+
+       mutex_unlock(L3);
+}
+
+Now we add 4 processes that run each of these functions separately.
+Processes A, B, C, and D which run functions func1, func2, func3 and func4
+respectively, and such that D runs first and A last.  With D being preempted
+in func4 in the "do something again" area, we have a locking that follows:
+
+D owns L3
+       C blocked on L3
+       C owns L2
+              B blocked on L2
+              B owns L1
+                     A blocked on L1
+
+And thus we have the chain A->L1->B->L2->C->L3->D.
+
+This gives us a PI depth of 4 (four processes), but looking at any of the
+functions individually, it seems as though they only have at most a locking
+depth of two.  So, although the locking depth is defined at compile time,
+it still is very difficult to find the possibilities of that depth.
+
+Now since mutexes can be defined by user-land applications, we don't want a DOS
+type of application that nests large amounts of mutexes to create a large
+PI chain, and have the code holding spin locks while looking at a large
+amount of data.  So to prevent this, the implementation not only implements
+a maximum lock depth, but also only holds at most two different locks at a
+time, as it walks the PI chain.  More about this below.
+
+
+Mutex owner and flags
+---------------------
+
+The mutex structure contains a pointer to the owner of the mutex.  If the
+mutex is not owned, this owner is set to NULL.  Since all architectures
+have the task structure on at least a four byte alignment (and if this is
+not true, the rtmutex.c code will be broken!), this allows for the two
+least significant bits to be used as flags.  This part is also described
+in Documentation/rt-mutex.txt, but will also be briefly described here.
+
+Bit 0 is used as the "Pending Owner" flag.  This is described later.
+Bit 1 is used as the "Has Waiters" flags.  This is also described later
+  in more detail, but is set whenever there are waiters on a mutex.
+
+
+cmpxchg Tricks
+--------------
+
+Some architectures implement an atomic cmpxchg (Compare and Exchange).  This
+is used (when applicable) to keep the fast path of grabbing and releasing
+mutexes short.
+
+cmpxchg is basically the following function performed atomically:
+
+unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
+{
+        unsigned long T = *A;
+        if (*A == *B) {
+                *A = *C;
+        }
+        return T;
+}
+#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
+
+This is really nice to have, since it allows you to only update a variable
+if the variable is what you expect it to be.  You know if it succeeded if
+the return value (the old value of A) is equal to B.
+
+The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
+the architecture does not support CMPXCHG, then this macro is simply set
+to fail every time.  But if CMPXCHG is supported, then this will
+help out extremely to keep the fast path short.
+
+The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
+the system for architectures that support it.  This will also be explained
+later in this document.
+
+
+Priority adjustments
+--------------------
+
+The implementation of the PI code in rtmutex.c has several places that a
+process must adjust its priority.  With the help of the pi_list of a
+process this is rather easy to know what needs to be adjusted.
+
+The functions implementing the task adjustments are rt_mutex_adjust_prio,
+__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock
+to already be taken), rt_mutex_get_prio, and rt_mutex_setprio.
+
+rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio.
+
+rt_mutex_getprio returns the priority that the task should have.  Either the
+task's own normal priority, or if a process of a higher priority is waiting on
+a mutex owned by the task, then that higher priority should be returned.
+Since the pi_list of a task holds an order by priority list of all the top
+waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs
+to compare the top pi waiter to its own normal priority, and return the higher
+priority back.
+
+(Note:  if looking at the code, you will notice that the lower number of
+        prio is returned.  This is because the prio field in the task structure
+        is an inverse order of the actual priority.  So a "prio" of 5 is
+        of higher priority than a "prio" of 10.)
+
+__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the
+result does not equal the task's current priority, then rt_mutex_setprio
+is called to adjust the priority of the task to the new priority.
+Note that rt_mutex_setprio is defined in kernel/sched.c to implement the
+actual change in priority.
+
+It is interesting to note that __rt_mutex_adjust_prio can either increase
+or decrease the priority of the task.  In the case that a higher priority
+process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio
+would increase/boost the task's priority.  But if a higher priority task
+were for some reason to leave the mutex (timeout or signal), this same function
+would decrease/unboost the priority of the task.  That is because the pi_list
+always contains the highest priority task that is waiting on a mutex owned
+by the task, so we only need to compare the priority of that top pi waiter
+to the normal priority of the given task.
+
+
+High level overview of the PI chain walk
+----------------------------------------
+
+The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
+
+The implementation has gone through several iterations, and has ended up
+with what we believe is the best.  It walks the PI chain by only grabbing
+at most two locks at a time, and is very efficient.
+
+The rt_mutex_adjust_prio_chain can be used either to boost or lower process
+priorities.
+
+rt_mutex_adjust_prio_chain is called with a task to be checked for PI
+(de)boosting (the owner of a mutex that a process is blocking on), a flag to
+check for deadlocking, the mutex that the task owns, and a pointer to a waiter
+that is the process's waiter struct that is blocked on the mutex (although this
+parameter may be NULL for deboosting).
+
+For this explanation, I will not mention deadlock detection. This explanation
+will try to stay at a high level.
+
+When this function is called, there are no locks held.  That also means
+that the state of the owner and lock can change when entered into this function.
+
+Before this function is called, the task has already had rt_mutex_adjust_prio
+performed on it.  This means that the task is set to the priority that it
+should be at, but the plist nodes of the task's waiter have not been updated
+with the new priorities, and that this task may not be in the proper locations
+in the pi_lists and wait_lists that the task is blocked on.  This function
+solves all that.
+
+A loop is entered, where task is the owner to be checked for PI changes that
+was passed by parameter (for the first iteration).  The pi_lock of this task is
+taken to prevent any more changes to the pi_list of the task.  This also
+prevents new tasks from completing the blocking on a mutex that is owned by this
+task.
+
+If the task is not blocked on a mutex then the loop is exited.  We are at
+the top of the PI chain.
+
+A check is now done to see if the original waiter (the process that is blocked
+on the current mutex) is the top pi waiter of the task.  That is, is this
+waiter on the top of the task's pi_list.  If it is not, it either means that
+there is another process higher in priority that is blocked on one of the
+mutexes that the task owns, or that the waiter has just woken up via a signal
+or timeout and has left the PI chain.  In either case, the loop is exited, since
+we don't need to do any more changes to the priority of the current task, or any
+task that owns a mutex that this current task is waiting on.  A priority chain
+walk is only needed when a new top pi waiter is made to a task.
+
+The next check sees if the task's waiter plist node has the priority equal to
+the priority the task is set at.  If they are equal, then we are done with
+the loop.  Remember that the function started with the priority of the
+task adjusted, but the plist nodes that hold the task in other processes
+pi_lists have not been adjusted.
+
+Next, we look at the mutex that the task is blocked on. The mutex's wait_lock
+is taken.  This is done by a spin_trylock, because the locking order of the
+pi_lock and wait_lock goes in the opposite direction. If we fail to grab the
+lock, the pi_lock is released, and we restart the loop.
+
+Now that we have both the pi_lock of the task as well as the wait_lock of
+the mutex the task is blocked on, we update the task's waiter's plist node
+that is located on the mutex's wait_list.
+
+Now we release the pi_lock of the task.
+
+Next the owner of the mutex has its pi_lock taken, so we can update the
+task's entry in the owner's pi_list.  If the task is the highest priority
+process on the mutex's wait_list, then we remove the previous top waiter
+from the owner's pi_list, and replace it with the task.
+
+Note: It is possible that the task was the current top waiter on the mutex,
+      in which case the task is not yet on the pi_list of the waiter.  This
+      is OK, since plist_del does nothing if the plist node is not on any
+      list.
+
+If the task was not the top waiter of the mutex, but it was before we
+did the priority updates, that means we are deboosting/lowering the
+task.  In this case, the task is removed from the pi_list of the owner,
+and the new top waiter is added.
+
+Lastly, we unlock both the pi_lock of the task, as well as the mutex's
+wait_lock, and continue the loop again.  On the next iteration of the
+loop, the previous owner of the mutex will be the task that will be
+processed.
+
+Note: One might think that the owner of this mutex might have changed
+      since we just grab the mutex's wait_lock. And one could be right.
+      The important thing to remember is that the owner could not have
+      become the task that is being processed in the PI chain, since
+      we have taken that task's pi_lock at the beginning of the loop.
+      So as long as there is an owner of this mutex that is not the same
+      process as the tasked being worked on, we are OK.
+
+      Looking closely at the code, one might be confused.  The check for the
+      end of the PI chain is when the task isn't blocked on anything or the
+      task's waiter structure "task" element is NULL.  This check is
+      protected only by the task's pi_lock.  But the code to unlock the mutex
+      sets the task's waiter structure "task" element to NULL with only
+      the protection of the mutex's wait_lock, which was not taken yet.
+      Isn't this a race condition if the task becomes the new owner?
+
+      The answer is No!  The trick is the spin_trylock of the mutex's
+      wait_lock.  If we fail that lock, we release the pi_lock of the
+      task and continue the loop, doing the end of PI chain check again.
+
+      In the code to release the lock, the wait_lock of the mutex is held
+      the entire time, and it is not let go when we grab the pi_lock of the
+      new owner of the mutex.  So if the switch of a new owner were to happen
+      after the check for end of the PI chain and the grabbing of the
+      wait_lock, the unlocking code would spin on the new owner's pi_lock
+      but never give up the wait_lock.  So the PI chain loop is guaranteed to
+      fail the spin_trylock on the wait_lock, release the pi_lock, and
+      try again.
+
+      If you don't quite understand the above, that's OK. You don't have to,
+      unless you really want to make a proof out of it ;)
+
+
+Pending Owners and Lock stealing
+--------------------------------
+
+One of the flags in the owner field of the mutex structure is "Pending Owner".
+What this means is that an owner was chosen by the process releasing the
+mutex, but that owner has yet to wake up and actually take the mutex.
+
+Why is this important?  Why can't we just give the mutex to another process
+and be done with it?
+
+The PI code is to help with real-time processes, and to let the highest
+priority process run as long as possible with little latencies and delays.
+If a high priority process owns a mutex that a lower priority process is
+blocked on, when the mutex is released it would be given to the lower priority
+process.  What if the higher priority process wants to take that mutex again.
+The high priority process would fail to take that mutex that it just gave up
+and it would need to boost the lower priority process to run with full
+latency of that critical section (since the low priority process just entered
+it).
+
+There's no reason a high priority process that gives up a mutex should be
+penalized if it tries to take that mutex again.  If the new owner of the
+mutex has not woken up yet, there's no reason that the higher priority process
+could not take that mutex away.
+
+To solve this, we introduced Pending Ownership and Lock Stealing.  When a
+new process is given a mutex that it was blocked on, it is only given
+pending ownership.  This means that it's the new owner, unless a higher
+priority process comes in and tries to grab that mutex.  If a higher priority
+process does come along and wants that mutex, we let the higher priority
+process "steal" the mutex from the pending owner (only if it is still pending)
+and continue with the mutex.
+
+
+Taking of a mutex (The walk through)
+------------------------------------
+
+OK, now let's take a look at the detailed walk through of what happens when
+taking a mutex.
+
+The first thing that is tried is the fast taking of the mutex.  This is
+done when we have CMPXCHG enabled (otherwise the fast taking automatically
+fails).  Only when the owner field of the mutex is NULL can the lock be
+taken with the CMPXCHG and nothing else needs to be done.
+
+If there is contention on the lock, whether it is owned or pending owner
+we go about the slow path (rt_mutex_slowlock).
+
+The slow path function is where the task's waiter structure is created on
+the stack.  This is because the waiter structure is only needed for the
+scope of this function.  The waiter structure holds the nodes to store
+the task on the wait_list of the mutex, and if need be, the pi_list of
+the owner.
+
+The wait_lock of the mutex is taken since the slow path of unlocking the
+mutex also takes this lock.
+
+We then call try_to_take_rt_mutex.  This is where the architecture that
+does not implement CMPXCHG would always grab the lock (if there's no
+contention).
+
+try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
+slow path.  The first thing that is done here is an atomic setting of
+the "Has Waiters" flag of the mutex's owner field.  Yes, this could really
+be false, because if the the mutex has no owner, there are no waiters and
+the current task also won't have any waiters.  But we don't have the lock
+yet, so we assume we are going to be a waiter.  The reason for this is to
+play nice for those architectures that do have CMPXCHG.  By setting this flag
+now, the owner of the mutex can't release the mutex without going into the
+slow unlock path, and it would then need to grab the wait_lock, which this
+code currently holds.  So setting the "Has Waiters" flag forces the owner
+to synchronize with this code.
+
+Now that we know that we can't have any races with the owner releasing the
+mutex, we check to see if we can take the ownership.  This is done if the
+mutex doesn't have a owner, or if we can steal the mutex from a pending
+owner.  Let's look at the situations we have here.
+
+  1) Has owner that is pending
+  ----------------------------
+
+  The mutex has a owner, but it hasn't woken up and the mutex flag
+  "Pending Owner" is set.  The first check is to see if the owner isn't the
+  current task.  This is because this function is also used for the pending
+  owner to grab the mutex.  When a pending owner wakes up, it checks to see
+  if it can take the mutex, and this is done if the owner is already set to
+  itself.  If so, we succeed and leave the function, clearing the "Pending
+  Owner" bit.
+
+  If the pending owner is not current, we check to see if the current priority is
+  higher than the pending owner.  If not, we fail the function and return.
+
+  There's also something special about a pending owner.  That is a pending owner
+  is never blocked on a mutex.  So there is no PI chain to worry about.  It also
+  means that if the mutex doesn't have any waiters, there's no accounting needed
+  to update the pending owner's pi_list, since we only worry about processes
+  blocked on the current mutex.
+
+  If there are waiters on this mutex, and we just stole the ownership, we need
+  to take the top waiter, remove it from the pi_list of the pending owner, and
+  add it to the current pi_list.  Note that at this moment, the pending owner
+  is no longer on the list of waiters.  This is fine, since the pending owner
+  would add itself back when it realizes that it had the ownership stolen
+  from itself.  When the pending owner tries to grab the mutex, it will fail
+  in try_to_take_rt_mutex if the owner field points to another process.
+
+  2) No owner
+  -----------
+
+  If there is no owner (or we successfully stole the lock), we set the owner
+  of the mutex to current, and set the flag of "Has Waiters" if the current
+  mutex actually has waiters, or we clear the flag if it doesn't.  See, it was
+  OK that we set that flag early, since now it is cleared.
+
+  3) Failed to grab ownership
+  ---------------------------
+
+  The most interesting case is when we fail to take ownership. This means that
+  there exists an owner, or there's a pending owner with equal or higher
+  priority than the current task.
+
+We'll continue on the failed case.
+
+If the mutex has a timeout, we set up a timer to go off to break us out
+of this mutex if we failed to get it after a specified amount of time.
+
+Now we enter a loop that will continue to try to take ownership of the mutex, or
+fail from a timeout or signal.
+
+Once again we try to take the mutex.  This will usually fail the first time
+in the loop, since it had just failed to get the mutex.  But the second time
+in the loop, this would likely succeed, since the task would likely be
+the pending owner.
+
+If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done
+here.
+
+The waiter structure has a "task" field that points to the task that is blocked
+on the mutex.  This field can be NULL the first time it goes through the loop
+or if the task is a pending owner and had it's mutex stolen.  If the "task"
+field is NULL then we need to set up the accounting for it.
+
+Task blocks on mutex
+--------------------
+
+The accounting of a mutex and process is done with the waiter structure of
+the process.  The "task" field is set to the process, and the "lock" field
+to the mutex.  The plist nodes are initialized to the processes current
+priority.
+
+Since the wait_lock was taken at the entry of the slow lock, we can safely
+add the waiter to the wait_list.  If the current process is the highest
+priority process currently waiting on this mutex, then we remove the
+previous top waiter process (if it exists) from the pi_list of the owner,
+and add the current process to that list.  Since the pi_list of the owner
+has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
+should adjust its priority accordingly.
+
+If the owner is also blocked on a lock, and had its pi_list changed
+(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
+and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
+
+Now all locks are released, and if the current process is still blocked on a
+mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
+
+Waking up in the loop
+---------------------
+
+The schedule can then wake up for a few reasons.
+  1) we were given pending ownership of the mutex.
+  2) we received a signal and was TASK_INTERRUPTIBLE
+  3) we had a timeout and was TASK_INTERRUPTIBLE
+
+In any of these cases, we continue the loop and once again try to grab the
+ownership of the mutex.  If we succeed, we exit the loop, otherwise we continue
+and on signal and timeout, will exit the loop, or if we had the mutex stolen
+we just simply add ourselves back on the lists and go back to sleep.
+
+Note: For various reasons, because of timeout and signals, the steal mutex
+      algorithm needs to be careful. This is because the current process is
+      still on the wait_list. And because of dynamic changing of priorities,
+      especially on SCHED_OTHER tasks, the current process can be the
+      highest priority task on the wait_list.
+
+Failed to get mutex on Timeout or Signal
+----------------------------------------
+
+If a timeout or signal occurred, the waiter's "task" field would not be
+NULL and the task needs to be taken off the wait_list of the mutex and perhaps
+pi_list of the owner.  If this process was a high priority process, then
+the rt_mutex_adjust_prio_chain needs to be executed again on the owner,
+but this time it will be lowering the priorities.
+
+
+Unlocking the Mutex
+-------------------
+
+The unlocking of a mutex also has a fast path for those architectures with
+CMPXCHG.  Since the taking of a mutex on contention always sets the
+"Has Waiters" flag of the mutex's owner, we use this to know if we need to
+take the slow path when unlocking the mutex.  If the mutex doesn't have any
+waiters, the owner field of the mutex would equal the current process and
+the mutex can be unlocked by just replacing the owner field with NULL.
+
+If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
+the slow unlock path is taken.
+
+The first thing done in the slow unlock path is to take the wait_lock of the
+mutex.  This synchronizes the locking and unlocking of the mutex.
+
+A check is made to see if the mutex has waiters or not.  On architectures that
+do not have CMPXCHG, this is the location that the owner of the mutex will
+determine if a waiter needs to be awoken or not.  On architectures that
+do have CMPXCHG, that check is done in the fast path, but it is still needed
+in the slow path too.  If a waiter of a mutex woke up because of a signal
+or timeout between the time the owner failed the fast path CMPXCHG check and
+the grabbing of the wait_lock, the mutex may not have any waiters, thus the
+owner still needs to make this check. If there are no waiters than the mutex
+owner field is set to NULL, the wait_lock is released and nothing more is
+needed.
+
+If there are waiters, then we need to wake one up and give that waiter
+pending ownership.
+
+On the wake up code, the pi_lock of the current owner is taken.  The top
+waiter of the lock is found and removed from the wait_list of the mutex
+as well as the pi_list of the current owner.  The task field of the new
+pending owner's waiter structure is set to NULL, and the owner field of the
+mutex is set to the new owner with the "Pending Owner" bit set, as well
+as the "Has Waiters" bit if there still are other processes blocked on the
+mutex.
+
+The pi_lock of the previous owner is released, and the new pending owner's
+pi_lock is taken.  Remember that this is the trick to prevent the race
+condition in rt_mutex_adjust_prio_chain from adding itself as a waiter
+on the mutex.
+
+We now clear the "pi_blocked_on" field of the new pending owner, and if
+the mutex still has waiters pending, we add the new top waiter to the pi_list
+of the pending owner.
+
+Finally we unlock the pi_lock of the pending owner and wake it up.
+
+
+Contact
+-------
+
+For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
+
+
+Credits
+-------
+
+Author:  Steven Rostedt <rostedt@goodmis.org>
+
+Reviewers:  Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap
+
+Updates
+-------
+
+This document was originally written for 2.6.17-rc3-mm1
diff --git a/Documentation/rt-mutex.txt b/Documentation/rt-mutex.txt
new file mode 100644 (file)
index 0000000..243393d
--- /dev/null
@@ -0,0 +1,79 @@
+RT-mutex subsystem with PI support
+----------------------------------
+
+RT-mutexes with priority inheritance are used to support PI-futexes,
+which enable pthread_mutex_t priority inheritance attributes
+(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
+about PI-futexes.]
+
+This technology was developed in the -rt tree and streamlined for
+pthread_mutex support.
+
+Basic principles:
+-----------------
+
+RT-mutexes extend the semantics of simple mutexes by the priority
+inheritance protocol.
+
+A low priority owner of a rt-mutex inherits the priority of a higher
+priority waiter until the rt-mutex is released. If the temporarily
+boosted owner blocks on a rt-mutex itself it propagates the priority
+boosting to the owner of the other rt_mutex it gets blocked on. The
+priority boosting is immediately removed once the rt_mutex has been
+unlocked.
+
+This approach allows us to shorten the block of high-prio tasks on
+mutexes which protect shared resources. Priority inheritance is not a
+magic bullet for poorly designed applications, but it allows
+well-designed applications to use userspace locks in critical parts of
+an high priority thread, without losing determinism.
+
+The enqueueing of the waiters into the rtmutex waiter list is done in
+priority order. For same priorities FIFO order is chosen. For each
+rtmutex, only the top priority waiter is enqueued into the owner's
+priority waiters list. This list too queues in priority order. Whenever
+the top priority waiter of a task changes (for example it timed out or
+got a signal), the priority of the owner task is readjusted. [The
+priority enqueueing is handled by "plists", see include/linux/plist.h
+for more details.]
+
+RT-mutexes are optimized for fastpath operations and have no internal
+locking overhead when locking an uncontended mutex or unlocking a mutex
+without waiters. The optimized fastpath operations require cmpxchg
+support. [If that is not available then the rt-mutex internal spinlock
+is used]
+
+The state of the rt-mutex is tracked via the owner field of the rt-mutex
+structure:
+
+rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1
+are used to keep track of the "owner is pending" and "rtmutex has
+waiters" state.
+
+ owner         bit1    bit0
+ NULL          0       0       mutex is free (fast acquire possible)
+ NULL          0       1       invalid state
+ NULL          1       0       Transitional state*
+ NULL          1       1       invalid state
+ taskpointer   0       0       mutex is held (fast release possible)
+ taskpointer   0       1       task is pending owner
+ taskpointer   1       0       mutex is held and has waiters
+ taskpointer   1       1       task is pending owner and mutex has waiters
+
+Pending-ownership handling is a performance optimization:
+pending-ownership is assigned to the first (highest priority) waiter of
+the mutex, when the mutex is released. The thread is woken up and once
+it starts executing it can acquire the mutex. Until the mutex is taken
+by it (bit 0 is cleared) a competing higher priority thread can "steal"
+the mutex which puts the woken up thread back on the waiters list.
+
+The pending-ownership optimization is especially important for the
+uninterrupted workflow of high-prio tasks which repeatedly
+takes/releases locks that have lower-prio waiters. Without this
+optimization the higher-prio thread would ping-pong to the lower-prio
+task [because at unlock time we always assign a new owner].
+
+(*) The "mutex has waiters" bit gets set to take the lock. If the lock
+doesn't already have an owner, this bit is quickly cleared if there are
+no waiters.  So this is a transitional state to synchronize with looking
+at the owner field of the mutex and the mutex owner releasing the lock.
index 558b83368559396f4d50f793d37d8e7dd83c26d1..254c507a608c076f9e09aa8acb83edfea90d2975 100644 (file)
@@ -481,7 +481,7 @@ register_cpus(void)
                struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL);
                if (!p)
                        return -ENOMEM;
-               register_cpu(p, i, NULL);
+               register_cpu(p, i);
        }
        return 0;
 }
index 9fc9af88c60c72e54cad5e835c579b95027624e6..093ccba0503c9a1885bbddc3fa6f3c2e8a67e5eb 100644 (file)
@@ -808,7 +808,7 @@ static int __init topology_init(void)
        int cpu;
 
        for_each_possible_cpu(cpu)
-               register_cpu(&per_cpu(cpu_data, cpu).cpu, cpu, NULL);
+               register_cpu(&per_cpu(cpu_data, cpu).cpu, cpu);
 
        return 0;
 }
index 47c08bcd9b24f313e2216d88b97e7e2e58958078..3bb221db164a65c0425fdb5b01e299e331ccd99a 100644 (file)
@@ -233,7 +233,7 @@ config NR_CPUS
 
 config SCHED_SMT
        bool "SMT (Hyperthreading) scheduler support"
-       depends on SMP
+       depends on X86_HT
        help
          SMT scheduler support improves the CPU scheduler's decision making
          when dealing with Intel Pentium 4 chips with HyperThreading at a
@@ -242,7 +242,7 @@ config SCHED_SMT
 
 config SCHED_MC
        bool "Multi-core scheduler support"
-       depends on SMP
+       depends on X86_HT
        default y
        help
          Multi-core scheduler support improves the CPU scheduler's decision
@@ -780,6 +780,17 @@ config HOTPLUG_CPU
          enable suspend on SMP systems. CPUs can be controlled through
          /sys/devices/system/cpu.
 
+config COMPAT_VDSO
+       bool "Compat VDSO support"
+       default y
+       help
+         Map the VDSO to the predictable old-style address too.
+       ---help---
+         Say N here if you are running a sufficiently recent glibc
+         version (2.3.3 or later), to remove the high-mapped
+         VDSO mapping and to exclusively use the randomized VDSO.
+
+         If unsure, say Y.
 
 endmenu
 
index 1c3a809e64217292de3d4ab009b32d9953c0ee19..c80271f8f084c04014d926185cbe896a86fe8d9e 100644 (file)
@@ -14,6 +14,7 @@
 #include <asm/fixmap.h>
 #include <asm/processor.h>
 #include <asm/thread_info.h>
+#include <asm/elf.h>
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -54,6 +55,7 @@ void foo(void)
        OFFSET(TI_preempt_count, thread_info, preempt_count);
        OFFSET(TI_addr_limit, thread_info, addr_limit);
        OFFSET(TI_restart_block, thread_info, restart_block);
+       OFFSET(TI_sysenter_return, thread_info, sysenter_return);
        BLANK();
 
        OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
@@ -69,7 +71,7 @@ void foo(void)
                 sizeof(struct tss_struct));
 
        DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-       DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
+       DEFINE(VDSO_PRELINK, VDSO_PRELINK);
 
        OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
 }
index fd0457c9c827f406e9e7f35d490bf7c0c648fe01..e6a2d6b80cdae8a84e72566da1642bff59a7dc7a 100644 (file)
@@ -235,10 +235,10 @@ static void __init init_amd(struct cpuinfo_x86 *c)
                        while ((1 << bits) < c->x86_max_cores)
                                bits++;
                }
-               cpu_core_id[cpu] = phys_proc_id[cpu] & ((1<<bits)-1);
-               phys_proc_id[cpu] >>= bits;
+               c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
+               c->phys_proc_id >>= bits;
                printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
-                      cpu, c->x86_max_cores, cpu_core_id[cpu]);
+                      cpu, c->x86_max_cores, c->cpu_core_id);
        }
 #endif
 
index 44f2c5f2dda16a0b8adcb6d0170fd92951a7307e..70c87de582c7a793ab346b60c4416bcdaa3a9f2c 100644 (file)
@@ -294,7 +294,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
                        if (c->x86 >= 0x6)
                                c->x86_model += ((tfms >> 16) & 0xF) << 4;
                        c->x86_mask = tfms & 15;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
                        c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
 #else
                        c->apicid = (ebx >> 24) & 0xFF;
@@ -319,7 +319,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
        early_intel_workaround(c);
 
 #ifdef CONFIG_X86_HT
-       phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+       c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
 #endif
 }
 
@@ -477,11 +477,9 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
        u32     eax, ebx, ecx, edx;
        int     index_msb, core_bits;
-       int     cpu = smp_processor_id();
 
        cpuid(1, &eax, &ebx, &ecx, &edx);
 
-
        if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
                return;
 
@@ -492,16 +490,17 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
        } else if (smp_num_siblings > 1 ) {
 
                if (smp_num_siblings > NR_CPUS) {
-                       printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+                       printk(KERN_WARNING "CPU: Unsupported number of the "
+                                       "siblings %d", smp_num_siblings);
                        smp_num_siblings = 1;
                        return;
                }
 
                index_msb = get_count_order(smp_num_siblings);
-               phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
+               c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
 
                printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-                      phys_proc_id[cpu]);
+                      c->phys_proc_id);
 
                smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
@@ -509,12 +508,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
                core_bits = get_count_order(c->x86_max_cores);
 
-               cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
+               c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
                                               ((1 << core_bits) - 1);
 
                if (c->x86_max_cores > 1)
                        printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-                              cpu_core_id[cpu]);
+                              c->cpu_core_id);
        }
 }
 #endif
@@ -613,6 +612,12 @@ void __cpuinit cpu_init(void)
                set_in_cr4(X86_CR4_TSD);
        }
 
+       /* The CPU hotplug case */
+       if (cpu_gdt_descr->address) {
+               gdt = (struct desc_struct *)cpu_gdt_descr->address;
+               memset(gdt, 0, PAGE_SIZE);
+               goto old_gdt;
+       }
        /*
         * This is a horrible hack to allocate the GDT.  The problem
         * is that cpu_init() is called really early for the boot CPU
@@ -631,7 +636,7 @@ void __cpuinit cpu_init(void)
                                local_irq_enable();
                }
        }
-
+old_gdt:
        /*
         * Initialize the per-CPU GDT with the boot GDT,
         * and set up the GDT descriptor:
index 6c37b4fd8ce285c293306788656b016e83cf3da3..e9f0b928b0a9925e534ddeadb3c868493240df58 100644 (file)
@@ -159,13 +159,13 @@ union l2_cache {
        unsigned val;
 };
 
-static unsigned short assocs[] = {
+static const unsigned short assocs[] = {
        [1] = 1, [2] = 2, [4] = 4, [6] = 8,
        [8] = 16,
        [0xf] = 0xffff // ??
        };
-static unsigned char levels[] = { 1, 1, 2 };
-static unsigned char types[] = { 1, 2, 3 };
+static const unsigned char levels[] = { 1, 1, 2 };
+static const unsigned char types[] = { 1, 2, 3 };
 
 static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
                       union _cpuid4_leaf_ebx *ebx,
@@ -261,7 +261,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
        unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
        unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
        unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
        unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
 #endif
 
@@ -383,14 +383,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 
        if (new_l2) {
                l2 = new_l2;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
                cpu_llc_id[cpu] = l2_id;
 #endif
        }
 
        if (new_l3) {
                l3 = new_l3;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
                cpu_llc_id[cpu] = l3_id;
 #endif
        }
@@ -729,7 +729,7 @@ static void __cpuexit cache_remove_dev(struct sys_device * sys_dev)
        return;
 }
 
-static int cacheinfo_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
@@ -747,7 +747,7 @@ static int cacheinfo_cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
 
-static struct notifier_block cacheinfo_cpu_notifier =
+static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier =
 {
     .notifier_call = cacheinfo_cpu_callback,
 };
index a19fcb262dbb64d30120f2e974d6ed7c9a31177b..f54a15268ed730d7a24aba1668d140636fb6d88e 100644 (file)
@@ -18,7 +18,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
         * applications want to get the raw CPUID data, they should access
         * /dev/cpu/<cpu_nr>/cpuid instead.
         */
-       static char *x86_cap_flags[] = {
+       static const char * const x86_cap_flags[] = {
                /* Intel-defined */
                "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
                "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
@@ -62,7 +62,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
        };
-       static char *x86_power_flags[] = {
+       static const char * const x86_power_flags[] = {
                "ts",   /* temperature sensor */
                "fid",  /* frequency id control */
                "vid",  /* voltage id control */
@@ -109,9 +109,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
                seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
 #ifdef CONFIG_X86_HT
        if (c->x86_max_cores * smp_num_siblings > 1) {
-               seq_printf(m, "physical id\t: %d\n", phys_proc_id[n]);
+               seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
                seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[n]));
-               seq_printf(m, "core id\t\t: %d\n", cpu_core_id[n]);
+               seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
                seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
        }
 #endif
index 1d9a4abcdfc71f034c1c822853049280b32de0a2..f6dfa9fb675c1bfc5eba75ba74317ba2815b6da1 100644 (file)
@@ -183,7 +183,7 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long ac
        return NOTIFY_OK;
 }
 
-static struct notifier_block cpuid_class_cpu_notifier =
+static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
 {
        .notifier_call = cpuid_class_cpu_callback,
 };
index e6e4506e749acbe079f9196fab1a05a6e333a753..fbdb933251b643b8e726feefe8e350205dbd7396 100644 (file)
@@ -83,6 +83,12 @@ VM_MASK              = 0x00020000
 #define resume_kernel          restore_nocheck
 #endif
 
+#ifdef CONFIG_VM86
+#define resume_userspace_sig   check_userspace
+#else
+#define resume_userspace_sig   resume_userspace
+#endif
+
 #define SAVE_ALL \
        cld; \
        pushl %es; \
@@ -211,6 +217,7 @@ ret_from_exception:
        preempt_stop
 ret_from_intr:
        GET_THREAD_INFO(%ebp)
+check_userspace:
        movl EFLAGS(%esp), %eax         # mix EFLAGS and CS
        movb CS(%esp), %al
        testl $(VM_MASK | 3), %eax
@@ -263,7 +270,12 @@ sysenter_past_esp:
        pushl $(__USER_CS)
        CFI_ADJUST_CFA_OFFSET 4
        /*CFI_REL_OFFSET cs, 0*/
-       pushl $SYSENTER_RETURN
+       /*
+        * Push current_thread_info()->sysenter_return to the stack.
+        * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+        * pushed above; +8 corresponds to copy_thread's esp0 setting.
+        */
+       pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
        CFI_ADJUST_CFA_OFFSET 4
        CFI_REL_OFFSET eip, 0
 
@@ -415,7 +427,7 @@ work_notifysig:                             # deal with pending signals and
                                        # vm86-space
        xorl %edx, %edx
        call do_notify_resume
-       jmp resume_userspace
+       jmp resume_userspace_sig
 
        ALIGN
 work_notifysig_v86:
@@ -428,7 +440,7 @@ work_notifysig_v86:
        movl %eax, %esp
        xorl %edx, %edx
        call do_notify_resume
-       jmp resume_userspace
+       jmp resume_userspace_sig
 #endif
 
        # perform syscall exit tracing
@@ -515,7 +527,7 @@ ENTRY(irq_entries_start)
  .if vector
        CFI_ADJUST_CFA_OFFSET -4
  .endif
-1:     pushl $vector-256
+1:     pushl $~(vector)
        CFI_ADJUST_CFA_OFFSET 4
        jmp common_interrupt
 .data
@@ -535,7 +547,7 @@ common_interrupt:
 #define BUILD_INTERRUPT(name, nr)      \
 ENTRY(name)                            \
        RING0_INT_FRAME;                \
-       pushl $nr-256;                  \
+       pushl $~(nr);                   \
        CFI_ADJUST_CFA_OFFSET 4;        \
        SAVE_ALL;                       \
        movl %esp,%eax;                 \
index 061533e0cb5e8efa284258c3619665a287a786e5..c703bc7b08800cc4f82298deb0817aa05178f4c7 100644 (file)
@@ -53,8 +53,8 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
  */
 fastcall unsigned int do_IRQ(struct pt_regs *regs)
 {      
-       /* high bits used in ret_from_ code */
-       int irq = regs->orig_eax & 0xff;
+       /* high bit used in ret_from_ code */
+       int irq = ~regs->orig_eax;
 #ifdef CONFIG_4KSTACKS
        union irq_ctx *curctx, *irqctx;
        u32 *isp;
@@ -100,8 +100,8 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
                 * softirq checks work in the hardirq context.
                 */
                irqctx->tinfo.preempt_count =
-                       irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK |
-                       curctx->tinfo.preempt_count & SOFTIRQ_MASK;
+                       (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+                       (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
 
                asm volatile(
                        "       xchgl   %%ebx,%%esp      \n"
index 7a328230e540f8b073e5d0306f457a1a9bffbc28..d022cb8fd7251ccc79e3eeb89a8210e5f92b85d1 100644 (file)
@@ -266,7 +266,7 @@ static int msr_class_cpu_callback(struct notifier_block *nfb, unsigned long acti
        return NOTIFY_OK;
 }
 
-static struct notifier_block msr_class_cpu_notifier =
+static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
 {
        .notifier_call = msr_class_cpu_callback,
 };
index 321f5fd26e75062ef2119710e9ce1b7047ac662c..9bf590cefc7d4d55d89fde0561ec6a5fb70d5192 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/mutex.h>
 #include <linux/pci.h>
 
 #include <linux/scx200.h>
@@ -45,11 +46,19 @@ static struct pci_driver scx200_pci_driver = {
        .probe = scx200_probe,
 };
 
-static DEFINE_SPINLOCK(scx200_gpio_config_lock);
+static DEFINE_MUTEX(scx200_gpio_config_lock);
 
-static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+static void __devinit scx200_init_shadow(void)
 {
        int bank;
+
+       /* read the current values driven on the GPIO signals */
+       for (bank = 0; bank < 2; ++bank)
+               scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+}
+
+static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
        unsigned base;
 
        if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
@@ -63,10 +72,7 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
                }
 
                scx200_gpio_base = base;
-
-               /* read the current values driven on the GPIO signals */
-               for (bank = 0; bank < 2; ++bank)
-                       scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+               scx200_init_shadow();
 
        } else {
                /* find the base of the Configuration Block */
@@ -87,12 +93,11 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
        return 0;
 }
 
-u32 scx200_gpio_configure(int index, u32 mask, u32 bits)
+u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
 {
        u32 config, new_config;
-       unsigned long flags;
 
-       spin_lock_irqsave(&scx200_gpio_config_lock, flags);
+       mutex_lock(&scx200_gpio_config_lock);
 
        outl(index, scx200_gpio_base + 0x20);
        config = inl(scx200_gpio_base + 0x24);
@@ -100,45 +105,11 @@ u32 scx200_gpio_configure(int index, u32 mask, u32 bits)
        new_config = (config & mask) | bits;
        outl(new_config, scx200_gpio_base + 0x24);
 
-       spin_unlock_irqrestore(&scx200_gpio_config_lock, flags);
+       mutex_unlock(&scx200_gpio_config_lock);
 
        return config;
 }
 
-#if 0
-void scx200_gpio_dump(unsigned index)
-{
-       u32 config = scx200_gpio_configure(index, ~0, 0);
-       printk(KERN_DEBUG "GPIO%02u: 0x%08lx", index, (unsigned long)config);
-       
-       if (config & 1) 
-               printk(" OE"); /* output enabled */
-       else
-               printk(" TS"); /* tristate */
-       if (config & 2) 
-               printk(" PP"); /* push pull */
-       else
-               printk(" OD"); /* open drain */
-       if (config & 4) 
-               printk(" PUE"); /* pull up enabled */
-       else
-               printk(" PUD"); /* pull up disabled */
-       if (config & 8) 
-               printk(" LOCKED"); /* locked */
-       if (config & 16) 
-               printk(" LEVEL"); /* level input */
-       else
-               printk(" EDGE"); /* edge input */
-       if (config & 32) 
-               printk(" HI"); /* trigger on rising edge */
-       else
-               printk(" LO"); /* trigger on falling edge */
-       if (config & 64) 
-               printk(" DEBOUNCE"); /* debounce */
-       printk("\n");
-}
-#endif  /*  0  */
-
 static int __init scx200_init(void)
 {
        printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
@@ -159,10 +130,3 @@ EXPORT_SYMBOL(scx200_gpio_base);
 EXPORT_SYMBOL(scx200_gpio_shadow);
 EXPORT_SYMBOL(scx200_gpio_configure);
 EXPORT_SYMBOL(scx200_cb_base);
-
-/*
-    Local variables:
-        compile-command: "make -k -C ../../.. SUBDIRS=arch/i386/kernel modules"
-        c-basic-offset: 8
-    End:
-*/
index 5c352c3a9e7fa00492da1ff1c33a28ad0b5217c7..43002cfb40c4e2811bf5006c9126fa2ff1cfdd46 100644 (file)
@@ -351,7 +351,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
                        goto give_sigsegv;
        }
 
-       restorer = &__kernel_sigreturn;
+       restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
        if (ka->sa.sa_flags & SA_RESTORER)
                restorer = ka->sa.sa_restorer;
 
@@ -447,7 +447,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                goto give_sigsegv;
 
        /* Set up to return from userspace.  */
-       restorer = &__kernel_rt_sigreturn;
+       restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
        if (ka->sa.sa_flags & SA_RESTORER)
                restorer = ka->sa.sa_restorer;
        err |= __put_user(restorer, &frame->pretcode);
index bce5470ecb42e6e91ac47510366676fe13637a57..89e7315e539c49a5f8f19a3479ae21774075a374 100644 (file)
@@ -67,12 +67,6 @@ int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
 #endif
 
-/* Package ID of each logical CPU */
-int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-
-/* Core ID of each logical CPU */
-int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-
 /* Last level cache ID of each logical CPU */
 int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
 
@@ -454,10 +448,12 @@ cpumask_t cpu_coregroup_map(int cpu)
        struct cpuinfo_x86 *c = cpu_data + cpu;
        /*
         * For perf, we return last level cache shared map.
-        * TBD: when power saving sched policy is added, we will return
-        *      cpu_core_map when power saving policy is enabled
+        * And for power savings, we return cpu_core_map
         */
-       return c->llc_shared_map;
+       if (sched_mc_power_savings || sched_smt_power_savings)
+               return cpu_core_map[cpu];
+       else
+               return c->llc_shared_map;
 }
 
 /* representing cpus for which sibling maps can be computed */
@@ -473,8 +469,8 @@ set_cpu_sibling_map(int cpu)
 
        if (smp_num_siblings > 1) {
                for_each_cpu_mask(i, cpu_sibling_setup_map) {
-                       if (phys_proc_id[cpu] == phys_proc_id[i] &&
-                           cpu_core_id[cpu] == cpu_core_id[i]) {
+                       if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+                           c[cpu].cpu_core_id == c[i].cpu_core_id) {
                                cpu_set(i, cpu_sibling_map[cpu]);
                                cpu_set(cpu, cpu_sibling_map[i]);
                                cpu_set(i, cpu_core_map[cpu]);
@@ -501,7 +497,7 @@ set_cpu_sibling_map(int cpu)
                        cpu_set(i, c[cpu].llc_shared_map);
                        cpu_set(cpu, c[i].llc_shared_map);
                }
-               if (phys_proc_id[cpu] == phys_proc_id[i]) {
+               if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
                        cpu_set(i, cpu_core_map[cpu]);
                        cpu_set(cpu, cpu_core_map[i]);
                        /*
@@ -1056,6 +1052,7 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
        struct warm_boot_cpu_info info;
        struct work_struct task;
        int     apicid, ret;
+       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
        apicid = x86_cpu_to_apicid[cpu];
        if (apicid == BAD_APICID) {
@@ -1063,6 +1060,18 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
                goto exit;
        }
 
+       /*
+        * the CPU isn't initialized at boot time, allocate gdt table here.
+        * cpu_init will initialize it
+        */
+       if (!cpu_gdt_descr->address) {
+               cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+               if (!cpu_gdt_descr->address)
+                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+                       ret = -ENOMEM;
+                       goto exit;
+       }
+
        info.complete = &done;
        info.apicid = apicid;
        info.cpu = cpu;
@@ -1340,8 +1349,8 @@ remove_siblinginfo(int cpu)
                cpu_clear(cpu, cpu_sibling_map[sibling]);
        cpus_clear(cpu_sibling_map[cpu]);
        cpus_clear(cpu_core_map[cpu]);
-       phys_proc_id[cpu] = BAD_APICID;
-       cpu_core_id[cpu] = BAD_APICID;
+       c[cpu].phys_proc_id = 0;
+       c[cpu].cpu_core_id = 0;
        cpu_clear(cpu, cpu_sibling_setup_map);
 }
 
index 0bada1870bdf5691631e10558bf43cbb350691a1..c60419dee0180b7b19f94b2e40452c73ab54c493 100644 (file)
@@ -2,6 +2,8 @@
  * linux/arch/i386/kernel/sysenter.c
  *
  * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
  *
  * This file contains the needed initializations to support sysenter.
  */
 #include <linux/gfp.h>
 #include <linux/string.h>
 #include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/module.h>
 
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
 #include <asm/pgtable.h>
 #include <asm/unistd.h>
 
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = 1;
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+
+static int __init vdso_setup(char *s)
+{
+       vdso_enabled = simple_strtoul(s, NULL, 0);
+
+       return 1;
+}
+
+__setup("vdso=", vdso_setup);
+
 extern asmlinkage void sysenter_entry(void);
 
 void enable_sep_cpu(void)
@@ -45,23 +66,122 @@ void enable_sep_cpu(void)
  */
 extern const char vsyscall_int80_start, vsyscall_int80_end;
 extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+static void *syscall_page;
 
 int __init sysenter_setup(void)
 {
-       void *page = (void *)get_zeroed_page(GFP_ATOMIC);
+       syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
 
-       __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
+#ifdef CONFIG_COMPAT_VDSO
+       __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY);
+       printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+#else
+       /*
+        * In the non-compat case the ELF coredumping code needs the fixmap:
+        */
+       __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_KERNEL_RO);
+#endif
 
        if (!boot_cpu_has(X86_FEATURE_SEP)) {
-               memcpy(page,
+               memcpy(syscall_page,
                       &vsyscall_int80_start,
                       &vsyscall_int80_end - &vsyscall_int80_start);
                return 0;
        }
 
-       memcpy(page,
+       memcpy(syscall_page,
               &vsyscall_sysenter_start,
               &vsyscall_sysenter_end - &vsyscall_sysenter_start);
 
        return 0;
 }
+
+static struct page *syscall_nopage(struct vm_area_struct *vma,
+                               unsigned long adr, int *type)
+{
+       struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
+       get_page(p);
+       return p;
+}
+
+/* Prevent VMA merging */
+static void syscall_vma_close(struct vm_area_struct *vma)
+{
+}
+
+static struct vm_operations_struct syscall_vm_ops = {
+       .close = syscall_vma_close,
+       .nopage = syscall_nopage,
+};
+
+/* Defined in vsyscall-sysenter.S */
+extern void SYSENTER_RETURN;
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+       struct vm_area_struct *vma;
+       struct mm_struct *mm = current->mm;
+       unsigned long addr;
+       int ret;
+
+       down_write(&mm->mmap_sem);
+       addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+       if (IS_ERR_VALUE(addr)) {
+               ret = addr;
+               goto up_fail;
+       }
+
+       vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL);
+       if (!vma) {
+               ret = -ENOMEM;
+               goto up_fail;
+       }
+
+       vma->vm_start = addr;
+       vma->vm_end = addr + PAGE_SIZE;
+       /* MAYWRITE to allow gdb to COW and set breakpoints */
+       vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
+       vma->vm_flags |= mm->def_flags;
+       vma->vm_page_prot = protection_map[vma->vm_flags & 7];
+       vma->vm_ops = &syscall_vm_ops;
+       vma->vm_mm = mm;
+
+       ret = insert_vm_struct(mm, vma);
+       if (ret)
+               goto free_vma;
+
+       current->mm->context.vdso = (void *)addr;
+       current_thread_info()->sysenter_return =
+                                   (void *)VDSO_SYM(&SYSENTER_RETURN);
+       mm->total_vm++;
+up_fail:
+       up_write(&mm->mmap_sem);
+       return ret;
+
+free_vma:
+       kmem_cache_free(vm_area_cachep, vma);
+       return ret;
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+       if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+               return "[vdso]";
+       return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+       return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+       return 0;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+       return 0;
+}
index 296355292c7c56433df1c849e93851985a7798d3..e2e281d4bcc80ee7bc3b090e75914606231e6224 100644 (file)
 
 static struct i386_cpu cpu_devices[NR_CPUS];
 
-int arch_register_cpu(int num){
-       struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
-       int node = cpu_to_node(num);
-       if (node_online(node))
-               parent = &node_devices[node].node;
-#endif /* CONFIG_NUMA */
-
+int arch_register_cpu(int num)
+{
        /*
         * CPU0 cannot be offlined due to several
         * restrictions and assumptions in kernel. This basically
@@ -50,21 +43,13 @@ int arch_register_cpu(int num){
        if (!num)
                cpu_devices[num].cpu.no_control = 1;
 
-       return register_cpu(&cpu_devices[num].cpu, num, parent);
+       return register_cpu(&cpu_devices[num].cpu, num);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
 void arch_unregister_cpu(int num) {
-       struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
-       int node = cpu_to_node(num);
-       if (node_online(node))
-               parent = &node_devices[node].node;
-#endif /* CONFIG_NUMA */
-
-       return unregister_cpu(&cpu_devices[num].cpu, parent);
+       return unregister_cpu(&cpu_devices[num].cpu);
 }
 EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
@@ -74,16 +59,13 @@ EXPORT_SYMBOL(arch_unregister_cpu);
 
 #ifdef CONFIG_NUMA
 #include <linux/mmzone.h>
-#include <asm/node.h>
-
-struct i386_node node_devices[MAX_NUMNODES];
 
 static int __init topology_init(void)
 {
        int i;
 
        for_each_online_node(i)
-               arch_register_node(i);
+               register_one_node(i);
 
        for_each_present_cpu(i)
                arch_register_cpu(i);
index 3b62baa6a371def2c0ae3e426fbf2410f3fc83a2..1a36d26e15eb0c6d370832ee166dd0fba3700f59 100644 (file)
@@ -42,10 +42,10 @@ __kernel_vsyscall:
        /* 7: align return point with nop's to make disassembly easier */
        .space 7,0x90
 
-       /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */
+       /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
        jmp .Lenter_kernel
        /* 16: System call normal return point is here! */
-       .globl SYSENTER_RETURN  /* Symbol used by entry.S.  */
+       .globl SYSENTER_RETURN  /* Symbol used by sysenter.c  */
 SYSENTER_RETURN:
        pop %ebp
 .Lpop_ebp:
index 98699ca6e52d7b2febba30a086741a4f663da426..e26975fc68b650105cbff7b0568ba7b856770085 100644 (file)
@@ -7,7 +7,7 @@
 
 SECTIONS
 {
-  . = VSYSCALL_BASE + SIZEOF_HEADERS;
+  . = VDSO_PRELINK + SIZEOF_HEADERS;
 
   .hash           : { *(.hash) }               :text
   .dynsym         : { *(.dynsym) }
@@ -20,7 +20,7 @@ SECTIONS
      For the layouts to match, we need to skip more than enough
      space for the dynamic symbol table et al.  If this amount
      is insufficient, ld -shared will barf.  Just increase it here.  */
-  . = VSYSCALL_BASE + 0x400;
+  . = VDSO_PRELINK + 0x400;
 
   .text           : { *(.text) }               :text =0x90909090
   .note                  : { *(.note.*) }              :text :note
index 0e225054e2229cf9543f70ede0b5794e45ddb73d..defc6ebbd56517ac054deeeafbf17239de36cc5d 100644 (file)
@@ -5,10 +5,10 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
-#include <asm/acpi.h>
 #include <asm/arch_hooks.h>
 #include <asm/voyager.h>
 #include <asm/e820.h>
+#include <asm/io.h>
 #include <asm/setup.h>
 
 void __init pre_intr_init_hook(void)
@@ -27,8 +27,7 @@ void __init intr_init_hook(void)
        smp_intr_init();
 #endif
 
-       if (!acpi_ioapic)
-               setup_irq(2, &irq2);
+       setup_irq(2, &irq2);
 }
 
 void __init pre_setup_arch_hook(void)
index bf19513f0cea227126407b0e22cbb0bf614fa7b3..f84b16e007ff86fb73185482efcbca7be8eb4ebd 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/poison.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
@@ -654,7 +655,7 @@ void __init mem_init(void)
  */
 #ifdef CONFIG_MEMORY_HOTPLUG
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
        struct pglist_data *pgdata = &contig_page_data;
        struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
@@ -753,7 +754,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
        for (addr = begin; addr < end; addr += PAGE_SIZE) {
                ClearPageReserved(virt_to_page(addr));
                init_page_count(virt_to_page(addr));
-               memset((void *)addr, 0xcc, PAGE_SIZE);
+               memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
                free_page(addr);
                totalram_pages++;
        }
index 0887b34bc59b987543c10b92ae53d73b83c361ef..353a836ed63c03f363acbcba49e4c54d9258f855 100644 (file)
@@ -229,8 +229,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
        if (PageHighMem(page))
                return;
        if (!enable)
-               mutex_debug_check_no_locks_freed(page_address(page),
-                                                numpages * PAGE_SIZE);
+               debug_check_no_locks_freed(page_address(page),
+                                          numpages * PAGE_SIZE);
 
        /* the return value is ignored - the calls cannot fail,
         * large pages are disabled at boot time.
index 18318749884b012f1f2e3d879a633a7ed21802bd..a56df7bf022da153c2efe0035526e296df419d0a 100644 (file)
@@ -374,6 +374,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
        def_bool y
        depends on NEED_MULTIPLE_NODES
 
+config HAVE_ARCH_NODEDATA_EXTENSION
+       def_bool y
+       depends on NUMA
+
 config IA32_SUPPORT
        bool "Support for Linux/x86 binaries"
        help
index 859fb37ff49b682799b65b1384b9d05cbc576b88..303a9afcf2a1ce7b5845335c36591f23c3c3b360 100644 (file)
@@ -959,7 +959,7 @@ remove_palinfo_proc_entries(unsigned int hcpu)
        }
 }
 
-static int palinfo_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit palinfo_cpu_callback(struct notifier_block *nfb,
                                                                unsigned long action,
                                                                void *hcpu)
 {
@@ -978,7 +978,7 @@ static int palinfo_cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
 
-static struct notifier_block palinfo_cpu_notifier =
+static struct notifier_block __cpuinitdata palinfo_cpu_notifier =
 {
        .notifier_call = palinfo_cpu_callback,
        .priority = 0,
index 663a186ad194a1abfaa7b00b72a7b93afbcbe107..9065f0f01ba3e7f70b48abf36af96cf46b205514 100644 (file)
@@ -572,7 +572,7 @@ static struct file_operations salinfo_data_fops = {
 };
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int
+static int __devinit
 salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
        unsigned int i, cpu = (unsigned long)hcpu;
@@ -673,9 +673,7 @@ salinfo_init(void)
        salinfo_timer.function = &salinfo_timeout;
        add_timer(&salinfo_timer);
 
-#ifdef CONFIG_HOTPLUG_CPU
-       register_cpu_notifier(&salinfo_cpu_notifier);
-#endif
+       register_hotcpu_notifier(&salinfo_cpu_notifier);
 
        return 0;
 }
index 879edb51d1e0e1b2c29698815127dc8b215f7067..5511d9c6c70152fe67b1025c1fe8ea8044926760 100644 (file)
 #include <asm/numa.h>
 #include <asm/cpu.h>
 
-#ifdef CONFIG_NUMA
-static struct node *sysfs_nodes;
-#endif
 static struct ia64_cpu *sysfs_cpus;
 
 int arch_register_cpu(int num)
 {
-       struct node *parent = NULL;
-       
-#ifdef CONFIG_NUMA
-       parent = &sysfs_nodes[cpu_to_node(num)];
-#endif /* CONFIG_NUMA */
-
 #if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU)
        /*
         * If CPEI cannot be re-targetted, and this is
@@ -48,21 +39,14 @@ int arch_register_cpu(int num)
                sysfs_cpus[num].cpu.no_control = 1;
 #endif
 
-       return register_cpu(&sysfs_cpus[num].cpu, num, parent);
+       return register_cpu(&sysfs_cpus[num].cpu, num);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
 void arch_unregister_cpu(int num)
 {
-       struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
-       int node = cpu_to_node(num);
-       parent = &sysfs_nodes[node];
-#endif /* CONFIG_NUMA */
-
-       return unregister_cpu(&sysfs_cpus[num].cpu, parent);
+       return unregister_cpu(&sysfs_cpus[num].cpu);
 }
 EXPORT_SYMBOL(arch_register_cpu);
 EXPORT_SYMBOL(arch_unregister_cpu);
@@ -74,17 +58,11 @@ static int __init topology_init(void)
        int i, err = 0;
 
 #ifdef CONFIG_NUMA
-       sysfs_nodes = kzalloc(sizeof(struct node) * MAX_NUMNODES, GFP_KERNEL);
-       if (!sysfs_nodes) {
-               err = -ENOMEM;
-               goto out;
-       }
-
        /*
         * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes?
         */
        for_each_online_node(i) {
-               if ((err = register_node(&sysfs_nodes[i], i, 0)))
+               if ((err = register_one_node(i)))
                        goto out;
        }
 #endif
@@ -426,7 +404,7 @@ static int __cpuinit cache_remove_dev(struct sys_device * sys_dev)
  * When a cpu is hot-plugged, do a check and initiate
  * cache kobject if necessary
  */
-static int cache_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cache_cpu_callback(struct notifier_block *nfb,
                unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
@@ -444,7 +422,7 @@ static int cache_cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
 
-static struct notifier_block cache_cpu_notifier =
+static struct notifier_block __cpuinitdata cache_cpu_notifier =
 {
        .notifier_call = cache_cpu_callback
 };
index b6bcc9fa36030690b073440781e48398847ae547..525b082eb6619821fed5224fccadabaf79002cf4 100644 (file)
@@ -33,7 +33,6 @@
  */
 struct early_node_data {
        struct ia64_node_data *node_data;
-       pg_data_t *pgdat;
        unsigned long pernode_addr;
        unsigned long pernode_size;
        struct bootmem_data bootmem_data;
@@ -46,6 +45,8 @@ struct early_node_data {
 static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
 static nodemask_t memory_less_mask __initdata;
 
+static pg_data_t *pgdat_list[MAX_NUMNODES];
+
 /*
  * To prevent cache aliasing effects, align per-node structures so that they
  * start at addresses that are strided by node number.
@@ -99,7 +100,7 @@ static int __init build_node_maps(unsigned long start, unsigned long len,
  * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
  * called yet.  Note that node 0 will also count all non-existent cpus.
  */
-static int __init early_nr_cpus_node(int node)
+static int __meminit early_nr_cpus_node(int node)
 {
        int cpu, n = 0;
 
@@ -114,7 +115,7 @@ static int __init early_nr_cpus_node(int node)
  * compute_pernodesize - compute size of pernode data
  * @node: the node id.
  */
-static unsigned long __init compute_pernodesize(int node)
+static unsigned long __meminit compute_pernodesize(int node)
 {
        unsigned long pernodesize = 0, cpus;
 
@@ -175,13 +176,13 @@ static void __init fill_pernode(int node, unsigned long pernode,
        pernode += PERCPU_PAGE_SIZE * cpus;
        pernode += node * L1_CACHE_BYTES;
 
-       mem_data[node].pgdat = __va(pernode);
+       pgdat_list[node] = __va(pernode);
        pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
 
        mem_data[node].node_data = __va(pernode);
        pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
 
-       mem_data[node].pgdat->bdata = bdp;
+       pgdat_list[node]->bdata = bdp;
        pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
 
        cpu_data = per_cpu_node_setup(cpu_data, node);
@@ -268,7 +269,7 @@ static int __init find_pernode_space(unsigned long start, unsigned long len,
 static int __init free_node_bootmem(unsigned long start, unsigned long len,
                                    int node)
 {
-       free_bootmem_node(mem_data[node].pgdat, start, len);
+       free_bootmem_node(pgdat_list[node], start, len);
 
        return 0;
 }
@@ -287,7 +288,7 @@ static void __init reserve_pernode_space(void)
        int node;
 
        for_each_online_node(node) {
-               pg_data_t *pdp = mem_data[node].pgdat;
+               pg_data_t *pdp = pgdat_list[node];
 
                if (node_isset(node, memory_less_mask))
                        continue;
@@ -307,6 +308,17 @@ static void __init reserve_pernode_space(void)
        }
 }
 
+static void __meminit scatter_node_data(void)
+{
+       pg_data_t **dst;
+       int node;
+
+       for_each_online_node(node) {
+               dst = LOCAL_DATA_ADDR(pgdat_list[node])->pg_data_ptrs;
+               memcpy(dst, pgdat_list, sizeof(pgdat_list));
+       }
+}
+
 /**
  * initialize_pernode_data - fixup per-cpu & per-node pointers
  *
@@ -317,17 +329,10 @@ static void __init reserve_pernode_space(void)
  */
 static void __init initialize_pernode_data(void)
 {
-       pg_data_t *pgdat_list[MAX_NUMNODES];
        int cpu, node;
 
-       for_each_online_node(node)
-               pgdat_list[node] = mem_data[node].pgdat;
+       scatter_node_data();
 
-       /* Copy the pg_data_t list to each node and init the node field */
-       for_each_online_node(node) {
-               memcpy(mem_data[node].node_data->pg_data_ptrs, pgdat_list,
-                      sizeof(pgdat_list));
-       }
 #ifdef CONFIG_SMP
        /* Set the node_data pointer for each per-cpu struct */
        for (cpu = 0; cpu < NR_CPUS; cpu++) {
@@ -372,7 +377,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
        if (bestnode == -1)
                bestnode = anynode;
 
-       ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat, pernodesize,
+       ptr = __alloc_bootmem_node(pgdat_list[bestnode], pernodesize,
                PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 
        return ptr;
@@ -476,7 +481,7 @@ void __init find_memory(void)
                pernodesize = mem_data[node].pernode_size;
                map = pernode + pernodesize;
 
-               init_bootmem_node(mem_data[node].pgdat,
+               init_bootmem_node(pgdat_list[node],
                                  map>>PAGE_SHIFT,
                                  bdp->node_boot_start>>PAGE_SHIFT,
                                  bdp->node_low_pfn);
@@ -786,3 +791,21 @@ void __init paging_init(void)
 
        zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
+
+pg_data_t *arch_alloc_nodedata(int nid)
+{
+       unsigned long size = compute_pernodesize(nid);
+
+       return kzalloc(size, GFP_KERNEL);
+}
+
+void arch_free_nodedata(pg_data_t *pgdat)
+{
+       kfree(pgdat);
+}
+
+void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
+{
+       pgdat_list[update_node] = update_pgdat;
+       scatter_node_data();
+}
index 11f08001f8c26e9aaa820b37d379d78eedac0bbd..38306e98f04b2877a076ae7ac4912dee3cebfbab 100644 (file)
@@ -652,7 +652,7 @@ void online_page(struct page *page)
        num_physpages++;
 }
 
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
        pg_data_t *pgdat;
        struct zone *zone;
@@ -660,7 +660,7 @@ int add_memory(u64 start, u64 size)
        unsigned long nr_pages = size >> PAGE_SHIFT;
        int ret;
 
-       pgdat = NODE_DATA(0);
+       pgdat = NODE_DATA(nid);
 
        zone = pgdat->node_zones + ZONE_NORMAL;
        ret = __add_pages(zone, start_pfn, nr_pages);
@@ -671,7 +671,6 @@ int add_memory(u64 start, u64 size)
 
        return ret;
 }
-EXPORT_SYMBOL_GPL(add_memory);
 
 int remove_memory(u64 start, u64 size)
 {
index dc8e2b6967135f7d92f7236b9601c09cc349cc14..677c6c0fd66188b69e8df0194c2f3654360f00f4 100644 (file)
@@ -27,7 +27,7 @@ static void unregister_intr_pda(struct sn_irq_info *sn_irq_info);
 int sn_force_interrupt_flag = 1;
 extern int sn_ioif_inited;
 struct list_head **sn_irq_lh;
-static spinlock_t sn_irq_info_lock = SPIN_LOCK_UNLOCKED; /* non-IRQ lock */
+static DEFINE_SPINLOCK(sn_irq_info_lock); /* non-IRQ lock */
 
 u64 sn_intr_alloc(nasid_t local_nasid, int local_widget,
                                     struct sn_irq_info *sn_irq_info,
index 3cd3c2988a4875f6da0c8e2fb8af921264beba42..1ff483c8a4c9749c99d8f70b052ee9d805cc3aa6 100644 (file)
@@ -275,7 +275,7 @@ static int __init topology_init(void)
        int i;
 
        for_each_present_cpu(i)
-               register_cpu(&cpu_devices[i], i, NULL);
+               register_cpu(&cpu_devices[i], i);
 
        return 0;
 }
index 298f82fe8440a43d1a8140ed8a256d44595ed6d8..9096a5ea42298d4f409ce249f7f5e4ec3d9195c8 100644 (file)
@@ -446,7 +446,7 @@ static int __init topology_init(void)
        int ret;
 
        for_each_present_cpu(cpu) {
-               ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu, NULL);
+               ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu);
                if (ret)
                        printk(KERN_WARNING "topology_init: register_cpu %d "
                               "failed (%d)\n", cpu, ret);
index 2e8e52c135e6edc17613f24b4e79f29563ef8519..70cf09afdf565c573d5750f70723deed249d98bd 100644 (file)
@@ -367,7 +367,7 @@ void mipsmt_prepare_cpus(void)
        dvpe();
        dmt();
 
-       freeIPIq.lock = SPIN_LOCK_UNLOCKED;
+       spin_lock_init(&freeIPIq.lock);
 
        /*
         * We probably don't have as many VPEs as we do SMP "CPUs",
@@ -375,7 +375,7 @@ void mipsmt_prepare_cpus(void)
         */
        for (i=0; i<NR_CPUS; i++) {
                IPIQ[i].head = IPIQ[i].tail = NULL;
-               IPIQ[i].lock = SPIN_LOCK_UNLOCKED;
+               spin_lock_init(&IPIQ[i].lock);
                IPIQ[i].depth = 0;
                ipi_timer_latch[i] = 0;
        }
index 3ba040050e4ca1d85a28448accea66e50a0ce0fb..068b20d822e7b9c672983135a0c038e11e491839 100644 (file)
@@ -26,11 +26,10 @@ static struct cpu cpu_devices[NR_CPUS] __read_mostly;
 
 static int __init topology_init(void)
 {
-       struct node *parent = NULL;
        int num;
 
        for_each_present_cpu(num) {
-               register_cpu(&cpu_devices[num], num, parent);
+               register_cpu(&cpu_devices[num], num);
        }
        return 0;
 }
index e5a44812441ac121b980e3d296f19e09651df045..0932a62a1c9637d6834c4872f7f5bca7e2b5d11d 100644 (file)
@@ -215,7 +215,7 @@ int __init ppc_init(void)
 
        /* register CPU devices */
        for_each_possible_cpu(i)
-               register_cpu(&cpu_devices[i], i, NULL);
+               register_cpu(&cpu_devices[i], i);
 
        /* call platform init */
        if (ppc_md.init != NULL) {
index 5bc2585c8036d97e08b8fff0c174cce01ad61c14..4662b580efa1648939eb9e03ec932dd6b63680e2 100644 (file)
@@ -279,7 +279,7 @@ static void unregister_cpu_online(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int sysfs_cpu_notify(struct notifier_block *self,
+static int __devinit sysfs_cpu_notify(struct notifier_block *self,
                                      unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned int)(long)hcpu;
@@ -297,30 +297,19 @@ static int sysfs_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
 
-static struct notifier_block sysfs_cpu_nb = {
+static struct notifier_block __devinitdata sysfs_cpu_nb = {
        .notifier_call  = sysfs_cpu_notify,
 };
 
 /* NUMA stuff */
 
 #ifdef CONFIG_NUMA
-static struct node node_devices[MAX_NUMNODES];
-
 static void register_nodes(void)
 {
        int i;
 
-       for (i = 0; i < MAX_NUMNODES; i++) {
-               if (node_online(i)) {
-                       int p_node = parent_node(i);
-                       struct node *parent = NULL;
-
-                       if (p_node != i)
-                               parent = &node_devices[p_node];
-
-                       register_node(&node_devices[i], i, parent);
-               }
-       }
+       for (i = 0; i < MAX_NUMNODES; i++)
+               register_one_node(i);
 }
 
 int sysfs_add_device_to_node(struct sys_device *dev, int nid)
@@ -359,23 +348,13 @@ static SYSDEV_ATTR(physical_id, 0444, show_physical_id, NULL);
 static int __init topology_init(void)
 {
        int cpu;
-       struct node *parent = NULL;
 
        register_nodes();
-
        register_cpu_notifier(&sysfs_cpu_nb);
 
        for_each_possible_cpu(cpu) {
                struct cpu *c = &per_cpu(cpu_devices, cpu);
 
-#ifdef CONFIG_NUMA
-               /* The node to which a cpu belongs can't be known
-                * until the cpu is made present.
-                */
-               parent = NULL;
-               if (cpu_present(cpu))
-                       parent = &node_devices[cpu_to_node(cpu)];
-#endif
                /*
                 * For now, we just see if the system supports making
                 * the RTAS calls for CPU hotplug.  But, there may be a
@@ -387,7 +366,7 @@ static int __init topology_init(void)
                        c->no_control = 1;
 
                if (cpu_online(cpu) || (c->no_control == 0)) {
-                       register_cpu(c, cpu, parent);
+                       register_cpu(c, cpu);
 
                        sysdev_create_file(&c->sysdev, &attr_physical_id);
                }
index 9e30f968c184af90dfad58f5ff357edfe81e0ed3..d454caada265d599addca7877ced6e5f7a318e04 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/idr.h>
 #include <linux/nodemask.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -90,7 +91,7 @@ void free_initmem(void)
 
        addr = (unsigned long)__init_begin;
        for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
-               memset((void *)addr, 0xcc, PAGE_SIZE);
+               memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
                ClearPageReserved(virt_to_page(addr));
                init_page_count(virt_to_page(addr));
                free_page(addr);
index 69f3b9a20beb768a093735b800628e18f3d1645f..089d939a0b3e9829b037cc95d38e27e2180746f9 100644 (file)
@@ -114,15 +114,20 @@ void online_page(struct page *page)
        num_physpages++;
 }
 
-int __devinit add_memory(u64 start, u64 size)
+#ifdef CONFIG_NUMA
+int memory_add_physaddr_to_nid(u64 start)
+{
+       return hot_add_scn_to_nid(start);
+}
+#endif
+
+int __devinit arch_add_memory(int nid, u64 start, u64 size)
 {
        struct pglist_data *pgdata;
        struct zone *zone;
-       int nid;
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
 
-       nid = hot_add_scn_to_nid(start);
        pgdata = NODE_DATA(nid);
 
        start = (unsigned long)__va(start);
index aa98cb3b59d82a3c9ff49f35089ba0244eca60d6..fbe23933f73192662b1780ff7d99c728a4eddaf9 100644 (file)
@@ -334,7 +334,7 @@ out:
        return nid;
 }
 
-static int cpu_numa_callback(struct notifier_block *nfb,
+static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
                             unsigned long action,
                             void *hcpu)
 {
@@ -609,14 +609,15 @@ static void __init *careful_allocation(int nid, unsigned long size,
        return (void *)ret;
 }
 
+static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+       .notifier_call = cpu_numa_callback,
+       .priority = 1 /* Must run before sched domains notifier. */
+};
+
 void __init do_init_bootmem(void)
 {
        int nid;
        unsigned int i;
-       static struct notifier_block ppc64_numa_nb = {
-               .notifier_call = cpu_numa_callback,
-               .priority = 1 /* Must run before sched domains notifier. */
-       };
 
        min_low_pfn = 0;
        max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
index 3068b429b031149ce8e4a4741bd9a29a3f52c1ec..a656d810a44aa6b6f34690deefd4dc8c65c617ab 100644 (file)
@@ -2203,7 +2203,7 @@ void spu_init_csa(struct spu_state *csa)
 
        memset(lscsa, 0, sizeof(struct spu_lscsa));
        csa->lscsa = lscsa;
-       csa->register_lock = SPIN_LOCK_UNLOCKED;
+       spin_lock_init(&csa->register_lock);
 
        /* Set LS pages reserved to allow for user-space mapping. */
        for (p = lscsa->ls; p < lscsa->ls + LS_SIZE; p += PAGE_SIZE)
index 047f954a89eb933206396688cd45dcf3b28d2a12..93e7505debc59813fb30dc82afadf96b963ef0bc 100644 (file)
@@ -546,7 +546,7 @@ struct pmf_device {
 };
 
 static LIST_HEAD(pmf_devices);
-static spinlock_t pmf_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(pmf_lock);
 static DEFINE_MUTEX(pmf_irq_mutex);
 
 static void pmf_release_device(struct kref *kref)
index 8f2d12935b99fe47de2a273ad4b12338fbcb5f4f..45ccc687e57cbedc3395f0c113b7117a5c6c1b53 100644 (file)
@@ -35,7 +35,7 @@
  */
 
 /* EEH event workqueue setup. */
-static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
 LIST_HEAD(eeh_eventlist);
 static void eeh_thread_launcher(void *);
 DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);
index 74e0d31a3559cf9ca3859a0f3b55fff962966f97..615350d46b526110f55bb0c80f63c7b22e01ffe7 100644 (file)
@@ -32,7 +32,7 @@
 
 static void __iomem *mmio_nvram_start;
 static long mmio_nvram_len;
-static spinlock_t mmio_nvram_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(mmio_nvram_lock);
 
 static ssize_t mmio_nvram_read(char *buf, size_t count, loff_t *index)
 {
index 1f79e84ab464dd8a416eaab3d104e8aff11ea6d5..4b4607d89bfa6e1c105a01d85c0a674748c5fa8f 100644 (file)
@@ -475,7 +475,7 @@ int __init ppc_init(void)
 
        /* register CPU devices */
        for_each_possible_cpu(i)
-               register_cpu(&cpu_devices[i], i, NULL);
+               register_cpu(&cpu_devices[i], i);
 
        /* call platform init */
        if (ppc_md.init != NULL) {
index 9a22434a580c52f1245bedd5d0a1a6a7d81d7866..54d35c13090798810a5dfcc931446e07ec0dae8a 100644 (file)
@@ -652,7 +652,7 @@ appldata_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
 
-static struct notifier_block appldata_nb = {
+static struct notifier_block __devinitdata appldata_nb = {
        .notifier_call = appldata_cpu_notify,
 };
 
index 343120c9223d798d6cad38703153df30f83e683b..8e03219eea76051f23d5f97edf5f61cab622ba38 100644 (file)
@@ -869,7 +869,7 @@ static int __init topology_init(void)
        int ret;
 
        for_each_possible_cpu(cpu) {
-               ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu, NULL);
+               ret = register_cpu(&per_cpu(cpu_devices, cpu), cpu);
                if (ret)
                        printk(KERN_WARNING "topology_init: register_cpu %d "
                               "failed (%d)\n", cpu, ret);
index bb229ef030f3cbf70c5afa777cd50e194fed6025..9af22116c9a2b3a78e02c8533e59660769d95723 100644 (file)
@@ -402,7 +402,7 @@ static int __init topology_init(void)
        int cpu_id;
 
        for_each_possible_cpu(cpu_id)
-               register_cpu(&cpu[cpu_id], cpu_id, NULL);
+               register_cpu(&cpu[cpu_id], cpu_id);
 
        return 0;
 }
index d2711c9c9d13954dbadaecdf0665e7b3d06cdbe1..da98d8dbcf95324284de3d601e2364cc1923c896 100644 (file)
@@ -309,7 +309,7 @@ static struct cpu cpu[1];
 
 static int __init topology_init(void)
 {
-       return register_cpu(cpu, 0, NULL);
+       return register_cpu(cpu, 0);
 }
 
 subsys_initcall(topology_init);
index a6a7d8168346c6b50b9910727ccd344f9566b5f1..116d9632002defd035dfdae91f0d80767d75d3b6 100644 (file)
@@ -537,7 +537,7 @@ static int __init topology_init(void)
        for_each_possible_cpu(i) {
                struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL);
                if (p) {
-                       register_cpu(p, i, NULL);
+                       register_cpu(p, i);
                        err = 0;
                }
        }
index 5c2bcf354ce64aa3d74ad75dbc7a7eb0ba0091bd..cb75a27adb517bba8b2d43289b0cb8e1a64ef81d 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/initrd.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
+#include <linux/poison.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/kprobes.h>
@@ -1520,7 +1521,7 @@ void free_initmem(void)
                page = (addr +
                        ((unsigned long) __va(kern_base)) -
                        ((unsigned long) KERNBASE));
-               memset((void *)addr, 0xcc, PAGE_SIZE);
+               memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
                p = virt_to_page(page);
 
                ClearPageReserved(p);
index 7290e72b9a34d2a56be1a90e3c96989d9df1d844..22cac4487b57fd1e9aef98769071a8e4f02f0d81 100644 (file)
@@ -588,7 +588,7 @@ END(common_interrupt)
  */            
        .macro apicinterrupt num,func
        INTR_FRAME
-       pushq $\num-256
+       pushq $~(\num)
        CFI_ADJUST_CFA_OFFSET 8
        interrupt \func
        jmp ret_from_intr
index 59518d4d43589a9b99724ecefbd6b6162b43ac28..3be0a7e4bf08d1b0109c946d1470e0c0292bc214 100644 (file)
@@ -115,8 +115,8 @@ skip:
  */
 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 {      
-       /* high bits used in ret_from_ code  */
-       unsigned irq = regs->orig_rax & 0xff;
+       /* high bit used in ret_from_ code  */
+       unsigned irq = ~regs->orig_rax;
 
        exit_idle();
        irq_enter();
index acd5816b1a6f214d2dfc5253d674ed9d81492fe5..88845674c661a39feefc5b35bb5c5dd9b2255be8 100644 (file)
@@ -629,7 +629,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 #endif
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
+static __cpuinit int
 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
@@ -647,7 +647,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
 
-static struct notifier_block mce_cpu_notifier = {
+static struct notifier_block __cpuinitdata mce_cpu_notifier = {
        .notifier_call = mce_cpu_callback,
 };
 
index acee4bc3f6fa945799d194b17cfbf4cac20285e6..5a1c0a3bf87262c027c327773628e93e58194188 100644 (file)
@@ -135,10 +135,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
 
        cpu = smp_processor_id();
        /*
-        * orig_rax contains the interrupt vector - 256.
+        * orig_rax contains the negated interrupt vector.
         * Use that to determine where the sender put the data.
         */
-       sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
+       sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
        f = &per_cpu(flush_state, sender);
 
        if (!cpu_isset(cpu, f->flush_cpumask))
index 4e9755179ecf57022bcb8e4e6adaa76d62612010..540c0ccbcccc8953c95ee6b6c2baf5ad11e44aaf 100644 (file)
@@ -455,10 +455,12 @@ cpumask_t cpu_coregroup_map(int cpu)
        struct cpuinfo_x86 *c = cpu_data + cpu;
        /*
         * For perf, we return last level cache shared map.
-        * TBD: when power saving sched policy is added, we will return
-        *      cpu_core_map when power saving policy is enabled
+        * And for power savings, we return cpu_core_map
         */
-       return c->llc_shared_map;
+       if (sched_mc_power_savings || sched_smt_power_savings)
+               return cpu_core_map[cpu];
+       else
+               return c->llc_shared_map;
 }
 
 /* representing cpus for which sibling maps can be computed */
index 02add1d1dfa88aa0843b0fffc245399fd5e4adbf..95bd232ff0cf3d37fc3b5838e0b477666115aebb 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/memory_hotplug.h>
@@ -506,8 +507,6 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
 /*
  * Memory hotplug specific functions
  */
-#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
-
 void online_page(struct page *page)
 {
        ClearPageReserved(page);
@@ -517,31 +516,17 @@ void online_page(struct page *page)
        num_physpages++;
 }
 
-#ifndef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_MEMORY_HOTPLUG
 /*
- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
- * just online the pages.
+ * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
+ *     via probe interface of sysfs. If acpi notifies hot-add event, then it
+ *     can tell node id by searching dsdt. But, probe interface doesn't have
+ *     node id. So, return 0 as node id at this time.
  */
-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+#ifdef CONFIG_NUMA
+int memory_add_physaddr_to_nid(u64 start)
 {
-       int err = -EIO;
-       unsigned long pfn;
-       unsigned long total = 0, mem = 0;
-       for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-               if (pfn_valid(pfn)) {
-                       online_page(pfn_to_page(pfn));
-                       err = 0;
-                       mem++;
-               }
-               total++;
-       }
-       if (!err) {
-               z->spanned_pages += total;
-               z->present_pages += mem;
-               z->zone_pgdat->node_spanned_pages += total;
-               z->zone_pgdat->node_present_pages += mem;
-       }
-       return err;
+       return 0;
 }
 #endif
 
@@ -549,9 +534,9 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
  * Memory is added always to NORMAL zone. This means you will never get
  * additional DMA/DMA32 memory.
  */
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
 {
-       struct pglist_data *pgdat = NODE_DATA(0);
+       struct pglist_data *pgdat = NODE_DATA(nid);
        struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -568,7 +553,7 @@ error:
        printk("%s: Problem encountered in __add_pages!\n", __func__);
        return ret;
 }
-EXPORT_SYMBOL_GPL(add_memory);
+EXPORT_SYMBOL_GPL(arch_add_memory);
 
 int remove_memory(u64 start, u64 size)
 {
@@ -576,7 +561,33 @@ int remove_memory(u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 
-#endif
+#else /* CONFIG_MEMORY_HOTPLUG */
+/*
+ * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
+ * just online the pages.
+ */
+int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+{
+       int err = -EIO;
+       unsigned long pfn;
+       unsigned long total = 0, mem = 0;
+       for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+               if (pfn_valid(pfn)) {
+                       online_page(pfn_to_page(pfn));
+                       err = 0;
+                       mem++;
+               }
+               total++;
+       }
+       if (!err) {
+               z->spanned_pages += total;
+               z->present_pages += mem;
+               z->zone_pgdat->node_spanned_pages += total;
+               z->zone_pgdat->node_present_pages += mem;
+       }
+       return err;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
                         kcore_vsyscall;
@@ -650,7 +661,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
        for (addr = begin; addr < end; addr += PAGE_SIZE) {
                ClearPageReserved(virt_to_page(addr));
                init_page_count(virt_to_page(addr));
-               memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
+               memset((void *)(addr & ~(PAGE_SIZE-1)),
+                       POISON_FREE_INITMEM, PAGE_SIZE);
                free_page(addr);
                totalram_pages++;
        }
@@ -658,7 +670,8 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 
 void free_initmem(void)
 {
-       memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
+       memset(__initdata_begin, POISON_FREE_INITDATA,
+               __initdata_end - __initdata_begin);
        free_init_pages("unused kernel memory",
                        (unsigned long)(&__init_begin),
                        (unsigned long)(&__init_end));
index 937d81f62f43e73e90e75e1280a0ef899cbbadbe..fe14909f45e057983c1100c5dab33359d718d7c6 100644 (file)
@@ -29,7 +29,7 @@
 
 extern volatile unsigned long wall_jiffies;
 
-spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
 
index 225d64d73f04c8700f9dc939bcbb26b0e56bf3e5..27e409089a7b9837ce33927236f21ea89323fdf1 100644 (file)
@@ -461,7 +461,7 @@ void show_code(unsigned int *pc)
        }
 }
 
-spinlock_t die_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(die_lock);
 
 void die(const char * str, struct pt_regs * regs, long err)
 {
index c04422a502da8ebd0cb03cc76d5962433fde6b0b..eee03a3876a3f2f875260b64c1c0bc51267219a3 100644 (file)
@@ -3403,7 +3403,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
 }
 
 
-static struct notifier_block blk_cpu_notifier = {
+static struct notifier_block __devinitdata blk_cpu_notifier = {
        .notifier_call  = blk_cpu_notify,
 };
 
@@ -3541,9 +3541,7 @@ int __init blk_dev_init(void)
                INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
        open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
-#ifdef CONFIG_HOTPLUG_CPU
-       register_cpu_notifier(&blk_cpu_notifier);
-#endif
+       register_hotcpu_notifier(&blk_cpu_notifier);
 
        blk_max_low_pfn = max_low_pfn;
        blk_max_pfn = max_pfn;
index 94b8d820c51229bcb3f669f9f176720c20e5af42..610d2cc02cf8f500e8eed0158df3f3967cb520cf 100644 (file)
@@ -328,7 +328,7 @@ config ACPI_CONTAINER
 config ACPI_HOTPLUG_MEMORY
        tristate "Memory Hotplug"
        depends on ACPI
-       depends on MEMORY_HOTPLUG || X86_64
+       depends on MEMORY_HOTPLUG
        default n
        help
          This driver adds supports for ACPI Memory Hotplug.  This driver
index e0a95ba72371562fc1c250ebcadd0b170eb50881..1012284ff4f7eb4fc55d0ed1a3e200e7ef84b3a0 100644 (file)
@@ -57,6 +57,7 @@ MODULE_LICENSE("GPL");
 
 static int acpi_memory_device_add(struct acpi_device *device);
 static int acpi_memory_device_remove(struct acpi_device *device, int type);
+static int acpi_memory_device_start(struct acpi_device *device);
 
 static struct acpi_driver acpi_memory_device_driver = {
        .name = ACPI_MEMORY_DEVICE_DRIVER_NAME,
@@ -65,48 +66,79 @@ static struct acpi_driver acpi_memory_device_driver = {
        .ops = {
                .add = acpi_memory_device_add,
                .remove = acpi_memory_device_remove,
+               .start = acpi_memory_device_start,
                },
 };
 
+struct acpi_memory_info {
+       struct list_head list;
+       u64 start_addr;         /* Memory Range start physical addr */
+       u64 length;             /* Memory Range length */
+       unsigned short caching; /* memory cache attribute */
+       unsigned short write_protect;   /* memory read/write attribute */
+       unsigned int enabled:1;
+};
+
 struct acpi_memory_device {
        acpi_handle handle;
        unsigned int state;     /* State of the memory device */
-       unsigned short caching; /* memory cache attribute */
-       unsigned short write_protect;   /* memory read/write attribute */
-       u64 start_addr;         /* Memory Range start physical addr */
-       u64 length;             /* Memory Range length */
+       struct list_head res_list;
 };
 
+static acpi_status
+acpi_memory_get_resource(struct acpi_resource *resource, void *context)
+{
+       struct acpi_memory_device *mem_device = context;
+       struct acpi_resource_address64 address64;
+       struct acpi_memory_info *info, *new;
+       acpi_status status;
+
+       status = acpi_resource_to_address64(resource, &address64);
+       if (ACPI_FAILURE(status) ||
+           (address64.resource_type != ACPI_MEMORY_RANGE))
+               return AE_OK;
+
+       list_for_each_entry(info, &mem_device->res_list, list) {
+               /* Can we combine the resource range information? */
+               if ((info->caching == address64.info.mem.caching) &&
+                   (info->write_protect == address64.info.mem.write_protect) &&
+                   (info->start_addr + info->length == address64.minimum)) {
+                       info->length += address64.address_length;
+                       return AE_OK;
+               }
+       }
+
+       new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL);
+       if (!new)
+               return AE_ERROR;
+
+       INIT_LIST_HEAD(&new->list);
+       new->caching = address64.info.mem.caching;
+       new->write_protect = address64.info.mem.write_protect;
+       new->start_addr = address64.minimum;
+       new->length = address64.address_length;
+       list_add_tail(&new->list, &mem_device->res_list);
+
+       return AE_OK;
+}
+
 static int
 acpi_memory_get_device_resources(struct acpi_memory_device *mem_device)
 {
        acpi_status status;
-       struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-       struct acpi_resource *resource = NULL;
-       struct acpi_resource_address64 address64;
+       struct acpi_memory_info *info, *n;
 
        ACPI_FUNCTION_TRACE("acpi_memory_get_device_resources");
 
-       /* Get the range from the _CRS */
-       status = acpi_get_current_resources(mem_device->handle, &buffer);
-       if (ACPI_FAILURE(status))
-               return_VALUE(-EINVAL);
-
-       resource = (struct acpi_resource *)buffer.pointer;
-       status = acpi_resource_to_address64(resource, &address64);
-       if (ACPI_SUCCESS(status)) {
-               if (address64.resource_type == ACPI_MEMORY_RANGE) {
-                       /* Populate the structure */
-                       mem_device->caching = address64.info.mem.caching;
-                       mem_device->write_protect =
-                           address64.info.mem.write_protect;
-                       mem_device->start_addr = address64.minimum;
-                       mem_device->length = address64.address_length;
-               }
+       status = acpi_walk_resources(mem_device->handle, METHOD_NAME__CRS,
+                                    acpi_memory_get_resource, mem_device);
+       if (ACPI_FAILURE(status)) {
+               list_for_each_entry_safe(info, n, &mem_device->res_list, list)
+                       kfree(info);
+               return -EINVAL;
        }
 
-       acpi_os_free(buffer.pointer);
-       return_VALUE(0);
+       return 0;
 }
 
 static int
@@ -181,7 +213,9 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
 
 static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 {
-       int result;
+       int result, num_enabled = 0;
+       struct acpi_memory_info *info;
+       int node;
 
        ACPI_FUNCTION_TRACE("acpi_memory_enable_device");
 
@@ -194,15 +228,35 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
                return result;
        }
 
+       node = acpi_get_node(mem_device->handle);
        /*
         * Tell the VM there is more memory here...
         * Note: Assume that this function returns zero on success
+        * We don't have memory-hot-add rollback function,now.
+        * (i.e. memory-hot-remove function)
         */
-       result = add_memory(mem_device->start_addr, mem_device->length);
-       if (result) {
+       list_for_each_entry(info, &mem_device->res_list, list) {
+               u64 start_pfn, end_pfn;
+
+               start_pfn = info->start_addr >> PAGE_SHIFT;
+               end_pfn = (info->start_addr + info->length - 1) >> PAGE_SHIFT;
+
+               if (pfn_valid(start_pfn) || pfn_valid(end_pfn)) {
+                       /* already enabled. try next area */
+                       num_enabled++;
+                       continue;
+               }
+
+               result = add_memory(node, info->start_addr, info->length);
+               if (result)
+                       continue;
+               info->enabled = 1;
+               num_enabled++;
+       }
+       if (!num_enabled) {
                ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n"));
                mem_device->state = MEMORY_INVALID_STATE;
-               return result;
+               return -EINVAL;
        }
 
        return result;
@@ -246,8 +300,7 @@ static int acpi_memory_powerdown_device(struct acpi_memory_device *mem_device)
 static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
 {
        int result;
-       u64 start = mem_device->start_addr;
-       u64 len = mem_device->length;
+       struct acpi_memory_info *info, *n;
 
        ACPI_FUNCTION_TRACE("acpi_memory_disable_device");
 
@@ -255,10 +308,13 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
         * Ask the VM to offline this memory range.
         * Note: Assume that this function returns zero on success
         */
-       result = remove_memory(start, len);
-       if (result) {
-               ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n"));
-               return_VALUE(result);
+       list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
+               if (info->enabled) {
+                       result = remove_memory(info->start_addr, info->length);
+                       if (result)
+                               return result;
+               }
+               kfree(info);
        }
 
        /* Power-off and eject the device */
@@ -356,6 +412,7 @@ static int acpi_memory_device_add(struct acpi_device *device)
                return_VALUE(-ENOMEM);
        memset(mem_device, 0, sizeof(struct acpi_memory_device));
 
+       INIT_LIST_HEAD(&mem_device->res_list);
        mem_device->handle = device->handle;
        sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
        sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
@@ -391,6 +448,25 @@ static int acpi_memory_device_remove(struct acpi_device *device, int type)
        return_VALUE(0);
 }
 
+static int acpi_memory_device_start (struct acpi_device *device)
+{
+       struct acpi_memory_device *mem_device;
+       int result = 0;
+
+       ACPI_FUNCTION_TRACE("acpi_memory_device_start");
+
+       mem_device = acpi_driver_data(device);
+
+       if (!acpi_memory_check_device(mem_device)) {
+               /* call add_memory func */
+               result = acpi_memory_enable_device(mem_device);
+               if (result)
+                       ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+                               "Error in acpi_memory_enable_device\n"));
+       }
+       return_VALUE(result);
+}
+
 /*
  * Helper function to check for memory device
  */
index e2c1a16078c990113d4813024e0abf170e04928b..13d6d5bdea264f16dbe6239d2cde0a5b5d5daa20 100644 (file)
@@ -254,5 +254,18 @@ int acpi_get_pxm(acpi_handle h)
        } while (ACPI_SUCCESS(status));
        return -1;
 }
-
 EXPORT_SYMBOL(acpi_get_pxm);
+
+int acpi_get_node(acpi_handle *handle)
+{
+       int pxm, node = -1;
+
+       ACPI_FUNCTION_TRACE("acpi_get_node");
+
+       pxm = acpi_get_pxm(handle);
+       if (pxm >= 0)
+               node = acpi_map_pxm_to_node(pxm);
+
+       return_VALUE(node);
+}
+EXPORT_SYMBOL(acpi_get_node);
index f2eeaf9dc56a4fffe9dff197a5899588d02f5098..1bca86edf5708b174220ee41fa5fb05bfaeb064a 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/errno.h>
 #include <linux/atm.h>
 #include <linux/atmdev.h>
@@ -754,7 +755,7 @@ static void process_txdone_queue (struct fs_dev *dev, struct queue *q)
                        fs_kfree_skb (skb);
 
                        fs_dprintk (FS_DEBUG_ALLOC, "Free trans-d: %p\n", td); 
-                       memset (td, 0x12, sizeof (struct FS_BPENTRY));
+                       memset (td, ATM_POISON_FREE, sizeof(struct FS_BPENTRY));
                        kfree (td);
                        break;
                default:
index dd712b24ec91900f3b75506af6ecf6467f7ba160..4bef76a2f3f2a7611ff0dcbca79c8d774d7a7255 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/cpu.h>
 #include <linux/topology.h>
 #include <linux/device.h>
+#include <linux/node.h>
 
 #include "base.h"
 
@@ -57,13 +58,12 @@ static void __devinit register_cpu_control(struct cpu *cpu)
 {
        sysdev_create_file(&cpu->sysdev, &attr_online);
 }
-void unregister_cpu(struct cpu *cpu, struct node *root)
+void unregister_cpu(struct cpu *cpu)
 {
        int logical_cpu = cpu->sysdev.id;
 
-       if (root)
-               sysfs_remove_link(&root->sysdev.kobj,
-                                 kobject_name(&cpu->sysdev.kobj));
+       unregister_cpu_under_node(logical_cpu, cpu_to_node(logical_cpu));
+
        sysdev_remove_file(&cpu->sysdev, &attr_online);
 
        sysdev_unregister(&cpu->sysdev);
@@ -109,23 +109,21 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
  *
  * Initialize and register the CPU device.
  */
-int __devinit register_cpu(struct cpu *cpu, int num, struct node *root)
+int __devinit register_cpu(struct cpu *cpu, int num)
 {
        int error;
-
        cpu->node_id = cpu_to_node(num);
        cpu->sysdev.id = num;
        cpu->sysdev.cls = &cpu_sysdev_class;
 
        error = sysdev_register(&cpu->sysdev);
-       if (!error && root)
-               error = sysfs_create_link(&root->sysdev.kobj,
-                                         &cpu->sysdev.kobj,
-                                         kobject_name(&cpu->sysdev.kobj));
+
        if (!error && !cpu->no_control)
                register_cpu_control(cpu);
        if (!error)
                cpu_sys_devices[num] = &cpu->sysdev;
+       if (!error)
+               register_cpu_under_node(num, cpu_to_node(num));
 
 #ifdef CONFIG_KEXEC
        if (!error)
@@ -145,5 +143,13 @@ EXPORT_SYMBOL_GPL(get_cpu_sysdev);
 
 int __init cpu_dev_init(void)
 {
-       return sysdev_class_register(&cpu_sysdev_class);
+       int err;
+
+       err = sysdev_class_register(&cpu_sysdev_class);
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       if (!err)
+               err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class);
+#endif
+
+       return err;
 }
index e2f64f91ed0558fcc6b9a70feb9620170685c485..33c5cce1560b261b767fa56faa410e19ed47d5e4 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/dmapool.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 
 /*
  * Pool allocator ... wraps the dma_alloc_coherent page allocator, so
@@ -35,8 +36,6 @@ struct dma_page {     /* cacheable header for 'allocation' bytes */
 };
 
 #define        POOL_TIMEOUT_JIFFIES    ((100 /* msec */ * HZ) / 1000)
-#define        POOL_POISON_FREED       0xa7    /* !inuse */
-#define        POOL_POISON_ALLOCATED   0xa9    /* !initted */
 
 static DECLARE_MUTEX (pools_lock);
 
index dd547af4681a50c87dc976cea8c22ee497f9a133..c6b7d9c4b65115054f3f9cd3591c7dbf2c75142d 100644 (file)
@@ -306,11 +306,13 @@ static ssize_t
 memory_probe_store(struct class *class, const char *buf, size_t count)
 {
        u64 phys_addr;
+       int nid;
        int ret;
 
        phys_addr = simple_strtoull(buf, NULL, 0);
 
-       ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
+       nid = memory_add_physaddr_to_nid(phys_addr);
+       ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
 
        if (ret)
                count = ret;
index c80c3aeed004a558de9cf9db0632191cf1c16c99..eae2bdc183bb7741a85c23f086d156c95b1d0317 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/cpumask.h>
 #include <linux/topology.h>
 #include <linux/nodemask.h>
+#include <linux/cpu.h>
 
 static struct sysdev_class node_class = {
        set_kset_name("node"),
@@ -190,6 +191,66 @@ void unregister_node(struct node *node)
        sysdev_unregister(&node->sysdev);
 }
 
+struct node node_devices[MAX_NUMNODES];
+
+/*
+ * register cpu under node
+ */
+int register_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+       if (node_online(nid)) {
+               struct sys_device *obj = get_cpu_sysdev(cpu);
+               if (!obj)
+                       return 0;
+               return sysfs_create_link(&node_devices[nid].sysdev.kobj,
+                                        &obj->kobj,
+                                        kobject_name(&obj->kobj));
+        }
+
+       return 0;
+}
+
+int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+       if (node_online(nid)) {
+               struct sys_device *obj = get_cpu_sysdev(cpu);
+               if (obj)
+                       sysfs_remove_link(&node_devices[nid].sysdev.kobj,
+                                        kobject_name(&obj->kobj));
+       }
+       return 0;
+}
+
+int register_one_node(int nid)
+{
+       int error = 0;
+       int cpu;
+
+       if (node_online(nid)) {
+               int p_node = parent_node(nid);
+               struct node *parent = NULL;
+
+               if (p_node != nid)
+                       parent = &node_devices[p_node];
+
+               error = register_node(&node_devices[nid], nid, parent);
+
+               /* link cpu under this node */
+               for_each_present_cpu(cpu) {
+                       if (cpu_to_node(cpu) == nid)
+                               register_cpu_under_node(cpu, nid);
+               }
+       }
+
+       return error;
+
+}
+
+void unregister_one_node(int nid)
+{
+       unregister_node(&node_devices[nid]);
+}
+
 static int __init register_node_type(void)
 {
        return sysdev_class_register(&node_class);
index 8c52421cbc545b54a6ce1c84c0cf1bf3f734c751..c2d621632383306a493fd5f60d5d3b9b55721f70 100644 (file)
@@ -107,7 +107,7 @@ static int __cpuinit topology_remove_dev(struct sys_device * sys_dev)
        return 0;
 }
 
-static int topology_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit topology_cpu_callback(struct notifier_block *nfb,
                unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
@@ -125,7 +125,7 @@ static int topology_cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
 
-static struct notifier_block topology_cpu_notifier =
+static struct notifier_block __cpuinitdata topology_cpu_notifier =
 {
        .notifier_call = topology_cpu_callback,
 };
index 3610c57295533c81807bbddfd174ab1ede354551..410d70cb76fbee1a6b97f329ea78d9e16788c4fc 100644 (file)
@@ -939,12 +939,35 @@ config MWAVE
 config SCx200_GPIO
        tristate "NatSemi SCx200 GPIO Support"
        depends on SCx200
+       select NSC_GPIO
        help
          Give userspace access to the GPIO pins on the National
          Semiconductor SCx200 processors.
 
          If compiled as a module, it will be called scx200_gpio.
 
+config PC8736x_GPIO
+       tristate "NatSemi PC8736x GPIO Support"
+       depends on X86
+       default SCx200_GPIO     # mostly N
+       select NSC_GPIO         # needed for support routines
+       help
+         Give userspace access to the GPIO pins on the National
+         Semiconductor PC-8736x (x=[03456]) SuperIO chip.  The chip
+         has multiple functional units, inc several managed by
+         hwmon/pc87360 driver.  Tested with PC-87366
+
+         If compiled as a module, it will be called pc8736x_gpio.
+
+config NSC_GPIO
+       tristate "NatSemi Base GPIO Support"
+       # selected by SCx200_GPIO and PC8736x_GPIO
+       # what about 2 selectors differing: m != y
+       help
+         Common support used (and needed) by scx200_gpio and
+         pc8736x_gpio drivers.  If those drivers are built as
+         modules, this one will be too, named nsc_gpio
+
 config CS5535_GPIO
        tristate "AMD CS5535/CS5536 GPIO (Geode Companion Device)"
        depends on X86_32
index 524105597ea7d7ff82339902afc0c15cda691447..6e0f4469d8bbdbf37197a7e722131acf48378bf7 100644 (file)
@@ -82,6 +82,8 @@ obj-$(CONFIG_PPDEV)           += ppdev.o
 obj-$(CONFIG_NWBUTTON)         += nwbutton.o
 obj-$(CONFIG_NWFLASH)          += nwflash.o
 obj-$(CONFIG_SCx200_GPIO)      += scx200_gpio.o
+obj-$(CONFIG_PC8736x_GPIO)     += pc8736x_gpio.o
+obj-$(CONFIG_NSC_GPIO)         += nsc_gpio.o
 obj-$(CONFIG_CS5535_GPIO)      += cs5535_gpio.o
 obj-$(CONFIG_GPIO_VR41XX)      += vr41xx_giu.o
 obj-$(CONFIG_TANBAC_TB0219)    += tb0219.o
index cfa7922cb431f500788b6d0ba6360312137eee03..d73be4c2db8a9c5c6fffe250d36e2e3593ac53f5 100644 (file)
@@ -329,9 +329,8 @@ static int __devinit agp_sgi_init(void)
 
 static void __devexit agp_sgi_cleanup(void)
 {
-       if (sgi_tioca_agp_bridges)
-               kfree(sgi_tioca_agp_bridges);
-       sgi_tioca_agp_bridges=NULL;
+       kfree(sgi_tioca_agp_bridges);
+       sgi_tioca_agp_bridges = NULL;
 }
 
 module_init(agp_sgi_init);
index 6543b9a14c42e677038bd001309e35143c5a10a1..d117cc9971922f5ec8a6c713da435221c4eb2af7 100644 (file)
@@ -43,7 +43,7 @@ typedef struct drm_mem_stats {
        unsigned long bytes_freed;
 } drm_mem_stats_t;
 
-static spinlock_t drm_mem_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(drm_mem_lock);
 static unsigned long drm_ram_available = 0;    /* In pages */
 static unsigned long drm_ram_used = 0;
 static drm_mem_stats_t drm_mem_stats[] =
index b7f17457b4243bb1c18e8dea11b6b7149a22e7fe..78a81a4a99c5c60fda46b05497351e211d94df1f 100644 (file)
@@ -557,7 +557,7 @@ via_init_dmablit(drm_device_t *dev)
                blitq->num_outstanding = 0;
                blitq->is_active = 0;
                blitq->aborting = 0;
-               blitq->blit_lock = SPIN_LOCK_UNLOCKED;
+               spin_lock_init(&blitq->blit_lock);
                for (j=0; j<VIA_NUM_BLIT_SLOTS; ++j) {
                        DRM_INIT_WAITQUEUE(blitq->blit_queue + j);
                }
index 9cad8501d62c9fca6a93656c204bb1dea1e09a15..dc0602ae8503041046e794790957c33be39d9ba8 100644 (file)
@@ -80,7 +80,7 @@ static int invalid_lilo_config;
 /* The ISA boards do window flipping into the same spaces so its only sane
    with a single lock. It's still pretty efficient */
 
-static spinlock_t epca_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(epca_lock);
 
 /* -----------------------------------------------------------------------
        MAXBOARDS is typically 12, but ISA and EISA cards are restricted to 
index 8d97b3911293bf46436bd743ae9c28aaad714316..afa26b65dac3b7fb4313df3e1b6e003951bf2d9d 100644 (file)
@@ -1320,11 +1320,12 @@ static struct tty_operations hvcs_ops = {
 static int hvcs_alloc_index_list(int n)
 {
        int i;
+
        hvcs_index_list = kmalloc(n * sizeof(hvcs_index_count),GFP_KERNEL);
        if (!hvcs_index_list)
                return -ENOMEM;
        hvcs_index_count = n;
-       for(i = 0; i < hvcs_index_count; i++)
+       for (i = 0; i < hvcs_index_count; i++)
                hvcs_index_list[i] = -1;
        return 0;
 }
@@ -1332,11 +1333,9 @@ static int hvcs_alloc_index_list(int n)
 static void hvcs_free_index_list(void)
 {
        /* Paranoia check to be thorough. */
-       if (hvcs_index_list) {
-               kfree(hvcs_index_list);
-               hvcs_index_list = NULL;
-               hvcs_index_count = 0;
-       }
+       kfree(hvcs_index_list);
+       hvcs_index_list = NULL;
+       hvcs_index_count = 0;
 }
 
 static int __init hvcs_module_init(void)
index b03ddab1bef57bae9a81acd79d31c42297d364cd..83ed6ae466a56cd5549b07dba914ba426fe86c3e 100644 (file)
@@ -57,8 +57,7 @@ static int ipmi_init_msghandler(void);
 static int initialized = 0;
 
 #ifdef CONFIG_PROC_FS
-struct proc_dir_entry *proc_ipmi_root = NULL;
-EXPORT_SYMBOL(proc_ipmi_root);
+static struct proc_dir_entry *proc_ipmi_root = NULL;
 #endif /* CONFIG_PROC_FS */
 
 #define MAX_EVENTS_IN_QUEUE    25
index 02a7dd7a8a55571cbf261e63a9dcdf3cf7fbb969..101c14b9b26de4b22751d4426d5474b909266d01 100644 (file)
@@ -809,7 +809,7 @@ static int ipmi_thread(void *data)
                        /* do nothing */
                }
                else if (smi_result == SI_SM_CALL_WITH_DELAY)
-                       udelay(1);
+                       schedule();
                else
                        schedule_timeout_interruptible(1);
        }
index f43c2e04eadd3878d57126b154afa2b4cbdc74aa..01247cccb89f26cbb0d52b7dee3757c36c43db64 100644 (file)
@@ -301,7 +301,7 @@ static struct tty_operations moxa_ops = {
        .tiocmset = moxa_tiocmset,
 };
 
-static spinlock_t moxa_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(moxa_lock);
 
 #ifdef CONFIG_PCI
 static int moxa_get_PCI_conf(struct pci_dev *p, int board_type, moxa_board_conf * board)
diff --git a/drivers/char/nsc_gpio.c b/drivers/char/nsc_gpio.c
new file mode 100644 (file)
index 0000000..5b91e4e
--- /dev/null
@@ -0,0 +1,142 @@
+/* linux/drivers/char/nsc_gpio.c
+
+   National Semiconductor common GPIO device-file/VFS methods.
+   Allows a user space process to control the GPIO pins.
+
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+   Copyright (c) 2005      Jim Cromie <jim.cromie@gmail.com>
+*/
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/nsc_gpio.h>
+#include <linux/platform_device.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#define NAME "nsc_gpio"
+
+void nsc_gpio_dump(struct nsc_gpio_ops *amp, unsigned index)
+{
+       /* retrieve current config w/o changing it */
+       u32 config = amp->gpio_config(index, ~0, 0);
+
+       /* user requested via 'v' command, so its INFO */
+       dev_info(amp->dev, "io%02u: 0x%04x %s %s %s %s %s %s %s\tio:%d/%d\n",
+                index, config,
+                (config & 1) ? "OE" : "TS",      /* output-enabled/tristate */
+                (config & 2) ? "PP" : "OD",      /* push pull / open drain */
+                (config & 4) ? "PUE" : "PUD",    /* pull up enabled/disabled */
+                (config & 8) ? "LOCKED" : "",    /* locked / unlocked */
+                (config & 16) ? "LEVEL" : "EDGE",/* level/edge input */
+                (config & 32) ? "HI" : "LO",     /* trigger on rise/fall edge */
+                (config & 64) ? "DEBOUNCE" : "", /* debounce */
+
+                amp->gpio_get(index), amp->gpio_current(index));
+}
+
+ssize_t nsc_gpio_write(struct file *file, const char __user *data,
+                      size_t len, loff_t *ppos)
+{
+       unsigned m = iminor(file->f_dentry->d_inode);
+       struct nsc_gpio_ops *amp = file->private_data;
+       struct device *dev = amp->dev;
+       size_t i;
+       int err = 0;
+
+       for (i = 0; i < len; ++i) {
+               char c;
+               if (get_user(c, data + i))
+                       return -EFAULT;
+               switch (c) {
+               case '0':
+                       amp->gpio_set(m, 0);
+                       break;
+               case '1':
+                       amp->gpio_set(m, 1);
+                       break;
+               case 'O':
+                       dev_dbg(dev, "GPIO%d output enabled\n", m);
+                       amp->gpio_config(m, ~1, 1);
+                       break;
+               case 'o':
+                       dev_dbg(dev, "GPIO%d output disabled\n", m);
+                       amp->gpio_config(m, ~1, 0);
+                       break;
+               case 'T':
+                       dev_dbg(dev, "GPIO%d output is push pull\n",
+                              m);
+                       amp->gpio_config(m, ~2, 2);
+                       break;
+               case 't':
+                       dev_dbg(dev, "GPIO%d output is open drain\n",
+                              m);
+                       amp->gpio_config(m, ~2, 0);
+                       break;
+               case 'P':
+                       dev_dbg(dev, "GPIO%d pull up enabled\n", m);
+                       amp->gpio_config(m, ~4, 4);
+                       break;
+               case 'p':
+                       dev_dbg(dev, "GPIO%d pull up disabled\n", m);
+                       amp->gpio_config(m, ~4, 0);
+                       break;
+               case 'v':
+                       /* View Current pin settings */
+                       amp->gpio_dump(amp, m);
+                       break;
+               case '\n':
+                       /* end of settings string, do nothing */
+                       break;
+               default:
+                       dev_err(dev, "io%2d bad setting: chr<0x%2x>\n",
+                               m, (int)c);
+                       err++;
+               }
+       }
+       if (err)
+               return -EINVAL; /* full string handled, report error */
+
+       return len;
+}
+
+ssize_t nsc_gpio_read(struct file *file, char __user * buf,
+                     size_t len, loff_t * ppos)
+{
+       unsigned m = iminor(file->f_dentry->d_inode);
+       int value;
+       struct nsc_gpio_ops *amp = file->private_data;
+
+       value = amp->gpio_get(m);
+       if (put_user(value ? '1' : '0', buf))
+               return -EFAULT;
+
+       return 1;
+}
+
+/* common file-ops routines for both scx200_gpio and pc87360_gpio */
+EXPORT_SYMBOL(nsc_gpio_write);
+EXPORT_SYMBOL(nsc_gpio_read);
+EXPORT_SYMBOL(nsc_gpio_dump);
+
+static int __init nsc_gpio_init(void)
+{
+       printk(KERN_DEBUG NAME " initializing\n");
+       return 0;
+}
+
+static void __exit nsc_gpio_cleanup(void)
+{
+       printk(KERN_DEBUG NAME " cleanup\n");
+}
+
+module_init(nsc_gpio_init);
+module_exit(nsc_gpio_cleanup);
+
+MODULE_AUTHOR("Jim Cromie <jim.cromie@gmail.com>");
+MODULE_DESCRIPTION("NatSemi GPIO Common Methods");
+MODULE_LICENSE("GPL");
diff --git a/drivers/char/pc8736x_gpio.c b/drivers/char/pc8736x_gpio.c
new file mode 100644 (file)
index 0000000..1c706cc
--- /dev/null
@@ -0,0 +1,340 @@
+/* linux/drivers/char/pc8736x_gpio.c
+
+   National Semiconductor PC8736x GPIO driver.  Allows a user space
+   process to play with the GPIO pins.
+
+   Copyright (c) 2005 Jim Cromie <jim.cromie@gmail.com>
+
+   adapted from linux/drivers/char/scx200_gpio.c
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>,
+*/
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/mutex.h>
+#include <linux/nsc_gpio.h>
+#include <linux/platform_device.h>
+#include <asm/uaccess.h>
+
+#define DEVNAME "pc8736x_gpio"
+
+MODULE_AUTHOR("Jim Cromie <jim.cromie@gmail.com>");
+MODULE_DESCRIPTION("NatSemi PC-8736x GPIO Pin Driver");
+MODULE_LICENSE("GPL");
+
+static int major;              /* default to dynamic major */
+module_param(major, int, 0);
+MODULE_PARM_DESC(major, "Major device number");
+
+static DEFINE_MUTEX(pc8736x_gpio_config_lock);
+static unsigned pc8736x_gpio_base;
+static u8 pc8736x_gpio_shadow[4];
+
+#define SIO_BASE1       0x2E   /* 1st command-reg to check */
+#define SIO_BASE2       0x4E   /* alt command-reg to check */
+#define SIO_BASE_OFFSET 0x20
+
+#define SIO_SID                0x20    /* SuperI/O ID Register */
+#define SIO_SID_VALUE  0xe9    /* Expected value in SuperI/O ID Register */
+
+#define SIO_CF1                0x21    /* chip config, bit0 is chip enable */
+
+#define PC8736X_GPIO_SIZE      16
+
+#define SIO_UNIT_SEL   0x7     /* unit select reg */
+#define SIO_UNIT_ACT   0x30    /* unit enable */
+#define SIO_GPIO_UNIT  0x7     /* unit number of GPIO */
+#define SIO_VLM_UNIT   0x0D
+#define SIO_TMS_UNIT   0x0E
+
+/* config-space addrs to read/write each unit's runtime addr */
+#define SIO_BASE_HADDR         0x60
+#define SIO_BASE_LADDR         0x61
+
+/* GPIO config-space pin-control addresses */
+#define SIO_GPIO_PIN_SELECT    0xF0
+#define SIO_GPIO_PIN_CONFIG     0xF1
+#define SIO_GPIO_PIN_EVENT      0xF2
+
+static unsigned char superio_cmd = 0;
+static unsigned char selected_device = 0xFF;   /* bogus start val */
+
+/* GPIO port runtime access, functionality */
+static int port_offset[] = { 0, 4, 8, 10 };    /* non-uniform offsets ! */
+/* static int event_capable[] = { 1, 1, 0, 0 };   ports 2,3 are hobbled */
+
+#define PORT_OUT       0
+#define PORT_IN                1
+#define PORT_EVT_EN    2
+#define PORT_EVT_STST  3
+
+static struct platform_device *pdev;  /* use in dev_*() */
+
+static inline void superio_outb(int addr, int val)
+{
+       outb_p(addr, superio_cmd);
+       outb_p(val, superio_cmd + 1);
+}
+
+static inline int superio_inb(int addr)
+{
+       outb_p(addr, superio_cmd);
+       return inb_p(superio_cmd + 1);
+}
+
+static int pc8736x_superio_present(void)
+{
+       /* try the 2 possible values, read a hardware reg to verify */
+       superio_cmd = SIO_BASE1;
+       if (superio_inb(SIO_SID) == SIO_SID_VALUE)
+               return superio_cmd;
+
+       superio_cmd = SIO_BASE2;
+       if (superio_inb(SIO_SID) == SIO_SID_VALUE)
+               return superio_cmd;
+
+       return 0;
+}
+
+static void device_select(unsigned devldn)
+{
+       superio_outb(SIO_UNIT_SEL, devldn);
+       selected_device = devldn;
+}
+
+static void select_pin(unsigned iminor)
+{
+       /* select GPIO port/pin from device minor number */
+       device_select(SIO_GPIO_UNIT);
+       superio_outb(SIO_GPIO_PIN_SELECT,
+                    ((iminor << 1) & 0xF0) | (iminor & 0x7));
+}
+
+static inline u32 pc8736x_gpio_configure_fn(unsigned index, u32 mask, u32 bits,
+                                           u32 func_slct)
+{
+       u32 config, new_config;
+
+       mutex_lock(&pc8736x_gpio_config_lock);
+
+       device_select(SIO_GPIO_UNIT);
+       select_pin(index);
+
+       /* read current config value */
+       config = superio_inb(func_slct);
+
+       /* set new config */
+       new_config = (config & mask) | bits;
+       superio_outb(func_slct, new_config);
+
+       mutex_unlock(&pc8736x_gpio_config_lock);
+
+       return config;
+}
+
+static u32 pc8736x_gpio_configure(unsigned index, u32 mask, u32 bits)
+{
+       return pc8736x_gpio_configure_fn(index, mask, bits,
+                                        SIO_GPIO_PIN_CONFIG);
+}
+
+static int pc8736x_gpio_get(unsigned minor)
+{
+       int port, bit, val;
+
+       port = minor >> 3;
+       bit = minor & 7;
+       val = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_IN);
+       val >>= bit;
+       val &= 1;
+
+       dev_dbg(&pdev->dev, "_gpio_get(%d from %x bit %d) == val %d\n",
+               minor, pc8736x_gpio_base + port_offset[port] + PORT_IN, bit,
+               val);
+
+       return val;
+}
+
+static void pc8736x_gpio_set(unsigned minor, int val)
+{
+       int port, bit, curval;
+
+       minor &= 0x1f;
+       port = minor >> 3;
+       bit = minor & 7;
+       curval = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_OUT);
+
+       dev_dbg(&pdev->dev, "addr:%x cur:%x bit-pos:%d cur-bit:%x + new:%d -> bit-new:%d\n",
+               pc8736x_gpio_base + port_offset[port] + PORT_OUT,
+               curval, bit, (curval & ~(1 << bit)), val, (val << bit));
+
+       val = (curval & ~(1 << bit)) | (val << bit);
+
+       dev_dbg(&pdev->dev, "gpio_set(minor:%d port:%d bit:%d)"
+               " %2x -> %2x\n", minor, port, bit, curval, val);
+
+       outb_p(val, pc8736x_gpio_base + port_offset[port] + PORT_OUT);
+
+       curval = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_OUT);
+       val = inb_p(pc8736x_gpio_base + port_offset[port] + PORT_IN);
+
+       dev_dbg(&pdev->dev, "wrote %x, read: %x\n", curval, val);
+       pc8736x_gpio_shadow[port] = val;
+}
+
+static void pc8736x_gpio_set_high(unsigned index)
+{
+       pc8736x_gpio_set(index, 1);
+}
+
+static void pc8736x_gpio_set_low(unsigned index)
+{
+       pc8736x_gpio_set(index, 0);
+}
+
+static int pc8736x_gpio_current(unsigned minor)
+{
+       int port, bit;
+       minor &= 0x1f;
+       port = minor >> 3;
+       bit = minor & 7;
+       return ((pc8736x_gpio_shadow[port] >> bit) & 0x01);
+}
+
+static void pc8736x_gpio_change(unsigned index)
+{
+       pc8736x_gpio_set(index, !pc8736x_gpio_current(index));
+}
+
+static struct nsc_gpio_ops pc8736x_access = {
+       .owner          = THIS_MODULE,
+       .gpio_config    = pc8736x_gpio_configure,
+       .gpio_dump      = nsc_gpio_dump,
+       .gpio_get       = pc8736x_gpio_get,
+       .gpio_set       = pc8736x_gpio_set,
+       .gpio_set_high  = pc8736x_gpio_set_high,
+       .gpio_set_low   = pc8736x_gpio_set_low,
+       .gpio_change    = pc8736x_gpio_change,
+       .gpio_current   = pc8736x_gpio_current
+};
+
+static int pc8736x_gpio_open(struct inode *inode, struct file *file)
+{
+       unsigned m = iminor(inode);
+       file->private_data = &pc8736x_access;
+
+       dev_dbg(&pdev->dev, "open %d\n", m);
+
+       if (m > 63)
+               return -EINVAL;
+       return nonseekable_open(inode, file);
+}
+
+static struct file_operations pc8736x_gpio_fops = {
+       .owner  = THIS_MODULE,
+       .open   = pc8736x_gpio_open,
+       .write  = nsc_gpio_write,
+       .read   = nsc_gpio_read,
+};
+
+static void __init pc8736x_init_shadow(void)
+{
+       int port;
+
+       /* read the current values driven on the GPIO signals */
+       for (port = 0; port < 4; ++port)
+               pc8736x_gpio_shadow[port]
+                   = inb_p(pc8736x_gpio_base + port_offset[port]
+                           + PORT_OUT);
+
+}
+
+static int __init pc8736x_gpio_init(void)
+{
+       int rc = 0;
+
+       pdev = platform_device_alloc(DEVNAME, 0);
+       if (!pdev)
+               return -ENOMEM;
+
+       rc = platform_device_add(pdev);
+       if (rc) {
+               rc = -ENODEV;
+               goto undo_platform_dev_alloc;
+       }
+       dev_info(&pdev->dev, "NatSemi pc8736x GPIO Driver Initializing\n");
+
+       if (!pc8736x_superio_present()) {
+               rc = -ENODEV;
+               dev_err(&pdev->dev, "no device found\n");
+               goto undo_platform_dev_add;
+       }
+       pc8736x_access.dev = &pdev->dev;
+
+       /* Verify that chip and it's GPIO unit are both enabled.
+          My BIOS does this, so I take minimum action here
+        */
+       rc = superio_inb(SIO_CF1);
+       if (!(rc & 0x01)) {
+               rc = -ENODEV;
+               dev_err(&pdev->dev, "device not enabled\n");
+               goto undo_platform_dev_add;
+       }
+       device_select(SIO_GPIO_UNIT);
+       if (!superio_inb(SIO_UNIT_ACT)) {
+               rc = -ENODEV;
+               dev_err(&pdev->dev, "GPIO unit not enabled\n");
+               goto undo_platform_dev_add;
+       }
+
+       /* read the GPIO unit base addr that chip responds to */
+       pc8736x_gpio_base = (superio_inb(SIO_BASE_HADDR) << 8
+                            | superio_inb(SIO_BASE_LADDR));
+
+       if (!request_region(pc8736x_gpio_base, 16, DEVNAME)) {
+               rc = -ENODEV;
+               dev_err(&pdev->dev, "GPIO ioport %x busy\n",
+                       pc8736x_gpio_base);
+               goto undo_platform_dev_add;
+       }
+       dev_info(&pdev->dev, "GPIO ioport %x reserved\n", pc8736x_gpio_base);
+
+       rc = register_chrdev(major, DEVNAME, &pc8736x_gpio_fops);
+       if (rc < 0) {
+               dev_err(&pdev->dev, "register-chrdev failed: %d\n", rc);
+               goto undo_platform_dev_add;
+       }
+       if (!major) {
+               major = rc;
+               dev_dbg(&pdev->dev, "got dynamic major %d\n", major);
+       }
+
+       pc8736x_init_shadow();
+       return 0;
+
+undo_platform_dev_add:
+       platform_device_put(pdev);
+undo_platform_dev_alloc:
+       kfree(pdev);
+       return rc;
+}
+
+static void __exit pc8736x_gpio_cleanup(void)
+{
+       dev_dbg(&pdev->dev, " cleanup\n");
+
+       release_region(pc8736x_gpio_base, 16);
+
+       unregister_chrdev(major, DEVNAME);
+}
+
+EXPORT_SYMBOL(pc8736x_access);
+
+module_init(pc8736x_gpio_init);
+module_exit(pc8736x_gpio_cleanup);
index 664a6e97eb1a8012475625af6383825a3d885e20..5a280a330401f76924b57d4031457439670f504e 100644 (file)
@@ -1,4 +1,4 @@
-/* linux/drivers/char/scx200_gpio.c 
+/* linux/drivers/char/scx200_gpio.c
 
    National Semiconductor SCx200 GPIO driver.  Allows a user space
    process to play with the GPIO pins.
@@ -6,17 +6,26 @@
    Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com> */
 
 #include <linux/config.h>
+#include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/platform_device.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
+#include <linux/types.h>
+#include <linux/cdev.h>
+
 #include <linux/scx200_gpio.h>
+#include <linux/nsc_gpio.h>
 
 #define NAME "scx200_gpio"
+#define DEVNAME NAME
+
+static struct platform_device *pdev;
 
 MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
 MODULE_DESCRIPTION("NatSemi SCx200 GPIO Pin Driver");
@@ -26,70 +35,23 @@ static int major = 0;               /* default to dynamic major */
 module_param(major, int, 0);
 MODULE_PARM_DESC(major, "Major device number");
 
-static ssize_t scx200_gpio_write(struct file *file, const char __user *data, 
-                                size_t len, loff_t *ppos)
-{
-       unsigned m = iminor(file->f_dentry->d_inode);
-       size_t i;
-
-       for (i = 0; i < len; ++i) {
-               char c;
-               if (get_user(c, data+i))
-                       return -EFAULT;
-               switch (c)
-               {
-               case '0': 
-                       scx200_gpio_set(m, 0); 
-                       break;
-               case '1': 
-                       scx200_gpio_set(m, 1); 
-                       break;
-               case 'O':
-                       printk(KERN_INFO NAME ": GPIO%d output enabled\n", m);
-                       scx200_gpio_configure(m, ~1, 1);
-                       break;
-               case 'o':
-                       printk(KERN_INFO NAME ": GPIO%d output disabled\n", m);
-                       scx200_gpio_configure(m, ~1, 0);
-                       break;
-               case 'T':
-                       printk(KERN_INFO NAME ": GPIO%d output is push pull\n", m);
-                       scx200_gpio_configure(m, ~2, 2);
-                       break;
-               case 't':
-                       printk(KERN_INFO NAME ": GPIO%d output is open drain\n", m);
-                       scx200_gpio_configure(m, ~2, 0);
-                       break;
-               case 'P':
-                       printk(KERN_INFO NAME ": GPIO%d pull up enabled\n", m);
-                       scx200_gpio_configure(m, ~4, 4);
-                       break;
-               case 'p':
-                       printk(KERN_INFO NAME ": GPIO%d pull up disabled\n", m);
-                       scx200_gpio_configure(m, ~4, 0);
-                       break;
-               }
-       }
-
-       return len;
-}
-
-static ssize_t scx200_gpio_read(struct file *file, char __user *buf,
-                               size_t len, loff_t *ppos)
-{
-       unsigned m = iminor(file->f_dentry->d_inode);
-       int value;
-
-       value = scx200_gpio_get(m);
-       if (put_user(value ? '1' : '0', buf))
-               return -EFAULT;
-       
-       return 1;
-}
+struct nsc_gpio_ops scx200_access = {
+       .owner          = THIS_MODULE,
+       .gpio_config    = scx200_gpio_configure,
+       .gpio_dump      = nsc_gpio_dump,
+       .gpio_get       = scx200_gpio_get,
+       .gpio_set       = scx200_gpio_set,
+       .gpio_set_high  = scx200_gpio_set_high,
+       .gpio_set_low   = scx200_gpio_set_low,
+       .gpio_change    = scx200_gpio_change,
+       .gpio_current   = scx200_gpio_current
+};
 
 static int scx200_gpio_open(struct inode *inode, struct file *file)
 {
        unsigned m = iminor(inode);
+       file->private_data = &scx200_access;
+
        if (m > 63)
                return -EINVAL;
        return nonseekable_open(inode, file);
@@ -103,47 +65,81 @@ static int scx200_gpio_release(struct inode *inode, struct file *file)
 
 static struct file_operations scx200_gpio_fops = {
        .owner   = THIS_MODULE,
-       .write   = scx200_gpio_write,
-       .read    = scx200_gpio_read,
+       .write   = nsc_gpio_write,
+       .read    = nsc_gpio_read,
        .open    = scx200_gpio_open,
        .release = scx200_gpio_release,
 };
 
+struct cdev *scx200_devices;
+static int num_pins = 32;
+
 static int __init scx200_gpio_init(void)
 {
-       int r;
-
-       printk(KERN_DEBUG NAME ": NatSemi SCx200 GPIO Driver\n");
+       int rc, i;
+       dev_t dev = MKDEV(major, 0);
 
        if (!scx200_gpio_present()) {
-               printk(KERN_ERR NAME ": no SCx200 gpio pins available\n");
+               printk(KERN_ERR NAME ": no SCx200 gpio present\n");
                return -ENODEV;
        }
 
-       r = register_chrdev(major, NAME, &scx200_gpio_fops);
-       if (r < 0) {
-               printk(KERN_ERR NAME ": unable to register character device\n");
-               return r;
+       /* support dev_dbg() with pdev->dev */
+       pdev = platform_device_alloc(DEVNAME, 0);
+       if (!pdev)
+               return -ENOMEM;
+
+       rc = platform_device_add(pdev);
+       if (rc)
+               goto undo_malloc;
+
+       /* nsc_gpio uses dev_dbg(), so needs this */
+       scx200_access.dev = &pdev->dev;
+
+       if (major)
+               rc = register_chrdev_region(dev, num_pins, "scx200_gpio");
+       else {
+               rc = alloc_chrdev_region(&dev, 0, num_pins, "scx200_gpio");
+               major = MAJOR(dev);
        }
-       if (!major) {
-               major = r;
-               printk(KERN_DEBUG NAME ": got dynamic major %d\n", major);
+       if (rc < 0) {
+               dev_err(&pdev->dev, "SCx200 chrdev_region err: %d\n", rc);
+               goto undo_platform_device_add;
+       }
+       scx200_devices = kzalloc(num_pins * sizeof(struct cdev), GFP_KERNEL);
+       if (!scx200_devices) {
+               rc = -ENOMEM;
+               goto undo_chrdev_region;
+       }
+       for (i = 0; i < num_pins; i++) {
+               struct cdev *cdev = &scx200_devices[i];
+               cdev_init(cdev, &scx200_gpio_fops);
+               cdev->owner = THIS_MODULE;
+               rc = cdev_add(cdev, MKDEV(major, i), 1);
+               /* tolerate 'minor' errors */
+               if (rc)
+                       dev_err(&pdev->dev, "Error %d on minor %d", rc, i);
        }
 
-       return 0;
+       return 0; /* succeed */
+
+undo_chrdev_region:
+       unregister_chrdev_region(dev, num_pins);
+undo_platform_device_add:
+       platform_device_put(pdev);
+undo_malloc:
+       kfree(pdev);
+       return rc;
 }
 
 static void __exit scx200_gpio_cleanup(void)
 {
-       unregister_chrdev(major, NAME);
+       kfree(scx200_devices);
+       unregister_chrdev_region(MKDEV(major, 0), num_pins);
+       platform_device_put(pdev);
+       platform_device_unregister(pdev);
+       /* kfree(pdev); */
 }
 
 module_init(scx200_gpio_init);
 module_exit(scx200_gpio_cleanup);
-
-/*
-    Local variables:
-        compile-command: "make -k -C ../.. SUBDIRS=drivers/char modules"
-        c-basic-offset: 8
-    End:
-*/
index 1b5330299e30fa965086eae13d601a1e80e29be3..d2d6b01dcd05a1168dbd7795ebd13d5c5be32dc9 100644 (file)
@@ -2477,7 +2477,7 @@ static int __init specialix_init(void)
 #endif
 
        for (i = 0; i < SX_NBOARD; i++)
-               sx_board[i].lock = SPIN_LOCK_UNLOCKED;
+               spin_lock_init(&sx_board[i].lock);
 
        if (sx_init_drivers()) {
                func_exit();
index a9c5a7230f8958b5d8ee1338fa88c40cddf79a58..bf361a5ba70d66c24a2f9d8d97f99f39e9aaffc3 100644 (file)
@@ -140,15 +140,6 @@ static char        *stl_drvversion = "5.6.0";
 
 static struct tty_driver       *stl_serial;
 
-/*
- *     We will need to allocate a temporary write buffer for chars that
- *     come direct from user space. The problem is that a copy from user
- *     space might cause a page fault (typically on a system that is
- *     swapping!). All ports will share one buffer - since if the system
- *     is already swapping a shared buffer won't make things any worse.
- */
-static char                    *stl_tmpwritebuf;
-
 /*
  *     Define a local default termios struct. All ports will be created
  *     with this termios initially. Basically all it defines is a raw port
@@ -362,6 +353,14 @@ static unsigned char       stl_vecmap[] = {
        0xff, 0xff, 0x00, 0x02, 0x01, 0xff, 0xff, 0x03
 };
 
+/*
+ *     Lock ordering is that you may not take stallion_lock holding
+ *     brd_lock.
+ */
+
+static spinlock_t brd_lock;            /* Guard the board mapping */
+static spinlock_t stallion_lock;       /* Guard the tty driver */
+
 /*
  *     Set up enable and disable macros for the ECH boards. They require
  *     the secondary io address space to be activated and deactivated.
@@ -725,17 +724,7 @@ static struct class *stallion_class;
 
 static int __init stallion_module_init(void)
 {
-       unsigned long   flags;
-
-#ifdef DEBUG
-       printk("init_module()\n");
-#endif
-
-       save_flags(flags);
-       cli();
        stl_init();
-       restore_flags(flags);
-
        return 0;
 }
 
@@ -746,7 +735,6 @@ static void __exit stallion_module_exit(void)
        stlbrd_t        *brdp;
        stlpanel_t      *panelp;
        stlport_t       *portp;
-       unsigned long   flags;
        int             i, j, k;
 
 #ifdef DEBUG
@@ -756,9 +744,6 @@ static void __exit stallion_module_exit(void)
        printk(KERN_INFO "Unloading %s: version %s\n", stl_drvtitle,
                stl_drvversion);
 
-       save_flags(flags);
-       cli();
-
 /*
  *     Free up all allocated resources used by the ports. This includes
  *     memory and interrupts. As part of this process we will also do
@@ -770,7 +755,6 @@ static void __exit stallion_module_exit(void)
        if (i) {
                printk("STALLION: failed to un-register tty driver, "
                        "errno=%d\n", -i);
-               restore_flags(flags);
                return;
        }
        for (i = 0; i < 4; i++) {
@@ -783,8 +767,6 @@ static void __exit stallion_module_exit(void)
                        "errno=%d\n", -i);
        class_destroy(stallion_class);
 
-       kfree(stl_tmpwritebuf);
-
        for (i = 0; (i < stl_nrbrds); i++) {
                if ((brdp = stl_brds[i]) == (stlbrd_t *) NULL)
                        continue;
@@ -814,8 +796,6 @@ static void __exit stallion_module_exit(void)
                kfree(brdp);
                stl_brds[i] = (stlbrd_t *) NULL;
        }
-
-       restore_flags(flags);
 }
 
 module_init(stallion_module_init);
@@ -948,7 +928,7 @@ static stlbrd_t *stl_allocbrd(void)
 
        brdp = kzalloc(sizeof(stlbrd_t), GFP_KERNEL);
        if (!brdp) {
-               printk("STALLION: failed to allocate memory (size=%d)\n",
+               printk("STALLION: failed to allocate memory (size=%Zd)\n",
                        sizeof(stlbrd_t));
                return NULL;
        }
@@ -1066,16 +1046,17 @@ static int stl_waitcarrier(stlport_t *portp, struct file *filp)
        rc = 0;
        doclocal = 0;
 
+       spin_lock_irqsave(&stallion_lock, flags);
+
        if (portp->tty->termios->c_cflag & CLOCAL)
                doclocal++;
 
-       save_flags(flags);
-       cli();
        portp->openwaitcnt++;
        if (! tty_hung_up_p(filp))
                portp->refcount--;
 
        for (;;) {
+               /* Takes brd_lock internally */
                stl_setsignals(portp, 1, 1);
                if (tty_hung_up_p(filp) ||
                    ((portp->flags & ASYNC_INITIALIZED) == 0)) {
@@ -1093,13 +1074,14 @@ static int stl_waitcarrier(stlport_t *portp, struct file *filp)
                        rc = -ERESTARTSYS;
                        break;
                }
+               /* FIXME */
                interruptible_sleep_on(&portp->open_wait);
        }
 
        if (! tty_hung_up_p(filp))
                portp->refcount++;
        portp->openwaitcnt--;
-       restore_flags(flags);
+       spin_unlock_irqrestore(&stallion_lock, flags);
 
        return rc;
 }
@@ -1119,16 +1101,15 @@ static void stl_close(struct tty_struct *tty, struct file *filp)
        if (portp == (stlport_t *) NULL)
                return;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&stallion_lock, flags);
        if (tty_hung_up_p(filp)) {
-               restore_flags(flags);
+               spin_unlock_irqrestore(&stallion_lock, flags);
                return;
        }
        if ((tty->count == 1) && (portp->refcount != 1))
                portp->refcount = 1;
        if (portp->refcount-- > 1) {
-               restore_flags(flags);
+               spin_unlock_irqrestore(&stallion_lock, flags);
                return;
        }
 
@@ -1142,11 +1123,18 @@ static void stl_close(struct tty_struct *tty, struct file *filp)
  *     (The sc26198 has no "end-of-data" interrupt only empty FIFO)
  */
        tty->closing = 1;
+
+       spin_unlock_irqrestore(&stallion_lock, flags);
+
        if (portp->closing_wait != ASYNC_CLOSING_WAIT_NONE)
                tty_wait_until_sent(tty, portp->closing_wait);
        stl_waituntilsent(tty, (HZ / 2));
 
+
+       spin_lock_irqsave(&stallion_lock, flags);
        portp->flags &= ~ASYNC_INITIALIZED;
+       spin_unlock_irqrestore(&stallion_lock, flags);
+
        stl_disableintrs(portp);
        if (tty->termios->c_cflag & HUPCL)
                stl_setsignals(portp, 0, 0);
@@ -1173,7 +1161,6 @@ static void stl_close(struct tty_struct *tty, struct file *filp)
 
        portp->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
        wake_up_interruptible(&portp->close_wait);
-       restore_flags(flags);
 }
 
 /*****************************************************************************/
@@ -1195,9 +1182,6 @@ static int stl_write(struct tty_struct *tty, const unsigned char *buf, int count
                (int) tty, (int) buf, count);
 #endif
 
-       if ((tty == (struct tty_struct *) NULL) ||
-           (stl_tmpwritebuf == (char *) NULL))
-               return 0;
        portp = tty->driver_data;
        if (portp == (stlport_t *) NULL)
                return 0;
@@ -1302,11 +1286,6 @@ static void stl_flushchars(struct tty_struct *tty)
        if (portp->tx.buf == (char *) NULL)
                return;
 
-#if 0
-       if (tty->stopped || tty->hw_stopped ||
-           (portp->tx.head == portp->tx.tail))
-               return;
-#endif
        stl_startrxtx(portp, -1, 1);
 }
 
@@ -1977,12 +1956,14 @@ static int stl_eiointr(stlbrd_t *brdp)
        unsigned int    iobase;
        int             handled = 0;
 
+       spin_lock(&brd_lock);
        panelp = brdp->panels[0];
        iobase = panelp->iobase;
        while (inb(brdp->iostatus) & EIO_INTRPEND) {
                handled = 1;
                (* panelp->isr)(panelp, iobase);
        }
+       spin_unlock(&brd_lock);
        return handled;
 }
 
@@ -2168,7 +2149,7 @@ static int __init stl_initports(stlbrd_t *brdp, stlpanel_t *panelp)
                portp = kzalloc(sizeof(stlport_t), GFP_KERNEL);
                if (!portp) {
                        printk("STALLION: failed to allocate memory "
-                               "(size=%d)\n", sizeof(stlport_t));
+                               "(size=%Zd)\n", sizeof(stlport_t));
                        break;
                }
 
@@ -2304,7 +2285,7 @@ static inline int stl_initeio(stlbrd_t *brdp)
        panelp = kzalloc(sizeof(stlpanel_t), GFP_KERNEL);
        if (!panelp) {
                printk(KERN_WARNING "STALLION: failed to allocate memory "
-                       "(size=%d)\n", sizeof(stlpanel_t));
+                       "(size=%Zd)\n", sizeof(stlpanel_t));
                return -ENOMEM;
        }
 
@@ -2478,7 +2459,7 @@ static inline int stl_initech(stlbrd_t *brdp)
                panelp = kzalloc(sizeof(stlpanel_t), GFP_KERNEL);
                if (!panelp) {
                        printk("STALLION: failed to allocate memory "
-                               "(size=%d)\n", sizeof(stlpanel_t));
+                               "(size=%Zd)\n", sizeof(stlpanel_t));
                        break;
                }
                panelp->magic = STL_PANELMAGIC;
@@ -2879,8 +2860,7 @@ static int stl_getportstats(stlport_t *portp, comstats_t __user *cp)
        portp->stats.lflags = 0;
        portp->stats.rxbuffered = 0;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&stallion_lock, flags);
        if (portp->tty != (struct tty_struct *) NULL) {
                if (portp->tty->driver_data == portp) {
                        portp->stats.ttystate = portp->tty->flags;
@@ -2894,7 +2874,7 @@ static int stl_getportstats(stlport_t *portp, comstats_t __user *cp)
                        }
                }
        }
-       restore_flags(flags);
+       spin_unlock_irqrestore(&stallion_lock, flags);
 
        head = portp->tx.head;
        tail = portp->tx.tail;
@@ -3055,14 +3035,6 @@ static int __init stl_init(void)
        if (!stl_serial)
                return -1;
 
-/*
- *     Allocate a temporary write buffer.
- */
-       stl_tmpwritebuf = kmalloc(STL_TXBUFSIZE, GFP_KERNEL);
-       if (!stl_tmpwritebuf)
-               printk("STALLION: failed to allocate memory (size=%d)\n",
-                       STL_TXBUFSIZE);
-
 /*
  *     Set up a character driver for per board stuff. This is mainly used
  *     to do stats ioctls on the ports.
@@ -3147,11 +3119,13 @@ static int stl_cd1400panelinit(stlbrd_t *brdp, stlpanel_t *panelp)
        unsigned int    gfrcr;
        int             chipmask, i, j;
        int             nrchips, uartaddr, ioaddr;
+       unsigned long   flags;
 
 #ifdef DEBUG
        printk("stl_panelinit(brdp=%x,panelp=%x)\n", (int) brdp, (int) panelp);
 #endif
 
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(panelp->brdnr, panelp->pagenr);
 
 /*
@@ -3189,6 +3163,7 @@ static int stl_cd1400panelinit(stlbrd_t *brdp, stlpanel_t *panelp)
        }
 
        BRDDISABLE(panelp->brdnr);
+       spin_unlock_irqrestore(&brd_lock, flags);
        return chipmask;
 }
 
@@ -3200,6 +3175,7 @@ static int stl_cd1400panelinit(stlbrd_t *brdp, stlpanel_t *panelp)
 
 static void stl_cd1400portinit(stlbrd_t *brdp, stlpanel_t *panelp, stlport_t *portp)
 {
+       unsigned long flags;
 #ifdef DEBUG
        printk("stl_cd1400portinit(brdp=%x,panelp=%x,portp=%x)\n",
                (int) brdp, (int) panelp, (int) portp);
@@ -3209,6 +3185,7 @@ static void stl_cd1400portinit(stlbrd_t *brdp, stlpanel_t *panelp, stlport_t *po
            (portp == (stlport_t *) NULL))
                return;
 
+       spin_lock_irqsave(&brd_lock, flags);
        portp->ioaddr = panelp->iobase + (((brdp->brdtype == BRD_ECHPCI) ||
                (portp->portnr < 8)) ? 0 : EREG_BANKSIZE);
        portp->uartaddr = (portp->portnr & 0x04) << 5;
@@ -3219,6 +3196,7 @@ static void stl_cd1400portinit(stlbrd_t *brdp, stlpanel_t *panelp, stlport_t *po
        stl_cd1400setreg(portp, LIVR, (portp->portnr << 3));
        portp->hwid = stl_cd1400getreg(portp, GFRCR);
        BRDDISABLE(portp->brdnr);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3428,8 +3406,7 @@ static void stl_cd1400setport(stlport_t *portp, struct termios *tiosp)
                tiosp->c_cc[VSTART], tiosp->c_cc[VSTOP]);
 #endif
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_cd1400setreg(portp, CAR, (portp->portnr & 0x3));
        srer = stl_cd1400getreg(portp, SRER);
@@ -3466,7 +3443,7 @@ static void stl_cd1400setport(stlport_t *portp, struct termios *tiosp)
                portp->sigs &= ~TIOCM_CD;
        stl_cd1400setreg(portp, SRER, ((srer & ~sreroff) | sreron));
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3492,8 +3469,7 @@ static void stl_cd1400setsignals(stlport_t *portp, int dtr, int rts)
        if (rts > 0)
                msvr2 = MSVR2_RTS;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
        if (rts >= 0)
@@ -3501,7 +3477,7 @@ static void stl_cd1400setsignals(stlport_t *portp, int dtr, int rts)
        if (dtr >= 0)
                stl_cd1400setreg(portp, MSVR1, msvr1);
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3520,14 +3496,13 @@ static int stl_cd1400getsignals(stlport_t *portp)
        printk("stl_cd1400getsignals(portp=%x)\n", (int) portp);
 #endif
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
        msvr1 = stl_cd1400getreg(portp, MSVR1);
        msvr2 = stl_cd1400getreg(portp, MSVR2);
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 
        sigs = 0;
        sigs |= (msvr1 & MSVR1_DCD) ? TIOCM_CD : 0;
@@ -3569,15 +3544,14 @@ static void stl_cd1400enablerxtx(stlport_t *portp, int rx, int tx)
        else if (rx > 0)
                ccr |= CCR_RXENABLE;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
        stl_cd1400ccrwait(portp);
        stl_cd1400setreg(portp, CCR, ccr);
        stl_cd1400ccrwait(portp);
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3609,8 +3583,7 @@ static void stl_cd1400startrxtx(stlport_t *portp, int rx, int tx)
        else if (rx > 0)
                sreron |= SRER_RXDATA;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
        stl_cd1400setreg(portp, SRER,
@@ -3618,7 +3591,7 @@ static void stl_cd1400startrxtx(stlport_t *portp, int rx, int tx)
        BRDDISABLE(portp->brdnr);
        if (tx > 0)
                set_bit(ASYI_TXBUSY, &portp->istate);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3634,13 +3607,12 @@ static void stl_cd1400disableintrs(stlport_t *portp)
 #ifdef DEBUG
        printk("stl_cd1400disableintrs(portp=%x)\n", (int) portp);
 #endif
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
        stl_cd1400setreg(portp, SRER, 0);
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3653,8 +3625,7 @@ static void stl_cd1400sendbreak(stlport_t *portp, int len)
        printk("stl_cd1400sendbreak(portp=%x,len=%d)\n", (int) portp, len);
 #endif
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
        stl_cd1400setreg(portp, SRER,
@@ -3664,7 +3635,7 @@ static void stl_cd1400sendbreak(stlport_t *portp, int len)
        portp->brklen = len;
        if (len == 1)
                portp->stats.txbreaks++;
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3688,8 +3659,7 @@ static void stl_cd1400flowctrl(stlport_t *portp, int state)
        if (tty == (struct tty_struct *) NULL)
                return;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
 
@@ -3729,7 +3699,7 @@ static void stl_cd1400flowctrl(stlport_t *portp, int state)
        }
 
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3753,8 +3723,7 @@ static void stl_cd1400sendflow(stlport_t *portp, int state)
        if (tty == (struct tty_struct *) NULL)
                return;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
        if (state) {
@@ -3769,7 +3738,7 @@ static void stl_cd1400sendflow(stlport_t *portp, int state)
                stl_cd1400ccrwait(portp);
        }
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3785,8 +3754,7 @@ static void stl_cd1400flush(stlport_t *portp)
        if (portp == (stlport_t *) NULL)
                return;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_cd1400setreg(portp, CAR, (portp->portnr & 0x03));
        stl_cd1400ccrwait(portp);
@@ -3794,7 +3762,7 @@ static void stl_cd1400flush(stlport_t *portp)
        stl_cd1400ccrwait(portp);
        portp->tx.tail = portp->tx.head;
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -3833,6 +3801,7 @@ static void stl_cd1400eiointr(stlpanel_t *panelp, unsigned int iobase)
                (int) panelp, iobase);
 #endif
 
+       spin_lock(&brd_lock);
        outb(SVRR, iobase);
        svrtype = inb(iobase + EREG_DATA);
        if (panelp->nrports > 4) {
@@ -3846,6 +3815,8 @@ static void stl_cd1400eiointr(stlpanel_t *panelp, unsigned int iobase)
                stl_cd1400txisr(panelp, iobase);
        else if (svrtype & SVRR_MDM)
                stl_cd1400mdmisr(panelp, iobase);
+
+       spin_unlock(&brd_lock);
 }
 
 /*****************************************************************************/
@@ -4433,8 +4404,7 @@ static void stl_sc26198setport(stlport_t *portp, struct termios *tiosp)
                tiosp->c_cc[VSTART], tiosp->c_cc[VSTOP]);
 #endif
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_sc26198setreg(portp, IMR, 0);
        stl_sc26198updatereg(portp, MR0, mr0);
@@ -4461,7 +4431,7 @@ static void stl_sc26198setport(stlport_t *portp, struct termios *tiosp)
        portp->imr = (portp->imr & ~imroff) | imron;
        stl_sc26198setreg(portp, IMR, portp->imr);
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4491,13 +4461,12 @@ static void stl_sc26198setsignals(stlport_t *portp, int dtr, int rts)
        else if (rts > 0)
                iopioron |= IPR_RTS;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_sc26198setreg(portp, IOPIOR,
                ((stl_sc26198getreg(portp, IOPIOR) & ~iopioroff) | iopioron));
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4516,12 +4485,11 @@ static int stl_sc26198getsignals(stlport_t *portp)
        printk("stl_sc26198getsignals(portp=%x)\n", (int) portp);
 #endif
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        ipr = stl_sc26198getreg(portp, IPR);
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 
        sigs = 0;
        sigs |= (ipr & IPR_DCD) ? 0 : TIOCM_CD;
@@ -4558,13 +4526,12 @@ static void stl_sc26198enablerxtx(stlport_t *portp, int rx, int tx)
        else if (rx > 0)
                ccr |= CR_RXENABLE;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_sc26198setreg(portp, SCCR, ccr);
        BRDDISABLE(portp->brdnr);
        portp->crenable = ccr;
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4593,15 +4560,14 @@ static void stl_sc26198startrxtx(stlport_t *portp, int rx, int tx)
        else if (rx > 0)
                imr |= IR_RXRDY | IR_RXBREAK | IR_RXWATCHDOG;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_sc26198setreg(portp, IMR, imr);
        BRDDISABLE(portp->brdnr);
        portp->imr = imr;
        if (tx > 0)
                set_bit(ASYI_TXBUSY, &portp->istate);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4618,13 +4584,12 @@ static void stl_sc26198disableintrs(stlport_t *portp)
        printk("stl_sc26198disableintrs(portp=%x)\n", (int) portp);
 #endif
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        portp->imr = 0;
        stl_sc26198setreg(portp, IMR, 0);
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4637,8 +4602,7 @@ static void stl_sc26198sendbreak(stlport_t *portp, int len)
        printk("stl_sc26198sendbreak(portp=%x,len=%d)\n", (int) portp, len);
 #endif
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        if (len == 1) {
                stl_sc26198setreg(portp, SCCR, CR_TXSTARTBREAK);
@@ -4647,7 +4611,7 @@ static void stl_sc26198sendbreak(stlport_t *portp, int len)
                stl_sc26198setreg(portp, SCCR, CR_TXSTOPBREAK);
        }
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4672,8 +4636,7 @@ static void stl_sc26198flowctrl(stlport_t *portp, int state)
        if (tty == (struct tty_struct *) NULL)
                return;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
 
        if (state) {
@@ -4719,7 +4682,7 @@ static void stl_sc26198flowctrl(stlport_t *portp, int state)
        }
 
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4744,8 +4707,7 @@ static void stl_sc26198sendflow(stlport_t *portp, int state)
        if (tty == (struct tty_struct *) NULL)
                return;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        if (state) {
                mr0 = stl_sc26198getreg(portp, MR0);
@@ -4765,7 +4727,7 @@ static void stl_sc26198sendflow(stlport_t *portp, int state)
                stl_sc26198setreg(portp, MR0, mr0);
        }
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4781,14 +4743,13 @@ static void stl_sc26198flush(stlport_t *portp)
        if (portp == (stlport_t *) NULL)
                return;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        stl_sc26198setreg(portp, SCCR, CR_TXRESET);
        stl_sc26198setreg(portp, SCCR, portp->crenable);
        BRDDISABLE(portp->brdnr);
        portp->tx.tail = portp->tx.head;
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 }
 
 /*****************************************************************************/
@@ -4815,12 +4776,11 @@ static int stl_sc26198datastate(stlport_t *portp)
        if (test_bit(ASYI_TXBUSY, &portp->istate))
                return 1;
 
-       save_flags(flags);
-       cli();
+       spin_lock_irqsave(&brd_lock, flags);
        BRDENABLE(portp->brdnr, portp->pagenr);
        sr = stl_sc26198getreg(portp, SR);
        BRDDISABLE(portp->brdnr);
-       restore_flags(flags);
+       spin_unlock_irqrestore(&brd_lock, flags);
 
        return (sr & SR_TXEMPTY) ? 0 : 1;
 }
@@ -4878,6 +4838,8 @@ static void stl_sc26198intr(stlpanel_t *panelp, unsigned int iobase)
        stlport_t       *portp;
        unsigned int    iack;
 
+       spin_lock(&brd_lock);
+
 /* 
  *     Work around bug in sc26198 chip... Cannot have A6 address
  *     line of UART high, else iack will be returned as 0.
@@ -4893,6 +4855,8 @@ static void stl_sc26198intr(stlpanel_t *panelp, unsigned int iobase)
                stl_sc26198txisr(portp);
        else
                stl_sc26198otherisr(portp, iack);
+
+       spin_unlock(&brd_lock);
 }
 
 /*****************************************************************************/
index 3b4747230270a3dc38fbdcc12fab013ee5a23439..76b9107f7f814b53bda55814e3d5e8d890b02b44 100644 (file)
@@ -2320,7 +2320,7 @@ static int sx_init_portstructs (int nboards, int nports)
 #ifdef NEW_WRITE_LOCKING
                        port->gs.port_write_mutex = MUTEX;
 #endif
-                       port->gs.driver_lock = SPIN_LOCK_UNLOCKED;
+                       spin_lock_init(&port->gs.driver_lock);
                        /*
                         * Initializing wait queue
                         */
index 8b2a5996986819b62eccb4ffdb436d1d3903521a..bd74e82d8a72a443b8ca4579898e3dcf11e876f3 100644 (file)
@@ -2621,10 +2621,9 @@ int tty_ioctl(struct inode * inode, struct file * file,
                        tty->driver->break_ctl(tty, 0);
                        return 0;
                case TCSBRK:   /* SVID version: non-zero arg --> no break */
-                       /*
-                        * XXX is the above comment correct, or the
-                        * code below correct?  Is this ioctl used at
-                        * all by anyone?
+                       /* non-zero arg means wait for all output data
+                        * to be sent (performed above) but don't send break.
+                        * This is used by the tcdrain() termios function.
                         */
                        if (!arg)
                                return send_break(tty, 250);
index 44d1eca83a7250748bf27c637998c3c2973f9657..35e0b9ceecf7faa90f9ab34784112533e7d48ec7 100644 (file)
@@ -1497,6 +1497,7 @@ int cpufreq_update_policy(unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
+#ifdef CONFIG_HOTPLUG_CPU
 static int cpufreq_cpu_callback(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
@@ -1532,10 +1533,11 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
 
-static struct notifier_block cpufreq_cpu_notifier =
+static struct notifier_block __cpuinitdata cpufreq_cpu_notifier =
 {
     .notifier_call = cpufreq_cpu_callback,
 };
+#endif /* CONFIG_HOTPLUG_CPU */
 
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
@@ -1596,7 +1598,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
        }
 
        if (!ret) {
-               register_cpu_notifier(&cpufreq_cpu_notifier);
+               register_hotcpu_notifier(&cpufreq_cpu_notifier);
                dprintk("driver %s up and running\n", driver_data->name);
                cpufreq_debug_enable_ratelimit();
        }
@@ -1628,7 +1630,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
        dprintk("unregistering driver %s\n", driver->name);
 
        sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
-       unregister_cpu_notifier(&cpufreq_cpu_notifier);
+       unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
 
        spin_lock_irqsave(&cpufreq_driver_lock, flags);
        cpufreq_driver = NULL;
index c576c0b3f452629560fd2567a1bc5735011daf92..145061b8472a814b96e1075e51876ecbb5f3f246 100644 (file)
@@ -350,7 +350,7 @@ __init cpufreq_stats_init(void)
                return ret;
        }
 
-       register_cpu_notifier(&cpufreq_stat_cpu_notifier);
+       register_hotcpu_notifier(&cpufreq_stat_cpu_notifier);
        lock_cpu_hotplug();
        for_each_online_cpu(cpu) {
                cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_ONLINE,
@@ -368,7 +368,7 @@ __exit cpufreq_stats_exit(void)
                        CPUFREQ_POLICY_NOTIFIER);
        cpufreq_unregister_notifier(&notifier_trans_block,
                        CPUFREQ_TRANSITION_NOTIFIER);
-       unregister_cpu_notifier(&cpufreq_stat_cpu_notifier);
+       unregister_hotcpu_notifier(&cpufreq_stat_cpu_notifier);
        lock_cpu_hotplug();
        for_each_online_cpu(cpu) {
                cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_DEAD,
index de2e7546b491a8878da0677fc40694ec4ba99b3e..a90486f5e49127bf3bbae494d8502d86835f412c 100644 (file)
@@ -998,12 +998,13 @@ void input_unregister_device(struct input_dev *dev)
        sysfs_remove_group(&dev->cdev.kobj, &input_dev_caps_attr_group);
        sysfs_remove_group(&dev->cdev.kobj, &input_dev_id_attr_group);
        sysfs_remove_group(&dev->cdev.kobj, &input_dev_attr_group);
-       class_device_unregister(&dev->cdev);
 
        mutex_lock(&dev->mutex);
        dev->name = dev->phys = dev->uniq = NULL;
        mutex_unlock(&dev->mutex);
 
+       class_device_unregister(&dev->cdev);
+
        input_wakeup_procfs_readers();
 }
 EXPORT_SYMBOL(input_unregister_device);
index acb7e2656780274cd32fc8f4e170f234c3357bc4..2a56bf33a6738ac47a31ecad684f2b9945aa06d7 100644 (file)
@@ -981,7 +981,7 @@ exit:
 EXPORT_SYMBOL_GPL(gigaset_stop);
 
 static LIST_HEAD(drivers);
-static spinlock_t driver_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(driver_lock);
 
 struct cardstate *gigaset_get_cs_by_id(int id)
 {
index 2ac90242d263714b7d6108a9005982ef660c02d7..433389daedb2ab9208b2f45b2bd3320ff18cf5ff 100644 (file)
@@ -82,7 +82,7 @@ isdn_tty_try_read(modem_info * info, struct sk_buff *skb)
                                                int l = skb->len;
                                                unsigned char *dp = skb->data;
                                                while (--l) {
-                                                       if (*skb->data == DLE)
+                                                       if (*dp == DLE)
                                                                tty_insert_flip_char(tty, DLE, 0);
                                                        tty_insert_flip_char(tty, *dp++, 0);
                                                }
index fe6541326c717fb996cb591aebb5a1bbc2107569..9b015f9af351e49526eb52ec0f0c09b4acb21e56 100644 (file)
@@ -18,7 +18,7 @@
 #include <linux/leds.h>
 #include "leds.h"
 
-rwlock_t leds_list_lock = RW_LOCK_UNLOCKED;
+DEFINE_RWLOCK(leds_list_lock);
 LIST_HEAD(leds_list);
 
 EXPORT_SYMBOL_GPL(leds_list);
index 5e2cd8be1191b99c44f3cb573d6bce3487b0155c..1b1ce6523960cb1676f342a718adf2e0962c42a6 100644 (file)
@@ -26,7 +26,7 @@
 /*
  * Nests outside led_cdev->trigger_lock
  */
-static rwlock_t triggers_list_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(triggers_list_lock);
 static LIST_HEAD(trigger_list);
 
 ssize_t led_trigger_store(struct class_device *dev, const char *buf,
index 74714e5bcf03299cc960529ab25c66ef3a061d0b..3ff8378ea660ad9aad1330122efd408f5faade01 100644 (file)
@@ -305,10 +305,8 @@ mptfc_GetFcDevPage0(MPT_ADAPTER *ioc, int ioc_port,
        }
 
  out:
-       if (pp0_array)
-               kfree(pp0_array);
-       if (p0_array)
-               kfree(p0_array);
+       kfree(pp0_array);
+       kfree(p0_array);
        return rc;
 }
 
index af6ec553ff7ca54c3cf4ee14c7e43e32cd9e73a2..85689ab46cbc26fcc30db0e471cc5081abd5dc66 100644 (file)
@@ -1378,8 +1378,7 @@ mptsas_probe_hba_phys(MPT_ADAPTER *ioc)
        return 0;
 
  out_free_port_info:
-       if (hba)
-               kfree(hba);
+       kfree(hba);
  out:
        return error;
 }
index febbdd4e0605ddeed250f534ebeda84ac77682ac..c74e5460f83406f786b34ecc2ff67acdfadb7c22 100644 (file)
@@ -1239,7 +1239,6 @@ EXPORT_SYMBOL(i2o_cntxt_list_remove);
 EXPORT_SYMBOL(i2o_cntxt_list_get_ptr);
 #endif
 EXPORT_SYMBOL(i2o_msg_get_wait);
-EXPORT_SYMBOL(i2o_msg_nop);
 EXPORT_SYMBOL(i2o_find_iop);
 EXPORT_SYMBOL(i2o_iop_find_device);
 EXPORT_SYMBOL(i2o_event_register);
index 1fdf03fd2da751181c1226d6441de7dab786b6e2..9706cc19134a1bc7b96e812e2717846ded951717 100644 (file)
@@ -85,7 +85,7 @@ static int __devinit ibmasm_init_one(struct pci_dev *pdev, const struct pci_devi
        }
        memset(sp, 0, sizeof(struct service_processor));
 
-       sp->lock = SPIN_LOCK_UNLOCKED;
+       spin_lock_init(&sp->lock);
        INIT_LIST_HEAD(&sp->command_queue);
 
        pci_set_drvdata(pdev, (void *)sp);
index c6770377ef8746c384d815d5067a469d872a3b59..0cd07150bf4aeee6e3add86d7e6e5e23544c3288 100644 (file)
@@ -431,8 +431,7 @@ static struct fs_enet_mii_bus *create_bus(const struct fs_mii_bus_info *bi)
        return bus;
 
 err:
-       if (bus)
-               kfree(bus);
+       kfree(bus);
        return ERR_PTR(ret);
 }
 
index 081a8999666e7e168404141073c687d0af2dfe0b..a8a8f975432fe1e32a3fb04aa0a4bc2d59d94e24 100644 (file)
@@ -1229,12 +1229,6 @@ static struct ipw_fw_error *ipw_alloc_error_log(struct ipw_priv *priv)
        return error;
 }
 
-static void ipw_free_error_log(struct ipw_fw_error *error)
-{
-       if (error)
-               kfree(error);
-}
-
 static ssize_t show_event_log(struct device *d,
                              struct device_attribute *attr, char *buf)
 {
@@ -1296,10 +1290,9 @@ static ssize_t clear_error(struct device *d,
                           const char *buf, size_t count)
 {
        struct ipw_priv *priv = dev_get_drvdata(d);
-       if (priv->error) {
-               ipw_free_error_log(priv->error);
-               priv->error = NULL;
-       }
+
+       kfree(priv->error);
+       priv->error = NULL;
        return count;
 }
 
@@ -1970,8 +1963,7 @@ static void ipw_irq_tasklet(struct ipw_priv *priv)
                                struct ipw_fw_error *error =
                                    ipw_alloc_error_log(priv);
                                ipw_dump_error_log(priv, error);
-                               if (error)
-                                       ipw_free_error_log(error);
+                               kfree(error);
                        }
 #endif
                } else {
@@ -11693,10 +11685,8 @@ static void ipw_pci_remove(struct pci_dev *pdev)
                }
        }
 
-       if (priv->error) {
-               ipw_free_error_log(priv->error);
-               priv->error = NULL;
-       }
+       kfree(priv->error);
+       priv->error = NULL;
 
 #ifdef CONFIG_IPW2200_PROMISCUOUS
        ipw_prom_free(priv);
index 0e07d9535116db884fcbe1762d0971ae3192669b..d0f68ab8f04119c12661a27e1785d76b6a372be4 100644 (file)
@@ -157,7 +157,7 @@ MODULE_LICENSE("Dual MPL/GPL");
 
 static int pcmcia_schlvl = PCMCIA_SCHLVL;
 
-static spinlock_t events_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(events_lock);
 
 
 #define PCMCIA_SOCKET_KEY_5V 1
@@ -644,7 +644,7 @@ static struct platform_device m8xx_device = {
 };
 
 static u32 pending_events[PCMCIA_SOCKETS_NO];
-static spinlock_t pending_event_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(pending_event_lock);
 
 static irqreturn_t m8xx_interrupt(int irq, void *dev, struct pt_regs *regs)
 {
index b9fab2ae3a36108ed2bbc48499682382c8e3b88f..8b56bbdd011ec0d5bf9870ca36abeda25caff49e 100644 (file)
@@ -17,8 +17,8 @@
  * These interrupt-safe spinlocks protect all accesses to RIO
  * configuration space and doorbell access.
  */
-static spinlock_t rio_config_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t rio_doorbell_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(rio_config_lock);
+static DEFINE_SPINLOCK(rio_doorbell_lock);
 
 /*
  *  Wrappers for all RIO configuration access functions.  They just check
index 5396beec30d0ca62b3d97a0c73c0ac66cb8ceccc..1cb61a761cb284cc6dd590e6ebd8062f6271d246 100644 (file)
@@ -94,7 +94,9 @@ exit_kfree:
        kfree(rtc);
 
 exit_idr:
+       mutex_lock(&idr_lock);
        idr_remove(&rtc_idr, id);
+       mutex_unlock(&idr_lock);
 
 exit:
        dev_err(dev, "rtc core: unable to register %s, err = %d\n",
index ecafbad41a242ce894d9247c88c6e1a5d0e67734..762521a1419cf296f080e7e4672e430f3029decc 100644 (file)
@@ -226,7 +226,7 @@ static int ds1553_rtc_ioctl(struct device *dev, unsigned int cmd,
        struct rtc_plat_data *pdata = platform_get_drvdata(pdev);
 
        if (pdata->irq < 0)
-               return -ENOIOCTLCMD;
+               return -ENOIOCTLCMD; /* fall back into rtc-dev's emulation */
        switch (cmd) {
        case RTC_AIE_OFF:
                pdata->irqen &= ~RTC_AF;
index ab486fbc828dab567357ab1d1ca04e2c5fc7d625..9cd1cb304bb2a7b66a4a5aab5442e472419961df 100644 (file)
@@ -45,7 +45,7 @@
 
 static unsigned long rtc_freq = 1024;
 static struct rtc_time rtc_alarm;
-static spinlock_t sa1100_rtc_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(sa1100_rtc_lock);
 
 static int rtc_update_alarm(struct rtc_time *alrm)
 {
index 33e029207e261cd2ce20f0bcb75375fa8380a768..4b9291dd444354ab1f2f374c26d2d0069266b800 100644 (file)
@@ -93,7 +93,7 @@ static void __iomem *rtc2_base;
 
 static unsigned long epoch = 1970;     /* Jan 1 1970 00:00:00 */
 
-static spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(rtc_lock);
 static char rtc_name[] = "RTC";
 static unsigned long periodic_frequency;
 static unsigned long periodic_count;
index 2d946b6ca074cd0eea567e680ac00fabfd614d59..2d8af709947fb09fefdb32c2bb8b195845f11d69 100644 (file)
@@ -89,7 +89,7 @@ struct eerbuffer {
 };
 
 static LIST_HEAD(bufferlist);
-static spinlock_t bufferlock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bufferlock);
 static DECLARE_WAIT_QUEUE_HEAD(dasd_eer_read_wait_queue);
 
 /*
index 6c66877be2bffe069b7663fc756fb176ed092003..855ce9a9d94892551b09702912efaa8facb105f3 100644 (file)
@@ -5733,7 +5733,7 @@ module_init(ata_init);
 module_exit(ata_exit);
 
 static unsigned long ratelimit_time;
-static spinlock_t ata_ratelimit_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ata_ratelimit_lock);
 
 int ata_ratelimit(void)
 {
index 93d18a74c401c37fe194422377736718c8bb02b1..2915bca691e8e941bd3813ab63b7de41532ef564 100644 (file)
@@ -222,9 +222,7 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
         && copy_to_user(arg + sizeof(args), argbuf, argsize))
                rc = -EFAULT;
 error:
-       if (argbuf)
-               kfree(argbuf);
-
+       kfree(argbuf);
        return rc;
 }
 
index 501316b198e51123bcaf30618f002c7de589afc0..ed946311d3a47e3120185e909eeb9524185cf4bc 100644 (file)
@@ -26,7 +26,7 @@ static DECLARE_RWSEM(ioc3_devices_rwsem);
 
 static struct ioc3_submodule *ioc3_submodules[IOC3_MAX_SUBMODULES];
 static struct ioc3_submodule *ioc3_ethernet;
-static rwlock_t ioc3_submodules_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(ioc3_submodules_lock);
 
 /* NIC probing code */
 
index d63c3f485853e52eb9b351bd7991295c05df1ebe..9ef68cd83bb4d0cc527524c073779fae21579d20 100644 (file)
@@ -743,8 +743,7 @@ void __exit au1100fb_cleanup(void)
 {
        driver_unregister(&au1100fb_driver);
 
-       if (drv_info.opt_mode)
-               kfree(drv_info.opt_mode);
+       kfree(drv_info.opt_mode);
 }
 
 module_init(au1100fb_init);
index a71e984c93d47aa8c8678ce1abb98a29b0a39336..ffc72ae3ada80a01f98c959099b004e57bc5aed4 100644 (file)
@@ -27,7 +27,7 @@
 
 static int hp680bl_suspended;
 static int current_intensity = 0;
-static spinlock_t bl_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bl_lock);
 static struct backlight_device *hp680_backlight_device;
 
 static void hp680bl_send_intensity(struct backlight_device *bd)
index 373bb6292bdc137a606296c7864093470d41b00f..f23bb647db470adbe513307d6d8b70f68ac9bbbf 100644 (file)
@@ -564,7 +564,7 @@ still_busy:
  * Completion handler for block_write_full_page() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
  */
-void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
        char b[BDEVNAME_SIZE];
        unsigned long flags;
@@ -3166,7 +3166,6 @@ EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
 EXPORT_SYMBOL(cont_prepare_write);
-EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
index 7f96b5cb67816109e38a51b79a6ee80364e48c5d..8c9b28dff1197d85a3a720f2988b25834f7c495b 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/suspend.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
+#include <linux/poison.h>
 #include <linux/proc_fs.h>
 
 #include <asm/uaccess.h>
@@ -1675,7 +1676,7 @@ static void journal_free_journal_head(struct journal_head *jh)
 {
 #ifdef CONFIG_JBD_DEBUG
        atomic_dec(&nr_journal_heads);
-       memset(jh, 0x5b, sizeof(*jh));
+       memset(jh, JBD_POISON_FREE, sizeof(*jh));
 #endif
        kmem_cache_free(journal_head_cache, jh);
 }
index 402005c35ab3fecd963329ff42be0b111f278381..8ca9707be6c9d06f28a843fbe6282afd1d405e77 100644 (file)
@@ -909,7 +909,7 @@ int __init nfs_init_directcache(void)
  * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
  *
  */
-void __exit nfs_destroy_directcache(void)
+void nfs_destroy_directcache(void)
 {
        if (kmem_cache_destroy(nfs_direct_cachep))
                printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
index 51bc88b662feabeb54f0873c1876296adabc0b25..c5b916605fb012b856bdee1c5cc8c8b4d9a8c81a 100644 (file)
@@ -1132,7 +1132,7 @@ static int __init nfs_init_inodecache(void)
        return 0;
 }
 
-static void __exit nfs_destroy_inodecache(void)
+static void nfs_destroy_inodecache(void)
 {
        if (kmem_cache_destroy(nfs_inode_cachep))
                printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n");
index bd2815e2dec1b924bf52bb803f490066c76b2e94..4fe51c1292bb763918cd885c0df1ba093fccd80e 100644 (file)
@@ -31,15 +31,15 @@ extern struct svc_version nfs4_callback_version1;
 
 /* pagelist.c */
 extern int __init nfs_init_nfspagecache(void);
-extern void __exit nfs_destroy_nfspagecache(void);
+extern void nfs_destroy_nfspagecache(void);
 extern int __init nfs_init_readpagecache(void);
-extern void __exit nfs_destroy_readpagecache(void);
+extern void nfs_destroy_readpagecache(void);
 extern int __init nfs_init_writepagecache(void);
-extern void __exit nfs_destroy_writepagecache(void);
+extern void nfs_destroy_writepagecache(void);
 
 #ifdef CONFIG_NFS_DIRECTIO
 extern int __init nfs_init_directcache(void);
-extern void __exit nfs_destroy_directcache(void);
+extern void nfs_destroy_directcache(void);
 #else
 #define nfs_init_directcache() (0)
 #define nfs_destroy_directcache() do {} while(0)
index ef9429643ebcc8e231268e58551cbf899df0336c..d89f6fb3b3a3517fea9679caec85982584ad7d81 100644 (file)
@@ -390,7 +390,7 @@ int __init nfs_init_nfspagecache(void)
        return 0;
 }
 
-void __exit nfs_destroy_nfspagecache(void)
+void nfs_destroy_nfspagecache(void)
 {
        if (kmem_cache_destroy(nfs_page_cachep))
                printk(KERN_INFO "nfs_page: not all structures were freed\n");
index 41c2ffee24f566dabf38f271e6ecdfa3fa3ea0d1..32cf3773af0cdc0a21b41363e6ddc34c7a40bff2 100644 (file)
@@ -711,7 +711,7 @@ int __init nfs_init_readpagecache(void)
        return 0;
 }
 
-void __exit nfs_destroy_readpagecache(void)
+void nfs_destroy_readpagecache(void)
 {
        mempool_destroy(nfs_rdata_mempool);
        if (kmem_cache_destroy(nfs_rdata_cachep))
index b383fdd3a15c1a5b6b621415b46ffe390a3294d2..8fccb9cb173ba9588c6f8946ba69cc67485659c2 100644 (file)
@@ -1551,7 +1551,7 @@ int __init nfs_init_writepagecache(void)
        return 0;
 }
 
-void __exit nfs_destroy_writepagecache(void)
+void nfs_destroy_writepagecache(void)
 {
        mempool_destroy(nfs_commit_mempool);
        mempool_destroy(nfs_wdata_mempool);
index 1630b5670dc2652ff36b9bee862f68a386904933..7c7d01672d35a4dc47bbd5ca5d059adcdc1c6598 100644 (file)
@@ -123,7 +123,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags);
  */
 
 /* recall_lock protects the del_recall_lru */
-static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
 
 static void
index 21f38accd0399dd26f6fa1da70af16209c01469e..1d26cfcd9f8406a531ab8acd61a5c813633658a8 100644 (file)
@@ -54,7 +54,7 @@ static DECLARE_RWSEM(o2hb_callback_sem);
  * multiple hb threads are watching multiple regions.  A node is live
  * whenever any of the threads sees activity from the node in its region.
  */
-static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(o2hb_live_lock);
 static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
 static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
index 0f60cc0d3985d9e80596e3f63b7059cb594d7d97..1591eb37a72366dc2e4ecbac76128e2d5b7e0d26 100644 (file)
            ##args);                                                    \
 } while (0)
 
-static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(o2net_handler_lock);
 static struct rb_root o2net_handler_tree = RB_ROOT;
 
 static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
index ba27c5c5e95939192f0bacfe5bfeb7c2776d67a3..b8c23f7ba67e1caf791af2e8f4f28e8f0c6e6413 100644 (file)
@@ -88,7 +88,7 @@ out_free:
  *
  */
 
-spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(dlm_domain_lock);
 LIST_HEAD(dlm_domains);
 static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
 
index d6f89577e25f1b0ac55f5ac8e931013ff2b460ee..5ca57ec650c77657c76e82e66f83dea2180da01a 100644 (file)
@@ -53,7 +53,7 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
 
-static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_cookie_lock);
 static u64 dlm_next_cookie = 1;
 
 static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
index da399013516ffe5947ac111f09b963e8c5a9c99b..29b2845f370d85b1a4b642af386553cd6178f708 100644 (file)
@@ -98,8 +98,8 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
 
 static u64 dlm_get_next_mig_cookie(void);
 
-static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
 static u64 dlm_mig_cookie = 1;
 
 static u64 dlm_get_next_mig_cookie(void)
index 64cd52860c876943d3a51a373f8a74df9f7067f5..4acd37286bdd7cb4b0807da5b338a9f6753ffb99 100644 (file)
@@ -242,7 +242,7 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
        mlog_exit_void();
 }
 
-static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
 
 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
                                       struct ocfs2_dlm_debug *dlm_debug)
index 3fe8781c22cb68fedf6c5d8189c9aab3f88fa0d1..910a601b2e9820cf4986e3e620faa534277cf072 100644 (file)
@@ -49,7 +49,7 @@
 
 #include "buffer_head_io.h"
 
-spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(trans_inc_lock);
 
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
index ee42765a8553e9c5ed51e47805f9bd1440679399..cf70fe2075b8f58e1a4109e7872da1cc7c246c21 100644 (file)
@@ -988,9 +988,7 @@ int ocfs2_request_mount_vote(struct ocfs2_super *osb)
        }
 
 bail:
-       if (request)
-               kfree(request);
-
+       kfree(request);
        return status;
 }
 
@@ -1021,9 +1019,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
        }
 
 bail:
-       if (request)
-               kfree(request);
-
+       kfree(request);
        return status;
 }
 
index 0137ec4c1368888d906d6eb543a238e5dca18236..0a163a4f7764059f7401c8b43cae42ca9c2bef43 100644 (file)
@@ -122,6 +122,11 @@ struct mem_size_stats
        unsigned long private_dirty;
 };
 
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+       return NULL;
+}
+
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
        struct proc_maps_private *priv = m->private;
@@ -158,22 +163,23 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
                pad_len_spaces(m, len);
                seq_path(m, file->f_vfsmnt, file->f_dentry, "\n");
        } else {
-               if (mm) {
-                       if (vma->vm_start <= mm->start_brk &&
+               const char *name = arch_vma_name(vma);
+               if (!name) {
+                       if (mm) {
+                               if (vma->vm_start <= mm->start_brk &&
                                                vma->vm_end >= mm->brk) {
-                               pad_len_spaces(m, len);
-                               seq_puts(m, "[heap]");
-                       } else {
-                               if (vma->vm_start <= mm->start_stack &&
-                                       vma->vm_end >= mm->start_stack) {
-
-                                       pad_len_spaces(m, len);
-                                       seq_puts(m, "[stack]");
+                                       name = "[heap]";
+                               } else if (vma->vm_start <= mm->start_stack &&
+                                          vma->vm_end >= mm->start_stack) {
+                                       name = "[stack]";
                                }
+                       } else {
+                               name = "[vdso]";
                        }
-               } else {
+               }
+               if (name) {
                        pad_len_spaces(m, len);
-                       seq_puts(m, "[vdso]");
+                       seq_puts(m, name);
                }
        }
        seq_putc(m, '\n');
index f2dbdf5a8769765d4ca175250e19e0659bb5f7c8..259bd196099d5c4128b41be2d144c16011ea1981 100644 (file)
@@ -605,39 +605,12 @@ static void ufs_set_inode_ops(struct inode *inode)
                                   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
 }
 
-void ufs_read_inode (struct inode * inode)
+static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
-       struct super_block * sb;
-       struct ufs_sb_private_info * uspi;
-       struct ufs_inode * ufs_inode;   
-       struct ufs2_inode *ufs2_inode;
-       struct buffer_head * bh;
+       struct super_block *sb = inode->i_sb;
        mode_t mode;
        unsigned i;
-       unsigned flags;
-       
-       UFSD("ENTER, ino %lu\n", inode->i_ino);
-       
-       sb = inode->i_sb;
-       uspi = UFS_SB(sb)->s_uspi;
-       flags = UFS_SB(sb)->s_flags;
-
-       if (inode->i_ino < UFS_ROOTINO || 
-           inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
-               ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino);
-               goto bad_inode;
-       }
-       
-       bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
-       if (!bh) {
-               ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino);
-               goto bad_inode;
-       }
-       if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
-               goto ufs2_inode;
-
-       ufs_inode = (struct ufs_inode *) (bh->b_data + sizeof(struct ufs_inode) * ufs_inotofsbo(inode->i_ino));
 
        /*
         * Copy data to the in-core inode.
@@ -661,14 +634,11 @@ void ufs_read_inode (struct inode * inode)
        inode->i_atime.tv_nsec = 0;
        inode->i_ctime.tv_nsec = 0;
        inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
-       inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size (for stat) */
-       inode->i_version++;
        ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
        ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen);
        ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
        ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
-       ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
-       ufsi->i_dir_start_lookup = 0;
+
        
        if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
@@ -677,24 +647,16 @@ void ufs_read_inode (struct inode * inode)
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
                        ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
        }
-       ufsi->i_osync = 0;
-
-       ufs_set_inode_ops(inode);
-
-       brelse (bh);
-
-       UFSD("EXIT\n");
-       return;
+}
 
-bad_inode:
-       make_bad_inode(inode);
-       return;
+static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
+{
+       struct ufs_inode_info *ufsi = UFS_I(inode);
+       struct super_block *sb = inode->i_sb;
+       mode_t mode;
+       unsigned i;
 
-ufs2_inode :
        UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
-
-       ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino));
-
        /*
         * Copy data to the in-core inode.
         */
@@ -717,26 +679,64 @@ ufs2_inode :
        inode->i_atime.tv_nsec = 0;
        inode->i_ctime.tv_nsec = 0;
        inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
-       inode->i_blksize = PAGE_SIZE; /*This is the optimal IO size(for stat)*/
-
-       inode->i_version++;
        ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
        ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen);
        /*
        ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
        ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
        */
-       ufsi->i_lastfrag= (inode->i_size + uspi->s_fsize- 1) >> uspi->s_fshift;
 
        if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
                        ufsi->i_u1.u2_i_data[i] =
                                ufs2_inode->ui_u2.ui_addr.ui_db[i];
-       }
-       else {
+       } else {
                for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
                        ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
        }
+}
+
+void ufs_read_inode(struct inode * inode)
+{
+       struct ufs_inode_info *ufsi = UFS_I(inode);
+       struct super_block * sb;
+       struct ufs_sb_private_info * uspi;
+       struct buffer_head * bh;
+
+       UFSD("ENTER, ino %lu\n", inode->i_ino);
+
+       sb = inode->i_sb;
+       uspi = UFS_SB(sb)->s_uspi;
+
+       if (inode->i_ino < UFS_ROOTINO ||
+           inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
+               ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n",
+                           inode->i_ino);
+               goto bad_inode;
+       }
+
+       bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
+       if (!bh) {
+               ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n",
+                           inode->i_ino);
+               goto bad_inode;
+       }
+       if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+               struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
+
+               ufs2_read_inode(inode,
+                               ufs2_inode + ufs_inotofsbo(inode->i_ino));
+       } else {
+               struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
+
+               ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
+       }
+
+       inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/
+       inode->i_version++;
+       ufsi->i_lastfrag =
+               (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
+       ufsi->i_dir_start_lookup = 0;
        ufsi->i_osync = 0;
 
        ufs_set_inode_ops(inode);
@@ -745,6 +745,9 @@ ufs2_inode :
 
        UFSD("EXIT\n");
        return;
+
+bad_inode:
+       make_bad_inode(inode);
 }
 
 static int ufs_update_inode(struct inode * inode, int do_sync)
index 10dbf203c62f6f929ee709d6624121f744de231a..ed7579beb6b06949541af834978eba7dc998f79e 100644 (file)
@@ -1721,15 +1721,14 @@ xfs_mount_log_sbunit(
  * is present to prevent thrashing).
  */
 
+#ifdef CONFIG_HOTPLUG_CPU
 /*
  * hot-plug CPU notifier support.
  *
- * We cannot use the hotcpu_register() function because it does
- * not allow notifier instances. We need a notifier per filesystem
- * as we need to be able to identify the filesystem to balance
- * the counters out. This is achieved by having a notifier block
- * embedded in the xfs_mount_t and doing pointer magic to get the
- * mount pointer from the notifier block address.
+ * We need a notifier per filesystem as we need to be able to identify
+ * the filesystem to balance the counters out. This is achieved by
+ * having a notifier block embedded in the xfs_mount_t and doing pointer
+ * magic to get the mount pointer from the notifier block address.
  */
 STATIC int
 xfs_icsb_cpu_notify(
@@ -1779,6 +1778,7 @@ xfs_icsb_cpu_notify(
 
        return NOTIFY_OK;
 }
+#endif /* CONFIG_HOTPLUG_CPU */
 
 int
 xfs_icsb_init_counters(
@@ -1791,9 +1791,11 @@ xfs_icsb_init_counters(
        if (mp->m_sb_cnts == NULL)
                return -ENOMEM;
 
+#ifdef CONFIG_HOTPLUG_CPU
        mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
        mp->m_icsb_notifier.priority = 0;
-       register_cpu_notifier(&mp->m_icsb_notifier);
+       register_hotcpu_notifier(&mp->m_icsb_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
 
        for_each_online_cpu(i) {
                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
@@ -1812,7 +1814,7 @@ xfs_icsb_destroy_counters(
        xfs_mount_t     *mp)
 {
        if (mp->m_sb_cnts) {
-               unregister_cpu_notifier(&mp->m_icsb_notifier);
+               unregister_hotcpu_notifier(&mp->m_icsb_notifier);
                free_percpu(mp->m_sb_cnts);
        }
 }
index dba70c62a16c0d3a56df6ffebd917983d0367f3b..457c34b6eb09337bc87421aeb0edfe0b35edd2e7 100644 (file)
@@ -435,7 +435,7 @@ static inline void t2_outl(u32 b, unsigned long addr)
        set_hae(msb); \
 }
 
-static spinlock_t t2_hae_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(t2_hae_lock);
 
 __EXTERN_INLINE u8 t2_readb(const volatile void __iomem *xaddr)
 {
index 845cb67ad8ea5d40bcb707462d30879255247b81..8ceab7bcd8b4f229f45536f8208e5394c99e9695 100644 (file)
        __ret;                                          \
 })
 
+#ifdef CONFIG_SMP
+# define WARN_ON_SMP(x)                        WARN_ON(x)
+#else
+# define WARN_ON_SMP(x)                        do { } while (0)
+#endif
+
 #endif
index e7252c216ca81cc8d6543f51b51fa1f8a216d194..b1bc7b1b64b0e304d13f907e4d5cd607417bdfee 100644 (file)
@@ -7,8 +7,6 @@
 #include <linux/nodemask.h>
 #include <linux/percpu.h>
 
-#include <asm/node.h>
-
 struct i386_cpu {
        struct cpu cpu;
 };
index 4153d80e4d2b86535a6e55a6e72a30ba7b5b5699..1eac92cb5b161744c463eec3ccd7d84395cc02f0 100644 (file)
@@ -10,6 +10,7 @@
 #include <asm/processor.h>
 #include <asm/system.h>                /* for savesegment */
 #include <asm/auxvec.h>
+#include <asm/desc.h>
 
 #include <linux/utsname.h>
 
@@ -129,15 +130,41 @@ extern int dump_task_extended_fpu (struct task_struct *, struct user_fxsr_struct
 #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
 #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
 
-#define VSYSCALL_BASE  (__fix_to_virt(FIX_VSYSCALL))
-#define VSYSCALL_EHDR  ((const struct elfhdr *) VSYSCALL_BASE)
-#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
+#define VDSO_HIGH_BASE         (__fix_to_virt(FIX_VDSO))
+#define VDSO_BASE              ((unsigned long)current->mm->context.vdso)
+
+#ifdef CONFIG_COMPAT_VDSO
+# define VDSO_COMPAT_BASE      VDSO_HIGH_BASE
+# define VDSO_PRELINK          VDSO_HIGH_BASE
+#else
+# define VDSO_COMPAT_BASE      VDSO_BASE
+# define VDSO_PRELINK          0
+#endif
+
+#define VDSO_COMPAT_SYM(x) \
+               (VDSO_COMPAT_BASE + (unsigned long)(x) - VDSO_PRELINK)
+
+#define VDSO_SYM(x) \
+               (VDSO_BASE + (unsigned long)(x) - VDSO_PRELINK)
+
+#define VDSO_HIGH_EHDR         ((const struct elfhdr *) VDSO_HIGH_BASE)
+#define VDSO_EHDR              ((const struct elfhdr *) VDSO_COMPAT_BASE)
+
 extern void __kernel_vsyscall;
 
+#define VDSO_ENTRY             VDSO_SYM(&__kernel_vsyscall)
+
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+                                       int executable_stack);
+
+extern unsigned int vdso_enabled;
+
 #define ARCH_DLINFO                                            \
-do {                                                           \
-               NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY);        \
-               NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE);    \
+do if (vdso_enabled) {                                         \
+               NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY);            \
+               NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_COMPAT_BASE); \
 } while (0)
 
 /*
@@ -148,15 +175,15 @@ do {                                                              \
  * Dumping its extra ELF program headers includes all the other information
  * a debugger needs to easily find how the vsyscall DSO was being used.
  */
-#define ELF_CORE_EXTRA_PHDRS           (VSYSCALL_EHDR->e_phnum)
+#define ELF_CORE_EXTRA_PHDRS           (VDSO_HIGH_EHDR->e_phnum)
 #define ELF_CORE_WRITE_EXTRA_PHDRS                                           \
 do {                                                                         \
        const struct elf_phdr *const vsyscall_phdrs =                         \
-               (const struct elf_phdr *) (VSYSCALL_BASE                      \
-                                          + VSYSCALL_EHDR->e_phoff);         \
+               (const struct elf_phdr *) (VDSO_HIGH_BASE                     \
+                                          + VDSO_HIGH_EHDR->e_phoff);    \
        int i;                                                                \
        Elf32_Off ofs = 0;                                                    \
-       for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {                        \
+       for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) {               \
                struct elf_phdr phdr = vsyscall_phdrs[i];                     \
                if (phdr.p_type == PT_LOAD) {                                 \
                        BUG_ON(ofs != 0);                                     \
@@ -174,10 +201,10 @@ do {                                                                            \
 #define ELF_CORE_WRITE_EXTRA_DATA                                            \
 do {                                                                         \
        const struct elf_phdr *const vsyscall_phdrs =                         \
-               (const struct elf_phdr *) (VSYSCALL_BASE                      \
-                                          + VSYSCALL_EHDR->e_phoff);         \
+               (const struct elf_phdr *) (VDSO_HIGH_BASE                     \
+                                          + VDSO_HIGH_EHDR->e_phoff);    \
        int i;                                                                \
-       for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {                        \
+       for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) {               \
                if (vsyscall_phdrs[i].p_type == PT_LOAD)                      \
                        DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr,        \
                                   PAGE_ALIGN(vsyscall_phdrs[i].p_memsz));    \
index f7e068f4d2f98d7cdb8c5198cd26c1796bb3d9dc..a48cc3f7ccc688a725595e46bb21c6e2fe7fb0ad 100644 (file)
@@ -51,7 +51,7 @@
  */
 enum fixed_addresses {
        FIX_HOLE,
-       FIX_VSYSCALL,
+       FIX_VDSO,
 #ifdef CONFIG_X86_LOCAL_APIC
        FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
 #endif
@@ -115,14 +115,6 @@ extern void __set_fixmap (enum fixed_addresses idx,
 #define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
 #define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
 
-/*
- * This is the range that is readable by user mode, and things
- * acting like user mode such as get_user_pages.
- */
-#define FIXADDR_USER_START     (__fix_to_virt(FIX_VSYSCALL))
-#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
-
-
 extern void __this_fixmap_does_not_exist(void);
 
 /*
index f431a0b86d4c46669e49b0fe4811a64017a27385..8358dd3df7aa22c1eb5bd4637f68b47b5509cd76 100644 (file)
@@ -12,6 +12,7 @@ typedef struct {
        int size;
        struct semaphore sem;
        void *ldt;
+       void *vdso;
 } mm_context_t;
 
 #endif
diff --git a/include/asm-i386/node.h b/include/asm-i386/node.h
deleted file mode 100644 (file)
index e13c6ff..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _ASM_I386_NODE_H_
-#define _ASM_I386_NODE_H_
-
-#include <linux/device.h>
-#include <linux/mmzone.h>
-#include <linux/node.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-
-struct i386_node {
-       struct node node;
-};
-extern struct i386_node node_devices[MAX_NUMNODES];
-
-static inline int arch_register_node(int num){
-       int p_node;
-       struct node *parent = NULL;
-
-       if (!node_online(num))
-               return 0;
-       p_node = parent_node(num);
-
-       if (p_node != num)
-               parent = &node_devices[p_node].node;
-
-       return register_node(&node_devices[num].node, num, parent);
-}
-
-#endif /* _ASM_I386_NODE_H_ */
index e3a552fa5538744bd578d6351c8c23756774c782..f5bf544c729a373aa4ec1832aa9fa3cd55a13800 100644 (file)
@@ -96,6 +96,8 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 
 #ifndef __ASSEMBLY__
 
+struct vm_area_struct;
+
 /*
  * This much address space is reserved for vmalloc() and iomap()
  * as well as fixmap mappings.
@@ -139,6 +141,7 @@ extern int page_is_ram(unsigned long pagenr);
 #include <asm-generic/memory_model.h>
 #include <asm-generic/page.h>
 
+#define __HAVE_ARCH_GATE_AREA 1
 #endif /* __KERNEL__ */
 
 #endif /* _I386_PAGE_H */
index 55ea992da32954c2bdd1aca5c5c902f7b4c3b02a..b32346d62e1039d956c919b190836f2ab0217ef1 100644 (file)
@@ -71,8 +71,12 @@ struct cpuinfo_x86 {
        cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
 #endif
        unsigned char x86_max_cores;    /* cpuid returned max cores value */
-       unsigned char booted_cores;     /* number of cores as seen by OS */
        unsigned char apicid;
+#ifdef CONFIG_SMP
+       unsigned char booted_cores;     /* number of cores as seen by OS */
+       __u8 phys_proc_id;              /* Physical processor id. */
+       __u8 cpu_core_id;               /* Core id */
+#endif
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL 0
@@ -104,8 +108,6 @@ extern struct cpuinfo_x86 cpu_data[];
 #define current_cpu_data boot_cpu_data
 #endif
 
-extern int phys_proc_id[NR_CPUS];
-extern int cpu_core_id[NR_CPUS];
 extern int cpu_llc_id[NR_CPUS];
 extern char ignore_fpu_irq;
 
index fdbc7f422ea5501ec586eabed02b349072e0184b..2833fa2c0dd0e6d5a54b5b6539123533f9c04855 100644 (file)
@@ -37,6 +37,7 @@ struct thread_info {
                                                   0-0xBFFFFFFF for user-thead
                                                   0-0xFFFFFFFF for kernel-thread
                                                */
+       void                    *sysenter_return;
        struct restart_block    restart_block;
 
        unsigned long           previous_esp;   /* ESP of the previous stack in case
@@ -83,17 +84,15 @@ struct thread_info {
 #define init_stack             (init_thread_union.stack)
 
 
+/* how to get the current stack pointer from C */
+register unsigned long current_stack_pointer asm("esp") __attribute_used__;
+
 /* how to get the thread information struct from C */
 static inline struct thread_info *current_thread_info(void)
 {
-       struct thread_info *ti;
-       __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1)));
-       return ti;
+       return (struct thread_info *)(current_stack_pointer & ~(THREAD_SIZE - 1));
 }
 
-/* how to get the current stack pointer from C */
-register unsigned long current_stack_pointer asm("esp") __attribute_used__;
-
 /* thread information allocation */
 #ifdef CONFIG_DEBUG_STACK_USAGE
 #define alloc_thread_info(tsk)                                 \
index b94e5eeef917ea92d9f7339593e1bda8dcc3bafd..6adbd9b1ae881121e9f377a121ec0c4e9355361c 100644 (file)
 #define _ASM_I386_TOPOLOGY_H
 
 #ifdef CONFIG_X86_HT
-#define topology_physical_package_id(cpu)                              \
-       (phys_proc_id[cpu] == BAD_APICID ? -1 : phys_proc_id[cpu])
-#define topology_core_id(cpu)                                          \
-       (cpu_core_id[cpu] == BAD_APICID ? 0 : cpu_core_id[cpu])
+#define topology_physical_package_id(cpu)      (cpu_data[cpu].phys_proc_id)
+#define topology_core_id(cpu)                  (cpu_data[cpu].cpu_core_id)
 #define topology_core_siblings(cpu)            (cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)          (cpu_sibling_map[cpu])
 #endif
@@ -114,4 +112,9 @@ extern unsigned long node_remap_size[];
 
 extern cpumask_t cpu_coregroup_map(int cpu);
 
+#ifdef CONFIG_SMP
+#define mc_capable()   (boot_cpu_data.x86_max_cores > 1)
+#define smt_capable()  (smp_num_siblings > 1)
+#endif
+
 #endif /* _ASM_I386_TOPOLOGY_H */
index d480f2e38215d69df898f89e11519f3c00b9dc85..69f0f1df67220edd0bc7bfe0b2148c6038b86e05 100644 (file)
@@ -78,8 +78,8 @@ static inline int arch_unw_user_mode(const struct unwind_frame_info *info)
        return user_mode_vm(&info->regs);
 #else
        return info->regs.eip < PAGE_OFFSET
-              || (info->regs.eip >= __fix_to_virt(FIX_VSYSCALL)
-                   && info->regs.eip < __fix_to_virt(FIX_VSYSCALL) + PAGE_SIZE)
+              || (info->regs.eip >= __fix_to_virt(FIX_VDSO)
+                   && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE)
               || info->regs.esp < PAGE_OFFSET;
 #endif
 }
index a140310bf84dcb4120980b883965a9ea9260a847..2fb337b0e9b786486508a6611c3e6e098ea58781 100644 (file)
@@ -46,6 +46,18 @@ struct ia64_node_data {
  */
 #define NODE_DATA(nid)         (local_node_data->pg_data_ptrs[nid])
 
+/*
+ * LOCAL_DATA_ADDR - This is to calculate the address of other node's
+ *                  "local_node_data" at hot-plug phase. The local_node_data
+ *                  is pointed by per_cpu_page. Kernel usually use it for
+ *                  just executing cpu. However, when new node is hot-added,
+ *                  the addresses of local data for other nodes are necessary
+ *                  to update all of them.
+ */
+#define LOCAL_DATA_ADDR(pgdat)                         \
+       ((struct ia64_node_data *)((u64)(pgdat) +       \
+                                  L1_CACHE_ALIGN(sizeof(struct pglist_data))))
+
 #endif /* CONFIG_NUMA */
 
 #endif /* _ASM_IA64_NODEDATA_H */
index 616b5ed2aa7277e12885e6e35805d4fcacf748d9..937c212575239f3ec9ee9526e2c989289669f967 100644 (file)
@@ -112,6 +112,7 @@ void build_cpu_to_node_map(void);
 #define topology_core_id(cpu)                  (cpu_data(cpu)->core_id)
 #define topology_core_siblings(cpu)            (cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)          (cpu_sibling_map[cpu])
+#define smt_capable()                          (smp_num_siblings > 1)
 #endif
 
 #include <asm-generic/topology.h>
index 92f3e5507d224cfc1cbf80e10d7d6bc09647a804..bbc3844b086fcadd9f529c69f5b12b6556530667 100644 (file)
@@ -93,5 +93,10 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev,
 
 #endif /* CONFIG_NUMA */
 
+#ifdef CONFIG_SMP
+#include <asm/cputable.h>
+#define smt_capable()          (cpu_has_feature(CPU_FTR_SMT))
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
index 0e234e201bd6b587375e17cdb450daae6c47b22c..98a6c613589dcbe29205d0705ebf871b6f577b22 100644 (file)
@@ -1,6 +1,9 @@
 #ifndef _ASM_SPARC64_TOPOLOGY_H
 #define _ASM_SPARC64_TOPOLOGY_H
 
+#include <asm/spitfire.h>
+#define smt_capable()  (tlb_type == hypervisor)
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_SPARC64_TOPOLOGY_H */
index 1b2ac55d3204e9ed6bbf3816fd82c50fa774f461..931877462788473f71d39b04198dfa56986d79eb 100644 (file)
@@ -124,7 +124,7 @@ asmlinkage void IRQ_NAME(nr); \
 __asm__( \
 "\n.p2align\n" \
 "IRQ" #nr "_interrupt:\n\t" \
-       "push $" #nr "-256 ; " \
+       "push $~(" #nr ") ; " \
        "jmp common_interrupt");
 
 #if defined(CONFIG_X86_IO_APIC)
index c4e46e7fa7ba781f9c086bf2ace317d105d9ba56..6e7a2e976b04fc8bc8a98213522fabf5c623af02 100644 (file)
@@ -59,6 +59,8 @@ extern int __node_distance(int, int);
 #define topology_core_id(cpu)                  (cpu_data[cpu].cpu_core_id)
 #define topology_core_siblings(cpu)            (cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)          (cpu_sibling_map[cpu])
+#define mc_capable()                   (boot_cpu_data.x86_max_cores > 1)
+#define smt_capable()                  (smp_num_siblings > 1)
 #endif
 
 #include <asm-generic/topology.h>
index 90d6df1551ed06964d6f326866790531e7c04780..88b5dfd8ee125be2c3a025c7ea1e03183eaccbe4 100644 (file)
@@ -528,12 +528,18 @@ static inline void acpi_set_cstate_limit(unsigned int new_limit) { return; }
 
 #ifdef CONFIG_ACPI_NUMA
 int acpi_get_pxm(acpi_handle handle);
+int acpi_get_node(acpi_handle *handle);
 #else
 static inline int acpi_get_pxm(acpi_handle handle)
 {
        return 0;
 }
+static inline int acpi_get_node(acpi_handle *handle)
+{
+       return 0;
+}
 #endif
+extern int acpi_paddr_to_node(u64 start_addr, u64 size);
 
 extern int pnpacpi_disabled;
 
index fb7e9b7ccbe312bf29ecada865da27dc152fb51d..737e407d0cd11e7479e6d38d3cf48562eec4f596 100644 (file)
@@ -149,7 +149,6 @@ void create_empty_buffers(struct page *, unsigned long,
                        unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
-void end_buffer_async_write(struct buffer_head *bh, int uptodate);
 
 /* Things to do with buffers at mapping->private_list */
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
@@ -214,6 +213,7 @@ int nobh_truncate_page(struct address_space *, loff_t);
 int nobh_writepage(struct page *page, get_block_t *get_block,
                         struct writeback_control *wbc);
 
+void buffer_init(void);
 
 /*
  * inline definitions
index 08d50c53aab401ef79c20e85213027d854e5d199..a3caf6866bae588885c43618d9965a6285f0913c 100644 (file)
@@ -31,17 +31,23 @@ struct cpu {
        struct sys_device sysdev;
 };
 
-extern int register_cpu(struct cpu *, int, struct node *);
+extern int register_cpu(struct cpu *cpu, int num);
 extern struct sys_device *get_cpu_sysdev(unsigned cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-extern void unregister_cpu(struct cpu *, struct node *);
+extern void unregister_cpu(struct cpu *cpu);
 #endif
 struct notifier_block;
 
 #ifdef CONFIG_SMP
 /* Need to know about CPUs going up/down? */
 extern int register_cpu_notifier(struct notifier_block *nb);
+#ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu_notifier(struct notifier_block *nb);
+#else
+static inline void unregister_cpu_notifier(struct notifier_block *nb)
+{
+}
+#endif
 extern int current_in_cpu_hotplug(void);
 
 int cpu_up(unsigned int cpu);
@@ -73,6 +79,8 @@ extern int lock_cpu_hotplug_interruptible(void);
                { .notifier_call = fn, .priority = pri };       \
        register_cpu_notifier(&fn##_nb);                        \
 }
+#define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
+#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb)
 int cpu_down(unsigned int cpu);
 #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
 #else
@@ -80,6 +88,8 @@ int cpu_down(unsigned int cpu);
 #define unlock_cpu_hotplug()   do { } while (0)
 #define lock_cpu_hotplug_interruptible() 0
 #define hotcpu_notifier(fn, pri)
+#define register_hotcpu_notifier(nb)
+#define unregister_hotcpu_notifier(nb)
 
 /* CPUs don't go offline once they're online w/o CONFIG_HOTPLUG_CPU */
 static inline int cpu_is_offline(int cpu) { return 0; }
index 78b236ca04f801ec188f1f78306ee44d10e02d95..272010a6078a9181dfb4a49a51f3820d47ee1b01 100644 (file)
@@ -20,7 +20,7 @@
  */
 #ifndef DMAENGINE_H
 #define DMAENGINE_H
-#include <linux/config.h>
+
 #ifdef CONFIG_DMA_ENGINE
 
 #include <linux/device.h>
index 966a5b3da439b40232c269f44026733db5a76244..34c3a215f2cd9affe3583451063a09887ee0669c 100644 (file)
@@ -12,6 +12,9 @@
 #define FUTEX_REQUEUE          3
 #define FUTEX_CMP_REQUEUE      4
 #define FUTEX_WAKE_OP          5
+#define FUTEX_LOCK_PI          6
+#define FUTEX_UNLOCK_PI                7
+#define FUTEX_TRYLOCK_PI       8
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
@@ -90,18 +93,21 @@ struct robust_list_head {
  */
 #define ROBUST_LIST_LIMIT      2048
 
-long do_futex(unsigned long uaddr, int op, int val,
-               unsigned long timeout, unsigned long uaddr2, int val2,
-               int val3);
+long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+             u32 __user *uaddr2, u32 val2, u32 val3);
 
 extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
+extern void exit_pi_state_list(struct task_struct *curr);
 #else
 static inline void exit_robust_list(struct task_struct *curr)
 {
 }
+static inline void exit_pi_state_list(struct task_struct *curr)
+{
+}
 #endif
 
 #define FUTEX_OP_SET           0       /* *(int *)UADDR2 = OPARG; */
index e127ef7e8da834dd3300ab875349c530b8a5e60d..3a256957fb56a7f05453dd3ea60cb075ca086b01 100644 (file)
@@ -87,6 +87,7 @@ extern struct group_info init_groups;
        .lock_depth     = -1,                                           \
        .prio           = MAX_PRIO-20,                                  \
        .static_prio    = MAX_PRIO-20,                                  \
+       .normal_prio    = MAX_PRIO-20,                                  \
        .policy         = SCHED_NORMAL,                                 \
        .cpus_allowed   = CPU_MASK_ALL,                                 \
        .mm             = NULL,                                         \
@@ -122,6 +123,8 @@ extern struct group_info init_groups;
        .journal_info   = NULL,                                         \
        .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
        .fs_excl        = ATOMIC_INIT(0),                               \
+       .pi_lock        = SPIN_LOCK_UNLOCKED,                           \
+       INIT_RT_MUTEXES(tsk)                                            \
 }
 
 
index cd6bd001ba4edbc68dfc5b52f08ef3bb832c785e..edfc733b1575c4f4c5d47644784e83e1d0d32a75 100644 (file)
@@ -105,6 +105,9 @@ extern int allocate_resource(struct resource *root, struct resource *new,
 int adjust_resource(struct resource *res, unsigned long start,
                    unsigned long size);
 
+/* get registered SYSTEM_RAM resources in specified area */
+extern int find_next_system_ram(struct resource *res);
+
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)   __request_region(&ioport_resource, (start), (n), (name))
 #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name))
index 5653b2f23b6a6fdf94f0e1d15ae8cb7436638db0..d09fbeabf1dc976539384f1889299bfc2e69383e 100644 (file)
@@ -210,11 +210,7 @@ struct kernel_ipmi_msg
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/device.h>
-
-#ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
-extern struct proc_dir_entry *proc_ipmi_root;
-#endif /* CONFIG_PROC_FS */
 
 /* Opaque type for a IPMI message user.  One of these is needed to
    send and receive messages. */
index 37ca31b21bb7a29cda46861f5e3d97a25ed83622..6b74adf5297f644f82531a5fbcbc55b5892d13f7 100644 (file)
@@ -4,17 +4,10 @@
 #ifdef __KERNEL__
 
 #include <linux/stddef.h>
+#include <linux/poison.h>
 #include <linux/prefetch.h>
 #include <asm/system.h>
 
-/*
- * These are non-NULL pointers that will result in page faults
- * under normal circumstances, used to verify that nobody uses
- * non-initialized list entries.
- */
-#define LIST_POISON1  ((void *) 0x00100100)
-#define LIST_POISON2  ((void *) 0x00200200)
-
 /*
  * Simple doubly linked list implementation.
  *
index 911206386171ccedc51397d9e36a7c424de9642c..218501cfaeb9255ed62ca9fbf2a197cbb6993f53 100644 (file)
@@ -63,6 +63,76 @@ extern int online_pages(unsigned long, unsigned long);
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(struct zone *zone, unsigned long start_pfn,
        unsigned long nr_pages);
+
+#ifdef CONFIG_NUMA
+extern int memory_add_physaddr_to_nid(u64 start);
+#else
+static inline int memory_add_physaddr_to_nid(u64 start)
+{
+       return 0;
+}
+#endif
+
+#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
+/*
+ * For supporting node-hotadd, we have to allocate a new pgdat.
+ *
+ * If an arch has generic style NODE_DATA(),
+ * node_data[nid] = kzalloc() works well. But it depends on the architecture.
+ *
+ * In general, generic_alloc_nodedata() is used.
+ * Now, arch_free_nodedata() is just defined for error path of node_hot_add.
+ *
+ */
+extern pg_data_t *arch_alloc_nodedata(int nid);
+extern void arch_free_nodedata(pg_data_t *pgdat);
+extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
+
+#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
+
+#define arch_alloc_nodedata(nid)       generic_alloc_nodedata(nid)
+#define arch_free_nodedata(pgdat)      generic_free_nodedata(pgdat)
+
+#ifdef CONFIG_NUMA
+/*
+ * If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat.
+ * XXX: kmalloc_node() can't work well to get new node's memory at this time.
+ *     Because, pgdat for the new node is not allocated/initialized yet itself.
+ *     To use new node's memory, more consideration will be necessary.
+ */
+#define generic_alloc_nodedata(nid)                            \
+({                                                             \
+       kzalloc(sizeof(pg_data_t), GFP_KERNEL);                 \
+})
+/*
+ * This definition is just for error path in node hotadd.
+ * For node hotremove, we have to replace this.
+ */
+#define generic_free_nodedata(pgdat)   kfree(pgdat)
+
+extern pg_data_t *node_data[];
+static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
+{
+       node_data[nid] = pgdat;
+}
+
+#else /* !CONFIG_NUMA */
+
+/* never called */
+static inline pg_data_t *generic_alloc_nodedata(int nid)
+{
+       BUG();
+       return NULL;
+}
+static inline void generic_free_nodedata(pg_data_t *pgdat)
+{
+}
+static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
+{
+}
+#endif /* CONFIG_NUMA */
+#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
+
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
@@ -99,7 +169,8 @@ static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
        return -ENOSYS;
 }
 
-extern int add_memory(u64 start, u64 size);
+extern int add_memory(int nid, u64 start, u64 size);
+extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int remove_memory(u64 start, u64 size);
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
index a929ea197e4844da5d11fc9f43da506138b276f7..c41a1299b8cf354cc8b9d9677f328aa4bc8b8533 100644 (file)
@@ -1030,13 +1030,20 @@ static inline void vm_stat_account(struct mm_struct *mm,
 }
 #endif /* CONFIG_PROC_FS */
 
+static inline void
+debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+       mutex_debug_check_no_locks_freed(from, len);
+       rt_mutex_debug_check_no_locks_freed(from, len);
+}
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
        if (!PageHighMem(page) && !enable)
-               mutex_debug_check_no_locks_freed(page_address(page),
-                                                numpages * PAGE_SIZE);
+               debug_check_no_locks_freed(page_address(page),
+                                          numpages * PAGE_SIZE);
 }
 #endif
 
@@ -1065,5 +1072,7 @@ void drop_slab(void);
 extern int randomize_va_space;
 #endif
 
+const char *arch_vma_name(struct vm_area_struct *vma);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
index 254dc3de650b81e5cce4821f8101dad72005f996..81dcec84cd8f00290037cb29c7f7e0960b51b895 100644 (file)
@@ -26,8 +26,25 @@ struct node {
        struct sys_device       sysdev;
 };
 
+extern struct node node_devices[];
+
 extern int register_node(struct node *, int, struct node *);
 extern void unregister_node(struct node *node);
+extern int register_one_node(int nid);
+extern void unregister_one_node(int nid);
+#ifdef CONFIG_NUMA
+extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
+extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
+#else
+static inline int register_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+       return 0;
+}
+static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
+{
+       return 0;
+}
+#endif
 
 #define to_node(sys_device) container_of(sys_device, struct node, sysdev)
 
diff --git a/include/linux/nsc_gpio.h b/include/linux/nsc_gpio.h
new file mode 100644 (file)
index 0000000..135742c
--- /dev/null
@@ -0,0 +1,42 @@
+/**
+   nsc_gpio.c
+
+   National Semiconductor GPIO common access methods.
+
+   struct nsc_gpio_ops abstracts the low-level access
+   operations for the GPIO units on 2 NSC chip families; the GEODE
+   integrated CPU, and the PC-8736[03456] integrated PC-peripheral
+   chips.
+
+   The GPIO units on these chips have the same pin architecture, but
+   the access methods differ.  Thus, scx200_gpio and pc8736x_gpio
+   implement their own versions of these routines; and use the common
+   file-operations routines implemented in nsc_gpio module.
+
+   Copyright (c) 2005 Jim Cromie <jim.cromie@gmail.com>
+
+   NB: this work was tested on the Geode SC-1100 and PC-87366 chips.
+   NSC sold the GEODE line to AMD, and the PC-8736x line to Winbond.
+*/
+
+struct nsc_gpio_ops {
+       struct module*  owner;
+       u32     (*gpio_config)  (unsigned iminor, u32 mask, u32 bits);
+       void    (*gpio_dump)    (struct nsc_gpio_ops *amp, unsigned iminor);
+       int     (*gpio_get)     (unsigned iminor);
+       void    (*gpio_set)     (unsigned iminor, int state);
+       void    (*gpio_set_high)(unsigned iminor);
+       void    (*gpio_set_low) (unsigned iminor);
+       void    (*gpio_change)  (unsigned iminor);
+       int     (*gpio_current) (unsigned iminor);
+       struct device*  dev;    /* for dev_dbg() support, set in init  */
+};
+
+extern ssize_t nsc_gpio_write(struct file *file, const char __user *data,
+                             size_t len, loff_t *ppos);
+
+extern ssize_t nsc_gpio_read(struct file *file, char __user *buf,
+                            size_t len, loff_t *ppos);
+
+extern void nsc_gpio_dump(struct nsc_gpio_ops *amp, unsigned index);
+
diff --git a/include/linux/plist.h b/include/linux/plist.h
new file mode 100644 (file)
index 0000000..3404fae
--- /dev/null
@@ -0,0 +1,247 @@
+/*
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Simplifications of the original code by
+ * Oleg Nesterov <oleg@tv-sign.ru>
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ *
+ * This is a priority-sorted list of nodes; each node has a
+ * priority from INT_MIN (highest) to INT_MAX (lowest).
+ *
+ * Addition is O(K), removal is O(1), change of priority of a node is
+ * O(K) and K is the number of RT priority levels used in the system.
+ * (1 <= K <= 99)
+ *
+ * This list is really a list of lists:
+ *
+ *  - The tier 1 list is the prio_list, different priority nodes.
+ *
+ *  - The tier 2 list is the node_list, serialized nodes.
+ *
+ * Simple ASCII art explanation:
+ *
+ * |HEAD          |
+ * |              |
+ * |prio_list.prev|<------------------------------------|
+ * |prio_list.next|<->|pl|<->|pl|<--------------->|pl|<-|
+ * |10            |   |10|   |21|   |21|   |21|   |40|   (prio)
+ * |              |   |  |   |  |   |  |   |  |   |  |
+ * |              |   |  |   |  |   |  |   |  |   |  |
+ * |node_list.next|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-|
+ * |node_list.prev|<------------------------------------|
+ *
+ * The nodes on the prio_list list are sorted by priority to simplify
+ * the insertion of new nodes. There are no nodes with duplicate
+ * priorites on the list.
+ *
+ * The nodes on the node_list is ordered by priority and can contain
+ * entries which have the same priority. Those entries are ordered
+ * FIFO
+ *
+ * Addition means: look for the prio_list node in the prio_list
+ * for the priority of the node and insert it before the node_list
+ * entry of the next prio_list node. If it is the first node of
+ * that priority, add it to the prio_list in the right position and
+ * insert it into the serialized node_list list
+ *
+ * Removal means remove it from the node_list and remove it from
+ * the prio_list if the node_list list_head is non empty. In case
+ * of removal from the prio_list it must be checked whether other
+ * entries of the same priority are on the list or not. If there
+ * is another entry of the same priority then this entry has to
+ * replace the removed entry on the prio_list. If the entry which
+ * is removed is the only entry of this priority then a simple
+ * remove from both list is sufficient.
+ *
+ * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX
+ * is lowest priority.
+ *
+ * No locking is done, up to the caller.
+ *
+ */
+#ifndef _LINUX_PLIST_H_
+#define _LINUX_PLIST_H_
+
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+struct plist_head {
+       struct list_head prio_list;
+       struct list_head node_list;
+#ifdef CONFIG_DEBUG_PI_LIST
+       spinlock_t *lock;
+#endif
+};
+
+struct plist_node {
+       int                     prio;
+       struct plist_head       plist;
+};
+
+#ifdef CONFIG_DEBUG_PI_LIST
+# define PLIST_HEAD_LOCK_INIT(_lock)   .lock = _lock
+#else
+# define PLIST_HEAD_LOCK_INIT(_lock)
+#endif
+
+/**
+ * #PLIST_HEAD_INIT - static struct plist_head initializer
+ *
+ * @head:      struct plist_head variable name
+ */
+#define PLIST_HEAD_INIT(head, _lock)                   \
+{                                                      \
+       .prio_list = LIST_HEAD_INIT((head).prio_list),  \
+       .node_list = LIST_HEAD_INIT((head).node_list),  \
+       PLIST_HEAD_LOCK_INIT(&(_lock))                  \
+}
+
+/**
+ * #PLIST_NODE_INIT - static struct plist_node initializer
+ *
+ * @node:      struct plist_node variable name
+ * @__prio:    initial node priority
+ */
+#define PLIST_NODE_INIT(node, __prio)                  \
+{                                                      \
+       .prio  = (__prio),                              \
+       .plist = PLIST_HEAD_INIT((node).plist, NULL),   \
+}
+
+/**
+ * plist_head_init - dynamic struct plist_head initializer
+ *
+ * @head:      &struct plist_head pointer
+ */
+static inline void
+plist_head_init(struct plist_head *head, spinlock_t *lock)
+{
+       INIT_LIST_HEAD(&head->prio_list);
+       INIT_LIST_HEAD(&head->node_list);
+#ifdef CONFIG_DEBUG_PI_LIST
+       head->lock = lock;
+#endif
+}
+
+/**
+ * plist_node_init - Dynamic struct plist_node initializer
+ *
+ * @node:      &struct plist_node pointer
+ * @prio:      initial node priority
+ */
+static inline void plist_node_init(struct plist_node *node, int prio)
+{
+       node->prio = prio;
+       plist_head_init(&node->plist, NULL);
+}
+
+extern void plist_add(struct plist_node *node, struct plist_head *head);
+extern void plist_del(struct plist_node *node, struct plist_head *head);
+
+/**
+ * plist_for_each - iterate over the plist
+ *
+ * @pos1:      the type * to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define plist_for_each(pos, head)      \
+        list_for_each_entry(pos, &(head)->node_list, plist.node_list)
+
+/**
+ * plist_for_each_entry_safe - iterate over a plist of given type safe
+ * against removal of list entry
+ *
+ * @pos1:      the type * to use as a loop counter.
+ * @n1:        another type * to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define plist_for_each_safe(pos, n, head)      \
+        list_for_each_entry_safe(pos, n, &(head)->node_list, plist.node_list)
+
+/**
+ * plist_for_each_entry        - iterate over list of given type
+ *
+ * @pos:       the type * to use as a loop counter.
+ * @head:      the head for your list.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define plist_for_each_entry(pos, head, mem)   \
+        list_for_each_entry(pos, &(head)->node_list, mem.plist.node_list)
+
+/**
+ * plist_for_each_entry_safe - iterate over list of given type safe against
+ * removal of list entry
+ *
+ * @pos:       the type * to use as a loop counter.
+ * @n:         another type * to use as temporary storage
+ * @head:      the head for your list.
+ * @m:         the name of the list_struct within the struct.
+ */
+#define plist_for_each_entry_safe(pos, n, head, m)     \
+       list_for_each_entry_safe(pos, n, &(head)->node_list, m.plist.node_list)
+
+/**
+ * plist_head_empty - return !0 if a plist_head is empty
+ *
+ * @head:      &struct plist_head pointer
+ */
+static inline int plist_head_empty(const struct plist_head *head)
+{
+       return list_empty(&head->node_list);
+}
+
+/**
+ * plist_node_empty - return !0 if plist_node is not on a list
+ *
+ * @node:      &struct plist_node pointer
+ */
+static inline int plist_node_empty(const struct plist_node *node)
+{
+       return plist_head_empty(&node->plist);
+}
+
+/* All functions below assume the plist_head is not empty. */
+
+/**
+ * plist_first_entry - get the struct for the first entry
+ *
+ * @ptr:       the &struct plist_head pointer.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ */
+#ifdef CONFIG_DEBUG_PI_LIST
+# define plist_first_entry(head, type, member) \
+({ \
+       WARN_ON(plist_head_empty(head)); \
+       container_of(plist_first(head), type, member); \
+})
+#else
+# define plist_first_entry(head, type, member) \
+       container_of(plist_first(head), type, member)
+#endif
+
+/**
+ * plist_first - return the first node (and thus, highest priority)
+ *
+ * @head:      the &struct plist_head pointer
+ *
+ * Assumes the plist is _not_ empty.
+ */
+static inline struct plist_node* plist_first(const struct plist_head *head)
+{
+       return list_entry(head->node_list.next,
+                         struct plist_node, plist.node_list);
+}
+
+#endif
diff --git a/include/linux/poison.h b/include/linux/poison.h
new file mode 100644 (file)
index 0000000..a5347c0
--- /dev/null
@@ -0,0 +1,58 @@
+#ifndef _LINUX_POISON_H
+#define _LINUX_POISON_H
+
+/********** include/linux/list.h **********/
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+/********** mm/slab.c **********/
+/*
+ * Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define        RED_INACTIVE    0x5A2CF071UL    /* when obj is inactive */
+#define        RED_ACTIVE      0x170FC2A5UL    /* when obj is active */
+
+/* ...and for poisoning */
+#define        POISON_INUSE    0x5a    /* for use-uninitialised poisoning */
+#define POISON_FREE    0x6b    /* for use-after-free poisoning */
+#define        POISON_END      0xa5    /* end-byte of poisoning */
+
+/********** arch/$ARCH/mm/init.c **********/
+#define POISON_FREE_INITMEM    0xcc
+
+/********** arch/x86_64/mm/init.c **********/
+#define        POISON_FREE_INITDATA    0xba
+
+/********** arch/ia64/hp/common/sba_iommu.c **********/
+/*
+ * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
+ * value of "SBAIOMMU POISON\0" for spill-over poisoning.
+ */
+
+/********** fs/jbd/journal.c **********/
+#define JBD_POISON_FREE        0x5b
+
+/********** drivers/base/dmapool.c **********/
+#define        POOL_POISON_FREED       0xa7    /* !inuse */
+#define        POOL_POISON_ALLOCATED   0xa9    /* !initted */
+
+/********** drivers/atm/ **********/
+#define ATM_POISON_FREE                0x12
+
+/********** kernel/mutexes **********/
+#define MUTEX_DEBUG_INIT       0x11
+#define MUTEX_DEBUG_FREE       0x22
+
+/********** security/ **********/
+#define KEY_DESTROY            0xbd
+
+/********** sound/oss/ **********/
+#define OSS_POISON_FREE                0xAB
+
+#endif
index 6312758393b61a3e3e741825f78d49956024ea26..48dfe00070c70bbcd27574870b19288f7655364c 100644 (file)
@@ -258,6 +258,7 @@ extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
 extern void rcu_restart_cpu(int cpu);
 extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
 
 /* Exported interfaces */
 extern void FASTCALL(call_rcu(struct rcu_head *head, 
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
new file mode 100644 (file)
index 0000000..fa4a3b8
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+
+#ifndef __LINUX_RT_MUTEX_H
+#define __LINUX_RT_MUTEX_H
+
+#include <linux/linkage.h>
+#include <linux/plist.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * The rt_mutex structure
+ *
+ * @wait_lock: spinlock to protect the structure
+ * @wait_list: pilist head to enqueue waiters in priority order
+ * @owner:     the mutex owner
+ */
+struct rt_mutex {
+       spinlock_t              wait_lock;
+       struct plist_head       wait_list;
+       struct task_struct      *owner;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+       int                     save_state;
+       struct list_head        held_list_entry;
+       unsigned long           acquire_ip;
+       const char              *name, *file;
+       int                     line;
+       void                    *magic;
+#endif
+};
+
+struct rt_mutex_waiter;
+struct hrtimer_sleeper;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ extern int rt_mutex_debug_check_no_locks_freed(const void *from,
+                                               unsigned long len);
+ extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task);
+#else
+ static inline int rt_mutex_debug_check_no_locks_freed(const void *from,
+                                                      unsigned long len)
+ {
+       return 0;
+ }
+# define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
+       , .name = #mutexname, .file = __FILE__, .line = __LINE__
+# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __FUNCTION__)
+ extern void rt_mutex_debug_task_free(struct task_struct *tsk);
+#else
+# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
+# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
+# define rt_mutex_debug_task_free(t)                   do { } while (0)
+#endif
+
+#define __RT_MUTEX_INITIALIZER(mutexname) \
+       { .wait_lock = SPIN_LOCK_UNLOCKED \
+       , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list, mutexname.wait_lock) \
+       , .owner = NULL \
+       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
+
+#define DEFINE_RT_MUTEX(mutexname) \
+       struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
+
+/***
+ * rt_mutex_is_locked - is the mutex locked
+ * @lock: the mutex to be queried
+ *
+ * Returns 1 if the mutex is locked, 0 if unlocked.
+ */
+static inline int rt_mutex_is_locked(struct rt_mutex *lock)
+{
+       return lock->owner != NULL;
+}
+
+extern void __rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void rt_mutex_destroy(struct rt_mutex *lock);
+
+extern void rt_mutex_lock(struct rt_mutex *lock);
+extern int rt_mutex_lock_interruptible(struct rt_mutex *lock,
+                                               int detect_deadlock);
+extern int rt_mutex_timed_lock(struct rt_mutex *lock,
+                                       struct hrtimer_sleeper *timeout,
+                                       int detect_deadlock);
+
+extern int rt_mutex_trylock(struct rt_mutex *lock);
+
+extern void rt_mutex_unlock(struct rt_mutex *lock);
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define INIT_RT_MUTEX_DEBUG(tsk)                                      \
+       .held_list_head = LIST_HEAD_INIT(tsk.held_list_head),           \
+       .held_list_lock = SPIN_LOCK_UNLOCKED
+#else
+# define INIT_RT_MUTEX_DEBUG(tsk)
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+# define INIT_RT_MUTEXES(tsk)                                          \
+       .pi_waiters     = PLIST_HEAD_INIT(tsk.pi_waiters, tsk.pi_lock), \
+       INIT_RT_MUTEX_DEBUG(tsk)
+#else
+# define INIT_RT_MUTEXES(tsk)
+#endif
+
+#endif
index 122a25c1b997e46641b636db8a740ca9af9e3788..821f0481ebe190c0b5e171a4e6ab887802b3015c 100644 (file)
@@ -73,6 +73,7 @@ struct sched_param {
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
 #include <linux/futex.h>
+#include <linux/rtmutex.h>
 
 #include <linux/time.h>
 #include <linux/param.h>
@@ -83,6 +84,7 @@ struct sched_param {
 #include <asm/processor.h>
 
 struct exec_domain;
+struct futex_pi_state;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -123,6 +125,7 @@ extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
+extern unsigned long weighted_cpuload(const int cpu);
 
 
 /*
@@ -494,8 +497,11 @@ struct signal_struct {
 
 #define MAX_PRIO               (MAX_RT_PRIO + 40)
 
-#define rt_task(p)             (unlikely((p)->prio < MAX_RT_PRIO))
+#define rt_prio(prio)          unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p)             rt_prio((p)->prio)
 #define batch_task(p)          (unlikely((p)->policy == SCHED_BATCH))
+#define has_rt_policy(p) \
+       unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -558,9 +564,9 @@ enum idle_type
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
-#ifdef CONFIG_SMP
 #define SCHED_LOAD_SCALE       128UL   /* increase resolution of load */
 
+#ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE                1       /* Do load balancing on this domain. */
 #define SD_BALANCE_NEWIDLE     2       /* Balance when about to become idle */
 #define SD_BALANCE_EXEC                4       /* Balance on exec */
@@ -569,6 +575,11 @@ enum idle_type
 #define SD_WAKE_AFFINE         32      /* Wake task to waking CPU */
 #define SD_WAKE_BALANCE                64      /* Perform balancing at task wakeup */
 #define SD_SHARE_CPUPOWER      128     /* Domain members share cpu power */
+#define SD_POWERSAVINGS_BALANCE        256     /* Balance for power savings */
+
+#define BALANCE_FOR_POWER      ((sched_mc_power_savings || sched_smt_power_savings) \
+                                ? SD_POWERSAVINGS_BALANCE : 0)
+
 
 struct sched_group {
        struct sched_group *next;       /* Must be a circular list */
@@ -638,7 +649,7 @@ struct sched_domain {
 #endif
 };
 
-extern void partition_sched_domains(cpumask_t *partition1,
+extern int partition_sched_domains(cpumask_t *partition1,
                                    cpumask_t *partition2);
 
 /*
@@ -713,10 +724,13 @@ struct task_struct {
 
        int lock_depth;         /* BKL lock depth */
 
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#ifdef CONFIG_SMP
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
        int oncpu;
 #endif
-       int prio, static_prio;
+#endif
+       int load_weight;        /* for niceness load balancing purposes */
+       int prio, static_prio, normal_prio;
        struct list_head run_list;
        prio_array_t *array;
 
@@ -843,6 +857,20 @@ struct task_struct {
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
        spinlock_t alloc_lock;
 
+       /* Protection of the PI data structures: */
+       spinlock_t pi_lock;
+
+#ifdef CONFIG_RT_MUTEXES
+       /* PI waiters blocked on a rt_mutex held by this task */
+       struct plist_head pi_waiters;
+       /* Deadlock detection and priority inheritance handling */
+       struct rt_mutex_waiter *pi_blocked_on;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+       spinlock_t held_list_lock;
+       struct list_head held_list_head;
+# endif
+#endif
+
 #ifdef CONFIG_DEBUG_MUTEXES
        /* mutex deadlock detection */
        struct mutex_waiter *blocked_on;
@@ -888,6 +916,8 @@ struct task_struct {
 #ifdef CONFIG_COMPAT
        struct compat_robust_list_head __user *compat_robust_list;
 #endif
+       struct list_head pi_state_list;
+       struct futex_pi_state *pi_state_cache;
 
        atomic_t fs_excl;       /* holding fs exclusive resources */
        struct rcu_head rcu;
@@ -955,6 +985,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SPREAD_PAGE 0x01000000      /* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB 0x02000000      /* Spread some slab caches over cpuset */
 #define PF_MEMPOLICY   0x10000000      /* Non-default NUMA mempolicy */
+#define PF_MUTEX_TESTER        0x20000000      /* Thread belongs to the rt mutex tester */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
@@ -1009,6 +1040,19 @@ static inline void idle_task_exit(void) {}
 #endif
 
 extern void sched_idle_next(void);
+
+#ifdef CONFIG_RT_MUTEXES
+extern int rt_mutex_getprio(task_t *p);
+extern void rt_mutex_setprio(task_t *p, int prio);
+extern void rt_mutex_adjust_pi(task_t *p);
+#else
+static inline int rt_mutex_getprio(task_t *p)
+{
+       return p->normal_prio;
+}
+# define rt_mutex_adjust_pi(p)         do { } while (0)
+#endif
+
 extern void set_user_nice(task_t *p, long nice);
 extern int task_prio(const task_t *p);
 extern int task_nice(const task_t *p);
@@ -1408,6 +1452,11 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
+#include <linux/sysdev.h>
+extern int sched_mc_power_savings, sched_smt_power_savings;
+extern struct sysdev_attribute attr_sched_mc_power_savings, attr_sched_smt_power_savings;
+extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
+
 extern void normalize_rt_tasks(void);
 
 #ifdef CONFIG_PM
index a22f9e173ad2d3b953dc3b90fb9df231b70557b2..693c0557e70bdf6b13baa898a4dca5c82b8ae919 100644 (file)
@@ -49,10 +49,3 @@ extern unsigned scx200_cb_base;
 #define SCx200_REV 0x3d                /* Revision Register */
 #define SCx200_CBA 0x3e                /* Configuration Base Address Register */
 #define SCx200_CBA_SCRATCH 0x64        /* Configuration Base Address Scratchpad */
-
-/*
-    Local variables:
-        compile-command: "make -C ../.. bzImage modules"
-        c-basic-offset: 8
-    End:
-*/
index 30cdd648ba79355d5dfed28fcbbb64ccf6be4de2..90dd069cc145c4b26a6fefe020aacee17d28aa65 100644 (file)
@@ -1,6 +1,6 @@
 #include <linux/spinlock.h>
 
-u32 scx200_gpio_configure(int index, u32 set, u32 clear);
+u32 scx200_gpio_configure(unsigned index, u32 set, u32 clear);
 
 extern unsigned scx200_gpio_base;
 extern long scx200_gpio_shadow[2];
@@ -17,7 +17,7 @@ extern long scx200_gpio_shadow[2];
 
 /* returns the value of the GPIO pin */
 
-static inline int scx200_gpio_get(int index) {
+static inline int scx200_gpio_get(unsigned index) {
        __SCx200_GPIO_BANK;
        __SCx200_GPIO_IOADDR + 0x04;
        __SCx200_GPIO_INDEX;
@@ -29,7 +29,7 @@ static inline int scx200_gpio_get(int index) {
    driven if the GPIO is configured as an output, it might not be the
    state of the GPIO right now if the GPIO is configured as an input) */
 
-static inline int scx200_gpio_current(int index) {
+static inline int scx200_gpio_current(unsigned index) {
         __SCx200_GPIO_BANK;
        __SCx200_GPIO_INDEX;
                
@@ -38,7 +38,7 @@ static inline int scx200_gpio_current(int index) {
 
 /* drive the GPIO signal high */
 
-static inline void scx200_gpio_set_high(int index) {
+static inline void scx200_gpio_set_high(unsigned index) {
        __SCx200_GPIO_BANK;
        __SCx200_GPIO_IOADDR;
        __SCx200_GPIO_SHADOW;
@@ -49,7 +49,7 @@ static inline void scx200_gpio_set_high(int index) {
 
 /* drive the GPIO signal low */
 
-static inline void scx200_gpio_set_low(int index) {
+static inline void scx200_gpio_set_low(unsigned index) {
        __SCx200_GPIO_BANK;
        __SCx200_GPIO_IOADDR;
        __SCx200_GPIO_SHADOW;
@@ -60,7 +60,7 @@ static inline void scx200_gpio_set_low(int index) {
 
 /* drive the GPIO signal to state */
 
-static inline void scx200_gpio_set(int index, int state) {
+static inline void scx200_gpio_set(unsigned index, int state) {
        __SCx200_GPIO_BANK;
        __SCx200_GPIO_IOADDR;
        __SCx200_GPIO_SHADOW;
@@ -73,7 +73,7 @@ static inline void scx200_gpio_set(int index, int state) {
 }
 
 /* toggle the GPIO signal */
-static inline void scx200_gpio_change(int index) {
+static inline void scx200_gpio_change(unsigned index) {
        __SCx200_GPIO_BANK;
        __SCx200_GPIO_IOADDR;
        __SCx200_GPIO_SHADOW;
@@ -87,10 +87,3 @@ static inline void scx200_gpio_change(int index) {
 #undef __SCx200_GPIO_SHADOW
 #undef __SCx200_GPIO_INDEX
 #undef __SCx200_GPIO_OUT
-
-/*
-    Local variables:
-        compile-command: "make -C ../.. bzImage modules"
-        c-basic-offset: 8
-    End:
-*/
index dc3f3aa0c83e89ec345caac3d4af9fdb1eae35b1..c41e2d6d1acc3dfb4ca036a6a78047e02181d5c0 100644 (file)
@@ -199,6 +199,8 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
+extern int kswapd_run(int nid);
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
index 33785b79d548a802e346022714d5032cacdbd097..008f04c5673715b33a52536e2bd4375bc064ff92 100644 (file)
@@ -174,9 +174,9 @@ asmlinkage long sys_waitid(int which, pid_t pid,
                           int options, struct rusage __user *ru);
 asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options);
 asmlinkage long sys_set_tid_address(int __user *tidptr);
-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
                        struct timespec __user *utime, u32 __user *uaddr2,
-                       int val3);
+                       u32 val3);
 
 asmlinkage long sys_init_module(void __user *umod, unsigned long len,
                                const char __user *uargs);
index 349ef908a2222d313ea7f87932a3a2d1d44b36b2..46e4d8f2771f9fa5812205c01a0fc85778fa9a44 100644 (file)
@@ -149,6 +149,7 @@ enum
        KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
        KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
        KERN_COMPAT_LOG=73,     /* int: print compat layer  messages */
+       KERN_MAX_LOCK_DEPTH=74,
 };
 
 
@@ -189,6 +190,7 @@ enum
        VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
        VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
        VM_PANIC_ON_OOM=33,     /* panic at out-of-memory */
+       VM_VDSO_ENABLED=34,     /* map VDSO into new processes? */
 };
 
 
index a305ae2e44b6dde305d3afe241768e32c47d8907..ec1eca85290ade96c5f42da4d74acac2a8ae2407 100644 (file)
        .flags                  = SD_LOAD_BALANCE       \
                                | SD_BALANCE_NEWIDLE    \
                                | SD_BALANCE_EXEC       \
-                               | SD_WAKE_AFFINE,       \
+                               | SD_WAKE_AFFINE        \
+                               | BALANCE_FOR_POWER,    \
        .last_balance           = jiffies,              \
        .balance_interval       = 1,                    \
        .nr_balance_failed      = 0,                    \
index df55b36656010b726aeb8b7a3e08984e2940a8da..f70f2fd273c2152708bdac6800d038229b58a8cc 100644 (file)
@@ -339,9 +339,14 @@ config BASE_FULL
          kernel data structures. This saves memory on small machines,
          but may reduce performance.
 
+config RT_MUTEXES
+       boolean
+       select PLIST
+
 config FUTEX
        bool "Enable futex support" if EMBEDDED
        default y
+       select RT_MUTEXES
        help
          Disabling this option will cause the kernel to be built without
          support for "fast userspace mutexes".  The resulting kernel may not
index 80af1a52485fcfd5b272e747d7a1576b0993167f..0d57f6ccb63a1a7c1f1efd47810727feb40e8ae8 100644 (file)
@@ -48,6 +48,7 @@
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 #include <linux/unwind.h>
+#include <linux/buffer_head.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -80,7 +81,6 @@ extern void mca_init(void);
 extern void sbus_init(void);
 extern void sysctl_init(void);
 extern void signals_init(void);
-extern void buffer_init(void);
 extern void pidhash_init(void);
 extern void pidmap_init(void);
 extern void prio_tree_init(void);
index 752bd7d383af34ecf7f72b3472d4fa9ccbd9a519..82fb182f6f618421cb822aac8131914486b8a651 100644 (file)
@@ -16,6 +16,9 @@ obj-$(CONFIG_FUTEX) += futex.o
 ifeq ($(CONFIG_COMPAT),y)
 obj-$(CONFIG_FUTEX) += futex_compat.o
 endif
+obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
+obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
+obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
index 368c4f03fe0e9fd0b7426b650d83a847badb05d7..126ca43d5d2ba9eb566af4380668c9546266efcd 100644 (file)
@@ -521,6 +521,7 @@ static void do_acct_process(struct file *file)
 
 /**
  * acct_init_pacct - initialize a new pacct_struct
+ * @pacct: per-process accounting info struct to initialize
  */
 void acct_init_pacct(struct pacct_struct *pacct)
 {
@@ -576,7 +577,7 @@ void acct_collect(long exitcode, int group_dead)
  *
  * handles process accounting for an exiting task
  */
-void acct_process()
+void acct_process(void)
 {
        struct file *file = NULL;
 
index 7dfac7031bd734f6c117e66b660b436b842e41be..82443fb433efcb9d550126be82dabf80c0308c23 100644 (file)
@@ -818,7 +818,7 @@ err:
  */
 unsigned int audit_serial(void)
 {
-       static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+       static DEFINE_SPINLOCK(serial_lock);
        static unsigned int serial = 0;
 
        unsigned long flags;
index 9ebd96fda2958835a7d0d0aa6dc1993ee11f1098..dc5e3f01efe747c9d1816895ae1a1e40fb527489 100644 (file)
@@ -658,8 +658,7 @@ static void audit_log_task_context(struct audit_buffer *ab)
        return;
 
 error_path:
-       if (ctx)
-               kfree(ctx);
+       kfree(ctx);
        audit_panic("error in audit_log_task_context");
        return;
 }
@@ -1367,7 +1366,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
  * @mqdes: MQ descriptor
  * @msg_len: Message length
  * @msg_prio: Message priority
- * @abs_timeout: Message timeout in absolute time
+ * @u_abs_timeout: Message timeout in absolute time
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
@@ -1409,8 +1408,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
  * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
  * @mqdes: MQ descriptor
  * @msg_len: Message length
- * @msg_prio: Message priority
- * @abs_timeout: Message timeout in absolute time
+ * @u_msg_prio: Message priority
+ * @u_abs_timeout: Message timeout in absolute time
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
@@ -1558,7 +1557,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
  * @uid: msgq user id
  * @gid: msgq group id
  * @mode: msgq mode (permissions)
- * @ipcp: in-kernel IPC permissions
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
index 03dcd981846a6b8020a07c058312c7277621be49..70fbf2e83766abb527d84bc6c4ddc30646036609 100644 (file)
@@ -18,7 +18,7 @@
 /* This protects CPUs going up and down... */
 static DEFINE_MUTEX(cpucontrol);
 
-static BLOCKING_NOTIFIER_HEAD(cpu_chain);
+static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
 #ifdef CONFIG_HOTPLUG_CPU
 static struct task_struct *lock_cpu_hotplug_owner;
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 #endif /* CONFIG_HOTPLUG_CPU */
 
 /* Need to know about CPUs going up/down? */
-int register_cpu_notifier(struct notifier_block *nb)
+int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 {
        return blocking_notifier_chain_register(&cpu_chain, nb);
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
-#ifdef CONFIG_HOTPLUG_CPU
 static inline void check_for_tasks(int cpu)
 {
        struct task_struct *p;
index 304ef637be6c700e7b5d73172222a0e05e5f96d8..ab06b9f88f64677ca036dea0285f740ccb54e1e9 100644 (file)
@@ -925,10 +925,19 @@ fastcall NORET_TYPE void do_exit(long code)
        mpol_free(tsk->mempolicy);
        tsk->mempolicy = NULL;
 #endif
+       /*
+        * This must happen late, after the PID is not
+        * hashed anymore:
+        */
+       if (unlikely(!list_empty(&tsk->pi_state_list)))
+               exit_pi_state_list(tsk);
+       if (unlikely(current->pi_state_cache))
+               kfree(current->pi_state_cache);
        /*
         * If DEBUG_MUTEXES is on, make sure we are holding no locks:
         */
        mutex_debug_check_no_locks_held(tsk);
+       rt_mutex_debug_check_no_locks_held(tsk);
 
        if (tsk->io_context)
                exit_io_context();
index 9b4e54ef0225e21f92366302ae9430933b330dbd..628198a4f28a722b25b37c26ecbd401c67919a8d 100644 (file)
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep;
 void free_task(struct task_struct *tsk)
 {
        free_thread_info(tsk->thread_info);
+       rt_mutex_debug_task_free(tsk);
        free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
        return current->pid;
 }
 
+static inline void rt_mutex_init_task(struct task_struct *p)
+{
+#ifdef CONFIG_RT_MUTEXES
+       spin_lock_init(&p->pi_lock);
+       plist_head_init(&p->pi_waiters, &p->pi_lock);
+       p->pi_blocked_on = NULL;
+# ifdef CONFIG_DEBUG_RT_MUTEXES
+       spin_lock_init(&p->held_list_lock);
+       INIT_LIST_HEAD(&p->held_list_head);
+# endif
+#endif
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1034,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags,
        mpol_fix_fork_child_flag(p);
 #endif
 
+       rt_mutex_init_task(p);
+
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
@@ -1076,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_COMPAT
        p->compat_robust_list = NULL;
 #endif
+       INIT_LIST_HEAD(&p->pi_state_list);
+       p->pi_state_cache = NULL;
+
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
index e1a380c77a5a2ac947f4fceee190fa7518488f41..6c91f938005db0719bac62a643fff9b411b7f594 100644 (file)
  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  *
+ *  PI-futex support started by Ingo Molnar and Thomas Gleixner
+ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
 #include <linux/signal.h>
 #include <asm/futex.h>
 
+#include "rtmutex_common.h"
+
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 
 /*
@@ -63,7 +69,7 @@ union futex_key {
                int offset;
        } shared;
        struct {
-               unsigned long uaddr;
+               unsigned long address;
                struct mm_struct *mm;
                int offset;
        } private;
@@ -74,6 +80,27 @@ union futex_key {
        } both;
 };
 
+/*
+ * Priority Inheritance state:
+ */
+struct futex_pi_state {
+       /*
+        * list of 'owned' pi_state instances - these have to be
+        * cleaned up in do_exit() if the task exits prematurely:
+        */
+       struct list_head list;
+
+       /*
+        * The PI object:
+        */
+       struct rt_mutex pi_mutex;
+
+       struct task_struct *owner;
+       atomic_t refcount;
+
+       union futex_key key;
+};
+
 /*
  * We use this hashed waitqueue instead of a normal wait_queue_t, so
  * we can wake only the relevant ones (hashed queues may be shared).
@@ -87,15 +114,19 @@ struct futex_q {
        struct list_head list;
        wait_queue_head_t waiters;
 
-       /* Which hash list lock to use. */
+       /* Which hash list lock to use: */
        spinlock_t *lock_ptr;
 
-       /* Key which the futex is hashed on. */
+       /* Key which the futex is hashed on: */
        union futex_key key;
 
-       /* For fd, sigio sent using these. */
+       /* For fd, sigio sent using these: */
        int fd;
        struct file *filp;
+
+       /* Optional priority inheritance state: */
+       struct futex_pi_state *pi_state;
+       struct task_struct *task;
 };
 
 /*
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
  *
  * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
  */
-static int get_futex_key(unsigned long uaddr, union futex_key *key)
+static int get_futex_key(u32 __user *uaddr, union futex_key *key)
 {
+       unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        struct page *page;
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
        /*
         * The futex address must be "naturally" aligned.
         */
-       key->both.offset = uaddr % PAGE_SIZE;
+       key->both.offset = address % PAGE_SIZE;
        if (unlikely((key->both.offset % sizeof(u32)) != 0))
                return -EINVAL;
-       uaddr -= key->both.offset;
+       address -= key->both.offset;
 
        /*
         * The futex is hashed differently depending on whether
         * it's in a shared or private mapping.  So check vma first.
         */
-       vma = find_extend_vma(mm, uaddr);
+       vma = find_extend_vma(mm, address);
        if (unlikely(!vma))
                return -EFAULT;
 
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
         */
        if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
                key->private.mm = mm;
-               key->private.uaddr = uaddr;
+               key->private.address = address;
                return 0;
        }
 
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
        key->shared.inode = vma->vm_file->f_dentry->d_inode;
        key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
        if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
-               key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+               key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
                                     + vma->vm_pgoff);
                return 0;
        }
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
         * from swap.  But that's a lot of code to duplicate here
         * for a rare case, so we simply fetch the page.
         */
-       err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
+       err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
        if (err >= 0) {
                key->shared.pgoff =
                        page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,17 +278,243 @@ static void drop_key_refs(union futex_key *key)
        }
 }
 
-static inline int get_futex_value_locked(int *dest, int __user *from)
+static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
        int ret;
 
        inc_preempt_count();
-       ret = __copy_from_user_inatomic(dest, from, sizeof(int));
+       ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
        dec_preempt_count();
 
        return ret ? -EFAULT : 0;
 }
 
+/*
+ * Fault handling. Called with current->mm->mmap_sem held.
+ */
+static int futex_handle_fault(unsigned long address, int attempt)
+{
+       struct vm_area_struct * vma;
+       struct mm_struct *mm = current->mm;
+
+       if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
+           vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
+               return -EFAULT;
+
+       switch (handle_mm_fault(mm, vma, address, 1)) {
+       case VM_FAULT_MINOR:
+               current->min_flt++;
+               break;
+       case VM_FAULT_MAJOR:
+               current->maj_flt++;
+               break;
+       default:
+               return -EFAULT;
+       }
+       return 0;
+}
+
+/*
+ * PI code:
+ */
+static int refill_pi_state_cache(void)
+{
+       struct futex_pi_state *pi_state;
+
+       if (likely(current->pi_state_cache))
+               return 0;
+
+       pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+
+       if (!pi_state)
+               return -ENOMEM;
+
+       memset(pi_state, 0, sizeof(*pi_state));
+       INIT_LIST_HEAD(&pi_state->list);
+       /* pi_mutex gets initialized later */
+       pi_state->owner = NULL;
+       atomic_set(&pi_state->refcount, 1);
+
+       current->pi_state_cache = pi_state;
+
+       return 0;
+}
+
+static struct futex_pi_state * alloc_pi_state(void)
+{
+       struct futex_pi_state *pi_state = current->pi_state_cache;
+
+       WARN_ON(!pi_state);
+       current->pi_state_cache = NULL;
+
+       return pi_state;
+}
+
+static void free_pi_state(struct futex_pi_state *pi_state)
+{
+       if (!atomic_dec_and_test(&pi_state->refcount))
+               return;
+
+       /*
+        * If pi_state->owner is NULL, the owner is most probably dying
+        * and has cleaned up the pi_state already
+        */
+       if (pi_state->owner) {
+               spin_lock_irq(&pi_state->owner->pi_lock);
+               list_del_init(&pi_state->list);
+               spin_unlock_irq(&pi_state->owner->pi_lock);
+
+               rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+       }
+
+       if (current->pi_state_cache)
+               kfree(pi_state);
+       else {
+               /*
+                * pi_state->list is already empty.
+                * clear pi_state->owner.
+                * refcount is at 0 - put it back to 1.
+                */
+               pi_state->owner = NULL;
+               atomic_set(&pi_state->refcount, 1);
+               current->pi_state_cache = pi_state;
+       }
+}
+
+/*
+ * Look up the task based on what TID userspace gave us.
+ * We dont trust it.
+ */
+static struct task_struct * futex_find_get_task(pid_t pid)
+{
+       struct task_struct *p;
+
+       read_lock(&tasklist_lock);
+       p = find_task_by_pid(pid);
+       if (!p)
+               goto out_unlock;
+       if ((current->euid != p->euid) && (current->euid != p->uid)) {
+               p = NULL;
+               goto out_unlock;
+       }
+       if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+               p = NULL;
+               goto out_unlock;
+       }
+       get_task_struct(p);
+out_unlock:
+       read_unlock(&tasklist_lock);
+
+       return p;
+}
+
+/*
+ * This task is holding PI mutexes at exit time => bad.
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+void exit_pi_state_list(struct task_struct *curr)
+{
+       struct futex_hash_bucket *hb;
+       struct list_head *next, *head = &curr->pi_state_list;
+       struct futex_pi_state *pi_state;
+       union futex_key key;
+
+       /*
+        * We are a ZOMBIE and nobody can enqueue itself on
+        * pi_state_list anymore, but we have to be careful
+        * versus waiters unqueueing themselfs
+        */
+       spin_lock_irq(&curr->pi_lock);
+       while (!list_empty(head)) {
+
+               next = head->next;
+               pi_state = list_entry(next, struct futex_pi_state, list);
+               key = pi_state->key;
+               spin_unlock_irq(&curr->pi_lock);
+
+               hb = hash_futex(&key);
+               spin_lock(&hb->lock);
+
+               spin_lock_irq(&curr->pi_lock);
+               if (head->next != next) {
+                       spin_unlock(&hb->lock);
+                       continue;
+               }
+
+               list_del_init(&pi_state->list);
+
+               WARN_ON(pi_state->owner != curr);
+
+               pi_state->owner = NULL;
+               spin_unlock_irq(&curr->pi_lock);
+
+               rt_mutex_unlock(&pi_state->pi_mutex);
+
+               spin_unlock(&hb->lock);
+
+               spin_lock_irq(&curr->pi_lock);
+       }
+       spin_unlock_irq(&curr->pi_lock);
+}
+
+static int
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+{
+       struct futex_pi_state *pi_state = NULL;
+       struct futex_q *this, *next;
+       struct list_head *head;
+       struct task_struct *p;
+       pid_t pid;
+
+       head = &hb->chain;
+
+       list_for_each_entry_safe(this, next, head, list) {
+               if (match_futex (&this->key, &me->key)) {
+                       /*
+                        * Another waiter already exists - bump up
+                        * the refcount and return its pi_state:
+                        */
+                       pi_state = this->pi_state;
+                       atomic_inc(&pi_state->refcount);
+                       me->pi_state = pi_state;
+
+                       return 0;
+               }
+       }
+
+       /*
+        * We are the first waiter - try to look up the real owner and
+        * attach the new pi_state to it:
+        */
+       pid = uval & FUTEX_TID_MASK;
+       p = futex_find_get_task(pid);
+       if (!p)
+               return -ESRCH;
+
+       pi_state = alloc_pi_state();
+
+       /*
+        * Initialize the pi_mutex in locked state and make 'p'
+        * the owner of it:
+        */
+       rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+       /* Store the key for possible exit cleanups: */
+       pi_state->key = me->key;
+
+       spin_lock_irq(&p->pi_lock);
+       list_add(&pi_state->list, &p->pi_state_list);
+       pi_state->owner = p;
+       spin_unlock_irq(&p->pi_lock);
+
+       put_task_struct(p);
+
+       me->pi_state = pi_state;
+
+       return 0;
+}
+
 /*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
@@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q)
        q->lock_ptr = NULL;
 }
 
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+{
+       struct task_struct *new_owner;
+       struct futex_pi_state *pi_state = this->pi_state;
+       u32 curval, newval;
+
+       if (!pi_state)
+               return -EINVAL;
+
+       new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+
+       /*
+        * This happens when we have stolen the lock and the original
+        * pending owner did not enqueue itself back on the rt_mutex.
+        * Thats not a tragedy. We know that way, that a lock waiter
+        * is on the fly. We make the futex_q waiter the pending owner.
+        */
+       if (!new_owner)
+               new_owner = this->task;
+
+       /*
+        * We pass it to the next owner. (The WAITERS bit is always
+        * kept enabled while there is PI state around. We must also
+        * preserve the owner died bit.)
+        */
+       newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
+
+       inc_preempt_count();
+       curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+       dec_preempt_count();
+
+       if (curval == -EFAULT)
+               return -EFAULT;
+       if (curval != uval)
+               return -EINVAL;
+
+       list_del_init(&pi_state->owner->pi_state_list);
+       list_add(&pi_state->list, &new_owner->pi_state_list);
+       pi_state->owner = new_owner;
+       rt_mutex_unlock(&pi_state->pi_mutex);
+
+       return 0;
+}
+
+static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
+{
+       u32 oldval;
+
+       /*
+        * There is no waiter, so we unlock the futex. The owner died
+        * bit has not to be preserved here. We are the owner:
+        */
+       inc_preempt_count();
+       oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
+       dec_preempt_count();
+
+       if (oldval == -EFAULT)
+               return oldval;
+       if (oldval != uval)
+               return -EAGAIN;
+
+       return 0;
+}
+
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(unsigned long uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, int nr_wake)
 {
-       union futex_key key;
-       struct futex_hash_bucket *bh;
-       struct list_head *head;
+       struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
+       struct list_head *head;
+       union futex_key key;
        int ret;
 
        down_read(&current->mm->mmap_sem);
@@ -302,19 +624,21 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
        if (unlikely(ret != 0))
                goto out;
 
-       bh = hash_futex(&key);
-       spin_lock(&bh->lock);
-       head = &bh->chain;
+       hb = hash_futex(&key);
+       spin_lock(&hb->lock);
+       head = &hb->chain;
 
        list_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key)) {
+                       if (this->pi_state)
+                               return -EINVAL;
                        wake_futex(this);
                        if (++ret >= nr_wake)
                                break;
                }
        }
 
-       spin_unlock(&bh->lock);
+       spin_unlock(&hb->lock);
 out:
        up_read(&current->mm->mmap_sem);
        return ret;
@@ -324,10 +648,12 @@ out:
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+static int
+futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+             int nr_wake, int nr_wake2, int op)
 {
        union futex_key key1, key2;
-       struct futex_hash_bucket *bh1, *bh2;
+       struct futex_hash_bucket *hb1, *hb2;
        struct list_head *head;
        struct futex_q *this, *next;
        int ret, op_ret, attempt = 0;
@@ -342,27 +668,29 @@ retryfull:
        if (unlikely(ret != 0))
                goto out;
 
-       bh1 = hash_futex(&key1);
-       bh2 = hash_futex(&key2);
+       hb1 = hash_futex(&key1);
+       hb2 = hash_futex(&key2);
 
 retry:
-       if (bh1 < bh2)
-               spin_lock(&bh1->lock);
-       spin_lock(&bh2->lock);
-       if (bh1 > bh2)
-               spin_lock(&bh1->lock);
+       if (hb1 < hb2)
+               spin_lock(&hb1->lock);
+       spin_lock(&hb2->lock);
+       if (hb1 > hb2)
+               spin_lock(&hb1->lock);
 
-       op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+       op_ret = futex_atomic_op_inuser(op, uaddr2);
        if (unlikely(op_ret < 0)) {
-               int dummy;
+               u32 dummy;
 
-               spin_unlock(&bh1->lock);
-               if (bh1 != bh2)
-                       spin_unlock(&bh2->lock);
+               spin_unlock(&hb1->lock);
+               if (hb1 != hb2)
+                       spin_unlock(&hb2->lock);
 
 #ifndef CONFIG_MMU
-               /* we don't get EFAULT from MMU faults if we don't have an MMU,
-                * but we might get them from range checking */
+               /*
+                * we don't get EFAULT from MMU faults if we don't have an MMU,
+                * but we might get them from range checking
+                */
                ret = op_ret;
                goto out;
 #endif
@@ -372,47 +700,34 @@ retry:
                        goto out;
                }
 
-               /* futex_atomic_op_inuser needs to both read and write
+               /*
+                * futex_atomic_op_inuser needs to both read and write
                 * *(int __user *)uaddr2, but we can't modify it
                 * non-atomically.  Therefore, if get_user below is not
                 * enough, we need to handle the fault ourselves, while
-                * still holding the mmap_sem.  */
+                * still holding the mmap_sem.
+                */
                if (attempt++) {
-                       struct vm_area_struct * vma;
-                       struct mm_struct *mm = current->mm;
-
-                       ret = -EFAULT;
-                       if (attempt >= 2 ||
-                           !(vma = find_vma(mm, uaddr2)) ||
-                           vma->vm_start > uaddr2 ||
-                           !(vma->vm_flags & VM_WRITE))
-                               goto out;
-
-                       switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
-                       case VM_FAULT_MINOR:
-                               current->min_flt++;
-                               break;
-                       case VM_FAULT_MAJOR:
-                               current->maj_flt++;
-                               break;
-                       default:
+                       if (futex_handle_fault((unsigned long)uaddr2,
+                                              attempt))
                                goto out;
-                       }
                        goto retry;
                }
 
-               /* If we would have faulted, release mmap_sem,
-                * fault it in and start all over again.  */
+               /*
+                * If we would have faulted, release mmap_sem,
+                * fault it in and start all over again.
+                */
                up_read(&current->mm->mmap_sem);
 
-               ret = get_user(dummy, (int __user *)uaddr2);
+               ret = get_user(dummy, uaddr2);
                if (ret)
                        return ret;
 
                goto retryfull;
        }
 
-       head = &bh1->chain;
+       head = &hb1->chain;
 
        list_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key1)) {
@@ -423,7 +738,7 @@ retry:
        }
 
        if (op_ret > 0) {
-               head = &bh2->chain;
+               head = &hb2->chain;
 
                op_ret = 0;
                list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +751,9 @@ retry:
                ret += op_ret;
        }
 
-       spin_unlock(&bh1->lock);
-       if (bh1 != bh2)
-               spin_unlock(&bh2->lock);
+       spin_unlock(&hb1->lock);
+       if (hb1 != hb2)
+               spin_unlock(&hb2->lock);
 out:
        up_read(&current->mm->mmap_sem);
        return ret;
@@ -448,11 +763,11 @@ out:
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
-                        int nr_wake, int nr_requeue, int *valp)
+static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+                        int nr_wake, int nr_requeue, u32 *cmpval)
 {
        union futex_key key1, key2;
-       struct futex_hash_bucket *bh1, *bh2;
+       struct futex_hash_bucket *hb1, *hb2;
        struct list_head *head1;
        struct futex_q *this, *next;
        int ret, drop_count = 0;
@@ -467,68 +782,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
        if (unlikely(ret != 0))
                goto out;
 
-       bh1 = hash_futex(&key1);
-       bh2 = hash_futex(&key2);
+       hb1 = hash_futex(&key1);
+       hb2 = hash_futex(&key2);
 
-       if (bh1 < bh2)
-               spin_lock(&bh1->lock);
-       spin_lock(&bh2->lock);
-       if (bh1 > bh2)
-               spin_lock(&bh1->lock);
+       if (hb1 < hb2)
+               spin_lock(&hb1->lock);
+       spin_lock(&hb2->lock);
+       if (hb1 > hb2)
+               spin_lock(&hb1->lock);
 
-       if (likely(valp != NULL)) {
-               int curval;
+       if (likely(cmpval != NULL)) {
+               u32 curval;
 
-               ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
+               ret = get_futex_value_locked(&curval, uaddr1);
 
                if (unlikely(ret)) {
-                       spin_unlock(&bh1->lock);
-                       if (bh1 != bh2)
-                               spin_unlock(&bh2->lock);
+                       spin_unlock(&hb1->lock);
+                       if (hb1 != hb2)
+                               spin_unlock(&hb2->lock);
 
-                       /* If we would have faulted, release mmap_sem, fault
+                       /*
+                        * If we would have faulted, release mmap_sem, fault
                         * it in and start all over again.
                         */
                        up_read(&current->mm->mmap_sem);
 
-                       ret = get_user(curval, (int __user *)uaddr1);
+                       ret = get_user(curval, uaddr1);
 
                        if (!ret)
                                goto retry;
 
                        return ret;
                }
-               if (curval != *valp) {
+               if (curval != *cmpval) {
                        ret = -EAGAIN;
                        goto out_unlock;
                }
        }
 
-       head1 = &bh1->chain;
+       head1 = &hb1->chain;
        list_for_each_entry_safe(this, next, head1, list) {
                if (!match_futex (&this->key, &key1))
                        continue;
                if (++ret <= nr_wake) {
                        wake_futex(this);
                } else {
-                       list_move_tail(&this->list, &bh2->chain);
-                       this->lock_ptr = &bh2->lock;
+                       /*
+                        * If key1 and key2 hash to the same bucket, no need to
+                        * requeue.
+                        */
+                       if (likely(head1 != &hb2->chain)) {
+                               list_move_tail(&this->list, &hb2->chain);
+                               this->lock_ptr = &hb2->lock;
+                       }
                        this->key = key2;
                        get_key_refs(&key2);
                        drop_count++;
 
                        if (ret - nr_wake >= nr_requeue)
                                break;
-                       /* Make sure to stop if key1 == key2 */
-                       if (head1 == &bh2->chain && head1 != &next->list)
-                               head1 = &this->list;
                }
        }
 
 out_unlock:
-       spin_unlock(&bh1->lock);
-       if (bh1 != bh2)
-               spin_unlock(&bh2->lock);
+       spin_unlock(&hb1->lock);
+       if (hb1 != hb2)
+               spin_unlock(&hb2->lock);
 
        /* drop_key_refs() must be called outside the spinlocks. */
        while (--drop_count >= 0)
@@ -543,7 +862,7 @@ out:
 static inline struct futex_hash_bucket *
 queue_lock(struct futex_q *q, int fd, struct file *filp)
 {
-       struct futex_hash_bucket *bh;
+       struct futex_hash_bucket *hb;
 
        q->fd = fd;
        q->filp = filp;
@@ -551,23 +870,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
        init_waitqueue_head(&q->waiters);
 
        get_key_refs(&q->key);
-       bh = hash_futex(&q->key);
-       q->lock_ptr = &bh->lock;
+       hb = hash_futex(&q->key);
+       q->lock_ptr = &hb->lock;
 
-       spin_lock(&bh->lock);
-       return bh;
+       spin_lock(&hb->lock);
+       return hb;
 }
 
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-       list_add_tail(&q->list, &bh->chain);
-       spin_unlock(&bh->lock);
+       list_add_tail(&q->list, &hb->chain);
+       q->task = current;
+       spin_unlock(&hb->lock);
 }
 
 static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-       spin_unlock(&bh->lock);
+       spin_unlock(&hb->lock);
        drop_key_refs(&q->key);
 }
 
@@ -579,16 +899,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
 /* The key must be already stored in q->key. */
 static void queue_me(struct futex_q *q, int fd, struct file *filp)
 {
-       struct futex_hash_bucket *bh;
-       bh = queue_lock(q, fd, filp);
-       __queue_me(q, bh);
+       struct futex_hash_bucket *hb;
+
+       hb = queue_lock(q, fd, filp);
+       __queue_me(q, hb);
 }
 
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static int unqueue_me(struct futex_q *q)
 {
-       int ret = 0;
        spinlock_t *lock_ptr;
+       int ret = 0;
 
        /* In the common case we don't take the spinlock, which is nice. */
  retry:
@@ -614,6 +935,9 @@ static int unqueue_me(struct futex_q *q)
                }
                WARN_ON(list_empty(&q->list));
                list_del(&q->list);
+
+               BUG_ON(q->pi_state);
+
                spin_unlock(lock_ptr);
                ret = 1;
        }
@@ -622,21 +946,42 @@ static int unqueue_me(struct futex_q *q)
        return ret;
 }
 
-static int futex_wait(unsigned long uaddr, int val, unsigned long time)
+/*
+ * PI futexes can not be requeued and must remove themself from the
+ * hash bucket. The hash bucket lock is held on entry and dropped here.
+ */
+static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-       DECLARE_WAITQUEUE(wait, current);
-       int ret, curval;
+       WARN_ON(list_empty(&q->list));
+       list_del(&q->list);
+
+       BUG_ON(!q->pi_state);
+       free_pi_state(q->pi_state);
+       q->pi_state = NULL;
+
+       spin_unlock(&hb->lock);
+
+       drop_key_refs(&q->key);
+}
+
+static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
+{
+       struct task_struct *curr = current;
+       DECLARE_WAITQUEUE(wait, curr);
+       struct futex_hash_bucket *hb;
        struct futex_q q;
-       struct futex_hash_bucket *bh;
+       u32 uval;
+       int ret;
 
+       q.pi_state = NULL;
  retry:
-       down_read(&current->mm->mmap_sem);
+       down_read(&curr->mm->mmap_sem);
 
        ret = get_futex_key(uaddr, &q.key);
        if (unlikely(ret != 0))
                goto out_release_sem;
 
-       bh = queue_lock(&q, -1, NULL);
+       hb = queue_lock(&q, -1, NULL);
 
        /*
         * Access the page AFTER the futex is queued.
@@ -658,37 +1003,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
         * We hold the mmap semaphore, so the mapping cannot have changed
         * since we looked it up in get_futex_key.
         */
-
-       ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+       ret = get_futex_value_locked(&uval, uaddr);
 
        if (unlikely(ret)) {
-               queue_unlock(&q, bh);
+               queue_unlock(&q, hb);
 
-               /* If we would have faulted, release mmap_sem, fault it in and
+               /*
+                * If we would have faulted, release mmap_sem, fault it in and
                 * start all over again.
                 */
-               up_read(&current->mm->mmap_sem);
+               up_read(&curr->mm->mmap_sem);
 
-               ret = get_user(curval, (int __user *)uaddr);
+               ret = get_user(uval, uaddr);
 
                if (!ret)
                        goto retry;
                return ret;
        }
-       if (curval != val) {
-               ret = -EWOULDBLOCK;
-               queue_unlock(&q, bh);
-               goto out_release_sem;
-       }
+       ret = -EWOULDBLOCK;
+       if (uval != val)
+               goto out_unlock_release_sem;
 
        /* Only actually queue if *uaddr contained val.  */
-       __queue_me(&q, bh);
+       __queue_me(&q, hb);
 
        /*
         * Now the futex is queued and we have checked the data, we
         * don't want to hold mmap_sem while we sleep.
-        */     
-       up_read(&current->mm->mmap_sem);
+        */
+       up_read(&curr->mm->mmap_sem);
 
        /*
         * There might have been scheduling since the queue_me(), as we
@@ -720,12 +1063,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
                return 0;
        if (time == 0)
                return -ETIMEDOUT;
-       /* We expect signal_pending(current), but another thread may
-        * have handled it for us already. */
+       /*
+        * We expect signal_pending(current), but another thread may
+        * have handled it for us already.
+        */
        return -EINTR;
 
+ out_unlock_release_sem:
+       queue_unlock(&q, hb);
+
  out_release_sem:
+       up_read(&curr->mm->mmap_sem);
+       return ret;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation:
+ * if there are waiters then it will block, it does PI, etc. (Due to
+ * races the kernel might see a 0 value of the futex too.)
+ */
+static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
+                           struct hrtimer_sleeper *to)
+{
+       struct task_struct *curr = current;
+       struct futex_hash_bucket *hb;
+       u32 uval, newval, curval;
+       struct futex_q q;
+       int ret, attempt = 0;
+
+       if (refill_pi_state_cache())
+               return -ENOMEM;
+
+       q.pi_state = NULL;
+ retry:
+       down_read(&curr->mm->mmap_sem);
+
+       ret = get_futex_key(uaddr, &q.key);
+       if (unlikely(ret != 0))
+               goto out_release_sem;
+
+       hb = queue_lock(&q, -1, NULL);
+
+ retry_locked:
+       /*
+        * To avoid races, we attempt to take the lock here again
+        * (by doing a 0 -> TID atomic cmpxchg), while holding all
+        * the locks. It will most likely not succeed.
+        */
+       newval = current->pid;
+
+       inc_preempt_count();
+       curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
+       dec_preempt_count();
+
+       if (unlikely(curval == -EFAULT))
+               goto uaddr_faulted;
+
+       /* We own the lock already */
+       if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
+               if (!detect && 0)
+                       force_sig(SIGKILL, current);
+               ret = -EDEADLK;
+               goto out_unlock_release_sem;
+       }
+
+       /*
+        * Surprise - we got the lock. Just return
+        * to userspace:
+        */
+       if (unlikely(!curval))
+               goto out_unlock_release_sem;
+
+       uval = curval;
+       newval = uval | FUTEX_WAITERS;
+
+       inc_preempt_count();
+       curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+       dec_preempt_count();
+
+       if (unlikely(curval == -EFAULT))
+               goto uaddr_faulted;
+       if (unlikely(curval != uval))
+               goto retry_locked;
+
+       /*
+        * We dont have the lock. Look up the PI state (or create it if
+        * we are the first waiter):
+        */
+       ret = lookup_pi_state(uval, hb, &q);
+
+       if (unlikely(ret)) {
+               /*
+                * There were no waiters and the owner task lookup
+                * failed. When the OWNER_DIED bit is set, then we
+                * know that this is a robust futex and we actually
+                * take the lock. This is safe as we are protected by
+                * the hash bucket lock. We also set the waiters bit
+                * unconditionally here, to simplify glibc handling of
+                * multiple tasks racing to acquire the lock and
+                * cleanup the problems which were left by the dead
+                * owner.
+                */
+               if (curval & FUTEX_OWNER_DIED) {
+                       uval = newval;
+                       newval = current->pid |
+                               FUTEX_OWNER_DIED | FUTEX_WAITERS;
+
+                       inc_preempt_count();
+                       curval = futex_atomic_cmpxchg_inatomic(uaddr,
+                                                              uval, newval);
+                       dec_preempt_count();
+
+                       if (unlikely(curval == -EFAULT))
+                               goto uaddr_faulted;
+                       if (unlikely(curval != uval))
+                               goto retry_locked;
+                       ret = 0;
+               }
+               goto out_unlock_release_sem;
+       }
+
+       /*
+        * Only actually queue now that the atomic ops are done:
+        */
+       __queue_me(&q, hb);
+
+       /*
+        * Now the futex is queued and we have checked the data, we
+        * don't want to hold mmap_sem while we sleep.
+        */
+       up_read(&curr->mm->mmap_sem);
+
+       WARN_ON(!q.pi_state);
+       /*
+        * Block on the PI mutex:
+        */
+       if (!trylock)
+               ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
+       else {
+               ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+               /* Fixup the trylock return value: */
+               ret = ret ? 0 : -EWOULDBLOCK;
+       }
+
+       down_read(&curr->mm->mmap_sem);
+       hb = queue_lock(&q, -1, NULL);
+
+       /*
+        * Got the lock. We might not be the anticipated owner if we
+        * did a lock-steal - fix up the PI-state in that case.
+        */
+       if (!ret && q.pi_state->owner != curr) {
+               u32 newtid = current->pid | FUTEX_WAITERS;
+
+               /* Owner died? */
+               if (q.pi_state->owner != NULL) {
+                       spin_lock_irq(&q.pi_state->owner->pi_lock);
+                       list_del_init(&q.pi_state->list);
+                       spin_unlock_irq(&q.pi_state->owner->pi_lock);
+               } else
+                       newtid |= FUTEX_OWNER_DIED;
+
+               q.pi_state->owner = current;
+
+               spin_lock_irq(&current->pi_lock);
+               list_add(&q.pi_state->list, &current->pi_state_list);
+               spin_unlock_irq(&current->pi_lock);
+
+               /* Unqueue and drop the lock */
+               unqueue_me_pi(&q, hb);
+               up_read(&curr->mm->mmap_sem);
+               /*
+                * We own it, so we have to replace the pending owner
+                * TID. This must be atomic as we have preserve the
+                * owner died bit here.
+                */
+               ret = get_user(uval, uaddr);
+               while (!ret) {
+                       newval = (uval & FUTEX_OWNER_DIED) | newtid;
+                       curval = futex_atomic_cmpxchg_inatomic(uaddr,
+                                                              uval, newval);
+                       if (curval == -EFAULT)
+                               ret = -EFAULT;
+                       if (curval == uval)
+                               break;
+                       uval = curval;
+               }
+       } else {
+               /*
+                * Catch the rare case, where the lock was released
+                * when we were on the way back before we locked
+                * the hash bucket.
+                */
+               if (ret && q.pi_state->owner == curr) {
+                       if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+                               ret = 0;
+               }
+               /* Unqueue and drop the lock */
+               unqueue_me_pi(&q, hb);
+               up_read(&curr->mm->mmap_sem);
+       }
+
+       if (!detect && ret == -EDEADLK && 0)
+               force_sig(SIGKILL, current);
+
+       return ret;
+
+ out_unlock_release_sem:
+       queue_unlock(&q, hb);
+
+ out_release_sem:
+       up_read(&curr->mm->mmap_sem);
+       return ret;
+
+ uaddr_faulted:
+       /*
+        * We have to r/w  *(int __user *)uaddr, but we can't modify it
+        * non-atomically.  Therefore, if get_user below is not
+        * enough, we need to handle the fault ourselves, while
+        * still holding the mmap_sem.
+        */
+       if (attempt++) {
+               if (futex_handle_fault((unsigned long)uaddr, attempt))
+                       goto out_unlock_release_sem;
+
+               goto retry_locked;
+       }
+
+       queue_unlock(&q, hb);
+       up_read(&curr->mm->mmap_sem);
+
+       ret = get_user(uval, uaddr);
+       if (!ret && (uval != -EFAULT))
+               goto retry;
+
+       return ret;
+}
+
+/*
+ * Restart handler
+ */
+static long futex_lock_pi_restart(struct restart_block *restart)
+{
+       struct hrtimer_sleeper timeout, *to = NULL;
+       int ret;
+
+       restart->fn = do_no_restart_syscall;
+
+       if (restart->arg2 || restart->arg3) {
+               to = &timeout;
+               hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+               hrtimer_init_sleeper(to, current);
+               to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
+                       (u64) restart->arg0;
+       }
+
+       pr_debug("lock_pi restart: %p, %d (%d)\n",
+                (u32 __user *)restart->arg0, current->pid);
+
+       ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
+                              0, to);
+
+       if (ret != -EINTR)
+               return ret;
+
+       restart->fn = futex_lock_pi_restart;
+
+       /* The other values are filled in */
+       return -ERESTART_RESTARTBLOCK;
+}
+
+/*
+ * Called from the syscall entry below.
+ */
+static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
+                        long nsec, int trylock)
+{
+       struct hrtimer_sleeper timeout, *to = NULL;
+       struct restart_block *restart;
+       int ret;
+
+       if (sec != MAX_SCHEDULE_TIMEOUT) {
+               to = &timeout;
+               hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+               hrtimer_init_sleeper(to, current);
+               to->timer.expires = ktime_set(sec, nsec);
+       }
+
+       ret = do_futex_lock_pi(uaddr, detect, trylock, to);
+
+       if (ret != -EINTR)
+               return ret;
+
+       pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
+
+       restart = &current_thread_info()->restart_block;
+       restart->fn = futex_lock_pi_restart;
+       restart->arg0 = (unsigned long) uaddr;
+       restart->arg1 = detect;
+       if (to) {
+               restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
+               restart->arg3 = to->timer.expires.tv64 >> 32;
+       } else
+               restart->arg2 = restart->arg3 = 0;
+
+       return -ERESTART_RESTARTBLOCK;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the PI state (if any),
+ * and do the rt-mutex unlock.
+ */
+static int futex_unlock_pi(u32 __user *uaddr)
+{
+       struct futex_hash_bucket *hb;
+       struct futex_q *this, *next;
+       u32 uval;
+       struct list_head *head;
+       union futex_key key;
+       int ret, attempt = 0;
+
+retry:
+       if (get_user(uval, uaddr))
+               return -EFAULT;
+       /*
+        * We release only a lock we actually own:
+        */
+       if ((uval & FUTEX_TID_MASK) != current->pid)
+               return -EPERM;
+       /*
+        * First take all the futex related locks:
+        */
+       down_read(&current->mm->mmap_sem);
+
+       ret = get_futex_key(uaddr, &key);
+       if (unlikely(ret != 0))
+               goto out;
+
+       hb = hash_futex(&key);
+       spin_lock(&hb->lock);
+
+retry_locked:
+       /*
+        * To avoid races, try to do the TID -> 0 atomic transition
+        * again. If it succeeds then we can return without waking
+        * anyone else up:
+        */
+       inc_preempt_count();
+       uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+       dec_preempt_count();
+
+       if (unlikely(uval == -EFAULT))
+               goto pi_faulted;
+       /*
+        * Rare case: we managed to release the lock atomically,
+        * no need to wake anyone else up:
+        */
+       if (unlikely(uval == current->pid))
+               goto out_unlock;
+
+       /*
+        * Ok, other tasks may need to be woken up - check waiters
+        * and do the wakeup if necessary:
+        */
+       head = &hb->chain;
+
+       list_for_each_entry_safe(this, next, head, list) {
+               if (!match_futex (&this->key, &key))
+                       continue;
+               ret = wake_futex_pi(uaddr, uval, this);
+               /*
+                * The atomic access to the futex value
+                * generated a pagefault, so retry the
+                * user-access and the wakeup:
+                */
+               if (ret == -EFAULT)
+                       goto pi_faulted;
+               goto out_unlock;
+       }
+       /*
+        * No waiters - kernel unlocks the futex:
+        */
+       ret = unlock_futex_pi(uaddr, uval);
+       if (ret == -EFAULT)
+               goto pi_faulted;
+
+out_unlock:
+       spin_unlock(&hb->lock);
+out:
        up_read(&current->mm->mmap_sem);
+
+       return ret;
+
+pi_faulted:
+       /*
+        * We have to r/w  *(int __user *)uaddr, but we can't modify it
+        * non-atomically.  Therefore, if get_user below is not
+        * enough, we need to handle the fault ourselves, while
+        * still holding the mmap_sem.
+        */
+       if (attempt++) {
+               if (futex_handle_fault((unsigned long)uaddr, attempt))
+                       goto out_unlock;
+
+               goto retry_locked;
+       }
+
+       spin_unlock(&hb->lock);
+       up_read(&current->mm->mmap_sem);
+
+       ret = get_user(uval, uaddr);
+       if (!ret && (uval != -EFAULT))
+               goto retry;
+
        return ret;
 }
 
@@ -735,6 +1487,7 @@ static int futex_close(struct inode *inode, struct file *filp)
 
        unqueue_me(q);
        kfree(q);
+
        return 0;
 }
 
@@ -766,7 +1519,7 @@ static struct file_operations futex_fops = {
  * Signal allows caller to avoid the race which would occur if they
  * set the sigio stuff up afterwards.
  */
-static int futex_fd(unsigned long uaddr, int signal)
+static int futex_fd(u32 __user *uaddr, int signal)
 {
        struct futex_q *q;
        struct file *filp;
@@ -803,6 +1556,7 @@ static int futex_fd(unsigned long uaddr, int signal)
                err = -ENOMEM;
                goto error;
        }
+       q->pi_state = NULL;
 
        down_read(&current->mm->mmap_sem);
        err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1594,7 @@ error:
  * Implementation: user-space maintains a per-thread list of locks it
  * is holding. Upon do_exit(), the kernel carefully walks this list,
  * and marks all locks that are owned by this thread with the
- * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
  * always manipulated with the lock held, so the list is private and
  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
  * field, to allow the kernel to clean up if the thread dies after
@@ -915,7 +1669,7 @@ err_unlock:
  */
 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
 {
-       u32 uval;
+       u32 uval, nval;
 
 retry:
        if (get_user(uval, uaddr))
@@ -932,12 +1686,16 @@ retry:
                 * thread-death.) The rest of the cleanup is done in
                 * userspace.
                 */
-               if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
-                                        uval | FUTEX_OWNER_DIED) != uval)
+               nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
+                                                    uval | FUTEX_OWNER_DIED);
+               if (nval == -EFAULT)
+                       return -1;
+
+               if (nval != uval)
                        goto retry;
 
                if (uval & FUTEX_WAITERS)
-                       futex_wake((unsigned long)uaddr, 1);
+                       futex_wake(uaddr, 1);
        }
        return 0;
 }
@@ -978,7 +1736,7 @@ void exit_robust_list(struct task_struct *curr)
        while (entry != &head->list) {
                /*
                 * A pending lock might already be on the list, so
-                * dont process it twice:
+                * don't process it twice:
                 */
                if (entry != pending)
                        if (handle_futex_death((void *)entry + futex_offset,
@@ -999,8 +1757,8 @@ void exit_robust_list(struct task_struct *curr)
        }
 }
 
-long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
-               unsigned long uaddr2, int val2, int val3)
+long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+               u32 __user *uaddr2, u32 val2, u32 val3)
 {
        int ret;
 
@@ -1024,6 +1782,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
        case FUTEX_WAKE_OP:
                ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
                break;
+       case FUTEX_LOCK_PI:
+               ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+               break;
+       case FUTEX_UNLOCK_PI:
+               ret = futex_unlock_pi(uaddr);
+               break;
+       case FUTEX_TRYLOCK_PI:
+               ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+               break;
        default:
                ret = -ENOSYS;
        }
@@ -1031,29 +1798,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
 }
 
 
-asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
                          struct timespec __user *utime, u32 __user *uaddr2,
-                         int val3)
+                         u32 val3)
 {
        struct timespec t;
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
-       int val2 = 0;
+       u32 val2 = 0;
 
-       if (utime && (op == FUTEX_WAIT)) {
+       if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
                if (copy_from_user(&t, utime, sizeof(t)) != 0)
                        return -EFAULT;
                if (!timespec_valid(&t))
                        return -EINVAL;
-               timeout = timespec_to_jiffies(&t) + 1;
+               if (op == FUTEX_WAIT)
+                       timeout = timespec_to_jiffies(&t) + 1;
+               else {
+                       timeout = t.tv_sec;
+                       val2 = t.tv_nsec;
+               }
        }
        /*
         * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
         */
-       if (op >= FUTEX_REQUEUE)
-               val2 = (int) (unsigned long) utime;
+       if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+               val2 = (u32) (unsigned long) utime;
 
-       return do_futex((unsigned long)uaddr, op, val, timeout,
-                       (unsigned long)uaddr2, val2, val3);
+       return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
 }
 
 static int futexfs_get_sb(struct file_system_type *fs_type,
index 1ab6a0ea3d14776e9a84d3b8af71ffd418da5498..d1d92b441fb7d7a327f229def2db99d4359063ee 100644 (file)
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
        int val2 = 0;
 
-       if (utime && (op == FUTEX_WAIT)) {
+       if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
                if (get_compat_timespec(&t, utime))
                        return -EFAULT;
                if (!timespec_valid(&t))
                        return -EINVAL;
-               timeout = timespec_to_jiffies(&t) + 1;
+               if (op == FUTEX_WAIT)
+                       timeout = timespec_to_jiffies(&t) + 1;
+               else {
+                       timeout = t.tv_sec;
+                       val2 = t.tv_nsec;
+               }
        }
-       if (op >= FUTEX_REQUEUE)
+       if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
                val2 = (int) (unsigned long) utime;
 
-       return do_futex((unsigned long)uaddr, op, val, timeout,
-                       (unsigned long)uaddr2, val2, val3);
+       return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
 }
index 55601b3ce60e92717fa6c6d771bc1a67dea7a03e..8d3dc29ef41ae1fb616d4d8765cb1d077c862852 100644 (file)
@@ -833,7 +833,7 @@ static void migrate_hrtimers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int hrtimer_cpu_notify(struct notifier_block *self,
+static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
                                        unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -857,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
 
-static struct notifier_block hrtimers_nb = {
+static struct notifier_block __devinitdata hrtimers_nb = {
        .notifier_call = hrtimer_cpu_notify,
 };
 
index 036b6285b15ccb195848a64e03fa741fd20e8d89..e38e4bac97cac6dde3cacd552f189ef77882bb62 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/poison.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
@@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock,
 
 void debug_mutex_init_waiter(struct mutex_waiter *waiter)
 {
-       memset(waiter, 0x11, sizeof(*waiter));
+       memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
        waiter->magic = waiter;
        INIT_LIST_HEAD(&waiter->list);
 }
@@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
 void debug_mutex_free_waiter(struct mutex_waiter *waiter)
 {
        DEBUG_WARN_ON(!list_empty(&waiter->list));
-       memset(waiter, 0x22, sizeof(*waiter));
+       memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
 }
 
 void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
index fc311a4673a25e451e94c1eef38c00bc425acf5a..857b4fa091244758b5fc0976cf44d58f914f20ec 100644 (file)
@@ -38,13 +38,22 @@ config PM_DEBUG
 
 config PM_TRACE
        bool "Suspend/resume event tracing"
-       depends on PM && PM_DEBUG && X86_32
-       default y
+       depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
+       default n
        ---help---
        This enables some cheesy code to save the last PM event point in the
        RTC across reboots, so that you can debug a machine that just hangs
        during suspend (or more commonly, during resume).
 
+       To use this debugging feature you should attempt to suspend the machine,
+       then reboot it, then run
+
+               dmesg -s 1000000 | grep 'hash matches'
+
+       CAUTION: this option will cause your machine's real-time clock to be
+       set to an invalid time after a resume.
+
+
 config SOFTWARE_SUSPEND
        bool "Software Suspend"
        depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
index 68afe121e5071f0574e37e7b9e20f1d66bd4c290..5a730fdb1a2cecf6b10c2112ba777fbb5fb7c794 100644 (file)
@@ -299,7 +299,7 @@ out:
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int profile_cpu_callback(struct notifier_block *info,
+static int __devinit profile_cpu_callback(struct notifier_block *info,
                                        unsigned long action, void *__cpu)
 {
        int node, cpu = (unsigned long)__cpu;
index 20e9710fc21c5fa5bc3d6c88342f8ba1dc4f5f0b..f464f5ae3f11a8edfe12d7ff83b609013526cfe6 100644 (file)
@@ -182,6 +182,15 @@ long rcu_batches_completed(void)
        return rcu_ctrlblk.completed;
 }
 
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+       return rcu_bh_ctrlblk.completed;
+}
+
 static void rcu_barrier_callback(struct rcu_head *notused)
 {
        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
        tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
 
-static int rcu_cpu_notify(struct notifier_block *self,
+static int __devinit rcu_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
 
-static struct notifier_block rcu_nb = {
+static struct notifier_block __devinitdata rcu_nb = {
        .notifier_call  = rcu_cpu_notify,
 };
 
@@ -619,6 +628,7 @@ module_param(qlowmark, int, 0);
 module_param(rsinterval, int, 0);
 #endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 EXPORT_SYMBOL_GPL(call_rcu);
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 EXPORT_SYMBOL_GPL(synchronize_rcu);
index 8154e7589d1284a7f96b1aa3b587ab2ffc01c299..4d1c3d2471278ebe93e00d7291eb253e83f7d89a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Read-Copy Update /proc-based torture test facility
+ * Read-Copy Update module-based torture test facility
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -53,6 +53,7 @@ static int stat_interval;     /* Interval between stats, in seconds. */
 static int verbose;            /* Print more debug info. */
 static int test_no_idle_hz;    /* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
+static char *torture_type = "rcu"; /* What to torture. */
 
 module_param(nreaders, int, 0);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0);
 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
 module_param(shuffle_interval, int, 0);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-#define TORTURE_FLAG "rcutorture: "
+module_param(torture_type, charp, 0);
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
+
+#define TORTURE_FLAG "-torture:"
 #define PRINTK_STRING(s) \
-       do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+       do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
 #define VERBOSE_PRINTK_STRING(s) \
-       do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+       do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
 #define VERBOSE_PRINTK_ERRSTRING(s) \
-       do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
+       do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
 
 static char printk_buf[4096];
 
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p)
        spin_unlock_bh(&rcu_torture_lock);
 }
 
-static void
-rcu_torture_cb(struct rcu_head *p)
-{
-       int i;
-       struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
-
-       if (fullstop) {
-               /* Test is ending, just drop callbacks on the floor. */
-               /* The next initialization will pick up the pieces. */
-               return;
-       }
-       i = rp->rtort_pipe_count;
-       if (i > RCU_TORTURE_PIPE_LEN)
-               i = RCU_TORTURE_PIPE_LEN;
-       atomic_inc(&rcu_torture_wcount[i]);
-       if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
-               rp->rtort_mbtest = 0;
-               rcu_torture_free(rp);
-       } else
-               call_rcu(p, rcu_torture_cb);
-}
-
 struct rcu_random_state {
        unsigned long rrs_state;
        unsigned long rrs_count;
@@ -190,6 +172,119 @@ rcu_random(struct rcu_random_state *rrsp)
        return swahw32(rrsp->rrs_state);
 }
 
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_torture_ops {
+       void (*init)(void);
+       void (*cleanup)(void);
+       int (*readlock)(void);
+       void (*readunlock)(int idx);
+       int (*completed)(void);
+       void (*deferredfree)(struct rcu_torture *p);
+       int (*stats)(char *page);
+       char *name;
+};
+static struct rcu_torture_ops *cur_ops = NULL;
+
+/*
+ * Definitions for rcu torture testing.
+ */
+
+static int rcu_torture_read_lock(void)
+{
+       rcu_read_lock();
+       return 0;
+}
+
+static void rcu_torture_read_unlock(int idx)
+{
+       rcu_read_unlock();
+}
+
+static int rcu_torture_completed(void)
+{
+       return rcu_batches_completed();
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+       int i;
+       struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+       if (fullstop) {
+               /* Test is ending, just drop callbacks on the floor. */
+               /* The next initialization will pick up the pieces. */
+               return;
+       }
+       i = rp->rtort_pipe_count;
+       if (i > RCU_TORTURE_PIPE_LEN)
+               i = RCU_TORTURE_PIPE_LEN;
+       atomic_inc(&rcu_torture_wcount[i]);
+       if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+               rp->rtort_mbtest = 0;
+               rcu_torture_free(rp);
+       } else
+               cur_ops->deferredfree(rp);
+}
+
+static void rcu_torture_deferred_free(struct rcu_torture *p)
+{
+       call_rcu(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_ops = {
+       .init = NULL,
+       .cleanup = NULL,
+       .readlock = rcu_torture_read_lock,
+       .readunlock = rcu_torture_read_unlock,
+       .completed = rcu_torture_completed,
+       .deferredfree = rcu_torture_deferred_free,
+       .stats = NULL,
+       .name = "rcu"
+};
+
+/*
+ * Definitions for rcu_bh torture testing.
+ */
+
+static int rcu_bh_torture_read_lock(void)
+{
+       rcu_read_lock_bh();
+       return 0;
+}
+
+static void rcu_bh_torture_read_unlock(int idx)
+{
+       rcu_read_unlock_bh();
+}
+
+static int rcu_bh_torture_completed(void)
+{
+       return rcu_batches_completed_bh();
+}
+
+static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
+{
+       call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_bh_ops = {
+       .init = NULL,
+       .cleanup = NULL,
+       .readlock = rcu_bh_torture_read_lock,
+       .readunlock = rcu_bh_torture_read_unlock,
+       .completed = rcu_bh_torture_completed,
+       .deferredfree = rcu_bh_torture_deferred_free,
+       .stats = NULL,
+       .name = "rcu_bh"
+};
+
+static struct rcu_torture_ops *torture_ops[] =
+       { &rcu_ops, &rcu_bh_ops, NULL };
+
 /*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
  * for that pointed to by rcu_torture_current, freeing the old structure
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg)
 
        do {
                schedule_timeout_uninterruptible(1);
-               if (rcu_batches_completed() == oldbatch)
-                       continue;
                if ((rp = rcu_torture_alloc()) == NULL)
                        continue;
                rp->rtort_pipe_count = 0;
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg)
                                i = RCU_TORTURE_PIPE_LEN;
                        atomic_inc(&rcu_torture_wcount[i]);
                        old_rp->rtort_pipe_count++;
-                       call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
+                       cur_ops->deferredfree(old_rp);
                }
                rcu_torture_current_version++;
-               oldbatch = rcu_batches_completed();
+               oldbatch = cur_ops->completed();
        } while (!kthread_should_stop() && !fullstop);
        VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
        while (!kthread_should_stop())
@@ -246,6 +339,7 @@ static int
 rcu_torture_reader(void *arg)
 {
        int completed;
+       int idx;
        DEFINE_RCU_RANDOM(rand);
        struct rcu_torture *p;
        int pipe_count;
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg)
        set_user_nice(current, 19);
 
        do {
-               rcu_read_lock();
-               completed = rcu_batches_completed();
+               idx = cur_ops->readlock();
+               completed = cur_ops->completed();
                p = rcu_dereference(rcu_torture_current);
                if (p == NULL) {
                        /* Wait for rcu_torture_writer to get underway */
-                       rcu_read_unlock();
+                       cur_ops->readunlock(idx);
                        schedule_timeout_interruptible(HZ);
                        continue;
                }
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg)
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
                ++__get_cpu_var(rcu_torture_count)[pipe_count];
-               completed = rcu_batches_completed() - completed;
+               completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
                }
                ++__get_cpu_var(rcu_torture_batch)[completed];
                preempt_enable();
-               rcu_read_unlock();
+               cur_ops->readunlock(idx);
                schedule();
        } while (!kthread_should_stop() && !fullstop);
        VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page)
                if (pipesummary[i] != 0)
                        break;
        }
-       cnt += sprintf(&page[cnt], "rcutorture: ");
+       cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
                       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
                       "rtmbe: %d",
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page)
                       atomic_read(&n_rcu_torture_mberror));
        if (atomic_read(&n_rcu_torture_mberror) != 0)
                cnt += sprintf(&page[cnt], " !!!");
-       cnt += sprintf(&page[cnt], "\nrcutorture: ");
+       cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        if (i > 1) {
                cnt += sprintf(&page[cnt], "!!! ");
                atomic_inc(&n_rcu_torture_error);
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page)
        cnt += sprintf(&page[cnt], "Reader Pipe: ");
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
                cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
-       cnt += sprintf(&page[cnt], "\nrcutorture: ");
+       cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt], "Reader Batch: ");
-       for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+       for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
                cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
-       cnt += sprintf(&page[cnt], "\nrcutorture: ");
+       cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
                cnt += sprintf(&page[cnt], " %d",
                               atomic_read(&rcu_torture_wcount[i]));
        }
        cnt += sprintf(&page[cnt], "\n");
+       if (cur_ops->stats != NULL)
+               cnt += cur_ops->stats(&page[cnt]);
        return cnt;
 }
 
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg)
 static inline void
 rcu_torture_print_module_parms(char *tag)
 {
-       printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d "
+       printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
                "shuffle_interval = %d\n",
-               tag, nrealreaders, stat_interval, verbose, test_no_idle_hz,
-               shuffle_interval);
+               torture_type, tag, nrealreaders, stat_interval, verbose,
+               test_no_idle_hz, shuffle_interval);
 }
 
 static void
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void)
        rcu_barrier();
 
        rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
+
+       if (cur_ops->cleanup != NULL)
+               cur_ops->cleanup();
        if (atomic_read(&n_rcu_torture_error))
                rcu_torture_print_module_parms("End of test: FAILURE");
        else
@@ -508,6 +607,20 @@ rcu_torture_init(void)
 
        /* Process args and tell the world that the torturer is on the job. */
 
+       for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
+               cur_ops = torture_ops[i];
+               if (strcmp(torture_type, cur_ops->name) == 0) {
+                       break;
+               }
+       }
+       if (cur_ops == NULL) {
+               printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
+                      torture_type);
+               return (-EINVAL);
+       }
+       if (cur_ops->init != NULL)
+               cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
        if (nreaders >= 0)
                nrealreaders = nreaders;
        else
index e3080fcc66a3b1237e30326782eed2a8fd184323..2404f9b0bc4772411adf8a8e69076642e6ded487 100644 (file)
@@ -232,6 +232,44 @@ int release_resource(struct resource *old)
 
 EXPORT_SYMBOL(release_resource);
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * the caller must specify res->start, res->end, res->flags.
+ * If found, returns 0, res is overwritten, if not found, returns -1.
+ */
+int find_next_system_ram(struct resource *res)
+{
+       resource_size_t start, end;
+       struct resource *p;
+
+       BUG_ON(!res);
+
+       start = res->start;
+       end = res->end;
+
+       read_lock(&resource_lock);
+       for (p = iomem_resource.child; p ; p = p->sibling) {
+               /* system ram is just marked as IORESOURCE_MEM */
+               if (p->flags != res->flags)
+                       continue;
+               if (p->start > end) {
+                       p = NULL;
+                       break;
+               }
+               if (p->start >= start)
+                       break;
+       }
+       read_unlock(&resource_lock);
+       if (!p)
+               return -1;
+       /* copy data */
+       res->start = p->start;
+       res->end = p->end;
+       return 0;
+}
+#endif
+
 /*
  * Find empty slot in the resource tree given range and alignment.
  */
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644 (file)
index 0000000..4aa8a2c
--- /dev/null
@@ -0,0 +1,513 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This code is based on the rt.c implementation in the preempt-rt tree.
+ * Portions of said code are
+ *
+ *  Copyright (C) 2004  LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Copyright (C) 2006  Esben Nielsen
+ *  Copyright (C) 2006  Kihon Technologies Inc.,
+ *                     Steven Rostedt <rostedt@goodmis.org>
+ *
+ * See rt.c in preempt-rt for proper credits and further information
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+# define TRACE_WARN_ON(x)                      WARN_ON(x)
+# define TRACE_BUG_ON(x)                       BUG_ON(x)
+
+# define TRACE_OFF()                                           \
+do {                                                           \
+       if (rt_trace_on) {                                      \
+               rt_trace_on = 0;                                \
+               console_verbose();                              \
+               if (spin_is_locked(&current->pi_lock))          \
+                       spin_unlock(&current->pi_lock);         \
+               if (spin_is_locked(&current->held_list_lock))   \
+                       spin_unlock(&current->held_list_lock);  \
+       }                                                       \
+} while (0)
+
+# define TRACE_OFF_NOLOCK()                                    \
+do {                                                           \
+       if (rt_trace_on) {                                      \
+               rt_trace_on = 0;                                \
+               console_verbose();                              \
+       }                                                       \
+} while (0)
+
+# define TRACE_BUG_LOCKED()                    \
+do {                                           \
+       TRACE_OFF();                            \
+       BUG();                                  \
+} while (0)
+
+# define TRACE_WARN_ON_LOCKED(c)               \
+do {                                           \
+       if (unlikely(c)) {                      \
+               TRACE_OFF();                    \
+               WARN_ON(1);                     \
+       }                                       \
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c)                        \
+do {                                           \
+       if (unlikely(c))                        \
+               TRACE_BUG_LOCKED();             \
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c)    TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c)    do { } while (0)
+#endif
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+int rt_trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+       rt_trace_on = 0;
+}
+
+static void printk_task(task_t *p)
+{
+       if (p)
+               printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+       else
+               printk("<none>");
+}
+
+static void printk_task_short(task_t *p)
+{
+       if (p)
+               printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+       else
+               printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+       if (lock->name)
+               printk(" [%p] {%s}\n",
+                       lock, lock->name);
+       else
+               printk(" [%p] {%s:%d}\n",
+                       lock, lock->file, lock->line);
+
+       if (print_owner && rt_mutex_owner(lock)) {
+               printk(".. ->owner: %p\n", lock->owner);
+               printk(".. held by:  ");
+               printk_task(rt_mutex_owner(lock));
+               printk("\n");
+       }
+       if (rt_mutex_owner(lock)) {
+               printk("... acquired at:               ");
+               print_symbol("%s\n", lock->acquire_ip);
+       }
+}
+
+static void printk_waiter(struct rt_mutex_waiter *w)
+{
+       printk("-------------------------\n");
+       printk("| waiter struct %p:\n", w);
+       printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+              w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
+              w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
+              w->list_entry.prio);
+       printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
+              w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
+              w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
+              w->pi_list_entry.prio);
+       printk("\n| lock:\n");
+       printk_lock(w->lock, 1);
+       printk("| w->ti->task:\n");
+       printk_task(w->task);
+       printk("| blocked at:  ");
+       print_symbol("%s\n", w->ip);
+       printk("-------------------------\n");
+}
+
+static void show_task_locks(task_t *p)
+{
+       switch (p->state) {
+       case TASK_RUNNING:              printk("R"); break;
+       case TASK_INTERRUPTIBLE:        printk("S"); break;
+       case TASK_UNINTERRUPTIBLE:      printk("D"); break;
+       case TASK_STOPPED:              printk("T"); break;
+       case EXIT_ZOMBIE:               printk("Z"); break;
+       case EXIT_DEAD:                 printk("X"); break;
+       default:                        printk("?"); break;
+       }
+       printk_task(p);
+       if (p->pi_blocked_on) {
+               struct rt_mutex *lock = p->pi_blocked_on->lock;
+
+               printk(" blocked on:");
+               printk_lock(lock, 1);
+       } else
+               printk(" (not blocked)\n");
+}
+
+void rt_mutex_show_held_locks(task_t *task, int verbose)
+{
+       struct list_head *curr, *cursor = NULL;
+       struct rt_mutex *lock;
+       task_t *t;
+       unsigned long flags;
+       int count = 0;
+
+       if (!rt_trace_on)
+               return;
+
+       if (verbose) {
+               printk("------------------------------\n");
+               printk("| showing all locks held by: |  (");
+               printk_task_short(task);
+               printk("):\n");
+               printk("------------------------------\n");
+       }
+
+next:
+       spin_lock_irqsave(&task->held_list_lock, flags);
+       list_for_each(curr, &task->held_list_head) {
+               if (cursor && curr != cursor)
+                       continue;
+               lock = list_entry(curr, struct rt_mutex, held_list_entry);
+               t = rt_mutex_owner(lock);
+               WARN_ON(t != task);
+               count++;
+               cursor = curr->next;
+               spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+               printk("\n#%03d:            ", count);
+               printk_lock(lock, 0);
+               goto next;
+       }
+       spin_unlock_irqrestore(&task->held_list_lock, flags);
+
+       printk("\n");
+}
+
+void rt_mutex_show_all_locks(void)
+{
+       task_t *g, *p;
+       int count = 10;
+       int unlock = 1;
+
+       printk("\n");
+       printk("----------------------\n");
+       printk("| showing all tasks: |\n");
+       printk("----------------------\n");
+
+       /*
+        * Here we try to get the tasklist_lock as hard as possible,
+        * if not successful after 2 seconds we ignore it (but keep
+        * trying). This is to enable a debug printout even if a
+        * tasklist_lock-holding task deadlocks or crashes.
+        */
+retry:
+       if (!read_trylock(&tasklist_lock)) {
+               if (count == 10)
+                       printk("hm, tasklist_lock locked, retrying... ");
+               if (count) {
+                       count--;
+                       printk(" #%d", 10-count);
+                       mdelay(200);
+                       goto retry;
+               }
+               printk(" ignoring it.\n");
+               unlock = 0;
+       }
+       if (count != 10)
+               printk(" locked it.\n");
+
+       do_each_thread(g, p) {
+               show_task_locks(p);
+               if (!unlock)
+                       if (read_trylock(&tasklist_lock))
+                               unlock = 1;
+       } while_each_thread(g, p);
+
+       printk("\n");
+
+       printk("-----------------------------------------\n");
+       printk("| showing all locks held in the system: |\n");
+       printk("-----------------------------------------\n");
+
+       do_each_thread(g, p) {
+               rt_mutex_show_held_locks(p, 0);
+               if (!unlock)
+                       if (read_trylock(&tasklist_lock))
+                               unlock = 1;
+       } while_each_thread(g, p);
+
+
+       printk("=============================================\n\n");
+
+       if (unlock)
+               read_unlock(&tasklist_lock);
+}
+
+void rt_mutex_debug_check_no_locks_held(task_t *task)
+{
+       struct rt_mutex_waiter *w;
+       struct list_head *curr;
+       struct rt_mutex *lock;
+
+       if (!rt_trace_on)
+               return;
+       if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
+               printk("BUG: PI priority boost leaked!\n");
+               printk_task(task);
+               printk("\n");
+       }
+       if (list_empty(&task->held_list_head))
+               return;
+
+       spin_lock(&task->pi_lock);
+       plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
+               TRACE_OFF();
+
+               printk("hm, PI interest held at exit time? Task:\n");
+               printk_task(task);
+               printk_waiter(w);
+               return;
+       }
+       spin_unlock(&task->pi_lock);
+
+       list_for_each(curr, &task->held_list_head) {
+               lock = list_entry(curr, struct rt_mutex, held_list_entry);
+
+               printk("BUG: %s/%d, lock held at task exit time!\n",
+                      task->comm, task->pid);
+               printk_lock(lock, 1);
+               if (rt_mutex_owner(lock) != task)
+                       printk("exiting task is not even the owner??\n");
+       }
+}
+
+int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
+{
+       const void *to = from + len;
+       struct list_head *curr;
+       struct rt_mutex *lock;
+       unsigned long flags;
+       void *lock_addr;
+
+       if (!rt_trace_on)
+               return 0;
+
+       spin_lock_irqsave(&current->held_list_lock, flags);
+       list_for_each(curr, &current->held_list_head) {
+               lock = list_entry(curr, struct rt_mutex, held_list_entry);
+               lock_addr = lock;
+               if (lock_addr < from || lock_addr >= to)
+                       continue;
+               TRACE_OFF();
+
+               printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
+                       current->comm, current->pid, lock, from, to);
+               dump_stack();
+               printk_lock(lock, 1);
+               if (rt_mutex_owner(lock) != current)
+                       printk("freeing task is not even the owner??\n");
+               return 1;
+       }
+       spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+       return 0;
+}
+
+void rt_mutex_debug_task_free(struct task_struct *task)
+{
+       WARN_ON(!plist_head_empty(&task->pi_waiters));
+       WARN_ON(task->pi_blocked_on);
+}
+
+/*
+ * We fill out the fields in the waiter to store the information about
+ * the deadlock. We print when we return. act_waiter can be NULL in
+ * case of a remove waiter operation.
+ */
+void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+                            struct rt_mutex *lock)
+{
+       struct task_struct *task;
+
+       if (!rt_trace_on || detect || !act_waiter)
+               return;
+
+       task = rt_mutex_owner(act_waiter->lock);
+       if (task && task != current) {
+               act_waiter->deadlock_task_pid = task->pid;
+               act_waiter->deadlock_lock = lock;
+       }
+}
+
+void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
+{
+       struct task_struct *task;
+
+       if (!waiter->deadlock_lock || !rt_trace_on)
+               return;
+
+       task = find_task_by_pid(waiter->deadlock_task_pid);
+       if (!task)
+               return;
+
+       TRACE_OFF_NOLOCK();
+
+       printk("\n============================================\n");
+       printk(  "[ BUG: circular locking deadlock detected! ]\n");
+       printk(  "--------------------------------------------\n");
+       printk("%s/%d is deadlocking current task %s/%d\n\n",
+              task->comm, task->pid, current->comm, current->pid);
+
+       printk("\n1) %s/%d is trying to acquire this lock:\n",
+              current->comm, current->pid);
+       printk_lock(waiter->lock, 1);
+
+       printk("... trying at:                 ");
+       print_symbol("%s\n", waiter->ip);
+
+       printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
+       printk_lock(waiter->deadlock_lock, 1);
+
+       rt_mutex_show_held_locks(current, 1);
+       rt_mutex_show_held_locks(task, 1);
+
+       printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
+       show_stack(task, NULL);
+       printk("\n%s/%d's [current] stackdump:\n\n",
+              current->comm, current->pid);
+       dump_stack();
+       rt_mutex_show_all_locks();
+       printk("[ turning off deadlock detection."
+              "Please report this trace. ]\n\n");
+       local_irq_disable();
+}
+
+void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
+{
+       unsigned long flags;
+
+       if (rt_trace_on) {
+               TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+               spin_lock_irqsave(&current->held_list_lock, flags);
+               list_add_tail(&lock->held_list_entry, &current->held_list_head);
+               spin_unlock_irqrestore(&current->held_list_lock, flags);
+
+               lock->acquire_ip = ip;
+       }
+}
+
+void debug_rt_mutex_unlock(struct rt_mutex *lock)
+{
+       unsigned long flags;
+
+       if (rt_trace_on) {
+               TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+               TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+               spin_lock_irqsave(&current->held_list_lock, flags);
+               list_del_init(&lock->held_list_entry);
+               spin_unlock_irqrestore(&current->held_list_lock, flags);
+       }
+}
+
+void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+                              struct task_struct *powner __IP_DECL__)
+{
+       unsigned long flags;
+
+       if (rt_trace_on) {
+               TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
+
+               spin_lock_irqsave(&powner->held_list_lock, flags);
+               list_add_tail(&lock->held_list_entry, &powner->held_list_head);
+               spin_unlock_irqrestore(&powner->held_list_lock, flags);
+
+               lock->acquire_ip = ip;
+       }
+}
+
+void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
+{
+       unsigned long flags;
+
+       if (rt_trace_on) {
+               struct task_struct *owner = rt_mutex_owner(lock);
+
+               TRACE_WARN_ON_LOCKED(!owner);
+               TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
+
+               spin_lock_irqsave(&owner->held_list_lock, flags);
+               list_del_init(&lock->held_list_entry);
+               spin_unlock_irqrestore(&owner->held_list_lock, flags);
+       }
+}
+
+void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+       memset(waiter, 0x11, sizeof(*waiter));
+       plist_node_init(&waiter->list_entry, MAX_PRIO);
+       plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+}
+
+void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
+{
+       TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+       TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+       TRACE_WARN_ON(waiter->task);
+       memset(waiter, 0x22, sizeof(*waiter));
+}
+
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+       void *addr = lock;
+
+       if (rt_trace_on) {
+               rt_mutex_debug_check_no_locks_freed(addr,
+                                                   sizeof(struct rt_mutex));
+               INIT_LIST_HEAD(&lock->held_list_entry);
+               lock->name = name;
+       }
+}
+
+void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
+{
+}
+
+void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+{
+}
+
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644 (file)
index 0000000..7612fbc
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+#define __IP_DECL__            , unsigned long ip
+#define __IP__                 , ip
+#define __RET_IP__             , (unsigned long)__builtin_return_address(0)
+
+extern void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
+extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
+                                     struct task_struct *powner __IP_DECL__);
+extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+                                   struct rt_mutex *lock);
+extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
+# define debug_rt_mutex_reset_waiter(w)                        \
+       do { (w)->deadlock_lock = NULL; } while (0)
+
+static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+                                                int detect)
+{
+       return (waiter != NULL);
+}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644 (file)
index 0000000..e82c2f8
--- /dev/null
@@ -0,0 +1,440 @@
+/*
+ * RT-Mutex-tester: scriptable tester for rt mutexes
+ *
+ * started by Thomas Gleixner:
+ *
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/sysdev.h>
+#include <linux/timer.h>
+
+#include "rtmutex.h"
+
+#define MAX_RT_TEST_THREADS    8
+#define MAX_RT_TEST_MUTEXES    8
+
+static spinlock_t rttest_lock;
+static atomic_t rttest_event;
+
+struct test_thread_data {
+       int                     opcode;
+       int                     opdata;
+       int                     mutexes[MAX_RT_TEST_MUTEXES];
+       int                     bkl;
+       int                     event;
+       struct sys_device       sysdev;
+};
+
+static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
+static task_t *threads[MAX_RT_TEST_THREADS];
+static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
+
+enum test_opcodes {
+       RTTEST_NOP = 0,
+       RTTEST_SCHEDOT,         /* 1 Sched other, data = nice */
+       RTTEST_SCHEDRT,         /* 2 Sched fifo, data = prio */
+       RTTEST_LOCK,            /* 3 Lock uninterruptible, data = lockindex */
+       RTTEST_LOCKNOWAIT,      /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
+       RTTEST_LOCKINT,         /* 5 Lock interruptible, data = lockindex */
+       RTTEST_LOCKINTNOWAIT,   /* 6 Lock interruptible no wait in wakeup, data = lockindex */
+       RTTEST_LOCKCONT,        /* 7 Continue locking after the wakeup delay */
+       RTTEST_UNLOCK,          /* 8 Unlock, data = lockindex */
+       RTTEST_LOCKBKL,         /* 9 Lock BKL */
+       RTTEST_UNLOCKBKL,       /* 10 Unlock BKL */
+       RTTEST_SIGNAL,          /* 11 Signal other test thread, data = thread id */
+       RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
+       RTTEST_RESET = 99,      /* 99 Reset all pending operations */
+};
+
+static int handle_op(struct test_thread_data *td, int lockwakeup)
+{
+       int i, id, ret = -EINVAL;
+
+       switch(td->opcode) {
+
+       case RTTEST_NOP:
+               return 0;
+
+       case RTTEST_LOCKCONT:
+               td->mutexes[td->opdata] = 1;
+               td->event = atomic_add_return(1, &rttest_event);
+               return 0;
+
+       case RTTEST_RESET:
+               for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
+                       if (td->mutexes[i] == 4) {
+                               rt_mutex_unlock(&mutexes[i]);
+                               td->mutexes[i] = 0;
+                       }
+               }
+
+               if (!lockwakeup && td->bkl == 4) {
+                       unlock_kernel();
+                       td->bkl = 0;
+               }
+               return 0;
+
+       case RTTEST_RESETEVENT:
+               atomic_set(&rttest_event, 0);
+               return 0;
+
+       default:
+               if (lockwakeup)
+                       return ret;
+       }
+
+       switch(td->opcode) {
+
+       case RTTEST_LOCK:
+       case RTTEST_LOCKNOWAIT:
+               id = td->opdata;
+               if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+                       return ret;
+
+               td->mutexes[id] = 1;
+               td->event = atomic_add_return(1, &rttest_event);
+               rt_mutex_lock(&mutexes[id]);
+               td->event = atomic_add_return(1, &rttest_event);
+               td->mutexes[id] = 4;
+               return 0;
+
+       case RTTEST_LOCKINT:
+       case RTTEST_LOCKINTNOWAIT:
+               id = td->opdata;
+               if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
+                       return ret;
+
+               td->mutexes[id] = 1;
+               td->event = atomic_add_return(1, &rttest_event);
+               ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
+               td->event = atomic_add_return(1, &rttest_event);
+               td->mutexes[id] = ret ? 0 : 4;
+               return ret ? -EINTR : 0;
+
+       case RTTEST_UNLOCK:
+               id = td->opdata;
+               if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
+                       return ret;
+
+               td->event = atomic_add_return(1, &rttest_event);
+               rt_mutex_unlock(&mutexes[id]);
+               td->event = atomic_add_return(1, &rttest_event);
+               td->mutexes[id] = 0;
+               return 0;
+
+       case RTTEST_LOCKBKL:
+               if (td->bkl)
+                       return 0;
+               td->bkl = 1;
+               lock_kernel();
+               td->bkl = 4;
+               return 0;
+
+       case RTTEST_UNLOCKBKL:
+               if (td->bkl != 4)
+                       break;
+               unlock_kernel();
+               td->bkl = 0;
+               return 0;
+
+       default:
+               break;
+       }
+       return ret;
+}
+
+/*
+ * Schedule replacement for rtsem_down(). Only called for threads with
+ * PF_MUTEX_TESTER set.
+ *
+ * This allows us to have finegrained control over the event flow.
+ *
+ */
+void schedule_rt_mutex_test(struct rt_mutex *mutex)
+{
+       int tid, op, dat;
+       struct test_thread_data *td;
+
+       /* We have to lookup the task */
+       for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
+               if (threads[tid] == current)
+                       break;
+       }
+
+       BUG_ON(tid == MAX_RT_TEST_THREADS);
+
+       td = &thread_data[tid];
+
+       op = td->opcode;
+       dat = td->opdata;
+
+       switch (op) {
+       case RTTEST_LOCK:
+       case RTTEST_LOCKINT:
+       case RTTEST_LOCKNOWAIT:
+       case RTTEST_LOCKINTNOWAIT:
+               if (mutex != &mutexes[dat])
+                       break;
+
+               if (td->mutexes[dat] != 1)
+                       break;
+
+               td->mutexes[dat] = 2;
+               td->event = atomic_add_return(1, &rttest_event);
+               break;
+
+       case RTTEST_LOCKBKL:
+       default:
+               break;
+       }
+
+       schedule();
+
+
+       switch (op) {
+       case RTTEST_LOCK:
+       case RTTEST_LOCKINT:
+               if (mutex != &mutexes[dat])
+                       return;
+
+               if (td->mutexes[dat] != 2)
+                       return;
+
+               td->mutexes[dat] = 3;
+               td->event = atomic_add_return(1, &rttest_event);
+               break;
+
+       case RTTEST_LOCKNOWAIT:
+       case RTTEST_LOCKINTNOWAIT:
+               if (mutex != &mutexes[dat])
+                       return;
+
+               if (td->mutexes[dat] != 2)
+                       return;
+
+               td->mutexes[dat] = 1;
+               td->event = atomic_add_return(1, &rttest_event);
+               return;
+
+       case RTTEST_LOCKBKL:
+               return;
+       default:
+               return;
+       }
+
+       td->opcode = 0;
+
+       for (;;) {
+               set_current_state(TASK_INTERRUPTIBLE);
+
+               if (td->opcode > 0) {
+                       int ret;
+
+                       set_current_state(TASK_RUNNING);
+                       ret = handle_op(td, 1);
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       if (td->opcode == RTTEST_LOCKCONT)
+                               break;
+                       td->opcode = ret;
+               }
+
+               /* Wait for the next command to be executed */
+               schedule();
+       }
+
+       /* Restore previous command and data */
+       td->opcode = op;
+       td->opdata = dat;
+}
+
+static int test_func(void *data)
+{
+       struct test_thread_data *td = data;
+       int ret;
+
+       current->flags |= PF_MUTEX_TESTER;
+       allow_signal(SIGHUP);
+
+       for(;;) {
+
+               set_current_state(TASK_INTERRUPTIBLE);
+
+               if (td->opcode > 0) {
+                       set_current_state(TASK_RUNNING);
+                       ret = handle_op(td, 0);
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       td->opcode = ret;
+               }
+
+               /* Wait for the next command to be executed */
+               schedule();
+
+               if (signal_pending(current))
+                       flush_signals(current);
+
+               if(kthread_should_stop())
+                       break;
+       }
+       return 0;
+}
+
+/**
+ * sysfs_test_command - interface for test commands
+ * @dev:       thread reference
+ * @buf:       command for actual step
+ * @count:     length of buffer
+ *
+ * command syntax:
+ *
+ * opcode:data
+ */
+static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
+                                 size_t count)
+{
+       struct sched_param schedpar;
+       struct test_thread_data *td;
+       char cmdbuf[32];
+       int op, dat, tid, ret;
+
+       td = container_of(dev, struct test_thread_data, sysdev);
+       tid = td->sysdev.id;
+
+       /* strings from sysfs write are not 0 terminated! */
+       if (count >= sizeof(cmdbuf))
+               return -EINVAL;
+
+       /* strip of \n: */
+       if (buf[count-1] == '\n')
+               count--;
+       if (count < 1)
+               return -EINVAL;
+
+       memcpy(cmdbuf, buf, count);
+       cmdbuf[count] = 0;
+
+       if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
+               return -EINVAL;
+
+       switch (op) {
+       case RTTEST_SCHEDOT:
+               schedpar.sched_priority = 0;
+               ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
+               if (ret)
+                       return ret;
+               set_user_nice(current, 0);
+               break;
+
+       case RTTEST_SCHEDRT:
+               schedpar.sched_priority = dat;
+               ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
+               if (ret)
+                       return ret;
+               break;
+
+       case RTTEST_SIGNAL:
+               send_sig(SIGHUP, threads[tid], 0);
+               break;
+
+       default:
+               if (td->opcode > 0)
+                       return -EBUSY;
+               td->opdata = dat;
+               td->opcode = op;
+               wake_up_process(threads[tid]);
+       }
+
+       return count;
+}
+
+/**
+ * sysfs_test_status - sysfs interface for rt tester
+ * @dev:       thread to query
+ * @buf:       char buffer to be filled with thread status info
+ */
+static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+{
+       struct test_thread_data *td;
+       char *curr = buf;
+       task_t *tsk;
+       int i;
+
+       td = container_of(dev, struct test_thread_data, sysdev);
+       tsk = threads[td->sysdev.id];
+
+       spin_lock(&rttest_lock);
+
+       curr += sprintf(curr,
+               "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+               td->opcode, td->event, tsk->state,
+                       (MAX_RT_PRIO - 1) - tsk->prio,
+                       (MAX_RT_PRIO - 1) - tsk->normal_prio,
+               tsk->pi_blocked_on, td->bkl);
+
+       for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
+               curr += sprintf(curr, "%d", td->mutexes[i]);
+
+       spin_unlock(&rttest_lock);
+
+       curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
+                       mutexes[td->sysdev.id].owner);
+
+       return curr - buf;
+}
+
+static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
+static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+
+static struct sysdev_class rttest_sysclass = {
+       set_kset_name("rttest"),
+};
+
+static int init_test_thread(int id)
+{
+       thread_data[id].sysdev.cls = &rttest_sysclass;
+       thread_data[id].sysdev.id = id;
+
+       threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
+       if (IS_ERR(threads[id]))
+               return PTR_ERR(threads[id]);
+
+       return sysdev_register(&thread_data[id].sysdev);
+}
+
+static int init_rttest(void)
+{
+       int ret, i;
+
+       spin_lock_init(&rttest_lock);
+
+       for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
+               rt_mutex_init(&mutexes[i]);
+
+       ret = sysdev_class_register(&rttest_sysclass);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
+               ret = init_test_thread(i);
+               if (ret)
+                       break;
+               ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+               if (ret)
+                       break;
+               ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+               if (ret)
+                       break;
+       }
+
+       printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
+
+       return ret;
+}
+
+device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644 (file)
index 0000000..45d6101
--- /dev/null
@@ -0,0 +1,990 @@
+/*
+ * RT-Mutexes: simple blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner.
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
+ *  Copyright (C) 2006 Esben Nielsen
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
+
+/*
+ * lock->owner state tracking:
+ *
+ * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
+ * are used to keep track of the "owner is pending" and "lock has
+ * waiters" state.
+ *
+ * owner       bit1    bit0
+ * NULL                0       0       lock is free (fast acquire possible)
+ * NULL                0       1       invalid state
+ * NULL                1       0       Transitional State*
+ * NULL                1       1       invalid state
+ * taskpointer 0       0       lock is held (fast release possible)
+ * taskpointer 0       1       task is pending owner
+ * taskpointer 1       0       lock is held and has waiters
+ * taskpointer 1       1       task is pending owner and lock has more waiters
+ *
+ * Pending ownership is assigned to the top (highest priority)
+ * waiter of the lock, when the lock is released. The thread is woken
+ * up and can now take the lock. Until the lock is taken (bit 0
+ * cleared) a competing higher priority thread can steal the lock
+ * which puts the woken up thread back on the waiters list.
+ *
+ * The fast atomic compare exchange based acquire and release is only
+ * possible when bit 0 and 1 of lock->owner are 0.
+ *
+ * (*) There's a small time where the owner can be NULL and the
+ * "lock has waiters" bit is set.  This can happen when grabbing the lock.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to set this
+ * bit before looking at the lock, hence the reason this is a transitional
+ * state.
+ */
+
+static void
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+                  unsigned long mask)
+{
+       unsigned long val = (unsigned long)owner | mask;
+
+       if (rt_mutex_has_waiters(lock))
+               val |= RT_MUTEX_HAS_WAITERS;
+
+       lock->owner = (struct task_struct *)val;
+}
+
+static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
+{
+       lock->owner = (struct task_struct *)
+                       ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
+{
+       if (!rt_mutex_has_waiters(lock))
+               clear_rt_mutex_waiters(lock);
+}
+
+/*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n)       (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+       unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+       do {
+               owner = *p;
+       } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n)       (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+       lock->owner = (struct task_struct *)
+                       ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+
+/*
+ * Calculate task priority from the waiter list priority
+ *
+ * Return task->normal_prio when the waiter list is empty or when
+ * the waiter is not allowed to do priority boosting
+ */
+int rt_mutex_getprio(struct task_struct *task)
+{
+       if (likely(!task_has_pi_waiters(task)))
+               return task->normal_prio;
+
+       return min(task_top_pi_waiter(task)->pi_list_entry.prio,
+                  task->normal_prio);
+}
+
+/*
+ * Adjust the priority of a task, after its pi_waiters got modified.
+ *
+ * This can be both boosting and unboosting. task->pi_lock must be held.
+ */
+static void __rt_mutex_adjust_prio(struct task_struct *task)
+{
+       int prio = rt_mutex_getprio(task);
+
+       if (task->prio != prio)
+               rt_mutex_setprio(task, prio);
+}
+
+/*
+ * Adjust task priority (undo boosting). Called from the exit path of
+ * rt_mutex_slowunlock() and rt_mutex_slowlock().
+ *
+ * (Note: We do this outside of the protection of lock->wait_lock to
+ * allow the lock to be taken while or before we readjust the priority
+ * of task. We do not use the spin_xx_mutex() variants here as we are
+ * outside of the debug path.)
+ */
+static void rt_mutex_adjust_prio(struct task_struct *task)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&task->pi_lock, flags);
+       __rt_mutex_adjust_prio(task);
+       spin_unlock_irqrestore(&task->pi_lock, flags);
+}
+
+/*
+ * Max number of times we'll walk the boosting chain:
+ */
+int max_lock_depth = 1024;
+
+/*
+ * Adjust the priority chain. Also used for deadlock detection.
+ * Decreases task's usage by one - may thus free the task.
+ * Returns 0 or -EDEADLK.
+ */
+static int rt_mutex_adjust_prio_chain(task_t *task,
+                                     int deadlock_detect,
+                                     struct rt_mutex *orig_lock,
+                                     struct rt_mutex_waiter *orig_waiter,
+                                     struct task_struct *top_task
+                                     __IP_DECL__)
+{
+       struct rt_mutex *lock;
+       struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+       int detect_deadlock, ret = 0, depth = 0;
+       unsigned long flags;
+
+       detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
+                                                        deadlock_detect);
+
+       /*
+        * The (de)boosting is a step by step approach with a lot of
+        * pitfalls. We want this to be preemptible and we want hold a
+        * maximum of two locks per step. So we have to check
+        * carefully whether things change under us.
+        */
+ again:
+       if (++depth > max_lock_depth) {
+               static int prev_max;
+
+               /*
+                * Print this only once. If the admin changes the limit,
+                * print a new message when reaching the limit again.
+                */
+               if (prev_max != max_lock_depth) {
+                       prev_max = max_lock_depth;
+                       printk(KERN_WARNING "Maximum lock depth %d reached "
+                              "task: %s (%d)\n", max_lock_depth,
+                              top_task->comm, top_task->pid);
+               }
+               put_task_struct(task);
+
+               return deadlock_detect ? -EDEADLK : 0;
+       }
+ retry:
+       /*
+        * Task can not go away as we did a get_task() before !
+        */
+       spin_lock_irqsave(&task->pi_lock, flags);
+
+       waiter = task->pi_blocked_on;
+       /*
+        * Check whether the end of the boosting chain has been
+        * reached or the state of the chain has changed while we
+        * dropped the locks.
+        */
+       if (!waiter || !waiter->task)
+               goto out_unlock_pi;
+
+       if (top_waiter && (!task_has_pi_waiters(task) ||
+                          top_waiter != task_top_pi_waiter(task)))
+               goto out_unlock_pi;
+
+       /*
+        * When deadlock detection is off then we check, if further
+        * priority adjustment is necessary.
+        */
+       if (!detect_deadlock && waiter->list_entry.prio == task->prio)
+               goto out_unlock_pi;
+
+       lock = waiter->lock;
+       if (!spin_trylock(&lock->wait_lock)) {
+               spin_unlock_irqrestore(&task->pi_lock, flags);
+               cpu_relax();
+               goto retry;
+       }
+
+       /* Deadlock detection */
+       if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
+               debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+               spin_unlock(&lock->wait_lock);
+               ret = deadlock_detect ? -EDEADLK : 0;
+               goto out_unlock_pi;
+       }
+
+       top_waiter = rt_mutex_top_waiter(lock);
+
+       /* Requeue the waiter */
+       plist_del(&waiter->list_entry, &lock->wait_list);
+       waiter->list_entry.prio = task->prio;
+       plist_add(&waiter->list_entry, &lock->wait_list);
+
+       /* Release the task */
+       spin_unlock_irqrestore(&task->pi_lock, flags);
+       put_task_struct(task);
+
+       /* Grab the next task */
+       task = rt_mutex_owner(lock);
+       spin_lock_irqsave(&task->pi_lock, flags);
+
+       if (waiter == rt_mutex_top_waiter(lock)) {
+               /* Boost the owner */
+               plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
+               waiter->pi_list_entry.prio = waiter->list_entry.prio;
+               plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+               __rt_mutex_adjust_prio(task);
+
+       } else if (top_waiter == waiter) {
+               /* Deboost the owner */
+               plist_del(&waiter->pi_list_entry, &task->pi_waiters);
+               waiter = rt_mutex_top_waiter(lock);
+               waiter->pi_list_entry.prio = waiter->list_entry.prio;
+               plist_add(&waiter->pi_list_entry, &task->pi_waiters);
+               __rt_mutex_adjust_prio(task);
+       }
+
+       get_task_struct(task);
+       spin_unlock_irqrestore(&task->pi_lock, flags);
+
+       top_waiter = rt_mutex_top_waiter(lock);
+       spin_unlock(&lock->wait_lock);
+
+       if (!detect_deadlock && waiter != top_waiter)
+               goto out_put_task;
+
+       goto again;
+
+ out_unlock_pi:
+       spin_unlock_irqrestore(&task->pi_lock, flags);
+ out_put_task:
+       put_task_struct(task);
+       return ret;
+}
+
+/*
+ * Optimization: check if we can steal the lock from the
+ * assigned pending owner [which might not have taken the
+ * lock yet]:
+ */
+static inline int try_to_steal_lock(struct rt_mutex *lock)
+{
+       struct task_struct *pendowner = rt_mutex_owner(lock);
+       struct rt_mutex_waiter *next;
+       unsigned long flags;
+
+       if (!rt_mutex_owner_pending(lock))
+               return 0;
+
+       if (pendowner == current)
+               return 1;
+
+       spin_lock_irqsave(&pendowner->pi_lock, flags);
+       if (current->prio >= pendowner->prio) {
+               spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+               return 0;
+       }
+
+       /*
+        * Check if a waiter is enqueued on the pending owners
+        * pi_waiters list. Remove it and readjust pending owners
+        * priority.
+        */
+       if (likely(!rt_mutex_has_waiters(lock))) {
+               spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+               return 1;
+       }
+
+       /* No chain handling, pending owner is not blocked on anything: */
+       next = rt_mutex_top_waiter(lock);
+       plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
+       __rt_mutex_adjust_prio(pendowner);
+       spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+       /*
+        * We are going to steal the lock and a waiter was
+        * enqueued on the pending owners pi_waiters queue. So
+        * we have to enqueue this waiter into
+        * current->pi_waiters list. This covers the case,
+        * where current is boosted because it holds another
+        * lock and gets unboosted because the booster is
+        * interrupted, so we would delay a waiter with higher
+        * priority as current->normal_prio.
+        *
+        * Note: in the rare case of a SCHED_OTHER task changing
+        * its priority and thus stealing the lock, next->task
+        * might be current:
+        */
+       if (likely(next->task != current)) {
+               spin_lock_irqsave(&current->pi_lock, flags);
+               plist_add(&next->pi_list_entry, &current->pi_waiters);
+               __rt_mutex_adjust_prio(current);
+               spin_unlock_irqrestore(&current->pi_lock, flags);
+       }
+       return 1;
+}
+
+/*
+ * Try to take an rt-mutex
+ *
+ * This fails
+ * - when the lock has a real owner
+ * - when a different pending owner exists and has higher priority than current
+ *
+ * Must be called with lock->wait_lock held.
+ */
+static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
+{
+       /*
+        * We have to be careful here if the atomic speedups are
+        * enabled, such that, when
+        *  - no other waiter is on the lock
+        *  - the lock has been released since we did the cmpxchg
+        * the lock can be released or taken while we are doing the
+        * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+        *
+        * The atomic acquire/release aware variant of
+        * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
+        * the WAITERS bit, the atomic release / acquire can not
+        * happen anymore and lock->wait_lock protects us from the
+        * non-atomic case.
+        *
+        * Note, that this might set lock->owner =
+        * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
+        * any more. This is fixed up when we take the ownership.
+        * This is the transitional state explained at the top of this file.
+        */
+       mark_rt_mutex_waiters(lock);
+
+       if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+               return 0;
+
+       /* We got the lock. */
+       debug_rt_mutex_lock(lock __IP__);
+
+       rt_mutex_set_owner(lock, current, 0);
+
+       rt_mutex_deadlock_account_lock(lock, current);
+
+       return 1;
+}
+
+/*
+ * Task blocks on lock.
+ *
+ * Prepare waiter and propagate pi chain
+ *
+ * This must be called with lock->wait_lock held.
+ */
+static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+                                  struct rt_mutex_waiter *waiter,
+                                  int detect_deadlock
+                                  __IP_DECL__)
+{
+       struct rt_mutex_waiter *top_waiter = waiter;
+       task_t *owner = rt_mutex_owner(lock);
+       int boost = 0, res;
+       unsigned long flags;
+
+       spin_lock_irqsave(&current->pi_lock, flags);
+       __rt_mutex_adjust_prio(current);
+       waiter->task = current;
+       waiter->lock = lock;
+       plist_node_init(&waiter->list_entry, current->prio);
+       plist_node_init(&waiter->pi_list_entry, current->prio);
+
+       /* Get the top priority waiter on the lock */
+       if (rt_mutex_has_waiters(lock))
+               top_waiter = rt_mutex_top_waiter(lock);
+       plist_add(&waiter->list_entry, &lock->wait_list);
+
+       current->pi_blocked_on = waiter;
+
+       spin_unlock_irqrestore(&current->pi_lock, flags);
+
+       if (waiter == rt_mutex_top_waiter(lock)) {
+               spin_lock_irqsave(&owner->pi_lock, flags);
+               plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+               plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+
+               __rt_mutex_adjust_prio(owner);
+               if (owner->pi_blocked_on) {
+                       boost = 1;
+                       /* gets dropped in rt_mutex_adjust_prio_chain()! */
+                       get_task_struct(owner);
+               }
+               spin_unlock_irqrestore(&owner->pi_lock, flags);
+       }
+       else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
+               spin_lock_irqsave(&owner->pi_lock, flags);
+               if (owner->pi_blocked_on) {
+                       boost = 1;
+                       /* gets dropped in rt_mutex_adjust_prio_chain()! */
+                       get_task_struct(owner);
+               }
+               spin_unlock_irqrestore(&owner->pi_lock, flags);
+       }
+       if (!boost)
+               return 0;
+
+       spin_unlock(&lock->wait_lock);
+
+       res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
+                                        current __IP__);
+
+       spin_lock(&lock->wait_lock);
+
+       return res;
+}
+
+/*
+ * Wake up the next waiter on the lock.
+ *
+ * Remove the top waiter from the current tasks waiter list and from
+ * the lock waiter list. Set it as pending owner. Then wake it up.
+ *
+ * Called with lock->wait_lock held.
+ */
+static void wakeup_next_waiter(struct rt_mutex *lock)
+{
+       struct rt_mutex_waiter *waiter;
+       struct task_struct *pendowner;
+       unsigned long flags;
+
+       spin_lock_irqsave(&current->pi_lock, flags);
+
+       waiter = rt_mutex_top_waiter(lock);
+       plist_del(&waiter->list_entry, &lock->wait_list);
+
+       /*
+        * Remove it from current->pi_waiters. We do not adjust a
+        * possible priority boost right now. We execute wakeup in the
+        * boosted mode and go back to normal after releasing
+        * lock->wait_lock.
+        */
+       plist_del(&waiter->pi_list_entry, &current->pi_waiters);
+       pendowner = waiter->task;
+       waiter->task = NULL;
+
+       rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+
+       spin_unlock_irqrestore(&current->pi_lock, flags);
+
+       /*
+        * Clear the pi_blocked_on variable and enqueue a possible
+        * waiter into the pi_waiters list of the pending owner. This
+        * prevents that in case the pending owner gets unboosted a
+        * waiter with higher priority than pending-owner->normal_prio
+        * is blocked on the unboosted (pending) owner.
+        */
+       spin_lock_irqsave(&pendowner->pi_lock, flags);
+
+       WARN_ON(!pendowner->pi_blocked_on);
+       WARN_ON(pendowner->pi_blocked_on != waiter);
+       WARN_ON(pendowner->pi_blocked_on->lock != lock);
+
+       pendowner->pi_blocked_on = NULL;
+
+       if (rt_mutex_has_waiters(lock)) {
+               struct rt_mutex_waiter *next;
+
+               next = rt_mutex_top_waiter(lock);
+               plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
+       }
+       spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+
+       wake_up_process(pendowner);
+}
+
+/*
+ * Remove a waiter from a lock
+ *
+ * Must be called with lock->wait_lock held
+ */
+static void remove_waiter(struct rt_mutex *lock,
+                         struct rt_mutex_waiter *waiter  __IP_DECL__)
+{
+       int first = (waiter == rt_mutex_top_waiter(lock));
+       int boost = 0;
+       task_t *owner = rt_mutex_owner(lock);
+       unsigned long flags;
+
+       spin_lock_irqsave(&current->pi_lock, flags);
+       plist_del(&waiter->list_entry, &lock->wait_list);
+       waiter->task = NULL;
+       current->pi_blocked_on = NULL;
+       spin_unlock_irqrestore(&current->pi_lock, flags);
+
+       if (first && owner != current) {
+
+               spin_lock_irqsave(&owner->pi_lock, flags);
+
+               plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
+
+               if (rt_mutex_has_waiters(lock)) {
+                       struct rt_mutex_waiter *next;
+
+                       next = rt_mutex_top_waiter(lock);
+                       plist_add(&next->pi_list_entry, &owner->pi_waiters);
+               }
+               __rt_mutex_adjust_prio(owner);
+
+               if (owner->pi_blocked_on) {
+                       boost = 1;
+                       /* gets dropped in rt_mutex_adjust_prio_chain()! */
+                       get_task_struct(owner);
+               }
+               spin_unlock_irqrestore(&owner->pi_lock, flags);
+       }
+
+       WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+
+       if (!boost)
+               return;
+
+       spin_unlock(&lock->wait_lock);
+
+       rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__);
+
+       spin_lock(&lock->wait_lock);
+}
+
+/*
+ * Recheck the pi chain, in case we got a priority setting
+ *
+ * Called from sched_setscheduler
+ */
+void rt_mutex_adjust_pi(struct task_struct *task)
+{
+       struct rt_mutex_waiter *waiter;
+       unsigned long flags;
+
+       spin_lock_irqsave(&task->pi_lock, flags);
+
+       waiter = task->pi_blocked_on;
+       if (!waiter || waiter->list_entry.prio == task->prio) {
+               spin_unlock_irqrestore(&task->pi_lock, flags);
+               return;
+       }
+
+       /* gets dropped in rt_mutex_adjust_prio_chain()! */
+       get_task_struct(task);
+       spin_unlock_irqrestore(&task->pi_lock, flags);
+
+       rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__);
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+                 struct hrtimer_sleeper *timeout,
+                 int detect_deadlock __IP_DECL__)
+{
+       struct rt_mutex_waiter waiter;
+       int ret = 0;
+
+       debug_rt_mutex_init_waiter(&waiter);
+       waiter.task = NULL;
+
+       spin_lock(&lock->wait_lock);
+
+       /* Try to acquire the lock again: */
+       if (try_to_take_rt_mutex(lock __IP__)) {
+               spin_unlock(&lock->wait_lock);
+               return 0;
+       }
+
+       set_current_state(state);
+
+       /* Setup the timer, when timeout != NULL */
+       if (unlikely(timeout))
+               hrtimer_start(&timeout->timer, timeout->timer.expires,
+                             HRTIMER_ABS);
+
+       for (;;) {
+               /* Try to acquire the lock: */
+               if (try_to_take_rt_mutex(lock __IP__))
+                       break;
+
+               /*
+                * TASK_INTERRUPTIBLE checks for signals and
+                * timeout. Ignored otherwise.
+                */
+               if (unlikely(state == TASK_INTERRUPTIBLE)) {
+                       /* Signal pending? */
+                       if (signal_pending(current))
+                               ret = -EINTR;
+                       if (timeout && !timeout->task)
+                               ret = -ETIMEDOUT;
+                       if (ret)
+                               break;
+               }
+
+               /*
+                * waiter.task is NULL the first time we come here and
+                * when we have been woken up by the previous owner
+                * but the lock got stolen by a higher prio task.
+                */
+               if (!waiter.task) {
+                       ret = task_blocks_on_rt_mutex(lock, &waiter,
+                                                     detect_deadlock __IP__);
+                       /*
+                        * If we got woken up by the owner then start loop
+                        * all over without going into schedule to try
+                        * to get the lock now:
+                        */
+                       if (unlikely(!waiter.task))
+                               continue;
+
+                       if (unlikely(ret))
+                               break;
+               }
+
+               spin_unlock(&lock->wait_lock);
+
+               debug_rt_mutex_print_deadlock(&waiter);
+
+               if (waiter.task)
+                       schedule_rt_mutex(lock);
+
+               spin_lock(&lock->wait_lock);
+               set_current_state(state);
+       }
+
+       set_current_state(TASK_RUNNING);
+
+       if (unlikely(waiter.task))
+               remove_waiter(lock, &waiter __IP__);
+
+       /*
+        * try_to_take_rt_mutex() sets the waiter bit
+        * unconditionally. We might have to fix that up.
+        */
+       fixup_rt_mutex_waiters(lock);
+
+       spin_unlock(&lock->wait_lock);
+
+       /* Remove pending timer: */
+       if (unlikely(timeout))
+               hrtimer_cancel(&timeout->timer);
+
+       /*
+        * Readjust priority, when we did not get the lock. We might
+        * have been the pending owner and boosted. Since we did not
+        * take the lock, the PI boost has to go.
+        */
+       if (unlikely(ret))
+               rt_mutex_adjust_prio(current);
+
+       debug_rt_mutex_free_waiter(&waiter);
+
+       return ret;
+}
+
+/*
+ * Slow path try-lock function:
+ */
+static inline int
+rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
+{
+       int ret = 0;
+
+       spin_lock(&lock->wait_lock);
+
+       if (likely(rt_mutex_owner(lock) != current)) {
+
+               ret = try_to_take_rt_mutex(lock __IP__);
+               /*
+                * try_to_take_rt_mutex() sets the lock waiters
+                * bit unconditionally. Clean this up.
+                */
+               fixup_rt_mutex_waiters(lock);
+       }
+
+       spin_unlock(&lock->wait_lock);
+
+       return ret;
+}
+
+/*
+ * Slow path to release a rt-mutex:
+ */
+static void __sched
+rt_mutex_slowunlock(struct rt_mutex *lock)
+{
+       spin_lock(&lock->wait_lock);
+
+       debug_rt_mutex_unlock(lock);
+
+       rt_mutex_deadlock_account_unlock(current);
+
+       if (!rt_mutex_has_waiters(lock)) {
+               lock->owner = NULL;
+               spin_unlock(&lock->wait_lock);
+               return;
+       }
+
+       wakeup_next_waiter(lock);
+
+       spin_unlock(&lock->wait_lock);
+
+       /* Undo pi boosting if necessary: */
+       rt_mutex_adjust_prio(current);
+}
+
+/*
+ * debug aware fast / slowpath lock,trylock,unlock
+ *
+ * The atomic acquire/release ops are compiled away, when either the
+ * architecture does not support cmpxchg or when debugging is enabled.
+ */
+static inline int
+rt_mutex_fastlock(struct rt_mutex *lock, int state,
+                 int detect_deadlock,
+                 int (*slowfn)(struct rt_mutex *lock, int state,
+                               struct hrtimer_sleeper *timeout,
+                               int detect_deadlock __IP_DECL__))
+{
+       if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+               rt_mutex_deadlock_account_lock(lock, current);
+               return 0;
+       } else
+               return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
+                       struct hrtimer_sleeper *timeout, int detect_deadlock,
+                       int (*slowfn)(struct rt_mutex *lock, int state,
+                                     struct hrtimer_sleeper *timeout,
+                                     int detect_deadlock __IP_DECL__))
+{
+       if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+               rt_mutex_deadlock_account_lock(lock, current);
+               return 0;
+       } else
+               return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
+}
+
+static inline int
+rt_mutex_fasttrylock(struct rt_mutex *lock,
+                    int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
+{
+       if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+               rt_mutex_deadlock_account_lock(lock, current);
+               return 1;
+       }
+       return slowfn(lock __RET_IP__);
+}
+
+static inline void
+rt_mutex_fastunlock(struct rt_mutex *lock,
+                   void (*slowfn)(struct rt_mutex *lock))
+{
+       if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+               rt_mutex_deadlock_account_unlock(current);
+       else
+               slowfn(lock);
+}
+
+/**
+ * rt_mutex_lock - lock a rt_mutex
+ *
+ * @lock: the rt_mutex to be locked
+ */
+void __sched rt_mutex_lock(struct rt_mutex *lock)
+{
+       might_sleep();
+
+       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock);
+
+/**
+ * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
+ *
+ * @lock:              the rt_mutex to be locked
+ * @detect_deadlock:   deadlock detection on/off
+ *
+ * Returns:
+ *  0          on success
+ * -EINTR      when interrupted by a signal
+ * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
+ */
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
+                                                int detect_deadlock)
+{
+       might_sleep();
+
+       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
+                                detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+/**
+ * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
+ *                                    the timeout structure is provided
+ *                                    by the caller
+ *
+ * @lock:              the rt_mutex to be locked
+ * @timeout:           timeout structure or NULL (no timeout)
+ * @detect_deadlock:   deadlock detection on/off
+ *
+ * Returns:
+ *  0          on success
+ * -EINTR      when interrupted by a signal
+ * -ETIMEOUT   when the timeout expired
+ * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
+ */
+int
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
+                   int detect_deadlock)
+{
+       might_sleep();
+
+       return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+                                      detect_deadlock, rt_mutex_slowlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+
+/**
+ * rt_mutex_trylock - try to lock a rt_mutex
+ *
+ * @lock:      the rt_mutex to be locked
+ *
+ * Returns 1 on success and 0 on contention
+ */
+int __sched rt_mutex_trylock(struct rt_mutex *lock)
+{
+       return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_trylock);
+
+/**
+ * rt_mutex_unlock - unlock a rt_mutex
+ *
+ * @lock: the rt_mutex to be unlocked
+ */
+void __sched rt_mutex_unlock(struct rt_mutex *lock)
+{
+       rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+}
+EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+/***
+ * rt_mutex_destroy - mark a mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+void rt_mutex_destroy(struct rt_mutex *lock)
+{
+       WARN_ON(rt_mutex_is_locked(lock));
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+       lock->magic = NULL;
+#endif
+}
+
+EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+
+/**
+ * __rt_mutex_init - initialize the rt lock
+ *
+ * @lock: the rt lock to be initialized
+ *
+ * Initialize the rt lock to unlocked state.
+ *
+ * Initializing of a locked rt lock is not allowed
+ */
+void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+{
+       lock->owner = NULL;
+       spin_lock_init(&lock->wait_lock);
+       plist_head_init(&lock->wait_list, &lock->wait_lock);
+
+       debug_rt_mutex_init(lock, name);
+}
+EXPORT_SYMBOL_GPL(__rt_mutex_init);
+
+/**
+ * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
+ *                             proxy owner
+ *
+ * @lock:      the rt_mutex to be locked
+ * @proxy_owner:the task to set as owner
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+                               struct task_struct *proxy_owner)
+{
+       __rt_mutex_init(lock, NULL);
+       debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__);
+       rt_mutex_set_owner(lock, proxy_owner, 0);
+       rt_mutex_deadlock_account_lock(lock, proxy_owner);
+}
+
+/**
+ * rt_mutex_proxy_unlock - release a lock on behalf of owner
+ *
+ * @lock:      the rt_mutex to be locked
+ *
+ * No locking. Caller has to do serializing itself
+ * Special API call for PI-futex support
+ */
+void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+                          struct task_struct *proxy_owner)
+{
+       debug_rt_mutex_proxy_unlock(lock);
+       rt_mutex_set_owner(lock, NULL, 0);
+       rt_mutex_deadlock_account_unlock(proxy_owner);
+}
+
+/**
+ * rt_mutex_next_owner - return the next owner of the lock
+ *
+ * @lock: the rt lock query
+ *
+ * Returns the next owner of the lock or NULL
+ *
+ * Caller has to serialize against other accessors to the lock
+ * itself.
+ *
+ * Special API call for PI-futex support
+ */
+struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
+{
+       if (!rt_mutex_has_waiters(lock))
+               return NULL;
+
+       return rt_mutex_top_waiter(lock)->task;
+}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644 (file)
index 0000000..1e0fca1
--- /dev/null
@@ -0,0 +1,29 @@
+/*
+ * RT-Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains macros used solely by rtmutex.c.
+ * Non-debug version.
+ */
+
+#define __IP_DECL__
+#define __IP__
+#define __RET_IP__
+#define rt_mutex_deadlock_check(l)                     (0)
+#define rt_mutex_deadlock_account_lock(m, t)           do { } while (0)
+#define rt_mutex_deadlock_account_unlock(l)            do { } while (0)
+#define debug_rt_mutex_init_waiter(w)                  do { } while (0)
+#define debug_rt_mutex_free_waiter(w)                  do { } while (0)
+#define debug_rt_mutex_lock(l)                         do { } while (0)
+#define debug_rt_mutex_proxy_lock(l,p)                 do { } while (0)
+#define debug_rt_mutex_proxy_unlock(l)                 do { } while (0)
+#define debug_rt_mutex_unlock(l)                       do { } while (0)
+#define debug_rt_mutex_init(m, n)                      do { } while (0)
+#define debug_rt_mutex_deadlock(d, a ,l)               do { } while (0)
+#define debug_rt_mutex_print_deadlock(w)               do { } while (0)
+#define debug_rt_mutex_detect_deadlock(w,d)            (d)
+#define debug_rt_mutex_reset_waiter(w)                 do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644 (file)
index 0000000..9c75856
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * RT Mutexes: blocking mutual exclusion locks with PI support
+ *
+ * started by Ingo Molnar and Thomas Gleixner:
+ *
+ *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+ *
+ * This file contains the private data structure and API definitions.
+ */
+
+#ifndef __KERNEL_RTMUTEX_COMMON_H
+#define __KERNEL_RTMUTEX_COMMON_H
+
+#include <linux/rtmutex.h>
+
+/*
+ * The rtmutex in kernel tester is independent of rtmutex debugging. We
+ * call schedule_rt_mutex_test() instead of schedule() for the tasks which
+ * belong to the tester. That way we can delay the wakeup path of those
+ * threads to provoke lock stealing and testing of  complex boosting scenarios.
+ */
+#ifdef CONFIG_RT_MUTEX_TESTER
+
+extern void schedule_rt_mutex_test(struct rt_mutex *lock);
+
+#define schedule_rt_mutex(_lock)                               \
+  do {                                                         \
+       if (!(current->flags & PF_MUTEX_TESTER))                \
+               schedule();                                     \
+       else                                                    \
+               schedule_rt_mutex_test(_lock);                  \
+  } while (0)
+
+#else
+# define schedule_rt_mutex(_lock)                      schedule()
+#endif
+
+/*
+ * This is the control structure for tasks blocked on a rt_mutex,
+ * which is allocated on the kernel stack on of the blocked task.
+ *
+ * @list_entry:                pi node to enqueue into the mutex waiters list
+ * @pi_list_entry:     pi node to enqueue into the mutex owner waiters list
+ * @task:              task reference to the blocked task
+ */
+struct rt_mutex_waiter {
+       struct plist_node       list_entry;
+       struct plist_node       pi_list_entry;
+       struct task_struct      *task;
+       struct rt_mutex         *lock;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+       unsigned long           ip;
+       pid_t                   deadlock_task_pid;
+       struct rt_mutex         *deadlock_lock;
+#endif
+};
+
+/*
+ * Various helpers to access the waiters-plist:
+ */
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+       return !plist_head_empty(&lock->wait_list);
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+       struct rt_mutex_waiter *w;
+
+       w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
+                              list_entry);
+       BUG_ON(w->lock != lock);
+
+       return w;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+       return !plist_head_empty(&p->pi_waiters);
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+       return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
+                                 pi_list_entry);
+}
+
+/*
+ * lock->owner state tracking:
+ */
+#define RT_MUTEX_OWNER_PENDING 1UL
+#define RT_MUTEX_HAS_WAITERS   2UL
+#define RT_MUTEX_OWNER_MASKALL 3UL
+
+static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
+{
+       return (struct task_struct *)
+               ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+}
+
+static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
+{
+       return (struct task_struct *)
+               ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
+}
+
+static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
+{
+       return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
+}
+
+/*
+ * PI-futex support (proxy locking functions, etc.):
+ */
+extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
+extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+                                      struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+                                 struct task_struct *proxy_owner);
+#endif
index a856040c200a4213bd8b0b2d8b469e25bcc1d57f..2629c1711fd62be84574153e0ae62077895f3b36 100644 (file)
  */
 
 #define SCALE_PRIO(x, prio) \
-       max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
+       max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 
-static unsigned int task_timeslice(task_t *p)
+static unsigned int static_prio_timeslice(int static_prio)
 {
-       if (p->static_prio < NICE_TO_PRIO(0))
-               return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
+       if (static_prio < NICE_TO_PRIO(0))
+               return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
        else
-               return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
+               return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 }
+
+static inline unsigned int task_timeslice(task_t *p)
+{
+       return static_prio_timeslice(p->static_prio);
+}
+
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)      \
                                < (long long) (sd)->cache_hot_time)
 
@@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p)
  * These are the runqueue data structures:
  */
 
-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
-
 typedef struct runqueue runqueue_t;
 
 struct prio_array {
        unsigned int nr_active;
-       unsigned long bitmap[BITMAP_SIZE];
+       DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
        struct list_head queue[MAX_PRIO];
 };
 
@@ -209,6 +213,7 @@ struct runqueue {
         * remote CPUs use both these fields when doing load calculation.
         */
        unsigned long nr_running;
+       unsigned long raw_weighted_load;
 #ifdef CONFIG_SMP
        unsigned long cpu_load[3];
 #endif
@@ -239,7 +244,6 @@ struct runqueue {
 
        task_t *migration_thread;
        struct list_head migration_queue;
-       int cpu;
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
@@ -350,12 +354,31 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
+/*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+static inline runqueue_t *__task_rq_lock(task_t *p)
+       __acquires(rq->lock)
+{
+       struct runqueue *rq;
+
+repeat_lock_task:
+       rq = task_rq(p);
+       spin_lock(&rq->lock);
+       if (unlikely(rq != task_rq(p))) {
+               spin_unlock(&rq->lock);
+               goto repeat_lock_task;
+       }
+       return rq;
+}
+
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
  */
-static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
        __acquires(rq->lock)
 {
        struct runqueue *rq;
@@ -371,6 +394,12 @@ repeat_lock_task:
        return rq;
 }
 
+static inline void __task_rq_unlock(runqueue_t *rq)
+       __releases(rq->lock)
+{
+       spin_unlock(&rq->lock);
+}
+
 static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
        __releases(rq->lock)
 {
@@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
 }
 
 /*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
  *
  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
  *
  * Both properties are important to certain workloads.
  */
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(task_t *p)
 {
        int bonus, prio;
 
-       if (rt_task(p))
-               return p->prio;
-
        bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 
        prio = p->static_prio - bonus;
@@ -664,6 +691,106 @@ static int effective_prio(task_t *p)
        return prio;
 }
 
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+/*
+ * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
+ * If static_prio_timeslice() is ever changed to break this assumption then
+ * this code will need modification
+ */
+#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
+#define LOAD_WEIGHT(lp) \
+       (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
+#define PRIO_TO_LOAD_WEIGHT(prio) \
+       LOAD_WEIGHT(static_prio_timeslice(prio))
+#define RTPRIO_TO_LOAD_WEIGHT(rp) \
+       (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+
+static void set_load_weight(task_t *p)
+{
+       if (has_rt_policy(p)) {
+#ifdef CONFIG_SMP
+               if (p == task_rq(p)->migration_thread)
+                       /*
+                        * The migration thread does the actual balancing.
+                        * Giving its load any weight will skew balancing
+                        * adversely.
+                        */
+                       p->load_weight = 0;
+               else
+#endif
+                       p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+       } else
+               p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+}
+
+static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+       rq->raw_weighted_load += p->load_weight;
+}
+
+static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
+{
+       rq->raw_weighted_load -= p->load_weight;
+}
+
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+       rq->nr_running++;
+       inc_raw_weighted_load(rq, p);
+}
+
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+       rq->nr_running--;
+       dec_raw_weighted_load(rq, p);
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(task_t *p)
+{
+       int prio;
+
+       if (has_rt_policy(p))
+               prio = MAX_RT_PRIO-1 - p->rt_priority;
+       else
+               prio = __normal_prio(p);
+       return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(task_t *p)
+{
+       p->normal_prio = normal_prio(p);
+       /*
+        * If we are RT tasks or we were boosted to RT priority,
+        * keep the priority unchanged. Otherwise, update priority
+        * to the normal priority:
+        */
+       if (!rt_prio(p->prio))
+               return p->normal_prio;
+       return p->prio;
+}
+
 /*
  * __activate_task - move a task to the runqueue.
  */
@@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
        if (batch_task(p))
                target = rq->expired;
        enqueue_task(p, target);
-       rq->nr_running++;
+       inc_nr_running(p, rq);
 }
 
 /*
@@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq)
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
        enqueue_task_head(p, rq->active);
-       rq->nr_running++;
+       inc_nr_running(p, rq);
 }
 
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
 static int recalc_task_prio(task_t *p, unsigned long long now)
 {
        /* Caller must always ensure 'now >= p->timestamp' */
-       unsigned long long __sleep_time = now - p->timestamp;
-       unsigned long sleep_time;
+       unsigned long sleep_time = now - p->timestamp;
 
        if (batch_task(p))
                sleep_time = 0;
-       else {
-               if (__sleep_time > NS_MAX_SLEEP_AVG)
-                       sleep_time = NS_MAX_SLEEP_AVG;
-               else
-                       sleep_time = (unsigned long)__sleep_time;
-       }
 
        if (likely(sleep_time > 0)) {
                /*
-                * User tasks that sleep a long time are categorised as
-                * idle. They will only have their sleep_avg increased to a
-                * level that makes them just interactive priority to stay
-                * active yet prevent them suddenly becoming cpu hogs and
-                * starving other processes.
+                * This ceiling is set to the lowest priority that would allow
+                * a task to be reinserted into the active array on timeslice
+                * completion.
                 */
-               if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
-                               unsigned long ceiling;
+               unsigned long ceiling = INTERACTIVE_SLEEP(p);
 
-                               ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-                                       DEF_TIMESLICE);
-                               if (p->sleep_avg < ceiling)
-                                       p->sleep_avg = ceiling;
+               if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+                       /*
+                        * Prevents user tasks from achieving best priority
+                        * with one single large enough sleep.
+                        */
+                       p->sleep_avg = ceiling;
+                       /*
+                        * Using INTERACTIVE_SLEEP() as a ceiling places a
+                        * nice(0) task 1ms sleep away from promotion, and
+                        * gives it 700ms to round-robin with no chance of
+                        * being demoted.  This is more than generous, so
+                        * mark this sleep as non-interactive to prevent the
+                        * on-runqueue bonus logic from intervening should
+                        * this task not receive cpu immediately.
+                        */
+                       p->sleep_type = SLEEP_NONINTERACTIVE;
                } else {
                        /*
                         * Tasks waking from uninterruptible sleep are
@@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
                         * are likely to be waiting on I/O
                         */
                        if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
-                               if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
+                               if (p->sleep_avg >= ceiling)
                                        sleep_time = 0;
                                else if (p->sleep_avg + sleep_time >=
-                                               INTERACTIVE_SLEEP(p)) {
-                                       p->sleep_avg = INTERACTIVE_SLEEP(p);
-                                       sleep_time = 0;
+                                        ceiling) {
+                                               p->sleep_avg = ceiling;
+                                               sleep_time = 0;
                                }
                        }
 
@@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
                         */
                        p->sleep_avg += sleep_time;
 
-                       if (p->sleep_avg > NS_MAX_SLEEP_AVG)
-                               p->sleep_avg = NS_MAX_SLEEP_AVG;
                }
+               if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+                       p->sleep_avg = NS_MAX_SLEEP_AVG;
        }
 
        return effective_prio(p);
@@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-       rq->nr_running--;
+       dec_nr_running(p, rq);
        dequeue_task(p, p->array);
        p->array = NULL;
 }
@@ -860,6 +993,12 @@ inline int task_curr(const task_t *p)
        return cpu_curr(task_cpu(p)) == p;
 }
 
+/* Used instead of source_load when we know the type == 0 */
+unsigned long weighted_cpuload(const int cpu)
+{
+       return cpu_rq(cpu)->raw_weighted_load;
+}
+
 #ifdef CONFIG_SMP
 typedef struct {
        struct list_head list;
@@ -949,7 +1088,8 @@ void kick_process(task_t *p)
 }
 
 /*
- * Return a low guess at the load of a migration-source cpu.
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
  *
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
@@ -957,24 +1097,36 @@ void kick_process(task_t *p)
 static inline unsigned long source_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
-       unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
        if (type == 0)
-               return load_now;
+               return rq->raw_weighted_load;
 
-       return min(rq->cpu_load[type-1], load_now);
+       return min(rq->cpu_load[type-1], rq->raw_weighted_load);
 }
 
 /*
- * Return a high guess at the load of a migration-target cpu
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
  */
 static inline unsigned long target_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
-       unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
        if (type == 0)
-               return load_now;
+               return rq->raw_weighted_load;
+
+       return max(rq->cpu_load[type-1], rq->raw_weighted_load);
+}
+
+/*
+ * Return the average load per task on the cpu's run queue
+ */
+static inline unsigned long cpu_avg_load_per_task(int cpu)
+{
+       runqueue_t *rq = cpu_rq(cpu);
+       unsigned long n = rq->nr_running;
 
-       return max(rq->cpu_load[type-1], load_now);
+       return n ?  rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
 }
 
 /*
@@ -1047,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
        cpus_and(tmp, group->cpumask, p->cpus_allowed);
 
        for_each_cpu_mask(i, tmp) {
-               load = source_load(i, 0);
+               load = weighted_cpuload(i);
 
                if (load < min_load || (load == min_load && i == this_cpu)) {
                        min_load = load;
@@ -1074,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag)
        struct task_struct *t = current;
        struct sched_domain *tmp, *sd = NULL;
 
-       for_each_domain(cpu, tmp)
+       for_each_domain(cpu, tmp) {
+               /*
+                * If power savings logic is enabled for a domain, stop there.
+                */
+               if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+                       break;
                if (tmp->flags & flag)
                        sd = tmp;
+       }
 
        while (sd) {
                cpumask_t span;
@@ -1226,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 
                if (this_sd->flags & SD_WAKE_AFFINE) {
                        unsigned long tl = this_load;
+                       unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+
                        /*
                         * If sync wakeup then subtract the (maximum possible)
                         * effect of the currently running task from the load
                         * of the current CPU:
                         */
                        if (sync)
-                               tl -= SCHED_LOAD_SCALE;
+                               tl -= current->load_weight;
 
                        if ((tl <= load &&
-                               tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
-                               100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+                               tl + target_load(cpu, idx) <= tl_per_task) ||
+                               100*(tl + p->load_weight) <= imbalance*load) {
                                /*
                                 * This domain has SD_WAKE_AFFINE and
                                 * p is cache cold in this domain, and
@@ -1353,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
         * event cannot wake it up and insert it on the runqueue either.
         */
        p->state = TASK_RUNNING;
+
+       /*
+        * Make sure we do not leak PI boosting priority to the child:
+        */
+       p->prio = current->normal_prio;
+
        INIT_LIST_HEAD(&p->run_list);
        p->array = NULL;
 #ifdef CONFIG_SCHEDSTATS
@@ -1432,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
                                __activate_task(p, rq);
                        else {
                                p->prio = current->prio;
+                               p->normal_prio = current->normal_prio;
                                list_add_tail(&p->run_list, &current->run_list);
                                p->array = current->array;
                                p->array->nr_active++;
-                               rq->nr_running++;
+                               inc_nr_running(p, rq);
                        }
                        set_need_resched();
                } else
@@ -1653,7 +1820,8 @@ unsigned long nr_uninterruptible(void)
 
 unsigned long long nr_context_switches(void)
 {
-       unsigned long long i, sum = 0;
+       int i;
+       unsigned long long sum = 0;
 
        for_each_possible_cpu(i)
                sum += cpu_rq(i)->nr_switches;
@@ -1691,9 +1859,6 @@ unsigned long nr_active(void)
 /*
  * double_rq_lock - safely lock two runqueues
  *
- * We must take them in cpu order to match code in
- * dependent_sleeper and wake_dependent_sleeper.
- *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
@@ -1705,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
                spin_lock(&rq1->lock);
                __acquire(rq2->lock);   /* Fake it out ;) */
        } else {
-               if (rq1->cpu < rq2->cpu) {
+               if (rq1 < rq2) {
                        spin_lock(&rq1->lock);
                        spin_lock(&rq2->lock);
                } else {
@@ -1741,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
        __acquires(this_rq->lock)
 {
        if (unlikely(!spin_trylock(&busiest->lock))) {
-               if (busiest->cpu < this_rq->cpu) {
+               if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
                        spin_lock(&busiest->lock);
                        spin_lock(&this_rq->lock);
@@ -1804,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
               runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
        dequeue_task(p, src_array);
-       src_rq->nr_running--;
+       dec_nr_running(p, src_rq);
        set_task_cpu(p, this_cpu);
-       this_rq->nr_running++;
+       inc_nr_running(p, this_rq);
        enqueue_task(p, this_array);
        p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
                                + this_rq->timestamp_last_tick;
@@ -1853,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
        return 1;
 }
 
+#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
 /*
- * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq,
- * as part of a balancing operation within "domain". Returns the number of
- * tasks moved.
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
  *
  * Called with both runqueues locked.
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-                     unsigned long max_nr_move, struct sched_domain *sd,
-                     enum idle_type idle, int *all_pinned)
+                     unsigned long max_nr_move, unsigned long max_load_move,
+                     struct sched_domain *sd, enum idle_type idle,
+                     int *all_pinned)
 {
        prio_array_t *array, *dst_array;
        struct list_head *head, *curr;
-       int idx, pulled = 0, pinned = 0;
+       int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
+       int busiest_best_prio_seen;
+       int skip_for_load; /* skip the task based on weighted load issues */
+       long rem_load_move;
        task_t *tmp;
 
-       if (max_nr_move == 0)
+       if (max_nr_move == 0 || max_load_move == 0)
                goto out;
 
+       rem_load_move = max_load_move;
        pinned = 1;
+       this_best_prio = rq_best_prio(this_rq);
+       busiest_best_prio = rq_best_prio(busiest);
+       /*
+        * Enable handling of the case where there is more than one task
+        * with the best priority.   If the current running task is one
+        * of those with prio==busiest_best_prio we know it won't be moved
+        * and therefore it's safe to override the skip (based on load) of
+        * any task we find with that prio.
+        */
+       busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
 
        /*
         * We first consider expired tasks. Those will likely not be
@@ -1912,7 +2093,17 @@ skip_queue:
 
        curr = curr->prev;
 
-       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+       /*
+        * To help distribute high priority tasks accross CPUs we don't
+        * skip a task if it will be the highest priority task (i.e. smallest
+        * prio value) on its new queue regardless of its load weight
+        */
+       skip_for_load = tmp->load_weight > rem_load_move;
+       if (skip_for_load && idx < this_best_prio)
+               skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
+       if (skip_for_load ||
+           !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+               busiest_best_prio_seen |= idx == busiest_best_prio;
                if (curr != head)
                        goto skip_queue;
                idx++;
@@ -1926,9 +2117,15 @@ skip_queue:
 
        pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
        pulled++;
+       rem_load_move -= tmp->load_weight;
 
-       /* We only want to steal up to the prescribed number of tasks. */
-       if (pulled < max_nr_move) {
+       /*
+        * We only want to steal up to the prescribed number of tasks
+        * and the prescribed amount of weighted load.
+        */
+       if (pulled < max_nr_move && rem_load_move > 0) {
+               if (idx < this_best_prio)
+                       this_best_prio = idx;
                if (curr != head)
                        goto skip_queue;
                idx++;
@@ -1949,7 +2146,7 @@ out:
 
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the number of tasks which should be
+ * domain. It calculates and returns the amount of weighted load which should be
  * moved to restore balance via the imbalance parameter.
  */
 static struct sched_group *
@@ -1959,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
        unsigned long max_pull;
+       unsigned long busiest_load_per_task, busiest_nr_running;
+       unsigned long this_load_per_task, this_nr_running;
        int load_idx;
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       int power_savings_balance = 1;
+       unsigned long leader_nr_running = 0, min_load_per_task = 0;
+       unsigned long min_nr_running = ULONG_MAX;
+       struct sched_group *group_min = NULL, *group_leader = NULL;
+#endif
 
        max_load = this_load = total_load = total_pwr = 0;
+       busiest_load_per_task = busiest_nr_running = 0;
+       this_load_per_task = this_nr_running = 0;
        if (idle == NOT_IDLE)
                load_idx = sd->busy_idx;
        else if (idle == NEWLY_IDLE)
@@ -1970,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                load_idx = sd->idle_idx;
 
        do {
-               unsigned long load;
+               unsigned long load, group_capacity;
                int local_group;
                int i;
+               unsigned long sum_nr_running, sum_weighted_load;
 
                local_group = cpu_isset(this_cpu, group->cpumask);
 
                /* Tally up the load of all CPUs in the group */
-               avg_load = 0;
+               sum_weighted_load = sum_nr_running = avg_load = 0;
 
                for_each_cpu_mask(i, group->cpumask) {
+                       runqueue_t *rq = cpu_rq(i);
+
                        if (*sd_idle && !idle_cpu(i))
                                *sd_idle = 0;
 
@@ -1990,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                                load = source_load(i, load_idx);
 
                        avg_load += load;
+                       sum_nr_running += rq->nr_running;
+                       sum_weighted_load += rq->raw_weighted_load;
                }
 
                total_load += avg_load;
@@ -1998,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                /* Adjust by relative CPU power of the group */
                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 
+               group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+
                if (local_group) {
                        this_load = avg_load;
                        this = group;
-               } else if (avg_load > max_load) {
+                       this_nr_running = sum_nr_running;
+                       this_load_per_task = sum_weighted_load;
+               } else if (avg_load > max_load &&
+                          sum_nr_running > group_capacity) {
                        max_load = avg_load;
                        busiest = group;
+                       busiest_nr_running = sum_nr_running;
+                       busiest_load_per_task = sum_weighted_load;
                }
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+               /*
+                * Busy processors will not participate in power savings
+                * balance.
+                */
+               if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                       goto group_next;
+
+               /*
+                * If the local group is idle or completely loaded
+                * no need to do power savings balance at this domain
+                */
+               if (local_group && (this_nr_running >= group_capacity ||
+                                   !this_nr_running))
+                       power_savings_balance = 0;
+
+               /*
+                * If a group is already running at full capacity or idle,
+                * don't include that group in power savings calculations
+                */
+               if (!power_savings_balance || sum_nr_running >= group_capacity
+                   || !sum_nr_running)
+                       goto group_next;
+
+               /*
+                * Calculate the group which has the least non-idle load.
+                * This is the group from where we need to pick up the load
+                * for saving power
+                */
+               if ((sum_nr_running < min_nr_running) ||
+                   (sum_nr_running == min_nr_running &&
+                    first_cpu(group->cpumask) <
+                    first_cpu(group_min->cpumask))) {
+                       group_min = group;
+                       min_nr_running = sum_nr_running;
+                       min_load_per_task = sum_weighted_load /
+                                               sum_nr_running;
+               }
+
+               /*
+                * Calculate the group which is almost near its
+                * capacity but still has some space to pick up some load
+                * from other group and save more power
+                */
+               if (sum_nr_running <= group_capacity - 1)
+                       if (sum_nr_running > leader_nr_running ||
+                           (sum_nr_running == leader_nr_running &&
+                            first_cpu(group->cpumask) >
+                             first_cpu(group_leader->cpumask))) {
+                               group_leader = group;
+                               leader_nr_running = sum_nr_running;
+                       }
+
+group_next:
+#endif
                group = group->next;
        } while (group != sd->groups);
 
-       if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
+       if (!busiest || this_load >= max_load || busiest_nr_running == 0)
                goto out_balanced;
 
        avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2017,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        100*max_load <= sd->imbalance_pct*this_load)
                goto out_balanced;
 
+       busiest_load_per_task /= busiest_nr_running;
        /*
         * We're trying to get all the cpus to the average_load, so we don't
         * want to push ourselves above the average load, nor do we wish to
@@ -2028,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * by pulling tasks to us.  Be careful of negative numbers as they'll
         * appear as very large values with unsigned longs.
         */
+       if (max_load <= busiest_load_per_task)
+               goto out_balanced;
+
+       /*
+        * In the presence of smp nice balancing, certain scenarios can have
+        * max load less than avg load(as we skip the groups at or below
+        * its cpu_power, while calculating max_load..)
+        */
+       if (max_load < avg_load) {
+               *imbalance = 0;
+               goto small_imbalance;
+       }
 
        /* Don't want to pull so many tasks that a group would go idle */
-       max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+       max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 
        /* How much load to actually move to equalise the imbalance */
        *imbalance = min(max_pull * busiest->cpu_power,
                                (avg_load - this_load) * this->cpu_power)
                        / SCHED_LOAD_SCALE;
 
-       if (*imbalance < SCHED_LOAD_SCALE) {
-               unsigned long pwr_now = 0, pwr_move = 0;
+       /*
+        * if *imbalance is less than the average load per runnable task
+        * there is no gaurantee that any tasks will be moved so we'll have
+        * a think about bumping its value to force at least one task to be
+        * moved
+        */
+       if (*imbalance < busiest_load_per_task) {
+               unsigned long pwr_now, pwr_move;
                unsigned long tmp;
+               unsigned int imbn;
+
+small_imbalance:
+               pwr_move = pwr_now = 0;
+               imbn = 2;
+               if (this_nr_running) {
+                       this_load_per_task /= this_nr_running;
+                       if (busiest_load_per_task > this_load_per_task)
+                               imbn = 1;
+               } else
+                       this_load_per_task = SCHED_LOAD_SCALE;
 
-               if (max_load - this_load >= SCHED_LOAD_SCALE*2) {
-                       *imbalance = 1;
+               if (max_load - this_load >= busiest_load_per_task * imbn) {
+                       *imbalance = busiest_load_per_task;
                        return busiest;
                }
 
@@ -2052,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 * moving them.
                 */
 
-               pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-               pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+               pwr_now += busiest->cpu_power *
+                       min(busiest_load_per_task, max_load);
+               pwr_now += this->cpu_power *
+                       min(this_load_per_task, this_load);
                pwr_now /= SCHED_LOAD_SCALE;
 
                /* Amount of load we'd subtract */
-               tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+               tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
                if (max_load > tmp)
-                       pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
-                                                       max_load - tmp);
+                       pwr_move += busiest->cpu_power *
+                               min(busiest_load_per_task, max_load - tmp);
 
                /* Amount of load we'd add */
                if (max_load*busiest->cpu_power <
-                               SCHED_LOAD_SCALE*SCHED_LOAD_SCALE)
+                               busiest_load_per_task*SCHED_LOAD_SCALE)
                        tmp = max_load*busiest->cpu_power/this->cpu_power;
                else
-                       tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-               pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp);
+                       tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+               pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
                pwr_move /= SCHED_LOAD_SCALE;
 
                /* Move if we gain throughput */
                if (pwr_move <= pwr_now)
                        goto out_balanced;
 
-               *imbalance = 1;
-               return busiest;
+               *imbalance = busiest_load_per_task;
        }
 
-       /* Get rid of the scaling factor, rounding down as we divide */
-       *imbalance = *imbalance / SCHED_LOAD_SCALE;
        return busiest;
 
 out_balanced:
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+               goto ret;
 
+       if (this == group_leader && group_leader != group_min) {
+               *imbalance = min_load_per_task;
+               return group_min;
+       }
+ret:
+#endif
        *imbalance = 0;
        return NULL;
 }
@@ -2093,18 +2406,21 @@ out_balanced:
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static runqueue_t *find_busiest_queue(struct sched_group *group,
-       enum idle_type idle)
+       enum idle_type idle, unsigned long imbalance)
 {
-       unsigned long load, max_load = 0;
-       runqueue_t *busiest = NULL;
+       unsigned long max_load = 0;
+       runqueue_t *busiest = NULL, *rqi;
        int i;
 
        for_each_cpu_mask(i, group->cpumask) {
-               load = source_load(i, 0);
+               rqi = cpu_rq(i);
+
+               if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
+                       continue;
 
-               if (load > max_load) {
-                       max_load = load;
-                       busiest = cpu_rq(i);
+               if (rqi->raw_weighted_load > max_load) {
+                       max_load = rqi->raw_weighted_load;
+                       busiest = rqi;
                }
        }
 
@@ -2117,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
  */
 #define MAX_PINNED_INTERVAL    512
 
+#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2133,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
        int active_balance = 0;
        int sd_idle = 0;
 
-       if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+       if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+           !sched_smt_power_savings)
                sd_idle = 1;
 
        schedstat_inc(sd, lb_cnt[idle]);
@@ -2144,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                goto out_balanced;
        }
 
-       busiest = find_busiest_queue(group, idle);
+       busiest = find_busiest_queue(group, idle, imbalance);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -2164,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 */
                double_rq_lock(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
+                                       minus_1_or_zero(busiest->nr_running),
                                        imbalance, sd, idle, &all_pinned);
                double_rq_unlock(this_rq, busiest);
 
@@ -2221,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                        sd->balance_interval *= 2;
        }
 
-       if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+       if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+           !sched_smt_power_savings)
                return -1;
        return nr_moved;
 
@@ -2236,7 +2556,7 @@ out_one_pinned:
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
 
-       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                return -1;
        return 0;
 }
@@ -2257,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        int nr_moved = 0;
        int sd_idle = 0;
 
-       if (sd->flags & SD_SHARE_CPUPOWER)
+       if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                sd_idle = 1;
 
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2267,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
                goto out_balanced;
        }
 
-       busiest = find_busiest_queue(group, NEWLY_IDLE);
+       busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
                goto out_balanced;
@@ -2282,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
                /* Attempt to move tasks */
                double_lock_balance(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
+                                       minus_1_or_zero(busiest->nr_running),
                                        imbalance, sd, NEWLY_IDLE, NULL);
                spin_unlock(&busiest->lock);
        }
@@ -2297,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 
 out_balanced:
        schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
-       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                return -1;
        sd->nr_balance_failed = 0;
        return 0;
@@ -2352,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
        double_lock_balance(busiest_rq, target_rq);
 
        /* Search for an sd spanning us and the target CPU. */
-       for_each_domain(target_cpu, sd)
+       for_each_domain(target_cpu, sd) {
                if ((sd->flags & SD_LOAD_BALANCE) &&
                        cpu_isset(busiest_cpu, sd->span))
                                break;
+       }
 
        if (unlikely(sd == NULL))
                goto out;
 
        schedstat_inc(sd, alb_cnt);
 
-       if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+       if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+                       RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
                schedstat_inc(sd, alb_pushed);
        else
                schedstat_inc(sd, alb_failed);
@@ -2390,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
        struct sched_domain *sd;
        int i;
 
-       this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+       this_load = this_rq->raw_weighted_load;
        /* Update our load */
        for (i = 0; i < 3; i++) {
                unsigned long new_load = this_load;
@@ -2691,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
                resched_task(rq->idle);
 }
 
-static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+/*
+ * Called with interrupt disabled and this_rq's runqueue locked.
+ */
+static void wake_sleeping_dependent(int this_cpu)
 {
        struct sched_domain *tmp, *sd = NULL;
-       cpumask_t sibling_map;
        int i;
 
-       for_each_domain(this_cpu, tmp)
-               if (tmp->flags & SD_SHARE_CPUPOWER)
+       for_each_domain(this_cpu, tmp) {
+               if (tmp->flags & SD_SHARE_CPUPOWER) {
                        sd = tmp;
+                       break;
+               }
+       }
 
        if (!sd)
                return;
 
-       /*
-        * Unlock the current runqueue because we have to lock in
-        * CPU order to avoid deadlocks. Caller knows that we might
-        * unlock. We keep IRQs disabled.
-        */
-       spin_unlock(&this_rq->lock);
-
-       sibling_map = sd->span;
-
-       for_each_cpu_mask(i, sibling_map)
-               spin_lock(&cpu_rq(i)->lock);
-       /*
-        * We clear this CPU from the mask. This both simplifies the
-        * inner loop and keps this_rq locked when we exit:
-        */
-       cpu_clear(this_cpu, sibling_map);
-
-       for_each_cpu_mask(i, sibling_map) {
+       for_each_cpu_mask(i, sd->span) {
                runqueue_t *smt_rq = cpu_rq(i);
 
+               if (i == this_cpu)
+                       continue;
+               if (unlikely(!spin_trylock(&smt_rq->lock)))
+                       continue;
+
                wakeup_busy_runqueue(smt_rq);
+               spin_unlock(&smt_rq->lock);
        }
-
-       for_each_cpu_mask(i, sibling_map)
-               spin_unlock(&cpu_rq(i)->lock);
-       /*
-        * We exit with this_cpu's rq still held and IRQs
-        * still disabled:
-        */
 }
 
 /*
@@ -2745,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
        return p->time_slice * (100 - sd->per_cpu_gain) / 100;
 }
 
-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+/*
+ * To minimise lock contention and not have to drop this_rq's runlock we only
+ * trylock the sibling runqueues and bypass those runqueues if we fail to
+ * acquire their lock. As we only trylock the normal locking order does not
+ * need to be obeyed.
+ */
+static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
 {
        struct sched_domain *tmp, *sd = NULL;
-       cpumask_t sibling_map;
-       prio_array_t *array;
        int ret = 0, i;
-       task_t *p;
 
-       for_each_domain(this_cpu, tmp)
-               if (tmp->flags & SD_SHARE_CPUPOWER)
+       /* kernel/rt threads do not participate in dependent sleeping */
+       if (!p->mm || rt_task(p))
+               return 0;
+
+       for_each_domain(this_cpu, tmp) {
+               if (tmp->flags & SD_SHARE_CPUPOWER) {
                        sd = tmp;
+                       break;
+               }
+       }
 
        if (!sd)
                return 0;
 
-       /*
-        * The same locking rules and details apply as for
-        * wake_sleeping_dependent():
-        */
-       spin_unlock(&this_rq->lock);
-       sibling_map = sd->span;
-       for_each_cpu_mask(i, sibling_map)
-               spin_lock(&cpu_rq(i)->lock);
-       cpu_clear(this_cpu, sibling_map);
+       for_each_cpu_mask(i, sd->span) {
+               runqueue_t *smt_rq;
+               task_t *smt_curr;
 
-       /*
-        * Establish next task to be run - it might have gone away because
-        * we released the runqueue lock above:
-        */
-       if (!this_rq->nr_running)
-               goto out_unlock;
-       array = this_rq->active;
-       if (!array->nr_active)
-               array = this_rq->expired;
-       BUG_ON(!array->nr_active);
+               if (i == this_cpu)
+                       continue;
 
-       p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
-               task_t, run_list);
+               smt_rq = cpu_rq(i);
+               if (unlikely(!spin_trylock(&smt_rq->lock)))
+                       continue;
 
-       for_each_cpu_mask(i, sibling_map) {
-               runqueue_t *smt_rq = cpu_rq(i);
-               task_t *smt_curr = smt_rq->curr;
+               smt_curr = smt_rq->curr;
 
-               /* Kernel threads do not participate in dependent sleeping */
-               if (!p->mm || !smt_curr->mm || rt_task(p))
-                       goto check_smt_task;
+               if (!smt_curr->mm)
+                       goto unlock;
 
                /*
                 * If a user task with lower static priority than the
@@ -2808,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                        if ((jiffies % DEF_TIMESLICE) >
                                (sd->per_cpu_gain * DEF_TIMESLICE / 100))
                                        ret = 1;
-               } else
+               } else {
                        if (smt_curr->static_prio < p->static_prio &&
                                !TASK_PREEMPTS_CURR(p, smt_rq) &&
                                smt_slice(smt_curr, sd) > task_timeslice(p))
                                        ret = 1;
-
-check_smt_task:
-               if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
-                       rt_task(smt_curr))
-                               continue;
-               if (!p->mm) {
-                       wakeup_busy_runqueue(smt_rq);
-                       continue;
-               }
-
-               /*
-                * Reschedule a lower priority task on the SMT sibling for
-                * it to be put to sleep, or wake it up if it has been put to
-                * sleep for priority reasons to see if it should run now.
-                */
-               if (rt_task(p)) {
-                       if ((jiffies % DEF_TIMESLICE) >
-                               (sd->per_cpu_gain * DEF_TIMESLICE / 100))
-                                       resched_task(smt_curr);
-               } else {
-                       if (TASK_PREEMPTS_CURR(p, smt_rq) &&
-                               smt_slice(p, sd) > task_timeslice(smt_curr))
-                                       resched_task(smt_curr);
-                       else
-                               wakeup_busy_runqueue(smt_rq);
                }
+unlock:
+               spin_unlock(&smt_rq->lock);
        }
-out_unlock:
-       for_each_cpu_mask(i, sibling_map)
-               spin_unlock(&cpu_rq(i)->lock);
        return ret;
 }
 #else
-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
+static inline void wake_sleeping_dependent(int this_cpu)
 {
 }
 
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
+                                       task_t *p)
 {
        return 0;
 }
@@ -2972,32 +3251,13 @@ need_resched_nonpreemptible:
 
        cpu = smp_processor_id();
        if (unlikely(!rq->nr_running)) {
-go_idle:
                idle_balance(cpu, rq);
                if (!rq->nr_running) {
                        next = rq->idle;
                        rq->expired_timestamp = 0;
-                       wake_sleeping_dependent(cpu, rq);
-                       /*
-                        * wake_sleeping_dependent() might have released
-                        * the runqueue, so break out if we got new
-                        * tasks meanwhile:
-                        */
-                       if (!rq->nr_running)
-                               goto switch_tasks;
-               }
-       } else {
-               if (dependent_sleeper(cpu, rq)) {
-                       next = rq->idle;
+                       wake_sleeping_dependent(cpu);
                        goto switch_tasks;
                }
-               /*
-                * dependent_sleeper() releases and reacquires the runqueue
-                * lock, hence go into the idle loop if the rq went
-                * empty meanwhile:
-                */
-               if (unlikely(!rq->nr_running))
-                       goto go_idle;
        }
 
        array = rq->active;
@@ -3035,6 +3295,8 @@ go_idle:
                }
        }
        next->sleep_type = SLEEP_NORMAL;
+       if (dependent_sleeper(cpu, rq, next))
+               next = rq->idle;
 switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
@@ -3478,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 
 EXPORT_SYMBOL(sleep_on_timeout);
 
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(task_t *p, int prio)
+{
+       unsigned long flags;
+       prio_array_t *array;
+       runqueue_t *rq;
+       int oldprio;
+
+       BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+       rq = task_rq_lock(p, &flags);
+
+       oldprio = p->prio;
+       array = p->array;
+       if (array)
+               dequeue_task(p, array);
+       p->prio = prio;
+
+       if (array) {
+               /*
+                * If changing to an RT priority then queue it
+                * in the active array!
+                */
+               if (rt_task(p))
+                       array = rq->active;
+               enqueue_task(p, array);
+               /*
+                * Reschedule if we are currently running on this runqueue and
+                * our priority decreased, or if we are not currently running on
+                * this runqueue and our priority is higher than the current's
+                */
+               if (task_running(rq, p)) {
+                       if (p->prio > oldprio)
+                               resched_task(rq->curr);
+               } else if (TASK_PREEMPTS_CURR(p, rq))
+                       resched_task(rq->curr);
+       }
+       task_rq_unlock(rq, &flags);
+}
+
+#endif
+
 void set_user_nice(task_t *p, long nice)
 {
        unsigned long flags;
        prio_array_t *array;
        runqueue_t *rq;
-       int old_prio, new_prio, delta;
+       int old_prio, delta;
 
        if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
                return;
@@ -3498,22 +3813,25 @@ void set_user_nice(task_t *p, long nice)
         * it wont have any effect on scheduling until the task is
         * not SCHED_NORMAL/SCHED_BATCH:
         */
-       if (rt_task(p)) {
+       if (has_rt_policy(p)) {
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
        array = p->array;
-       if (array)
+       if (array) {
                dequeue_task(p, array);
+               dec_raw_weighted_load(rq, p);
+       }
 
-       old_prio = p->prio;
-       new_prio = NICE_TO_PRIO(nice);
-       delta = new_prio - old_prio;
        p->static_prio = NICE_TO_PRIO(nice);
-       p->prio += delta;
+       set_load_weight(p);
+       old_prio = p->prio;
+       p->prio = effective_prio(p);
+       delta = p->prio - old_prio;
 
        if (array) {
                enqueue_task(p, array);
+               inc_raw_weighted_load(rq, p);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -3524,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
 out_unlock:
        task_rq_unlock(rq, &flags);
 }
-
 EXPORT_SYMBOL(set_user_nice);
 
 /*
@@ -3639,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
        BUG_ON(p->array);
        p->policy = policy;
        p->rt_priority = prio;
-       if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
-               p->prio = MAX_RT_PRIO-1 - p->rt_priority;
-       } else {
-               p->prio = p->static_prio;
-               /*
-                * SCHED_BATCH tasks are treated as perpetual CPU hogs:
-                */
-               if (policy == SCHED_BATCH)
-                       p->sleep_avg = 0;
-       }
+       p->normal_prio = normal_prio(p);
+       /* we are holding p->pi_lock already */
+       p->prio = rt_mutex_getprio(p);
+       /*
+        * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+        */
+       if (policy == SCHED_BATCH)
+               p->sleep_avg = 0;
+       set_load_weight(p);
 }
 
 /**
@@ -3667,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy,
        unsigned long flags;
        runqueue_t *rq;
 
+       /* may grab non-irq protected spin_locks */
+       BUG_ON(in_interrupt());
 recheck:
        /* double check policy once rq lock held */
        if (policy < 0)
@@ -3714,15 +4032,21 @@ recheck:
        retval = security_task_setscheduler(p, policy, param);
        if (retval)
                return retval;
+       /*
+        * make sure no PI-waiters arrive (or leave) while we are
+        * changing the priority of the task:
+        */
+       spin_lock_irqsave(&p->pi_lock, flags);
        /*
         * To be able to change p->policy safely, the apropriate
         * runqueue lock must be held.
         */
-       rq = task_rq_lock(p, &flags);
+       rq = __task_rq_lock(p);
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
-               task_rq_unlock(rq, &flags);
+               __task_rq_unlock(rq);
+               spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
        array = p->array;
@@ -3743,7 +4067,11 @@ recheck:
                } else if (TASK_PREEMPTS_CURR(p, rq))
                        resched_task(rq->curr);
        }
-       task_rq_unlock(rq, &flags);
+       __task_rq_unlock(rq);
+       spin_unlock_irqrestore(&p->pi_lock, flags);
+
+       rt_mutex_adjust_pi(p);
+
        return 0;
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3765,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
                read_unlock_irq(&tasklist_lock);
                return -ESRCH;
        }
-       retval = sched_setscheduler(p, policy, &lparam);
+       get_task_struct(p);
        read_unlock_irq(&tasklist_lock);
+       retval = sched_setscheduler(p, policy, &lparam);
+       put_task_struct(p);
        return retval;
 }
 
@@ -4378,7 +4708,7 @@ void __devinit init_idle(task_t *idle, int cpu)
        idle->timestamp = sched_clock();
        idle->sleep_avg = 0;
        idle->array = NULL;
-       idle->prio = MAX_PRIO;
+       idle->prio = idle->normal_prio = MAX_PRIO;
        idle->state = TASK_RUNNING;
        idle->cpus_allowed = cpumask_of_cpu(cpu);
        set_task_cpu(idle, cpu);
@@ -4474,13 +4804,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
  */
-static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
        runqueue_t *rq_dest, *rq_src;
+       int ret = 0;
 
        if (unlikely(cpu_is_offline(dest_cpu)))
-               return;
+               return ret;
 
        rq_src = cpu_rq(src_cpu);
        rq_dest = cpu_rq(dest_cpu);
@@ -4508,9 +4841,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
                if (TASK_PREEMPTS_CURR(p, rq_dest))
                        resched_task(rq_dest->curr);
        }
-
+       ret = 1;
 out:
        double_rq_unlock(rq_src, rq_dest);
+       return ret;
 }
 
 /*
@@ -4580,9 +4914,12 @@ wait_to_die:
 /* Figure out where task on dead CPU should go, use force if neccessary. */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
 {
+       runqueue_t *rq;
+       unsigned long flags;
        int dest_cpu;
        cpumask_t mask;
 
+restart:
        /* On same node? */
        mask = node_to_cpumask(cpu_to_node(dead_cpu));
        cpus_and(mask, mask, tsk->cpus_allowed);
@@ -4594,8 +4931,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
 
        /* No more Mr. Nice Guy. */
        if (dest_cpu == NR_CPUS) {
+               rq = task_rq_lock(tsk, &flags);
                cpus_setall(tsk->cpus_allowed);
                dest_cpu = any_online_cpu(tsk->cpus_allowed);
+               task_rq_unlock(rq, &flags);
 
                /*
                 * Don't tell them about moving exiting tasks or
@@ -4607,7 +4946,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
                               "longer affine to cpu%d\n",
                               tsk->pid, tsk->comm, dead_cpu);
        }
-       __migrate_task(tsk, dead_cpu, dest_cpu);
+       if (!__migrate_task(tsk, dead_cpu, dest_cpu))
+               goto restart;
 }
 
 /*
@@ -4734,8 +5074,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
-static int migration_call(struct notifier_block *nfb, unsigned long action,
-                         void *hcpu)
+static int __cpuinit migration_call(struct notifier_block *nfb,
+                       unsigned long action,
+                       void *hcpu)
 {
        int cpu = (long)hcpu;
        struct task_struct *p;
@@ -4805,7 +5146,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 /* Register at highest priority so that task migration (migrate_all_tasks)
  * happens before everything else.
  */
-static struct notifier_block migration_notifier = {
+static struct notifier_block __cpuinitdata migration_notifier = {
        .notifier_call = migration_call,
        .priority = 10
 };
@@ -5606,6 +5947,7 @@ static cpumask_t sched_domain_node_span(int node)
 }
 #endif
 
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
  * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
  * can switch it on easily if needed.
@@ -5621,7 +5963,7 @@ static int cpu_to_cpu_group(int cpu)
 
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static struct sched_group *sched_group_core_bycpu[NR_CPUS];
 #endif
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5637,7 +5979,7 @@ static int cpu_to_core_group(int cpu)
 #endif
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
 #if defined(CONFIG_SCHED_MC)
@@ -5694,13 +6036,74 @@ next_sg:
 }
 #endif
 
+/* Free memory allocated for various sched_group structures */
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+       int cpu;
+#ifdef CONFIG_NUMA
+       int i;
+
+       for_each_cpu_mask(cpu, *cpu_map) {
+               struct sched_group *sched_group_allnodes
+                       = sched_group_allnodes_bycpu[cpu];
+               struct sched_group **sched_group_nodes
+                       = sched_group_nodes_bycpu[cpu];
+
+               if (sched_group_allnodes) {
+                       kfree(sched_group_allnodes);
+                       sched_group_allnodes_bycpu[cpu] = NULL;
+               }
+
+               if (!sched_group_nodes)
+                       continue;
+
+               for (i = 0; i < MAX_NUMNODES; i++) {
+                       cpumask_t nodemask = node_to_cpumask(i);
+                       struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+                       cpus_and(nodemask, nodemask, *cpu_map);
+                       if (cpus_empty(nodemask))
+                               continue;
+
+                       if (sg == NULL)
+                               continue;
+                       sg = sg->next;
+next_sg:
+                       oldsg = sg;
+                       sg = sg->next;
+                       kfree(oldsg);
+                       if (oldsg != sched_group_nodes[i])
+                               goto next_sg;
+               }
+               kfree(sched_group_nodes);
+               sched_group_nodes_bycpu[cpu] = NULL;
+       }
+#endif
+       for_each_cpu_mask(cpu, *cpu_map) {
+               if (sched_group_phys_bycpu[cpu]) {
+                       kfree(sched_group_phys_bycpu[cpu]);
+                       sched_group_phys_bycpu[cpu] = NULL;
+               }
+#ifdef CONFIG_SCHED_MC
+               if (sched_group_core_bycpu[cpu]) {
+                       kfree(sched_group_core_bycpu[cpu]);
+                       sched_group_core_bycpu[cpu] = NULL;
+               }
+#endif
+       }
+}
+
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-void build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
+       struct sched_group *sched_group_phys = NULL;
+#ifdef CONFIG_SCHED_MC
+       struct sched_group *sched_group_core = NULL;
+#endif
 #ifdef CONFIG_NUMA
        struct sched_group **sched_group_nodes = NULL;
        struct sched_group *sched_group_allnodes = NULL;
@@ -5708,11 +6111,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
        /*
         * Allocate the per-node list of sched groups
         */
-       sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
-                                          GFP_ATOMIC);
+       sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+                                          GFP_KERNEL);
        if (!sched_group_nodes) {
                printk(KERN_WARNING "Can not alloc sched group node list\n");
-               return;
+               return -ENOMEM;
        }
        sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
@@ -5738,7 +6141,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
                                if (!sched_group_allnodes) {
                                        printk(KERN_WARNING
                                        "Can not alloc allnodes sched group\n");
-                                       break;
+                                       goto error;
                                }
                                sched_group_allnodes_bycpu[i]
                                                = sched_group_allnodes;
@@ -5759,6 +6162,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
+               if (!sched_group_phys) {
+                       sched_group_phys
+                               = kmalloc(sizeof(struct sched_group) * NR_CPUS,
+                                         GFP_KERNEL);
+                       if (!sched_group_phys) {
+                               printk (KERN_WARNING "Can not alloc phys sched"
+                                                    "group\n");
+                               goto error;
+                       }
+                       sched_group_phys_bycpu[i] = sched_group_phys;
+               }
+
                p = sd;
                sd = &per_cpu(phys_domains, i);
                group = cpu_to_phys_group(i);
@@ -5768,6 +6183,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
                sd->groups = &sched_group_phys[group];
 
 #ifdef CONFIG_SCHED_MC
+               if (!sched_group_core) {
+                       sched_group_core
+                               = kmalloc(sizeof(struct sched_group) * NR_CPUS,
+                                         GFP_KERNEL);
+                       if (!sched_group_core) {
+                               printk (KERN_WARNING "Can not alloc core sched"
+                                                    "group\n");
+                               goto error;
+                       }
+                       sched_group_core_bycpu[i] = sched_group_core;
+               }
+
                p = sd;
                sd = &per_cpu(core_domains, i);
                group = cpu_to_core_group(i);
@@ -5851,24 +6278,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
                domainspan = sched_domain_node_span(i);
                cpus_and(domainspan, domainspan, *cpu_map);
 
-               sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+               if (!sg) {
+                       printk(KERN_WARNING "Can not alloc domain group for "
+                               "node %d\n", i);
+                       goto error;
+               }
                sched_group_nodes[i] = sg;
                for_each_cpu_mask(j, nodemask) {
                        struct sched_domain *sd;
                        sd = &per_cpu(node_domains, j);
                        sd->groups = sg;
-                       if (sd->groups == NULL) {
-                               /* Turn off balancing if we have no groups */
-                               sd->flags = 0;
-                       }
-               }
-               if (!sg) {
-                       printk(KERN_WARNING
-                       "Can not alloc domain group for node %d\n", i);
-                       continue;
                }
                sg->cpu_power = 0;
                sg->cpumask = nodemask;
+               sg->next = sg;
                cpus_or(covered, covered, nodemask);
                prev = sg;
 
@@ -5887,54 +6311,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
                        if (cpus_empty(tmp))
                                continue;
 
-                       sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                       sg = kmalloc_node(sizeof(struct sched_group),
+                                         GFP_KERNEL, i);
                        if (!sg) {
                                printk(KERN_WARNING
                                "Can not alloc domain group for node %d\n", j);
-                               break;
+                               goto error;
                        }
                        sg->cpu_power = 0;
                        sg->cpumask = tmp;
+                       sg->next = prev->next;
                        cpus_or(covered, covered, tmp);
                        prev->next = sg;
                        prev = sg;
                }
-               prev->next = sched_group_nodes[i];
        }
 #endif
 
        /* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
        for_each_cpu_mask(i, *cpu_map) {
-               int power;
                struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
-               power = SCHED_LOAD_SCALE;
-               sd->groups->cpu_power = power;
+               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+       }
 #endif
 #ifdef CONFIG_SCHED_MC
+       for_each_cpu_mask(i, *cpu_map) {
+               int power;
+               struct sched_domain *sd;
                sd = &per_cpu(core_domains, i);
-               power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+               if (sched_smt_power_savings)
+                       power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+               else
+                       power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
                                            * SCHED_LOAD_SCALE / 10;
                sd->groups->cpu_power = power;
+       }
+#endif
 
+       for_each_cpu_mask(i, *cpu_map) {
+               struct sched_domain *sd;
+#ifdef CONFIG_SCHED_MC
                sd = &per_cpu(phys_domains, i);
+               if (i != first_cpu(sd->groups->cpumask))
+                       continue;
 
-               /*
-                * This has to be < 2 * SCHED_LOAD_SCALE
-                * Lets keep it SCHED_LOAD_SCALE, so that
-                * while calculating NUMA group's cpu_power
-                * we can simply do
-                *  numa_group->cpu_power += phys_group->cpu_power;
-                *
-                * See "only add power once for each physical pkg"
-                * comment below
-                */
-               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+               sd->groups->cpu_power = 0;
+               if (sched_mc_power_savings || sched_smt_power_savings) {
+                       int j;
+
+                       for_each_cpu_mask(j, sd->groups->cpumask) {
+                               struct sched_domain *sd1;
+                               sd1 = &per_cpu(core_domains, j);
+                               /*
+                                * for each core we will add once
+                                * to the group in physical domain
+                                */
+                               if (j != first_cpu(sd1->groups->cpumask))
+                                       continue;
+
+                               if (sched_smt_power_savings)
+                                       sd->groups->cpu_power += sd1->groups->cpu_power;
+                               else
+                                       sd->groups->cpu_power += SCHED_LOAD_SCALE;
+                       }
+               } else
+                       /*
+                        * This has to be < 2 * SCHED_LOAD_SCALE
+                        * Lets keep it SCHED_LOAD_SCALE, so that
+                        * while calculating NUMA group's cpu_power
+                        * we can simply do
+                        *  numa_group->cpu_power += phys_group->cpu_power;
+                        *
+                        * See "only add power once for each physical pkg"
+                        * comment below
+                        */
+                       sd->groups->cpu_power = SCHED_LOAD_SCALE;
 #else
+               int power;
                sd = &per_cpu(phys_domains, i);
-               power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                               (cpus_weight(sd->groups->cpumask)-1) / 10;
+               if (sched_smt_power_savings)
+                       power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+               else
+                       power = SCHED_LOAD_SCALE;
                sd->groups->cpu_power = power;
 #endif
        }
@@ -5962,13 +6422,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
         * Tune cache-hot values:
         */
        calibrate_migration_costs(cpu_map);
+
+       return 0;
+
+error:
+       free_sched_groups(cpu_map);
+       return -ENOMEM;
 }
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
        cpumask_t cpu_default_map;
+       int err;
 
        /*
         * Setup mask for cpus without special case scheduling requirements.
@@ -5977,51 +6444,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
         */
        cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
 
-       build_sched_domains(&cpu_default_map);
+       err = build_sched_domains(&cpu_default_map);
+
+       return err;
 }
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-#ifdef CONFIG_NUMA
-       int i;
-       int cpu;
-
-       for_each_cpu_mask(cpu, *cpu_map) {
-               struct sched_group *sched_group_allnodes
-                       = sched_group_allnodes_bycpu[cpu];
-               struct sched_group **sched_group_nodes
-                       = sched_group_nodes_bycpu[cpu];
-
-               if (sched_group_allnodes) {
-                       kfree(sched_group_allnodes);
-                       sched_group_allnodes_bycpu[cpu] = NULL;
-               }
-
-               if (!sched_group_nodes)
-                       continue;
-
-               for (i = 0; i < MAX_NUMNODES; i++) {
-                       cpumask_t nodemask = node_to_cpumask(i);
-                       struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
-                       cpus_and(nodemask, nodemask, *cpu_map);
-                       if (cpus_empty(nodemask))
-                               continue;
-
-                       if (sg == NULL)
-                               continue;
-                       sg = sg->next;
-next_sg:
-                       oldsg = sg;
-                       sg = sg->next;
-                       kfree(oldsg);
-                       if (oldsg != sched_group_nodes[i])
-                               goto next_sg;
-               }
-               kfree(sched_group_nodes);
-               sched_group_nodes_bycpu[cpu] = NULL;
-       }
-#endif
+       free_sched_groups(cpu_map);
 }
 
 /*
@@ -6046,9 +6476,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
  * correct sched domains
  * Call with hotplug lock held
  */
-void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
 {
        cpumask_t change_map;
+       int err = 0;
 
        cpus_and(*partition1, *partition1, cpu_online_map);
        cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6057,10 +6488,86 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
        /* Detach sched domains from all of the affected cpus */
        detach_destroy_domains(&change_map);
        if (!cpus_empty(*partition1))
-               build_sched_domains(partition1);
-       if (!cpus_empty(*partition2))
-               build_sched_domains(partition2);
+               err = build_sched_domains(partition1);
+       if (!err && !cpus_empty(*partition2))
+               err = build_sched_domains(partition2);
+
+       return err;
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+       int err;
+
+       lock_cpu_hotplug();
+       detach_destroy_domains(&cpu_online_map);
+       err = arch_init_sched_domains(&cpu_online_map);
+       unlock_cpu_hotplug();
+
+       return err;
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+       int ret;
+
+       if (buf[0] != '0' && buf[0] != '1')
+               return -EINVAL;
+
+       if (smt)
+               sched_smt_power_savings = (buf[0] == '1');
+       else
+               sched_mc_power_savings = (buf[0] == '1');
+
+       ret = arch_reinit_sched_domains();
+
+       return ret ? ret : count;
+}
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+       int err = 0;
+#ifdef CONFIG_SCHED_SMT
+       if (smt_capable())
+               err = sysfs_create_file(&cls->kset.kobj,
+                                       &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+       if (!err && mc_capable())
+               err = sysfs_create_file(&cls->kset.kobj,
+                                       &attr_sched_mc_power_savings.attr);
+#endif
+       return err;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+{
+       return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+       return sched_power_savings_store(buf, count, 0);
+}
+SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+           sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+{
+       return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+       return sched_power_savings_store(buf, count, 1);
 }
+SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+           sched_smt_power_savings_store);
+#endif
+
 
 #ifdef CONFIG_HOTPLUG_CPU
 /*
@@ -6143,7 +6650,6 @@ void __init sched_init(void)
                rq->push_cpu = 0;
                rq->migration_thread = NULL;
                INIT_LIST_HEAD(&rq->migration_queue);
-               rq->cpu = i;
 #endif
                atomic_set(&rq->nr_iowait, 0);
 
@@ -6158,6 +6664,7 @@ void __init sched_init(void)
                }
        }
 
+       set_load_weight(&init_task);
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
@@ -6204,11 +6711,12 @@ void normalize_rt_tasks(void)
        runqueue_t *rq;
 
        read_lock_irq(&tasklist_lock);
-       for_each_process (p) {
+       for_each_process(p) {
                if (!rt_task(p))
                        continue;
 
-               rq = task_rq_lock(p, &flags);
+               spin_lock_irqsave(&p->pi_lock, flags);
+               rq = __task_rq_lock(p);
 
                array = p->array;
                if (array)
@@ -6219,7 +6727,8 @@ void normalize_rt_tasks(void)
                        resched_task(rq->curr);
                }
 
-               task_rq_unlock(rq, &flags);
+               __task_rq_unlock(rq);
+               spin_unlock_irqrestore(&p->pi_lock, flags);
        }
        read_unlock_irq(&tasklist_lock);
 }
index 9e2f1c6e73d7b341958c74baa8af54169c88c119..8f03e3b89b5540a0e21b1bab99dcb455bb74ffd5 100644 (file)
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int cpu_callback(struct notifier_block *nfb,
+static int __devinit cpu_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
 {
@@ -486,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
 
-static struct notifier_block cpu_nfb = {
+static struct notifier_block __devinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
 
index b5c3b94e01ce7408a9f3d0dae310cff0a6939a36..6b76caa229818811f61aaacf16afd9c2b6da5dc5 100644 (file)
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
 /*
  * Create/destroy watchdog threads as CPUs come and go:
  */
-static int
+static int __devinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
 
-static struct notifier_block cpu_nfb = {
+static struct notifier_block __devinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
 
index f1a4eb1a655e31a8a44800f5e6a8ecdd87065071..93a2c53986488f5c29cf05318db80241118bda74 100644 (file)
@@ -133,6 +133,10 @@ extern int acct_parm[];
 extern int no_unaligned_warning;
 #endif
 
+#ifdef CONFIG_RT_MUTEXES
+extern int max_lock_depth;
+#endif
+
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
                       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -688,6 +692,17 @@ static ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
+#ifdef CONFIG_RT_MUTEXES
+       {
+               .ctl_name       = KERN_MAX_LOCK_DEPTH,
+               .procname       = "max_lock_depth",
+               .data           = &max_lock_depth,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#endif
+
        { .ctl_name = 0 }
 };
 
@@ -927,6 +942,18 @@ static ctl_table vm_table[] = {
                .proc_handler   = &proc_dointvec_jiffies,
                .strategy       = &sysctl_jiffies,
        },
+#endif
+#ifdef CONFIG_X86_32
+       {
+               .ctl_name       = VM_VDSO_ENABLED,
+               .procname       = "vdso_enabled",
+               .data           = &vdso_enabled,
+               .maxlen         = sizeof(vdso_enabled),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+       },
 #endif
        { .ctl_name = 0 }
 };
index 5bb6b7976eecf6c215b561b42b4d79c4197507eb..5a8960253063d00495366a48d92e06e5eb4133fa 100644 (file)
@@ -1652,7 +1652,7 @@ static void __devinit migrate_timers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int timer_cpu_notify(struct notifier_block *self,
+static int __devinit timer_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -1672,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
 
-static struct notifier_block timers_nb = {
+static struct notifier_block __devinitdata timers_nb = {
        .notifier_call  = timer_cpu_notify,
 };
 
index 565cf7a1febda94b88582c6e9326d782fb29f96c..59f0b42bd89e0e819a1a67145f48a11f2898bbd2 100644 (file)
@@ -559,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 }
 
 /* We're holding the cpucontrol mutex here */
-static int workqueue_cpu_callback(struct notifier_block *nfb,
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                                  unsigned long action,
                                  void *hcpu)
 {
index 3de93357f5ab1e05c370164e5e390eb7d0595f9f..f6299342b882d86f56b70655879b5c8ba792f3d4 100644 (file)
@@ -86,4 +86,10 @@ config TEXTSEARCH_BM
 config TEXTSEARCH_FSM
        tristate
 
+#
+# plist support is select#ed if needed
+#
+config PLIST
+       boolean
+
 endmenu
index 8bab0102ac739d181bbddc7fc5a14f2d562353c3..5330911ebd303ed875657c3e739e5bb966dd5c71 100644 (file)
@@ -107,6 +107,24 @@ config DEBUG_MUTEXES
         This allows mutex semantics violations and mutex related deadlocks
         (lockups) to be detected and reported automatically.
 
+config DEBUG_RT_MUTEXES
+       bool "RT Mutex debugging, deadlock detection"
+       depends on DEBUG_KERNEL && RT_MUTEXES
+       help
+        This allows rt mutex semantics violations and rt mutex related
+        deadlocks (lockups) to be detected and reported automatically.
+
+config DEBUG_PI_LIST
+       bool
+       default y
+       depends on DEBUG_RT_MUTEXES
+
+config RT_MUTEX_TESTER
+       bool "Built-in scriptable tester for rt-mutexes"
+       depends on DEBUG_KERNEL && RT_MUTEXES
+       help
+         This option enables a rt-mutex tester.
+
 config DEBUG_SPINLOCK
        bool "Spinlock debugging"
        depends on DEBUG_KERNEL
index 79358ad1f11353d3ada95577e41a9a857f0f3805..10c13c9d7824d21aa6425001b3a1d0c0403edfa5 100644 (file)
@@ -25,6 +25,7 @@ lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
+obj-$(CONFIG_PLIST) += plist.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 
 ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
diff --git a/lib/plist.c b/lib/plist.c
new file mode 100644 (file)
index 0000000..3074a02
--- /dev/null
@@ -0,0 +1,118 @@
+/*
+ * lib/plist.c
+ *
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Simplifications of the original code by
+ * Oleg Nesterov <oleg@tv-sign.ru>
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ *
+ * This file contains the add / del functions which are considered to
+ * be too large to inline. See include/linux/plist.h for further
+ * information.
+ */
+
+#include <linux/plist.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_DEBUG_PI_LIST
+
+static void plist_check_prev_next(struct list_head *t, struct list_head *p,
+                                 struct list_head *n)
+{
+       if (n->prev != p || p->next != n) {
+               printk("top: %p, n: %p, p: %p\n", t, t->next, t->prev);
+               printk("prev: %p, n: %p, p: %p\n", p, p->next, p->prev);
+               printk("next: %p, n: %p, p: %p\n", n, n->next, n->prev);
+               WARN_ON(1);
+       }
+}
+
+static void plist_check_list(struct list_head *top)
+{
+       struct list_head *prev = top, *next = top->next;
+
+       plist_check_prev_next(top, prev, next);
+       while (next != top) {
+               prev = next;
+               next = prev->next;
+               plist_check_prev_next(top, prev, next);
+       }
+}
+
+static void plist_check_head(struct plist_head *head)
+{
+       WARN_ON(!head->lock);
+       if (head->lock)
+               WARN_ON_SMP(!spin_is_locked(head->lock));
+       plist_check_list(&head->prio_list);
+       plist_check_list(&head->node_list);
+}
+
+#else
+# define plist_check_head(h)   do { } while (0)
+#endif
+
+/**
+ * plist_add - add @node to @head
+ *
+ * @node:      &struct plist_node pointer
+ * @head:      &struct plist_head pointer
+ */
+void plist_add(struct plist_node *node, struct plist_head *head)
+{
+       struct plist_node *iter;
+
+       plist_check_head(head);
+       WARN_ON(!plist_node_empty(node));
+
+       list_for_each_entry(iter, &head->prio_list, plist.prio_list) {
+               if (node->prio < iter->prio)
+                       goto lt_prio;
+               else if (node->prio == iter->prio) {
+                       iter = list_entry(iter->plist.prio_list.next,
+                                       struct plist_node, plist.prio_list);
+                       goto eq_prio;
+               }
+       }
+
+lt_prio:
+       list_add_tail(&node->plist.prio_list, &iter->plist.prio_list);
+eq_prio:
+       list_add_tail(&node->plist.node_list, &iter->plist.node_list);
+
+       plist_check_head(head);
+}
+
+/**
+ * plist_del - Remove a @node from plist.
+ *
+ * @node:      &struct plist_node pointer - entry to be removed
+ * @head:      &struct plist_head pointer - list head
+ */
+void plist_del(struct plist_node *node, struct plist_head *head)
+{
+       plist_check_head(head);
+
+       if (!list_empty(&node->plist.prio_list)) {
+               struct plist_node *next = plist_first(&node->plist);
+
+               list_move_tail(&next->plist.prio_list, &node->plist.prio_list);
+               list_del_init(&node->plist.prio_list);
+       }
+
+       list_del_init(&node->plist.node_list);
+
+       plist_check_head(head);
+}
index 02a16eacb72dabb50861193db448004b175bd3dc..d84560c076d83cfca8a0d2d19b6214ed1cac23ca 100644 (file)
       bytes, which is the maximum length that can be coded.  inflate_fast()
       requires strm->avail_out >= 258 for each loop to avoid checking for
       output space.
+
+    - @start:  inflate()'s starting value for strm->avail_out
  */
-void inflate_fast(strm, start)
-z_streamp strm;
-unsigned start;         /* inflate()'s starting value for strm->avail_out */
+void inflate_fast(z_streamp strm, unsigned start)
 {
     struct inflate_state *state;
     unsigned char *in;      /* local strm->next_in */
index da665fbb16aaf4e6b9fdf8f69c25b46bc4785f3e..3fe6ce5b53e51999b0ad15f97843b93c313cb60f 100644 (file)
    table index bits.  It will differ if the request is greater than the
    longest code or if it is less than the shortest code.
  */
-int zlib_inflate_table(type, lens, codes, table, bits, work)
-codetype type;
-unsigned short *lens;
-unsigned codes;
-code **table;
-unsigned *bits;
-unsigned short *work;
+int zlib_inflate_table(codetype type, unsigned short *lens, unsigned codes,
+                       code **table, unsigned *bits, unsigned short *work)
 {
     unsigned len;               /* a code's length in bits */
     unsigned sym;               /* index of code symbols */
index 66e65ab3942651e86f2fbbadcf914fd600e41600..e76c023eb0bb4696be89ffb8185dc1ff502c0ed9 100644 (file)
@@ -116,6 +116,7 @@ config SPARSEMEM_EXTREME
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
        depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+       depends on (IA64 || X86 || PPC64)
 
 comment "Memory hotplug is currently incompatible with Software Suspend"
        depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
index 9c7334bafda8da5150f4a9f0a7556bda113b8a19..d504d6e98886f2d0d2f5307c7bbe962b8af41e72 100644 (file)
@@ -2095,14 +2095,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
        do {
                unsigned long index;
                unsigned long offset;
-               unsigned long maxlen;
                size_t copied;
 
                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
                index = pos >> PAGE_CACHE_SHIFT;
                bytes = PAGE_CACHE_SIZE - offset;
-               if (bytes > count)
-                       bytes = count;
+
+               /* Limit the size of the copy to the caller's write size */
+               bytes = min(bytes, count);
+
+               /*
+                * Limit the size of the copy to that of the current segment,
+                * because fault_in_pages_readable() doesn't know how to walk
+                * segments.
+                */
+               bytes = min(bytes, cur_iov->iov_len - iov_base);
 
                /*
                 * Bring in the user page that we will copy from _first_.
@@ -2110,10 +2117,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                 * same page as we're writing to, without it being marked
                 * up-to-date.
                 */
-               maxlen = cur_iov->iov_len - iov_base;
-               if (maxlen > bytes)
-                       maxlen = bytes;
-               fault_in_pages_readable(buf, maxlen);
+               fault_in_pages_readable(buf, bytes);
 
                page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
                if (!page) {
index 841a077d5aeb0d3423ecba8ed5d81efdd72c2442..ea4038838b0a2b4c95a46cc3d4f1eaad25556f5f 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
+#include <linux/ioport.h>
 
 #include <asm/tlbflush.h>
 
@@ -126,6 +127,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        unsigned long i;
        unsigned long flags;
        unsigned long onlined_pages = 0;
+       struct resource res;
+       u64 section_end;
+       unsigned long start_pfn;
        struct zone *zone;
        int need_zonelists_rebuild = 0;
 
@@ -148,10 +152,27 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        if (!populated_zone(zone))
                need_zonelists_rebuild = 1;
 
-       for (i = 0; i < nr_pages; i++) {
-               struct page *page = pfn_to_page(pfn + i);
-               online_page(page);
-               onlined_pages++;
+       res.start = (u64)pfn << PAGE_SHIFT;
+       res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
+       res.flags = IORESOURCE_MEM; /* we just need system ram */
+       section_end = res.end;
+
+       while (find_next_system_ram(&res) >= 0) {
+               start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
+               nr_pages = (unsigned long)
+                           ((res.end + 1 - res.start) >> PAGE_SHIFT);
+
+               if (PageReserved(pfn_to_page(start_pfn))) {
+                       /* this region's page is not onlined now */
+                       for (i = 0; i < nr_pages; i++) {
+                               struct page *page = pfn_to_page(start_pfn + i);
+                               online_page(page);
+                               onlined_pages++;
+                       }
+               }
+
+               res.start = res.end + 1;
+               res.end = section_end;
        }
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
@@ -163,3 +184,100 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        vm_total_pages = nr_free_pagecache_pages();
        return 0;
 }
+
+static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+{
+       struct pglist_data *pgdat;
+       unsigned long zones_size[MAX_NR_ZONES] = {0};
+       unsigned long zholes_size[MAX_NR_ZONES] = {0};
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+
+       pgdat = arch_alloc_nodedata(nid);
+       if (!pgdat)
+               return NULL;
+
+       arch_refresh_nodedata(nid, pgdat);
+
+       /* we can use NODE_DATA(nid) from here */
+
+       /* init node's zones as empty zones, we don't have any present pages.*/
+       free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
+
+       return pgdat;
+}
+
+static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
+{
+       arch_refresh_nodedata(nid, NULL);
+       arch_free_nodedata(pgdat);
+       return;
+}
+
+/* add this memory to iomem resource */
+static void register_memory_resource(u64 start, u64 size)
+{
+       struct resource *res;
+
+       res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+       BUG_ON(!res);
+
+       res->name = "System RAM";
+       res->start = start;
+       res->end = start + size - 1;
+       res->flags = IORESOURCE_MEM;
+       if (request_resource(&iomem_resource, res) < 0) {
+               printk("System RAM resource %llx - %llx cannot be added\n",
+               (unsigned long long)res->start, (unsigned long long)res->end);
+               kfree(res);
+       }
+}
+
+
+
+int add_memory(int nid, u64 start, u64 size)
+{
+       pg_data_t *pgdat = NULL;
+       int new_pgdat = 0;
+       int ret;
+
+       if (!node_online(nid)) {
+               pgdat = hotadd_new_pgdat(nid, start);
+               if (!pgdat)
+                       return -ENOMEM;
+               new_pgdat = 1;
+               ret = kswapd_run(nid);
+               if (ret)
+                       goto error;
+       }
+
+       /* call arch's memory hotadd */
+       ret = arch_add_memory(nid, start, size);
+
+       if (ret < 0)
+               goto error;
+
+       /* we online node here. we can't roll back from here. */
+       node_set_online(nid);
+
+       if (new_pgdat) {
+               ret = register_one_node(nid);
+               /*
+                * If sysfs file of new node can't create, cpu on the node
+                * can't be hot-added. There is no rollback way now.
+                * So, check by BUG_ON() to catch it reluctantly..
+                */
+               BUG_ON(ret);
+       }
+
+       /* register this memory as resource */
+       register_memory_resource(start, size);
+
+       return ret;
+error:
+       /* rollback pgdat allocation and others */
+       if (new_pgdat)
+               rollback_node_hotadd(nid, pgdat);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory);
index 8ccf6f1b1473c1e26e0da73e5f8f95c4b041def9..4ec7026c7bab14e1f4a9e512742e318a6630607f 100644 (file)
@@ -516,14 +516,14 @@ static void set_ratelimit(void)
                ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 }
 
-static int
+static int __cpuinit
 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 {
        set_ratelimit();
        return 0;
 }
 
-static struct notifier_block ratelimit_nb = {
+static struct notifier_block __cpuinitdata ratelimit_nb = {
        .notifier_call  = ratelimit_handler,
        .next           = NULL,
 };
index 9f86191bb632955a224d94ddf6c53980c64e9244..084a2de7e52a8c8b4ceaa3c78997199d17f770e5 100644 (file)
@@ -446,8 +446,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 
        arch_free_page(page, order);
        if (!PageHighMem(page))
-               mutex_debug_check_no_locks_freed(page_address(page),
-                                                PAGE_SIZE<<order);
+               debug_check_no_locks_freed(page_address(page),
+                                          PAGE_SIZE<<order);
 
        for (i = 0 ; i < (1 << order) ; ++i)
                reserved += free_pages_check(page + i);
@@ -2009,7 +2009,7 @@ static inline void free_zone_pagesets(int cpu)
        }
 }
 
-static int pageset_cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
                unsigned long action,
                void *hcpu)
 {
@@ -2031,7 +2031,7 @@ static int pageset_cpuup_callback(struct notifier_block *nfb,
        return ret;
 }
 
-static struct notifier_block pageset_notifier =
+static struct notifier_block __cpuinitdata pageset_notifier =
        { &pageset_cpuup_callback, NULL, 0 };
 
 void __init setup_per_cpu_pageset(void)
index 98ac20bc0de9a3ad443ef9f8bd54cb1c58ed5de4..233e39d14caf5ae2966c9df788c9c73dc6a563e3 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -89,6 +89,7 @@
 #include       <linux/config.h>
 #include       <linux/slab.h>
 #include       <linux/mm.h>
+#include       <linux/poison.h>
 #include       <linux/swap.h>
 #include       <linux/cache.h>
 #include       <linux/interrupt.h>
 #include       <linux/nodemask.h>
 #include       <linux/mempolicy.h>
 #include       <linux/mutex.h>
+#include       <linux/rtmutex.h>
 
 #include       <asm/uaccess.h>
 #include       <asm/cacheflush.h>
@@ -492,17 +494,6 @@ struct kmem_cache {
 #endif
 
 #if DEBUG
-/*
- * Magic nums for obj red zoning.
- * Placed in the first word before and the first word after an obj.
- */
-#define        RED_INACTIVE    0x5A2CF071UL    /* when obj is inactive */
-#define        RED_ACTIVE      0x170FC2A5UL    /* when obj is active */
-
-/* ...and for poisoning */
-#define        POISON_INUSE    0x5a    /* for use-uninitialised poisoning */
-#define POISON_FREE    0x6b    /* for use-after-free poisoning */
-#define        POISON_END      0xa5    /* end-byte of poisoning */
 
 /*
  * memory layout of objects:
@@ -1083,7 +1074,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 
 #endif
 
-static int cpuup_callback(struct notifier_block *nfb,
+static int __devinit cpuup_callback(struct notifier_block *nfb,
                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -1265,7 +1256,9 @@ bad:
        return NOTIFY_BAD;
 }
 
-static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
+static struct notifier_block __cpuinitdata cpucache_notifier = {
+       &cpuup_callback, NULL, 0
+};
 
 /*
  * swap the static kmem_list3 with kmalloced memory
@@ -3405,7 +3398,7 @@ void kfree(const void *objp)
        local_irq_save(flags);
        kfree_debugcheck(objp);
        c = virt_to_cache(objp);
-       mutex_debug_check_no_locks_freed(objp, obj_size(c));
+       debug_check_no_locks_freed(objp, obj_size(c));
        __cache_free(c, (void *)objp);
        local_irq_restore(flags);
 }
index e0a3fe48aa3745bebd710ff80d8cad6215dd244d..c7a2b3a0e46b2961fbc6fe6794e562be66def97e 100644 (file)
@@ -45,7 +45,7 @@ static struct mem_section *sparse_index_alloc(int nid)
 
 static int sparse_index_init(unsigned long section_nr, int nid)
 {
-       static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED;
+       static DEFINE_SPINLOCK(index_init_lock);
        unsigned long root = SECTION_NR_TO_ROOT(section_nr);
        struct mem_section *section;
        int ret = 0;
index 72babac71deaba28f9c75a1b6d2ccacb8ba537fa..eeacb0d695c35233e57688e4d20314a149d1d22c 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
+#include <linux/kthread.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1223,7 +1224,6 @@ static int kswapd(void *p)
        };
        cpumask_t cpumask;
 
-       daemonize("kswapd%d", pgdat->node_id);
        cpumask = node_to_cpumask(pgdat->node_id);
        if (!cpus_empty(cpumask))
                set_cpus_allowed(tsk, cpumask);
@@ -1450,7 +1450,7 @@ out:
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
-static int cpu_callback(struct notifier_block *nfb,
+static int __devinit cpu_callback(struct notifier_block *nfb,
                                  unsigned long action, void *hcpu)
 {
        pg_data_t *pgdat;
@@ -1468,20 +1468,35 @@ static int cpu_callback(struct notifier_block *nfb,
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+       pg_data_t *pgdat = NODE_DATA(nid);
+       int ret = 0;
+
+       if (pgdat->kswapd)
+               return 0;
+
+       pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+       if (IS_ERR(pgdat->kswapd)) {
+               /* failure at boot is fatal */
+               BUG_ON(system_state == SYSTEM_BOOTING);
+               printk("Failed to start kswapd on node %d\n",nid);
+               ret = -1;
+       }
+       return ret;
+}
+
 static int __init kswapd_init(void)
 {
-       pg_data_t *pgdat;
+       int nid;
 
        swap_setup();
-       for_each_online_pgdat(pgdat) {
-               pid_t pid;
-
-               pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
-               BUG_ON(pid < 0);
-               read_lock(&tasklist_lock);
-               pgdat->kswapd = find_task_by_pid(pid);
-               read_unlock(&tasklist_lock);
-       }
+       for_each_online_node(nid)
+               kswapd_run(nid);
        hotcpu_notifier(cpu_callback, 0);
        return 0;
 }
index 8a777932786d7d4c0975fd6941489039abad3d06..e728980160d2243400ed49cd2124bd7dfec5bb34 100644 (file)
@@ -349,7 +349,7 @@ static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
            (strict & RT6_SELECT_F_REACHABLE) &&
            last && last != rt0) {
                /* no entries matched; do round-robin */
-               static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+               static DEFINE_SPINLOCK(lock);
                spin_lock(&lock);
                *head = rt0->u.next;
                rt0->u.next = last->u.next;
index f43311221a72b9a8b4f56cd177a33e5cd0296b41..2f312164d6d5611559aabab099e37ef8e55f7459 100644 (file)
@@ -70,7 +70,7 @@
 # define RPCDBG_FACILITY        RPCDBG_AUTH
 #endif
 
-spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(krb5_seq_lock);
 
 u32
 gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
index 54128040a1245ad9231ff00db3ebaf12644c273e..1bb75703f3848b8e50bdf57ad561d28f829e8132 100644 (file)
@@ -117,7 +117,7 @@ struct bclink {
 static struct bcbearer *bcbearer = NULL;
 static struct bclink *bclink = NULL;
 static struct link *bcl = NULL;
-static spinlock_t bc_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bc_lock);
 
 char tipc_bclink_name[] = "multicast-link";
 
@@ -796,7 +796,7 @@ int tipc_bclink_init(void)
        memset(bclink, 0, sizeof(struct bclink));
        INIT_LIST_HEAD(&bcl->waiting_ports);
        bcl->next_out_no = 1;
-       bclink->node.lock =  SPIN_LOCK_UNLOCKED;        
+       spin_lock_init(&bclink->node.lock);
        bcl->owner = &bclink->node;
         bcl->max_pkt = MAX_PKT_DEFAULT_MCAST;
        tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
index 4fa24b5e8914a559d3cc025731355bb0664e856c..7ef17a449cfdb33d696c19428430ab891c4c316c 100644 (file)
@@ -566,7 +566,7 @@ restart:
                b_ptr->link_req = tipc_disc_init_link_req(b_ptr, &m_ptr->bcast_addr,
                                                          bcast_scope, 2);
        }
-       b_ptr->publ.lock = SPIN_LOCK_UNLOCKED;
+       spin_lock_init(&b_ptr->publ.lock);
        write_unlock_bh(&tipc_net_lock);
        info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
             name, addr_string_fill(addr_string, bcast_scope), priority);
index 3ec502fac8c34b29a0ed66253341e35dfb60a83c..285e1bc2d8808502f53e952eed76a1dd72308534 100644 (file)
@@ -63,7 +63,7 @@ struct manager {
 
 static struct manager mng = { 0};
 
-static spinlock_t config_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(config_lock);
 
 static const void *req_tlv_area;       /* request message TLV area */
 static int req_tlv_space;              /* request message TLV area size */
index 26ef95d5fe38601611e6d95a4d6248276235b4cb..55130655e1edbe306a484f552c2a20080496cb15 100644 (file)
@@ -41,7 +41,7 @@
 #define MAX_STRING 512
 
 static char print_string[MAX_STRING];
-static spinlock_t print_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(print_lock);
 
 static struct print_buf cons_buf = { NULL, 0, NULL, NULL };
 struct print_buf *TIPC_CONS = &cons_buf;
index 966f70a1b60800012c14a09e7377eeea2414c0d0..ae6ddf00a1aaea8aaaa17d708e4d4eaca4bfe059 100644 (file)
@@ -44,7 +44,7 @@ struct queue_item {
 
 static kmem_cache_t *tipc_queue_item_cache;
 static struct list_head signal_queue_head;
-static spinlock_t qitem_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(qitem_lock);
 static int handler_enabled = 0;
 
 static void process_signal_queue(unsigned long dummy);
index 38571306aba5ab0bb60a86be10a0c8202b28f40a..a6926ff07bcc3e2365d0424e30b721df1e455d21 100644 (file)
@@ -101,7 +101,7 @@ struct name_table {
 
 static struct name_table table = { NULL } ;
 static atomic_t rsv_publ_ok = ATOMIC_INIT(0);
-rwlock_t tipc_nametbl_lock = RW_LOCK_UNLOCKED;
+DEFINE_RWLOCK(tipc_nametbl_lock);
 
 
 static int hash(int x)
@@ -172,7 +172,7 @@ static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_hea
        }
 
        memset(nseq, 0, sizeof(*nseq));
-       nseq->lock = SPIN_LOCK_UNLOCKED;
+       spin_lock_init(&nseq->lock);
        nseq->type = type;
        nseq->sseqs = sseq;
        dbg("tipc_nameseq_create(): nseq = %p, type %u, ssseqs %p, ff: %u\n",
index f7c8223ddf7dd0f3e9ce9c148e2cca1d1915cb3f..e5a359ab49308025f10a1c065cfffa23e7991a5f 100644 (file)
  *     - A local spin_lock protecting the queue of subscriber events.
 */
 
-rwlock_t tipc_net_lock = RW_LOCK_UNLOCKED;
+DEFINE_RWLOCK(tipc_net_lock);
 struct network tipc_net = { NULL };
 
 struct node *tipc_net_select_remote_node(u32 addr, u32 ref) 
index ce9678efa98a5822b5e205d879fb158b3f8ddb44..861322b935daf1be3aee4ac4e5eca284c4a7c89a 100644 (file)
@@ -77,7 +77,7 @@ struct node *tipc_node_create(u32 addr)
                
        memset(n_ptr, 0, sizeof(*n_ptr));
        n_ptr->addr = addr;
-       n_ptr->lock =  SPIN_LOCK_UNLOCKED;      
+                spin_lock_init(&n_ptr->lock);
        INIT_LIST_HEAD(&n_ptr->nsub);
        n_ptr->owner = c_ptr;
        tipc_cltr_attach_node(c_ptr, n_ptr);
index 47d97404e3ee06a3de6d9f2721af8211a78fd8d3..3251c8d8e53c3bcc401a13e53d6636d410b6b5d0 100644 (file)
@@ -57,8 +57,8 @@
 static struct sk_buff *msg_queue_head = NULL;
 static struct sk_buff *msg_queue_tail = NULL;
 
-spinlock_t tipc_port_list_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t queue_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(tipc_port_list_lock);
+static DEFINE_SPINLOCK(queue_lock);
 
 static LIST_HEAD(ports);
 static void port_handle_node_down(unsigned long ref);
index d2f0cce10e2046c0e9693b113c6e234b30792f1a..596d3c8ff75006a95545e9f1a9407bd5e6b9d062 100644 (file)
@@ -63,7 +63,7 @@
 
 struct ref_table tipc_ref_table = { NULL };
 
-static rwlock_t ref_table_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(ref_table_lock);
 
 /**
  * tipc_ref_table_init - create reference table for objects
@@ -87,7 +87,7 @@ int tipc_ref_table_init(u32 requested_size, u32 start)
        index_mask = sz - 1;
        for (i = sz - 1; i >= 0; i--) {
                table[i].object = NULL;
-               table[i].lock = SPIN_LOCK_UNLOCKED;
+               spin_lock_init(&table[i].lock);
                table[i].data.next_plus_upper = (start & ~index_mask) + i - 1;
        }
        tipc_ref_table.entries = table;
index fc171875660c5eda665c7674509ab9dc72fac057..e19b4bcd67ec2bf44b6a2e7f4186751984782c4b 100644 (file)
@@ -457,7 +457,7 @@ int tipc_subscr_start(void)
        int res = -1;
 
        memset(&topsrv, 0, sizeof (topsrv));
-       topsrv.lock = SPIN_LOCK_UNLOCKED;
+       spin_lock_init(&topsrv.lock);
        INIT_LIST_HEAD(&topsrv.subscriber_list);
 
        spin_lock_bh(&topsrv.lock);
index 3f3f933976e9dc27603cff2deec0b74150faebc4..1e3ae57c722872f2e2d74d28891ecb39d53d0f21 100644 (file)
@@ -67,7 +67,7 @@ struct tipc_user {
 
 static struct tipc_user *users = NULL;
 static u32 next_free_user = MAX_USERID + 1;
-static spinlock_t reg_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(reg_lock);
 
 /**
  * reg_init - create TIPC user registry (but don't activate it)
index ac5f275b0283f1f9807fcc281c3a443e30955eda..b0d067be739056089fbb09a357bbacc5c0daf6b4 100644 (file)
@@ -12,11 +12,6 @@ space   := $(empty) $(empty)
 # contain a comma
 depfile = $(subst $(comma),_,$(@D)/.$(@F).d)
 
-###
-# basetarget equals the filename of the target with no extension.
-# So 'foo/bar.o' becomes 'bar'
-basetarget = $(basename $(notdir $@))
-
 ###
 # Escape single quote for use in echo statements
 escsq = $(subst $(squote),'\$(squote)',$1)
index 3cb445cc7432fdac293fcea287a62e437fee921f..02a7eea5fdbcb77221eca244d264d33c1eada5f5 100644 (file)
@@ -117,7 +117,7 @@ $(real-objs-m:.o=.lst): quiet_modtag := [M]
 $(obj-m)              : quiet_modtag := [M]
 
 # Default for not multi-part modules
-modname = $(basetarget)
+modname = $(*F)
 
 $(multi-objs-m)         : modname = $(modname-multi)
 $(multi-objs-m:.o=.i)   : modname = $(modname-multi)
index 18ecd4d5df7fe4ae76352cc926605cc304040958..2b066d12af2c30fe65c51a0b204cdfb6ac494f2b 100644 (file)
@@ -80,10 +80,8 @@ obj-dirs += $(host-objdirs)
 #####
 # Handle options to gcc. Support building with separate output directory
 
-_hostc_flags   = $(HOSTCFLAGS)   $(HOST_EXTRACFLAGS)   \
-                 $(HOSTCFLAGS_$(basetarget).o)
-_hostcxx_flags = $(HOSTCXXFLAGS) $(HOST_EXTRACXXFLAGS) \
-                 $(HOSTCXXFLAGS_$(basetarget).o)
+_hostc_flags   = $(HOSTCFLAGS)   $(HOST_EXTRACFLAGS)   $(HOSTCFLAGS_$(*F).o)
+_hostcxx_flags = $(HOSTCXXFLAGS) $(HOST_EXTRACXXFLAGS) $(HOSTCXXFLAGS_$(*F).o)
 
 ifeq ($(KBUILD_SRC),)
 __hostc_flags  = $(_hostc_flags)
index fc498fee68edefe6c88afc2d546e0e459b9396d9..2cb4935e85d1b0a8d3638783a8bd964489f984c1 100644 (file)
@@ -82,12 +82,12 @@ obj-dirs    := $(addprefix $(obj)/,$(obj-dirs))
 #       than one module. In that case KBUILD_MODNAME will be set to foo_bar,
 #       where foo and bar are the name of the modules.
 name-fix = $(subst $(comma),_,$(subst -,_,$1))
-basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))"
+basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(*F)))"
 modname_flags  = $(if $(filter 1,$(words $(modname))),\
                  -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
 
-_c_flags       = $(CFLAGS) $(EXTRA_CFLAGS) $(CFLAGS_$(basetarget).o)
-_a_flags       = $(AFLAGS) $(EXTRA_AFLAGS) $(AFLAGS_$(basetarget).o)
+_c_flags       = $(CFLAGS) $(EXTRA_CFLAGS) $(CFLAGS_$(*F).o)
+_a_flags       = $(AFLAGS) $(EXTRA_AFLAGS) $(AFLAGS_$(*F).o)
 _cpp_flags     = $(CPPFLAGS) $(EXTRA_CPPFLAGS) $(CPPFLAGS_$(@F))
 
 # If building the kernel in a separate objtree expand all occurrences
index e83613e0e82726b66541f0c28ace5232d07b365a..576cce5e387f4472bc31a20fcfca2e08fcea6fdd 100644 (file)
@@ -72,7 +72,7 @@ $(modules:.ko=.mod.c): __modpost ;
 # Step 5), compile all *.mod.c files
 
 # modname is set to make c_flags define KBUILD_MODNAME
-modname = $(basetarget)
+modname = $(*F)
 
 quiet_cmd_cc_o_c = CC      $@
       cmd_cc_o_c = $(CC) $(c_flags) $(CFLAGS_MODULE)   \
diff --git a/scripts/rt-tester/check-all.sh b/scripts/rt-tester/check-all.sh
new file mode 100644 (file)
index 0000000..43098af
--- /dev/null
@@ -0,0 +1,22 @@
+
+
+function testit ()
+{
+ printf "%-30s: " $1
+ ./rt-tester.py $1 | grep Pass
+}
+
+testit t2-l1-2rt-sameprio.tst
+testit t2-l1-pi.tst
+testit t2-l1-signal.tst
+#testit t2-l2-2rt-deadlock.tst
+testit t3-l1-pi-1rt.tst
+testit t3-l1-pi-2rt.tst
+testit t3-l1-pi-3rt.tst
+testit t3-l1-pi-signal.tst
+testit t3-l1-pi-steal.tst
+testit t3-l2-pi.tst
+testit t4-l2-pi-deboost.tst
+testit t5-l4-pi-boost-deboost.tst
+testit t5-l4-pi-boost-deboost-setsched.tst
+
diff --git a/scripts/rt-tester/rt-tester.py b/scripts/rt-tester/rt-tester.py
new file mode 100644 (file)
index 0000000..4c79660
--- /dev/null
@@ -0,0 +1,222 @@
+#!/usr/bin/env python
+#
+# rt-mutex tester
+#
+# (C) 2006 Thomas Gleixner <tglx@linutronix.de>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+#
+import os
+import sys
+import getopt
+import shutil
+import string
+
+# Globals
+quiet = 0
+test = 0
+comments = 0
+
+sysfsprefix = "/sys/devices/system/rttest/rttest"
+statusfile = "/status"
+commandfile = "/command"
+
+# Command opcodes
+cmd_opcodes = {
+    "schedother"    : "1",
+    "schedfifo"     : "2",
+    "lock"          : "3",
+    "locknowait"    : "4",
+    "lockint"       : "5",
+    "lockintnowait" : "6",
+    "lockcont"      : "7",
+    "unlock"        : "8",
+    "lockbkl"       : "9",
+    "unlockbkl"     : "10",
+    "signal"        : "11",
+    "resetevent"    : "98",
+    "reset"         : "99",
+    }
+
+test_opcodes = {
+    "prioeq"        : ["P" , "eq" , None],
+    "priolt"        : ["P" , "lt" , None],
+    "priogt"        : ["P" , "gt" , None],
+    "nprioeq"       : ["N" , "eq" , None],
+    "npriolt"       : ["N" , "lt" , None],
+    "npriogt"       : ["N" , "gt" , None],
+    "unlocked"      : ["M" , "eq" , 0],
+    "trylock"       : ["M" , "eq" , 1],
+    "blocked"       : ["M" , "eq" , 2],
+    "blockedwake"   : ["M" , "eq" , 3],
+    "locked"        : ["M" , "eq" , 4],
+    "opcodeeq"      : ["O" , "eq" , None],
+    "opcodelt"      : ["O" , "lt" , None],
+    "opcodegt"      : ["O" , "gt" , None],
+    "eventeq"       : ["E" , "eq" , None],
+    "eventlt"       : ["E" , "lt" , None],
+    "eventgt"       : ["E" , "gt" , None],
+    }
+
+# Print usage information
+def usage():
+    print "rt-tester.py <-c -h -q -t> <testfile>"
+    print " -c    display comments after first command"
+    print " -h    help"
+    print " -q    quiet mode"
+    print " -t    test mode (syntax check)"
+    print " testfile: read test specification from testfile"
+    print " otherwise from stdin"
+    return
+
+# Print progress when not in quiet mode
+def progress(str):
+    if not quiet:
+        print str
+
+# Analyse a status value
+def analyse(val, top, arg):
+
+    intval = int(val)
+
+    if top[0] == "M":
+        intval = intval / (10 ** int(arg))
+       intval = intval % 10
+        argval = top[2]
+    elif top[0] == "O":
+        argval = int(cmd_opcodes.get(arg, arg))
+    else:
+        argval = int(arg)
+
+    # progress("%d %s %d" %(intval, top[1], argval))
+
+    if top[1] == "eq" and intval == argval:
+       return 1
+    if top[1] == "lt" and intval < argval:
+        return 1
+    if top[1] == "gt" and intval > argval:
+       return 1
+    return 0
+
+# Parse the commandline
+try:
+    (options, arguments) = getopt.getopt(sys.argv[1:],'chqt')
+except getopt.GetoptError, ex:
+    usage()
+    sys.exit(1)
+
+# Parse commandline options
+for option, value in options:
+    if option == "-c":
+        comments = 1
+    elif option == "-q":
+        quiet = 1
+    elif option == "-t":
+        test = 1
+    elif option == '-h':
+        usage()
+        sys.exit(0)
+
+# Select the input source
+if arguments:
+    try:
+        fd = open(arguments[0])
+    except Exception,ex:
+        sys.stderr.write("File not found %s\n" %(arguments[0]))
+        sys.exit(1)
+else:
+    fd = sys.stdin
+
+linenr = 0
+
+# Read the test patterns
+while 1:
+
+    linenr = linenr + 1
+    line = fd.readline()
+    if not len(line):
+        break
+
+    line = line.strip()
+    parts = line.split(":")
+
+    if not parts or len(parts) < 1:
+        continue
+
+    if len(parts[0]) == 0:
+        continue
+
+    if parts[0].startswith("#"):
+       if comments > 1:
+           progress(line)
+       continue
+
+    if comments == 1:
+       comments = 2
+
+    progress(line)
+
+    cmd = parts[0].strip().lower()
+    opc = parts[1].strip().lower()
+    tid = parts[2].strip()
+    dat = parts[3].strip()
+
+    try:
+        # Test or wait for a status value
+        if cmd == "t" or cmd == "w":
+            testop = test_opcodes[opc]
+
+            fname = "%s%s%s" %(sysfsprefix, tid, statusfile)
+            if test:
+               print fname
+                continue
+
+            while 1:
+                query = 1
+                fsta = open(fname, 'r')
+                status = fsta.readline().strip()
+                fsta.close()
+                stat = status.split(",")
+                for s in stat:
+                   s = s.strip()
+                    if s.startswith(testop[0]):
+                        # Seperate status value
+                        val = s[2:].strip()
+                        query = analyse(val, testop, dat)
+                        break
+                if query or cmd == "t":
+                    break
+
+            progress("   " + status)
+
+            if not query:
+                sys.stderr.write("Test failed in line %d\n" %(linenr))
+               sys.exit(1)
+
+        # Issue a command to the tester
+        elif cmd == "c":
+            cmdnr = cmd_opcodes[opc]
+            # Build command string and sys filename
+            cmdstr = "%s:%s" %(cmdnr, dat)
+            fname = "%s%s%s" %(sysfsprefix, tid, commandfile)
+            if test:
+               print fname
+                continue
+            fcmd = open(fname, 'w')
+            fcmd.write(cmdstr)
+            fcmd.close()
+
+    except Exception,ex:
+       sys.stderr.write(str(ex))
+        sys.stderr.write("\nSyntax error in line %d\n" %(linenr))
+        if not test:
+            fd.close()
+            sys.exit(1)
+
+# Normal exit pass
+print "Pass"
+sys.exit(0)
+
+
diff --git a/scripts/rt-tester/t2-l1-2rt-sameprio.tst b/scripts/rt-tester/t2-l1-2rt-sameprio.tst
new file mode 100644 (file)
index 0000000..8821f27
--- /dev/null
@@ -0,0 +1,99 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       0
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 2 threads 1 lock
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedfifo:          0:      80
+C: schedfifo:          1:      80
+
+# T0 lock L0
+C: locknowait:         0:      0
+C: locknowait:         1:      0
+W: locked:             0:      0
+W: blocked:            1:      0
+T: prioeq:             0:      80
+
+# T0 unlock L0
+C: unlock:             0:      0
+W: locked:             1:      0
+
+# Verify T0
+W: unlocked:           0:      0
+T: prioeq:             0:      80
+
+# Unlock
+C: unlock:             1:      0
+W: unlocked:           1:      0
+
+# T1,T0 lock L0
+C: locknowait:         1:      0
+C: locknowait:         0:      0
+W: locked:             1:      0
+W: blocked:            0:      0
+T: prioeq:             1:      80
+
+# T1 unlock L0
+C: unlock:             1:      0
+W: locked:             0:      0
+
+# Verify T1
+W: unlocked:           1:      0
+T: prioeq:             1:      80
+
+# Unlock and exit
+C: unlock:             0:      0
+W: unlocked:           0:      0
+
diff --git a/scripts/rt-tester/t2-l1-pi.tst b/scripts/rt-tester/t2-l1-pi.tst
new file mode 100644 (file)
index 0000000..cde1f18
--- /dev/null
@@ -0,0 +1,82 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       0
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 2 threads 1 lock with priority inversion
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedother:         0:      0
+C: schedfifo:          1:      80
+
+# T0 lock L0
+C: locknowait:         0:      0
+W: locked:             0:      0
+
+# T1 lock L0
+C: locknowait:         1:      0
+W: blocked:            1:      0
+T: prioeq:             0:      80
+
+# T0 unlock L0
+C: unlock:             0:      0
+W: locked:             1:      0
+
+# Verify T1
+W: unlocked:           0:      0
+T: priolt:             0:      1
+
+# Unlock and exit
+C: unlock:             1:      0
+W: unlocked:           1:      0
+
diff --git a/scripts/rt-tester/t2-l1-signal.tst b/scripts/rt-tester/t2-l1-signal.tst
new file mode 100644 (file)
index 0000000..3ab0bfc
--- /dev/null
@@ -0,0 +1,77 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       0
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 2 threads 1 lock with priority inversion
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedother:         0:      0
+C: schedother:         1:      0
+
+# T0 lock L0
+C: locknowait:         0:      0
+W: locked:             0:      0
+
+# T1 lock L0
+C: lockintnowait:      1:      0
+W: blocked:            1:      0
+
+# Interrupt T1
+C: signal:             1:      0
+W: unlocked:           1:      0
+T: opcodeeq:           1:      -4
+
+# Unlock and exit
+C: unlock:             0:      0
+W: unlocked:           0:      0
diff --git a/scripts/rt-tester/t2-l2-2rt-deadlock.tst b/scripts/rt-tester/t2-l2-2rt-deadlock.tst
new file mode 100644 (file)
index 0000000..f4b5d5d
--- /dev/null
@@ -0,0 +1,89 @@
+#
+# RT-Mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       0
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 2 threads 2 lock
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedfifo:          0:      80
+C: schedfifo:          1:      80
+
+# T0 lock L0
+C: locknowait:         0:      0
+W: locked:             0:      0
+
+# T1 lock L1
+C: locknowait:         1:      1
+W: locked:             1:      1
+
+# T0 lock L1
+C: lockintnowait:      0:      1
+W: blocked:            0:      1
+
+# T1 lock L0
+C: lockintnowait:      1:      0
+W: blocked:            1:      0
+
+# Make deadlock go away
+C: signal:             1:      0
+W: unlocked:           1:      0
+C: signal:             0:      0
+W: unlocked:           0:      1
+
+# Unlock and exit
+C: unlock:             0:      0
+W: unlocked:           0:      0
+C: unlock:             1:      1
+W: unlocked:           1:      1
+
diff --git a/scripts/rt-tester/t3-l1-pi-1rt.tst b/scripts/rt-tester/t3-l1-pi-1rt.tst
new file mode 100644 (file)
index 0000000..63440ca
--- /dev/null
@@ -0,0 +1,92 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       thread to signal (0-7)
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedother:         0:      0
+C: schedother:         1:      0
+C: schedfifo:          2:      82
+
+# T0 lock L0
+C: locknowait:         0:      0
+W: locked:             0:      0
+
+# T1 lock L0
+C: locknowait:         1:      0
+W: blocked:            1:      0
+T: priolt:             0:      1
+
+# T2 lock L0
+C: locknowait:         2:      0
+W: blocked:            2:      0
+T: prioeq:             0:      82
+
+# T0 unlock L0
+C: unlock:             0:      0
+
+# Wait until T2 got the lock
+W: locked:             2:      0
+W: unlocked:           0:      0
+T: priolt:             0:      1
+
+# T2 unlock L0
+C: unlock:             2:      0
+
+W: unlocked:           2:      0
+W: locked:             1:      0
+
+C: unlock:             1:      0
+W: unlocked:           1:      0
diff --git a/scripts/rt-tester/t3-l1-pi-2rt.tst b/scripts/rt-tester/t3-l1-pi-2rt.tst
new file mode 100644 (file)
index 0000000..e5816fe
--- /dev/null
@@ -0,0 +1,93 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       thread to signal (0-7)
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedother:         0:      0
+C: schedfifo:          1:      81
+C: schedfifo:          2:      82
+
+# T0 lock L0
+C: locknowait:         0:      0
+W: locked:             0:      0
+
+# T1 lock L0
+C: locknowait:         1:      0
+W: blocked:            1:      0
+T: prioeq:             0:      81
+
+# T2 lock L0
+C: locknowait:         2:      0
+W: blocked:            2:      0
+T: prioeq:             0:      82
+T: prioeq:             1:      81
+
+# T0 unlock L0
+C: unlock:             0:      0
+
+# Wait until T2 got the lock
+W: locked:             2:      0
+W: unlocked:           0:      0
+T: priolt:             0:      1
+
+# T2 unlock L0
+C: unlock:             2:      0
+
+W: unlocked:           2:      0
+W: locked:             1:      0
+
+C: unlock:             1:      0
+W: unlocked:           1:      0
diff --git a/scripts/rt-tester/t3-l1-pi-3rt.tst b/scripts/rt-tester/t3-l1-pi-3rt.tst
new file mode 100644 (file)
index 0000000..718b82b
--- /dev/null
@@ -0,0 +1,92 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       thread to signal (0-7)
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 3 threads 1 lock PI
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedfifo:          0:      80
+C: schedfifo:          1:      81
+C: schedfifo:          2:      82
+
+# T0 lock L0
+C: locknowait:         0:      0
+W: locked:             0:      0
+
+# T1 lock L0
+C: locknowait:         1:      0
+W: blocked:            1:      0
+T: prioeq:             0:      81
+
+# T2 lock L0
+C: locknowait:         2:      0
+W: blocked:            2:      0
+T: prioeq:             0:      82
+
+# T0 unlock L0
+C: unlock:             0:      0
+
+# Wait until T2 got the lock
+W: locked:             2:      0
+W: unlocked:           0:      0
+T: prioeq:             0:      80
+
+# T2 unlock L0
+C: unlock:             2:      0
+
+W: locked:             1:      0
+W: unlocked:           2:      0
+
+C: unlock:             1:      0
+W: unlocked:           1:      0
diff --git a/scripts/rt-tester/t3-l1-pi-signal.tst b/scripts/rt-tester/t3-l1-pi-signal.tst
new file mode 100644 (file)
index 0000000..c6e2135
--- /dev/null
@@ -0,0 +1,98 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       thread to signal (0-7)
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+# Reset event counter
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set priorities
+C: schedother:         0:      0
+C: schedfifo:          1:      80
+C: schedfifo:          2:      81
+
+# T0 lock L0
+C: lock:               0:      0
+W: locked:             0:      0
+
+# T1 lock L0, no wait in the wakeup path
+C: locknowait:         1:      0
+W: blocked:            1:      0
+T: prioeq:             0:      80
+T: prioeq:             1:      80
+
+# T2 lock L0 interruptible, no wait in the wakeup path
+C: lockintnowait:      2:      0
+W: blocked:            2:      0
+T: prioeq:             0:      81
+T: prioeq:             1:      80
+
+# Interrupt T2
+C: signal:             2:      2
+W: unlocked:           2:      0
+T: prioeq:             1:      80
+T: prioeq:             0:      80
+
+T: locked:             0:      0
+T: blocked:            1:      0
+
+# T0 unlock L0
+C: unlock:             0:      0
+
+# Wait until T1 has locked L0 and exit
+W: locked:             1:      0
+W: unlocked:           0:      0
+T: priolt:             0:      1
+
+C: unlock:             1:      0
+W: unlocked:           1:      0
+
+
+
diff --git a/scripts/rt-tester/t3-l1-pi-steal.tst b/scripts/rt-tester/t3-l1-pi-steal.tst
new file mode 100644 (file)
index 0000000..f53749d
--- /dev/null
@@ -0,0 +1,96 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       thread to signal (0-7)
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 3 threads 1 lock PI steal pending ownership
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedother:         0:      0
+C: schedfifo:          1:      80
+C: schedfifo:          2:      81
+
+# T0 lock L0
+C: lock:               0:      0
+W: locked:             0:      0
+
+# T1 lock L0
+C: lock:               1:      0
+W: blocked:            1:      0
+T: prioeq:             0:      80
+
+# T0 unlock L0
+C: unlock:             0:      0
+
+# Wait until T1 is in the wakeup loop
+W: blockedwake:                1:      0
+T: priolt:             0:      1
+
+# T2 lock L0
+C: lock:               2:      0
+# T1 leave wakeup loop
+C: lockcont:           1:      0
+
+# T2 must have the lock and T1 must be blocked
+W: locked:             2:      0
+W: blocked:            1:      0
+
+# T2 unlock L0
+C: unlock:             2:      0
+
+# Wait until T1 is in the wakeup loop and let it run
+W: blockedwake:                1:      0
+C: lockcont:           1:      0
+W: locked:             1:      0
+C: unlock:             1:      0
+W: unlocked:           1:      0
diff --git a/scripts/rt-tester/t3-l2-pi.tst b/scripts/rt-tester/t3-l2-pi.tst
new file mode 100644 (file)
index 0000000..cdc3e4f
--- /dev/null
@@ -0,0 +1,92 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       thread to signal (0-7)
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 3 threads 2 lock PI
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedother:         0:      0
+C: schedother:         1:      0
+C: schedfifo:          2:      82
+
+# T0 lock L0
+C: locknowait:         0:      0
+W: locked:             0:      0
+
+# T1 lock L0
+C: locknowait:         1:      0
+W: blocked:            1:      0
+T: priolt:             0:      1
+
+# T2 lock L0
+C: locknowait:         2:      0
+W: blocked:            2:      0
+T: prioeq:             0:      82
+
+# T0 unlock L0
+C: unlock:             0:      0
+
+# Wait until T2 got the lock
+W: locked:             2:      0
+W: unlocked:           0:      0
+T: priolt:             0:      1
+
+# T2 unlock L0
+C: unlock:             2:      0
+
+W: unlocked:           2:      0
+W: locked:             1:      0
+
+C: unlock:             1:      0
+W: unlocked:           1:      0
diff --git a/scripts/rt-tester/t4-l2-pi-deboost.tst b/scripts/rt-tester/t4-l2-pi-deboost.tst
new file mode 100644 (file)
index 0000000..baa1413
--- /dev/null
@@ -0,0 +1,123 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       thread to signal (0-7)
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 4 threads 2 lock PI
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedother:         0:      0
+C: schedother:         1:      0
+C: schedfifo:          2:      82
+C: schedfifo:          3:      83
+
+# T0 lock L0
+C: locknowait:         0:      0
+W: locked:             0:      0
+
+# T1 lock L1
+C: locknowait:         1:      1
+W: locked:             1:      1
+
+# T3 lock L0
+C: lockintnowait:      3:      0
+W: blocked:            3:      0
+T: prioeq:             0:      83
+
+# T0 lock L1
+C: lock:               0:      1
+W: blocked:            0:      1
+T: prioeq:             1:      83
+
+# T1 unlock L1
+C: unlock:             1:      1
+
+# Wait until T0 is in the wakeup code
+W: blockedwake:                0:      1
+
+# Verify that T1 is unboosted
+W: unlocked:           1:      1
+T: priolt:             1:      1
+
+# T2 lock L1 (T0 is boosted and pending owner !)
+C: locknowait:         2:      1
+W: blocked:            2:      1
+T: prioeq:             0:      83
+
+# Interrupt T3 and wait until T3 returned
+C: signal:             3:      0
+W: unlocked:           3:      0
+
+# Verify prio of T0 (still pending owner,
+# but T2 is enqueued due to the previous boost by T3
+T: prioeq:             0:      82
+
+# Let T0 continue
+C: lockcont:           0:      1
+W: locked:             0:      1
+
+# Unlock L1 and let T2 get L1
+C: unlock:             0:      1
+W: locked:             2:      1
+
+# Verify that T0 is unboosted
+W: unlocked:           0:      1
+T: priolt:             0:      1
+
+# Unlock everything and exit
+C: unlock:             2:      1
+W: unlocked:           2:      1
+
+C: unlock:             0:      0
+W: unlocked:           0:      0
+
diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost-setsched.tst
new file mode 100644 (file)
index 0000000..e6ec0c8
--- /dev/null
@@ -0,0 +1,183 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       thread to signal (0-7)
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 5 threads 4 lock PI - modify priority of blocked threads
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedother:         0:      0
+C: schedfifo:          1:      81
+C: schedfifo:          2:      82
+C: schedfifo:          3:      83
+C: schedfifo:          4:      84
+
+# T0 lock L0
+C: locknowait:         0:      0
+W: locked:             0:      0
+
+# T1 lock L1
+C: locknowait:         1:      1
+W: locked:             1:      1
+
+# T1 lock L0
+C: lockintnowait:      1:      0
+W: blocked:            1:      0
+T: prioeq:             0:      81
+
+# T2 lock L2
+C: locknowait:         2:      2
+W: locked:             2:      2
+
+# T2 lock L1
+C: lockintnowait:      2:      1
+W: blocked:            2:      1
+T: prioeq:             0:      82
+T: prioeq:             1:      82
+
+# T3 lock L3
+C: locknowait:         3:      3
+W: locked:             3:      3
+
+# T3 lock L2
+C: lockintnowait:      3:      2
+W: blocked:            3:      2
+T: prioeq:             0:      83
+T: prioeq:             1:      83
+T: prioeq:             2:      83
+
+# T4 lock L3
+C: lockintnowait:      4:      3
+W: blocked:            4:      3
+T: prioeq:             0:      84
+T: prioeq:             1:      84
+T: prioeq:             2:      84
+T: prioeq:             3:      84
+
+# Reduce prio of T4
+C: schedfifo:          4:      80
+T: prioeq:             0:      83
+T: prioeq:             1:      83
+T: prioeq:             2:      83
+T: prioeq:             3:      83
+T: prioeq:             4:      80
+
+# Increase prio of T4
+C: schedfifo:          4:      84
+T: prioeq:             0:      84
+T: prioeq:             1:      84
+T: prioeq:             2:      84
+T: prioeq:             3:      84
+T: prioeq:             4:      84
+
+# Reduce prio of T3
+C: schedfifo:          3:      80
+T: prioeq:             0:      84
+T: prioeq:             1:      84
+T: prioeq:             2:      84
+T: prioeq:             3:      84
+T: prioeq:             4:      84
+
+# Increase prio of T3
+C: schedfifo:          3:      85
+T: prioeq:             0:      85
+T: prioeq:             1:      85
+T: prioeq:             2:      85
+T: prioeq:             3:      85
+T: prioeq:             4:      84
+
+# Reduce prio of T3
+C: schedfifo:          3:      83
+T: prioeq:             0:      84
+T: prioeq:             1:      84
+T: prioeq:             2:      84
+T: prioeq:             3:      84
+T: prioeq:             4:      84
+
+# Signal T4
+C: signal:             4:      0
+W: unlocked:           4:      3
+T: prioeq:             0:      83
+T: prioeq:             1:      83
+T: prioeq:             2:      83
+T: prioeq:             3:      83
+
+# Signal T3
+C: signal:             3:      0
+W: unlocked:           3:      2
+T: prioeq:             0:      82
+T: prioeq:             1:      82
+T: prioeq:             2:      82
+
+# Signal T2
+C: signal:             2:      0
+W: unlocked:           2:      1
+T: prioeq:             0:      81
+T: prioeq:             1:      81
+
+# Signal T1
+C: signal:             1:      0
+W: unlocked:           1:      0
+T: priolt:             0:      1
+
+# Unlock and exit
+C: unlock:             3:      3
+C: unlock:             2:      2
+C: unlock:             1:      1
+C: unlock:             0:      0
+
+W: unlocked:           3:      3
+W: unlocked:           2:      2
+W: unlocked:           1:      1
+W: unlocked:           0:      0
+
diff --git a/scripts/rt-tester/t5-l4-pi-boost-deboost.tst b/scripts/rt-tester/t5-l4-pi-boost-deboost.tst
new file mode 100644 (file)
index 0000000..ca64f8b
--- /dev/null
@@ -0,0 +1,143 @@
+#
+# rt-mutex test
+#
+# Op: C(ommand)/T(est)/W(ait)
+# |  opcode
+# |  |     threadid: 0-7
+# |  |     |  opcode argument
+# |  |     |  |
+# C: lock: 0: 0
+#
+# Commands
+#
+# opcode       opcode argument
+# schedother   nice value
+# schedfifo    priority
+# lock         lock nr (0-7)
+# locknowait   lock nr (0-7)
+# lockint      lock nr (0-7)
+# lockintnowait        lock nr (0-7)
+# lockcont     lock nr (0-7)
+# unlock       lock nr (0-7)
+# lockbkl      lock nr (0-7)
+# unlockbkl    lock nr (0-7)
+# signal       thread to signal (0-7)
+# reset                0
+# resetevent   0
+#
+# Tests / Wait
+#
+# opcode       opcode argument
+#
+# prioeq       priority
+# priolt       priority
+# priogt       priority
+# nprioeq      normal priority
+# npriolt      normal priority
+# npriogt      normal priority
+# locked       lock nr (0-7)
+# blocked      lock nr (0-7)
+# blockedwake  lock nr (0-7)
+# unlocked     lock nr (0-7)
+# lockedbkl    dont care
+# blockedbkl   dont care
+# unlockedbkl  dont care
+# opcodeeq     command opcode or number
+# opcodelt     number
+# opcodegt     number
+# eventeq      number
+# eventgt      number
+# eventlt      number
+
+#
+# 5 threads 4 lock PI
+#
+C: resetevent:         0:      0
+W: opcodeeq:           0:      0
+
+# Set schedulers
+C: schedother:         0:      0
+C: schedfifo:          1:      81
+C: schedfifo:          2:      82
+C: schedfifo:          3:      83
+C: schedfifo:          4:      84
+
+# T0 lock L0
+C: locknowait:         0:      0
+W: locked:             0:      0
+
+# T1 lock L1
+C: locknowait:         1:      1
+W: locked:             1:      1
+
+# T1 lock L0
+C: lockintnowait:      1:      0
+W: blocked:            1:      0
+T: prioeq:             0:      81
+
+# T2 lock L2
+C: locknowait:         2:      2
+W: locked:             2:      2
+
+# T2 lock L1
+C: lockintnowait:      2:      1
+W: blocked:            2:      1
+T: prioeq:             0:      82
+T: prioeq:             1:      82
+
+# T3 lock L3
+C: locknowait:         3:      3
+W: locked:             3:      3
+
+# T3 lock L2
+C: lockintnowait:      3:      2
+W: blocked:            3:      2
+T: prioeq:             0:      83
+T: prioeq:             1:      83
+T: prioeq:             2:      83
+
+# T4 lock L3
+C: lockintnowait:      4:      3
+W: blocked:            4:      3
+T: prioeq:             0:      84
+T: prioeq:             1:      84
+T: prioeq:             2:      84
+T: prioeq:             3:      84
+
+# Signal T4
+C: signal:             4:      0
+W: unlocked:           4:      3
+T: prioeq:             0:      83
+T: prioeq:             1:      83
+T: prioeq:             2:      83
+T: prioeq:             3:      83
+
+# Signal T3
+C: signal:             3:      0
+W: unlocked:           3:      2
+T: prioeq:             0:      82
+T: prioeq:             1:      82
+T: prioeq:             2:      82
+
+# Signal T2
+C: signal:             2:      0
+W: unlocked:           2:      1
+T: prioeq:             0:      81
+T: prioeq:             1:      81
+
+# Signal T1
+C: signal:             1:      0
+W: unlocked:           1:      0
+T: priolt:             0:      1
+
+# Unlock and exit
+C: unlock:             3:      3
+C: unlock:             2:      2
+C: unlock:             1:      1
+C: unlock:             0:      0
+
+W: unlocked:           3:      3
+W: unlocked:           2:      2
+W: unlocked:           1:      1
+W: unlocked:           0:      0
+
index 43295ca37b5dcb7b3461dc16645513a6df84ecf8..80de8c3e9cc3ea49c79e6525b0ce17b22076ac3e 100644 (file)
@@ -11,6 +11,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/poison.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/security.h>
@@ -988,7 +989,7 @@ void unregister_key_type(struct key_type *ktype)
                if (key->type == ktype) {
                        if (ktype->destroy)
                                ktype->destroy(key);
-                       memset(&key->payload, 0xbd, sizeof(key->payload));
+                       memset(&key->payload, KEY_DESTROY, sizeof(key->payload));
                }
        }
 
index ac7f2b2e39240024a70cfe06b464e93050447410..28832e689800b9c9250b54cc4c0be918adebb51d 100644 (file)
@@ -1532,8 +1532,9 @@ static int selinux_bprm_set_security(struct linux_binprm *bprm)
        /* Default to the current task SID. */
        bsec->sid = tsec->sid;
 
-       /* Reset create and sockcreate SID on execve. */
+       /* Reset fs, key, and sock SIDs on execve. */
        tsec->create_sid = 0;
+       tsec->keycreate_sid = 0;
        tsec->sockcreate_sid = 0;
 
        if (tsec->exec_sid) {
@@ -2586,9 +2587,10 @@ static int selinux_task_alloc_security(struct task_struct *tsk)
        tsec2->osid = tsec1->osid;
        tsec2->sid = tsec1->sid;
 
-       /* Retain the exec, create, and sock SIDs across fork */
+       /* Retain the exec, fs, key, and sock SIDs across fork */
        tsec2->exec_sid = tsec1->exec_sid;
        tsec2->create_sid = tsec1->create_sid;
+       tsec2->keycreate_sid = tsec1->keycreate_sid;
        tsec2->sockcreate_sid = tsec1->sockcreate_sid;
 
        /* Retain ptracer SID across fork, if any.
index 1a921ee71aba2d567c5e48d186343905fa981550..2343dedd44ae305850661b236becb6456a0014bc 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/pci.h>
+#include <linux/poison.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
@@ -3522,7 +3523,7 @@ err_out_have_mixer:
 
 err_out_kfree:
 #ifndef VIA_NDEBUG
-       memset (card, 0xAB, sizeof (*card)); /* poison memory */
+       memset (card, OSS_POISON_FREE, sizeof (*card)); /* poison memory */
 #endif
        kfree (card);
 
@@ -3559,7 +3560,7 @@ static void __devexit via_remove_one (struct pci_dev *pdev)
        via_ac97_cleanup (card);
 
 #ifndef VIA_NDEBUG
-       memset (card, 0xAB, sizeof (*card)); /* poison memory */
+       memset (card, OSS_POISON_FREE, sizeof (*card)); /* poison memory */
 #endif
        kfree (card);