]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 31 Dec 2008 00:10:19 +0000 (16:10 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 31 Dec 2008 00:10:19 +0000 (16:10 -0800)
* 'core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (63 commits)
  stacktrace: provide save_stack_trace_tsk() weak alias
  rcu: provide RCU options on non-preempt architectures too
  printk: fix discarding message when recursion_bug
  futex: clean up futex_(un)lock_pi fault handling
  "Tree RCU": scalable classic RCU implementation
  futex: rename field in futex_q to clarify single waiter semantics
  x86/swiotlb: add default swiotlb_arch_range_needs_mapping
  x86/swiotlb: add default phys<->bus conversion
  x86: unify pci iommu setup and allow swiotlb to compile for 32 bit
  x86: add swiotlb allocation functions
  swiotlb: consolidate swiotlb info message printing
  swiotlb: support bouncing of HighMem pages
  swiotlb: factor out copy to/from device
  swiotlb: add arch hook to force mapping
  swiotlb: allow architectures to override phys<->bus<->phys conversions
  swiotlb: add comment where we handle the overflow of a dma mask on 32 bit
  rcu: fix rcutorture behavior during reboot
  resources: skip sanity check of busy resources
  swiotlb: move some definitions to header
  swiotlb: allow architectures to override swiotlb pool allocation
  ...

Fix up trivial conflicts in
  arch/x86/kernel/Makefile
  arch/x86/mm/init_32.c
  include/linux/hardirq.h
as per Ingo's suggestions.

20 files changed:
1  2 
arch/x86/include/asm/dma-mapping.h
arch/x86/include/asm/iommu.h
arch/x86/include/asm/pci.h
arch/x86/include/asm/uaccess.h
arch/x86/kernel/Makefile
arch/x86/kernel/pci-dma.c
arch/x86/mm/init_32.c
include/asm-generic/bug.h
include/linux/hardirq.h
include/linux/kernel.h
include/linux/rcupdate.h
init/Kconfig
kernel/Makefile
kernel/exit.c
kernel/extable.c
kernel/futex.c
kernel/lockdep.c
kernel/sched.c
kernel/sys.c
mm/memory.c

index dc22c0733282b9ce631250d69cbcc00e0cd46617,3b43a65894c4ac44c5c96e6a736d12ec1ceeb664..4035357f5b9d6b830a16b09f39d30e90810aae9e
@@@ -65,16 -65,18 +65,16 @@@ static inline struct dma_mapping_ops *g
                return dma_ops;
        else
                return dev->archdata.dma_ops;
- #endif /* _ASM_X86_DMA_MAPPING_H */
+ #endif
  }
  
  /* Make sure we keep the same behaviour */
  static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
  {
 -#ifdef CONFIG_X86_64
        struct dma_mapping_ops *ops = get_dma_ops(dev);
        if (ops->mapping_error)
                return ops->mapping_error(dev, dma_addr);
  
 -#endif
        return (dma_addr == bad_dma_address);
  }
  
index 295b13193f4df09b05aad53ec09c81c601a17a9f,35276ec5925b0ee7dcaad9aaadcb7d1a1382cb27..a6ee9e6f530f89cc2e86a5607ceff6597c9d1757
@@@ -7,9 -7,40 +7,7 @@@ extern struct dma_mapping_ops nommu_dma
  extern int force_iommu, no_iommu;
  extern int iommu_detected;
  
- extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
  /* 10 seconds */
  #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
  
 -#ifdef CONFIG_GART_IOMMU
 -extern int gart_iommu_aperture;
 -extern int gart_iommu_aperture_allowed;
 -extern int gart_iommu_aperture_disabled;
 -
 -extern void early_gart_iommu_check(void);
 -extern void gart_iommu_init(void);
 -extern void gart_iommu_shutdown(void);
 -extern void __init gart_parse_options(char *);
 -extern void gart_iommu_hole_init(void);
 -
 -#else
 -#define gart_iommu_aperture            0
 -#define gart_iommu_aperture_allowed    0
 -#define gart_iommu_aperture_disabled   1
 -
 -static inline void early_gart_iommu_check(void)
 -{
 -}
 -static inline void gart_iommu_init(void)
 -{
 -}
 -static inline void gart_iommu_shutdown(void)
 -{
 -}
 -static inline void gart_parse_options(char *options)
 -{
 -}
 -static inline void gart_iommu_hole_init(void)
 -{
 -}
 -#endif
 -
  #endif /* _ASM_X86_IOMMU_H */
index 647781298e7ef7cbc98b14ad75f2552b0092507f,50ac542c9382ec371aa336e937bd6bc779a52581..66834c41c0493eccf1b117b1565443c10ec706b6
@@@ -19,8 -19,6 +19,8 @@@ struct pci_sysdata 
  };
  
  extern int pci_routeirq;
 +extern int noioapicquirk;
 +extern int noioapicreroute;
  
  /* scan a bus after allocating a pci_sysdata for it */
  extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
@@@ -84,6 -82,8 +84,8 @@@ static inline void pci_dma_burst_advice
  static inline void early_quirks(void) { }
  #endif
  
+ extern void pci_iommu_alloc(void);
  #endif  /* __KERNEL__ */
  
  #ifdef CONFIG_X86_32
index 580c3ee6c58c4d0479dbce9ea491c78049255e04,99192bb55a53bf68afc30efeae89b9b40b5f4b4d..4340055b755918fb8fd777df62f811ba5e26820c
@@@ -157,6 -157,7 +157,7 @@@ extern int __get_user_bad(void)
        int __ret_gu;                                                   \
        unsigned long __val_gu;                                         \
        __chk_user_ptr(ptr);                                            \
+       might_fault();                                                  \
        switch (sizeof(*(ptr))) {                                       \
        case 1:                                                         \
                __get_user_x(1, __ret_gu, __val_gu, ptr);               \
@@@ -241,6 -242,7 +242,7 @@@ extern void __put_user_8(void)
        int __ret_pu;                                           \
        __typeof__(*(ptr)) __pu_val;                            \
        __chk_user_ptr(ptr);                                    \
+       might_fault();                                          \
        __pu_val = x;                                           \
        switch (sizeof(*(ptr))) {                               \
        case 1:                                                 \
@@@ -350,14 -352,14 +352,14 @@@ do {                                                                    
  
  #define __put_user_nocheck(x, ptr, size)                      \
  ({                                                            \
 -      long __pu_err;                                          \
 +      int __pu_err;                                           \
        __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
        __pu_err;                                               \
  })
  
  #define __get_user_nocheck(x, ptr, size)                              \
  ({                                                                    \
 -      long __gu_err;                                                  \
 +      int __gu_err;                                                   \
        unsigned long __gu_val;                                         \
        __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT);    \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                     \
diff --combined arch/x86/kernel/Makefile
index 88dd768eab6d34980d14aff41178115ed881045e,a9c656f2d6613c0339d05c4fb100022d80c1a755..d364df03c1d6419ce4473e23b420d86167343775
@@@ -12,7 -12,6 +12,7 @@@ CFLAGS_REMOVE_tsc.o = -p
  CFLAGS_REMOVE_rtc.o = -pg
  CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
  CFLAGS_REMOVE_ftrace.o = -pg
 +CFLAGS_REMOVE_early_printk.o = -pg
  endif
  
  #
@@@ -24,9 -23,9 +24,9 @@@ CFLAGS_vsyscall_64.o  := $(PROFILING) -g
  CFLAGS_hpet.o         := $(nostackp)
  CFLAGS_tsc.o          := $(nostackp)
  
 -obj-y                 := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 +obj-y                 := process_$(BITS).o signal.o entry_$(BITS).o
  obj-y                 += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 -obj-y                 += time_$(BITS).o ioport.o ldt.o
 +obj-y                 += time_$(BITS).o ioport.o ldt.o dumpstack.o
  obj-y                 += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
  obj-$(CONFIG_X86_VISWS)       += visws_quirks.o
  obj-$(CONFIG_X86_32)  += probe_roms_32.o
@@@ -66,7 -65,6 +66,7 @@@ obj-$(CONFIG_X86_LOCAL_APIC)  += apic.o 
  obj-$(CONFIG_X86_IO_APIC)     += io_apic.o
  obj-$(CONFIG_X86_REBOOTFIXUPS)        += reboot_fixups_32.o
  obj-$(CONFIG_DYNAMIC_FTRACE)  += ftrace.o
 +obj-$(CONFIG_FUNCTION_GRAPH_TRACER)   += ftrace.o
  obj-$(CONFIG_KEXEC)           += machine_kexec_$(BITS).o
  obj-$(CONFIG_KEXEC)           += relocate_kernel_$(BITS).o crash.o
  obj-$(CONFIG_CRASH_DUMP)      += crash_dump_$(BITS).o
@@@ -107,8 -105,8 +107,10 @@@ microcode-$(CONFIG_MICROCODE_INTEL)       += 
  microcode-$(CONFIG_MICROCODE_AMD)     += microcode_amd.o
  obj-$(CONFIG_MICROCODE)                       += microcode.o
  
 +obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 +
+ obj-$(CONFIG_SWIOTLB)                 += pci-swiotlb_64.o # NB rename without _64
  ###
  # 64 bit specific files
  ifeq ($(CONFIG_X86_64),y)
          obj-$(CONFIG_GART_IOMMU)      += pci-gart_64.o aperture_64.o
          obj-$(CONFIG_CALGARY_IOMMU)   += pci-calgary_64.o tce_64.o
          obj-$(CONFIG_AMD_IOMMU)               += amd_iommu_init.o amd_iommu.o
-         obj-$(CONFIG_SWIOTLB)         += pci-swiotlb_64.o
  
          obj-$(CONFIG_PCI_MMCONFIG)    += mmconf-fam10h_64.o
  endif
index 7a3dfceb90e47a341295ca3f1201c232b5beb814,00e07447a5bd4881f474e843f5a93abdf52fcb49..19a1044a0cd94fbdc64b6ffd1e3df8ecde4530c9
@@@ -6,7 -6,6 +6,7 @@@
  #include <asm/proto.h>
  #include <asm/dma.h>
  #include <asm/iommu.h>
 +#include <asm/gart.h>
  #include <asm/calgary.h>
  #include <asm/amd_iommu.h>
  
@@@ -31,6 -30,11 +31,6 @@@ int no_iommu __read_mostly
  /* Set this to 1 if there is a HW IOMMU in the system */
  int iommu_detected __read_mostly = 0;
  
 -/* This tells the BIO block layer to assume merging. Default to off
 -   because we cannot guarantee merging later. */
 -int iommu_bio_merge __read_mostly = 0;
 -EXPORT_SYMBOL(iommu_bio_merge);
 -
  dma_addr_t bad_dma_address __read_mostly = 0;
  EXPORT_SYMBOL(bad_dma_address);
  
@@@ -101,11 -105,15 +101,15 @@@ static void __init dma32_free_bootmem(v
        dma32_bootmem_ptr = NULL;
        dma32_bootmem_size = 0;
  }
+ #endif
  
  void __init pci_iommu_alloc(void)
  {
+ #ifdef CONFIG_X86_64
        /* free the range so iommu could get some range less than 4G */
        dma32_free_bootmem();
+ #endif
        /*
         * The order of these functions is important for
         * fall-back/fail-over reasons
        pci_swiotlb_init();
  }
  
- unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
- {
-       unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
-       return size >> PAGE_SHIFT;
- }
- EXPORT_SYMBOL(iommu_nr_pages);
- #endif
  void *dma_generic_alloc_coherent(struct device *dev, size_t size,
                                 dma_addr_t *dma_addr, gfp_t flag)
  {
@@@ -184,6 -183,7 +179,6 @@@ static __init int iommu_setup(char *p
                }
  
                if (!strncmp(p, "biomerge", 8)) {
 -                      iommu_bio_merge = 4096;
                        iommu_merge = 1;
                        force_iommu = 1;
                }
@@@ -295,8 -295,8 +290,8 @@@ fs_initcall(pci_iommu_init)
  static __devinit void via_no_dac(struct pci_dev *dev)
  {
        if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
 -              printk(KERN_INFO "PCI: VIA PCI bridge detected."
 -                               "Disabling DAC.\n");
 +              printk(KERN_INFO
 +                      "PCI: VIA PCI bridge detected. Disabling DAC.\n");
                forbid_dac = 1;
        }
  }
diff --combined arch/x86/mm/init_32.c
index 800e1d94c1b5580e627ddb0a2ef81cfd10d031cd,2b4b14fc0c048ba683776f59282b5d64574c4bd9..8655b5bb0963f807a0fe49dcd3da367db19379f0
@@@ -21,6 -21,7 +21,7 @@@
  #include <linux/init.h>
  #include <linux/highmem.h>
  #include <linux/pagemap.h>
+ #include <linux/pci.h>
  #include <linux/pfn.h>
  #include <linux/poison.h>
  #include <linux/bootmem.h>
@@@ -67,7 -68,7 +68,7 @@@ static unsigned long __meminitdata tabl
  
  static int __initdata after_init_bootmem;
  
 -static __init void *alloc_low_page(unsigned long *phys)
 +static __init void *alloc_low_page(void)
  {
        unsigned long pfn = table_end++;
        void *adr;
@@@ -77,6 -78,7 +78,6 @@@
  
        adr = __va(pfn * PAGE_SIZE);
        memset(adr, 0, PAGE_SIZE);
 -      *phys  = pfn * PAGE_SIZE;
        return adr;
  }
  
@@@ -91,17 -93,16 +92,17 @@@ static pmd_t * __init one_md_table_init
        pmd_t *pmd_table;
  
  #ifdef CONFIG_X86_PAE
 -      unsigned long phys;
        if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
                if (after_init_bootmem)
                        pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
                else
 -                      pmd_table = (pmd_t *)alloc_low_page(&phys);
 +                      pmd_table = (pmd_t *)alloc_low_page();
                paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
                set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
                pud = pud_offset(pgd, 0);
                BUG_ON(pmd_table != pmd_offset(pud, 0));
 +
 +              return pmd_table;
        }
  #endif
        pud = pud_offset(pgd, 0);
@@@ -126,8 -127,10 +127,8 @@@ static pte_t * __init one_page_table_in
                        if (!page_table)
                                page_table =
                                (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
 -              } else {
 -                      unsigned long phys;
 -                      page_table = (pte_t *)alloc_low_page(&phys);
 -              }
 +              } else
 +                      page_table = (pte_t *)alloc_low_page();
  
                paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
                set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@@ -967,6 -970,10 +968,8 @@@ void __init mem_init(void
        int codesize, reservedpages, datasize, initsize;
        int tmp;
  
 -      start_periodic_check_for_corruption();
 -
+       pci_iommu_alloc();
  #ifdef CONFIG_FLATMEM
        BUG_ON(!mem_map);
  #endif
                (unsigned long)&_text, (unsigned long)&_etext,
                ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
  
 +      /*
 +       * Check boundaries twice: Some fundamental inconsistencies can
 +       * be detected at build time already.
 +       */
 +#define __FIXADDR_TOP (-PAGE_SIZE)
 +#ifdef CONFIG_HIGHMEM
 +      BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE  > FIXADDR_START);
 +      BUILD_BUG_ON(VMALLOC_END                        > PKMAP_BASE);
 +#endif
 +#define high_memory (-128UL << 20)
 +      BUILD_BUG_ON(VMALLOC_START                      >= VMALLOC_END);
 +#undef high_memory
 +#undef __FIXADDR_TOP
 +
  #ifdef CONFIG_HIGHMEM
        BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);
        BUG_ON(VMALLOC_END                              > PKMAP_BASE);
  #endif
 -      BUG_ON(VMALLOC_START                            > VMALLOC_END);
 +      BUG_ON(VMALLOC_START                            >= VMALLOC_END);
        BUG_ON((unsigned long)high_memory               > VMALLOC_START);
  
        if (boot_cpu_data.wp_works_ok < 0)
index 4c794d73fb8484e47fb0708beb7a470724707046,b8ba6941f587c7d12ac850f621826efd00107c35..8af276361bf26c662bd268fcec2c1431254ce03d
@@@ -8,17 -8,9 +8,17 @@@
  #ifdef CONFIG_GENERIC_BUG
  #ifndef __ASSEMBLY__
  struct bug_entry {
 +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
        unsigned long   bug_addr;
 +#else
 +      signed int      bug_addr_disp;
 +#endif
  #ifdef CONFIG_DEBUG_BUGVERBOSE
 +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
        const char      *file;
 +#else
 +      signed int      file_disp;
 +#endif
        unsigned short  line;
  #endif
        unsigned short  flags;
  
  #ifndef __WARN
  #ifndef __ASSEMBLY__
- extern void warn_on_slowpath(const char *file, const int line);
  extern void warn_slowpath(const char *file, const int line,
                const char *fmt, ...) __attribute__((format(printf, 3, 4)));
  #define WANT_WARN_ON_SLOWPATH
  #endif
- #define __WARN() warn_on_slowpath(__FILE__, __LINE__)
- #define __WARN_printf(arg...) warn_slowpath(__FILE__, __LINE__, arg)
+ #define __WARN()              warn_slowpath(__FILE__, __LINE__, NULL)
+ #define __WARN_printf(arg...) warn_slowpath(__FILE__, __LINE__, arg)
  #else
- #define __WARN_printf(arg...) do { printk(arg); __WARN(); } while (0)
+ #define __WARN_printf(arg...) do { printk(arg); __WARN(); } while (0)
  #endif
  
  #ifndef WARN_ON
diff --combined include/linux/hardirq.h
index 89a56d79e4c6c4987531a10ad8fed17f3d597bf7,9b70b9231693fc07a446f892220d2c91a6300672..f83288347dda3455e2deaa057112707accacaa93
@@@ -4,7 -4,6 +4,7 @@@
  #include <linux/preempt.h>
  #include <linux/smp_lock.h>
  #include <linux/lockdep.h>
 +#include <linux/ftrace_irq.h>
  #include <asm/hardirq.h>
  #include <asm/system.h>
  
@@@ -119,13 -118,17 +119,17 @@@ static inline void account_system_vtime
  }
  #endif
  
- #if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+ #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU)
  extern void rcu_irq_enter(void);
  extern void rcu_irq_exit(void);
+ extern void rcu_nmi_enter(void);
+ extern void rcu_nmi_exit(void);
  #else
  # define rcu_irq_enter() do { } while (0)
  # define rcu_irq_exit() do { } while (0)
- #endif /* CONFIG_PREEMPT_RCU */
+ # define rcu_nmi_enter() do { } while (0)
+ # define rcu_nmi_exit() do { } while (0)
+ #endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */
  
  /*
   * It is safe to do non-atomic ops on ->hardirq_context,
   */
  #define __irq_enter()                                 \
        do {                                            \
-               rcu_irq_enter();                        \
                account_system_vtime(current);          \
                add_preempt_count(HARDIRQ_OFFSET);      \
                trace_hardirq_enter();                  \
@@@ -154,7 -156,6 +157,6 @@@ extern void irq_enter(void)
                trace_hardirq_exit();                   \
                account_system_vtime(current);          \
                sub_preempt_count(HARDIRQ_OFFSET);      \
-               rcu_irq_exit();                         \
        } while (0)
  
  /*
   */
  extern void irq_exit(void);
  
 -#define nmi_enter()           do { lockdep_off(); rcu_nmi_enter(); __irq_enter(); } while (0)
 -#define nmi_exit()            do { __irq_exit(); rcu_nmi_exit(); lockdep_on(); } while (0)
 +#define nmi_enter()                           \
 +      do {                                    \
 +              ftrace_nmi_enter();             \
 +              lockdep_off();                  \
++              rcu_nmi_enter();                \
 +              __irq_enter();                  \
 +      } while (0)
++
 +#define nmi_exit()                            \
 +      do {                                    \
 +              __irq_exit();                   \
++              rcu_nmi_exit();                 \
 +              lockdep_on();                   \
 +              ftrace_nmi_exit();              \
 +      } while (0)
  
  #endif /* LINUX_HARDIRQ_H */
diff --combined include/linux/kernel.h
index 6002ae76785c9aeba493c0960aa5de508b139b49,269df5a17b30af1b7349c131da05abec8aa95046..ca9ff6411dfa778fed80eae8360b6136529f8f11
@@@ -141,6 -141,15 +141,15 @@@ extern int _cond_resched(void)
                (__x < 0) ? -__x : __x;         \
        })
  
+ #ifdef CONFIG_PROVE_LOCKING
+ void might_fault(void);
+ #else
+ static inline void might_fault(void)
+ {
+       might_sleep();
+ }
+ #endif
  extern struct atomic_notifier_head panic_notifier_list;
  extern long (*panic_blink)(long time);
  NORET_TYPE void panic(const char * fmt, ...)
@@@ -188,6 -197,8 +197,8 @@@ extern unsigned long long memparse(cons
  extern int core_kernel_text(unsigned long addr);
  extern int __kernel_text_address(unsigned long addr);
  extern int kernel_text_address(unsigned long addr);
+ extern int func_ptr_is_kernel_text(void *ptr);
  struct pid;
  extern struct pid *session_of_pgrp(struct pid *pgrp);
  
@@@ -361,6 -372,18 +372,6 @@@ static inline char *pack_hex_byte(char 
        ((unsigned char *)&addr)[3]
  #define NIPQUAD_FMT "%u.%u.%u.%u"
  
 -#define NIP6(addr) \
 -      ntohs((addr).s6_addr16[0]), \
 -      ntohs((addr).s6_addr16[1]), \
 -      ntohs((addr).s6_addr16[2]), \
 -      ntohs((addr).s6_addr16[3]), \
 -      ntohs((addr).s6_addr16[4]), \
 -      ntohs((addr).s6_addr16[5]), \
 -      ntohs((addr).s6_addr16[6]), \
 -      ntohs((addr).s6_addr16[7])
 -#define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
 -#define NIP6_SEQFMT "%04x%04x%04x%04x%04x%04x%04x%04x"
 -
  #if defined(__LITTLE_ENDIAN)
  #define HIPQUAD(addr) \
        ((unsigned char *)&addr)[3], \
diff --combined include/linux/rcupdate.h
index 895dc9c1088c767ce4706814b806c20248423241,bfd289aff5766bdd2921314fdbea014a7bf84a7f..1168fbcea8d4bc6a42d4abccf5b05e47266aa418
@@@ -52,11 -52,15 +52,15 @@@ struct rcu_head 
        void (*func)(struct rcu_head *head);
  };
  
- #ifdef CONFIG_CLASSIC_RCU
+ #if defined(CONFIG_CLASSIC_RCU)
  #include <linux/rcuclassic.h>
- #else /* #ifdef CONFIG_CLASSIC_RCU */
+ #elif defined(CONFIG_TREE_RCU)
+ #include <linux/rcutree.h>
+ #elif defined(CONFIG_PREEMPT_RCU)
  #include <linux/rcupreempt.h>
- #endif /* #else #ifdef CONFIG_CLASSIC_RCU */
+ #else
+ #error "Unknown RCU implementation specified to kernel configuration"
+ #endif /* #else #if defined(CONFIG_CLASSIC_RCU) */
  
  #define RCU_HEAD_INIT         { .next = NULL, .func = NULL }
  #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
   * on the write-side to insure proper synchronization.
   */
  #define rcu_read_lock_sched() preempt_disable()
 +#define rcu_read_lock_sched_notrace() preempt_disable_notrace()
  
  /*
   * rcu_read_unlock_sched - marks the end of a RCU-classic critical section
   * See rcu_read_lock_sched for more information.
   */
  #define rcu_read_unlock_sched() preempt_enable()
 +#define rcu_read_unlock_sched_notrace() preempt_enable_notrace()
  
  
  
diff --combined init/Kconfig
index 8a63c404ef449d391a08ffd5760f46325f76babf,6b0fdedf3596a44aabe5015af2afffc6b3c6e7ab..13627191a60d194de08aaa4b410aa752cfd7cb21
@@@ -588,13 -588,6 +588,13 @@@ config KALLSYMS_AL
  
           Say N.
  
 +config KALLSYMS_STRIP_GENERATED
 +      bool "Strip machine generated symbols from kallsyms"
 +      depends on KALLSYMS_ALL
 +      default y
 +      help
 +        Say N if you want kallsyms to retain even machine generated symbols.
 +
  config KALLSYMS_EXTRA_PASS
        bool "Do an extra kallsyms pass"
        depends on KALLSYMS
@@@ -815,7 -808,6 +815,7 @@@ config TRACEPOINT
  
  config MARKERS
        bool "Activate markers"
 +      depends on TRACEPOINTS
        help
          Place an empty function call at each marker site. Can be
          dynamically changed for a probe function.
@@@ -936,10 -928,90 +936,90 @@@ source "block/Kconfig
  config PREEMPT_NOTIFIERS
        bool
  
+ choice
+       prompt "RCU Implementation"
+       default CLASSIC_RCU
  config CLASSIC_RCU
-       def_bool !PREEMPT_RCU
+       bool "Classic RCU"
        help
          This option selects the classic RCU implementation that is
          designed for best read-side performance on non-realtime
-         systems.  Classic RCU is the default.  Note that the
-         PREEMPT_RCU symbol is used to select/deselect this option.
+         systems.
+         Select this option if you are unsure.
+ config TREE_RCU
+       bool "Tree-based hierarchical RCU"
+       help
+         This option selects the RCU implementation that is
+         designed for very large SMP system with hundreds or
+         thousands of CPUs.
+ config PREEMPT_RCU
+       bool "Preemptible RCU"
+       depends on PREEMPT
+       help
+         This option reduces the latency of the kernel by making certain
+         RCU sections preemptible. Normally RCU code is non-preemptible, if
+         this option is selected then read-only RCU sections become
+         preemptible. This helps latency, but may expose bugs due to
+         now-naive assumptions about each RCU read-side critical section
+         remaining on a given CPU through its execution.
+ endchoice
+ config RCU_TRACE
+       bool "Enable tracing for RCU"
+       depends on TREE_RCU || PREEMPT_RCU
+       help
+         This option provides tracing in RCU which presents stats
+         in debugfs for debugging RCU implementation.
+         Say Y here if you want to enable RCU tracing
+         Say N if you are unsure.
+ config RCU_FANOUT
+       int "Tree-based hierarchical RCU fanout value"
+       range 2 64 if 64BIT
+       range 2 32 if !64BIT
+       depends on TREE_RCU
+       default 64 if 64BIT
+       default 32 if !64BIT
+       help
+         This option controls the fanout of hierarchical implementations
+         of RCU, allowing RCU to work efficiently on machines with
+         large numbers of CPUs.  This value must be at least the cube
+         root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+         systems and up to 262,144 for 64-bit systems.
+         Select a specific number if testing RCU itself.
+         Take the default if unsure.
+ config RCU_FANOUT_EXACT
+       bool "Disable tree-based hierarchical RCU auto-balancing"
+       depends on TREE_RCU
+       default n
+       help
+         This option forces use of the exact RCU_FANOUT value specified,
+         regardless of imbalances in the hierarchy.  This is useful for
+         testing RCU itself, and might one day be useful on systems with
+         strong NUMA behavior.
+         Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+         Say N if unsure.
+ config TREE_RCU_TRACE
+       def_bool RCU_TRACE && TREE_RCU
+       select DEBUG_FS
+       help
+         This option provides tracing for the TREE_RCU implementation,
+         permitting Makefile to trivially select kernel/rcutree_trace.c.
+ config PREEMPT_RCU_TRACE
+       def_bool RCU_TRACE && PREEMPT_RCU
+       select DEBUG_FS
+       help
+         This option provides tracing for the PREEMPT_RCU implementation,
+         permitting Makefile to trivially select kernel/rcupreempt_trace.c.
diff --combined kernel/Makefile
index 027edda6351137b5f8c1ee085c2c5320d204e904,b4fdbbff5ec069feeaa21250c1ebd2f6cc28464b..e1c5bf3365c0a4cdee8e0279f31b3c6742a9c6ef
@@@ -9,7 -9,7 +9,7 @@@ obj-y     = sched.o fork.o exec_domain.
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 -          notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 +          notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
  
  ifdef CONFIG_FUNCTION_TRACER
  # Do not trace debug files and internal ftrace files
@@@ -19,6 -19,7 +19,6 @@@ CFLAGS_REMOVE_mutex-debug.o = -p
  CFLAGS_REMOVE_rtmutex-debug.o = -pg
  CFLAGS_REMOVE_cgroup-debug.o = -pg
  CFLAGS_REMOVE_sched_clock.o = -pg
 -CFLAGS_REMOVE_sched.o = -pg
  endif
  
  obj-$(CONFIG_FREEZER) += freezer.o
@@@ -73,10 -74,10 +73,10 @@@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq
  obj-$(CONFIG_SECCOMP) += seccomp.o
  obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
  obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+ obj-$(CONFIG_TREE_RCU) += rcutree.o
  obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
- ifeq ($(CONFIG_PREEMPT_RCU),y)
- obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
- endif
+ obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+ obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
  obj-$(CONFIG_RELAY) += relay.o
  obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@@ -89,7 -90,7 +89,7 @@@ obj-$(CONFIG_FUNCTION_TRACER) += trace
  obj-$(CONFIG_TRACING) += trace/
  obj-$(CONFIG_SMP) += sched_cpupri.o
  
 -ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 +ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
  # needed for x86 only.  Why this used to be enabled for all architectures is beyond
  # me.  I suspect most platforms don't need this, but until we know that for sure
diff --combined kernel/exit.c
index c7422ca920382b47445ace263bdba0fe6fbf6bf8,30fcdf16737a2bb013a78b62bc6b7f1eb6eb165b..a946221879d7c03b2263ae4842e2de15b00e30a9
  #include <linux/blkdev.h>
  #include <linux/task_io_accounting_ops.h>
  #include <linux/tracehook.h>
 +#include <linux/init_task.h>
  #include <trace/sched.h>
  
  #include <asm/uaccess.h>
  #include <asm/unistd.h>
  #include <asm/pgtable.h>
  #include <asm/mmu_context.h>
 +#include "cred-internals.h"
 +
 +DEFINE_TRACE(sched_process_free);
 +DEFINE_TRACE(sched_process_exit);
 +DEFINE_TRACE(sched_process_wait);
  
  static void exit_mm(struct task_struct * tsk);
  
@@@ -170,10 -164,7 +170,10 @@@ void release_task(struct task_struct * 
        int zap_leader;
  repeat:
        tracehook_prepare_release_task(p);
 -      atomic_dec(&p->user->processes);
 +      /* don't need to get the RCU readlock here - the process is dead and
 +       * can't be modifying its own credentials */
 +      atomic_dec(&__task_cred(p)->user->processes);
 +
        proc_flush_task(p);
        write_lock_irq(&tasklist_lock);
        tracehook_finish_release_task(p);
@@@ -348,12 -339,12 +348,12 @@@ static void reparent_to_kthreadd(void
        /* cpus_allowed? */
        /* rt_priority? */
        /* signals? */
 -      security_task_reparent_to_init(current);
        memcpy(current->signal->rlim, init_task.signal->rlim,
               sizeof(current->signal->rlim));
 -      atomic_inc(&(INIT_USER->__count));
 +
 +      atomic_inc(&init_cred.usage);
 +      commit_creds(&init_cred);
        write_unlock_irq(&tasklist_lock);
 -      switch_uid(INIT_USER);
  }
  
  void __set_special_pids(struct pid *pid)
@@@ -1087,6 -1078,7 +1087,6 @@@ NORET_TYPE void do_exit(long code
        check_stack_usage();
        exit_thread();
        cgroup_exit(tsk, 1);
 -      exit_keys(tsk);
  
        if (group_dead && tsk->signal->leader)
                disassociate_ctty(1);
        preempt_disable();
        /* causes final put_task_struct in finish_task_switch(). */
        tsk->state = TASK_DEAD;
 -
        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
@@@ -1270,12 -1263,12 +1270,12 @@@ static int wait_task_zombie(struct task
        unsigned long state;
        int retval, status, traced;
        pid_t pid = task_pid_vnr(p);
 +      uid_t uid = __task_cred(p)->uid;
  
        if (!likely(options & WEXITED))
                return 0;
  
        if (unlikely(options & WNOWAIT)) {
 -              uid_t uid = p->uid;
                int exit_code = p->exit_code;
                int why, status;
  
                 * group, which consolidates times for all threads in the
                 * group including the group leader.
                 */
+               thread_group_cputime(p, &cputime);
                spin_lock_irq(&p->parent->sighand->siglock);
                psig = p->parent->signal;
                sig = p->signal;
-               thread_group_cputime(p, &cputime);
                psig->cutime =
                        cputime_add(psig->cutime,
                        cputime_add(cputime.utime,
        if (!retval && infop)
                retval = put_user(pid, &infop->si_pid);
        if (!retval && infop)
 -              retval = put_user(p->uid, &infop->si_uid);
 +              retval = put_user(uid, &infop->si_uid);
        if (!retval)
                retval = pid;
  
@@@ -1461,8 -1454,7 +1461,8 @@@ static int wait_task_stopped(int ptrace
        if (!unlikely(options & WNOWAIT))
                p->exit_code = 0;
  
 -      uid = p->uid;
 +      /* don't need the RCU readlock here as we're holding a spinlock */
 +      uid = __task_cred(p)->uid;
  unlock_sig:
        spin_unlock_irq(&p->sighand->siglock);
        if (!exit_code)
@@@ -1536,10 -1528,10 +1536,10 @@@ static int wait_task_continued(struct t
        }
        if (!unlikely(options & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
 +      uid = __task_cred(p)->uid;
        spin_unlock_irq(&p->sighand->siglock);
  
        pid = task_pid_vnr(p);
 -      uid = p->uid;
        get_task_struct(p);
        read_unlock(&tasklist_lock);
  
diff --combined kernel/extable.c
index feb0317cf09ab5960d0288a31ec00d03ae5a6335,adf0cc9c02d6873d50c48edf4c34b44a32bf48b9..e136ed8d82ba56ab81283a48fe3d6cfb41c23c79
@@@ -17,7 -17,6 +17,7 @@@
  */
  #include <linux/module.h>
  #include <linux/init.h>
 +#include <linux/ftrace.h>
  #include <asm/uaccess.h>
  #include <asm/sections.h>
  
@@@ -41,7 -40,7 +41,7 @@@ const struct exception_table_entry *sea
        return e;
  }
  
 -int core_kernel_text(unsigned long addr)
 +__notrace_funcgraph int core_kernel_text(unsigned long addr)
  {
        if (addr >= (unsigned long)_stext &&
            addr <= (unsigned long)_etext)
@@@ -54,7 -53,7 +54,7 @@@
        return 0;
  }
  
 -int __kernel_text_address(unsigned long addr)
 +__notrace_funcgraph int __kernel_text_address(unsigned long addr)
  {
        if (core_kernel_text(addr))
                return 1;
@@@ -67,3 -66,19 +67,19 @@@ int kernel_text_address(unsigned long a
                return 1;
        return module_text_address(addr) != NULL;
  }
+ /*
+  * On some architectures (PPC64, IA64) function pointers
+  * are actually only tokens to some data that then holds the
+  * real function address. As a result, to find if a function
+  * pointer is part of the kernel text, we need to do some
+  * special dereferencing first.
+  */
+ int func_ptr_is_kernel_text(void *ptr)
+ {
+       unsigned long addr;
+       addr = (unsigned long) dereference_function_descriptor(ptr);
+       if (core_kernel_text(addr))
+               return 1;
+       return module_text_address(addr) != NULL;
+ }
diff --combined kernel/futex.c
index 4fe790e89d0f34af1cc24359d32bca3b58e970ca,b4f87bac91c1afb824404fe8fb6ce90d43669d41..7c6cbabe52b3c0368e800790b638e82eb00aa657
@@@ -92,11 -92,12 +92,12 @@@ struct futex_pi_state 
   * A futex_q has a woken state, just like tasks have TASK_RUNNING.
   * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
   * The order of wakup is always to make the first condition true, then
-  * wake up q->waiters, then make the second condition true.
+  * wake up q->waiter, then make the second condition true.
   */
  struct futex_q {
        struct plist_node list;
-       wait_queue_head_t waiters;
+       /* There can only be a single waiter */
+       wait_queue_head_t waiter;
  
        /* Which hash list lock to use: */
        spinlock_t *lock_ptr;
@@@ -122,24 -123,6 +123,6 @@@ struct futex_hash_bucket 
  
  static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
  
- /*
-  * Take mm->mmap_sem, when futex is shared
-  */
- static inline void futex_lock_mm(struct rw_semaphore *fshared)
- {
-       if (fshared)
-               down_read(fshared);
- }
- /*
-  * Release mm->mmap_sem, when the futex is shared
-  */
- static inline void futex_unlock_mm(struct rw_semaphore *fshared)
- {
-       if (fshared)
-               up_read(fshared);
- }
  /*
   * We hash on the keys returned from get_futex_key (see below).
   */
@@@ -161,6 -144,45 +144,45 @@@ static inline int match_futex(union fut
                && key1->both.offset == key2->both.offset);
  }
  
+ /*
+  * Take a reference to the resource addressed by a key.
+  * Can be called while holding spinlocks.
+  *
+  */
+ static void get_futex_key_refs(union futex_key *key)
+ {
+       if (!key->both.ptr)
+               return;
+       switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+       case FUT_OFF_INODE:
+               atomic_inc(&key->shared.inode->i_count);
+               break;
+       case FUT_OFF_MMSHARED:
+               atomic_inc(&key->private.mm->mm_count);
+               break;
+       }
+ }
+ /*
+  * Drop a reference to the resource addressed by a key.
+  * The hash bucket spinlock must not be held.
+  */
+ static void drop_futex_key_refs(union futex_key *key)
+ {
+       if (!key->both.ptr)
+               return;
+       switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+       case FUT_OFF_INODE:
+               iput(key->shared.inode);
+               break;
+       case FUT_OFF_MMSHARED:
+               mmdrop(key->private.mm);
+               break;
+       }
+ }
  /**
   * get_futex_key - Get parameters which are the keys for a futex.
   * @uaddr: virtual address of the futex
   * For other futexes, it points to &current->mm->mmap_sem and
   * caller must have taken the reader lock. but NOT any spinlocks.
   */
- static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
-                        union futex_key *key)
+ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
  {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
        struct page *page;
        int err;
  
                        return -EFAULT;
                key->private.mm = mm;
                key->private.address = address;
+               get_futex_key_refs(key);
                return 0;
        }
-       /*
-        * The futex is hashed differently depending on whether
-        * it's in a shared or private mapping.  So check vma first.
-        */
-       vma = find_extend_vma(mm, address);
-       if (unlikely(!vma))
-               return -EFAULT;
  
-       /*
-        * Permissions.
-        */
-       if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
-               return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+ again:
+       err = get_user_pages_fast(address, 1, 0, &page);
+       if (err < 0)
+               return err;
+       lock_page(page);
+       if (!page->mapping) {
+               unlock_page(page);
+               put_page(page);
+               goto again;
+       }
  
        /*
         * Private mappings are handled in a simple way.
         *
         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
         * it's a read-only handle, it's expected that futexes attach to
-        * the object not the particular process.  Therefore we use
-        * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
-        * mappings of _writable_ handles.
+        * the object not the particular process.
         */
-       if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
-               key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
+       if (PageAnon(page)) {
+               key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
-               return 0;
+       } else {
+               key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+               key->shared.inode = page->mapping->host;
+               key->shared.pgoff = page->index;
        }
  
-       /*
-        * Linear file mappings are also simple.
-        */
-       key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
-       key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
-       if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
-               key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
-                                    + vma->vm_pgoff);
-               return 0;
-       }
+       get_futex_key_refs(key);
  
-       /*
-        * We could walk the page table to read the non-linear
-        * pte, and get the page index without fetching the page
-        * from swap.  But that's a lot of code to duplicate here
-        * for a rare case, so we simply fetch the page.
-        */
-       err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
-       if (err >= 0) {
-               key->shared.pgoff =
-                       page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-               put_page(page);
-               return 0;
-       }
-       return err;
- }
- /*
-  * Take a reference to the resource addressed by a key.
-  * Can be called while holding spinlocks.
-  *
-  */
- static void get_futex_key_refs(union futex_key *key)
- {
-       if (key->both.ptr == NULL)
-               return;
-       switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-               case FUT_OFF_INODE:
-                       atomic_inc(&key->shared.inode->i_count);
-                       break;
-               case FUT_OFF_MMSHARED:
-                       atomic_inc(&key->private.mm->mm_count);
-                       break;
-       }
+       unlock_page(page);
+       put_page(page);
+       return 0;
  }
  
- /*
-  * Drop a reference to the resource addressed by a key.
-  * The hash bucket spinlock must not be held.
-  */
- static void drop_futex_key_refs(union futex_key *key)
+ static inline
+ void put_futex_key(int fshared, union futex_key *key)
  {
-       if (!key->both.ptr)
-               return;
-       switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-               case FUT_OFF_INODE:
-                       iput(key->shared.inode);
-                       break;
-               case FUT_OFF_MMSHARED:
-                       mmdrop(key->private.mm);
-                       break;
-       }
+       drop_futex_key_refs(key);
  }
  
  static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@@ -328,10 -298,8 +298,8 @@@ static int get_futex_value_locked(u32 *
  
  /*
   * Fault handling.
-  * if fshared is non NULL, current->mm->mmap_sem is already held
   */
- static int futex_handle_fault(unsigned long address,
-                             struct rw_semaphore *fshared, int attempt)
+ static int futex_handle_fault(unsigned long address, int attempt)
  {
        struct vm_area_struct * vma;
        struct mm_struct *mm = current->mm;
        if (attempt > 2)
                return ret;
  
-       if (!fshared)
-               down_read(&mm->mmap_sem);
+       down_read(&mm->mmap_sem);
        vma = find_vma(mm, address);
        if (vma && address >= vma->vm_start &&
            (vma->vm_flags & VM_WRITE)) {
                                current->min_flt++;
                }
        }
-       if (!fshared)
-               up_read(&mm->mmap_sem);
+       up_read(&mm->mmap_sem);
        return ret;
  }
  
@@@ -385,6 -351,7 +351,7 @@@ static int refill_pi_state_cache(void
        /* pi_mutex gets initialized later */
        pi_state->owner = NULL;
        atomic_set(&pi_state->refcount, 1);
+       pi_state->key = FUTEX_KEY_INIT;
  
        current->pi_state_cache = pi_state;
  
@@@ -439,20 -406,13 +406,20 @@@ static void free_pi_state(struct futex_
  static struct task_struct * futex_find_get_task(pid_t pid)
  {
        struct task_struct *p;
 +      const struct cred *cred = current_cred(), *pcred;
  
        rcu_read_lock();
        p = find_task_by_vpid(pid);
 -      if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
 +      if (!p) {
                p = ERR_PTR(-ESRCH);
 -      else
 -              get_task_struct(p);
 +      } else {
 +              pcred = __task_cred(p);
 +              if (cred->euid != pcred->euid &&
 +                  cred->euid != pcred->uid)
 +                      p = ERR_PTR(-ESRCH);
 +              else
 +                      get_task_struct(p);
 +      }
  
        rcu_read_unlock();
  
@@@ -469,7 -429,7 +436,7 @@@ void exit_pi_state_list(struct task_str
        struct list_head *next, *head = &curr->pi_state_list;
        struct futex_pi_state *pi_state;
        struct futex_hash_bucket *hb;
-       union futex_key key;
+       union futex_key key = FUTEX_KEY_INIT;
  
        if (!futex_cmpxchg_enabled)
                return;
@@@ -614,7 -574,7 +581,7 @@@ static void wake_futex(struct futex_q *
         * The lock in wake_up_all() is a crucial memory barrier after the
         * plist_del() and also before assigning to q->lock_ptr.
         */
-       wake_up_all(&q->waiters);
+       wake_up(&q->waiter);
        /*
         * The waiting task can free the futex_q as soon as this is written,
         * without taking any locks.  This must come last.
@@@ -726,20 -686,17 +693,17 @@@ double_lock_hb(struct futex_hash_bucke
   * Wake up all waiters hashed on the physical page that is mapped
   * to this virtual address:
   */
- static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
-                     int nr_wake, u32 bitset)
+ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
  {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
        struct plist_head *head;
-       union futex_key key;
+       union futex_key key = FUTEX_KEY_INIT;
        int ret;
  
        if (!bitset)
                return -EINVAL;
  
-       futex_lock_mm(fshared);
        ret = get_futex_key(uaddr, fshared, &key);
        if (unlikely(ret != 0))
                goto out;
  
        spin_unlock(&hb->lock);
  out:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &key);
        return ret;
  }
  
   * to this virtual address:
   */
  static int
- futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
-             u32 __user *uaddr2,
+ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
              int nr_wake, int nr_wake2, int op)
  {
-       union futex_key key1, key2;
+       union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        struct futex_hash_bucket *hb1, *hb2;
        struct plist_head *head;
        struct futex_q *this, *next;
        int ret, op_ret, attempt = 0;
  
  retryfull:
-       futex_lock_mm(fshared);
        ret = get_futex_key(uaddr1, fshared, &key1);
        if (unlikely(ret != 0))
                goto out;
@@@ -833,18 -787,12 +794,12 @@@ retry
                 */
                if (attempt++) {
                        ret = futex_handle_fault((unsigned long)uaddr2,
-                                                fshared, attempt);
+                                                attempt);
                        if (ret)
                                goto out;
                        goto retry;
                }
  
-               /*
-                * If we would have faulted, release mmap_sem,
-                * fault it in and start all over again.
-                */
-               futex_unlock_mm(fshared);
                ret = get_user(dummy, uaddr2);
                if (ret)
                        return ret;
        if (hb1 != hb2)
                spin_unlock(&hb2->lock);
  out:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &key2);
+       put_futex_key(fshared, &key1);
  
        return ret;
  }
   * Requeue all waiters hashed on one physical page to another
   * physical page.
   */
- static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
-                        u32 __user *uaddr2,
+ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
                         int nr_wake, int nr_requeue, u32 *cmpval)
  {
-       union futex_key key1, key2;
+       union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        struct futex_hash_bucket *hb1, *hb2;
        struct plist_head *head1;
        struct futex_q *this, *next;
        int ret, drop_count = 0;
  
   retry:
-       futex_lock_mm(fshared);
        ret = get_futex_key(uaddr1, fshared, &key1);
        if (unlikely(ret != 0))
                goto out;
                        if (hb1 != hb2)
                                spin_unlock(&hb2->lock);
  
-                       /*
-                        * If we would have faulted, release mmap_sem, fault
-                        * it in and start all over again.
-                        */
-                       futex_unlock_mm(fshared);
                        ret = get_user(curval, uaddr1);
  
                        if (!ret)
@@@ -981,7 -921,8 +928,8 @@@ out_unlock
                drop_futex_key_refs(&key1);
  
  out:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &key2);
+       put_futex_key(fshared, &key1);
        return ret;
  }
  
@@@ -990,7 -931,7 +938,7 @@@ static inline struct futex_hash_bucket 
  {
        struct futex_hash_bucket *hb;
  
-       init_waitqueue_head(&q->waiters);
+       init_waitqueue_head(&q->waiter);
  
        get_futex_key_refs(&q->key);
        hb = hash_futex(&q->key);
@@@ -1103,8 -1044,7 +1051,7 @@@ static void unqueue_me_pi(struct futex_
   * private futexes.
   */
  static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                               struct task_struct *newowner,
-                               struct rw_semaphore *fshared)
+                               struct task_struct *newowner, int fshared)
  {
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
@@@ -1183,7 -1123,7 +1130,7 @@@ retry
  handle_fault:
        spin_unlock(q->lock_ptr);
  
-       ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
+       ret = futex_handle_fault((unsigned long)uaddr, attempt++);
  
        spin_lock(q->lock_ptr);
  
   * In case we must use restart_block to restart a futex_wait,
   * we encode in the 'flags' shared capability
   */
- #define FLAGS_SHARED  1
+ #define FLAGS_SHARED          0x01
+ #define FLAGS_CLOCKRT         0x02
  
  static long futex_wait_restart(struct restart_block *restart);
  
- static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
-                     u32 val, ktime_t *abs_time, u32 bitset)
+ static int futex_wait(u32 __user *uaddr, int fshared,
+                     u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
  {
        struct task_struct *curr = current;
        DECLARE_WAITQUEUE(wait, curr);
        q.pi_state = NULL;
        q.bitset = bitset;
   retry:
-       futex_lock_mm(fshared);
+       q.key = FUTEX_KEY_INIT;
        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out_release_sem;
        if (unlikely(ret)) {
                queue_unlock(&q, hb);
  
-               /*
-                * If we would have faulted, release mmap_sem, fault it in and
-                * start all over again.
-                */
-               futex_unlock_mm(fshared);
                ret = get_user(uval, uaddr);
  
                if (!ret)
        /* Only actually queue if *uaddr contained val.  */
        queue_me(&q, hb);
  
-       /*
-        * Now the futex is queued and we have checked the data, we
-        * don't want to hold mmap_sem while we sleep.
-        */
-       futex_unlock_mm(fshared);
        /*
         * There might have been scheduling since the queue_me(), as we
         * cannot hold a spinlock across the get_user() in case it
  
        /* add_wait_queue is the barrier after __set_current_state. */
        __set_current_state(TASK_INTERRUPTIBLE);
-       add_wait_queue(&q.waiters, &wait);
+       add_wait_queue(&q.waiter, &wait);
        /*
         * !plist_node_empty() is safe here without any lock.
         * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
                        slack = current->timer_slack_ns;
                        if (rt_task(current))
                                slack = 0;
-                       hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
-                                               HRTIMER_MODE_ABS);
+                       hrtimer_init_on_stack(&t.timer,
+                                             clockrt ? CLOCK_REALTIME :
+                                             CLOCK_MONOTONIC,
+                                             HRTIMER_MODE_ABS);
                        hrtimer_init_sleeper(&t, current);
                        hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
  
  
                if (fshared)
                        restart->futex.flags |= FLAGS_SHARED;
+               if (clockrt)
+                       restart->futex.flags |= FLAGS_CLOCKRT;
                return -ERESTART_RESTARTBLOCK;
        }
  
        queue_unlock(&q, hb);
  
   out_release_sem:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &q.key);
        return ret;
  }
  
  static long futex_wait_restart(struct restart_block *restart)
  {
        u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
-       struct rw_semaphore *fshared = NULL;
+       int fshared = 0;
        ktime_t t;
  
        t.tv64 = restart->futex.time;
        restart->fn = do_no_restart_syscall;
        if (restart->futex.flags & FLAGS_SHARED)
-               fshared = &current->mm->mmap_sem;
+               fshared = 1;
        return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
-                               restart->futex.bitset);
+                               restart->futex.bitset,
+                               restart->futex.flags & FLAGS_CLOCKRT);
  }
  
  
   * if there are waiters then it will block, it does PI, etc. (Due to
   * races the kernel might see a 0 value of the futex too.)
   */
- static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                         int detect, ktime_t *time, int trylock)
  {
        struct hrtimer_sleeper timeout, *to = NULL;
  
        q.pi_state = NULL;
   retry:
-       futex_lock_mm(fshared);
+       q.key = FUTEX_KEY_INIT;
        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out_release_sem;
                         * exit to complete.
                         */
                        queue_unlock(&q, hb);
-                       futex_unlock_mm(fshared);
                        cond_resched();
                        goto retry;
  
         */
        queue_me(&q, hb);
  
-       /*
-        * Now the futex is queued and we have checked the data, we
-        * don't want to hold mmap_sem while we sleep.
-        */
-       futex_unlock_mm(fshared);
        WARN_ON(!q.pi_state);
        /*
         * Block on the PI mutex:
                ret = ret ? 0 : -EWOULDBLOCK;
        }
  
-       futex_lock_mm(fshared);
        spin_lock(q.lock_ptr);
  
        if (!ret) {
  
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
-       futex_unlock_mm(fshared);
  
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
        queue_unlock(&q, hb);
  
   out_release_sem:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &q.key);
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
        return ret;
  
   uaddr_faulted:
        /*
-        * We have to r/w  *(int __user *)uaddr, but we can't modify it
-        * non-atomically.  Therefore, if get_user below is not
-        * enough, we need to handle the fault ourselves, while
-        * still holding the mmap_sem.
-        *
-        * ... and hb->lock. :-) --ANK
+        * We have to r/w  *(int __user *)uaddr, and we have to modify it
+        * atomically.  Therefore, if we continue to fault after get_user()
+        * below, we need to handle the fault ourselves, while still holding
+        * the mmap_sem.  This can occur if the uaddr is under contention as
+        * we have to drop the mmap_sem in order to call get_user().
         */
        queue_unlock(&q, hb);
  
        if (attempt++) {
-               ret = futex_handle_fault((unsigned long)uaddr, fshared,
-                                        attempt);
+               ret = futex_handle_fault((unsigned long)uaddr, attempt);
                if (ret)
                        goto out_release_sem;
                goto retry_unlocked;
        }
  
-       futex_unlock_mm(fshared);
        ret = get_user(uval, uaddr);
-       if (!ret && (uval != -EFAULT))
+       if (!ret)
                goto retry;
  
        if (to)
   * This is the in-kernel slowpath: we look up the PI state (if any),
   * and do the rt-mutex unlock.
   */
- static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
+ static int futex_unlock_pi(u32 __user *uaddr, int fshared)
  {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
        u32 uval;
        struct plist_head *head;
-       union futex_key key;
+       union futex_key key = FUTEX_KEY_INIT;
        int ret, attempt = 0;
  
  retry:
         */
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                return -EPERM;
-       /*
-        * First take all the futex related locks:
-        */
-       futex_lock_mm(fshared);
  
        ret = get_futex_key(uaddr, fshared, &key);
        if (unlikely(ret != 0))
@@@ -1754,34 -1669,30 +1676,30 @@@ retry_unlocked
  out_unlock:
        spin_unlock(&hb->lock);
  out:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &key);
  
        return ret;
  
  pi_faulted:
        /*
-        * We have to r/w  *(int __user *)uaddr, but we can't modify it
-        * non-atomically.  Therefore, if get_user below is not
-        * enough, we need to handle the fault ourselves, while
-        * still holding the mmap_sem.
-        *
-        * ... and hb->lock. --ANK
+        * We have to r/w  *(int __user *)uaddr, and we have to modify it
+        * atomically.  Therefore, if we continue to fault after get_user()
+        * below, we need to handle the fault ourselves, while still holding
+        * the mmap_sem.  This can occur if the uaddr is under contention as
+        * we have to drop the mmap_sem in order to call get_user().
         */
        spin_unlock(&hb->lock);
  
        if (attempt++) {
-               ret = futex_handle_fault((unsigned long)uaddr, fshared,
-                                        attempt);
+               ret = futex_handle_fault((unsigned long)uaddr, attempt);
                if (ret)
                        goto out;
                uval = 0;
                goto retry_unlocked;
        }
  
-       futex_unlock_mm(fshared);
        ret = get_user(uval, uaddr);
-       if (!ret && (uval != -EFAULT))
+       if (!ret)
                goto retry;
  
        return ret;
@@@ -1836,7 -1747,6 +1754,7 @@@ sys_get_robust_list(int pid, struct rob
  {
        struct robust_list_head __user *head;
        unsigned long ret;
 +      const struct cred *cred = current_cred(), *pcred;
  
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
                if (!p)
                        goto err_unlock;
                ret = -EPERM;
 -              if ((current->euid != p->euid) && (current->euid != p->uid) &&
 -                              !capable(CAP_SYS_PTRACE))
 +              pcred = __task_cred(p);
 +              if (cred->euid != pcred->euid &&
 +                  cred->euid != pcred->uid &&
 +                  !capable(CAP_SYS_PTRACE))
                        goto err_unlock;
                head = p->robust_list;
                rcu_read_unlock();
@@@ -1908,8 -1816,7 +1826,7 @@@ retry
                 * PI futexes happens in exit_pi_state():
                 */
                if (!pi && (uval & FUTEX_WAITERS))
-                       futex_wake(uaddr, &curr->mm->mmap_sem, 1,
-                                  FUTEX_BITSET_MATCH_ANY);
+                       futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
        }
        return 0;
  }
@@@ -2003,18 -1910,22 +1920,22 @@@ void exit_robust_list(struct task_struc
  long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
  {
-       int ret = -ENOSYS;
+       int clockrt, ret = -ENOSYS;
        int cmd = op & FUTEX_CMD_MASK;
-       struct rw_semaphore *fshared = NULL;
+       int fshared = 0;
  
        if (!(op & FUTEX_PRIVATE_FLAG))
-               fshared = &current->mm->mmap_sem;
+               fshared = 1;
+       clockrt = op & FUTEX_CLOCK_REALTIME;
+       if (clockrt && cmd != FUTEX_WAIT_BITSET)
+               return -ENOSYS;
  
        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
        case FUTEX_WAIT_BITSET:
-               ret = futex_wait(uaddr, fshared, val, timeout, val3);
+               ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
                break;
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
diff --combined kernel/lockdep.c
index 74b1878b8bb8170a21957e74600465437aa225bd,4fa6eeb4e8a7f0c6111a0cb5d67fa6f9671bad5b..06b0c3568f0b230a8c6b669d055c8c442a7eb790
@@@ -25,7 -25,6 +25,7 @@@
   * Thanks to Arjan van de Ven for coming up with the initial idea of
   * mapping lock dependencies runtime.
   */
 +#define DISABLE_BRANCH_PROFILING
  #include <linux/mutex.h>
  #include <linux/sched.h>
  #include <linux/delay.h>
@@@ -137,16 -136,16 +137,16 @@@ static inline struct lock_class *hlock_
  #ifdef CONFIG_LOCK_STAT
  static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
  
- static int lock_contention_point(struct lock_class *class, unsigned long ip)
+ static int lock_point(unsigned long points[], unsigned long ip)
  {
        int i;
  
-       for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
-               if (class->contention_point[i] == 0) {
-                       class->contention_point[i] = ip;
+       for (i = 0; i < LOCKSTAT_POINTS; i++) {
+               if (points[i] == 0) {
+                       points[i] = ip;
                        break;
                }
-               if (class->contention_point[i] == ip)
+               if (points[i] == ip)
                        break;
        }
  
@@@ -186,6 -185,9 +186,9 @@@ struct lock_class_stats lock_stats(stru
                for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
                        stats.contention_point[i] += pcs->contention_point[i];
  
+               for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
+                       stats.contending_point[i] += pcs->contending_point[i];
                lock_time_add(&pcs->read_waittime, &stats.read_waittime);
                lock_time_add(&pcs->write_waittime, &stats.write_waittime);
  
@@@ -210,6 -212,7 +213,7 @@@ void clear_lock_stats(struct lock_clas
                memset(cpu_stats, 0, sizeof(struct lock_class_stats));
        }
        memset(class->contention_point, 0, sizeof(class->contention_point));
+       memset(class->contending_point, 0, sizeof(class->contending_point));
  }
  
  static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@@ -288,14 -291,12 +292,12 @@@ void lockdep_off(void
  {
        current->lockdep_recursion++;
  }
  EXPORT_SYMBOL(lockdep_off);
  
  void lockdep_on(void)
  {
        current->lockdep_recursion--;
  }
  EXPORT_SYMBOL(lockdep_on);
  
  /*
@@@ -577,7 -578,8 +579,8 @@@ static void print_lock_class_header(str
  /*
   * printk all lock dependencies starting at <entry>:
   */
- static void print_lock_dependencies(struct lock_class *class, int depth)
+ static void __used
+ print_lock_dependencies(struct lock_class *class, int depth)
  {
        struct lock_list *entry;
  
@@@ -2509,7 -2511,6 +2512,6 @@@ void lockdep_init_map(struct lockdep_ma
        if (subclass)
                register_lock_class(lock, subclass, 1);
  }
  EXPORT_SYMBOL_GPL(lockdep_init_map);
  
  /*
@@@ -2690,8 -2691,9 +2692,9 @@@ static int check_unlock(struct task_str
  }
  
  static int
- __lock_set_subclass(struct lockdep_map *lock,
-                   unsigned int subclass, unsigned long ip)
+ __lock_set_class(struct lockdep_map *lock, const char *name,
+                struct lock_class_key *key, unsigned int subclass,
+                unsigned long ip)
  {
        struct task_struct *curr = current;
        struct held_lock *hlock, *prev_hlock;
        return print_unlock_inbalance_bug(curr, lock, ip);
  
  found_it:
+       lockdep_init_map(lock, name, key, 0);
        class = register_lock_class(lock, subclass, 0);
        hlock->class_idx = class - lock_classes + 1;
  
@@@ -2902,9 -2905,9 +2906,9 @@@ static void check_flags(unsigned long f
  #endif
  }
  
- void
lock_set_subclass(struct lockdep_map *lock,
-                 unsigned int subclass, unsigned long ip)
+ void lock_set_class(struct lockdep_map *lock, const char *name,
                  struct lock_class_key *key, unsigned int subclass,
+                   unsigned long ip)
  {
        unsigned long flags;
  
        raw_local_irq_save(flags);
        current->lockdep_recursion = 1;
        check_flags(flags);
-       if (__lock_set_subclass(lock, subclass, ip))
+       if (__lock_set_class(lock, name, key, subclass, ip))
                check_chain_key(current);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
  }
- EXPORT_SYMBOL_GPL(lock_set_subclass);
+ EXPORT_SYMBOL_GPL(lock_set_class);
  
  /*
   * We are not always called with irqs disabled - do that here,
@@@ -2944,7 -2946,6 +2947,6 @@@ void lock_acquire(struct lockdep_map *l
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
  }
  EXPORT_SYMBOL_GPL(lock_acquire);
  
  void lock_release(struct lockdep_map *lock, int nested,
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
  }
  EXPORT_SYMBOL_GPL(lock_release);
  
  #ifdef CONFIG_LOCK_STAT
@@@ -3000,7 -3000,7 +3001,7 @@@ __lock_contended(struct lockdep_map *lo
        struct held_lock *hlock, *prev_hlock;
        struct lock_class_stats *stats;
        unsigned int depth;
-       int i, point;
+       int i, contention_point, contending_point;
  
        depth = curr->lockdep_depth;
        if (DEBUG_LOCKS_WARN_ON(!depth))
  found_it:
        hlock->waittime_stamp = sched_clock();
  
-       point = lock_contention_point(hlock_class(hlock), ip);
+       contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
+       contending_point = lock_point(hlock_class(hlock)->contending_point,
+                                     lock->ip);
  
        stats = get_lock_stats(hlock_class(hlock));
-       if (point < ARRAY_SIZE(stats->contention_point))
-               stats->contention_point[point]++;
+       if (contention_point < LOCKSTAT_POINTS)
+               stats->contention_point[contention_point]++;
+       if (contending_point < LOCKSTAT_POINTS)
+               stats->contending_point[contending_point]++;
        if (lock->cpu != smp_processor_id())
                stats->bounces[bounce_contended + !!hlock->read]++;
        put_lock_stats(stats);
  }
  
  static void
- __lock_acquired(struct lockdep_map *lock)
+ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
  {
        struct task_struct *curr = current;
        struct held_lock *hlock, *prev_hlock;
@@@ -3084,6 -3088,7 +3089,7 @@@ found_it
        put_lock_stats(stats);
  
        lock->cpu = cpu;
+       lock->ip = ip;
  }
  
  void lock_contended(struct lockdep_map *lock, unsigned long ip)
  }
  EXPORT_SYMBOL_GPL(lock_contended);
  
- void lock_acquired(struct lockdep_map *lock)
+ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
  {
        unsigned long flags;
  
        raw_local_irq_save(flags);
        check_flags(flags);
        current->lockdep_recursion = 1;
-       __lock_acquired(lock);
+       __lock_acquired(lock, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
  }
@@@ -3442,7 -3447,6 +3448,6 @@@ retry
        if (unlock)
                read_unlock(&tasklist_lock);
  }
  EXPORT_SYMBOL_GPL(debug_show_all_locks);
  
  /*
@@@ -3463,7 -3467,6 +3468,6 @@@ void debug_show_held_locks(struct task_
  {
                __debug_show_held_locks(task);
  }
  EXPORT_SYMBOL_GPL(debug_show_held_locks);
  
  void lockdep_sys_exit(void)
diff --combined kernel/sched.c
index 748ff924a29056e57f5c30058ce39ea7e470b23e,3e70963120a0c9e57daf482f9363fe751f90eacc..22aa9cab3fe5b350f38aa86d596325adf486382c
   */
  #define RUNTIME_INF   ((u64)~0ULL)
  
 +DEFINE_TRACE(sched_wait_task);
 +DEFINE_TRACE(sched_wakeup);
 +DEFINE_TRACE(sched_wakeup_new);
 +DEFINE_TRACE(sched_switch);
 +DEFINE_TRACE(sched_migrate_task);
 +
  #ifdef CONFIG_SMP
  /*
   * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@@ -267,10 -261,6 +267,10 @@@ struct task_group 
        struct cgroup_subsys_state css;
  #endif
  
 +#ifdef CONFIG_USER_SCHED
 +      uid_t uid;
 +#endif
 +
  #ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
        struct sched_entity **se;
  
  #ifdef CONFIG_USER_SCHED
  
 +/* Helper function to pass uid information to create_sched_user() */
 +void set_tg_uid(struct user_struct *user)
 +{
 +      user->tg->uid = user->uid;
 +}
 +
  /*
   * Root task group.
   *    Every UID task group (including init_task_group aka UID-0) will
@@@ -361,9 -345,7 +361,9 @@@ static inline struct task_group *task_g
        struct task_group *tg;
  
  #ifdef CONFIG_USER_SCHED
 -      tg = p->user->tg;
 +      rcu_read_lock();
 +      tg = __task_cred(p)->user->tg;
 +      rcu_read_unlock();
  #elif defined(CONFIG_CGROUP_SCHED)
        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                struct task_group, css);
@@@ -604,8 -586,6 +604,8 @@@ struct rq 
  #ifdef CONFIG_SCHEDSTATS
        /* latency stats */
        struct sched_info rq_sched_info;
 +      unsigned long long rq_cpu_time;
 +      /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  
        /* sys_sched_yield() stats */
        unsigned int yld_exp_empty;
@@@ -723,18 -703,45 +723,18 @@@ static __read_mostly char *sched_feat_n
  
  #undef SCHED_FEAT
  
 -static int sched_feat_open(struct inode *inode, struct file *filp)
 -{
 -      filp->private_data = inode->i_private;
 -      return 0;
 -}
 -
 -static ssize_t
 -sched_feat_read(struct file *filp, char __user *ubuf,
 -              size_t cnt, loff_t *ppos)
 +static int sched_feat_show(struct seq_file *m, void *v)
  {
 -      char *buf;
 -      int r = 0;
 -      int len = 0;
        int i;
  
        for (i = 0; sched_feat_names[i]; i++) {
 -              len += strlen(sched_feat_names[i]);
 -              len += 4;
 -      }
 -
 -      buf = kmalloc(len + 2, GFP_KERNEL);
 -      if (!buf)
 -              return -ENOMEM;
 -
 -      for (i = 0; sched_feat_names[i]; i++) {
 -              if (sysctl_sched_features & (1UL << i))
 -                      r += sprintf(buf + r, "%s ", sched_feat_names[i]);
 -              else
 -                      r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
 +              if (!(sysctl_sched_features & (1UL << i)))
 +                      seq_puts(m, "NO_");
 +              seq_printf(m, "%s ", sched_feat_names[i]);
        }
 +      seq_puts(m, "\n");
  
 -      r += sprintf(buf + r, "\n");
 -      WARN_ON(r >= len + 2);
 -
 -      r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 -
 -      kfree(buf);
 -
 -      return r;
 +      return 0;
  }
  
  static ssize_t
@@@ -779,17 -786,10 +779,17 @@@ sched_feat_write(struct file *filp, con
        return cnt;
  }
  
 +static int sched_feat_open(struct inode *inode, struct file *filp)
 +{
 +      return single_open(filp, sched_feat_show, NULL);
 +}
 +
  static struct file_operations sched_feat_fops = {
 -      .open   = sched_feat_open,
 -      .read   = sched_feat_read,
 -      .write  = sched_feat_write,
 +      .open           = sched_feat_open,
 +      .write          = sched_feat_write,
 +      .read           = seq_read,
 +      .llseek         = seq_lseek,
 +      .release        = single_release,
  };
  
  static __init int sched_init_debug(void)
@@@ -1474,13 -1474,27 +1474,13 @@@ static voi
  update_group_shares_cpu(struct task_group *tg, int cpu,
                        unsigned long sd_shares, unsigned long sd_rq_weight)
  {
 -      int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
  
        if (!tg->se[cpu])
                return;
  
 -      rq_weight = tg->cfs_rq[cpu]->load.weight;
 -
 -      /*
 -       * If there are currently no tasks on the cpu pretend there is one of
 -       * average load so that when a new task gets to run here it will not
 -       * get delayed by group starvation.
 -       */
 -      if (!rq_weight) {
 -              boost = 1;
 -              rq_weight = NICE_0_LOAD;
 -      }
 -
 -      if (unlikely(rq_weight > sd_rq_weight))
 -              rq_weight = sd_rq_weight;
 +      rq_weight = tg->cfs_rq[cpu]->rq_weight;
  
        /*
         *           \Sum shares * rq_weight
         *               \Sum rq_weight
         *
         */
 -      shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
 +      shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
  
        if (abs(shares - tg->se[cpu]->load.weight) >
                unsigned long flags;
  
                spin_lock_irqsave(&rq->lock, flags);
 -              /*
 -               * record the actual number of shares, not the boosted amount.
 -               */
 -              tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
 -              tg->cfs_rq[cpu]->rq_weight = rq_weight;
 +              tg->cfs_rq[cpu]->shares = shares;
  
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
   */
  static int tg_shares_up(struct task_group *tg, void *data)
  {
 -      unsigned long rq_weight = 0;
 +      unsigned long weight, rq_weight = 0;
        unsigned long shares = 0;
        struct sched_domain *sd = data;
        int i;
  
        for_each_cpu_mask(i, sd->span) {
 -              rq_weight += tg->cfs_rq[i]->load.weight;
 +              /*
 +               * If there are currently no tasks on the cpu pretend there
 +               * is one of average load so that when a new task gets to
 +               * run here it will not get delayed by group starvation.
 +               */
 +              weight = tg->cfs_rq[i]->load.weight;
 +              if (!weight)
 +                      weight = NICE_0_LOAD;
 +
 +              tg->cfs_rq[i]->rq_weight = weight;
 +              rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
  
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
  
 -      if (!rq_weight)
 -              rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
 -
        for_each_cpu_mask(i, sd->span)
                update_group_shares_cpu(tg, i, shares, rq_weight);
  
@@@ -1601,39 -1612,6 +1601,39 @@@ static inline void update_shares_locked
  
  #endif
  
 +/*
 + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
 + */
 +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 +      __releases(this_rq->lock)
 +      __acquires(busiest->lock)
 +      __acquires(this_rq->lock)
 +{
 +      int ret = 0;
 +
 +      if (unlikely(!irqs_disabled())) {
 +              /* printk() doesn't work good under rq->lock */
 +              spin_unlock(&this_rq->lock);
 +              BUG_ON(1);
 +      }
 +      if (unlikely(!spin_trylock(&busiest->lock))) {
 +              if (busiest < this_rq) {
 +                      spin_unlock(&this_rq->lock);
 +                      spin_lock(&busiest->lock);
 +                      spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
 +                      ret = 1;
 +              } else
 +                      spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
 +      }
 +      return ret;
 +}
 +
 +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 +      __releases(busiest->lock)
 +{
 +      spin_unlock(&busiest->lock);
 +      lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 +}
  #endif
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -1867,8 -1845,6 +1867,8 @@@ void set_task_cpu(struct task_struct *p
  
        clock_offset = old_rq->clock - new_rq->clock;
  
 +      trace_sched_migrate_task(p, task_cpu(p), new_cpu);
 +
  #ifdef CONFIG_SCHEDSTATS
        if (p->se.wait_start)
                p->se.wait_start -= clock_offset;
@@@ -2278,7 -2254,6 +2278,7 @@@ static int try_to_wake_up(struct task_s
  
        smp_wmb();
        rq = task_rq_lock(p, &flags);
 +      update_rq_clock(rq);
        old_state = p->state;
        if (!(old_state & state))
                goto out;
@@@ -2336,11 -2311,12 +2336,11 @@@ out_activate
                schedstat_inc(p, se.nr_wakeups_local);
        else
                schedstat_inc(p, se.nr_wakeups_remote);
 -      update_rq_clock(rq);
        activate_task(rq, p, 1);
        success = 1;
  
  out_running:
 -      trace_sched_wakeup(rq, p);
 +      trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
  
        p->state = TASK_RUNNING;
@@@ -2473,7 -2449,7 +2473,7 @@@ void wake_up_new_task(struct task_struc
                p->sched_class->task_new(rq, p);
                inc_nr_running(rq);
        }
 -      trace_sched_wakeup_new(rq, p);
 +      trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@@ -2835,6 -2811,40 +2835,6 @@@ static void double_rq_unlock(struct rq 
                __release(rq2->lock);
  }
  
 -/*
 - * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
 - */
 -static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 -      __releases(this_rq->lock)
 -      __acquires(busiest->lock)
 -      __acquires(this_rq->lock)
 -{
 -      int ret = 0;
 -
 -      if (unlikely(!irqs_disabled())) {
 -              /* printk() doesn't work good under rq->lock */
 -              spin_unlock(&this_rq->lock);
 -              BUG_ON(1);
 -      }
 -      if (unlikely(!spin_trylock(&busiest->lock))) {
 -              if (busiest < this_rq) {
 -                      spin_unlock(&this_rq->lock);
 -                      spin_lock(&busiest->lock);
 -                      spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
 -                      ret = 1;
 -              } else
 -                      spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
 -      }
 -      return ret;
 -}
 -
 -static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 -      __releases(busiest->lock)
 -{
 -      spin_unlock(&busiest->lock);
 -      lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 -}
 -
  /*
   * If dest_cpu is allowed for this process, migrate the task to it.
   * This is accomplished by forcing the cpu_allowed mask to only
@@@ -2852,6 -2862,7 +2852,6 @@@ static void sched_migrate_task(struct t
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
  
 -      trace_sched_migrate_task(rq, p, dest_cpu);
        /* force the process onto the specified CPU */
        if (migrate_task(p, dest_cpu, &req)) {
                /* Need to wait for migration thread (might exit: take ref). */
@@@ -3696,7 -3707,7 +3696,7 @@@ out_balanced
  static void idle_balance(int this_cpu, struct rq *this_rq)
  {
        struct sched_domain *sd;
 -      int pulled_task = -1;
 +      int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
        cpumask_t tmpmask;
  
@@@ -4192,7 -4203,6 +4192,6 @@@ void account_steal_time(struct task_str
  
        if (p == rq->idle) {
                p->stime = cputime_add(p->stime, steal);
-               account_group_system_time(p, steal);
                if (atomic_read(&rq->nr_iowait) > 0)
                        cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                else
@@@ -4328,7 -4338,7 +4327,7 @@@ void __kprobes sub_preempt_count(int va
        /*
         * Underflow?
         */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+        if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
                return;
        /*
         * Is the spinlock portion underflowing?
@@@ -5123,22 -5133,6 +5122,22 @@@ __setscheduler(struct rq *rq, struct ta
        set_load_weight(p);
  }
  
 +/*
 + * check the target process has a UID that matches the current process's
 + */
 +static bool check_same_owner(struct task_struct *p)
 +{
 +      const struct cred *cred = current_cred(), *pcred;
 +      bool match;
 +
 +      rcu_read_lock();
 +      pcred = __task_cred(p);
 +      match = (cred->euid == pcred->euid ||
 +               cred->euid == pcred->uid);
 +      rcu_read_unlock();
 +      return match;
 +}
 +
  static int __sched_setscheduler(struct task_struct *p, int policy,
                                struct sched_param *param, bool user)
  {
@@@ -5198,7 -5192,8 +5197,7 @@@ recheck
                        return -EPERM;
  
                /* can't change other user's priorities */
 -              if ((current->euid != p->euid) &&
 -                  (current->euid != p->uid))
 +              if (!check_same_owner(p))
                        return -EPERM;
        }
  
@@@ -5430,7 -5425,8 +5429,7 @@@ long sched_setaffinity(pid_t pid, cons
        read_unlock(&tasklist_lock);
  
        retval = -EPERM;
 -      if ((current->euid != p->euid) && (current->euid != p->uid) &&
 -                      !capable(CAP_SYS_NICE))
 +      if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
  
        retval = security_task_setscheduler(p, 0, NULL);
@@@ -5899,7 -5895,6 +5898,7 @@@ void __cpuinit init_idle(struct task_st
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
 +      ftrace_graph_init_task(idle);
  }
  
  /*
@@@ -6130,6 -6125,7 +6129,6 @@@ static int __migrate_task_irq(struct ta
  
  /*
   * Figure out where task on dead CPU should go, use force if necessary.
 - * NOTE: interrupts should be disabled by the caller
   */
  static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  {
@@@ -6641,6 -6637,28 +6640,6 @@@ early_initcall(migration_init)
  
  #ifdef CONFIG_SCHED_DEBUG
  
 -static inline const char *sd_level_to_string(enum sched_domain_level lvl)
 -{
 -      switch (lvl) {
 -      case SD_LV_NONE:
 -                      return "NONE";
 -      case SD_LV_SIBLING:
 -                      return "SIBLING";
 -      case SD_LV_MC:
 -                      return "MC";
 -      case SD_LV_CPU:
 -                      return "CPU";
 -      case SD_LV_NODE:
 -                      return "NODE";
 -      case SD_LV_ALLNODES:
 -                      return "ALLNODES";
 -      case SD_LV_MAX:
 -                      return "MAX";
 -
 -      }
 -      return "MAX";
 -}
 -
  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  cpumask_t *groupmask)
  {
                return -1;
        }
  
 -      printk(KERN_CONT "span %s level %s\n",
 -              str, sd_level_to_string(sd->level));
 +      printk(KERN_CONT "span %s level %s\n", str, sd->name);
  
        if (!cpu_isset(cpu, sd->span)) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
@@@ -6796,8 -6815,6 +6795,8 @@@ sd_parent_degenerate(struct sched_domai
                                SD_BALANCE_EXEC |
                                SD_SHARE_CPUPOWER |
                                SD_SHARE_PKG_RESOURCES);
 +              if (nr_node_ids == 1)
 +                      pflags &= ~SD_SERIALIZE;
        }
        if (~cflags & pflags)
                return 0;
@@@ -7318,21 -7335,13 +7317,21 @@@ struct allmasks 
  };
  
  #if   NR_CPUS > 128
 -#define       SCHED_CPUMASK_ALLOC             1
 -#define       SCHED_CPUMASK_FREE(v)           kfree(v)
 -#define       SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
 +#define SCHED_CPUMASK_DECLARE(v)      struct allmasks *v
 +static inline void sched_cpumask_alloc(struct allmasks **masks)
 +{
 +      *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
 +}
 +static inline void sched_cpumask_free(struct allmasks *masks)
 +{
 +      kfree(masks);
 +}
  #else
 -#define       SCHED_CPUMASK_ALLOC             0
 -#define       SCHED_CPUMASK_FREE(v)
 -#define       SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
 +#define SCHED_CPUMASK_DECLARE(v)      struct allmasks _v, *v = &_v
 +static inline void sched_cpumask_alloc(struct allmasks **masks)
 +{ }
 +static inline void sched_cpumask_free(struct allmasks *masks)
 +{ }
  #endif
  
  #define       SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
@@@ -7408,8 -7417,9 +7407,8 @@@ static int __build_sched_domains(const 
                return -ENOMEM;
        }
  
 -#if SCHED_CPUMASK_ALLOC
        /* get space for all scratch cpumask variables */
 -      allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
 +      sched_cpumask_alloc(&allmasks);
        if (!allmasks) {
                printk(KERN_WARNING "Cannot alloc cpumask array\n");
                kfree(rd);
  #endif
                return -ENOMEM;
        }
 -#endif
 +
        tmpmask = (cpumask_t *)allmasks;
  
  
                cpu_attach_domain(sd, rd, i);
        }
  
 -      SCHED_CPUMASK_FREE((void *)allmasks);
 +      sched_cpumask_free(allmasks);
        return 0;
  
  #ifdef CONFIG_NUMA
  error:
        free_sched_groups(cpu_map, tmpmask);
 -      SCHED_CPUMASK_FREE((void *)allmasks);
 +      sched_cpumask_free(allmasks);
        kfree(rd);
        return -ENOMEM;
  #endif
@@@ -7701,14 -7711,8 +7700,14 @@@ static struct sched_domain_attr *dattr_
   */
  static cpumask_t fallback_doms;
  
 -void __attribute__((weak)) arch_update_cpu_topology(void)
 +/*
 + * arch_update_cpu_topology lets virtualized architectures update the
 + * cpu core maps. It is supposed to return 1 if the topology changed
 + * or 0 if it stayed the same.
 + */
 +int __attribute__((weak)) arch_update_cpu_topology(void)
  {
 +      return 0;
  }
  
  /*
@@@ -7748,6 -7752,8 +7747,6 @@@ static void detach_destroy_domains(cons
        cpumask_t tmpmask;
        int i;
  
 -      unregister_sched_domain_sysctl();
 -
        for_each_cpu_mask_nr(i, *cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
@@@ -7800,21 -7806,17 +7799,21 @@@ void partition_sched_domains(int ndoms_
                             struct sched_domain_attr *dattr_new)
  {
        int i, j, n;
 +      int new_topology;
  
        mutex_lock(&sched_domains_mutex);
  
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
  
 +      /* Let architecture update cpu core mappings. */
 +      new_topology = arch_update_cpu_topology();
 +
        n = doms_new ? ndoms_new : 0;
  
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
 -              for (j = 0; j < n; j++) {
 +              for (j = 0; j < n && !new_topology; j++) {
                        if (cpus_equal(doms_cur[i], doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
@@@ -7829,12 -7831,12 +7828,12 @@@ match1
                ndoms_cur = 0;
                doms_new = &fallback_doms;
                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
 -              dattr_new = NULL;
 +              WARN_ON_ONCE(dattr_new);
        }
  
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
 -              for (j = 0; j < ndoms_cur; j++) {
 +              for (j = 0; j < ndoms_cur && !new_topology; j++) {
                        if (cpus_equal(doms_new[i], doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
@@@ -8489,7 -8491,7 +8488,7 @@@ stati
  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  {
        struct cfs_rq *cfs_rq;
 -      struct sched_entity *se, *parent_se;
 +      struct sched_entity *se;
        struct rq *rq;
        int i;
  
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
  
 -              cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
 -                              GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 +              cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 +                                    GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
                        goto err;
  
 -              se = kmalloc_node(sizeof(struct sched_entity),
 -                              GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 +              se = kzalloc_node(sizeof(struct sched_entity),
 +                                GFP_KERNEL, cpu_to_node(i));
                if (!se)
                        goto err;
  
 -              parent_se = parent ? parent->se[i] : NULL;
 -              init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
 +              init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
        }
  
        return 1;
@@@ -8576,7 -8579,7 +8575,7 @@@ stati
  int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  {
        struct rt_rq *rt_rq;
 -      struct sched_rt_entity *rt_se, *parent_se;
 +      struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
  
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
  
 -              rt_rq = kmalloc_node(sizeof(struct rt_rq),
 -                              GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 +              rt_rq = kzalloc_node(sizeof(struct rt_rq),
 +                                   GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
                        goto err;
  
 -              rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
 -                              GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 +              rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 +                                   GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
                        goto err;
  
 -              parent_se = parent ? parent->rt_se[i] : NULL;
 -              init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
 +              init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
        }
  
        return 1;
@@@ -9246,12 -9250,11 +9245,12 @@@ struct cgroup_subsys cpu_cgroup_subsys 
   * (balbir@in.ibm.com).
   */
  
 -/* track cpu usage of a group of tasks */
 +/* track cpu usage of a group of tasks and its child groups */
  struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
 +      struct cpuacct *parent;
  };
  
  struct cgroup_subsys cpuacct_subsys;
@@@ -9285,9 -9288,6 +9284,9 @@@ static struct cgroup_subsys_state *cpua
                return ERR_PTR(-ENOMEM);
        }
  
 +      if (cgrp->parent)
 +              ca->parent = cgroup_ca(cgrp->parent);
 +
        return &ca->css;
  }
  
@@@ -9301,41 -9301,6 +9300,41 @@@ cpuacct_destroy(struct cgroup_subsys *s
        kfree(ca);
  }
  
 +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 +{
 +      u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 +      u64 data;
 +
 +#ifndef CONFIG_64BIT
 +      /*
 +       * Take rq->lock to make 64-bit read safe on 32-bit platforms.
 +       */
 +      spin_lock_irq(&cpu_rq(cpu)->lock);
 +      data = *cpuusage;
 +      spin_unlock_irq(&cpu_rq(cpu)->lock);
 +#else
 +      data = *cpuusage;
 +#endif
 +
 +      return data;
 +}
 +
 +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 +{
 +      u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 +
 +#ifndef CONFIG_64BIT
 +      /*
 +       * Take rq->lock to make 64-bit write safe on 32-bit platforms.
 +       */
 +      spin_lock_irq(&cpu_rq(cpu)->lock);
 +      *cpuusage = val;
 +      spin_unlock_irq(&cpu_rq(cpu)->lock);
 +#else
 +      *cpuusage = val;
 +#endif
 +}
 +
  /* return total cpu usage (in nanoseconds) of a group */
  static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
  {
        u64 totalcpuusage = 0;
        int i;
  
 -      for_each_possible_cpu(i) {
 -              u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
 -
 -              /*
 -               * Take rq->lock to make 64-bit addition safe on 32-bit
 -               * platforms.
 -               */
 -              spin_lock_irq(&cpu_rq(i)->lock);
 -              totalcpuusage += *cpuusage;
 -              spin_unlock_irq(&cpu_rq(i)->lock);
 -      }
 +      for_each_present_cpu(i)
 +              totalcpuusage += cpuacct_cpuusage_read(ca, i);
  
        return totalcpuusage;
  }
@@@ -9361,39 -9335,23 +9360,39 @@@ static int cpuusage_write(struct cgrou
                goto out;
        }
  
 -      for_each_possible_cpu(i) {
 -              u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
 +      for_each_present_cpu(i)
 +              cpuacct_cpuusage_write(ca, i, 0);
  
 -              spin_lock_irq(&cpu_rq(i)->lock);
 -              *cpuusage = 0;
 -              spin_unlock_irq(&cpu_rq(i)->lock);
 -      }
  out:
        return err;
  }
  
 +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
 +                                 struct seq_file *m)
 +{
 +      struct cpuacct *ca = cgroup_ca(cgroup);
 +      u64 percpu;
 +      int i;
 +
 +      for_each_present_cpu(i) {
 +              percpu = cpuacct_cpuusage_read(ca, i);
 +              seq_printf(m, "%llu ", (unsigned long long) percpu);
 +      }
 +      seq_printf(m, "\n");
 +      return 0;
 +}
 +
  static struct cftype files[] = {
        {
                .name = "usage",
                .read_u64 = cpuusage_read,
                .write_u64 = cpuusage_write,
        },
 +      {
 +              .name = "usage_percpu",
 +              .read_seq_string = cpuacct_percpu_seq_read,
 +      },
 +
  };
  
  static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
  static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
  {
        struct cpuacct *ca;
 +      int cpu;
  
        if (!cpuacct_subsys.active)
                return;
  
 +      cpu = task_cpu(tsk);
        ca = task_ca(tsk);
 -      if (ca) {
 -              u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
  
 +      for (; ca; ca = ca->parent) {
 +              u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
  }
diff --combined kernel/sys.c
index ebe65c2c9873382a6017c35fb2954ad5bb54029b,5fc3a0cfb9946e01efc3907f06523964f39d59c1..d356d79e84ac5682553e76d1bb7f449e5f808e69
@@@ -112,17 -112,12 +112,17 @@@ EXPORT_SYMBOL(cad_pid)
  
  void (*pm_power_off_prepare)(void);
  
 +/*
 + * set the priority of a task
 + * - the caller must hold the RCU read lock
 + */
  static int set_one_prio(struct task_struct *p, int niceval, int error)
  {
 +      const struct cred *cred = current_cred(), *pcred = __task_cred(p);
        int no_nice;
  
 -      if (p->uid != current->euid &&
 -              p->euid != current->euid && !capable(CAP_SYS_NICE)) {
 +      if (pcred->uid  != cred->euid &&
 +          pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
                error = -EPERM;
                goto out;
        }
@@@ -146,7 -141,6 +146,7 @@@ asmlinkage long sys_setpriority(int whi
  {
        struct task_struct *g, *p;
        struct user_struct *user;
 +      const struct cred *cred = current_cred();
        int error = -EINVAL;
        struct pid *pgrp;
  
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case PRIO_USER:
 -                      user = current->user;
 +                      user = (struct user_struct *) cred->user;
                        if (!who)
 -                              who = current->uid;
 -                      else
 -                              if ((who != current->uid) && !(user = find_user(who)))
 -                                      goto out_unlock;        /* No processes for this user */
 +                              who = cred->uid;
 +                      else if ((who != cred->uid) &&
 +                               !(user = find_user(who)))
 +                              goto out_unlock;        /* No processes for this user */
  
                        do_each_thread(g, p)
 -                              if (p->uid == who)
 +                              if (__task_cred(p)->uid == who)
                                        error = set_one_prio(p, niceval, error);
                        while_each_thread(g, p);
 -                      if (who != current->uid)
 +                      if (who != cred->uid)
                                free_uid(user);         /* For find_user() */
                        break;
        }
@@@ -211,7 -205,6 +211,7 @@@ asmlinkage long sys_getpriority(int whi
  {
        struct task_struct *g, *p;
        struct user_struct *user;
 +      const struct cred *cred = current_cred();
        long niceval, retval = -ESRCH;
        struct pid *pgrp;
  
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case PRIO_USER:
 -                      user = current->user;
 +                      user = (struct user_struct *) cred->user;
                        if (!who)
 -                              who = current->uid;
 -                      else
 -                              if ((who != current->uid) && !(user = find_user(who)))
 -                                      goto out_unlock;        /* No processes for this user */
 +                              who = cred->uid;
 +                      else if ((who != cred->uid) &&
 +                               !(user = find_user(who)))
 +                              goto out_unlock;        /* No processes for this user */
  
                        do_each_thread(g, p)
 -                              if (p->uid == who) {
 +                              if (__task_cred(p)->uid == who) {
                                        niceval = 20 - task_nice(p);
                                        if (niceval > retval)
                                                retval = niceval;
                                }
                        while_each_thread(g, p);
 -                      if (who != current->uid)
 +                      if (who != cred->uid)
                                free_uid(user);         /* for find_user() */
                        break;
        }
@@@ -479,48 -472,46 +479,48 @@@ void ctrl_alt_del(void
   */
  asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
  {
 -      int old_rgid = current->gid;
 -      int old_egid = current->egid;
 -      int new_rgid = old_rgid;
 -      int new_egid = old_egid;
 +      const struct cred *old;
 +      struct cred *new;
        int retval;
  
 +      new = prepare_creds();
 +      if (!new)
 +              return -ENOMEM;
 +      old = current_cred();
 +
        retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
        if (retval)
 -              return retval;
 +              goto error;
  
 +      retval = -EPERM;
        if (rgid != (gid_t) -1) {
 -              if ((old_rgid == rgid) ||
 -                  (current->egid==rgid) ||
 +              if (old->gid == rgid ||
 +                  old->egid == rgid ||
                    capable(CAP_SETGID))
 -                      new_rgid = rgid;
 +                      new->gid = rgid;
                else
 -                      return -EPERM;
 +                      goto error;
        }
        if (egid != (gid_t) -1) {
 -              if ((old_rgid == egid) ||
 -                  (current->egid == egid) ||
 -                  (current->sgid == egid) ||
 +              if (old->gid == egid ||
 +                  old->egid == egid ||
 +                  old->sgid == egid ||
                    capable(CAP_SETGID))
 -                      new_egid = egid;
 +                      new->egid = egid;
                else
 -                      return -EPERM;
 -      }
 -      if (new_egid != old_egid) {
 -              set_dumpable(current->mm, suid_dumpable);
 -              smp_wmb();
 +                      goto error;
        }
 +
        if (rgid != (gid_t) -1 ||
 -          (egid != (gid_t) -1 && egid != old_rgid))
 -              current->sgid = new_egid;
 -      current->fsgid = new_egid;
 -      current->egid = new_egid;
 -      current->gid = new_rgid;
 -      key_fsgid_changed(current);
 -      proc_id_connector(current, PROC_EVENT_GID);
 -      return 0;
 +          (egid != (gid_t) -1 && egid != old->gid))
 +              new->sgid = new->egid;
 +      new->fsgid = new->egid;
 +
 +      return commit_creds(new);
 +
 +error:
 +      abort_creds(new);
 +      return retval;
  }
  
  /*
   */
  asmlinkage long sys_setgid(gid_t gid)
  {
 -      int old_egid = current->egid;
 +      const struct cred *old;
 +      struct cred *new;
        int retval;
  
 +      new = prepare_creds();
 +      if (!new)
 +              return -ENOMEM;
 +      old = current_cred();
 +
        retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
        if (retval)
 -              return retval;
 +              goto error;
  
 -      if (capable(CAP_SETGID)) {
 -              if (old_egid != gid) {
 -                      set_dumpable(current->mm, suid_dumpable);
 -                      smp_wmb();
 -              }
 -              current->gid = current->egid = current->sgid = current->fsgid = gid;
 -      } else if ((gid == current->gid) || (gid == current->sgid)) {
 -              if (old_egid != gid) {
 -                      set_dumpable(current->mm, suid_dumpable);
 -                      smp_wmb();
 -              }
 -              current->egid = current->fsgid = gid;
 -      }
 +      retval = -EPERM;
 +      if (capable(CAP_SETGID))
 +              new->gid = new->egid = new->sgid = new->fsgid = gid;
 +      else if (gid == old->gid || gid == old->sgid)
 +              new->egid = new->fsgid = gid;
        else
 -              return -EPERM;
 +              goto error;
  
 -      key_fsgid_changed(current);
 -      proc_id_connector(current, PROC_EVENT_GID);
 -      return 0;
 +      return commit_creds(new);
 +
 +error:
 +      abort_creds(new);
 +      return retval;
  }
    
 -static int set_user(uid_t new_ruid, int dumpclear)
 +/*
 + * change the user struct in a credentials set to match the new UID
 + */
 +static int set_user(struct cred *new)
  {
        struct user_struct *new_user;
  
 -      new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
 +      new_user = alloc_uid(current_user_ns(), new->uid);
        if (!new_user)
                return -EAGAIN;
  
        if (atomic_read(&new_user->processes) >=
                                current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
 -                      new_user != current->nsproxy->user_ns->root_user) {
 +                      new_user != INIT_USER) {
                free_uid(new_user);
                return -EAGAIN;
        }
  
 -      switch_uid(new_user);
 -
 -      if (dumpclear) {
 -              set_dumpable(current->mm, suid_dumpable);
 -              smp_wmb();
 -      }
 -      current->uid = new_ruid;
 +      free_uid(new->user);
 +      new->user = new_user;
        return 0;
  }
  
   */
  asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
  {
 -      int old_ruid, old_euid, old_suid, new_ruid, new_euid;
 +      const struct cred *old;
 +      struct cred *new;
        int retval;
  
 +      new = prepare_creds();
 +      if (!new)
 +              return -ENOMEM;
 +      old = current_cred();
 +
        retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
        if (retval)
 -              return retval;
 -
 -      new_ruid = old_ruid = current->uid;
 -      new_euid = old_euid = current->euid;
 -      old_suid = current->suid;
 +              goto error;
  
 +      retval = -EPERM;
        if (ruid != (uid_t) -1) {
 -              new_ruid = ruid;
 -              if ((old_ruid != ruid) &&
 -                  (current->euid != ruid) &&
 +              new->uid = ruid;
 +              if (old->uid != ruid &&
 +                  old->euid != ruid &&
                    !capable(CAP_SETUID))
 -                      return -EPERM;
 +                      goto error;
        }
  
        if (euid != (uid_t) -1) {
 -              new_euid = euid;
 -              if ((old_ruid != euid) &&
 -                  (current->euid != euid) &&
 -                  (current->suid != euid) &&
 +              new->euid = euid;
 +              if (old->uid != euid &&
 +                  old->euid != euid &&
 +                  old->suid != euid &&
                    !capable(CAP_SETUID))
 -                      return -EPERM;
 +                      goto error;
        }
  
 -      if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
 -              return -EAGAIN;
 +      retval = -EAGAIN;
 +      if (new->uid != old->uid && set_user(new) < 0)
 +              goto error;
  
 -      if (new_euid != old_euid) {
 -              set_dumpable(current->mm, suid_dumpable);
 -              smp_wmb();
 -      }
 -      current->fsuid = current->euid = new_euid;
        if (ruid != (uid_t) -1 ||
 -          (euid != (uid_t) -1 && euid != old_ruid))
 -              current->suid = current->euid;
 -      current->fsuid = current->euid;
 +          (euid != (uid_t) -1 && euid != old->uid))
 +              new->suid = new->euid;
 +      new->fsuid = new->euid;
  
 -      key_fsuid_changed(current);
 -      proc_id_connector(current, PROC_EVENT_UID);
 -
 -      return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
 -}
 +      retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
 +      if (retval < 0)
 +              goto error;
  
 +      return commit_creds(new);
  
 +error:
 +      abort_creds(new);
 +      return retval;
 +}
                
  /*
   * setuid() is implemented like SysV with SAVED_IDS 
   */
  asmlinkage long sys_setuid(uid_t uid)
  {
 -      int old_euid = current->euid;
 -      int old_ruid, old_suid, new_suid;
 +      const struct cred *old;
 +      struct cred *new;
        int retval;
  
 +      new = prepare_creds();
 +      if (!new)
 +              return -ENOMEM;
 +      old = current_cred();
 +
        retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
        if (retval)
 -              return retval;
 +              goto error;
  
 -      old_ruid = current->uid;
 -      old_suid = current->suid;
 -      new_suid = old_suid;
 -      
 +      retval = -EPERM;
        if (capable(CAP_SETUID)) {
 -              if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
 -                      return -EAGAIN;
 -              new_suid = uid;
 -      } else if ((uid != current->uid) && (uid != new_suid))
 -              return -EPERM;
 -
 -      if (old_euid != uid) {
 -              set_dumpable(current->mm, suid_dumpable);
 -              smp_wmb();
 +              new->suid = new->uid = uid;
 +              if (uid != old->uid && set_user(new) < 0) {
 +                      retval = -EAGAIN;
 +                      goto error;
 +              }
 +      } else if (uid != old->uid && uid != new->suid) {
 +              goto error;
        }
 -      current->fsuid = current->euid = uid;
 -      current->suid = new_suid;
  
 -      key_fsuid_changed(current);
 -      proc_id_connector(current, PROC_EVENT_UID);
 +      new->fsuid = new->euid = uid;
 +
 +      retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
 +      if (retval < 0)
 +              goto error;
  
 -      return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
 +      return commit_creds(new);
 +
 +error:
 +      abort_creds(new);
 +      return retval;
  }
  
  
   */
  asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
  {
 -      int old_ruid = current->uid;
 -      int old_euid = current->euid;
 -      int old_suid = current->suid;
 +      const struct cred *old;
 +      struct cred *new;
        int retval;
  
 +      new = prepare_creds();
 +      if (!new)
 +              return -ENOMEM;
 +
        retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
        if (retval)
 -              return retval;
 +              goto error;
 +      old = current_cred();
  
 +      retval = -EPERM;
        if (!capable(CAP_SETUID)) {
 -              if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
 -                  (ruid != current->euid) && (ruid != current->suid))
 -                      return -EPERM;
 -              if ((euid != (uid_t) -1) && (euid != current->uid) &&
 -                  (euid != current->euid) && (euid != current->suid))
 -                      return -EPERM;
 -              if ((suid != (uid_t) -1) && (suid != current->uid) &&
 -                  (suid != current->euid) && (suid != current->suid))
 -                      return -EPERM;
 +              if (ruid != (uid_t) -1 && ruid != old->uid &&
 +                  ruid != old->euid  && ruid != old->suid)
 +                      goto error;
 +              if (euid != (uid_t) -1 && euid != old->uid &&
 +                  euid != old->euid  && euid != old->suid)
 +                      goto error;
 +              if (suid != (uid_t) -1 && suid != old->uid &&
 +                  suid != old->euid  && suid != old->suid)
 +                      goto error;
        }
 +
 +      retval = -EAGAIN;
        if (ruid != (uid_t) -1) {
 -              if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
 -                      return -EAGAIN;
 +              new->uid = ruid;
 +              if (ruid != old->uid && set_user(new) < 0)
 +                      goto error;
        }
 -      if (euid != (uid_t) -1) {
 -              if (euid != current->euid) {
 -                      set_dumpable(current->mm, suid_dumpable);
 -                      smp_wmb();
 -              }
 -              current->euid = euid;
 -      }
 -      current->fsuid = current->euid;
 +      if (euid != (uid_t) -1)
 +              new->euid = euid;
        if (suid != (uid_t) -1)
 -              current->suid = suid;
 +              new->suid = suid;
 +      new->fsuid = new->euid;
 +
 +      retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
 +      if (retval < 0)
 +              goto error;
  
 -      key_fsuid_changed(current);
 -      proc_id_connector(current, PROC_EVENT_UID);
 +      return commit_creds(new);
  
 -      return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
 +error:
 +      abort_creds(new);
 +      return retval;
  }
  
  asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
  {
 +      const struct cred *cred = current_cred();
        int retval;
  
 -      if (!(retval = put_user(current->uid, ruid)) &&
 -          !(retval = put_user(current->euid, euid)))
 -              retval = put_user(current->suid, suid);
 +      if (!(retval   = put_user(cred->uid,  ruid)) &&
 +          !(retval   = put_user(cred->euid, euid)))
 +              retval = put_user(cred->suid, suid);
  
        return retval;
  }
   */
  asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
  {
 +      const struct cred *old;
 +      struct cred *new;
        int retval;
  
 +      new = prepare_creds();
 +      if (!new)
 +              return -ENOMEM;
 +      old = current_cred();
 +
        retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
        if (retval)
 -              return retval;
 +              goto error;
  
 +      retval = -EPERM;
        if (!capable(CAP_SETGID)) {
 -              if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
 -                  (rgid != current->egid) && (rgid != current->sgid))
 -                      return -EPERM;
 -              if ((egid != (gid_t) -1) && (egid != current->gid) &&
 -                  (egid != current->egid) && (egid != current->sgid))
 -                      return -EPERM;
 -              if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
 -                  (sgid != current->egid) && (sgid != current->sgid))
 -                      return -EPERM;
 +              if (rgid != (gid_t) -1 && rgid != old->gid &&
 +                  rgid != old->egid  && rgid != old->sgid)
 +                      goto error;
 +              if (egid != (gid_t) -1 && egid != old->gid &&
 +                  egid != old->egid  && egid != old->sgid)
 +                      goto error;
 +              if (sgid != (gid_t) -1 && sgid != old->gid &&
 +                  sgid != old->egid  && sgid != old->sgid)
 +                      goto error;
        }
 -      if (egid != (gid_t) -1) {
 -              if (egid != current->egid) {
 -                      set_dumpable(current->mm, suid_dumpable);
 -                      smp_wmb();
 -              }
 -              current->egid = egid;
 -      }
 -      current->fsgid = current->egid;
 +
        if (rgid != (gid_t) -1)
 -              current->gid = rgid;
 +              new->gid = rgid;
 +      if (egid != (gid_t) -1)
 +              new->egid = egid;
        if (sgid != (gid_t) -1)
 -              current->sgid = sgid;
 +              new->sgid = sgid;
 +      new->fsgid = new->egid;
  
 -      key_fsgid_changed(current);
 -      proc_id_connector(current, PROC_EVENT_GID);
 -      return 0;
 +      return commit_creds(new);
 +
 +error:
 +      abort_creds(new);
 +      return retval;
  }
  
  asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
  {
 +      const struct cred *cred = current_cred();
        int retval;
  
 -      if (!(retval = put_user(current->gid, rgid)) &&
 -          !(retval = put_user(current->egid, egid)))
 -              retval = put_user(current->sgid, sgid);
 +      if (!(retval   = put_user(cred->gid,  rgid)) &&
 +          !(retval   = put_user(cred->egid, egid)))
 +              retval = put_user(cred->sgid, sgid);
  
        return retval;
  }
   */
  asmlinkage long sys_setfsuid(uid_t uid)
  {
 -      int old_fsuid;
 +      const struct cred *old;
 +      struct cred *new;
 +      uid_t old_fsuid;
  
 -      old_fsuid = current->fsuid;
 -      if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
 -              return old_fsuid;
 +      new = prepare_creds();
 +      if (!new)
 +              return current_fsuid();
 +      old = current_cred();
 +      old_fsuid = old->fsuid;
  
 -      if (uid == current->uid || uid == current->euid ||
 -          uid == current->suid || uid == current->fsuid || 
 +      if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
 +              goto error;
 +
 +      if (uid == old->uid  || uid == old->euid  ||
 +          uid == old->suid || uid == old->fsuid ||
            capable(CAP_SETUID)) {
                if (uid != old_fsuid) {
 -                      set_dumpable(current->mm, suid_dumpable);
 -                      smp_wmb();
 +                      new->fsuid = uid;
 +                      if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
 +                              goto change_okay;
                }
 -              current->fsuid = uid;
        }
  
 -      key_fsuid_changed(current);
 -      proc_id_connector(current, PROC_EVENT_UID);
 -
 -      security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
 +error:
 +      abort_creds(new);
 +      return old_fsuid;
  
 +change_okay:
 +      commit_creds(new);
        return old_fsuid;
  }
  
   */
  asmlinkage long sys_setfsgid(gid_t gid)
  {
 -      int old_fsgid;
 +      const struct cred *old;
 +      struct cred *new;
 +      gid_t old_fsgid;
 +
 +      new = prepare_creds();
 +      if (!new)
 +              return current_fsgid();
 +      old = current_cred();
 +      old_fsgid = old->fsgid;
  
 -      old_fsgid = current->fsgid;
        if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
 -              return old_fsgid;
 +              goto error;
  
 -      if (gid == current->gid || gid == current->egid ||
 -          gid == current->sgid || gid == current->fsgid || 
 +      if (gid == old->gid  || gid == old->egid  ||
 +          gid == old->sgid || gid == old->fsgid ||
            capable(CAP_SETGID)) {
                if (gid != old_fsgid) {
 -                      set_dumpable(current->mm, suid_dumpable);
 -                      smp_wmb();
 +                      new->fsgid = gid;
 +                      goto change_okay;
                }
 -              current->fsgid = gid;
 -              key_fsgid_changed(current);
 -              proc_id_connector(current, PROC_EVENT_GID);
        }
 +
 +error:
 +      abort_creds(new);
 +      return old_fsgid;
 +
 +change_okay:
 +      commit_creds(new);
        return old_fsgid;
  }
  
@@@ -907,8 -858,8 +907,8 @@@ void do_sys_times(struct tms *tms
        struct task_cputime cputime;
        cputime_t cutime, cstime;
  
-       spin_lock_irq(&current->sighand->siglock);
        thread_group_cputime(current, &cputime);
+       spin_lock_irq(&current->sighand->siglock);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
        spin_unlock_irq(&current->sighand->siglock);
@@@ -1167,7 -1118,7 +1167,7 @@@ EXPORT_SYMBOL(groups_free)
  
  /* export the group_info to a user-space array */
  static int groups_to_user(gid_t __user *grouplist,
 -    struct group_info *group_info)
 +                        const struct group_info *group_info)
  {
        int i;
        unsigned int count = group_info->ngroups;
@@@ -1235,7 -1186,7 +1235,7 @@@ static void groups_sort(struct group_in
  }
  
  /* a simple bsearch */
 -int groups_search(struct group_info *group_info, gid_t grp)
 +int groups_search(const struct group_info *group_info, gid_t grp)
  {
        unsigned int left, right;
  
        return 0;
  }
  
 -/* validate and set current->group_info */
 -int set_current_groups(struct group_info *group_info)
 +/**
 + * set_groups - Change a group subscription in a set of credentials
 + * @new: The newly prepared set of credentials to alter
 + * @group_info: The group list to install
 + *
 + * Validate a group subscription and, if valid, insert it into a set
 + * of credentials.
 + */
 +int set_groups(struct cred *new, struct group_info *group_info)
  {
        int retval;
 -      struct group_info *old_info;
  
        retval = security_task_setgroups(group_info);
        if (retval)
                return retval;
  
 +      put_group_info(new->group_info);
        groups_sort(group_info);
        get_group_info(group_info);
 +      new->group_info = group_info;
 +      return 0;
 +}
 +
 +EXPORT_SYMBOL(set_groups);
  
 -      task_lock(current);
 -      old_info = current->group_info;
 -      current->group_info = group_info;
 -      task_unlock(current);
 +/**
 + * set_current_groups - Change current's group subscription
 + * @group_info: The group list to impose
 + *
 + * Validate a group subscription and, if valid, impose it upon current's task
 + * security record.
 + */
 +int set_current_groups(struct group_info *group_info)
 +{
 +      struct cred *new;
 +      int ret;
  
 -      put_group_info(old_info);
 +      new = prepare_creds();
 +      if (!new)
 +              return -ENOMEM;
  
 -      return 0;
 +      ret = set_groups(new, group_info);
 +      if (ret < 0) {
 +              abort_creds(new);
 +              return ret;
 +      }
 +
 +      return commit_creds(new);
  }
  
  EXPORT_SYMBOL(set_current_groups);
  
  asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
  {
 -      int i = 0;
 -
 -      /*
 -       *      SMP: Nobody else can change our grouplist. Thus we are
 -       *      safe.
 -       */
 +      const struct cred *cred = current_cred();
 +      int i;
  
        if (gidsetsize < 0)
                return -EINVAL;
  
        /* no need to grab task_lock here; it cannot change */
 -      i = current->group_info->ngroups;
 +      i = cred->group_info->ngroups;
        if (gidsetsize) {
                if (i > gidsetsize) {
                        i = -EINVAL;
                        goto out;
                }
 -              if (groups_to_user(grouplist, current->group_info)) {
 +              if (groups_to_user(grouplist, cred->group_info)) {
                        i = -EFAULT;
                        goto out;
                }
@@@ -1368,11 -1296,9 +1368,11 @@@ asmlinkage long sys_setgroups(int gidse
   */
  int in_group_p(gid_t grp)
  {
 +      const struct cred *cred = current_cred();
        int retval = 1;
 -      if (grp != current->fsgid)
 -              retval = groups_search(current->group_info, grp);
 +
 +      if (grp != cred->fsgid)
 +              retval = groups_search(cred->group_info, grp);
        return retval;
  }
  
@@@ -1380,11 -1306,9 +1380,11 @@@ EXPORT_SYMBOL(in_group_p)
  
  int in_egroup_p(gid_t grp)
  {
 +      const struct cred *cred = current_cred();
        int retval = 1;
 -      if (grp != current->egid)
 -              retval = groups_search(current->group_info, grp);
 +
 +      if (grp != cred->egid)
 +              retval = groups_search(cred->group_info, grp);
        return retval;
  }
  
@@@ -1700,56 -1624,50 +1700,56 @@@ asmlinkage long sys_umask(int mask
  asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
                          unsigned long arg4, unsigned long arg5)
  {
 -      long error = 0;
 +      struct task_struct *me = current;
 +      unsigned char comm[sizeof(me->comm)];
 +      long error;
  
 -      if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
 +      error = security_task_prctl(option, arg2, arg3, arg4, arg5);
 +      if (error != -ENOSYS)
                return error;
  
 +      error = 0;
        switch (option) {
                case PR_SET_PDEATHSIG:
                        if (!valid_signal(arg2)) {
                                error = -EINVAL;
                                break;
                        }
 -                      current->pdeath_signal = arg2;
 +                      me->pdeath_signal = arg2;
 +                      error = 0;
                        break;
                case PR_GET_PDEATHSIG:
 -                      error = put_user(current->pdeath_signal, (int __user *)arg2);
 +                      error = put_user(me->pdeath_signal, (int __user *)arg2);
                        break;
                case PR_GET_DUMPABLE:
 -                      error = get_dumpable(current->mm);
 +                      error = get_dumpable(me->mm);
                        break;
                case PR_SET_DUMPABLE:
                        if (arg2 < 0 || arg2 > 1) {
                                error = -EINVAL;
                                break;
                        }
 -                      set_dumpable(current->mm, arg2);
 +                      set_dumpable(me->mm, arg2);
 +                      error = 0;
                        break;
  
                case PR_SET_UNALIGN:
 -                      error = SET_UNALIGN_CTL(current, arg2);
 +                      error = SET_UNALIGN_CTL(me, arg2);
                        break;
                case PR_GET_UNALIGN:
 -                      error = GET_UNALIGN_CTL(current, arg2);
 +                      error = GET_UNALIGN_CTL(me, arg2);
                        break;
                case PR_SET_FPEMU:
 -                      error = SET_FPEMU_CTL(current, arg2);
 +                      error = SET_FPEMU_CTL(me, arg2);
                        break;
                case PR_GET_FPEMU:
 -                      error = GET_FPEMU_CTL(current, arg2);
 +                      error = GET_FPEMU_CTL(me, arg2);
                        break;
                case PR_SET_FPEXC:
 -                      error = SET_FPEXC_CTL(current, arg2);
 +                      error = SET_FPEXC_CTL(me, arg2);
                        break;
                case PR_GET_FPEXC:
 -                      error = GET_FPEXC_CTL(current, arg2);
 +                      error = GET_FPEXC_CTL(me, arg2);
                        break;
                case PR_GET_TIMING:
                        error = PR_TIMING_STATISTICAL;
                case PR_SET_TIMING:
                        if (arg2 != PR_TIMING_STATISTICAL)
                                error = -EINVAL;
 +                      else
 +                              error = 0;
                        break;
  
 -              case PR_SET_NAME: {
 -                      struct task_struct *me = current;
 -                      unsigned char ncomm[sizeof(me->comm)];
 -
 -                      ncomm[sizeof(me->comm)-1] = 0;
 -                      if (strncpy_from_user(ncomm, (char __user *)arg2,
 -                                              sizeof(me->comm)-1) < 0)
 +              case PR_SET_NAME:
 +                      comm[sizeof(me->comm)-1] = 0;
 +                      if (strncpy_from_user(comm, (char __user *)arg2,
 +                                            sizeof(me->comm) - 1) < 0)
                                return -EFAULT;
 -                      set_task_comm(me, ncomm);
 +                      set_task_comm(me, comm);
                        return 0;
 -              }
 -              case PR_GET_NAME: {
 -                      struct task_struct *me = current;
 -                      unsigned char tcomm[sizeof(me->comm)];
 -
 -                      get_task_comm(tcomm, me);
 -                      if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
 +              case PR_GET_NAME:
 +                      get_task_comm(comm, me);
 +                      if (copy_to_user((char __user *)arg2, comm,
 +                                       sizeof(comm)))
                                return -EFAULT;
                        return 0;
 -              }
                case PR_GET_ENDIAN:
 -                      error = GET_ENDIAN(current, arg2);
 +                      error = GET_ENDIAN(me, arg2);
                        break;
                case PR_SET_ENDIAN:
 -                      error = SET_ENDIAN(current, arg2);
 +                      error = SET_ENDIAN(me, arg2);
                        break;
  
                case PR_GET_SECCOMP:
                                        current->default_timer_slack_ns;
                        else
                                current->timer_slack_ns = arg2;
 +                      error = 0;
                        break;
                default:
                        error = -EINVAL;
diff --combined mm/memory.c
index f01b7eed6e16c4e3f32e039b7f2223ca598f40aa,fc031d68327e5fad33130b15d50a020b7b9b31a8..0a2010a9518c499efe694c9ed0d42f5bd18e459b
@@@ -669,16 -669,6 +669,16 @@@ int copy_page_range(struct mm_struct *d
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
  
 +      if (unlikely(is_pfn_mapping(vma))) {
 +              /*
 +               * We do not free on error cases below as remove_vma
 +               * gets called on error from higher level routine
 +               */
 +              ret = track_pfn_vma_copy(vma);
 +              if (ret)
 +                      return ret;
 +      }
 +
        /*
         * We need to invalidate the secondary MMU mappings only when
         * there could be a permission downgrade on the ptes of the
@@@ -925,9 -915,6 +925,9 @@@ unsigned long unmap_vmas(struct mmu_gat
                if (vma->vm_flags & VM_ACCOUNT)
                        *nr_accounted += (end - start) >> PAGE_SHIFT;
  
 +              if (unlikely(is_pfn_mapping(vma)))
 +                      untrack_pfn_vma(vma, 0, 0);
 +
                while (start != end) {
                        if (!tlb_start_valid) {
                                tlb_start = start;
@@@ -1443,7 -1430,6 +1443,7 @@@ out
  int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
  {
 +      int ret;
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
  
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
 -      return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
 +      if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE))
 +              return -EINVAL;
 +
 +      ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot);
 +
 +      if (ret)
 +              untrack_pfn_vma(vma, pfn, PAGE_SIZE);
 +
 +      return ret;
  }
  EXPORT_SYMBOL(vm_insert_pfn);
  
@@@ -1597,17 -1575,14 +1597,17 @@@ int remap_pfn_range(struct vm_area_stru
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         */
 -      if (is_cow_mapping(vma->vm_flags)) {
 -              if (addr != vma->vm_start || end != vma->vm_end)
 -                      return -EINVAL;
 +      if (addr == vma->vm_start && end == vma->vm_end)
                vma->vm_pgoff = pfn;
 -      }
 +      else if (is_cow_mapping(vma->vm_flags))
 +              return -EINVAL;
  
        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
  
 +      err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size));
 +      if (err)
 +              return -EINVAL;
 +
        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
        pgd = pgd_offset(mm, addr);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
 +
 +      if (err)
 +              untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
 +
        return err;
  }
  EXPORT_SYMBOL(remap_pfn_range);
@@@ -2894,9 -2865,9 +2894,9 @@@ int in_gate_area_no_task(unsigned long 
  #endif        /* __HAVE_ARCH_GATE_AREA */
  
  #ifdef CONFIG_HAVE_IOREMAP_PROT
 -static resource_size_t follow_phys(struct vm_area_struct *vma,
 -                      unsigned long address, unsigned int flags,
 -                      unsigned long *prot)
 +int follow_phys(struct vm_area_struct *vma,
 +              unsigned long address, unsigned int flags,
 +              unsigned long *prot, resource_size_t *phys)
  {
        pgd_t *pgd;
        pud_t *pud;
        spinlock_t *ptl;
        resource_size_t phys_addr = 0;
        struct mm_struct *mm = vma->vm_mm;
 +      int ret = -EINVAL;
  
 -      VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
 +      if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 +              goto out;
  
        pgd = pgd_offset(mm, address);
        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 -              goto no_page_table;
 +              goto out;
  
        pud = pud_offset(pgd, address);
        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
 -              goto no_page_table;
 +              goto out;
  
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
 -              goto no_page_table;
 +              goto out;
  
        /* We cannot handle huge page PFN maps. Luckily they don't exist. */
        if (pmd_huge(*pmd))
 -              goto no_page_table;
 +              goto out;
  
        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (!ptep)
        phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
  
        *prot = pgprot_val(pte_pgprot(pte));
 +      *phys = phys_addr;
 +      ret = 0;
  
  unlock:
        pte_unmap_unlock(ptep, ptl);
  out:
 -      return phys_addr;
 -no_page_table:
 -      return 0;
 +      return ret;
  }
  
  int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
        void *maddr;
        int offset = addr & (PAGE_SIZE-1);
  
 -      if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
 -              return -EINVAL;
 -
 -      phys_addr = follow_phys(vma, addr, write, &prot);
 -
 -      if (!phys_addr)
 +      if (follow_phys(vma, addr, write, &prot, &phys_addr))
                return -EINVAL;
  
        maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
@@@ -3075,3 -3049,18 +3075,18 @@@ void print_vma_addr(char *prefix, unsig
        }
        up_read(&current->mm->mmap_sem);
  }
+ #ifdef CONFIG_PROVE_LOCKING
+ void might_fault(void)
+ {
+       might_sleep();
+       /*
+        * it would be nicer only to annotate paths which are not under
+        * pagefault_disable, however that requires a larger audit and
+        * providing helpers like get_user_atomic.
+        */
+       if (!in_atomic() && current->mm)
+               might_lock_read(&current->mm->mmap_sem);
+ }
+ EXPORT_SYMBOL(might_fault);
+ #endif