* 'core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (63 commits)
stacktrace: provide save_stack_trace_tsk() weak alias
rcu: provide RCU options on non-preempt architectures too
printk: fix discarding message when recursion_bug
futex: clean up futex_(un)lock_pi fault handling
"Tree RCU": scalable classic RCU implementation
futex: rename field in futex_q to clarify single waiter semantics
x86/swiotlb: add default swiotlb_arch_range_needs_mapping
x86/swiotlb: add default phys<->bus conversion
x86: unify pci iommu setup and allow swiotlb to compile for 32 bit
x86: add swiotlb allocation functions
swiotlb: consolidate swiotlb info message printing
swiotlb: support bouncing of HighMem pages
swiotlb: factor out copy to/from device
swiotlb: add arch hook to force mapping
swiotlb: allow architectures to override phys<->bus<->phys conversions
swiotlb: add comment where we handle the overflow of a dma mask on 32 bit
rcu: fix rcutorture behavior during reboot
resources: skip sanity check of busy resources
swiotlb: move some definitions to header
swiotlb: allow architectures to override swiotlb pool allocation
...
Fix up trivial conflicts in
arch/x86/kernel/Makefile
arch/x86/mm/init_32.c
include/linux/hardirq.h
as per Ingo's suggestions.
return dma_ops;
else
return dev->archdata.dma_ops;
- #endif /* _ASM_X86_DMA_MAPPING_H */
+ #endif
}
/* Make sure we keep the same behaviour */
static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
-#ifdef CONFIG_X86_64
struct dma_mapping_ops *ops = get_dma_ops(dev);
if (ops->mapping_error)
return ops->mapping_error(dev, dma_addr);
-#endif
return (dma_addr == bad_dma_address);
}
extern int force_iommu, no_iommu;
extern int iommu_detected;
- extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
-
/* 10 seconds */
#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
-#ifdef CONFIG_GART_IOMMU
-extern int gart_iommu_aperture;
-extern int gart_iommu_aperture_allowed;
-extern int gart_iommu_aperture_disabled;
-
-extern void early_gart_iommu_check(void);
-extern void gart_iommu_init(void);
-extern void gart_iommu_shutdown(void);
-extern void __init gart_parse_options(char *);
-extern void gart_iommu_hole_init(void);
-
-#else
-#define gart_iommu_aperture 0
-#define gart_iommu_aperture_allowed 0
-#define gart_iommu_aperture_disabled 1
-
-static inline void early_gart_iommu_check(void)
-{
-}
-static inline void gart_iommu_init(void)
-{
-}
-static inline void gart_iommu_shutdown(void)
-{
-}
-static inline void gart_parse_options(char *options)
-{
-}
-static inline void gart_iommu_hole_init(void)
-{
-}
-#endif
-
#endif /* _ASM_X86_IOMMU_H */
};
extern int pci_routeirq;
+extern int noioapicquirk;
+extern int noioapicreroute;
/* scan a bus after allocating a pci_sysdata for it */
extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
static inline void early_quirks(void) { }
#endif
+ extern void pci_iommu_alloc(void);
+
#endif /* __KERNEL__ */
#ifdef CONFIG_X86_32
int __ret_gu; \
unsigned long __val_gu; \
__chk_user_ptr(ptr); \
+ might_fault(); \
switch (sizeof(*(ptr))) { \
case 1: \
__get_user_x(1, __ret_gu, __val_gu, ptr); \
int __ret_pu; \
__typeof__(*(ptr)) __pu_val; \
__chk_user_ptr(ptr); \
+ might_fault(); \
__pu_val = x; \
switch (sizeof(*(ptr))) { \
case 1: \
#define __put_user_nocheck(x, ptr, size) \
({ \
- long __pu_err; \
+ int __pu_err; \
__put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
__pu_err; \
})
#define __get_user_nocheck(x, ptr, size) \
({ \
- long __gu_err; \
+ int __gu_err; \
unsigned long __gu_val; \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
CFLAGS_REMOVE_rtc.o = -pg
CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_early_printk.o = -pg
endif
#
CFLAGS_hpet.o := $(nostackp)
CFLAGS_tsc.o := $(nostackp)
-obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
+obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y += time_$(BITS).o ioport.o ldt.o
+obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
obj-$(CONFIG_MICROCODE) += microcode.o
+obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+
+ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
- obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
endif
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/iommu.h>
+#include <asm/gart.h>
#include <asm/calgary.h>
#include <asm/amd_iommu.h>
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
-/* This tells the BIO block layer to assume merging. Default to off
- because we cannot guarantee merging later. */
-int iommu_bio_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_bio_merge);
-
dma_addr_t bad_dma_address __read_mostly = 0;
EXPORT_SYMBOL(bad_dma_address);
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
}
+ #endif
void __init pci_iommu_alloc(void)
{
+ #ifdef CONFIG_X86_64
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
+ #endif
+
/*
* The order of these functions is important for
* fall-back/fail-over reasons
pci_swiotlb_init();
}
- unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
- {
- unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
-
- return size >> PAGE_SHIFT;
- }
- EXPORT_SYMBOL(iommu_nr_pages);
- #endif
-
void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag)
{
}
if (!strncmp(p, "biomerge", 8)) {
- iommu_bio_merge = 4096;
iommu_merge = 1;
force_iommu = 1;
}
static __devinit void via_no_dac(struct pci_dev *dev)
{
if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
- printk(KERN_INFO "PCI: VIA PCI bridge detected."
- "Disabling DAC.\n");
+ printk(KERN_INFO
+ "PCI: VIA PCI bridge detected. Disabling DAC.\n");
forbid_dac = 1;
}
}
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+ #include <linux/pci.h>
#include <linux/pfn.h>
#include <linux/poison.h>
#include <linux/bootmem.h>
static int __initdata after_init_bootmem;
-static __init void *alloc_low_page(unsigned long *phys)
+static __init void *alloc_low_page(void)
{
unsigned long pfn = table_end++;
void *adr;
adr = __va(pfn * PAGE_SIZE);
memset(adr, 0, PAGE_SIZE);
- *phys = pfn * PAGE_SIZE;
return adr;
}
pmd_t *pmd_table;
#ifdef CONFIG_X86_PAE
- unsigned long phys;
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
if (after_init_bootmem)
pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
else
- pmd_table = (pmd_t *)alloc_low_page(&phys);
+ pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+ return pmd_table;
}
#endif
pud = pud_offset(pgd, 0);
if (!page_table)
page_table =
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- } else {
- unsigned long phys;
- page_table = (pte_t *)alloc_low_page(&phys);
- }
+ } else
+ page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
int codesize, reservedpages, datasize, initsize;
int tmp;
- start_periodic_check_for_corruption();
-
+ pci_iommu_alloc();
+
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
(unsigned long)&_text, (unsigned long)&_etext,
((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+ /*
+ * Check boundaries twice: Some fundamental inconsistencies can
+ * be detected at build time already.
+ */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+ BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+ BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
- BUG_ON(VMALLOC_START > VMALLOC_END);
+ BUG_ON(VMALLOC_START >= VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
if (boot_cpu_data.wp_works_ok < 0)
#ifdef CONFIG_GENERIC_BUG
#ifndef __ASSEMBLY__
struct bug_entry {
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
unsigned long bug_addr;
+#else
+ signed int bug_addr_disp;
+#endif
#ifdef CONFIG_DEBUG_BUGVERBOSE
+#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
const char *file;
+#else
+ signed int file_disp;
+#endif
unsigned short line;
#endif
unsigned short flags;
#ifndef __WARN
#ifndef __ASSEMBLY__
- extern void warn_on_slowpath(const char *file, const int line);
extern void warn_slowpath(const char *file, const int line,
const char *fmt, ...) __attribute__((format(printf, 3, 4)));
#define WANT_WARN_ON_SLOWPATH
#endif
- #define __WARN() warn_on_slowpath(__FILE__, __LINE__)
- #define __WARN_printf(arg...) warn_slowpath(__FILE__, __LINE__, arg)
+ #define __WARN() warn_slowpath(__FILE__, __LINE__, NULL)
+ #define __WARN_printf(arg...) warn_slowpath(__FILE__, __LINE__, arg)
#else
- #define __WARN_printf(arg...) do { printk(arg); __WARN(); } while (0)
+ #define __WARN_printf(arg...) do { printk(arg); __WARN(); } while (0)
#endif
#ifndef WARN_ON
#include <linux/preempt.h>
#include <linux/smp_lock.h>
#include <linux/lockdep.h>
+#include <linux/ftrace_irq.h>
#include <asm/hardirq.h>
#include <asm/system.h>
}
#endif
- #if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+ #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU)
extern void rcu_irq_enter(void);
extern void rcu_irq_exit(void);
+ extern void rcu_nmi_enter(void);
+ extern void rcu_nmi_exit(void);
#else
# define rcu_irq_enter() do { } while (0)
# define rcu_irq_exit() do { } while (0)
- #endif /* CONFIG_PREEMPT_RCU */
+ # define rcu_nmi_enter() do { } while (0)
+ # define rcu_nmi_exit() do { } while (0)
+ #endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */
/*
* It is safe to do non-atomic ops on ->hardirq_context,
*/
#define __irq_enter() \
do { \
- rcu_irq_enter(); \
account_system_vtime(current); \
add_preempt_count(HARDIRQ_OFFSET); \
trace_hardirq_enter(); \
trace_hardirq_exit(); \
account_system_vtime(current); \
sub_preempt_count(HARDIRQ_OFFSET); \
- rcu_irq_exit(); \
} while (0)
/*
*/
extern void irq_exit(void);
-#define nmi_enter() do { lockdep_off(); rcu_nmi_enter(); __irq_enter(); } while (0)
-#define nmi_exit() do { __irq_exit(); rcu_nmi_exit(); lockdep_on(); } while (0)
+#define nmi_enter() \
+ do { \
+ ftrace_nmi_enter(); \
+ lockdep_off(); \
++ rcu_nmi_enter(); \
+ __irq_enter(); \
+ } while (0)
++
+#define nmi_exit() \
+ do { \
+ __irq_exit(); \
++ rcu_nmi_exit(); \
+ lockdep_on(); \
+ ftrace_nmi_exit(); \
+ } while (0)
#endif /* LINUX_HARDIRQ_H */
(__x < 0) ? -__x : __x; \
})
+ #ifdef CONFIG_PROVE_LOCKING
+ void might_fault(void);
+ #else
+ static inline void might_fault(void)
+ {
+ might_sleep();
+ }
+ #endif
+
extern struct atomic_notifier_head panic_notifier_list;
extern long (*panic_blink)(long time);
NORET_TYPE void panic(const char * fmt, ...)
extern int core_kernel_text(unsigned long addr);
extern int __kernel_text_address(unsigned long addr);
extern int kernel_text_address(unsigned long addr);
+ extern int func_ptr_is_kernel_text(void *ptr);
+
struct pid;
extern struct pid *session_of_pgrp(struct pid *pgrp);
((unsigned char *)&addr)[3]
#define NIPQUAD_FMT "%u.%u.%u.%u"
-#define NIP6(addr) \
- ntohs((addr).s6_addr16[0]), \
- ntohs((addr).s6_addr16[1]), \
- ntohs((addr).s6_addr16[2]), \
- ntohs((addr).s6_addr16[3]), \
- ntohs((addr).s6_addr16[4]), \
- ntohs((addr).s6_addr16[5]), \
- ntohs((addr).s6_addr16[6]), \
- ntohs((addr).s6_addr16[7])
-#define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
-#define NIP6_SEQFMT "%04x%04x%04x%04x%04x%04x%04x%04x"
-
#if defined(__LITTLE_ENDIAN)
#define HIPQUAD(addr) \
((unsigned char *)&addr)[3], \
void (*func)(struct rcu_head *head);
};
- #ifdef CONFIG_CLASSIC_RCU
+ #if defined(CONFIG_CLASSIC_RCU)
#include <linux/rcuclassic.h>
- #else /* #ifdef CONFIG_CLASSIC_RCU */
+ #elif defined(CONFIG_TREE_RCU)
+ #include <linux/rcutree.h>
+ #elif defined(CONFIG_PREEMPT_RCU)
#include <linux/rcupreempt.h>
- #endif /* #else #ifdef CONFIG_CLASSIC_RCU */
+ #else
+ #error "Unknown RCU implementation specified to kernel configuration"
+ #endif /* #else #if defined(CONFIG_CLASSIC_RCU) */
#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
* on the write-side to insure proper synchronization.
*/
#define rcu_read_lock_sched() preempt_disable()
+#define rcu_read_lock_sched_notrace() preempt_disable_notrace()
/*
* rcu_read_unlock_sched - marks the end of a RCU-classic critical section
* See rcu_read_lock_sched for more information.
*/
#define rcu_read_unlock_sched() preempt_enable()
+#define rcu_read_unlock_sched_notrace() preempt_enable_notrace()
Say N.
+config KALLSYMS_STRIP_GENERATED
+ bool "Strip machine generated symbols from kallsyms"
+ depends on KALLSYMS_ALL
+ default y
+ help
+ Say N if you want kallsyms to retain even machine generated symbols.
+
config KALLSYMS_EXTRA_PASS
bool "Do an extra kallsyms pass"
depends on KALLSYMS
config MARKERS
bool "Activate markers"
+ depends on TRACEPOINTS
help
Place an empty function call at each marker site. Can be
dynamically changed for a probe function.
config PREEMPT_NOTIFIERS
bool
+ choice
+ prompt "RCU Implementation"
+ default CLASSIC_RCU
+
config CLASSIC_RCU
- def_bool !PREEMPT_RCU
+ bool "Classic RCU"
help
This option selects the classic RCU implementation that is
designed for best read-side performance on non-realtime
- systems. Classic RCU is the default. Note that the
- PREEMPT_RCU symbol is used to select/deselect this option.
+ systems.
+
+ Select this option if you are unsure.
+
+ config TREE_RCU
+ bool "Tree-based hierarchical RCU"
+ help
+ This option selects the RCU implementation that is
+ designed for very large SMP system with hundreds or
+ thousands of CPUs.
+
+ config PREEMPT_RCU
+ bool "Preemptible RCU"
+ depends on PREEMPT
+ help
+ This option reduces the latency of the kernel by making certain
+ RCU sections preemptible. Normally RCU code is non-preemptible, if
+ this option is selected then read-only RCU sections become
+ preemptible. This helps latency, but may expose bugs due to
+ now-naive assumptions about each RCU read-side critical section
+ remaining on a given CPU through its execution.
+
+ endchoice
+
+ config RCU_TRACE
+ bool "Enable tracing for RCU"
+ depends on TREE_RCU || PREEMPT_RCU
+ help
+ This option provides tracing in RCU which presents stats
+ in debugfs for debugging RCU implementation.
+
+ Say Y here if you want to enable RCU tracing
+ Say N if you are unsure.
+
+ config RCU_FANOUT
+ int "Tree-based hierarchical RCU fanout value"
+ range 2 64 if 64BIT
+ range 2 32 if !64BIT
+ depends on TREE_RCU
+ default 64 if 64BIT
+ default 32 if !64BIT
+ help
+ This option controls the fanout of hierarchical implementations
+ of RCU, allowing RCU to work efficiently on machines with
+ large numbers of CPUs. This value must be at least the cube
+ root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+ systems and up to 262,144 for 64-bit systems.
+
+ Select a specific number if testing RCU itself.
+ Take the default if unsure.
+
+ config RCU_FANOUT_EXACT
+ bool "Disable tree-based hierarchical RCU auto-balancing"
+ depends on TREE_RCU
+ default n
+ help
+ This option forces use of the exact RCU_FANOUT value specified,
+ regardless of imbalances in the hierarchy. This is useful for
+ testing RCU itself, and might one day be useful on systems with
+ strong NUMA behavior.
+
+ Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+ Say N if unsure.
+
+ config TREE_RCU_TRACE
+ def_bool RCU_TRACE && TREE_RCU
+ select DEBUG_FS
+ help
+ This option provides tracing for the TREE_RCU implementation,
+ permitting Makefile to trivially select kernel/rcutree_trace.c.
+
+ config PREEMPT_RCU_TRACE
+ def_bool RCU_TRACE && PREEMPT_RCU
+ select DEBUG_FS
+ help
+ This option provides tracing for the PREEMPT_RCU implementation,
+ permitting Makefile to trivially select kernel/rcupreempt_trace.c.
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
- notifier.o ksysfs.o pm_qos_params.o sched_clock.o
+ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -pg
endif
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+ obj-$(CONFIG_TREE_RCU) += rcutree.o
obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
- ifeq ($(CONFIG_PREEMPT_RCU),y)
- obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
- endif
+ obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+ obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
-ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
# me. I suspect most platforms don't need this, but until we know that for sure
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
+#include <linux/init_task.h>
#include <trace/sched.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
+#include "cred-internals.h"
+
+DEFINE_TRACE(sched_process_free);
+DEFINE_TRACE(sched_process_exit);
+DEFINE_TRACE(sched_process_wait);
static void exit_mm(struct task_struct * tsk);
int zap_leader;
repeat:
tracehook_prepare_release_task(p);
- atomic_dec(&p->user->processes);
+ /* don't need to get the RCU readlock here - the process is dead and
+ * can't be modifying its own credentials */
+ atomic_dec(&__task_cred(p)->user->processes);
+
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
tracehook_finish_release_task(p);
/* cpus_allowed? */
/* rt_priority? */
/* signals? */
- security_task_reparent_to_init(current);
memcpy(current->signal->rlim, init_task.signal->rlim,
sizeof(current->signal->rlim));
- atomic_inc(&(INIT_USER->__count));
+
+ atomic_inc(&init_cred.usage);
+ commit_creds(&init_cred);
write_unlock_irq(&tasklist_lock);
- switch_uid(INIT_USER);
}
void __set_special_pids(struct pid *pid)
check_stack_usage();
exit_thread();
cgroup_exit(tsk, 1);
- exit_keys(tsk);
if (group_dead && tsk->signal->leader)
disassociate_ctty(1);
preempt_disable();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
-
schedule();
BUG();
/* Avoid "noreturn function does return". */
unsigned long state;
int retval, status, traced;
pid_t pid = task_pid_vnr(p);
+ uid_t uid = __task_cred(p)->uid;
if (!likely(options & WEXITED))
return 0;
if (unlikely(options & WNOWAIT)) {
- uid_t uid = p->uid;
int exit_code = p->exit_code;
int why, status;
* group, which consolidates times for all threads in the
* group including the group leader.
*/
+ thread_group_cputime(p, &cputime);
spin_lock_irq(&p->parent->sighand->siglock);
psig = p->parent->signal;
sig = p->signal;
- thread_group_cputime(p, &cputime);
psig->cutime =
cputime_add(psig->cutime,
cputime_add(cputime.utime,
if (!retval && infop)
retval = put_user(pid, &infop->si_pid);
if (!retval && infop)
- retval = put_user(p->uid, &infop->si_uid);
+ retval = put_user(uid, &infop->si_uid);
if (!retval)
retval = pid;
if (!unlikely(options & WNOWAIT))
p->exit_code = 0;
- uid = p->uid;
+ /* don't need the RCU readlock here as we're holding a spinlock */
+ uid = __task_cred(p)->uid;
unlock_sig:
spin_unlock_irq(&p->sighand->siglock);
if (!exit_code)
}
if (!unlikely(options & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
+ uid = __task_cred(p)->uid;
spin_unlock_irq(&p->sighand->siglock);
pid = task_pid_vnr(p);
- uid = p->uid;
get_task_struct(p);
read_unlock(&tasklist_lock);
*/
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/ftrace.h>
#include <asm/uaccess.h>
#include <asm/sections.h>
return e;
}
-int core_kernel_text(unsigned long addr)
+__notrace_funcgraph int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr <= (unsigned long)_etext)
return 0;
}
-int __kernel_text_address(unsigned long addr)
+__notrace_funcgraph int __kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
return 1;
return module_text_address(addr) != NULL;
}
+
+ /*
+ * On some architectures (PPC64, IA64) function pointers
+ * are actually only tokens to some data that then holds the
+ * real function address. As a result, to find if a function
+ * pointer is part of the kernel text, we need to do some
+ * special dereferencing first.
+ */
+ int func_ptr_is_kernel_text(void *ptr)
+ {
+ unsigned long addr;
+ addr = (unsigned long) dereference_function_descriptor(ptr);
+ if (core_kernel_text(addr))
+ return 1;
+ return module_text_address(addr) != NULL;
+ }
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
* It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
- * wake up q->waiters, then make the second condition true.
+ * wake up q->waiter, then make the second condition true.
*/
struct futex_q {
struct plist_node list;
- wait_queue_head_t waiters;
+ /* There can only be a single waiter */
+ wait_queue_head_t waiter;
/* Which hash list lock to use: */
spinlock_t *lock_ptr;
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
- /*
- * Take mm->mmap_sem, when futex is shared
- */
- static inline void futex_lock_mm(struct rw_semaphore *fshared)
- {
- if (fshared)
- down_read(fshared);
- }
-
- /*
- * Release mm->mmap_sem, when the futex is shared
- */
- static inline void futex_unlock_mm(struct rw_semaphore *fshared)
- {
- if (fshared)
- up_read(fshared);
- }
-
/*
* We hash on the keys returned from get_futex_key (see below).
*/
&& key1->both.offset == key2->both.offset);
}
+ /*
+ * Take a reference to the resource addressed by a key.
+ * Can be called while holding spinlocks.
+ *
+ */
+ static void get_futex_key_refs(union futex_key *key)
+ {
+ if (!key->both.ptr)
+ return;
+
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
+ atomic_inc(&key->shared.inode->i_count);
+ break;
+ case FUT_OFF_MMSHARED:
+ atomic_inc(&key->private.mm->mm_count);
+ break;
+ }
+ }
+
+ /*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket spinlock must not be held.
+ */
+ static void drop_futex_key_refs(union futex_key *key)
+ {
+ if (!key->both.ptr)
+ return;
+
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
+ iput(key->shared.inode);
+ break;
+ case FUT_OFF_MMSHARED:
+ mmdrop(key->private.mm);
+ break;
+ }
+ }
+
/**
* get_futex_key - Get parameters which are the keys for a futex.
* @uaddr: virtual address of the futex
* For other futexes, it points to ¤t->mm->mmap_sem and
* caller must have taken the reader lock. but NOT any spinlocks.
*/
- static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
- union futex_key *key)
+ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
struct page *page;
int err;
return -EFAULT;
key->private.mm = mm;
key->private.address = address;
+ get_futex_key_refs(key);
return 0;
}
- /*
- * The futex is hashed differently depending on whether
- * it's in a shared or private mapping. So check vma first.
- */
- vma = find_extend_vma(mm, address);
- if (unlikely(!vma))
- return -EFAULT;
- /*
- * Permissions.
- */
- if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
- return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+ again:
+ err = get_user_pages_fast(address, 1, 0, &page);
+ if (err < 0)
+ return err;
+
+ lock_page(page);
+ if (!page->mapping) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
/*
* Private mappings are handled in a simple way.
*
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
- * the object not the particular process. Therefore we use
- * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
- * mappings of _writable_ handles.
+ * the object not the particular process.
*/
- if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
- key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
+ if (PageAnon(page)) {
+ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
- return 0;
+ } else {
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+ key->shared.inode = page->mapping->host;
+ key->shared.pgoff = page->index;
}
- /*
- * Linear file mappings are also simple.
- */
- key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
- key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
- if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
- key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
- + vma->vm_pgoff);
- return 0;
- }
+ get_futex_key_refs(key);
- /*
- * We could walk the page table to read the non-linear
- * pte, and get the page index without fetching the page
- * from swap. But that's a lot of code to duplicate here
- * for a rare case, so we simply fetch the page.
- */
- err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
- if (err >= 0) {
- key->shared.pgoff =
- page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- put_page(page);
- return 0;
- }
- return err;
- }
-
- /*
- * Take a reference to the resource addressed by a key.
- * Can be called while holding spinlocks.
- *
- */
- static void get_futex_key_refs(union futex_key *key)
- {
- if (key->both.ptr == NULL)
- return;
- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
- case FUT_OFF_INODE:
- atomic_inc(&key->shared.inode->i_count);
- break;
- case FUT_OFF_MMSHARED:
- atomic_inc(&key->private.mm->mm_count);
- break;
- }
+ unlock_page(page);
+ put_page(page);
+ return 0;
}
- /*
- * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
- */
- static void drop_futex_key_refs(union futex_key *key)
+ static inline
+ void put_futex_key(int fshared, union futex_key *key)
{
- if (!key->both.ptr)
- return;
- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
- case FUT_OFF_INODE:
- iput(key->shared.inode);
- break;
- case FUT_OFF_MMSHARED:
- mmdrop(key->private.mm);
- break;
- }
+ drop_futex_key_refs(key);
}
static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
/*
* Fault handling.
- * if fshared is non NULL, current->mm->mmap_sem is already held
*/
- static int futex_handle_fault(unsigned long address,
- struct rw_semaphore *fshared, int attempt)
+ static int futex_handle_fault(unsigned long address, int attempt)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
if (attempt > 2)
return ret;
- if (!fshared)
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
if (vma && address >= vma->vm_start &&
(vma->vm_flags & VM_WRITE)) {
current->min_flt++;
}
}
- if (!fshared)
- up_read(&mm->mmap_sem);
+ up_read(&mm->mmap_sem);
return ret;
}
/* pi_mutex gets initialized later */
pi_state->owner = NULL;
atomic_set(&pi_state->refcount, 1);
+ pi_state->key = FUTEX_KEY_INIT;
current->pi_state_cache = pi_state;
static struct task_struct * futex_find_get_task(pid_t pid)
{
struct task_struct *p;
+ const struct cred *cred = current_cred(), *pcred;
rcu_read_lock();
p = find_task_by_vpid(pid);
- if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
+ if (!p) {
p = ERR_PTR(-ESRCH);
- else
- get_task_struct(p);
+ } else {
+ pcred = __task_cred(p);
+ if (cred->euid != pcred->euid &&
+ cred->euid != pcred->uid)
+ p = ERR_PTR(-ESRCH);
+ else
+ get_task_struct(p);
+ }
rcu_read_unlock();
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
struct futex_hash_bucket *hb;
- union futex_key key;
+ union futex_key key = FUTEX_KEY_INIT;
if (!futex_cmpxchg_enabled)
return;
* The lock in wake_up_all() is a crucial memory barrier after the
* plist_del() and also before assigning to q->lock_ptr.
*/
- wake_up_all(&q->waiters);
+ wake_up(&q->waiter);
/*
* The waiting task can free the futex_q as soon as this is written,
* without taking any locks. This must come last.
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
- static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
- int nr_wake, u32 bitset)
+ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
struct plist_head *head;
- union futex_key key;
+ union futex_key key = FUTEX_KEY_INIT;
int ret;
if (!bitset)
return -EINVAL;
- futex_lock_mm(fshared);
-
ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
spin_unlock(&hb->lock);
out:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &key);
return ret;
}
* to this virtual address:
*/
static int
- futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
- u32 __user *uaddr2,
+ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
- union futex_key key1, key2;
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
retryfull:
- futex_lock_mm(fshared);
-
ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
*/
if (attempt++) {
ret = futex_handle_fault((unsigned long)uaddr2,
- fshared, attempt);
+ attempt);
if (ret)
goto out;
goto retry;
}
- /*
- * If we would have faulted, release mmap_sem,
- * fault it in and start all over again.
- */
- futex_unlock_mm(fshared);
-
ret = get_user(dummy, uaddr2);
if (ret)
return ret;
if (hb1 != hb2)
spin_unlock(&hb2->lock);
out:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
return ret;
}
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
- static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
- u32 __user *uaddr2,
+ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
int nr_wake, int nr_requeue, u32 *cmpval)
{
- union futex_key key1, key2;
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
retry:
- futex_lock_mm(fshared);
-
ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
if (hb1 != hb2)
spin_unlock(&hb2->lock);
- /*
- * If we would have faulted, release mmap_sem, fault
- * it in and start all over again.
- */
- futex_unlock_mm(fshared);
-
ret = get_user(curval, uaddr1);
if (!ret)
drop_futex_key_refs(&key1);
out:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
return ret;
}
{
struct futex_hash_bucket *hb;
- init_waitqueue_head(&q->waiters);
+ init_waitqueue_head(&q->waiter);
get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
* private futexes.
*/
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner,
- struct rw_semaphore *fshared)
+ struct task_struct *newowner, int fshared)
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
handle_fault:
spin_unlock(q->lock_ptr);
- ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
+ ret = futex_handle_fault((unsigned long)uaddr, attempt++);
spin_lock(q->lock_ptr);
* In case we must use restart_block to restart a futex_wait,
* we encode in the 'flags' shared capability
*/
- #define FLAGS_SHARED 1
+ #define FLAGS_SHARED 0x01
+ #define FLAGS_CLOCKRT 0x02
static long futex_wait_restart(struct restart_block *restart);
- static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
- u32 val, ktime_t *abs_time, u32 bitset)
+ static int futex_wait(u32 __user *uaddr, int fshared,
+ u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
{
struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
q.pi_state = NULL;
q.bitset = bitset;
retry:
- futex_lock_mm(fshared);
-
+ q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
if (unlikely(ret)) {
queue_unlock(&q, hb);
- /*
- * If we would have faulted, release mmap_sem, fault it in and
- * start all over again.
- */
- futex_unlock_mm(fshared);
-
ret = get_user(uval, uaddr);
if (!ret)
/* Only actually queue if *uaddr contained val. */
queue_me(&q, hb);
- /*
- * Now the futex is queued and we have checked the data, we
- * don't want to hold mmap_sem while we sleep.
- */
- futex_unlock_mm(fshared);
-
/*
* There might have been scheduling since the queue_me(), as we
* cannot hold a spinlock across the get_user() in case it
/* add_wait_queue is the barrier after __set_current_state. */
__set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&q.waiters, &wait);
+ add_wait_queue(&q.waiter, &wait);
/*
* !plist_node_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
slack = current->timer_slack_ns;
if (rt_task(current))
slack = 0;
- hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&t.timer,
+ clockrt ? CLOCK_REALTIME :
+ CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(&t, current);
hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
if (fshared)
restart->futex.flags |= FLAGS_SHARED;
+ if (clockrt)
+ restart->futex.flags |= FLAGS_CLOCKRT;
return -ERESTART_RESTARTBLOCK;
}
queue_unlock(&q, hb);
out_release_sem:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &q.key);
return ret;
}
static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
- struct rw_semaphore *fshared = NULL;
+ int fshared = 0;
ktime_t t;
t.tv64 = restart->futex.time;
restart->fn = do_no_restart_syscall;
if (restart->futex.flags & FLAGS_SHARED)
- fshared = ¤t->mm->mmap_sem;
+ fshared = 1;
return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
- restart->futex.bitset);
+ restart->futex.bitset,
+ restart->futex.flags & FLAGS_CLOCKRT);
}
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
- static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+ static int futex_lock_pi(u32 __user *uaddr, int fshared,
int detect, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
q.pi_state = NULL;
retry:
- futex_lock_mm(fshared);
-
+ q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
* exit to complete.
*/
queue_unlock(&q, hb);
- futex_unlock_mm(fshared);
cond_resched();
goto retry;
*/
queue_me(&q, hb);
- /*
- * Now the futex is queued and we have checked the data, we
- * don't want to hold mmap_sem while we sleep.
- */
- futex_unlock_mm(fshared);
-
WARN_ON(!q.pi_state);
/*
* Block on the PI mutex:
ret = ret ? 0 : -EWOULDBLOCK;
}
- futex_lock_mm(fshared);
spin_lock(q.lock_ptr);
if (!ret) {
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
- futex_unlock_mm(fshared);
if (to)
destroy_hrtimer_on_stack(&to->timer);
queue_unlock(&q, hb);
out_release_sem:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &q.key);
if (to)
destroy_hrtimer_on_stack(&to->timer);
return ret;
uaddr_faulted:
/*
- * We have to r/w *(int __user *)uaddr, but we can't modify it
- * non-atomically. Therefore, if get_user below is not
- * enough, we need to handle the fault ourselves, while
- * still holding the mmap_sem.
- *
- * ... and hb->lock. :-) --ANK
+ * We have to r/w *(int __user *)uaddr, and we have to modify it
+ * atomically. Therefore, if we continue to fault after get_user()
+ * below, we need to handle the fault ourselves, while still holding
+ * the mmap_sem. This can occur if the uaddr is under contention as
+ * we have to drop the mmap_sem in order to call get_user().
*/
queue_unlock(&q, hb);
if (attempt++) {
- ret = futex_handle_fault((unsigned long)uaddr, fshared,
- attempt);
+ ret = futex_handle_fault((unsigned long)uaddr, attempt);
if (ret)
goto out_release_sem;
goto retry_unlocked;
}
- futex_unlock_mm(fshared);
-
ret = get_user(uval, uaddr);
- if (!ret && (uval != -EFAULT))
+ if (!ret)
goto retry;
if (to)
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
- static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
+ static int futex_unlock_pi(u32 __user *uaddr, int fshared)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
u32 uval;
struct plist_head *head;
- union futex_key key;
+ union futex_key key = FUTEX_KEY_INIT;
int ret, attempt = 0;
retry:
*/
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
- /*
- * First take all the futex related locks:
- */
- futex_lock_mm(fshared);
ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
out_unlock:
spin_unlock(&hb->lock);
out:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &key);
return ret;
pi_faulted:
/*
- * We have to r/w *(int __user *)uaddr, but we can't modify it
- * non-atomically. Therefore, if get_user below is not
- * enough, we need to handle the fault ourselves, while
- * still holding the mmap_sem.
- *
- * ... and hb->lock. --ANK
+ * We have to r/w *(int __user *)uaddr, and we have to modify it
+ * atomically. Therefore, if we continue to fault after get_user()
+ * below, we need to handle the fault ourselves, while still holding
+ * the mmap_sem. This can occur if the uaddr is under contention as
+ * we have to drop the mmap_sem in order to call get_user().
*/
spin_unlock(&hb->lock);
if (attempt++) {
- ret = futex_handle_fault((unsigned long)uaddr, fshared,
- attempt);
+ ret = futex_handle_fault((unsigned long)uaddr, attempt);
if (ret)
goto out;
uval = 0;
goto retry_unlocked;
}
- futex_unlock_mm(fshared);
-
ret = get_user(uval, uaddr);
- if (!ret && (uval != -EFAULT))
+ if (!ret)
goto retry;
return ret;
{
struct robust_list_head __user *head;
unsigned long ret;
+ const struct cred *cred = current_cred(), *pcred;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
if (!p)
goto err_unlock;
ret = -EPERM;
- if ((current->euid != p->euid) && (current->euid != p->uid) &&
- !capable(CAP_SYS_PTRACE))
+ pcred = __task_cred(p);
+ if (cred->euid != pcred->euid &&
+ cred->euid != pcred->uid &&
+ !capable(CAP_SYS_PTRACE))
goto err_unlock;
head = p->robust_list;
rcu_read_unlock();
* PI futexes happens in exit_pi_state():
*/
if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, &curr->mm->mmap_sem, 1,
- FUTEX_BITSET_MATCH_ANY);
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
}
return 0;
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
- int ret = -ENOSYS;
+ int clockrt, ret = -ENOSYS;
int cmd = op & FUTEX_CMD_MASK;
- struct rw_semaphore *fshared = NULL;
+ int fshared = 0;
if (!(op & FUTEX_PRIVATE_FLAG))
- fshared = ¤t->mm->mmap_sem;
+ fshared = 1;
+
+ clockrt = op & FUTEX_CLOCK_REALTIME;
+ if (clockrt && cmd != FUTEX_WAIT_BITSET)
+ return -ENOSYS;
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAIT_BITSET:
- ret = futex_wait(uaddr, fshared, val, timeout, val3);
+ ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
break;
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
* Thanks to Arjan van de Ven for coming up with the initial idea of
* mapping lock dependencies runtime.
*/
+#define DISABLE_BRANCH_PROFILING
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/delay.h>
#ifdef CONFIG_LOCK_STAT
static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
- static int lock_contention_point(struct lock_class *class, unsigned long ip)
+ static int lock_point(unsigned long points[], unsigned long ip)
{
int i;
- for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
- if (class->contention_point[i] == 0) {
- class->contention_point[i] = ip;
+ for (i = 0; i < LOCKSTAT_POINTS; i++) {
+ if (points[i] == 0) {
+ points[i] = ip;
break;
}
- if (class->contention_point[i] == ip)
+ if (points[i] == ip)
break;
}
for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
stats.contention_point[i] += pcs->contention_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
+ stats.contending_point[i] += pcs->contending_point[i];
+
lock_time_add(&pcs->read_waittime, &stats.read_waittime);
lock_time_add(&pcs->write_waittime, &stats.write_waittime);
memset(cpu_stats, 0, sizeof(struct lock_class_stats));
}
memset(class->contention_point, 0, sizeof(class->contention_point));
+ memset(class->contending_point, 0, sizeof(class->contending_point));
}
static struct lock_class_stats *get_lock_stats(struct lock_class *class)
{
current->lockdep_recursion++;
}
-
EXPORT_SYMBOL(lockdep_off);
void lockdep_on(void)
{
current->lockdep_recursion--;
}
-
EXPORT_SYMBOL(lockdep_on);
/*
/*
* printk all lock dependencies starting at <entry>:
*/
- static void print_lock_dependencies(struct lock_class *class, int depth)
+ static void __used
+ print_lock_dependencies(struct lock_class *class, int depth)
{
struct lock_list *entry;
if (subclass)
register_lock_class(lock, subclass, 1);
}
-
EXPORT_SYMBOL_GPL(lockdep_init_map);
/*
}
static int
- __lock_set_subclass(struct lockdep_map *lock,
- unsigned int subclass, unsigned long ip)
+ __lock_set_class(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, unsigned int subclass,
+ unsigned long ip)
{
struct task_struct *curr = current;
struct held_lock *hlock, *prev_hlock;
return print_unlock_inbalance_bug(curr, lock, ip);
found_it:
+ lockdep_init_map(lock, name, key, 0);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes + 1;
#endif
}
- void
- lock_set_subclass(struct lockdep_map *lock,
- unsigned int subclass, unsigned long ip)
+ void lock_set_class(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, unsigned int subclass,
+ unsigned long ip)
{
unsigned long flags;
raw_local_irq_save(flags);
current->lockdep_recursion = 1;
check_flags(flags);
- if (__lock_set_subclass(lock, subclass, ip))
+ if (__lock_set_class(lock, name, key, subclass, ip))
check_chain_key(current);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
-
- EXPORT_SYMBOL_GPL(lock_set_subclass);
+ EXPORT_SYMBOL_GPL(lock_set_class);
/*
* We are not always called with irqs disabled - do that here,
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
-
EXPORT_SYMBOL_GPL(lock_acquire);
void lock_release(struct lockdep_map *lock, int nested,
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
-
EXPORT_SYMBOL_GPL(lock_release);
#ifdef CONFIG_LOCK_STAT
struct held_lock *hlock, *prev_hlock;
struct lock_class_stats *stats;
unsigned int depth;
- int i, point;
+ int i, contention_point, contending_point;
depth = curr->lockdep_depth;
if (DEBUG_LOCKS_WARN_ON(!depth))
found_it:
hlock->waittime_stamp = sched_clock();
- point = lock_contention_point(hlock_class(hlock), ip);
+ contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
+ contending_point = lock_point(hlock_class(hlock)->contending_point,
+ lock->ip);
stats = get_lock_stats(hlock_class(hlock));
- if (point < ARRAY_SIZE(stats->contention_point))
- stats->contention_point[point]++;
+ if (contention_point < LOCKSTAT_POINTS)
+ stats->contention_point[contention_point]++;
+ if (contending_point < LOCKSTAT_POINTS)
+ stats->contending_point[contending_point]++;
if (lock->cpu != smp_processor_id())
stats->bounces[bounce_contended + !!hlock->read]++;
put_lock_stats(stats);
}
static void
- __lock_acquired(struct lockdep_map *lock)
+ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
struct held_lock *hlock, *prev_hlock;
put_lock_stats(stats);
lock->cpu = cpu;
+ lock->ip = ip;
}
void lock_contended(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_contended);
- void lock_acquired(struct lockdep_map *lock)
+ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
raw_local_irq_save(flags);
check_flags(flags);
current->lockdep_recursion = 1;
- __lock_acquired(lock);
+ __lock_acquired(lock, ip);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
if (unlock)
read_unlock(&tasklist_lock);
}
-
EXPORT_SYMBOL_GPL(debug_show_all_locks);
/*
{
__debug_show_held_locks(task);
}
-
EXPORT_SYMBOL_GPL(debug_show_held_locks);
void lockdep_sys_exit(void)
*/
#define RUNTIME_INF ((u64)~0ULL)
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
#ifdef CONFIG_SMP
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
struct cgroup_subsys_state css;
#endif
+#ifdef CONFIG_USER_SCHED
+ uid_t uid;
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
#ifdef CONFIG_USER_SCHED
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+ user->tg->uid = user->uid;
+}
+
/*
* Root task group.
* Every UID task group (including init_task_group aka UID-0) will
struct task_group *tg;
#ifdef CONFIG_USER_SCHED
- tg = p->user->tg;
+ rcu_read_lock();
+ tg = __task_cred(p)->user->tg;
+ rcu_read_unlock();
#elif defined(CONFIG_CGROUP_SCHED)
tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
struct task_group, css);
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_exp_empty;
#undef SCHED_FEAT
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
- filp->private_data = inode->i_private;
- return 0;
-}
-
-static ssize_t
-sched_feat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int sched_feat_show(struct seq_file *m, void *v)
{
- char *buf;
- int r = 0;
- int len = 0;
int i;
for (i = 0; sched_feat_names[i]; i++) {
- len += strlen(sched_feat_names[i]);
- len += 4;
- }
-
- buf = kmalloc(len + 2, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- for (i = 0; sched_feat_names[i]; i++) {
- if (sysctl_sched_features & (1UL << i))
- r += sprintf(buf + r, "%s ", sched_feat_names[i]);
- else
- r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
+ if (!(sysctl_sched_features & (1UL << i)))
+ seq_puts(m, "NO_");
+ seq_printf(m, "%s ", sched_feat_names[i]);
}
+ seq_puts(m, "\n");
- r += sprintf(buf + r, "\n");
- WARN_ON(r >= len + 2);
-
- r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
- kfree(buf);
-
- return r;
+ return 0;
}
static ssize_t
return cnt;
}
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_feat_show, NULL);
+}
+
static struct file_operations sched_feat_fops = {
- .open = sched_feat_open,
- .read = sched_feat_read,
- .write = sched_feat_write,
+ .open = sched_feat_open,
+ .write = sched_feat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
};
static __init int sched_init_debug(void)
update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long sd_shares, unsigned long sd_rq_weight)
{
- int boost = 0;
unsigned long shares;
unsigned long rq_weight;
if (!tg->se[cpu])
return;
- rq_weight = tg->cfs_rq[cpu]->load.weight;
-
- /*
- * If there are currently no tasks on the cpu pretend there is one of
- * average load so that when a new task gets to run here it will not
- * get delayed by group starvation.
- */
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- if (unlikely(rq_weight > sd_rq_weight))
- rq_weight = sd_rq_weight;
+ rq_weight = tg->cfs_rq[cpu]->rq_weight;
/*
* \Sum shares * rq_weight
* \Sum rq_weight
*
*/
- shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+ shares = (sd_shares * rq_weight) / sd_rq_weight;
shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
if (abs(shares - tg->se[cpu]->load.weight) >
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
- /*
- * record the actual number of shares, not the boosted amount.
- */
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- tg->cfs_rq[cpu]->rq_weight = rq_weight;
+ tg->cfs_rq[cpu]->shares = shares;
__set_se_shares(tg->se[cpu], shares);
spin_unlock_irqrestore(&rq->lock, flags);
*/
static int tg_shares_up(struct task_group *tg, void *data)
{
- unsigned long rq_weight = 0;
+ unsigned long weight, rq_weight = 0;
unsigned long shares = 0;
struct sched_domain *sd = data;
int i;
for_each_cpu_mask(i, sd->span) {
- rq_weight += tg->cfs_rq[i]->load.weight;
+ /*
+ * If there are currently no tasks on the cpu pretend there
+ * is one of average load so that when a new task gets to
+ * run here it will not get delayed by group starvation.
+ */
+ weight = tg->cfs_rq[i]->load.weight;
+ if (!weight)
+ weight = NICE_0_LOAD;
+
+ tg->cfs_rq[i]->rq_weight = weight;
+ rq_weight += weight;
shares += tg->cfs_rq[i]->shares;
}
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
shares = tg->shares;
- if (!rq_weight)
- rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-
for_each_cpu_mask(i, sd->span)
update_group_shares_cpu(tg, i, shares, rq_weight);
#endif
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ int ret = 0;
+
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
+ if (unlikely(!spin_trylock(&busiest->lock))) {
+ if (busiest < this_rq) {
+ spin_unlock(&this_rq->lock);
+ spin_lock(&busiest->lock);
+ spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+ ret = 1;
+ } else
+ spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+ }
+ return ret;
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(busiest->lock)
+{
+ spin_unlock(&busiest->lock);
+ lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
clock_offset = old_rq->clock - new_rq->clock;
+ trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
p->se.wait_start -= clock_offset;
smp_wmb();
rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
old_state = p->state;
if (!(old_state & state))
goto out;
schedstat_inc(p, se.nr_wakeups_local);
else
schedstat_inc(p, se.nr_wakeups_remote);
- update_rq_clock(rq);
activate_task(rq, p, 1);
success = 1;
out_running:
- trace_sched_wakeup(rq, p);
+ trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
p->sched_class->task_new(rq, p);
inc_nr_running(rq);
}
- trace_sched_wakeup_new(rq, p);
+ trace_sched_wakeup_new(rq, p, 1);
check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
__release(rq2->lock);
}
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
- __releases(this_rq->lock)
- __acquires(busiest->lock)
- __acquires(this_rq->lock)
-{
- int ret = 0;
-
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work good under rq->lock */
- spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
- if (unlikely(!spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- spin_unlock(&this_rq->lock);
- spin_lock(&busiest->lock);
- spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
- ret = 1;
- } else
- spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
- }
- return ret;
-}
-
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
- __releases(busiest->lock)
-{
- spin_unlock(&busiest->lock);
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
/*
* If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only
|| unlikely(!cpu_active(dest_cpu)))
goto out;
- trace_sched_migrate_task(rq, p, dest_cpu);
/* force the process onto the specified CPU */
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
static void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
- int pulled_task = -1;
+ int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
cpumask_t tmpmask;
if (p == rq->idle) {
p->stime = cputime_add(p->stime, steal);
- account_group_system_time(p, steal);
if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
else
/*
* Underflow?
*/
- if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
return;
/*
* Is the spinlock portion underflowing?
set_load_weight(p);
}
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ bool match;
+
+ rcu_read_lock();
+ pcred = __task_cred(p);
+ match = (cred->euid == pcred->euid ||
+ cred->euid == pcred->uid);
+ rcu_read_unlock();
+ return match;
+}
+
static int __sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param, bool user)
{
return -EPERM;
/* can't change other user's priorities */
- if ((current->euid != p->euid) &&
- (current->euid != p->uid))
+ if (!check_same_owner(p))
return -EPERM;
}
read_unlock(&tasklist_lock);
retval = -EPERM;
- if ((current->euid != p->euid) && (current->euid != p->uid) &&
- !capable(CAP_SYS_NICE))
+ if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p, 0, NULL);
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
+ ftrace_graph_init_task(idle);
}
/*
/*
* Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
*/
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
#ifdef CONFIG_SCHED_DEBUG
-static inline const char *sd_level_to_string(enum sched_domain_level lvl)
-{
- switch (lvl) {
- case SD_LV_NONE:
- return "NONE";
- case SD_LV_SIBLING:
- return "SIBLING";
- case SD_LV_MC:
- return "MC";
- case SD_LV_CPU:
- return "CPU";
- case SD_LV_NODE:
- return "NODE";
- case SD_LV_ALLNODES:
- return "ALLNODES";
- case SD_LV_MAX:
- return "MAX";
-
- }
- return "MAX";
-}
-
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_t *groupmask)
{
return -1;
}
- printk(KERN_CONT "span %s level %s\n",
- str, sd_level_to_string(sd->level));
+ printk(KERN_CONT "span %s level %s\n", str, sd->name);
if (!cpu_isset(cpu, sd->span)) {
printk(KERN_ERR "ERROR: domain->span does not contain "
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_PKG_RESOURCES);
+ if (nr_node_ids == 1)
+ pflags &= ~SD_SERIALIZE;
}
if (~cflags & pflags)
return 0;
};
#if NR_CPUS > 128
-#define SCHED_CPUMASK_ALLOC 1
-#define SCHED_CPUMASK_FREE(v) kfree(v)
-#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
+#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{
+ *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
+}
+static inline void sched_cpumask_free(struct allmasks *masks)
+{
+ kfree(masks);
+}
#else
-#define SCHED_CPUMASK_ALLOC 0
-#define SCHED_CPUMASK_FREE(v)
-#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
+#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{ }
+static inline void sched_cpumask_free(struct allmasks *masks)
+{ }
#endif
#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
return -ENOMEM;
}
-#if SCHED_CPUMASK_ALLOC
/* get space for all scratch cpumask variables */
- allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+ sched_cpumask_alloc(&allmasks);
if (!allmasks) {
printk(KERN_WARNING "Cannot alloc cpumask array\n");
kfree(rd);
#endif
return -ENOMEM;
}
-#endif
+
tmpmask = (cpumask_t *)allmasks;
cpu_attach_domain(sd, rd, i);
}
- SCHED_CPUMASK_FREE((void *)allmasks);
+ sched_cpumask_free(allmasks);
return 0;
#ifdef CONFIG_NUMA
error:
free_sched_groups(cpu_map, tmpmask);
- SCHED_CPUMASK_FREE((void *)allmasks);
+ sched_cpumask_free(allmasks);
kfree(rd);
return -ENOMEM;
#endif
*/
static cpumask_t fallback_doms;
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
{
+ return 0;
}
/*
cpumask_t tmpmask;
int i;
- unregister_sched_domain_sysctl();
-
for_each_cpu_mask_nr(i, *cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
synchronize_sched();
struct sched_domain_attr *dattr_new)
{
int i, j, n;
+ int new_topology;
mutex_lock(&sched_domains_mutex);
/* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl();
+ /* Let architecture update cpu core mappings. */
+ new_topology = arch_update_cpu_topology();
+
n = doms_new ? ndoms_new : 0;
/* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < n; j++) {
+ for (j = 0; j < n && !new_topology; j++) {
if (cpus_equal(doms_cur[i], doms_new[j])
&& dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
ndoms_cur = 0;
doms_new = &fallback_doms;
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
- dattr_new = NULL;
+ WARN_ON_ONCE(dattr_new);
}
/* Build new domains */
for (i = 0; i < ndoms_new; i++) {
- for (j = 0; j < ndoms_cur; j++) {
+ for (j = 0; j < ndoms_cur && !new_topology; j++) {
if (cpus_equal(doms_new[i], doms_cur[j])
&& dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2;
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
struct cfs_rq *cfs_rq;
- struct sched_entity *se, *parent_se;
+ struct sched_entity *se;
struct rq *rq;
int i;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
- cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
- GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+ GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
goto err;
- se = kmalloc_node(sizeof(struct sched_entity),
- GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ se = kzalloc_node(sizeof(struct sched_entity),
+ GFP_KERNEL, cpu_to_node(i));
if (!se)
goto err;
- parent_se = parent ? parent->se[i] : NULL;
- init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
}
return 1;
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
struct rt_rq *rt_rq;
- struct sched_rt_entity *rt_se, *parent_se;
+ struct sched_rt_entity *rt_se;
struct rq *rq;
int i;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
- rt_rq = kmalloc_node(sizeof(struct rt_rq),
- GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ rt_rq = kzalloc_node(sizeof(struct rt_rq),
+ GFP_KERNEL, cpu_to_node(i));
if (!rt_rq)
goto err;
- rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
- GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+ GFP_KERNEL, cpu_to_node(i));
if (!rt_se)
goto err;
- parent_se = parent ? parent->rt_se[i] : NULL;
- init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+ init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
}
return 1;
* (balbir@in.ibm.com).
*/
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 *cpuusage;
+ struct cpuacct *parent;
};
struct cgroup_subsys cpuacct_subsys;
return ERR_PTR(-ENOMEM);
}
+ if (cgrp->parent)
+ ca->parent = cgroup_ca(cgrp->parent);
+
return &ca->css;
}
kfree(ca);
}
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 data;
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ */
+ spin_lock_irq(&cpu_rq(cpu)->lock);
+ data = *cpuusage;
+ spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ data = *cpuusage;
+#endif
+
+ return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+ spin_lock_irq(&cpu_rq(cpu)->lock);
+ *cpuusage = val;
+ spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ *cpuusage = val;
+#endif
+}
+
/* return total cpu usage (in nanoseconds) of a group */
static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
{
u64 totalcpuusage = 0;
int i;
- for_each_possible_cpu(i) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
-
- /*
- * Take rq->lock to make 64-bit addition safe on 32-bit
- * platforms.
- */
- spin_lock_irq(&cpu_rq(i)->lock);
- totalcpuusage += *cpuusage;
- spin_unlock_irq(&cpu_rq(i)->lock);
- }
+ for_each_present_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i);
return totalcpuusage;
}
goto out;
}
- for_each_possible_cpu(i) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+ for_each_present_cpu(i)
+ cpuacct_cpuusage_write(ca, i, 0);
- spin_lock_irq(&cpu_rq(i)->lock);
- *cpuusage = 0;
- spin_unlock_irq(&cpu_rq(i)->lock);
- }
out:
return err;
}
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct cpuacct *ca = cgroup_ca(cgroup);
+ u64 percpu;
+ int i;
+
+ for_each_present_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i);
+ seq_printf(m, "%llu ", (unsigned long long) percpu);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
static struct cftype files[] = {
{
.name = "usage",
.read_u64 = cpuusage_read,
.write_u64 = cpuusage_write,
},
+ {
+ .name = "usage_percpu",
+ .read_seq_string = cpuacct_percpu_seq_read,
+ },
+
};
static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
+ int cpu;
if (!cpuacct_subsys.active)
return;
+ cpu = task_cpu(tsk);
ca = task_ca(tsk);
- if (ca) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
+ for (; ca; ca = ca->parent) {
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
}
void (*pm_power_off_prepare)(void);
+/*
+ * set the priority of a task
+ * - the caller must hold the RCU read lock
+ */
static int set_one_prio(struct task_struct *p, int niceval, int error)
{
+ const struct cred *cred = current_cred(), *pcred = __task_cred(p);
int no_nice;
- if (p->uid != current->euid &&
- p->euid != current->euid && !capable(CAP_SYS_NICE)) {
+ if (pcred->uid != cred->euid &&
+ pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
error = -EPERM;
goto out;
}
{
struct task_struct *g, *p;
struct user_struct *user;
+ const struct cred *cred = current_cred();
int error = -EINVAL;
struct pid *pgrp;
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case PRIO_USER:
- user = current->user;
+ user = (struct user_struct *) cred->user;
if (!who)
- who = current->uid;
- else
- if ((who != current->uid) && !(user = find_user(who)))
- goto out_unlock; /* No processes for this user */
+ who = cred->uid;
+ else if ((who != cred->uid) &&
+ !(user = find_user(who)))
+ goto out_unlock; /* No processes for this user */
do_each_thread(g, p)
- if (p->uid == who)
+ if (__task_cred(p)->uid == who)
error = set_one_prio(p, niceval, error);
while_each_thread(g, p);
- if (who != current->uid)
+ if (who != cred->uid)
free_uid(user); /* For find_user() */
break;
}
{
struct task_struct *g, *p;
struct user_struct *user;
+ const struct cred *cred = current_cred();
long niceval, retval = -ESRCH;
struct pid *pgrp;
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case PRIO_USER:
- user = current->user;
+ user = (struct user_struct *) cred->user;
if (!who)
- who = current->uid;
- else
- if ((who != current->uid) && !(user = find_user(who)))
- goto out_unlock; /* No processes for this user */
+ who = cred->uid;
+ else if ((who != cred->uid) &&
+ !(user = find_user(who)))
+ goto out_unlock; /* No processes for this user */
do_each_thread(g, p)
- if (p->uid == who) {
+ if (__task_cred(p)->uid == who) {
niceval = 20 - task_nice(p);
if (niceval > retval)
retval = niceval;
}
while_each_thread(g, p);
- if (who != current->uid)
+ if (who != cred->uid)
free_uid(user); /* for find_user() */
break;
}
*/
asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
{
- int old_rgid = current->gid;
- int old_egid = current->egid;
- int new_rgid = old_rgid;
- int new_egid = old_egid;
+ const struct cred *old;
+ struct cred *new;
int retval;
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ old = current_cred();
+
retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
if (retval)
- return retval;
+ goto error;
+ retval = -EPERM;
if (rgid != (gid_t) -1) {
- if ((old_rgid == rgid) ||
- (current->egid==rgid) ||
+ if (old->gid == rgid ||
+ old->egid == rgid ||
capable(CAP_SETGID))
- new_rgid = rgid;
+ new->gid = rgid;
else
- return -EPERM;
+ goto error;
}
if (egid != (gid_t) -1) {
- if ((old_rgid == egid) ||
- (current->egid == egid) ||
- (current->sgid == egid) ||
+ if (old->gid == egid ||
+ old->egid == egid ||
+ old->sgid == egid ||
capable(CAP_SETGID))
- new_egid = egid;
+ new->egid = egid;
else
- return -EPERM;
- }
- if (new_egid != old_egid) {
- set_dumpable(current->mm, suid_dumpable);
- smp_wmb();
+ goto error;
}
+
if (rgid != (gid_t) -1 ||
- (egid != (gid_t) -1 && egid != old_rgid))
- current->sgid = new_egid;
- current->fsgid = new_egid;
- current->egid = new_egid;
- current->gid = new_rgid;
- key_fsgid_changed(current);
- proc_id_connector(current, PROC_EVENT_GID);
- return 0;
+ (egid != (gid_t) -1 && egid != old->gid))
+ new->sgid = new->egid;
+ new->fsgid = new->egid;
+
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
}
/*
*/
asmlinkage long sys_setgid(gid_t gid)
{
- int old_egid = current->egid;
+ const struct cred *old;
+ struct cred *new;
int retval;
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ old = current_cred();
+
retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
if (retval)
- return retval;
+ goto error;
- if (capable(CAP_SETGID)) {
- if (old_egid != gid) {
- set_dumpable(current->mm, suid_dumpable);
- smp_wmb();
- }
- current->gid = current->egid = current->sgid = current->fsgid = gid;
- } else if ((gid == current->gid) || (gid == current->sgid)) {
- if (old_egid != gid) {
- set_dumpable(current->mm, suid_dumpable);
- smp_wmb();
- }
- current->egid = current->fsgid = gid;
- }
+ retval = -EPERM;
+ if (capable(CAP_SETGID))
+ new->gid = new->egid = new->sgid = new->fsgid = gid;
+ else if (gid == old->gid || gid == old->sgid)
+ new->egid = new->fsgid = gid;
else
- return -EPERM;
+ goto error;
- key_fsgid_changed(current);
- proc_id_connector(current, PROC_EVENT_GID);
- return 0;
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
}
-static int set_user(uid_t new_ruid, int dumpclear)
+/*
+ * change the user struct in a credentials set to match the new UID
+ */
+static int set_user(struct cred *new)
{
struct user_struct *new_user;
- new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
+ new_user = alloc_uid(current_user_ns(), new->uid);
if (!new_user)
return -EAGAIN;
if (atomic_read(&new_user->processes) >=
current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
- new_user != current->nsproxy->user_ns->root_user) {
+ new_user != INIT_USER) {
free_uid(new_user);
return -EAGAIN;
}
- switch_uid(new_user);
-
- if (dumpclear) {
- set_dumpable(current->mm, suid_dumpable);
- smp_wmb();
- }
- current->uid = new_ruid;
+ free_uid(new->user);
+ new->user = new_user;
return 0;
}
*/
asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
{
- int old_ruid, old_euid, old_suid, new_ruid, new_euid;
+ const struct cred *old;
+ struct cred *new;
int retval;
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ old = current_cred();
+
retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
if (retval)
- return retval;
-
- new_ruid = old_ruid = current->uid;
- new_euid = old_euid = current->euid;
- old_suid = current->suid;
+ goto error;
+ retval = -EPERM;
if (ruid != (uid_t) -1) {
- new_ruid = ruid;
- if ((old_ruid != ruid) &&
- (current->euid != ruid) &&
+ new->uid = ruid;
+ if (old->uid != ruid &&
+ old->euid != ruid &&
!capable(CAP_SETUID))
- return -EPERM;
+ goto error;
}
if (euid != (uid_t) -1) {
- new_euid = euid;
- if ((old_ruid != euid) &&
- (current->euid != euid) &&
- (current->suid != euid) &&
+ new->euid = euid;
+ if (old->uid != euid &&
+ old->euid != euid &&
+ old->suid != euid &&
!capable(CAP_SETUID))
- return -EPERM;
+ goto error;
}
- if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
- return -EAGAIN;
+ retval = -EAGAIN;
+ if (new->uid != old->uid && set_user(new) < 0)
+ goto error;
- if (new_euid != old_euid) {
- set_dumpable(current->mm, suid_dumpable);
- smp_wmb();
- }
- current->fsuid = current->euid = new_euid;
if (ruid != (uid_t) -1 ||
- (euid != (uid_t) -1 && euid != old_ruid))
- current->suid = current->euid;
- current->fsuid = current->euid;
+ (euid != (uid_t) -1 && euid != old->uid))
+ new->suid = new->euid;
+ new->fsuid = new->euid;
- key_fsuid_changed(current);
- proc_id_connector(current, PROC_EVENT_UID);
-
- return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
-}
+ retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
+ if (retval < 0)
+ goto error;
+ return commit_creds(new);
+error:
+ abort_creds(new);
+ return retval;
+}
/*
* setuid() is implemented like SysV with SAVED_IDS
*/
asmlinkage long sys_setuid(uid_t uid)
{
- int old_euid = current->euid;
- int old_ruid, old_suid, new_suid;
+ const struct cred *old;
+ struct cred *new;
int retval;
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ old = current_cred();
+
retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
if (retval)
- return retval;
+ goto error;
- old_ruid = current->uid;
- old_suid = current->suid;
- new_suid = old_suid;
-
+ retval = -EPERM;
if (capable(CAP_SETUID)) {
- if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
- return -EAGAIN;
- new_suid = uid;
- } else if ((uid != current->uid) && (uid != new_suid))
- return -EPERM;
-
- if (old_euid != uid) {
- set_dumpable(current->mm, suid_dumpable);
- smp_wmb();
+ new->suid = new->uid = uid;
+ if (uid != old->uid && set_user(new) < 0) {
+ retval = -EAGAIN;
+ goto error;
+ }
+ } else if (uid != old->uid && uid != new->suid) {
+ goto error;
}
- current->fsuid = current->euid = uid;
- current->suid = new_suid;
- key_fsuid_changed(current);
- proc_id_connector(current, PROC_EVENT_UID);
+ new->fsuid = new->euid = uid;
+
+ retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
+ if (retval < 0)
+ goto error;
- return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
}
*/
asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
{
- int old_ruid = current->uid;
- int old_euid = current->euid;
- int old_suid = current->suid;
+ const struct cred *old;
+ struct cred *new;
int retval;
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+
retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
if (retval)
- return retval;
+ goto error;
+ old = current_cred();
+ retval = -EPERM;
if (!capable(CAP_SETUID)) {
- if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
- (ruid != current->euid) && (ruid != current->suid))
- return -EPERM;
- if ((euid != (uid_t) -1) && (euid != current->uid) &&
- (euid != current->euid) && (euid != current->suid))
- return -EPERM;
- if ((suid != (uid_t) -1) && (suid != current->uid) &&
- (suid != current->euid) && (suid != current->suid))
- return -EPERM;
+ if (ruid != (uid_t) -1 && ruid != old->uid &&
+ ruid != old->euid && ruid != old->suid)
+ goto error;
+ if (euid != (uid_t) -1 && euid != old->uid &&
+ euid != old->euid && euid != old->suid)
+ goto error;
+ if (suid != (uid_t) -1 && suid != old->uid &&
+ suid != old->euid && suid != old->suid)
+ goto error;
}
+
+ retval = -EAGAIN;
if (ruid != (uid_t) -1) {
- if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
- return -EAGAIN;
+ new->uid = ruid;
+ if (ruid != old->uid && set_user(new) < 0)
+ goto error;
}
- if (euid != (uid_t) -1) {
- if (euid != current->euid) {
- set_dumpable(current->mm, suid_dumpable);
- smp_wmb();
- }
- current->euid = euid;
- }
- current->fsuid = current->euid;
+ if (euid != (uid_t) -1)
+ new->euid = euid;
if (suid != (uid_t) -1)
- current->suid = suid;
+ new->suid = suid;
+ new->fsuid = new->euid;
+
+ retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
+ if (retval < 0)
+ goto error;
- key_fsuid_changed(current);
- proc_id_connector(current, PROC_EVENT_UID);
+ return commit_creds(new);
- return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
+error:
+ abort_creds(new);
+ return retval;
}
asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
{
+ const struct cred *cred = current_cred();
int retval;
- if (!(retval = put_user(current->uid, ruid)) &&
- !(retval = put_user(current->euid, euid)))
- retval = put_user(current->suid, suid);
+ if (!(retval = put_user(cred->uid, ruid)) &&
+ !(retval = put_user(cred->euid, euid)))
+ retval = put_user(cred->suid, suid);
return retval;
}
*/
asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
{
+ const struct cred *old;
+ struct cred *new;
int retval;
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
+ old = current_cred();
+
retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
if (retval)
- return retval;
+ goto error;
+ retval = -EPERM;
if (!capable(CAP_SETGID)) {
- if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
- (rgid != current->egid) && (rgid != current->sgid))
- return -EPERM;
- if ((egid != (gid_t) -1) && (egid != current->gid) &&
- (egid != current->egid) && (egid != current->sgid))
- return -EPERM;
- if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
- (sgid != current->egid) && (sgid != current->sgid))
- return -EPERM;
+ if (rgid != (gid_t) -1 && rgid != old->gid &&
+ rgid != old->egid && rgid != old->sgid)
+ goto error;
+ if (egid != (gid_t) -1 && egid != old->gid &&
+ egid != old->egid && egid != old->sgid)
+ goto error;
+ if (sgid != (gid_t) -1 && sgid != old->gid &&
+ sgid != old->egid && sgid != old->sgid)
+ goto error;
}
- if (egid != (gid_t) -1) {
- if (egid != current->egid) {
- set_dumpable(current->mm, suid_dumpable);
- smp_wmb();
- }
- current->egid = egid;
- }
- current->fsgid = current->egid;
+
if (rgid != (gid_t) -1)
- current->gid = rgid;
+ new->gid = rgid;
+ if (egid != (gid_t) -1)
+ new->egid = egid;
if (sgid != (gid_t) -1)
- current->sgid = sgid;
+ new->sgid = sgid;
+ new->fsgid = new->egid;
- key_fsgid_changed(current);
- proc_id_connector(current, PROC_EVENT_GID);
- return 0;
+ return commit_creds(new);
+
+error:
+ abort_creds(new);
+ return retval;
}
asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
{
+ const struct cred *cred = current_cred();
int retval;
- if (!(retval = put_user(current->gid, rgid)) &&
- !(retval = put_user(current->egid, egid)))
- retval = put_user(current->sgid, sgid);
+ if (!(retval = put_user(cred->gid, rgid)) &&
+ !(retval = put_user(cred->egid, egid)))
+ retval = put_user(cred->sgid, sgid);
return retval;
}
*/
asmlinkage long sys_setfsuid(uid_t uid)
{
- int old_fsuid;
+ const struct cred *old;
+ struct cred *new;
+ uid_t old_fsuid;
- old_fsuid = current->fsuid;
- if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
- return old_fsuid;
+ new = prepare_creds();
+ if (!new)
+ return current_fsuid();
+ old = current_cred();
+ old_fsuid = old->fsuid;
- if (uid == current->uid || uid == current->euid ||
- uid == current->suid || uid == current->fsuid ||
+ if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
+ goto error;
+
+ if (uid == old->uid || uid == old->euid ||
+ uid == old->suid || uid == old->fsuid ||
capable(CAP_SETUID)) {
if (uid != old_fsuid) {
- set_dumpable(current->mm, suid_dumpable);
- smp_wmb();
+ new->fsuid = uid;
+ if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
+ goto change_okay;
}
- current->fsuid = uid;
}
- key_fsuid_changed(current);
- proc_id_connector(current, PROC_EVENT_UID);
-
- security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
+error:
+ abort_creds(new);
+ return old_fsuid;
+change_okay:
+ commit_creds(new);
return old_fsuid;
}
*/
asmlinkage long sys_setfsgid(gid_t gid)
{
- int old_fsgid;
+ const struct cred *old;
+ struct cred *new;
+ gid_t old_fsgid;
+
+ new = prepare_creds();
+ if (!new)
+ return current_fsgid();
+ old = current_cred();
+ old_fsgid = old->fsgid;
- old_fsgid = current->fsgid;
if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
- return old_fsgid;
+ goto error;
- if (gid == current->gid || gid == current->egid ||
- gid == current->sgid || gid == current->fsgid ||
+ if (gid == old->gid || gid == old->egid ||
+ gid == old->sgid || gid == old->fsgid ||
capable(CAP_SETGID)) {
if (gid != old_fsgid) {
- set_dumpable(current->mm, suid_dumpable);
- smp_wmb();
+ new->fsgid = gid;
+ goto change_okay;
}
- current->fsgid = gid;
- key_fsgid_changed(current);
- proc_id_connector(current, PROC_EVENT_GID);
}
+
+error:
+ abort_creds(new);
+ return old_fsgid;
+
+change_okay:
+ commit_creds(new);
return old_fsgid;
}
struct task_cputime cputime;
cputime_t cutime, cstime;
- spin_lock_irq(¤t->sighand->siglock);
thread_group_cputime(current, &cputime);
+ spin_lock_irq(¤t->sighand->siglock);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
spin_unlock_irq(¤t->sighand->siglock);
/* export the group_info to a user-space array */
static int groups_to_user(gid_t __user *grouplist,
- struct group_info *group_info)
+ const struct group_info *group_info)
{
int i;
unsigned int count = group_info->ngroups;
}
/* a simple bsearch */
-int groups_search(struct group_info *group_info, gid_t grp)
+int groups_search(const struct group_info *group_info, gid_t grp)
{
unsigned int left, right;
return 0;
}
-/* validate and set current->group_info */
-int set_current_groups(struct group_info *group_info)
+/**
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
+ *
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
+ */
+int set_groups(struct cred *new, struct group_info *group_info)
{
int retval;
- struct group_info *old_info;
retval = security_task_setgroups(group_info);
if (retval)
return retval;
+ put_group_info(new->group_info);
groups_sort(group_info);
get_group_info(group_info);
+ new->group_info = group_info;
+ return 0;
+}
+
+EXPORT_SYMBOL(set_groups);
- task_lock(current);
- old_info = current->group_info;
- current->group_info = group_info;
- task_unlock(current);
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+ struct cred *new;
+ int ret;
- put_group_info(old_info);
+ new = prepare_creds();
+ if (!new)
+ return -ENOMEM;
- return 0;
+ ret = set_groups(new, group_info);
+ if (ret < 0) {
+ abort_creds(new);
+ return ret;
+ }
+
+ return commit_creds(new);
}
EXPORT_SYMBOL(set_current_groups);
asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
{
- int i = 0;
-
- /*
- * SMP: Nobody else can change our grouplist. Thus we are
- * safe.
- */
+ const struct cred *cred = current_cred();
+ int i;
if (gidsetsize < 0)
return -EINVAL;
/* no need to grab task_lock here; it cannot change */
- i = current->group_info->ngroups;
+ i = cred->group_info->ngroups;
if (gidsetsize) {
if (i > gidsetsize) {
i = -EINVAL;
goto out;
}
- if (groups_to_user(grouplist, current->group_info)) {
+ if (groups_to_user(grouplist, cred->group_info)) {
i = -EFAULT;
goto out;
}
*/
int in_group_p(gid_t grp)
{
+ const struct cred *cred = current_cred();
int retval = 1;
- if (grp != current->fsgid)
- retval = groups_search(current->group_info, grp);
+
+ if (grp != cred->fsgid)
+ retval = groups_search(cred->group_info, grp);
return retval;
}
int in_egroup_p(gid_t grp)
{
+ const struct cred *cred = current_cred();
int retval = 1;
- if (grp != current->egid)
- retval = groups_search(current->group_info, grp);
+
+ if (grp != cred->egid)
+ retval = groups_search(cred->group_info, grp);
return retval;
}
asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
- long error = 0;
+ struct task_struct *me = current;
+ unsigned char comm[sizeof(me->comm)];
+ long error;
- if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
+ error = security_task_prctl(option, arg2, arg3, arg4, arg5);
+ if (error != -ENOSYS)
return error;
+ error = 0;
switch (option) {
case PR_SET_PDEATHSIG:
if (!valid_signal(arg2)) {
error = -EINVAL;
break;
}
- current->pdeath_signal = arg2;
+ me->pdeath_signal = arg2;
+ error = 0;
break;
case PR_GET_PDEATHSIG:
- error = put_user(current->pdeath_signal, (int __user *)arg2);
+ error = put_user(me->pdeath_signal, (int __user *)arg2);
break;
case PR_GET_DUMPABLE:
- error = get_dumpable(current->mm);
+ error = get_dumpable(me->mm);
break;
case PR_SET_DUMPABLE:
if (arg2 < 0 || arg2 > 1) {
error = -EINVAL;
break;
}
- set_dumpable(current->mm, arg2);
+ set_dumpable(me->mm, arg2);
+ error = 0;
break;
case PR_SET_UNALIGN:
- error = SET_UNALIGN_CTL(current, arg2);
+ error = SET_UNALIGN_CTL(me, arg2);
break;
case PR_GET_UNALIGN:
- error = GET_UNALIGN_CTL(current, arg2);
+ error = GET_UNALIGN_CTL(me, arg2);
break;
case PR_SET_FPEMU:
- error = SET_FPEMU_CTL(current, arg2);
+ error = SET_FPEMU_CTL(me, arg2);
break;
case PR_GET_FPEMU:
- error = GET_FPEMU_CTL(current, arg2);
+ error = GET_FPEMU_CTL(me, arg2);
break;
case PR_SET_FPEXC:
- error = SET_FPEXC_CTL(current, arg2);
+ error = SET_FPEXC_CTL(me, arg2);
break;
case PR_GET_FPEXC:
- error = GET_FPEXC_CTL(current, arg2);
+ error = GET_FPEXC_CTL(me, arg2);
break;
case PR_GET_TIMING:
error = PR_TIMING_STATISTICAL;
case PR_SET_TIMING:
if (arg2 != PR_TIMING_STATISTICAL)
error = -EINVAL;
+ else
+ error = 0;
break;
- case PR_SET_NAME: {
- struct task_struct *me = current;
- unsigned char ncomm[sizeof(me->comm)];
-
- ncomm[sizeof(me->comm)-1] = 0;
- if (strncpy_from_user(ncomm, (char __user *)arg2,
- sizeof(me->comm)-1) < 0)
+ case PR_SET_NAME:
+ comm[sizeof(me->comm)-1] = 0;
+ if (strncpy_from_user(comm, (char __user *)arg2,
+ sizeof(me->comm) - 1) < 0)
return -EFAULT;
- set_task_comm(me, ncomm);
+ set_task_comm(me, comm);
return 0;
- }
- case PR_GET_NAME: {
- struct task_struct *me = current;
- unsigned char tcomm[sizeof(me->comm)];
-
- get_task_comm(tcomm, me);
- if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
+ case PR_GET_NAME:
+ get_task_comm(comm, me);
+ if (copy_to_user((char __user *)arg2, comm,
+ sizeof(comm)))
return -EFAULT;
return 0;
- }
case PR_GET_ENDIAN:
- error = GET_ENDIAN(current, arg2);
+ error = GET_ENDIAN(me, arg2);
break;
case PR_SET_ENDIAN:
- error = SET_ENDIAN(current, arg2);
+ error = SET_ENDIAN(me, arg2);
break;
case PR_GET_SECCOMP:
current->default_timer_slack_ns;
else
current->timer_slack_ns = arg2;
+ error = 0;
break;
default:
error = -EINVAL;
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+ if (unlikely(is_pfn_mapping(vma))) {
+ /*
+ * We do not free on error cases below as remove_vma
+ * gets called on error from higher level routine
+ */
+ ret = track_pfn_vma_copy(vma);
+ if (ret)
+ return ret;
+ }
+
/*
* We need to invalidate the secondary MMU mappings only when
* there could be a permission downgrade on the ptes of the
if (vma->vm_flags & VM_ACCOUNT)
*nr_accounted += (end - start) >> PAGE_SHIFT;
+ if (unlikely(is_pfn_mapping(vma)))
+ untrack_pfn_vma(vma, 0, 0);
+
while (start != end) {
if (!tlb_start_valid) {
tlb_start = start;
int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn)
{
+ int ret;
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
- return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+ if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE))
+ return -EINVAL;
+
+ ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+
+ if (ret)
+ untrack_pfn_vma(vma, pfn, PAGE_SIZE);
+
+ return ret;
}
EXPORT_SYMBOL(vm_insert_pfn);
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
*/
- if (is_cow_mapping(vma->vm_flags)) {
- if (addr != vma->vm_start || end != vma->vm_end)
- return -EINVAL;
+ if (addr == vma->vm_start && end == vma->vm_end)
vma->vm_pgoff = pfn;
- }
+ else if (is_cow_mapping(vma->vm_flags))
+ return -EINVAL;
vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size));
+ if (err)
+ return -EINVAL;
+
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
if (err)
break;
} while (pgd++, addr = next, addr != end);
+
+ if (err)
+ untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
+
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
#endif /* __HAVE_ARCH_GATE_AREA */
#ifdef CONFIG_HAVE_IOREMAP_PROT
-static resource_size_t follow_phys(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned long *prot)
+int follow_phys(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags,
+ unsigned long *prot, resource_size_t *phys)
{
pgd_t *pgd;
pud_t *pud;
spinlock_t *ptl;
resource_size_t phys_addr = 0;
struct mm_struct *mm = vma->vm_mm;
+ int ret = -EINVAL;
- VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- goto no_page_table;
+ goto out;
pud = pud_offset(pgd, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
- goto no_page_table;
+ goto out;
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- goto no_page_table;
+ goto out;
/* We cannot handle huge page PFN maps. Luckily they don't exist. */
if (pmd_huge(*pmd))
- goto no_page_table;
+ goto out;
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
*prot = pgprot_val(pte_pgprot(pte));
+ *phys = phys_addr;
+ ret = 0;
unlock:
pte_unmap_unlock(ptep, ptl);
out:
- return phys_addr;
-no_page_table:
- return 0;
+ return ret;
}
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *maddr;
int offset = addr & (PAGE_SIZE-1);
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
- return -EINVAL;
-
- phys_addr = follow_phys(vma, addr, write, &prot);
-
- if (!phys_addr)
+ if (follow_phys(vma, addr, write, &prot, &phys_addr))
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
}
up_read(¤t->mm->mmap_sem);
}
+
+ #ifdef CONFIG_PROVE_LOCKING
+ void might_fault(void)
+ {
+ might_sleep();
+ /*
+ * it would be nicer only to annotate paths which are not under
+ * pagefault_disable, however that requires a larger audit and
+ * providing helpers like get_user_atomic.
+ */
+ if (!in_atomic() && current->mm)
+ might_lock_read(¤t->mm->mmap_sem);
+ }
+ EXPORT_SYMBOL(might_fault);
+ #endif