]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 May 2012 00:42:39 +0000 (17:42 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 May 2012 00:42:39 +0000 (17:42 -0700)
Pull user namespace enhancements from Eric Biederman:
 "This is a course correction for the user namespace, so that we can
  reach an inexpensive, maintainable, and reasonably complete
  implementation.

  Highlights:
   - Config guards make it impossible to enable the user namespace and
     code that has not been converted to be user namespace safe.

   - Use of the new kuid_t type ensures the if you somehow get past the
     config guards the kernel will encounter type errors if you enable
     user namespaces and attempt to compile in code whose permission
     checks have not been updated to be user namespace safe.

   - All uids from child user namespaces are mapped into the initial
     user namespace before they are processed.  Removing the need to add
     an additional check to see if the user namespace of the compared
     uids remains the same.

   - With the user namespaces compiled out the performance is as good or
     better than it is today.

   - For most operations absolutely nothing changes performance or
     operationally with the user namespace enabled.

   - The worst case performance I could come up with was timing 1
     billion cache cold stat operations with the user namespace code
     enabled.  This went from 156s to 164s on my laptop (or 156ns to
     164ns per stat operation).

   - (uid_t)-1 and (gid_t)-1 are reserved as an internal error value.
     Most uid/gid setting system calls treat these value specially
     anyway so attempting to use -1 as a uid would likely cause
     entertaining failures in userspace.

   - If setuid is called with a uid that can not be mapped setuid fails.
     I have looked at sendmail, login, ssh and every other program I
     could think of that would call setuid and they all check for and
     handle the case where setuid fails.

   - If stat or a similar system call is called from a context in which
     we can not map a uid we lie and return overflowuid.  The LFS
     experience suggests not lying and returning an error code might be
     better, but the historical precedent with uids is different and I
     can not think of anything that would break by lying about a uid we
     can't map.

   - Capabilities are localized to the current user namespace making it
     safe to give the initial user in a user namespace all capabilities.

  My git tree covers all of the modifications needed to convert the core
  kernel and enough changes to make a system bootable to runlevel 1."

Fix up trivial conflicts due to nearby independent changes in fs/stat.c

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (46 commits)
  userns:  Silence silly gcc warning.
  cred: use correct cred accessor with regards to rcu read lock
  userns: Convert the move_pages, and migrate_pages permission checks to use uid_eq
  userns: Convert cgroup permission checks to use uid_eq
  userns: Convert tmpfs to use kuid and kgid where appropriate
  userns: Convert sysfs to use kgid/kuid where appropriate
  userns: Convert sysctl permission checks to use kuid and kgids.
  userns: Convert proc to use kuid/kgid where appropriate
  userns: Convert ext4 to user kuid/kgid where appropriate
  userns: Convert ext3 to use kuid/kgid where appropriate
  userns: Convert ext2 to use kuid/kgid where appropriate.
  userns: Convert devpts to use kuid/kgid where appropriate
  userns: Convert binary formats to use kuid/kgid where appropriate
  userns: Add negative depends on entries to avoid building code that is userns unsafe
  userns: signal remove unnecessary map_cred_ns
  userns: Teach inode_capable to understand inodes whose uids map to other namespaces.
  userns: Fail exec for suid and sgid binaries with ids outside our user namespace.
  userns: Convert stat to return values mapped from kuids and kgids
  userns: Convert user specfied uids and gids in chown into kuids and kgid
  userns: Use uid_eq gid_eq helpers when comparing kuids and kgids in the vfs
  ...

30 files changed:
1  2 
arch/x86/ia32/sys_ia32.c
fs/binfmt_elf.c
fs/binfmt_elf_fdpic.c
fs/exec.c
fs/ext4/ext4.h
fs/ext4/super.c
fs/locks.c
fs/namei.c
fs/open.c
fs/proc/base.c
fs/proc/root.c
fs/stat.c
include/linux/capability.h
include/linux/fs.h
include/linux/sched.h
init/Kconfig
kernel/cgroup.c
kernel/cred.c
kernel/sched/core.c
kernel/signal.c
kernel/sys.c
kernel/timer.c
mm/mempolicy.c
mm/migrate.c
net/core/sock.c
net/ipv4/ping.c
security/commoncap.c
security/keys/key.c
security/keys/permission.c
security/keys/process_keys.c

diff --combined arch/x86/ia32/sys_ia32.c
index edca9c0a79ccbecb73f2cc90973525f658ba66d7,d5c820a5459016b128f232a5d5fb4c33f40f8421..4540bece09466149a3122a48c8538cef9622554b
@@@ -71,8 -71,8 +71,8 @@@ static int cp_stat64(struct stat64 __us
  {
        typeof(ubuf->st_uid) uid = 0;
        typeof(ubuf->st_gid) gid = 0;
-       SET_UID(uid, stat->uid);
-       SET_GID(gid, stat->gid);
+       SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid));
+       SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid));
        if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
            __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
            __put_user(stat->ino, &ubuf->__st_ino) ||
@@@ -287,6 -287,11 +287,6 @@@ asmlinkage long sys32_sigaction(int sig
        return ret;
  }
  
 -asmlinkage long sys32_alarm(unsigned int seconds)
 -{
 -      return alarm_setitimer(seconds);
 -}
 -
  asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
                              int options)
  {
  
  /* 32-bit timeval and related flotsam.  */
  
 -asmlinkage long sys32_sysfs(int option, u32 arg1, u32 arg2)
 -{
 -      return sys_sysfs(option, arg1, arg2);
 -}
 -
  asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid,
                                    struct compat_timespec __user *interval)
  {
@@@ -365,6 -375,19 +365,6 @@@ asmlinkage long sys32_pwrite(unsigned i
  }
  
  
 -asmlinkage long sys32_personality(unsigned long personality)
 -{
 -      int ret;
 -
 -      if (personality(current->personality) == PER_LINUX32 &&
 -              personality == PER_LINUX)
 -              personality = PER_LINUX32;
 -      ret = sys_personality(personality);
 -      if (ret == PER_LINUX32)
 -              ret = PER_LINUX;
 -      return ret;
 -}
 -
  asmlinkage long sys32_sendfile(int out_fd, int in_fd,
                               compat_off_t __user *offset, s32 count)
  {
diff --combined fs/binfmt_elf.c
index 16f7354170725e050e69bf971aeb63eb57598c3e,efc673163ef5ea26ecee2d9952ba9822a13b1a6f..e658dd134b95fb375b371a931e739baa95d249a8
@@@ -82,7 -82,9 +82,7 @@@ static int set_brk(unsigned long start
        end = ELF_PAGEALIGN(end);
        if (end > start) {
                unsigned long addr;
 -              down_write(&current->mm->mmap_sem);
 -              addr = do_brk(start, end - start);
 -              up_write(&current->mm->mmap_sem);
 +              addr = vm_brk(start, end - start);
                if (BAD_ADDR(addr))
                        return addr;
        }
@@@ -226,10 -228,10 +226,10 @@@ create_elf_tables(struct linux_binprm *
        NEW_AUX_ENT(AT_BASE, interp_load_addr);
        NEW_AUX_ENT(AT_FLAGS, 0);
        NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-       NEW_AUX_ENT(AT_UID, cred->uid);
-       NEW_AUX_ENT(AT_EUID, cred->euid);
-       NEW_AUX_ENT(AT_GID, cred->gid);
-       NEW_AUX_ENT(AT_EGID, cred->egid);
+       NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid));
+       NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
+       NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid));
+       NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid));
        NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
        NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
        NEW_AUX_ENT(AT_EXECFN, bprm->exec);
@@@ -512,7 -514,9 +512,7 @@@ static unsigned long load_elf_interp(st
                elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
  
                /* Map the last of the bss segment */
 -              down_write(&current->mm->mmap_sem);
 -              error = do_brk(elf_bss, last_bss - elf_bss);
 -              up_write(&current->mm->mmap_sem);
 +              error = vm_brk(elf_bss, last_bss - elf_bss);
                if (BAD_ADDR(error))
                        goto out_close;
        }
@@@ -958,8 -962,10 +958,8 @@@ static int load_elf_binary(struct linux
                   and some applications "depend" upon this behavior.
                   Since we do not have the power to recompile these, we
                   emulate the SVr4 behavior. Sigh. */
 -              down_write(&current->mm->mmap_sem);
 -              error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
 +              error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
                                MAP_FIXED | MAP_PRIVATE, 0);
 -              up_write(&current->mm->mmap_sem);
        }
  
  #ifdef ELF_PLAT_INIT
@@@ -1044,7 -1050,8 +1044,7 @@@ static int load_elf_library(struct fil
                eppnt++;
  
        /* Now use mmap to map the library into memory. */
 -      down_write(&current->mm->mmap_sem);
 -      error = do_mmap(file,
 +      error = vm_mmap(file,
                        ELF_PAGESTART(eppnt->p_vaddr),
                        (eppnt->p_filesz +
                         ELF_PAGEOFFSET(eppnt->p_vaddr)),
                        MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
                        (eppnt->p_offset -
                         ELF_PAGEOFFSET(eppnt->p_vaddr)));
 -      up_write(&current->mm->mmap_sem);
        if (error != ELF_PAGESTART(eppnt->p_vaddr))
                goto out_free_ph;
  
        len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr +
                            ELF_MIN_ALIGN - 1);
        bss = eppnt->p_memsz + eppnt->p_vaddr;
 -      if (bss > len) {
 -              down_write(&current->mm->mmap_sem);
 -              do_brk(len, bss - len);
 -              up_write(&current->mm->mmap_sem);
 -      }
 +      if (bss > len)
 +              vm_brk(len, bss - len);
        error = 0;
  
  out_free_ph:
@@@ -1356,8 -1367,8 +1356,8 @@@ static int fill_psinfo(struct elf_prpsi
        psinfo->pr_flag = p->flags;
        rcu_read_lock();
        cred = __task_cred(p);
-       SET_UID(psinfo->pr_uid, cred->uid);
-       SET_GID(psinfo->pr_gid, cred->gid);
+       SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+       SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
        rcu_read_unlock();
        strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
        
diff --combined fs/binfmt_elf_fdpic.c
index d390a0fffc65e1794c1985a2a626a87ed16c124b,82bf0ed0cd105078bdd433f6355e02044b27a4b0..3d77cf81ba3c82c8dbe7d80531e34bea4355250c
@@@ -390,17 -390,21 +390,17 @@@ static int load_elf_fdpic_binary(struc
            (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
                stack_prot |= PROT_EXEC;
  
 -      down_write(&current->mm->mmap_sem);
 -      current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot,
 +      current->mm->start_brk = vm_mmap(NULL, 0, stack_size, stack_prot,
                                         MAP_PRIVATE | MAP_ANONYMOUS |
                                         MAP_UNINITIALIZED | MAP_GROWSDOWN,
                                         0);
  
        if (IS_ERR_VALUE(current->mm->start_brk)) {
 -              up_write(&current->mm->mmap_sem);
                retval = current->mm->start_brk;
                current->mm->start_brk = 0;
                goto error_kill;
        }
  
 -      up_write(&current->mm->mmap_sem);
 -
        current->mm->brk = current->mm->start_brk;
        current->mm->context.end_brk = current->mm->start_brk;
        current->mm->context.end_brk +=
@@@ -627,10 -631,10 +627,10 @@@ static int create_elf_fdpic_tables(stru
        NEW_AUX_ENT(AT_BASE,    interp_params->elfhdr_addr);
        NEW_AUX_ENT(AT_FLAGS,   0);
        NEW_AUX_ENT(AT_ENTRY,   exec_params->entry_addr);
-       NEW_AUX_ENT(AT_UID,     (elf_addr_t) cred->uid);
-       NEW_AUX_ENT(AT_EUID,    (elf_addr_t) cred->euid);
-       NEW_AUX_ENT(AT_GID,     (elf_addr_t) cred->gid);
-       NEW_AUX_ENT(AT_EGID,    (elf_addr_t) cred->egid);
+       NEW_AUX_ENT(AT_UID,     (elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid));
+       NEW_AUX_ENT(AT_EUID,    (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid));
+       NEW_AUX_ENT(AT_GID,     (elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid));
+       NEW_AUX_ENT(AT_EGID,    (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid));
        NEW_AUX_ENT(AT_SECURE,  security_bprm_secureexec(bprm));
        NEW_AUX_ENT(AT_EXECFN,  bprm->exec);
  
@@@ -951,8 -955,10 +951,8 @@@ static int elf_fdpic_map_file_constdisp
        if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
                mflags |= MAP_EXECUTABLE;
  
 -      down_write(&mm->mmap_sem);
 -      maddr = do_mmap(NULL, load_addr, top - base,
 +      maddr = vm_mmap(NULL, load_addr, top - base,
                        PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0);
 -      up_write(&mm->mmap_sem);
        if (IS_ERR_VALUE(maddr))
                return (int) maddr;
  
@@@ -1090,8 -1096,10 +1090,8 @@@ static int elf_fdpic_map_file_by_direct
  
                /* create the mapping */
                disp = phdr->p_vaddr & ~PAGE_MASK;
 -              down_write(&mm->mmap_sem);
 -              maddr = do_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
 +              maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
                                phdr->p_offset - disp);
 -              up_write(&mm->mmap_sem);
  
                kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
                       loop, phdr->p_memsz + disp, prot, flags,
                        unsigned long xmaddr;
  
                        flags |= MAP_FIXED | MAP_ANONYMOUS;
 -                      down_write(&mm->mmap_sem);
 -                      xmaddr = do_mmap(NULL, xaddr, excess - excess1,
 +                      xmaddr = vm_mmap(NULL, xaddr, excess - excess1,
                                         prot, flags, 0);
 -                      up_write(&mm->mmap_sem);
  
                        kdebug("mmap[%d] <anon>"
                               " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx",
@@@ -1421,8 -1431,8 +1421,8 @@@ static int fill_psinfo(struct elf_prpsi
        psinfo->pr_flag = p->flags;
        rcu_read_lock();
        cred = __task_cred(p);
-       SET_UID(psinfo->pr_uid, cred->uid);
-       SET_GID(psinfo->pr_gid, cred->gid);
+       SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+       SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
        rcu_read_unlock();
        strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
  
diff --combined fs/exec.c
index 1e8efdc80412794d0e566db69783f2a310408aa6,e001bdfac5306391a74ded9ea0a5ac4efd856892..52c9e2ff6e6bd8b6f763e56ceafda431731cea9b
+++ b/fs/exec.c
@@@ -1139,7 -1139,7 +1139,7 @@@ void setup_new_exec(struct linux_binpr
        /* This is the point of no return */
        current->sas_ss_sp = current->sas_ss_size = 0;
  
-       if (current_euid() == current_uid() && current_egid() == current_gid())
+       if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
                set_dumpable(current->mm, 1);
        else
                set_dumpable(current->mm, suid_dumpable);
        current->mm->task_size = TASK_SIZE;
  
        /* install the new credentials */
-       if (bprm->cred->uid != current_euid() ||
-           bprm->cred->gid != current_egid()) {
+       if (!uid_eq(bprm->cred->uid, current_euid()) ||
+           !gid_eq(bprm->cred->gid, current_egid())) {
                current->pdeath_signal = 0;
        } else {
                would_dump(bprm, bprm->file);
@@@ -1245,13 -1245,6 +1245,13 @@@ static int check_unsafe_exec(struct lin
                        bprm->unsafe |= LSM_UNSAFE_PTRACE;
        }
  
 +      /*
 +       * This isn't strictly necessary, but it makes it harder for LSMs to
 +       * mess up.
 +       */
 +      if (current->no_new_privs)
 +              bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
 +
        n_fs = 1;
        spin_lock(&p->fs->lock);
        rcu_read_lock();
@@@ -1295,12 -1288,14 +1295,15 @@@ int prepare_binprm(struct linux_binprm 
        bprm->cred->euid = current_euid();
        bprm->cred->egid = current_egid();
  
 -      if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
 +      if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
 +          !current->no_new_privs) {
                /* Set-uid? */
                if (mode & S_ISUID) {
+                       if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
+                               return -EPERM;
                        bprm->per_clear |= PER_CLEAR_ON_SETID;
                        bprm->cred->euid = inode->i_uid;
                }
  
                /* Set-gid? */
                 * executable.
                 */
                if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+                       if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
+                               return -EPERM;
                        bprm->per_clear |= PER_CLEAR_ON_SETID;
                        bprm->cred->egid = inode->i_gid;
                }
@@@ -1379,7 -1376,7 +1384,7 @@@ int search_binary_handler(struct linux_
        unsigned int depth = bprm->recursion_depth;
        int try,retval;
        struct linux_binfmt *fmt;
 -      pid_t old_pid;
 +      pid_t old_pid, old_vpid;
  
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
  
        /* Need to fetch pid before load_binary changes it */
 +      old_pid = current->pid;
        rcu_read_lock();
 -      old_pid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
 +      old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
        rcu_read_unlock();
  
        retval = -ENOENT;
                        if (retval >= 0) {
                                if (depth == 0) {
                                        trace_sched_process_exec(current, old_pid, bprm);
 -                                      ptrace_event(PTRACE_EVENT_EXEC, old_pid);
 +                                      ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
                                }
                                put_binfmt(fmt);
                                allow_write_access(bprm->file);
@@@ -1938,21 -1934,8 +1943,21 @@@ static int coredump_wait(int exit_code
                core_waiters = zap_threads(tsk, mm, core_state, exit_code);
        up_write(&mm->mmap_sem);
  
 -      if (core_waiters > 0)
 +      if (core_waiters > 0) {
 +              struct core_thread *ptr;
 +
                wait_for_completion(&core_state->startup);
 +              /*
 +               * Wait for all the threads to become inactive, so that
 +               * all the thread context (extended register state, like
 +               * fpu etc) gets copied to the memory.
 +               */
 +              ptr = core_state->dumper.next;
 +              while (ptr != NULL) {
 +                      wait_task_inactive(ptr->task, 0);
 +                      ptr = ptr->next;
 +              }
 +      }
  
        return core_waiters;
  }
@@@ -2142,7 -2125,7 +2147,7 @@@ void do_coredump(long signr, int exit_c
        if (__get_dumpable(cprm.mm_flags) == 2) {
                /* Setuid core dump mode */
                flag = O_EXCL;          /* Stop rewrite attacks */
-               cred->fsuid = 0;        /* Dump root private */
+               cred->fsuid = GLOBAL_ROOT_UID;  /* Dump root private */
        }
  
        retval = coredump_wait(exit_code, &core_state);
                 * Dont allow local users get cute and trick others to coredump
                 * into their pre-created files.
                 */
-               if (inode->i_uid != current_fsuid())
+               if (!uid_eq(inode->i_uid, current_fsuid()))
                        goto close_fail;
                if (!cprm.file->f_op || !cprm.file->f_op->write)
                        goto close_fail;
diff --combined fs/ext4/ext4.h
index 0e01e90add8bc42f1492a73a2e1e78331b8ae134,0b4aeb24593cfdc267f306385fa9e880046063db..c21b1de51afbb42191adea4fc4a357e3906c8489
@@@ -1153,8 -1153,8 +1153,8 @@@ struct ext4_sb_info 
        unsigned int s_mount_flags;
        unsigned int s_def_mount_opt;
        ext4_fsblk_t s_sb_block;
-       uid_t s_resuid;
-       gid_t s_resgid;
+       kuid_t s_resuid;
+       kgid_t s_resgid;
        unsigned short s_mount_state;
        unsigned short s_pad;
        int s_addr_per_block_bits;
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
  #endif
 -      /* ext4 extent cache stats */
 -      unsigned long extent_cache_hits;
 -      unsigned long extent_cache_misses;
  
        /* for buddy allocator */
        struct ext4_group_info ***s_group_info;
diff --combined fs/ext4/super.c
index e1fb1d5de58eab4150792974f9784751f4638557,9d8eba0de27d02f69dc45e71227ce2ea782f24b1..436b4223df66a889dc4e8f685cc89195afec8650
@@@ -1305,20 -1305,20 +1305,20 @@@ static int set_qf_name(struct super_blo
                ext4_msg(sb, KERN_ERR,
                        "Cannot change journaled "
                        "quota options when quota turned on");
 -              return 0;
 +              return -1;
        }
        qname = match_strdup(args);
        if (!qname) {
                ext4_msg(sb, KERN_ERR,
                        "Not enough memory for storing quotafile name");
 -              return 0;
 +              return -1;
        }
        if (sbi->s_qf_names[qtype] &&
                strcmp(sbi->s_qf_names[qtype], qname)) {
                ext4_msg(sb, KERN_ERR,
                        "%s quota file already specified", QTYPE2NAME(qtype));
                kfree(qname);
 -              return 0;
 +              return -1;
        }
        sbi->s_qf_names[qtype] = qname;
        if (strchr(sbi->s_qf_names[qtype], '/')) {
                        "quotafile must be on filesystem root");
                kfree(sbi->s_qf_names[qtype]);
                sbi->s_qf_names[qtype] = NULL;
 -              return 0;
 +              return -1;
        }
        set_opt(sb, QUOTA);
        return 1;
@@@ -1341,7 -1341,7 +1341,7 @@@ static int clear_qf_name(struct super_b
                sbi->s_qf_names[qtype]) {
                ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
                        " when quota turned on");
 -              return 0;
 +              return -1;
        }
        /*
         * The space will be released later when all options are confirmed
@@@ -1448,18 -1448,10 +1448,20 @@@ static int handle_mount_opt(struct supe
  {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        const struct mount_opts *m;
+       kuid_t uid;
+       kgid_t gid;
        int arg = 0;
  
 +#ifdef CONFIG_QUOTA
 +      if (token == Opt_usrjquota)
 +              return set_qf_name(sb, USRQUOTA, &args[0]);
 +      else if (token == Opt_grpjquota)
 +              return set_qf_name(sb, GRPQUOTA, &args[0]);
 +      else if (token == Opt_offusrjquota)
 +              return clear_qf_name(sb, USRQUOTA);
 +      else if (token == Opt_offgrpjquota)
 +              return clear_qf_name(sb, GRPQUOTA);
 +#endif
        if (args->from && match_int(args, &arg))
                return -1;
        switch (token) {
                         "Ignoring removed %s option", opt);
                return 1;
        case Opt_resuid:
-               sbi->s_resuid = arg;
+               uid = make_kuid(current_user_ns(), arg);
+               if (!uid_valid(uid)) {
+                       ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
+                       return -1;
+               }
+               sbi->s_resuid = uid;
                return 1;
        case Opt_resgid:
-               sbi->s_resgid = arg;
+               gid = make_kgid(current_user_ns(), arg);
+               if (!gid_valid(gid)) {
+                       ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
+                       return -1;
+               }
+               sbi->s_resgid = gid;
                return 1;
        case Opt_abort:
                sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
                                sbi->s_mount_opt |= m->mount_opt;
                        }
  #ifdef CONFIG_QUOTA
 -              } else if (token == Opt_usrjquota) {
 -                      if (!set_qf_name(sb, USRQUOTA, &args[0]))
 -                              return -1;
 -              } else if (token == Opt_grpjquota) {
 -                      if (!set_qf_name(sb, GRPQUOTA, &args[0]))
 -                              return -1;
 -              } else if (token == Opt_offusrjquota) {
 -                      if (!clear_qf_name(sb, USRQUOTA))
 -                              return -1;
 -              } else if (token == Opt_offgrpjquota) {
 -                      if (!clear_qf_name(sb, GRPQUOTA))
 -                              return -1;
                } else if (m->flags & MOPT_QFMT) {
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_jquota_fmt != m->mount_opt) {
@@@ -1597,9 -1611,7 +1609,9 @@@ static int parse_options(char *options
                         unsigned int *journal_ioprio,
                         int is_remount)
  {
 +#ifdef CONFIG_QUOTA
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 +#endif
        char *p;
        substring_t args[MAX_OPT_ARGS];
        int token;
@@@ -1732,12 -1744,14 +1744,14 @@@ static int _ext4_show_options(struct se
                SEQ_OPTS_PRINT("%s", token2str(m->token));
        }
  
-       if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID ||
+       if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
            le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
-               SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid);
-       if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID ||
+               SEQ_OPTS_PRINT("resuid=%u",
+                               from_kuid_munged(&init_user_ns, sbi->s_resuid));
+       if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
            le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
-               SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid);
+               SEQ_OPTS_PRINT("resgid=%u",
+                               from_kgid_munged(&init_user_ns, sbi->s_resgid));
        def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
        if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
                SEQ_OPTS_PUTS("errors=remount-ro");
@@@ -2366,6 -2380,18 +2380,6 @@@ static ssize_t lifetime_write_kbytes_sh
                          EXT4_SB(sb)->s_sectors_written_start) >> 1)));
  }
  
 -static ssize_t extent_cache_hits_show(struct ext4_attr *a,
 -                                    struct ext4_sb_info *sbi, char *buf)
 -{
 -      return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
 -}
 -
 -static ssize_t extent_cache_misses_show(struct ext4_attr *a,
 -                                      struct ext4_sb_info *sbi, char *buf)
 -{
 -      return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
 -}
 -
  static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
                                          struct ext4_sb_info *sbi,
                                          const char *buf, size_t count)
@@@ -2423,6 -2449,8 +2437,6 @@@ static struct ext4_attr ext4_attr_##nam
  EXT4_RO_ATTR(delayed_allocation_blocks);
  EXT4_RO_ATTR(session_write_kbytes);
  EXT4_RO_ATTR(lifetime_write_kbytes);
 -EXT4_RO_ATTR(extent_cache_hits);
 -EXT4_RO_ATTR(extent_cache_misses);
  EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
                 inode_readahead_blks_store, s_inode_readahead_blks);
  EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@@ -2438,6 -2466,8 +2452,6 @@@ static struct attribute *ext4_attrs[] 
        ATTR_LIST(delayed_allocation_blocks),
        ATTR_LIST(session_write_kbytes),
        ATTR_LIST(lifetime_write_kbytes),
 -      ATTR_LIST(extent_cache_hits),
 -      ATTR_LIST(extent_cache_misses),
        ATTR_LIST(inode_readahead_blks),
        ATTR_LIST(inode_goal),
        ATTR_LIST(mb_stats),
@@@ -2980,8 -3010,8 +2994,8 @@@ static int ext4_fill_super(struct super
        }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
-       sbi->s_resuid = EXT4_DEF_RESUID;
-       sbi->s_resgid = EXT4_DEF_RESGID;
+       sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
+       sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sb_block = sb_block;
        if (sb->s_bdev->bd_part)
        if (def_mount_opts & EXT4_DEFM_DISCARD)
                set_opt(sb, DISCARD);
  
-       sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
-       sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+       sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+       sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
@@@ -4213,8 -4243,8 +4227,8 @@@ static int ext4_unfreeze(struct super_b
  struct ext4_mount_options {
        unsigned long s_mount_opt;
        unsigned long s_mount_opt2;
-       uid_t s_resuid;
-       gid_t s_resgid;
+       kuid_t s_resuid;
+       kgid_t s_resgid;
        unsigned long s_commit_interval;
        u32 s_min_batch_time, s_max_batch_time;
  #ifdef CONFIG_QUOTA
diff --combined fs/locks.c
index 0d68f1f817996bef79ab16a4b0c51ceecd409d65,3e946cda98c65d87f8b1df46cf154b7f1c7a4b64..4f441e46cef47bc67b08a3e82b78f389dfbbf818
@@@ -510,13 -510,12 +510,13 @@@ static void __locks_delete_block(struc
  
  /*
   */
 -static void locks_delete_block(struct file_lock *waiter)
 +void locks_delete_block(struct file_lock *waiter)
  {
        lock_flocks();
        __locks_delete_block(waiter);
        unlock_flocks();
  }
 +EXPORT_SYMBOL(locks_delete_block);
  
  /* Insert waiter into blocker's block list.
   * We use a circular list so that processes can be easily woken up in
@@@ -1446,7 -1445,7 +1446,7 @@@ int generic_setlease(struct file *filp
        struct inode *inode = dentry->d_inode;
        int error;
  
-       if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
+       if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
                return -EACCES;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
diff --combined fs/namei.c
index f9e883c1b856526fe65ac485c4c225a7407d3fa9,86512b4d38fd126d3ca2c8701d7f276a6774a229..e70ebab9624bb8666b9c0b70598c83be14bfc63c
   * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
   * PATH_MAX includes the nul terminator --RR.
   */
 -static int do_getname(const char __user *filename, char *page)
 -{
 -      int retval;
 -      unsigned long len = PATH_MAX;
 -
 -      if (!segment_eq(get_fs(), KERNEL_DS)) {
 -              if ((unsigned long) filename >= TASK_SIZE)
 -                      return -EFAULT;
 -              if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
 -                      len = TASK_SIZE - (unsigned long) filename;
 -      }
 -
 -      retval = strncpy_from_user(page, filename, len);
 -      if (retval > 0) {
 -              if (retval < len)
 -                      return 0;
 -              return -ENAMETOOLONG;
 -      } else if (!retval)
 -              retval = -ENOENT;
 -      return retval;
 -}
 -
  static char *getname_flags(const char __user *filename, int flags, int *empty)
  {
 -      char *result = __getname();
 -      int retval;
 +      char *result = __getname(), *err;
 +      int len;
  
 -      if (!result)
 +      if (unlikely(!result))
                return ERR_PTR(-ENOMEM);
  
 -      retval = do_getname(filename, result);
 -      if (retval < 0) {
 -              if (retval == -ENOENT && empty)
 +      len = strncpy_from_user(result, filename, PATH_MAX);
 +      err = ERR_PTR(len);
 +      if (unlikely(len < 0))
 +              goto error;
 +
 +      /* The empty path is special. */
 +      if (unlikely(!len)) {
 +              if (empty)
                        *empty = 1;
 -              if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
 -                      __putname(result);
 -                      return ERR_PTR(retval);
 -              }
 +              err = ERR_PTR(-ENOENT);
 +              if (!(flags & LOOKUP_EMPTY))
 +                      goto error;
 +      }
 +
 +      err = ERR_PTR(-ENAMETOOLONG);
 +      if (likely(len < PATH_MAX)) {
 +              audit_getname(result);
 +              return result;
        }
 -      audit_getname(result);
 -      return result;
 +
 +error:
 +      __putname(result);
 +      return err;
  }
  
  char *getname(const char __user * filename)
@@@ -218,10 -228,7 +218,7 @@@ static int acl_permission_check(struct 
  {
        unsigned int mode = inode->i_mode;
  
-       if (current_user_ns() != inode_userns(inode))
-               goto other_perms;
-       if (likely(current_fsuid() == inode->i_uid))
+       if (likely(uid_eq(current_fsuid(), inode->i_uid)))
                mode >>= 6;
        else {
                if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
                        mode >>= 3;
        }
  
- other_perms:
        /*
         * If the DACs are ok we don't need any capability check.
         */
@@@ -270,10 -276,10 +266,10 @@@ int generic_permission(struct inode *in
  
        if (S_ISDIR(inode->i_mode)) {
                /* DACs are overridable for directories */
-               if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
+               if (inode_capable(inode, CAP_DAC_OVERRIDE))
                        return 0;
                if (!(mask & MAY_WRITE))
-                       if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
+                       if (inode_capable(inode, CAP_DAC_READ_SEARCH))
                                return 0;
                return -EACCES;
        }
         * at least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
-               if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
+               if (inode_capable(inode, CAP_DAC_OVERRIDE))
                        return 0;
  
        /*
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ)
-               if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
+               if (inode_capable(inode, CAP_DAC_READ_SEARCH))
                        return 0;
  
        return -EACCES;
@@@ -1144,25 -1150,12 +1140,25 @@@ static int do_lookup(struct nameidata *
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
 -              *inode = nd->inode;
 -              dentry = __d_lookup_rcu(parent, name, &seq, inode);
 +              dentry = __d_lookup_rcu(parent, name, &seq, nd->inode);
                if (!dentry)
                        goto unlazy;
  
 -              /* Memory barrier in read_seqcount_begin of child is enough */
 +              /*
 +               * This sequence count validates that the inode matches
 +               * the dentry name information from lookup.
 +               */
 +              *inode = dentry->d_inode;
 +              if (read_seqcount_retry(&dentry->d_seq, seq))
 +                      return -ECHILD;
 +
 +              /*
 +               * This sequence count validates that the parent had no
 +               * changes while we did the lookup of the dentry above.
 +               *
 +               * The memory barrier in read_seqcount_begin of child is
 +               *  enough, we can use __read_seqcount_retry here.
 +               */
                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
                        return -ECHILD;
                nd->seq = seq;
@@@ -1410,9 -1403,18 +1406,9 @@@ static inline int can_lookup(struct ino
   */
  #ifdef CONFIG_DCACHE_WORD_ACCESS
  
 -#ifdef CONFIG_64BIT
 +#include <asm/word-at-a-time.h>
  
 -/*
 - * Jan Achrenius on G+: microoptimized version of
 - * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
 - * that works for the bytemasks without having to
 - * mask them first.
 - */
 -static inline long count_masked_bytes(unsigned long mask)
 -{
 -      return mask*0x0001020304050608ul >> 56;
 -}
 +#ifdef CONFIG_64BIT
  
  static inline unsigned int fold_hash(unsigned long hash)
  {
  
  #else /* 32-bit case */
  
 -/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
 -static inline long count_masked_bytes(long mask)
 -{
 -      /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
 -      long a = (0x0ff0001+mask) >> 23;
 -      /* Fix the 1 for 00 case */
 -      return a & mask;
 -}
 -
  #define fold_hash(x) (x)
  
  #endif
@@@ -1432,7 -1443,7 +1428,7 @@@ unsigned int full_name_hash(const unsig
        unsigned long hash = 0;
  
        for (;;) {
 -              a = *(unsigned long *)name;
 +              a = load_unaligned_zeropad(name);
                if (len < sizeof(unsigned long))
                        break;
                hash += a;
@@@ -1449,6 -1460,17 +1445,6 @@@ done
  }
  EXPORT_SYMBOL(full_name_hash);
  
 -#define REPEAT_BYTE(x)        ((~0ul / 0xff) * (x))
 -#define ONEBYTES      REPEAT_BYTE(0x01)
 -#define SLASHBYTES    REPEAT_BYTE('/')
 -#define HIGHBITS      REPEAT_BYTE(0x80)
 -
 -/* Return the high bit set in the first byte that is a zero */
 -static inline unsigned long has_zero(unsigned long a)
 -{
 -      return ((a - ONEBYTES) & ~a) & HIGHBITS;
 -}
 -
  /*
   * Calculate the length and hash of the path component, and
   * return the length of the component;
@@@ -1462,9 -1484,9 +1458,9 @@@ static inline unsigned long hash_name(c
        do {
                hash = (hash + a) * 9;
                len += sizeof(unsigned long);
 -              a = *(unsigned long *)(name+len);
 +              a = load_unaligned_zeropad(name+len);
                /* Do we have any NUL or '/' bytes in this word? */
 -              mask = has_zero(a) | has_zero(a ^ SLASHBYTES);
 +              mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/'));
        } while (!mask);
  
        /* The mask *below* the first high bit set */
@@@ -1934,19 -1956,15 +1930,15 @@@ static int user_path_parent(int dfd, co
   */
  static inline int check_sticky(struct inode *dir, struct inode *inode)
  {
-       uid_t fsuid = current_fsuid();
+       kuid_t fsuid = current_fsuid();
  
        if (!(dir->i_mode & S_ISVTX))
                return 0;
-       if (current_user_ns() != inode_userns(inode))
-               goto other_userns;
-       if (inode->i_uid == fsuid)
+       if (uid_eq(inode->i_uid, fsuid))
                return 0;
-       if (dir->i_uid == fsuid)
+       if (uid_eq(dir->i_uid, fsuid))
                return 0;
- other_userns:
-       return !ns_capable(inode_userns(inode), CAP_FOWNER);
+       return !inode_capable(inode, CAP_FOWNER);
  }
  
  /*
@@@ -2534,8 -2552,7 +2526,7 @@@ int vfs_mknod(struct inode *dir, struc
        if (error)
                return error;
  
-       if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
-           !ns_capable(inode_userns(dir), CAP_MKNOD))
+       if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
                return -EPERM;
  
        if (!dir->i_op->mknod)
diff --combined fs/open.c
index 5eccdcea2d1b977dde9482cabf8976e54e172bf8,e166801383238f948c2c2218a175be4ec0887c52..d54301219d04f1c8fed18d6de15ed590a593e2ab
+++ b/fs/open.c
@@@ -316,7 -316,8 +316,8 @@@ SYSCALL_DEFINE3(faccessat, int, dfd, co
  
        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                /* Clear the capabilities if we switch to a non-root user */
-               if (override_cred->uid)
+               kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
+               if (!uid_eq(override_cred->uid, root_uid))
                        cap_clear(override_cred->cap_effective);
                else
                        override_cred->cap_effective =
@@@ -505,15 -506,24 +506,24 @@@ static int chown_common(struct path *pa
        struct inode *inode = path->dentry->d_inode;
        int error;
        struct iattr newattrs;
+       kuid_t uid;
+       kgid_t gid;
+       uid = make_kuid(current_user_ns(), user);
+       gid = make_kgid(current_user_ns(), group);
  
        newattrs.ia_valid =  ATTR_CTIME;
        if (user != (uid_t) -1) {
+               if (!uid_valid(uid))
+                       return -EINVAL;
                newattrs.ia_valid |= ATTR_UID;
-               newattrs.ia_uid = user;
+               newattrs.ia_uid = uid;
        }
        if (group != (gid_t) -1) {
+               if (!gid_valid(gid))
+                       return -EINVAL;
                newattrs.ia_valid |= ATTR_GID;
-               newattrs.ia_gid = group;
+               newattrs.ia_gid = gid;
        }
        if (!S_ISDIR(inode->i_mode))
                newattrs.ia_valid |=
@@@ -681,7 -691,7 +691,7 @@@ static struct file *__dentry_open(struc
  
        f->f_op = fops_get(inode->i_fop);
  
 -      error = security_dentry_open(f, cred);
 +      error = security_file_open(f, cred);
        if (error)
                goto cleanup_all;
  
diff --combined fs/proc/base.c
index 57b8159f26f328e4fdd6308ef1b656a587cf7a28,c47904994b78248fc1984914eab3b3c382952139..d2d3108a611c8cf96b6d1aa275270a3929556ccf
@@@ -81,6 -81,7 +81,7 @@@
  #include <linux/oom.h>
  #include <linux/elf.h>
  #include <linux/pid_namespace.h>
+ #include <linux/user_namespace.h>
  #include <linux/fs_struct.h>
  #include <linux/slab.h>
  #include <linux/flex_array.h>
@@@ -1561,8 -1562,8 +1562,8 @@@ int pid_getattr(struct vfsmount *mnt, s
        generic_fillattr(inode, stat);
  
        rcu_read_lock();
-       stat->uid = 0;
-       stat->gid = 0;
+       stat->uid = GLOBAL_ROOT_UID;
+       stat->gid = GLOBAL_ROOT_GID;
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (task) {
                if (!has_pid_permissions(pid, task, 2)) {
@@@ -1622,8 -1623,8 +1623,8 @@@ int pid_revalidate(struct dentry *dentr
                        inode->i_gid = cred->egid;
                        rcu_read_unlock();
                } else {
-                       inode->i_uid = 0;
-                       inode->i_gid = 0;
+                       inode->i_uid = GLOBAL_ROOT_UID;
+                       inode->i_gid = GLOBAL_ROOT_GID;
                }
                inode->i_mode &= ~(S_ISUID | S_ISGID);
                security_task_to_inode(task, inode);
@@@ -1799,15 -1800,10 +1800,15 @@@ static int tid_fd_revalidate(struct den
        if (task) {
                files = get_files_struct(task);
                if (files) {
 +                      struct file *file;
                        rcu_read_lock();
 -                      if (fcheck_files(files, fd)) {
 +                      file = fcheck_files(files, fd);
 +                      if (file) {
 +                              unsigned i_mode, f_mode = file->f_mode;
 +
                                rcu_read_unlock();
                                put_files_struct(files);
 +
                                if (task_dumpable(task)) {
                                        rcu_read_lock();
                                        cred = __task_cred(task);
                                        inode->i_gid = cred->egid;
                                        rcu_read_unlock();
                                } else {
-                                       inode->i_uid = 0;
-                                       inode->i_gid = 0;
+                                       inode->i_uid = GLOBAL_ROOT_UID;
+                                       inode->i_gid = GLOBAL_ROOT_GID;
                                }
 -                              inode->i_mode &= ~(S_ISUID | S_ISGID);
 +
 +                              i_mode = S_IFLNK;
 +                              if (f_mode & FMODE_READ)
 +                                      i_mode |= S_IRUSR | S_IXUSR;
 +                              if (f_mode & FMODE_WRITE)
 +                                      i_mode |= S_IWUSR | S_IXUSR;
 +                              inode->i_mode = i_mode;
 +
                                security_task_to_inode(task, inode);
                                put_task_struct(task);
                                return 1;
@@@ -1849,6 -1838,8 +1850,6 @@@ static struct dentry *proc_fd_instantia
        struct dentry *dentry, struct task_struct *task, const void *ptr)
  {
        unsigned fd = *(const unsigned *)ptr;
 -      struct file *file;
 -      struct files_struct *files;
        struct inode *inode;
        struct proc_inode *ei;
        struct dentry *error = ERR_PTR(-ENOENT);
                goto out;
        ei = PROC_I(inode);
        ei->fd = fd;
 -      files = get_files_struct(task);
 -      if (!files)
 -              goto out_iput;
 -      inode->i_mode = S_IFLNK;
 -
 -      /*
 -       * We are not taking a ref to the file structure, so we must
 -       * hold ->file_lock.
 -       */
 -      spin_lock(&files->file_lock);
 -      file = fcheck_files(files, fd);
 -      if (!file)
 -              goto out_unlock;
 -      if (file->f_mode & FMODE_READ)
 -              inode->i_mode |= S_IRUSR | S_IXUSR;
 -      if (file->f_mode & FMODE_WRITE)
 -              inode->i_mode |= S_IWUSR | S_IXUSR;
 -      spin_unlock(&files->file_lock);
 -      put_files_struct(files);
  
        inode->i_op = &proc_pid_link_inode_operations;
        inode->i_size = 64;
  
   out:
        return error;
 -out_unlock:
 -      spin_unlock(&files->file_lock);
 -      put_files_struct(files);
 -out_iput:
 -      iput(inode);
 -      goto out;
  }
  
  static struct dentry *proc_lookupfd_common(struct inode *dir,
@@@ -2045,8 -2061,8 +2046,8 @@@ static int map_files_d_revalidate(struc
                        inode->i_gid = cred->egid;
                        rcu_read_unlock();
                } else {
-                       inode->i_uid = 0;
-                       inode->i_gid = 0;
+                       inode->i_uid = GLOBAL_ROOT_UID;
+                       inode->i_gid = GLOBAL_ROOT_GID;
                }
                security_task_to_inode(task, inode);
                status = 1;
@@@ -2162,16 -2178,16 +2163,16 @@@ static struct dentry *proc_map_files_lo
                goto out;
  
        result = ERR_PTR(-EACCES);
 -      if (lock_trace(task))
 +      if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out_put_task;
  
        result = ERR_PTR(-ENOENT);
        if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
 -              goto out_unlock;
 +              goto out_put_task;
  
        mm = get_task_mm(task);
        if (!mm)
 -              goto out_unlock;
 +              goto out_put_task;
  
        down_read(&mm->mmap_sem);
        vma = find_exact_vma(mm, vm_start, vm_end);
  out_no_vma:
        up_read(&mm->mmap_sem);
        mmput(mm);
 -out_unlock:
 -      unlock_trace(task);
  out_put_task:
        put_task_struct(task);
  out:
@@@ -2216,7 -2234,7 +2217,7 @@@ proc_map_files_readdir(struct file *fil
                goto out;
  
        ret = -EACCES;
 -      if (lock_trace(task))
 +      if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out_put_task;
  
        ret = 0;
        case 0:
                ino = inode->i_ino;
                if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
 -                      goto out_unlock;
 +                      goto out_put_task;
                filp->f_pos++;
        case 1:
                ino = parent_ino(dentry);
                if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
 -                      goto out_unlock;
 +                      goto out_put_task;
                filp->f_pos++;
        default:
        {
  
                mm = get_task_mm(task);
                if (!mm)
 -                      goto out_unlock;
 +                      goto out_put_task;
                down_read(&mm->mmap_sem);
  
                nr_files = 0;
                                        flex_array_free(fa);
                                up_read(&mm->mmap_sem);
                                mmput(mm);
 -                              goto out_unlock;
 +                              goto out_put_task;
                        }
                        for (i = 0, vma = mm->mmap, pos = 2; vma;
                                        vma = vma->vm_next) {
        }
        }
  
 -out_unlock:
 -      unlock_trace(task);
  out_put_task:
        put_task_struct(task);
  out:
@@@ -2924,6 -2944,74 +2925,74 @@@ static int proc_tgid_io_accounting(stru
  }
  #endif /* CONFIG_TASK_IO_ACCOUNTING */
  
+ #ifdef CONFIG_USER_NS
+ static int proc_id_map_open(struct inode *inode, struct file *file,
+       struct seq_operations *seq_ops)
+ {
+       struct user_namespace *ns = NULL;
+       struct task_struct *task;
+       struct seq_file *seq;
+       int ret = -EINVAL;
+       task = get_proc_task(inode);
+       if (task) {
+               rcu_read_lock();
+               ns = get_user_ns(task_cred_xxx(task, user_ns));
+               rcu_read_unlock();
+               put_task_struct(task);
+       }
+       if (!ns)
+               goto err;
+       ret = seq_open(file, seq_ops);
+       if (ret)
+               goto err_put_ns;
+       seq = file->private_data;
+       seq->private = ns;
+       return 0;
+ err_put_ns:
+       put_user_ns(ns);
+ err:
+       return ret;
+ }
+ static int proc_id_map_release(struct inode *inode, struct file *file)
+ {
+       struct seq_file *seq = file->private_data;
+       struct user_namespace *ns = seq->private;
+       put_user_ns(ns);
+       return seq_release(inode, file);
+ }
+ static int proc_uid_map_open(struct inode *inode, struct file *file)
+ {
+       return proc_id_map_open(inode, file, &proc_uid_seq_operations);
+ }
+ static int proc_gid_map_open(struct inode *inode, struct file *file)
+ {
+       return proc_id_map_open(inode, file, &proc_gid_seq_operations);
+ }
+ static const struct file_operations proc_uid_map_operations = {
+       .open           = proc_uid_map_open,
+       .write          = proc_uid_map_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = proc_id_map_release,
+ };
+ static const struct file_operations proc_gid_map_operations = {
+       .open           = proc_gid_map_open,
+       .write          = proc_gid_map_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = proc_id_map_release,
+ };
+ #endif /* CONFIG_USER_NS */
  static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
  {
@@@ -3026,6 -3114,10 +3095,10 @@@ static const struct pid_entry tgid_base
  #ifdef CONFIG_HARDWALL
        INF("hardwall",   S_IRUGO, proc_pid_hardwall),
  #endif
+ #ifdef CONFIG_USER_NS
+       REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
+       REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
+ #endif
  };
  
  static int proc_tgid_base_readdir(struct file * filp,
@@@ -3381,6 -3473,10 +3454,10 @@@ static const struct pid_entry tid_base_
  #ifdef CONFIG_HARDWALL
        INF("hardwall",   S_IRUGO, proc_pid_hardwall),
  #endif
+ #ifdef CONFIG_USER_NS
+       REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
+       REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
+ #endif
  };
  
  static int proc_tid_base_readdir(struct file * filp,
diff --combined fs/proc/root.c
index eed44bfc85db7c6ea420233dd09c2e5b9b5f0350,df4e4561dbbf3c1497c7ebf9a524cd190acc9821..7c30fce037c0e9cbf6d848aeb7cc1f8c25a13e83
@@@ -67,7 -67,7 +67,7 @@@ static int proc_parse_options(char *opt
                case Opt_gid:
                        if (match_int(&args[0], &option))
                                return 0;
-                       pid->pid_gid = option;
+                       pid->pid_gid = make_kgid(current_user_ns(), option);
                        break;
                case Opt_hidepid:
                        if (match_int(&args[0], &option))
@@@ -115,13 -115,12 +115,13 @@@ static struct dentry *proc_mount(struc
        if (IS_ERR(sb))
                return ERR_CAST(sb);
  
 +      if (!proc_parse_options(options, ns)) {
 +              deactivate_locked_super(sb);
 +              return ERR_PTR(-EINVAL);
 +      }
 +
        if (!sb->s_root) {
                sb->s_flags = flags;
 -              if (!proc_parse_options(options, ns)) {
 -                      deactivate_locked_super(sb);
 -                      return ERR_PTR(-EINVAL);
 -              }
                err = proc_fill_super(sb);
                if (err) {
                        deactivate_locked_super(sb);
diff --combined fs/stat.c
index 0cef3366a919db83f7e57b4b44be81d755ded16e,31acca5f5a0cd55152acae75f381f9337d8a9afa..b6ff11825fc8a9c37f8d45ccf01e1fbdc1115868
+++ b/fs/stat.c
@@@ -57,13 -57,12 +57,13 @@@ EXPORT_SYMBOL(vfs_getattr)
  
  int vfs_fstat(unsigned int fd, struct kstat *stat)
  {
 -      struct file *f = fget(fd);
 +      int fput_needed;
 +      struct file *f = fget_light(fd, &fput_needed);
        int error = -EBADF;
  
        if (f) {
                error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
 -              fput(f);
 +              fput_light(f, fput_needed);
        }
        return error;
  }
@@@ -138,8 -137,8 +138,8 @@@ static int cp_old_stat(struct kstat *st
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
-       SET_UID(tmp.st_uid, stat->uid);
-       SET_GID(tmp.st_gid, stat->gid);
+       SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+       SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = old_encode_dev(stat->rdev);
  #if BITS_PER_LONG == 32
        if (stat->size > MAX_NON_LFS)
@@@ -191,32 -190,24 +191,32 @@@ SYSCALL_DEFINE2(fstat, unsigned int, fd
  
  #endif /* __ARCH_WANT_OLD_STAT */
  
 +#if BITS_PER_LONG == 32
 +#  define choose_32_64(a,b) a
 +#else
 +#  define choose_32_64(a,b) b
 +#endif
 +
 +#define valid_dev(x)  choose_32_64(old_valid_dev,new_valid_dev)(x)
 +#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
 +
 +#ifndef INIT_STRUCT_STAT_PADDING
 +#  define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
 +#endif
 +
  static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
  {
        struct stat tmp;
  
 -#if BITS_PER_LONG == 32
 -      if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
 +      if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
                return -EOVERFLOW;
 -#else
 -      if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
 +#if BITS_PER_LONG == 32
 +      if (stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
  #endif
  
 -      memset(&tmp, 0, sizeof(tmp));
 -#if BITS_PER_LONG == 32
 -      tmp.st_dev = old_encode_dev(stat->dev);
 -#else
 -      tmp.st_dev = new_encode_dev(stat->dev);
 -#endif
 +      INIT_STRUCT_STAT_PADDING(tmp);
 +      tmp.st_dev = encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
-       SET_UID(tmp.st_uid, stat->uid);
-       SET_GID(tmp.st_gid, stat->gid);
+       SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+       SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
 -#if BITS_PER_LONG == 32
 -      tmp.st_rdev = old_encode_dev(stat->rdev);
 -#else
 -      tmp.st_rdev = new_encode_dev(stat->rdev);
 -#endif
 -#if BITS_PER_LONG == 32
 -      if (stat->size > MAX_NON_LFS)
 -              return -EOVERFLOW;
 -#endif        
 +      tmp.st_rdev = encode_dev(stat->rdev);
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_mtime = stat->mtime.tv_sec;
@@@ -328,15 -327,11 +328,15 @@@ SYSCALL_DEFINE3(readlink, const char __
  /* ---------- LFS-64 ----------- */
  #ifdef __ARCH_WANT_STAT64
  
 +#ifndef INIT_STRUCT_STAT64_PADDING
 +#  define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st))
 +#endif
 +
  static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
  {
        struct stat64 tmp;
  
 -      memset(&tmp, 0, sizeof(struct stat64));
 +      INIT_STRUCT_STAT64_PADDING(tmp);
  #ifdef CONFIG_MIPS
        /* mips has weird padding, so we don't get 64 bits there */
        if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
  #endif
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
-       tmp.st_uid = stat->uid;
-       tmp.st_gid = stat->gid;
+       tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
+       tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime = stat->mtime.tv_sec;
index c398cff3dab7b3b2e74bedef7c5a6cf8bbf54050,a76eca90747077fe24bb2c98e91765ee431159ba..68d56effc32860ac29a26487316fcf75b4642261
@@@ -360,11 -360,8 +360,11 @@@ struct cpu_vfs_cap_data 
  
  #define CAP_WAKE_ALARM            35
  
 +/* Allow preventing system suspends while epoll events are pending */
  
 -#define CAP_LAST_CAP         CAP_WAKE_ALARM
 +#define CAP_EPOLLWAKEUP      36
 +
 +#define CAP_LAST_CAP         CAP_EPOLLWAKEUP
  
  #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
  
  
  #ifdef __KERNEL__
  
+ struct inode;
  struct dentry;
  struct user_namespace;
  
@@@ -551,6 -549,7 +552,7 @@@ extern bool has_ns_capability_noaudit(s
  extern bool capable(int cap);
  extern bool ns_capable(struct user_namespace *ns, int cap);
  extern bool nsown_capable(int cap);
+ extern bool inode_capable(const struct inode *inode, int cap);
  
  /* audit system wants to get cap info from files as well */
  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
diff --combined include/linux/fs.h
index 25c40b9f848afeac9313be994c6e8ad03683a0c5,797eb262d9f1d63428cd3af4a5390eb89f30b71e..c0e53372b082f445a6d713839452015de367e8fb
@@@ -402,6 -402,7 +402,7 @@@ struct inodes_stat_t 
  #include <linux/atomic.h>
  #include <linux/shrinker.h>
  #include <linux/migrate_mode.h>
+ #include <linux/uidgid.h>
  
  #include <asm/byteorder.h>
  
@@@ -469,8 -470,8 +470,8 @@@ typedef void (dio_iodone_t)(struct kioc
  struct iattr {
        unsigned int    ia_valid;
        umode_t         ia_mode;
-       uid_t           ia_uid;
-       gid_t           ia_gid;
+       kuid_t          ia_uid;
+       kgid_t          ia_gid;
        loff_t          ia_size;
        struct timespec ia_atime;
        struct timespec ia_mtime;
@@@ -761,8 -762,8 +762,8 @@@ struct posix_acl
  struct inode {
        umode_t                 i_mode;
        unsigned short          i_opflags;
-       uid_t                   i_uid;
-       gid_t                   i_gid;
+       kuid_t                  i_uid;
+       kgid_t                  i_gid;
        unsigned int            i_flags;
  
  #ifdef CONFIG_FS_POSIX_ACL
@@@ -927,6 -928,31 +928,31 @@@ static inline void i_size_write(struct 
  #endif
  }
  
+ /* Helper functions so that in most cases filesystems will
+  * not need to deal directly with kuid_t and kgid_t and can
+  * instead deal with the raw numeric values that are stored
+  * in the filesystem.
+  */
+ static inline uid_t i_uid_read(const struct inode *inode)
+ {
+       return from_kuid(&init_user_ns, inode->i_uid);
+ }
+ static inline gid_t i_gid_read(const struct inode *inode)
+ {
+       return from_kgid(&init_user_ns, inode->i_gid);
+ }
+ static inline void i_uid_write(struct inode *inode, uid_t uid)
+ {
+       inode->i_uid = make_kuid(&init_user_ns, uid);
+ }
+ static inline void i_gid_write(struct inode *inode, gid_t gid)
+ {
+       inode->i_gid = make_kgid(&init_user_ns, gid);
+ }
  static inline unsigned iminor(const struct inode *inode)
  {
        return MINOR(inode->i_rdev);
@@@ -943,7 -969,7 +969,7 @@@ struct fown_struct 
        rwlock_t lock;          /* protects pid, uid, euid fields */
        struct pid *pid;        /* pid or -pgrp where SIGIO should be sent */
        enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
-       uid_t uid, euid;        /* uid/euid of process setting the owner */
+       kuid_t uid, euid;       /* uid/euid of process setting the owner */
        int signum;             /* posix.1b rt signal to be delivered on IO */
  };
  
@@@ -1215,7 -1241,6 +1241,7 @@@ extern int vfs_setlease(struct file *, 
  extern int lease_modify(struct file_lock **, int);
  extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
  extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
 +extern void locks_delete_block(struct file_lock *waiter);
  extern void lock_flocks(void);
  extern void unlock_flocks(void);
  #else /* !CONFIG_FILE_LOCKING */
@@@ -1360,10 -1385,6 +1386,10 @@@ static inline int lock_may_write(struc
        return 1;
  }
  
 +static inline void locks_delete_block(struct file_lock *waiter)
 +{
 +}
 +
  static inline void lock_flocks(void)
  {
  }
@@@ -1527,12 -1548,6 +1553,6 @@@ enum 
  #define vfs_check_frozen(sb, level) \
        wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
  
- /*
-  * until VFS tracks user namespaces for inodes, just make all files
-  * belong to init_user_ns
-  */
- extern struct user_namespace init_user_ns;
- #define inode_userns(inode) (&init_user_ns)
  extern bool inode_owner_or_capable(const struct inode *inode);
  
  /* not quite ready to be deprecated, but... */
@@@ -2051,7 -2066,6 +2071,7 @@@ extern void unregister_blkdev(unsigned 
  extern struct block_device *bdget(dev_t);
  extern struct block_device *bdgrab(struct block_device *bdev);
  extern void bd_set_size(struct block_device *, loff_t size);
 +extern sector_t blkdev_max_block(struct block_device *bdev);
  extern void bd_forget(struct inode *inode);
  extern void bdput(struct block_device *);
  extern void invalidate_bdev(struct block_device *);
@@@ -2512,7 -2526,6 +2532,7 @@@ extern int dcache_readdir(struct file *
  extern int simple_setattr(struct dentry *, struct iattr *);
  extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *);
  extern int simple_statfs(struct dentry *, struct kstatfs *);
 +extern int simple_open(struct inode *inode, struct file *file);
  extern int simple_link(struct dentry *, struct inode *, struct dentry *);
  extern int simple_unlink(struct inode *, struct dentry *);
  extern int simple_rmdir(struct inode *, struct dentry *);
diff --combined include/linux/sched.h
index 28fa9d02fd59cd011db1cceff056662961a30872,5fdc1ebbcbc4658500f6471c1b91e580efcbae98..5ea8baea938743777ec69efcce61b357e904ba36
@@@ -90,6 -90,7 +90,7 @@@ struct sched_param 
  #include <linux/latencytop.h>
  #include <linux/cred.h>
  #include <linux/llist.h>
+ #include <linux/uidgid.h>
  
  #include <asm/processor.h>
  
@@@ -728,8 -729,7 +729,7 @@@ struct user_struct 
  
        /* Hash table maintenance information */
        struct hlist_node uidhash_node;
-       uid_t uid;
-       struct user_namespace *user_ns;
+       kuid_t uid;
  
  #ifdef CONFIG_PERF_EVENTS
        atomic_long_t locked_vm;
  
  extern int uids_sysfs_init(void);
  
- extern struct user_struct *find_user(uid_t);
+ extern struct user_struct *find_user(kuid_t);
  
  extern struct user_struct root_user;
  #define INIT_USER (&root_user)
@@@ -855,14 -855,61 +855,14 @@@ enum cpu_idle_type 
  #define SD_WAKE_AFFINE                0x0020  /* Wake task to waking CPU */
  #define SD_PREFER_LOCAL               0x0040  /* Prefer to keep tasks local to this domain */
  #define SD_SHARE_CPUPOWER     0x0080  /* Domain members share cpu power */
 -#define SD_POWERSAVINGS_BALANCE       0x0100  /* Balance for power savings */
  #define SD_SHARE_PKG_RESOURCES        0x0200  /* Domain members share cpu pkg resources */
  #define SD_SERIALIZE          0x0400  /* Only a single load balancing instance */
  #define SD_ASYM_PACKING               0x0800  /* Place busy groups earlier in the domain */
  #define SD_PREFER_SIBLING     0x1000  /* Prefer to place tasks in a sibling domain */
  #define SD_OVERLAP            0x2000  /* sched_domains of this level overlap */
  
 -enum powersavings_balance_level {
 -      POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
 -      POWERSAVINGS_BALANCE_BASIC,     /* Fill one thread/core/package
 -                                       * first for long running threads
 -                                       */
 -      POWERSAVINGS_BALANCE_WAKEUP,    /* Also bias task wakeups to semi-idle
 -                                       * cpu package for power savings
 -                                       */
 -      MAX_POWERSAVINGS_BALANCE_LEVELS
 -};
 -
 -extern int sched_mc_power_savings, sched_smt_power_savings;
 -
 -static inline int sd_balance_for_mc_power(void)
 -{
 -      if (sched_smt_power_savings)
 -              return SD_POWERSAVINGS_BALANCE;
 -
 -      if (!sched_mc_power_savings)
 -              return SD_PREFER_SIBLING;
 -
 -      return 0;
 -}
 -
 -static inline int sd_balance_for_package_power(void)
 -{
 -      if (sched_mc_power_savings | sched_smt_power_savings)
 -              return SD_POWERSAVINGS_BALANCE;
 -
 -      return SD_PREFER_SIBLING;
 -}
 -
  extern int __weak arch_sd_sibiling_asym_packing(void);
  
 -/*
 - * Optimise SD flags for power savings:
 - * SD_BALANCE_NEWIDLE helps aggressive task consolidation and power savings.
 - * Keep default SD flags if sched_{smt,mc}_power_saving=0
 - */
 -
 -static inline int sd_power_saving_flags(void)
 -{
 -      if (sched_mc_power_savings | sched_smt_power_savings)
 -              return SD_BALANCE_NEWIDLE;
 -
 -      return 0;
 -}
 -
  struct sched_group_power {
        atomic_t ref;
        /*
@@@ -1294,8 -1341,6 +1294,8 @@@ struct task_struct 
                                 * execve */
        unsigned in_iowait:1;
  
 +      /* task may not gain privileges */
 +      unsigned no_new_privs:1;
  
        /* Revert to default priority/policy when forking */
        unsigned sched_reset_on_fork:1;
        uid_t loginuid;
        unsigned int sessionid;
  #endif
 -      seccomp_t seccomp;
 +      struct seccomp seccomp;
  
  /* Thread group tracking */
        u32 parent_exec_id;
@@@ -1860,22 -1905,12 +1860,22 @@@ static inline void rcu_copy_process(str
        INIT_LIST_HEAD(&p->rcu_node_entry);
  }
  
 +static inline void rcu_switch_from(struct task_struct *prev)
 +{
 +      if (prev->rcu_read_lock_nesting != 0)
 +              rcu_preempt_note_context_switch();
 +}
 +
  #else
  
  static inline void rcu_copy_process(struct task_struct *p)
  {
  }
  
 +static inline void rcu_switch_from(struct task_struct *prev)
 +{
 +}
 +
  #endif
  
  #ifdef CONFIG_SMP
@@@ -1915,7 -1950,7 +1915,7 @@@ static inline int set_cpus_allowed(stru
   */
  extern unsigned long long notrace sched_clock(void);
  /*
 - * See the comment in kernel/sched_clock.c
 + * See the comment in kernel/sched/clock.c
   */
  extern u64 cpu_clock(int cpu);
  extern u64 local_clock(void);
@@@ -2142,14 -2177,13 +2142,13 @@@ extern struct task_struct *find_task_by
  extern void __set_special_pids(struct pid *pid);
  
  /* per-UID process charging. */
- extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
+ extern struct user_struct * alloc_uid(kuid_t);
  static inline struct user_struct *get_uid(struct user_struct *u)
  {
        atomic_inc(&u->__count);
        return u;
  }
  extern void free_uid(struct user_struct *);
- extern void release_uids(struct user_namespace *ns);
  
  #include <asm/current.h>
  
diff --combined init/Kconfig
index a30fe085940ee8a51810e3204469a12cb06f8048,b5dff4d1e1ded461decce3ce227b0baef569b9a4..ccb5248474c2222b25f7aa4a017636f1fd78f9b4
@@@ -27,9 -27,6 +27,9 @@@ config IRQ_WOR
        bool
        depends on HAVE_IRQ_WORK
  
 +config BUILDTIME_EXTABLE_SORT
 +      bool
 +
  menu "General setup"
  
  config EXPERIMENTAL
@@@ -461,33 -458,6 +461,33 @@@ config RCU_FANOU
          Select a specific number if testing RCU itself.
          Take the default if unsure.
  
 +config RCU_FANOUT_LEAF
 +      int "Tree-based hierarchical RCU leaf-level fanout value"
 +      range 2 RCU_FANOUT if 64BIT
 +      range 2 RCU_FANOUT if !64BIT
 +      depends on TREE_RCU || TREE_PREEMPT_RCU
 +      default 16
 +      help
 +        This option controls the leaf-level fanout of hierarchical
 +        implementations of RCU, and allows trading off cache misses
 +        against lock contention.  Systems that synchronize their
 +        scheduling-clock interrupts for energy-efficiency reasons will
 +        want the default because the smaller leaf-level fanout keeps
 +        lock contention levels acceptably low.  Very large systems
 +        (hundreds or thousands of CPUs) will instead want to set this
 +        value to the maximum value possible in order to reduce the
 +        number of cache misses incurred during RCU's grace-period
 +        initialization.  These systems tend to run CPU-bound, and thus
 +        are not helped by synchronized interrupts, and thus tend to
 +        skew them, which reduces lock contention enough that large
 +        leaf-level fanouts work well.
 +
 +        Select a specific number if testing RCU itself.
 +
 +        Select the maximum permissible value for large systems.
 +
 +        Take the default if unsure.
 +
  config RCU_FANOUT_EXACT
        bool "Disable tree-based hierarchical RCU auto-balancing"
        depends on TREE_RCU || TREE_PREEMPT_RCU
@@@ -545,25 -515,10 +545,25 @@@ config RCU_BOOST_PRI
        depends on RCU_BOOST
        default 1
        help
 -        This option specifies the real-time priority to which preempted
 -        RCU readers are to be boosted.  If you are working with CPU-bound
 -        real-time applications, you should specify a priority higher then
 -        the highest-priority CPU-bound application.
 +        This option specifies the real-time priority to which long-term
 +        preempted RCU readers are to be boosted.  If you are working
 +        with a real-time application that has one or more CPU-bound
 +        threads running at a real-time priority level, you should set
 +        RCU_BOOST_PRIO to a priority higher then the highest-priority
 +        real-time CPU-bound thread.  The default RCU_BOOST_PRIO value
 +        of 1 is appropriate in the common case, which is real-time
 +        applications that do not have any CPU-bound threads.
 +
 +        Some real-time applications might not have a single real-time
 +        thread that saturates a given CPU, but instead might have
 +        multiple real-time threads that, taken together, fully utilize
 +        that CPU.  In this case, you should set RCU_BOOST_PRIO to
 +        a priority higher than the lowest-priority thread that is
 +        conspiring to prevent the CPU from running any non-real-time
 +        tasks.  For example, if one thread at priority 10 and another
 +        thread at priority 5 are between themselves fully consuming
 +        the CPU time on a given CPU, then RCU_BOOST_PRIO should be
 +        set to priority 6 or higher.
  
          Specify the real-time priority, or take the default if unsure.
  
@@@ -873,7 -828,10 +873,10 @@@ config IPC_N
  config USER_NS
        bool "User namespace (EXPERIMENTAL)"
        depends on EXPERIMENTAL
-       default y
+       depends on UIDGID_CONVERTED
+       select UIDGID_STRICT_TYPE_CHECKS
+       default n
        help
          This allows containers, i.e. vservers, to use user namespaces
          to provide different user info for different servers.
@@@ -897,6 -855,131 +900,131 @@@ config NET_N
  
  endif # NAMESPACES
  
+ config UIDGID_CONVERTED
+       # True if all of the selected software conmponents are known
+       # to have uid_t and gid_t converted to kuid_t and kgid_t
+       # where appropriate and are otherwise safe to use with
+       # the user namespace.
+       bool
+       default y
+       # List of kernel pieces that need user namespace work
+       # Features
+       depends on SYSVIPC = n
+       depends on IMA = n
+       depends on EVM = n
+       depends on KEYS = n
+       depends on AUDIT = n
+       depends on AUDITSYSCALL = n
+       depends on TASKSTATS = n
+       depends on TRACING = n
+       depends on FS_POSIX_ACL = n
+       depends on QUOTA = n
+       depends on QUOTACTL = n
+       depends on DEBUG_CREDENTIALS = n
+       depends on BSD_PROCESS_ACCT = n
+       depends on DRM = n
+       depends on PROC_EVENTS = n
+       # Networking
+       depends on NET = n
+       depends on NET_9P = n
+       depends on IPX = n
+       depends on PHONET = n
+       depends on NET_CLS_FLOW = n
+       depends on NETFILTER_XT_MATCH_OWNER = n
+       depends on NETFILTER_XT_MATCH_RECENT = n
+       depends on NETFILTER_XT_TARGET_LOG = n
+       depends on NETFILTER_NETLINK_LOG = n
+       depends on INET = n
+       depends on IPV6 = n
+       depends on IP_SCTP = n
+       depends on AF_RXRPC = n
+       depends on LLC2 = n
+       depends on NET_KEY = n
+       depends on INET_DIAG = n
+       depends on DNS_RESOLVER = n
+       depends on AX25 = n
+       depends on ATALK = n
+       # Filesystems
+       depends on USB_DEVICEFS = n
+       depends on USB_GADGETFS = n
+       depends on USB_FUNCTIONFS = n
+       depends on DEVTMPFS = n
+       depends on XENFS = n
+       depends on 9P_FS = n
+       depends on ADFS_FS = n
+       depends on AFFS_FS = n
+       depends on AFS_FS = n
+       depends on AUTOFS4_FS = n
+       depends on BEFS_FS = n
+       depends on BFS_FS = n
+       depends on BTRFS_FS = n
+       depends on CEPH_FS = n
+       depends on CIFS = n
+       depends on CODA_FS = n
+       depends on CONFIGFS_FS = n
+       depends on CRAMFS = n
+       depends on DEBUG_FS = n
+       depends on ECRYPT_FS = n
+       depends on EFS_FS = n
+       depends on EXOFS_FS = n
+       depends on FAT_FS = n
+       depends on FUSE_FS = n
+       depends on GFS2_FS = n
+       depends on HFS_FS = n
+       depends on HFSPLUS_FS = n
+       depends on HPFS_FS = n
+       depends on HUGETLBFS = n
+       depends on ISO9660_FS = n
+       depends on JFFS2_FS = n
+       depends on JFS_FS = n
+       depends on LOGFS = n
+       depends on MINIX_FS = n
+       depends on NCP_FS = n
+       depends on NFSD = n
+       depends on NFS_FS = n
+       depends on NILFS2_FS = n
+       depends on NTFS_FS = n
+       depends on OCFS2_FS = n
+       depends on OMFS_FS = n
+       depends on QNX4FS_FS = n
+       depends on QNX6FS_FS = n
+       depends on REISERFS_FS = n
+       depends on SQUASHFS = n
+       depends on SYSV_FS = n
+       depends on UBIFS_FS = n
+       depends on UDF_FS = n
+       depends on UFS_FS = n
+       depends on VXFS_FS = n
+       depends on XFS_FS = n
+       depends on !UML || HOSTFS = n
+       # The rare drivers that won't build
+       depends on AIRO = n
+       depends on AIRO_CS = n
+       depends on TUN = n
+       depends on INFINIBAND_QIB = n
+       depends on BLK_DEV_LOOP = n
+       depends on ANDROID_BINDER_IPC = n
+       # Security modules
+       depends on SECURITY_TOMOYO = n
+       depends on SECURITY_APPARMOR = n
+ config UIDGID_STRICT_TYPE_CHECKS
+       bool "Require conversions between uid/gids and their internal representation"
+       depends on UIDGID_CONVERTED
+       default n
+       help
+        While the nececessary conversions are being added to all subsystems this option allows
+        the code to continue to build for unconverted subsystems.
+        Say Y here if you want the strict type checking enabled
  config SCHED_AUTOGROUP
        bool "Automatic process group scheduling"
        select EVENTFD
@@@ -1201,7 -1284,7 +1329,7 @@@ menu "Kernel Performance Events And Cou
  
  config PERF_EVENTS
        bool "Kernel performance events and counters"
 -      default y if (PROFILING || PERF_COUNTERS)
 +      default y if PROFILING
        depends on HAVE_PERF_EVENTS
        select ANON_INODES
        select IRQ_WORK
  
          Say Y if unsure.
  
 -config PERF_COUNTERS
 -      bool "Kernel performance counters (old config option)"
 -      depends on HAVE_PERF_EVENTS
 -      help
 -        This config has been obsoleted by the PERF_EVENTS
 -        config option - please see that one for details.
 -
 -        It has no effect on the kernel whether you enable
 -        it or not, it is a compatibility placeholder.
 -
 -        Say N if unsure.
 -
  config DEBUG_PERF_USE_VMALLOC
        default n
        bool "Debug: use vmalloc to back perf mmap() buffers"
@@@ -1447,8 -1542,8 +1575,8 @@@ endif # MODULE
  config INIT_ALL_POSSIBLE
        bool
        help
 -        Back when each arch used to define their own cpu_online_map and
 -        cpu_possible_map, some of them chose to initialize cpu_possible_map
 +        Back when each arch used to define their own cpu_online_mask and
 +        cpu_possible_mask, some of them chose to initialize cpu_possible_mask
          with all 1s, and others with all 0s.  When they were centralised,
          it was better to provide this option than to break all the archs
          and have several arch maintainers pursuing me down dark alleys.
diff --combined kernel/cgroup.c
index ad8eae5bb801f52128ad3c0dab0cbbdebd685e96,c8329b0c25762bbd85422a4df53848c22677eb0a..a0c6af34d50063b31f6bd1fd1fc1de8b299d10ce
  #include <linux/eventfd.h>
  #include <linux/poll.h>
  #include <linux/flex_array.h> /* used in cgroup_attach_proc */
 +#include <linux/kthread.h>
  
  #include <linux/atomic.h>
  
 +/* css deactivation bias, makes css->refcnt negative to deny new trygets */
 +#define CSS_DEACT_BIAS                INT_MIN
 +
  /*
   * cgroup_mutex is the master lock.  Any modification to cgroup or its
   * hierarchy must be performed while holding it.
@@@ -131,9 -127,6 +131,9 @@@ struct cgroupfs_root 
        /* A list running through the active hierarchies */
        struct list_head root_list;
  
 +      /* All cgroups on this root, cgroup_mutex protected */
 +      struct list_head allcg_list;
 +
        /* Hierarchy-specific flags */
        unsigned long flags;
  
   */
  static struct cgroupfs_root rootnode;
  
 +/*
 + * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
 + */
 +struct cfent {
 +      struct list_head                node;
 +      struct dentry                   *dentry;
 +      struct cftype                   *type;
 +};
 +
  /*
   * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
   * cgroup_subsys->use_id != 0.
@@@ -255,14 -239,6 +255,14 @@@ int cgroup_lock_is_held(void
  
  EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
  
 +/* the current nr of refs, always >= 0 whether @css is deactivated or not */
 +static int css_refcnt(struct cgroup_subsys_state *css)
 +{
 +      int v = atomic_read(&css->refcnt);
 +
 +      return v >= 0 ? v : v - CSS_DEACT_BIAS;
 +}
 +
  /* convenient tests for these bits */
  inline int cgroup_is_removed(const struct cgroup *cgrp)
  {
@@@ -303,21 -279,6 +303,21 @@@ list_for_each_entry(_ss, &_root->subsys
  #define for_each_active_root(_root) \
  list_for_each_entry(_root, &roots, root_list)
  
 +static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 +{
 +      return dentry->d_fsdata;
 +}
 +
 +static inline struct cfent *__d_cfe(struct dentry *dentry)
 +{
 +      return dentry->d_fsdata;
 +}
 +
 +static inline struct cftype *__d_cft(struct dentry *dentry)
 +{
 +      return __d_cfe(dentry)->type;
 +}
 +
  /* the list of cgroups eligible for automatic release. Protected by
   * release_list_lock */
  static LIST_HEAD(release_list);
@@@ -855,17 -816,12 +855,17 @@@ static int cgroup_call_pre_destroy(stru
        struct cgroup_subsys *ss;
        int ret = 0;
  
 -      for_each_subsys(cgrp->root, ss)
 -              if (ss->pre_destroy) {
 -                      ret = ss->pre_destroy(cgrp);
 -                      if (ret)
 -                              break;
 +      for_each_subsys(cgrp->root, ss) {
 +              if (!ss->pre_destroy)
 +                      continue;
 +
 +              ret = ss->pre_destroy(cgrp);
 +              if (ret) {
 +                      /* ->pre_destroy() failure is being deprecated */
 +                      WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
 +                      break;
                }
 +      }
  
        return ret;
  }
@@@ -908,14 -864,6 +908,14 @@@ static void cgroup_diput(struct dentry 
                BUG_ON(!list_empty(&cgrp->pidlists));
  
                kfree_rcu(cgrp, rcu_head);
 +      } else {
 +              struct cfent *cfe = __d_cfe(dentry);
 +              struct cgroup *cgrp = dentry->d_parent->d_fsdata;
 +
 +              WARN_ONCE(!list_empty(&cfe->node) &&
 +                        cgrp != &cgrp->root->top_cgroup,
 +                        "cfe still linked for %s\n", cfe->type->name);
 +              kfree(cfe);
        }
        iput(inode);
  }
@@@ -934,36 -882,34 +934,36 @@@ static void remove_dir(struct dentry *d
        dput(parent);
  }
  
 -static void cgroup_clear_directory(struct dentry *dentry)
 -{
 -      struct list_head *node;
 -
 -      BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
 -      spin_lock(&dentry->d_lock);
 -      node = dentry->d_subdirs.next;
 -      while (node != &dentry->d_subdirs) {
 -              struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 -
 -              spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
 -              list_del_init(node);
 -              if (d->d_inode) {
 -                      /* This should never be called on a cgroup
 -                       * directory with child cgroups */
 -                      BUG_ON(d->d_inode->i_mode & S_IFDIR);
 -                      dget_dlock(d);
 -                      spin_unlock(&d->d_lock);
 -                      spin_unlock(&dentry->d_lock);
 -                      d_delete(d);
 -                      simple_unlink(dentry->d_inode, d);
 -                      dput(d);
 -                      spin_lock(&dentry->d_lock);
 -              } else
 -                      spin_unlock(&d->d_lock);
 -              node = dentry->d_subdirs.next;
 +static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 +{
 +      struct cfent *cfe;
 +
 +      lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
 +      lockdep_assert_held(&cgroup_mutex);
 +
 +      list_for_each_entry(cfe, &cgrp->files, node) {
 +              struct dentry *d = cfe->dentry;
 +
 +              if (cft && cfe->type != cft)
 +                      continue;
 +
 +              dget(d);
 +              d_delete(d);
 +              simple_unlink(d->d_inode, d);
 +              list_del_init(&cfe->node);
 +              dput(d);
 +
 +              return 0;
        }
 -      spin_unlock(&dentry->d_lock);
 +      return -ENOENT;
 +}
 +
 +static void cgroup_clear_directory(struct dentry *dir)
 +{
 +      struct cgroup *cgrp = __d_cgrp(dir);
 +
 +      while (!list_empty(&cgrp->files))
 +              cgroup_rm_file(cgrp, NULL);
  }
  
  /*
@@@ -1348,11 -1294,6 +1348,11 @@@ static int cgroup_remount(struct super_
        if (ret)
                goto out_unlock;
  
 +      /* See feature-removal-schedule.txt */
 +      if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
 +              pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 +                         task_tgid_nr(current), current->comm);
 +
        /* Don't allow flags or name to change at remount */
        if (opts.flags != root->flags ||
            (opts.name && strcmp(opts.name, root->name))) {
                goto out_unlock;
        }
  
 -      /* (re)populate subsystem files */
 +      /* clear out any existing files and repopulate subsystem files */
 +      cgroup_clear_directory(cgrp->dentry);
        cgroup_populate_dir(cgrp);
  
        if (opts.release_agent)
@@@ -1393,7 -1333,6 +1393,7 @@@ static void init_cgroup_housekeeping(st
  {
        INIT_LIST_HEAD(&cgrp->sibling);
        INIT_LIST_HEAD(&cgrp->children);
 +      INIT_LIST_HEAD(&cgrp->files);
        INIT_LIST_HEAD(&cgrp->css_sets);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
  static void init_cgroup_root(struct cgroupfs_root *root)
  {
        struct cgroup *cgrp = &root->top_cgroup;
 +
        INIT_LIST_HEAD(&root->subsys_list);
        INIT_LIST_HEAD(&root->root_list);
 +      INIT_LIST_HEAD(&root->allcg_list);
        root->number_of_cgroups = 1;
        cgrp->root = root;
        cgrp->top_cgroup = cgrp;
 +      list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        init_cgroup_housekeeping(cgrp);
  }
  
@@@ -1756,6 -1692,16 +1756,6 @@@ static struct file_system_type cgroup_f
  
  static struct kobject *cgroup_kobj;
  
 -static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 -{
 -      return dentry->d_fsdata;
 -}
 -
 -static inline struct cftype *__d_cft(struct dentry *dentry)
 -{
 -      return dentry->d_fsdata;
 -}
 -
  /**
   * cgroup_path - generate the path of a cgroup
   * @cgrp: the cgroup in question
@@@ -2214,9 -2160,9 +2214,9 @@@ retry_find_task
                 * only need to check permissions on one of them.
                 */
                tcred = __task_cred(tsk);
-               if (cred->euid &&
-                   cred->euid != tcred->uid &&
-                   cred->euid != tcred->suid) {
+               if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+                   !uid_eq(cred->euid, tcred->uid) &&
+                   !uid_eq(cred->euid, tcred->suid)) {
                        rcu_read_unlock();
                        ret = -EACCES;
                        goto out_unlock_cgroup;
  
        if (threadgroup)
                tsk = tsk->group_leader;
 +
 +      /*
 +       * Workqueue threads may acquire PF_THREAD_BOUND and become
 +       * trapped in a cpuset, or RT worker may be born in a cgroup
 +       * with no rt_runtime allocated.  Just say no.
 +       */
 +      if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
 +              ret = -EINVAL;
 +              rcu_read_unlock();
 +              goto out_unlock_cgroup;
 +      }
 +
        get_task_struct(tsk);
        rcu_read_unlock();
  
@@@ -2669,191 -2603,50 +2669,191 @@@ static umode_t cgroup_file_mode(const s
        return mode;
  }
  
 -int cgroup_add_file(struct cgroup *cgrp,
 -                     struct cgroup_subsys *subsys,
 -                     const struct cftype *cft)
 +static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 +                         const struct cftype *cft)
  {
        struct dentry *dir = cgrp->dentry;
 +      struct cgroup *parent = __d_cgrp(dir);
        struct dentry *dentry;
 +      struct cfent *cfe;
        int error;
        umode_t mode;
 -
        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 +
 +      /* does @cft->flags tell us to skip creation on @cgrp? */
 +      if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
 +              return 0;
 +      if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
 +              return 0;
 +
        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
                strcpy(name, subsys->name);
                strcat(name, ".");
        }
        strcat(name, cft->name);
 +
        BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
 +
 +      cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
 +      if (!cfe)
 +              return -ENOMEM;
 +
        dentry = lookup_one_len(name, dir, strlen(name));
 -      if (!IS_ERR(dentry)) {
 -              mode = cgroup_file_mode(cft);
 -              error = cgroup_create_file(dentry, mode | S_IFREG,
 -                                              cgrp->root->sb);
 -              if (!error)
 -                      dentry->d_fsdata = (void *)cft;
 -              dput(dentry);
 -      } else
 +      if (IS_ERR(dentry)) {
                error = PTR_ERR(dentry);
 +              goto out;
 +      }
 +
 +      mode = cgroup_file_mode(cft);
 +      error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
 +      if (!error) {
 +              cfe->type = (void *)cft;
 +              cfe->dentry = dentry;
 +              dentry->d_fsdata = cfe;
 +              list_add_tail(&cfe->node, &parent->files);
 +              cfe = NULL;
 +      }
 +      dput(dentry);
 +out:
 +      kfree(cfe);
        return error;
  }
 -EXPORT_SYMBOL_GPL(cgroup_add_file);
  
 -int cgroup_add_files(struct cgroup *cgrp,
 -                      struct cgroup_subsys *subsys,
 -                      const struct cftype cft[],
 -                      int count)
 +static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 +                            const struct cftype cfts[], bool is_add)
  {
 -      int i, err;
 -      for (i = 0; i < count; i++) {
 -              err = cgroup_add_file(cgrp, subsys, &cft[i]);
 -              if (err)
 -                      return err;
 +      const struct cftype *cft;
 +      int err, ret = 0;
 +
 +      for (cft = cfts; cft->name[0] != '\0'; cft++) {
 +              if (is_add)
 +                      err = cgroup_add_file(cgrp, subsys, cft);
 +              else
 +                      err = cgroup_rm_file(cgrp, cft);
 +              if (err) {
 +                      pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
 +                                 is_add ? "add" : "remove", cft->name, err);
 +                      ret = err;
 +              }
 +      }
 +      return ret;
 +}
 +
 +static DEFINE_MUTEX(cgroup_cft_mutex);
 +
 +static void cgroup_cfts_prepare(void)
 +      __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
 +{
 +      /*
 +       * Thanks to the entanglement with vfs inode locking, we can't walk
 +       * the existing cgroups under cgroup_mutex and create files.
 +       * Instead, we increment reference on all cgroups and build list of
 +       * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure
 +       * exclusive access to the field.
 +       */
 +      mutex_lock(&cgroup_cft_mutex);
 +      mutex_lock(&cgroup_mutex);
 +}
 +
 +static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 +                             const struct cftype *cfts, bool is_add)
 +      __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
 +{
 +      LIST_HEAD(pending);
 +      struct cgroup *cgrp, *n;
 +
 +      /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
 +      if (cfts && ss->root != &rootnode) {
 +              list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
 +                      dget(cgrp->dentry);
 +                      list_add_tail(&cgrp->cft_q_node, &pending);
 +              }
 +      }
 +
 +      mutex_unlock(&cgroup_mutex);
 +
 +      /*
 +       * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm
 +       * files for all cgroups which were created before.
 +       */
 +      list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
 +              struct inode *inode = cgrp->dentry->d_inode;
 +
 +              mutex_lock(&inode->i_mutex);
 +              mutex_lock(&cgroup_mutex);
 +              if (!cgroup_is_removed(cgrp))
 +                      cgroup_addrm_files(cgrp, ss, cfts, is_add);
 +              mutex_unlock(&cgroup_mutex);
 +              mutex_unlock(&inode->i_mutex);
 +
 +              list_del_init(&cgrp->cft_q_node);
 +              dput(cgrp->dentry);
        }
 +
 +      mutex_unlock(&cgroup_cft_mutex);
 +}
 +
 +/**
 + * cgroup_add_cftypes - add an array of cftypes to a subsystem
 + * @ss: target cgroup subsystem
 + * @cfts: zero-length name terminated array of cftypes
 + *
 + * Register @cfts to @ss.  Files described by @cfts are created for all
 + * existing cgroups to which @ss is attached and all future cgroups will
 + * have them too.  This function can be called anytime whether @ss is
 + * attached or not.
 + *
 + * Returns 0 on successful registration, -errno on failure.  Note that this
 + * function currently returns 0 as long as @cfts registration is successful
 + * even if some file creation attempts on existing cgroups fail.
 + */
 +int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
 +{
 +      struct cftype_set *set;
 +
 +      set = kzalloc(sizeof(*set), GFP_KERNEL);
 +      if (!set)
 +              return -ENOMEM;
 +
 +      cgroup_cfts_prepare();
 +      set->cfts = cfts;
 +      list_add_tail(&set->node, &ss->cftsets);
 +      cgroup_cfts_commit(ss, cfts, true);
 +
        return 0;
  }
 -EXPORT_SYMBOL_GPL(cgroup_add_files);
 +EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
 +
 +/**
 + * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
 + * @ss: target cgroup subsystem
 + * @cfts: zero-length name terminated array of cftypes
 + *
 + * Unregister @cfts from @ss.  Files described by @cfts are removed from
 + * all existing cgroups to which @ss is attached and all future cgroups
 + * won't have them either.  This function can be called anytime whether @ss
 + * is attached or not.
 + *
 + * Returns 0 on successful unregistration, -ENOENT if @cfts is not
 + * registered with @ss.
 + */
 +int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
 +{
 +      struct cftype_set *set;
 +
 +      cgroup_cfts_prepare();
 +
 +      list_for_each_entry(set, &ss->cftsets, node) {
 +              if (set->cfts == cfts) {
 +                      list_del_init(&set->node);
 +                      cgroup_cfts_commit(ss, cfts, false);
 +                      return 0;
 +              }
 +      }
 +
 +      cgroup_cfts_commit(ss, NULL, false);
 +      return -ENOENT;
 +}
  
  /**
   * cgroup_task_count - count the number of tasks in a cgroup.
@@@ -3832,14 -3625,13 +3832,14 @@@ static struct cftype files[] = 
                .read_u64 = cgroup_clone_children_read,
                .write_u64 = cgroup_clone_children_write,
        },
 -};
 -
 -static struct cftype cft_release_agent = {
 -      .name = "release_agent",
 -      .read_seq_string = cgroup_release_agent_show,
 -      .write_string = cgroup_release_agent_write,
 -      .max_write_len = PATH_MAX,
 +      {
 +              .name = "release_agent",
 +              .flags = CFTYPE_ONLY_ON_ROOT,
 +              .read_seq_string = cgroup_release_agent_show,
 +              .write_string = cgroup_release_agent_write,
 +              .max_write_len = PATH_MAX,
 +      },
 +      { }     /* terminate */
  };
  
  static int cgroup_populate_dir(struct cgroup *cgrp)
        int err;
        struct cgroup_subsys *ss;
  
 -      /* First clear out any existing files */
 -      cgroup_clear_directory(cgrp->dentry);
 -
 -      err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
 +      err = cgroup_addrm_files(cgrp, NULL, files, true);
        if (err < 0)
                return err;
  
 -      if (cgrp == cgrp->top_cgroup) {
 -              if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
 -                      return err;
 -      }
 -
 +      /* process cftsets of each subsystem */
        for_each_subsys(cgrp->root, ss) {
 -              if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 -                      return err;
 +              struct cftype_set *set;
 +
 +              list_for_each_entry(set, &ss->cftsets, node)
 +                      cgroup_addrm_files(cgrp, ss, set->cfts, true);
        }
 +
        /* This cgroup is ready now */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
        return 0;
  }
  
 +static void css_dput_fn(struct work_struct *work)
 +{
 +      struct cgroup_subsys_state *css =
 +              container_of(work, struct cgroup_subsys_state, dput_work);
 +
 +      dput(css->cgroup->dentry);
 +}
 +
  static void init_cgroup_css(struct cgroup_subsys_state *css,
                               struct cgroup_subsys *ss,
                               struct cgroup *cgrp)
                set_bit(CSS_ROOT, &css->flags);
        BUG_ON(cgrp->subsys[ss->subsys_id]);
        cgrp->subsys[ss->subsys_id] = css;
 +
 +      /*
 +       * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
 +       * which is put on the last css_put().  dput() requires process
 +       * context, which css_put() may be called without.  @css->dput_work
 +       * will be used to invoke dput() asynchronously from css_put().
 +       */
 +      INIT_WORK(&css->dput_work, css_dput_fn);
 +      if (ss->__DEPRECATED_clear_css_refs)
 +              set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
  }
  
  static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@@ -4006,16 -3784,9 +4006,16 @@@ static long cgroup_create(struct cgrou
        if (err < 0)
                goto err_remove;
  
 +      /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
 +      for_each_subsys(root, ss)
 +              if (!ss->__DEPRECATED_clear_css_refs)
 +                      dget(dentry);
 +
        /* The cgroup directory was pre-locked for us */
        BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
  
 +      list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 +
        err = cgroup_populate_dir(cgrp);
        /* If err < 0, we have a half-filled directory - oh well ;) */
  
@@@ -4055,19 -3826,18 +4055,19 @@@ static int cgroup_mkdir(struct inode *d
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
  }
  
 +/*
 + * Check the reference count on each subsystem. Since we already
 + * established that there are no tasks in the cgroup, if the css refcount
 + * is also 1, then there should be no outstanding references, so the
 + * subsystem is safe to destroy. We scan across all subsystems rather than
 + * using the per-hierarchy linked list of mounted subsystems since we can
 + * be called via check_for_release() with no synchronization other than
 + * RCU, and the subsystem linked list isn't RCU-safe.
 + */
  static int cgroup_has_css_refs(struct cgroup *cgrp)
  {
 -      /* Check the reference count on each subsystem. Since we
 -       * already established that there are no tasks in the
 -       * cgroup, if the css refcount is also 1, then there should
 -       * be no outstanding references, so the subsystem is safe to
 -       * destroy. We scan across all subsystems rather than using
 -       * the per-hierarchy linked list of mounted subsystems since
 -       * we can be called via check_for_release() with no
 -       * synchronization other than RCU, and the subsystem linked
 -       * list isn't RCU-safe */
        int i;
 +
        /*
         * We won't need to lock the subsys array, because the subsystems
         * we're concerned about aren't going anywhere since our cgroup root
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                struct cgroup_subsys_state *css;
 +
                /* Skip subsystems not present or not in this hierarchy */
                if (ss == NULL || ss->root != cgrp->root)
                        continue;
 +
                css = cgrp->subsys[ss->subsys_id];
 -              /* When called from check_for_release() it's possible
 +              /*
 +               * When called from check_for_release() it's possible
                 * that by this point the cgroup has been removed
                 * and the css deleted. But a false-positive doesn't
                 * matter, since it can only happen if the cgroup
                 * has been deleted and hence no longer needs the
 -               * release agent to be called anyway. */
 -              if (css && (atomic_read(&css->refcnt) > 1))
 +               * release agent to be called anyway.
 +               */
 +              if (css && css_refcnt(css) > 1)
                        return 1;
        }
        return 0;
   * Atomically mark all (or else none) of the cgroup's CSS objects as
   * CSS_REMOVED. Return true on success, or false if the cgroup has
   * busy subsystems. Call with cgroup_mutex held
 + *
 + * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
 + * not, cgroup removal behaves differently.
 + *
 + * If clear is set, css refcnt for the subsystem should be zero before
 + * cgroup removal can be committed.  This is implemented by
 + * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
 + * called multiple times until all css refcnts reach zero and is allowed to
 + * veto removal on any invocation.  This behavior is deprecated and will be
 + * removed as soon as the existing user (memcg) is updated.
 + *
 + * If clear is not set, each css holds an extra reference to the cgroup's
 + * dentry and cgroup removal proceeds regardless of css refs.
 + * ->pre_destroy() will be called at least once and is not allowed to fail.
 + * On the last put of each css, whenever that may be, the extra dentry ref
 + * is put so that dentry destruction happens only after all css's are
 + * released.
   */
 -
  static int cgroup_clear_css_refs(struct cgroup *cgrp)
  {
        struct cgroup_subsys *ss;
        unsigned long flags;
        bool failed = false;
 +
        local_irq_save(flags);
 +
 +      /*
 +       * Block new css_tryget() by deactivating refcnt.  If all refcnts
 +       * for subsystems w/ clear_css_refs set were 1 at the moment of
 +       * deactivation, we succeeded.
 +       */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 -              int refcnt;
 -              while (1) {
 -                      /* We can only remove a CSS with a refcnt==1 */
 -                      refcnt = atomic_read(&css->refcnt);
 -                      if (refcnt > 1) {
 -                              failed = true;
 -                              goto done;
 -                      }
 -                      BUG_ON(!refcnt);
 -                      /*
 -                       * Drop the refcnt to 0 while we check other
 -                       * subsystems. This will cause any racing
 -                       * css_tryget() to spin until we set the
 -                       * CSS_REMOVED bits or abort
 -                       */
 -                      if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
 -                              break;
 -                      cpu_relax();
 -              }
 +
 +              WARN_ON(atomic_read(&css->refcnt) < 0);
 +              atomic_add(CSS_DEACT_BIAS, &css->refcnt);
 +
 +              if (ss->__DEPRECATED_clear_css_refs)
 +                      failed |= css_refcnt(css) != 1;
        }
 - done:
 +
 +      /*
 +       * If succeeded, set REMOVED and put all the base refs; otherwise,
 +       * restore refcnts to positive values.  Either way, all in-progress
 +       * css_tryget() will be released.
 +       */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 -              if (failed) {
 -                      /*
 -                       * Restore old refcnt if we previously managed
 -                       * to clear it from 1 to 0
 -                       */
 -                      if (!atomic_read(&css->refcnt))
 -                              atomic_set(&css->refcnt, 1);
 -              } else {
 -                      /* Commit the fact that the CSS is removed */
 +
 +              if (!failed) {
                        set_bit(CSS_REMOVED, &css->flags);
 +                      css_put(css);
 +              } else {
 +                      atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
                }
        }
 +
        local_irq_restore(flags);
        return !failed;
  }
@@@ -4241,8 -3995,6 +4241,8 @@@ again
        list_del_init(&cgrp->sibling);
        cgroup_unlock_hierarchy(cgrp->root);
  
 +      list_del_init(&cgrp->allcg_node);
 +
        d = dget(cgrp->dentry);
  
        cgroup_d_remove_dir(d);
        return 0;
  }
  
 +static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
 +{
 +      INIT_LIST_HEAD(&ss->cftsets);
 +
 +      /*
 +       * base_cftset is embedded in subsys itself, no need to worry about
 +       * deregistration.
 +       */
 +      if (ss->base_cftypes) {
 +              ss->base_cftset.cfts = ss->base_cftypes;
 +              list_add_tail(&ss->base_cftset.node, &ss->cftsets);
 +      }
 +}
 +
  static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
  {
        struct cgroup_subsys_state *css;
  
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
  
 +      /* init base cftset */
 +      cgroup_init_cftsets(ss);
 +
        /* Create the top cgroup state for this subsystem */
        list_add(&ss->sibling, &rootnode.subsys_list);
        ss->root = &rootnode;
@@@ -4361,9 -4096,6 +4361,9 @@@ int __init_or_module cgroup_load_subsys
                return 0;
        }
  
 +      /* init base cftset */
 +      cgroup_init_cftsets(ss);
 +
        /*
         * need to register a subsys id before anything else - for example,
         * init_cgroup_css needs it.
@@@ -4953,41 -4685,21 +4953,41 @@@ static void check_for_release(struct cg
  }
  
  /* Caller must verify that the css is not for root cgroup */
 -void __css_put(struct cgroup_subsys_state *css, int count)
 +bool __css_tryget(struct cgroup_subsys_state *css)
 +{
 +      do {
 +              int v = css_refcnt(css);
 +
 +              if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
 +                      return true;
 +              cpu_relax();
 +      } while (!test_bit(CSS_REMOVED, &css->flags));
 +
 +      return false;
 +}
 +EXPORT_SYMBOL_GPL(__css_tryget);
 +
 +/* Caller must verify that the css is not for root cgroup */
 +void __css_put(struct cgroup_subsys_state *css)
  {
        struct cgroup *cgrp = css->cgroup;
 -      int val;
 +
        rcu_read_lock();
 -      val = atomic_sub_return(count, &css->refcnt);
 -      if (val == 1) {
 +      atomic_dec(&css->refcnt);
 +      switch (css_refcnt(css)) {
 +      case 1:
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
                cgroup_wakeup_rmdir_waiter(cgrp);
 +              break;
 +      case 0:
 +              if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
 +                      schedule_work(&css->dput_work);
 +              break;
        }
        rcu_read_unlock();
 -      WARN_ON_ONCE(val < 1);
  }
  EXPORT_SYMBOL_GPL(__css_put);
  
@@@ -5106,7 -4818,7 +5106,7 @@@ unsigned short css_id(struct cgroup_sub
         * on this or this is under rcu_read_lock(). Once css->id is allocated,
         * it's unchanged until freed.
         */
 -      cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
 +      cssid = rcu_dereference_check(css->id, css_refcnt(css));
  
        if (cssid)
                return cssid->id;
@@@ -5118,7 -4830,7 +5118,7 @@@ unsigned short css_depth(struct cgroup_
  {
        struct css_id *cssid;
  
 -      cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
 +      cssid = rcu_dereference_check(css->id, css_refcnt(css));
  
        if (cssid)
                return cssid->depth;
@@@ -5499,15 -5211,19 +5499,15 @@@ static struct cftype debug_files[] =  
                .name = "releasable",
                .read_u64 = releasable_read,
        },
 -};
  
 -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 -{
 -      return cgroup_add_files(cont, ss, debug_files,
 -                              ARRAY_SIZE(debug_files));
 -}
 +      { }     /* terminate */
 +};
  
  struct cgroup_subsys debug_subsys = {
        .name = "debug",
        .create = debug_create,
        .destroy = debug_destroy,
 -      .populate = debug_populate,
        .subsys_id = debug_subsys_id,
 +      .base_cftypes = debug_files,
  };
  #endif /* CONFIG_CGROUP_DEBUG */
diff --combined kernel/cred.c
index e70683d9ec32f00bf58a0ea2dc64be2d74fb8415,eddc5e2e9587e37a342a1c646e62395c4a02a211..430557ea488f3625243455afcdd6b2f9f481ac19
@@@ -49,6 -49,14 +49,14 @@@ struct cred init_cred = 
        .subscribers            = ATOMIC_INIT(2),
        .magic                  = CRED_MAGIC,
  #endif
+       .uid                    = GLOBAL_ROOT_UID,
+       .gid                    = GLOBAL_ROOT_GID,
+       .suid                   = GLOBAL_ROOT_UID,
+       .sgid                   = GLOBAL_ROOT_GID,
+       .euid                   = GLOBAL_ROOT_UID,
+       .egid                   = GLOBAL_ROOT_GID,
+       .fsuid                  = GLOBAL_ROOT_UID,
+       .fsgid                  = GLOBAL_ROOT_GID,
        .securebits             = SECUREBITS_DEFAULT,
        .cap_inheritable        = CAP_EMPTY_SET,
        .cap_permitted          = CAP_FULL_SET,
@@@ -148,6 -156,7 +156,7 @@@ static void put_cred_rcu(struct rcu_hea
        if (cred->group_info)
                put_group_info(cred->group_info);
        free_uid(cred->user);
+       put_user_ns(cred->user_ns);
        kmem_cache_free(cred_jar, cred);
  }
  
@@@ -303,6 -312,7 +312,7 @@@ struct cred *prepare_creds(void
        set_cred_subscribers(new, 0);
        get_group_info(new->group_info);
        get_uid(new->user);
+       get_user_ns(new->user_ns);
  
  #ifdef CONFIG_KEYS
        key_get(new->thread_keyring);
@@@ -386,8 -396,6 +396,8 @@@ int copy_creds(struct task_struct *p, u
        struct cred *new;
        int ret;
  
 +      p->replacement_session_keyring = NULL;
 +
        if (
  #ifdef CONFIG_KEYS
                !p->cred->thread_keyring &&
                        goto error_put;
        }
  
-       /* cache user_ns in cred.  Doesn't need a refcount because it will
-        * stay pinned by cred->user
-        */
-       new->user_ns = new->user->user_ns;
  #ifdef CONFIG_KEYS
        /* new threads get their own thread keyrings if their parent already
         * had one */
@@@ -493,10 -496,10 +498,10 @@@ int commit_creds(struct cred *new
        get_cred(new); /* we will require a ref for the subj creds too */
  
        /* dumpability changes */
-       if (old->euid != new->euid ||
-           old->egid != new->egid ||
-           old->fsuid != new->fsuid ||
-           old->fsgid != new->fsgid ||
+       if (!uid_eq(old->euid, new->euid) ||
+           !gid_eq(old->egid, new->egid) ||
+           !uid_eq(old->fsuid, new->fsuid) ||
+           !gid_eq(old->fsgid, new->fsgid) ||
            !cap_issubset(new->cap_permitted, old->cap_permitted)) {
                if (task->mm)
                        set_dumpable(task->mm, suid_dumpable);
        }
  
        /* alter the thread keyring */
-       if (new->fsuid != old->fsuid)
+       if (!uid_eq(new->fsuid, old->fsuid))
                key_fsuid_changed(task);
-       if (new->fsgid != old->fsgid)
+       if (!gid_eq(new->fsgid, old->fsgid))
                key_fsgid_changed(task);
  
        /* do it
        alter_cred_subscribers(old, -2);
  
        /* send notifications */
-       if (new->uid   != old->uid  ||
-           new->euid  != old->euid ||
-           new->suid  != old->suid ||
-           new->fsuid != old->fsuid)
+       if (!uid_eq(new->uid,   old->uid)  ||
+           !uid_eq(new->euid,  old->euid) ||
+           !uid_eq(new->suid,  old->suid) ||
+           !uid_eq(new->fsuid, old->fsuid))
                proc_id_connector(task, PROC_EVENT_UID);
  
-       if (new->gid   != old->gid  ||
-           new->egid  != old->egid ||
-           new->sgid  != old->sgid ||
-           new->fsgid != old->fsgid)
+       if (!gid_eq(new->gid,   old->gid)  ||
+           !gid_eq(new->egid,  old->egid) ||
+           !gid_eq(new->sgid,  old->sgid) ||
+           !gid_eq(new->fsgid, old->fsgid))
                proc_id_connector(task, PROC_EVENT_GID);
  
        /* release the old obj and subj refs both */
@@@ -678,6 -681,7 +683,7 @@@ struct cred *prepare_kernel_cred(struc
        atomic_set(&new->usage, 1);
        set_cred_subscribers(new, 0);
        get_uid(new->user);
+       get_user_ns(new->user_ns);
        get_group_info(new->group_info);
  
  #ifdef CONFIG_KEYS
diff --combined kernel/sched/core.c
index a5a9d39b845cfaed2838a1fe1e0afd49f0609a3a,b189fecaef906043fdc8948089633c4a0ce7846d..39eb6011bc38e3f20188c942cb31a173fce74093
@@@ -83,7 -83,6 +83,7 @@@
  
  #include "sched.h"
  #include "../workqueue_sched.h"
 +#include "../smpboot.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
@@@ -693,6 -692,8 +693,6 @@@ int tg_nop(struct task_group *tg, void 
  }
  #endif
  
 -void update_cpu_load(struct rq *this_rq);
 -
  static void set_load_weight(struct task_struct *p)
  {
        int prio = p->static_prio - MAX_RT_PRIO;
@@@ -2082,7 -2083,6 +2082,7 @@@ context_switch(struct rq *rq, struct ta
  #endif
  
        /* Here we just switch the register state and the stack. */
 +      rcu_switch_from(prev);
        switch_to(prev, next, prev);
  
        barrier();
@@@ -2486,13 -2486,22 +2486,13 @@@ decay_load_missed(unsigned long load, u
   * scheduler tick (TICK_NSEC). With tickless idle this will not be called
   * every tick. We fix it up based on jiffies.
   */
 -void update_cpu_load(struct rq *this_rq)
 +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 +                            unsigned long pending_updates)
  {
 -      unsigned long this_load = this_rq->load.weight;
 -      unsigned long curr_jiffies = jiffies;
 -      unsigned long pending_updates;
        int i, scale;
  
        this_rq->nr_load_updates++;
  
 -      /* Avoid repeated calls on same jiffy, when moving in and out of idle */
 -      if (curr_jiffies == this_rq->last_load_update_tick)
 -              return;
 -
 -      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 -      this_rq->last_load_update_tick = curr_jiffies;
 -
        /* Update our load: */
        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
        sched_avg_update(this_rq);
  }
  
 +/*
 + * Called from nohz_idle_balance() to update the load ratings before doing the
 + * idle balance.
 + */
 +void update_idle_cpu_load(struct rq *this_rq)
 +{
 +      unsigned long curr_jiffies = jiffies;
 +      unsigned long load = this_rq->load.weight;
 +      unsigned long pending_updates;
 +
 +      /*
 +       * Bloody broken means of dealing with nohz, but better than nothing..
 +       * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
 +       * update and see 0 difference the one time and 2 the next, even though
 +       * we ticked at roughtly the same rate.
 +       *
 +       * Hence we only use this from nohz_idle_balance() and skip this
 +       * nonsense when called from the scheduler_tick() since that's
 +       * guaranteed a stable rate.
 +       */
 +      if (load || curr_jiffies == this_rq->last_load_update_tick)
 +              return;
 +
 +      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
 +      this_rq->last_load_update_tick = curr_jiffies;
 +
 +      __update_cpu_load(this_rq, load, pending_updates);
 +}
 +
 +/*
 + * Called from scheduler_tick()
 + */
  static void update_cpu_load_active(struct rq *this_rq)
  {
 -      update_cpu_load(this_rq);
 +      /*
 +       * See the mess in update_idle_cpu_load().
 +       */
 +      this_rq->last_load_update_tick = jiffies;
 +      __update_cpu_load(this_rq, this_rq->load.weight, 1);
  
        calc_load_account_active(this_rq);
  }
@@@ -3140,7 -3113,6 +3140,7 @@@ static noinline void __schedule_bug(str
        if (irqs_disabled())
                print_irqtrace_events(prev);
        dump_stack();
 +      add_taint(TAINT_WARN);
  }
  
  /*
@@@ -4070,11 -4042,8 +4070,8 @@@ static bool check_same_owner(struct tas
  
        rcu_read_lock();
        pcred = __task_cred(p);
-       if (cred->user->user_ns == pcred->user->user_ns)
-               match = (cred->euid == pcred->euid ||
-                        cred->euid == pcred->uid);
-       else
-               match = false;
+       match = (uid_eq(cred->euid, pcred->euid) ||
+                uid_eq(cred->euid, pcred->uid));
        rcu_read_unlock();
        return match;
  }
@@@ -5588,8 -5557,7 +5585,8 @@@ static int sched_domain_debug_one(struc
                        break;
                }
  
 -              if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
 +              if (!(sd->flags & SD_OVERLAP) &&
 +                  cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
@@@ -5927,11 -5895,99 +5924,11 @@@ static int __init isolated_cpu_setup(ch
  
  __setup("isolcpus=", isolated_cpu_setup);
  
 -#ifdef CONFIG_NUMA
 -
 -/**
 - * find_next_best_node - find the next node to include in a sched_domain
 - * @node: node whose sched_domain we're building
 - * @used_nodes: nodes already in the sched_domain
 - *
 - * Find the next node to include in a given scheduling domain. Simply
 - * finds the closest node not already in the @used_nodes map.
 - *
 - * Should use nodemask_t.
 - */
 -static int find_next_best_node(int node, nodemask_t *used_nodes)
 -{
 -      int i, n, val, min_val, best_node = -1;
 -
 -      min_val = INT_MAX;
 -
 -      for (i = 0; i < nr_node_ids; i++) {
 -              /* Start at @node */
 -              n = (node + i) % nr_node_ids;
 -
 -              if (!nr_cpus_node(n))
 -                      continue;
 -
 -              /* Skip already used nodes */
 -              if (node_isset(n, *used_nodes))
 -                      continue;
 -
 -              /* Simple min distance search */
 -              val = node_distance(node, n);
 -
 -              if (val < min_val) {
 -                      min_val = val;
 -                      best_node = n;
 -              }
 -      }
 -
 -      if (best_node != -1)
 -              node_set(best_node, *used_nodes);
 -      return best_node;
 -}
 -
 -/**
 - * sched_domain_node_span - get a cpumask for a node's sched_domain
 - * @node: node whose cpumask we're constructing
 - * @span: resulting cpumask
 - *
 - * Given a node, construct a good cpumask for its sched_domain to span. It
 - * should be one that prevents unnecessary balancing, but also spreads tasks
 - * out optimally.
 - */
 -static void sched_domain_node_span(int node, struct cpumask *span)
 -{
 -      nodemask_t used_nodes;
 -      int i;
 -
 -      cpumask_clear(span);
 -      nodes_clear(used_nodes);
 -
 -      cpumask_or(span, span, cpumask_of_node(node));
 -      node_set(node, used_nodes);
 -
 -      for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
 -              int next_node = find_next_best_node(node, &used_nodes);
 -              if (next_node < 0)
 -                      break;
 -              cpumask_or(span, span, cpumask_of_node(next_node));
 -      }
 -}
 -
 -static const struct cpumask *cpu_node_mask(int cpu)
 -{
 -      lockdep_assert_held(&sched_domains_mutex);
 -
 -      sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
 -
 -      return sched_domains_tmpmask;
 -}
 -
 -static const struct cpumask *cpu_allnodes_mask(int cpu)
 -{
 -      return cpu_possible_mask;
 -}
 -#endif /* CONFIG_NUMA */
 -
  static const struct cpumask *cpu_cpu_mask(int cpu)
  {
        return cpumask_of_node(cpu_to_node(cpu));
  }
  
 -int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 -
  struct sd_data {
        struct sched_domain **__percpu sd;
        struct sched_group **__percpu sg;
@@@ -5961,7 -6017,6 +5958,7 @@@ struct sched_domain_topology_level 
        sched_domain_init_f init;
        sched_domain_mask_f mask;
        int                 flags;
 +      int                 numa_level;
        struct sd_data      data;
  };
  
@@@ -6153,6 -6208,10 +6150,6 @@@ sd_init_##type(struct sched_domain_topo
  }
  
  SD_INIT_FUNC(CPU)
 -#ifdef CONFIG_NUMA
 - SD_INIT_FUNC(ALLNODES)
 - SD_INIT_FUNC(NODE)
 -#endif
  #ifdef CONFIG_SCHED_SMT
   SD_INIT_FUNC(SIBLING)
  #endif
@@@ -6274,184 -6333,15 +6271,184 @@@ static struct sched_domain_topology_lev
        { sd_init_BOOK, cpu_book_mask, },
  #endif
        { sd_init_CPU, cpu_cpu_mask, },
 -#ifdef CONFIG_NUMA
 -      { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
 -      { sd_init_ALLNODES, cpu_allnodes_mask, },
 -#endif
        { NULL, },
  };
  
  static struct sched_domain_topology_level *sched_domain_topology = default_topology;
  
 +#ifdef CONFIG_NUMA
 +
 +static int sched_domains_numa_levels;
 +static int sched_domains_numa_scale;
 +static int *sched_domains_numa_distance;
 +static struct cpumask ***sched_domains_numa_masks;
 +static int sched_domains_curr_level;
 +
 +static inline int sd_local_flags(int level)
 +{
 +      if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
 +              return 0;
 +
 +      return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
 +}
 +
 +static struct sched_domain *
 +sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
 +{
 +      struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
 +      int level = tl->numa_level;
 +      int sd_weight = cpumask_weight(
 +                      sched_domains_numa_masks[level][cpu_to_node(cpu)]);
 +
 +      *sd = (struct sched_domain){
 +              .min_interval           = sd_weight,
 +              .max_interval           = 2*sd_weight,
 +              .busy_factor            = 32,
 +              .imbalance_pct          = 125,
 +              .cache_nice_tries       = 2,
 +              .busy_idx               = 3,
 +              .idle_idx               = 2,
 +              .newidle_idx            = 0,
 +              .wake_idx               = 0,
 +              .forkexec_idx           = 0,
 +
 +              .flags                  = 1*SD_LOAD_BALANCE
 +                                      | 1*SD_BALANCE_NEWIDLE
 +                                      | 0*SD_BALANCE_EXEC
 +                                      | 0*SD_BALANCE_FORK
 +                                      | 0*SD_BALANCE_WAKE
 +                                      | 0*SD_WAKE_AFFINE
 +                                      | 0*SD_PREFER_LOCAL
 +                                      | 0*SD_SHARE_CPUPOWER
 +                                      | 0*SD_SHARE_PKG_RESOURCES
 +                                      | 1*SD_SERIALIZE
 +                                      | 0*SD_PREFER_SIBLING
 +                                      | sd_local_flags(level)
 +                                      ,
 +              .last_balance           = jiffies,
 +              .balance_interval       = sd_weight,
 +      };
 +      SD_INIT_NAME(sd, NUMA);
 +      sd->private = &tl->data;
 +
 +      /*
 +       * Ugly hack to pass state to sd_numa_mask()...
 +       */
 +      sched_domains_curr_level = tl->numa_level;
 +
 +      return sd;
 +}
 +
 +static const struct cpumask *sd_numa_mask(int cpu)
 +{
 +      return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
 +}
 +
 +static void sched_init_numa(void)
 +{
 +      int next_distance, curr_distance = node_distance(0, 0);
 +      struct sched_domain_topology_level *tl;
 +      int level = 0;
 +      int i, j, k;
 +
 +      sched_domains_numa_scale = curr_distance;
 +      sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
 +      if (!sched_domains_numa_distance)
 +              return;
 +
 +      /*
 +       * O(nr_nodes^2) deduplicating selection sort -- in order to find the
 +       * unique distances in the node_distance() table.
 +       *
 +       * Assumes node_distance(0,j) includes all distances in
 +       * node_distance(i,j) in order to avoid cubic time.
 +       *
 +       * XXX: could be optimized to O(n log n) by using sort()
 +       */
 +      next_distance = curr_distance;
 +      for (i = 0; i < nr_node_ids; i++) {
 +              for (j = 0; j < nr_node_ids; j++) {
 +                      int distance = node_distance(0, j);
 +                      if (distance > curr_distance &&
 +                                      (distance < next_distance ||
 +                                       next_distance == curr_distance))
 +                              next_distance = distance;
 +              }
 +              if (next_distance != curr_distance) {
 +                      sched_domains_numa_distance[level++] = next_distance;
 +                      sched_domains_numa_levels = level;
 +                      curr_distance = next_distance;
 +              } else break;
 +      }
 +      /*
 +       * 'level' contains the number of unique distances, excluding the
 +       * identity distance node_distance(i,i).
 +       *
 +       * The sched_domains_nume_distance[] array includes the actual distance
 +       * numbers.
 +       */
 +
 +      sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
 +      if (!sched_domains_numa_masks)
 +              return;
 +
 +      /*
 +       * Now for each level, construct a mask per node which contains all
 +       * cpus of nodes that are that many hops away from us.
 +       */
 +      for (i = 0; i < level; i++) {
 +              sched_domains_numa_masks[i] =
 +                      kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
 +              if (!sched_domains_numa_masks[i])
 +                      return;
 +
 +              for (j = 0; j < nr_node_ids; j++) {
 +                      struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
 +                      if (!mask)
 +                              return;
 +
 +                      sched_domains_numa_masks[i][j] = mask;
 +
 +                      for (k = 0; k < nr_node_ids; k++) {
 +                              if (node_distance(j, k) > sched_domains_numa_distance[i])
 +                                      continue;
 +
 +                              cpumask_or(mask, mask, cpumask_of_node(k));
 +                      }
 +              }
 +      }
 +
 +      tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
 +                      sizeof(struct sched_domain_topology_level), GFP_KERNEL);
 +      if (!tl)
 +              return;
 +
 +      /*
 +       * Copy the default topology bits..
 +       */
 +      for (i = 0; default_topology[i].init; i++)
 +              tl[i] = default_topology[i];
 +
 +      /*
 +       * .. and append 'j' levels of NUMA goodness.
 +       */
 +      for (j = 0; j < level; i++, j++) {
 +              tl[i] = (struct sched_domain_topology_level){
 +                      .init = sd_numa_init,
 +                      .mask = sd_numa_mask,
 +                      .flags = SDTL_OVERLAP,
 +                      .numa_level = j,
 +              };
 +      }
 +
 +      sched_domain_topology = tl;
 +}
 +#else
 +static inline void sched_init_numa(void)
 +{
 +}
 +#endif /* CONFIG_NUMA */
 +
  static int __sdt_alloc(const struct cpumask *cpu_map)
  {
        struct sched_domain_topology_level *tl;
                        if (!sg)
                                return -ENOMEM;
  
 +                      sg->next = sg;
 +
                        *per_cpu_ptr(sdd->sg, j) = sg;
  
                        sgp = kzalloc_node(sizeof(struct sched_group_power),
@@@ -6514,26 -6402,16 +6511,26 @@@ static void __sdt_free(const struct cpu
                struct sd_data *sdd = &tl->data;
  
                for_each_cpu(j, cpu_map) {
 -                      struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
 -                      if (sd && (sd->flags & SD_OVERLAP))
 -                              free_sched_groups(sd->groups, 0);
 -                      kfree(*per_cpu_ptr(sdd->sd, j));
 -                      kfree(*per_cpu_ptr(sdd->sg, j));
 -                      kfree(*per_cpu_ptr(sdd->sgp, j));
 +                      struct sched_domain *sd;
 +
 +                      if (sdd->sd) {
 +                              sd = *per_cpu_ptr(sdd->sd, j);
 +                              if (sd && (sd->flags & SD_OVERLAP))
 +                                      free_sched_groups(sd->groups, 0);
 +                              kfree(*per_cpu_ptr(sdd->sd, j));
 +                      }
 +
 +                      if (sdd->sg)
 +                              kfree(*per_cpu_ptr(sdd->sg, j));
 +                      if (sdd->sgp)
 +                              kfree(*per_cpu_ptr(sdd->sgp, j));
                }
                free_percpu(sdd->sd);
 +              sdd->sd = NULL;
                free_percpu(sdd->sg);
 +              sdd->sg = NULL;
                free_percpu(sdd->sgp);
 +              sdd->sgp = NULL;
        }
  }
  
@@@ -6819,6 -6697,97 +6816,6 @@@ match2
        mutex_unlock(&sched_domains_mutex);
  }
  
 -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 -static void reinit_sched_domains(void)
 -{
 -      get_online_cpus();
 -
 -      /* Destroy domains first to force the rebuild */
 -      partition_sched_domains(0, NULL, NULL);
 -
 -      rebuild_sched_domains();
 -      put_online_cpus();
 -}
 -
 -static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 -{
 -      unsigned int level = 0;
 -
 -      if (sscanf(buf, "%u", &level) != 1)
 -              return -EINVAL;
 -
 -      /*
 -       * level is always be positive so don't check for
 -       * level < POWERSAVINGS_BALANCE_NONE which is 0
 -       * What happens on 0 or 1 byte write,
 -       * need to check for count as well?
 -       */
 -
 -      if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
 -              return -EINVAL;
 -
 -      if (smt)
 -              sched_smt_power_savings = level;
 -      else
 -              sched_mc_power_savings = level;
 -
 -      reinit_sched_domains();
 -
 -      return count;
 -}
 -
 -#ifdef CONFIG_SCHED_MC
 -static ssize_t sched_mc_power_savings_show(struct device *dev,
 -                                         struct device_attribute *attr,
 -                                         char *buf)
 -{
 -      return sprintf(buf, "%u\n", sched_mc_power_savings);
 -}
 -static ssize_t sched_mc_power_savings_store(struct device *dev,
 -                                          struct device_attribute *attr,
 -                                          const char *buf, size_t count)
 -{
 -      return sched_power_savings_store(buf, count, 0);
 -}
 -static DEVICE_ATTR(sched_mc_power_savings, 0644,
 -                 sched_mc_power_savings_show,
 -                 sched_mc_power_savings_store);
 -#endif
 -
 -#ifdef CONFIG_SCHED_SMT
 -static ssize_t sched_smt_power_savings_show(struct device *dev,
 -                                          struct device_attribute *attr,
 -                                          char *buf)
 -{
 -      return sprintf(buf, "%u\n", sched_smt_power_savings);
 -}
 -static ssize_t sched_smt_power_savings_store(struct device *dev,
 -                                          struct device_attribute *attr,
 -                                           const char *buf, size_t count)
 -{
 -      return sched_power_savings_store(buf, count, 1);
 -}
 -static DEVICE_ATTR(sched_smt_power_savings, 0644,
 -                 sched_smt_power_savings_show,
 -                 sched_smt_power_savings_store);
 -#endif
 -
 -int __init sched_create_sysfs_power_savings_entries(struct device *dev)
 -{
 -      int err = 0;
 -
 -#ifdef CONFIG_SCHED_SMT
 -      if (smt_capable())
 -              err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
 -#endif
 -#ifdef CONFIG_SCHED_MC
 -      if (!err && mc_capable())
 -              err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
 -#endif
 -      return err;
 -}
 -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 -
  /*
   * Update cpusets according to cpu_active mask.  If cpusets are
   * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@@ -6856,8 -6825,6 +6853,8 @@@ void __init sched_init_smp(void
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
  
 +      sched_init_numa();
 +
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);
@@@ -7079,7 -7046,6 +7076,7 @@@ void __init sched_init(void
        /* May be allocated at isolcpus cmdline parse time */
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 +      idle_thread_set_boot_cpu();
  #endif
        init_sched_fair_class();
  
@@@ -8001,9 -7967,13 +7998,9 @@@ static struct cftype cpu_files[] = 
                .write_u64 = cpu_rt_period_write_uint,
        },
  #endif
 +      { }     /* terminate */
  };
  
 -static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 -{
 -      return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
 -}
 -
  struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
 -      .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
 +      .base_cftypes   = cpu_files,
        .early_init     = 1,
  };
  
@@@ -8197,9 -8167,13 +8194,9 @@@ static struct cftype files[] = 
                .name = "stat",
                .read_map = cpuacct_stats_show,
        },
 +      { }     /* terminate */
  };
  
 -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 -{
 -      return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
 -}
 -
  /*
   * charge this task's execution time to its accounting group.
   *
@@@ -8231,7 -8205,7 +8228,7 @@@ struct cgroup_subsys cpuacct_subsys = 
        .name = "cpuacct",
        .create = cpuacct_create,
        .destroy = cpuacct_destroy,
 -      .populate = cpuacct_populate,
        .subsys_id = cpuacct_subsys_id,
 +      .base_cftypes = files,
  };
  #endif        /* CONFIG_CGROUP_CPUACCT */
diff --combined kernel/signal.c
index 1a006b5d9d9d7329e1e827a1b0cf79bae7e10513,833ea516685569952e1e62d8d9939db525c78c30..21ebe75ff85f0b2745e6c3251f787b067c223d90
@@@ -160,7 -160,7 +160,7 @@@ void recalc_sigpending(void
  
  #define SYNCHRONOUS_MASK \
        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
 -       sigmask(SIGTRAP) | sigmask(SIGFPE))
 +       sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
  
  int next_signal(struct sigpending *pending, sigset_t *mask)
  {
@@@ -767,14 -767,13 +767,13 @@@ static int kill_ok_by_cred(struct task_
        const struct cred *cred = current_cred();
        const struct cred *tcred = __task_cred(t);
  
-       if (cred->user->user_ns == tcred->user->user_ns &&
-           (cred->euid == tcred->suid ||
-            cred->euid == tcred->uid ||
-            cred->uid  == tcred->suid ||
-            cred->uid  == tcred->uid))
+       if (uid_eq(cred->euid, tcred->suid) ||
+           uid_eq(cred->euid, tcred->uid)  ||
+           uid_eq(cred->uid,  tcred->suid) ||
+           uid_eq(cred->uid,  tcred->uid))
                return 1;
  
-       if (ns_capable(tcred->user->user_ns, CAP_KILL))
+       if (ns_capable(tcred->user_ns, CAP_KILL))
                return 1;
  
        return 0;
@@@ -1020,15 -1019,6 +1019,6 @@@ static inline int legacy_queue(struct s
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
  }
  
- /*
-  * map the uid in struct cred into user namespace *ns
-  */
- static inline uid_t map_cred_ns(const struct cred *cred,
-                               struct user_namespace *ns)
- {
-       return user_ns_map_uid(ns, cred, cred->uid);
- }
  #ifdef CONFIG_USER_NS
  static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
  {
        if (SI_FROMKERNEL(info))
                return;
  
-       info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
-                                       current_cred(), info->si_uid);
+       rcu_read_lock();
+       info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
+                                       make_kuid(current_user_ns(), info->si_uid));
+       rcu_read_unlock();
  }
  #else
  static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
@@@ -1106,7 -1098,7 +1098,7 @@@ static int __send_signal(int sig, struc
                        q->info.si_code = SI_USER;
                        q->info.si_pid = task_tgid_nr_ns(current,
                                                        task_active_pid_ns(t));
-                       q->info.si_uid = current_uid();
+                       q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
                        break;
                case (unsigned long) SEND_SIG_PRIV:
                        q->info.si_signo = sig;
@@@ -1387,10 -1379,8 +1379,8 @@@ static int kill_as_cred_perm(const stru
                             struct task_struct *target)
  {
        const struct cred *pcred = __task_cred(target);
-       if (cred->user_ns != pcred->user_ns)
-               return 0;
-       if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
-           cred->uid  != pcred->suid && cred->uid  != pcred->uid)
+       if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) &&
+           !uid_eq(cred->uid,  pcred->suid) && !uid_eq(cred->uid,  pcred->uid))
                return 0;
        return 1;
  }
@@@ -1678,8 -1668,8 +1668,8 @@@ bool do_notify_parent(struct task_struc
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
-       info.si_uid = map_cred_ns(__task_cred(tsk),
-                       task_cred_xxx(tsk->parent, user_ns));
+       info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
+                                      task_uid(tsk));
        rcu_read_unlock();
  
        info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@@ -1762,8 -1752,7 +1752,7 @@@ static void do_notify_parent_cldstop(st
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-       info.si_uid = map_cred_ns(__task_cred(tsk),
-                       task_cred_xxx(parent, user_ns));
+       info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
        rcu_read_unlock();
  
        info.si_utime = cputime_to_clock_t(tsk->utime);
@@@ -1973,7 -1962,7 +1962,7 @@@ static void ptrace_do_notify(int signr
        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
-       info.si_uid = current_uid();
+       info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
  
        /* Let the debugger run.  */
        ptrace_stop(exit_code, why, 1, &info);
@@@ -2181,8 -2170,8 +2170,8 @@@ static int ptrace_signal(int signr, sig
                info->si_code = SI_USER;
                rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
-               info->si_uid = map_cred_ns(__task_cred(current->parent),
-                               current_user_ns());
+               info->si_uid = from_kuid_munged(current_user_ns(),
+                                               task_uid(current->parent));
                rcu_read_unlock();
        }
  
@@@ -2706,13 -2695,6 +2695,13 @@@ int copy_siginfo_to_user(siginfo_t __us
                err |= __put_user(from->si_uid, &to->si_uid);
                err |= __put_user(from->si_ptr, &to->si_ptr);
                break;
 +#ifdef __ARCH_SIGSYS
 +      case __SI_SYS:
 +              err |= __put_user(from->si_call_addr, &to->si_call_addr);
 +              err |= __put_user(from->si_syscall, &to->si_syscall);
 +              err |= __put_user(from->si_arch, &to->si_arch);
 +              break;
 +#endif
        default: /* this is just in case for now ... */
                err |= __put_user(from->si_pid, &to->si_pid);
                err |= __put_user(from->si_uid, &to->si_uid);
@@@ -2835,7 -2817,7 +2824,7 @@@ SYSCALL_DEFINE2(kill, pid_t, pid, int, 
        info.si_errno = 0;
        info.si_code = SI_USER;
        info.si_pid = task_tgid_vnr(current);
-       info.si_uid = current_uid();
+       info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
  
        return kill_something_info(sig, &info, pid);
  }
@@@ -2878,7 -2860,7 +2867,7 @@@ static int do_tkill(pid_t tgid, pid_t p
        info.si_errno = 0;
        info.si_code = SI_TKILL;
        info.si_pid = task_tgid_vnr(current);
-       info.si_uid = current_uid();
+       info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
  
        return do_send_specific(tgid, pid, sig, &info);
  }
diff --combined kernel/sys.c
index ba0ae8eea6fbc32fb159c6dbf8d9868bcbaa1452,f484077b6b148190816cbc9b8f6ccb19096b5262..6df42624e454aeb236ab1c9413d6fdf5f676365f
  int overflowuid = DEFAULT_OVERFLOWUID;
  int overflowgid = DEFAULT_OVERFLOWGID;
  
- #ifdef CONFIG_UID16
  EXPORT_SYMBOL(overflowuid);
  EXPORT_SYMBOL(overflowgid);
- #endif
  
  /*
   * the same as above, but for filesystems which can only store a 16-bit
@@@ -133,11 -131,10 +131,10 @@@ static bool set_one_prio_perm(struct ta
  {
        const struct cred *cred = current_cred(), *pcred = __task_cred(p);
  
-       if (pcred->user->user_ns == cred->user->user_ns &&
-           (pcred->uid  == cred->euid ||
-            pcred->euid == cred->euid))
+       if (uid_eq(pcred->uid,  cred->euid) ||
+           uid_eq(pcred->euid, cred->euid))
                return true;
-       if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+       if (ns_capable(pcred->user_ns, CAP_SYS_NICE))
                return true;
        return false;
  }
@@@ -177,6 -174,7 +174,7 @@@ SYSCALL_DEFINE3(setpriority, int, which
        const struct cred *cred = current_cred();
        int error = -EINVAL;
        struct pid *pgrp;
+       kuid_t uid;
  
        if (which > PRIO_USER || which < PRIO_PROCESS)
                goto out;
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case PRIO_USER:
-                       user = (struct user_struct *) cred->user;
+                       uid = make_kuid(cred->user_ns, who);
+                       user = cred->user;
                        if (!who)
-                               who = cred->uid;
-                       else if ((who != cred->uid) &&
-                                !(user = find_user(who)))
+                               uid = cred->uid;
+                       else if (!uid_eq(uid, cred->uid) &&
+                                !(user = find_user(uid)))
                                goto out_unlock;        /* No processes for this user */
  
                        do_each_thread(g, p) {
-                               if (__task_cred(p)->uid == who)
+                               if (uid_eq(task_uid(p), uid))
                                        error = set_one_prio(p, niceval, error);
                        } while_each_thread(g, p);
-                       if (who != cred->uid)
+                       if (!uid_eq(uid, cred->uid))
                                free_uid(user);         /* For find_user() */
                        break;
        }
@@@ -244,6 -243,7 +243,7 @@@ SYSCALL_DEFINE2(getpriority, int, which
        const struct cred *cred = current_cred();
        long niceval, retval = -ESRCH;
        struct pid *pgrp;
+       kuid_t uid;
  
        if (which > PRIO_USER || which < PRIO_PROCESS)
                return -EINVAL;
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case PRIO_USER:
-                       user = (struct user_struct *) cred->user;
+                       uid = make_kuid(cred->user_ns, who);
+                       user = cred->user;
                        if (!who)
-                               who = cred->uid;
-                       else if ((who != cred->uid) &&
-                                !(user = find_user(who)))
+                               uid = cred->uid;
+                       else if (!uid_eq(uid, cred->uid) &&
+                                !(user = find_user(uid)))
                                goto out_unlock;        /* No processes for this user */
  
                        do_each_thread(g, p) {
-                               if (__task_cred(p)->uid == who) {
+                               if (uid_eq(task_uid(p), uid)) {
                                        niceval = 20 - task_nice(p);
                                        if (niceval > retval)
                                                retval = niceval;
                                }
                        } while_each_thread(g, p);
-                       if (who != cred->uid)
+                       if (!uid_eq(uid, cred->uid))
                                free_uid(user);         /* for find_user() */
                        break;
        }
@@@ -553,9 -554,19 +554,19 @@@ void ctrl_alt_del(void
   */
  SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
  {
+       struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+       kgid_t krgid, kegid;
+       krgid = make_kgid(ns, rgid);
+       kegid = make_kgid(ns, egid);
+       if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+               return -EINVAL;
+       if ((egid != (gid_t) -1) && !gid_valid(kegid))
+               return -EINVAL;
  
        new = prepare_creds();
        if (!new)
  
        retval = -EPERM;
        if (rgid != (gid_t) -1) {
-               if (old->gid == rgid ||
-                   old->egid == rgid ||
+               if (gid_eq(old->gid, krgid) ||
+                   gid_eq(old->egid, krgid) ||
                    nsown_capable(CAP_SETGID))
-                       new->gid = rgid;
+                       new->gid = krgid;
                else
                        goto error;
        }
        if (egid != (gid_t) -1) {
-               if (old->gid == egid ||
-                   old->egid == egid ||
-                   old->sgid == egid ||
+               if (gid_eq(old->gid, kegid) ||
+                   gid_eq(old->egid, kegid) ||
+                   gid_eq(old->sgid, kegid) ||
                    nsown_capable(CAP_SETGID))
-                       new->egid = egid;
+                       new->egid = kegid;
                else
                        goto error;
        }
  
        if (rgid != (gid_t) -1 ||
-           (egid != (gid_t) -1 && egid != old->gid))
+           (egid != (gid_t) -1 && !gid_eq(kegid, old->gid)))
                new->sgid = new->egid;
        new->fsgid = new->egid;
  
@@@ -600,9 -611,15 +611,15 @@@ error
   */
  SYSCALL_DEFINE1(setgid, gid_t, gid)
  {
+       struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+       kgid_t kgid;
+       kgid = make_kgid(ns, gid);
+       if (!gid_valid(kgid))
+               return -EINVAL;
  
        new = prepare_creds();
        if (!new)
  
        retval = -EPERM;
        if (nsown_capable(CAP_SETGID))
-               new->gid = new->egid = new->sgid = new->fsgid = gid;
-       else if (gid == old->gid || gid == old->sgid)
-               new->egid = new->fsgid = gid;
+               new->gid = new->egid = new->sgid = new->fsgid = kgid;
+       else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
+               new->egid = new->fsgid = kgid;
        else
                goto error;
  
@@@ -631,7 -648,7 +648,7 @@@ static int set_user(struct cred *new
  {
        struct user_struct *new_user;
  
-       new_user = alloc_uid(current_user_ns(), new->uid);
+       new_user = alloc_uid(new->uid);
        if (!new_user)
                return -EAGAIN;
  
   */
  SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
  {
+       struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+       kuid_t kruid, keuid;
+       kruid = make_kuid(ns, ruid);
+       keuid = make_kuid(ns, euid);
+       if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+               return -EINVAL;
+       if ((euid != (uid_t) -1) && !uid_valid(keuid))
+               return -EINVAL;
  
        new = prepare_creds();
        if (!new)
  
        retval = -EPERM;
        if (ruid != (uid_t) -1) {
-               new->uid = ruid;
-               if (old->uid != ruid &&
-                   old->euid != ruid &&
+               new->uid = kruid;
+               if (!uid_eq(old->uid, kruid) &&
+                   !uid_eq(old->euid, kruid) &&
                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
  
        if (euid != (uid_t) -1) {
-               new->euid = euid;
-               if (old->uid != euid &&
-                   old->euid != euid &&
-                   old->suid != euid &&
+               new->euid = keuid;
+               if (!uid_eq(old->uid, keuid) &&
+                   !uid_eq(old->euid, keuid) &&
+                   !uid_eq(old->suid, keuid) &&
                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
  
-       if (new->uid != old->uid) {
+       if (!uid_eq(new->uid, old->uid)) {
                retval = set_user(new);
                if (retval < 0)
                        goto error;
        }
        if (ruid != (uid_t) -1 ||
-           (euid != (uid_t) -1 && euid != old->uid))
+           (euid != (uid_t) -1 && !uid_eq(keuid, old->uid)))
                new->suid = new->euid;
        new->fsuid = new->euid;
  
@@@ -731,9 -758,15 +758,15 @@@ error
   */
  SYSCALL_DEFINE1(setuid, uid_t, uid)
  {
+       struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+       kuid_t kuid;
+       kuid = make_kuid(ns, uid);
+       if (!uid_valid(kuid))
+               return -EINVAL;
  
        new = prepare_creds();
        if (!new)
  
        retval = -EPERM;
        if (nsown_capable(CAP_SETUID)) {
-               new->suid = new->uid = uid;
-               if (uid != old->uid) {
+               new->suid = new->uid = kuid;
+               if (!uid_eq(kuid, old->uid)) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
-       } else if (uid != old->uid && uid != new->suid) {
+       } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) {
                goto error;
        }
  
-       new->fsuid = new->euid = uid;
+       new->fsuid = new->euid = kuid;
  
        retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
        if (retval < 0)
@@@ -772,9 -805,24 +805,24 @@@ error
   */
  SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
  {
+       struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+       kuid_t kruid, keuid, ksuid;
+       kruid = make_kuid(ns, ruid);
+       keuid = make_kuid(ns, euid);
+       ksuid = make_kuid(ns, suid);
+       if ((ruid != (uid_t) -1) && !uid_valid(kruid))
+               return -EINVAL;
+       if ((euid != (uid_t) -1) && !uid_valid(keuid))
+               return -EINVAL;
+       if ((suid != (uid_t) -1) && !uid_valid(ksuid))
+               return -EINVAL;
  
        new = prepare_creds();
        if (!new)
  
        retval = -EPERM;
        if (!nsown_capable(CAP_SETUID)) {
-               if (ruid != (uid_t) -1 && ruid != old->uid &&
-                   ruid != old->euid  && ruid != old->suid)
+               if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
+                   !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
                        goto error;
-               if (euid != (uid_t) -1 && euid != old->uid &&
-                   euid != old->euid  && euid != old->suid)
+               if (euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
+                   !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
                        goto error;
-               if (suid != (uid_t) -1 && suid != old->uid &&
-                   suid != old->euid  && suid != old->suid)
+               if (suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
+                   !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
                        goto error;
        }
  
        if (ruid != (uid_t) -1) {
-               new->uid = ruid;
-               if (ruid != old->uid) {
+               new->uid = kruid;
+               if (!uid_eq(kruid, old->uid)) {
                        retval = set_user(new);
                        if (retval < 0)
                                goto error;
                }
        }
        if (euid != (uid_t) -1)
-               new->euid = euid;
+               new->euid = keuid;
        if (suid != (uid_t) -1)
-               new->suid = suid;
+               new->suid = ksuid;
        new->fsuid = new->euid;
  
        retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
@@@ -820,14 -868,19 +868,19 @@@ error
        return retval;
  }
  
- SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
+ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp)
  {
        const struct cred *cred = current_cred();
        int retval;
+       uid_t ruid, euid, suid;
+       ruid = from_kuid_munged(cred->user_ns, cred->uid);
+       euid = from_kuid_munged(cred->user_ns, cred->euid);
+       suid = from_kuid_munged(cred->user_ns, cred->suid);
  
-       if (!(retval   = put_user(cred->uid,  ruid)) &&
-           !(retval   = put_user(cred->euid, euid)))
-               retval = put_user(cred->suid, suid);
+       if (!(retval   = put_user(ruid, ruidp)) &&
+           !(retval   = put_user(euid, euidp)))
+               retval = put_user(suid, suidp);
  
        return retval;
  }
   */
  SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
  {
+       struct user_namespace *ns = current_user_ns();
        const struct cred *old;
        struct cred *new;
        int retval;
+       kgid_t krgid, kegid, ksgid;
+       krgid = make_kgid(ns, rgid);
+       kegid = make_kgid(ns, egid);
+       ksgid = make_kgid(ns, sgid);
+       if ((rgid != (gid_t) -1) && !gid_valid(krgid))
+               return -EINVAL;
+       if ((egid != (gid_t) -1) && !gid_valid(kegid))
+               return -EINVAL;
+       if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
+               return -EINVAL;
  
        new = prepare_creds();
        if (!new)
  
        retval = -EPERM;
        if (!nsown_capable(CAP_SETGID)) {
-               if (rgid != (gid_t) -1 && rgid != old->gid &&
-                   rgid != old->egid  && rgid != old->sgid)
+               if (rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
+                   !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
                        goto error;
-               if (egid != (gid_t) -1 && egid != old->gid &&
-                   egid != old->egid  && egid != old->sgid)
+               if (egid != (gid_t) -1        && !gid_eq(kegid, old->gid) &&
+                   !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
                        goto error;
-               if (sgid != (gid_t) -1 && sgid != old->gid &&
-                   sgid != old->egid  && sgid != old->sgid)
+               if (sgid != (gid_t) -1        && !gid_eq(ksgid, old->gid) &&
+                   !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
                        goto error;
        }
  
        if (rgid != (gid_t) -1)
-               new->gid = rgid;
+               new->gid = krgid;
        if (egid != (gid_t) -1)
-               new->egid = egid;
+               new->egid = kegid;
        if (sgid != (gid_t) -1)
-               new->sgid = sgid;
+               new->sgid = ksgid;
        new->fsgid = new->egid;
  
        return commit_creds(new);
@@@ -874,14 -940,19 +940,19 @@@ error
        return retval;
  }
  
- SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
+ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp)
  {
        const struct cred *cred = current_cred();
        int retval;
+       gid_t rgid, egid, sgid;
+       rgid = from_kgid_munged(cred->user_ns, cred->gid);
+       egid = from_kgid_munged(cred->user_ns, cred->egid);
+       sgid = from_kgid_munged(cred->user_ns, cred->sgid);
  
-       if (!(retval   = put_user(cred->gid,  rgid)) &&
-           !(retval   = put_user(cred->egid, egid)))
-               retval = put_user(cred->sgid, sgid);
+       if (!(retval   = put_user(rgid, rgidp)) &&
+           !(retval   = put_user(egid, egidp)))
+               retval = put_user(sgid, sgidp);
  
        return retval;
  }
@@@ -898,18 -969,24 +969,24 @@@ SYSCALL_DEFINE1(setfsuid, uid_t, uid
        const struct cred *old;
        struct cred *new;
        uid_t old_fsuid;
+       kuid_t kuid;
+       old = current_cred();
+       old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
+       kuid = make_kuid(old->user_ns, uid);
+       if (!uid_valid(kuid))
+               return old_fsuid;
  
        new = prepare_creds();
        if (!new)
-               return current_fsuid();
-       old = current_cred();
-       old_fsuid = old->fsuid;
+               return old_fsuid;
  
-       if (uid == old->uid  || uid == old->euid  ||
-           uid == old->suid || uid == old->fsuid ||
+       if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
+           uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
            nsown_capable(CAP_SETUID)) {
-               if (uid != old_fsuid) {
-                       new->fsuid = uid;
+               if (!uid_eq(kuid, old->fsuid)) {
+                       new->fsuid = kuid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
                                goto change_okay;
                }
@@@ -931,18 -1008,24 +1008,24 @@@ SYSCALL_DEFINE1(setfsgid, gid_t, gid
        const struct cred *old;
        struct cred *new;
        gid_t old_fsgid;
+       kgid_t kgid;
+       old = current_cred();
+       old_fsgid = from_kgid_munged(old->user_ns, old->fsgid);
+       kgid = make_kgid(old->user_ns, gid);
+       if (!gid_valid(kgid))
+               return old_fsgid;
  
        new = prepare_creds();
        if (!new)
-               return current_fsgid();
-       old = current_cred();
-       old_fsgid = old->fsgid;
+               return old_fsgid;
  
-       if (gid == old->gid  || gid == old->egid  ||
-           gid == old->sgid || gid == old->fsgid ||
+       if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
+           gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
            nsown_capable(CAP_SETGID)) {
-               if (gid != old_fsgid) {
-                       new->fsgid = gid;
+               if (!gid_eq(kgid, old->fsgid)) {
+                       new->fsgid = kgid;
                        goto change_okay;
                }
        }
@@@ -1498,15 -1581,14 +1581,14 @@@ static int check_prlimit_permission(str
                return 0;
  
        tcred = __task_cred(task);
-       if (cred->user->user_ns == tcred->user->user_ns &&
-           (cred->uid == tcred->euid &&
-            cred->uid == tcred->suid &&
-            cred->uid == tcred->uid  &&
-            cred->gid == tcred->egid &&
-            cred->gid == tcred->sgid &&
-            cred->gid == tcred->gid))
+       if (uid_eq(cred->uid, tcred->euid) &&
+           uid_eq(cred->uid, tcred->suid) &&
+           uid_eq(cred->uid, tcred->uid)  &&
+           gid_eq(cred->gid, tcred->egid) &&
+           gid_eq(cred->gid, tcred->sgid) &&
+           gid_eq(cred->gid, tcred->gid))
                return 0;
-       if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+       if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
                return 0;
  
        return -EPERM;
@@@ -1908,7 -1990,7 +1990,7 @@@ SYSCALL_DEFINE5(prctl, int, option, uns
                        error = prctl_get_seccomp();
                        break;
                case PR_SET_SECCOMP:
 -                      error = prctl_set_seccomp(arg2);
 +                      error = prctl_set_seccomp(arg2, (char __user *)arg3);
                        break;
                case PR_GET_TSC:
                        error = GET_TSC_CTL(arg2);
                        error = put_user(me->signal->is_child_subreaper,
                                         (int __user *) arg2);
                        break;
 +              case PR_SET_NO_NEW_PRIVS:
 +                      if (arg2 != 1 || arg3 || arg4 || arg5)
 +                              return -EINVAL;
 +
 +                      current->no_new_privs = 1;
 +                      break;
 +              case PR_GET_NO_NEW_PRIVS:
 +                      if (arg2 || arg3 || arg4 || arg5)
 +                              return -EINVAL;
 +                      return current->no_new_privs ? 1 : 0;
                default:
                        error = -EINVAL;
                        break;
diff --combined kernel/timer.c
index 09de9a941cd706fbd43abada917044fca8909a3e,67316cb6a777494a4b53b6ba357ec17780585d6f..6ec7e7e0db435d722cecd601090ca4a8b3dc994a
@@@ -861,13 -861,7 +861,13 @@@ EXPORT_SYMBOL(mod_timer)
   *
   * mod_timer_pinned() is a way to update the expire field of an
   * active timer (if the timer is inactive it will be activated)
 - * and not allow the timer to be migrated to a different CPU.
 + * and to ensure that the timer is scheduled on the current CPU.
 + *
 + * Note that this does not prevent the timer from being migrated
 + * when the current CPU goes offline.  If this is a problem for
 + * you, use CPU-hotplug notifiers to handle it correctly, for
 + * example, cancelling the timer when the corresponding CPU goes
 + * offline.
   *
   * mod_timer_pinned(timer, expires) is equivalent to:
   *
@@@ -1108,9 -1102,7 +1108,9 @@@ static void call_timer_fn(struct timer_
         * warnings as well as problems when looking into
         * timer->lockdep_map, make a copy and use that here.
         */
 -      struct lockdep_map lockdep_map = timer->lockdep_map;
 +      struct lockdep_map lockdep_map;
 +
 +      lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
  #endif
        /*
         * Couple the lock chain with the lock chain at
@@@ -1435,25 -1427,25 +1435,25 @@@ SYSCALL_DEFINE0(getppid
  SYSCALL_DEFINE0(getuid)
  {
        /* Only we change this so SMP safe */
-       return current_uid();
+       return from_kuid_munged(current_user_ns(), current_uid());
  }
  
  SYSCALL_DEFINE0(geteuid)
  {
        /* Only we change this so SMP safe */
-       return current_euid();
+       return from_kuid_munged(current_user_ns(), current_euid());
  }
  
  SYSCALL_DEFINE0(getgid)
  {
        /* Only we change this so SMP safe */
-       return current_gid();
+       return from_kgid_munged(current_user_ns(), current_gid());
  }
  
  SYSCALL_DEFINE0(getegid)
  {
        /* Only we change this so SMP safe */
-       return  current_egid();
+       return from_kgid_munged(current_user_ns(), current_egid());
  }
  
  #endif
diff --combined mm/mempolicy.c
index b19569137529221163e6b51bd96e3a161805a97f,7b44fc8ec99c6e55d69db3251514d0344a931e1b..39fd416ae14f7cc1da341c50e85fa1bdf93a5904
@@@ -1334,8 -1334,8 +1334,8 @@@ SYSCALL_DEFINE4(migrate_pages, pid_t, p
         * userid as the target process.
         */
        tcred = __task_cred(task);
-       if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
-           cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
+       if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
+           !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
            !capable(CAP_SYS_NICE)) {
                rcu_read_unlock();
                err = -EPERM;
  
        mm = get_task_mm(task);
        put_task_struct(task);
 -      if (mm)
 -              err = do_migrate_pages(mm, old, new,
 -                      capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 -      else
 +
 +      if (!mm) {
                err = -EINVAL;
 +              goto out;
 +      }
 +
 +      err = do_migrate_pages(mm, old, new,
 +              capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
  
        mmput(mm);
  out:
diff --combined mm/migrate.c
index 11072383ae12e5698498be5b3da5b8d991192535,1cf5252c3b9948e580e4b1bc16290f8dfe893943..ab81d482ae6f1cac508fce796c8902432e32c3ad
@@@ -1371,8 -1371,8 +1371,8 @@@ SYSCALL_DEFINE6(move_pages, pid_t, pid
         * userid as the target process.
         */
        tcred = __task_cred(task);
-       if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
-           cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
+       if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
+           !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
            !capable(CAP_SYS_NICE)) {
                rcu_read_unlock();
                err = -EPERM;
        mm = get_task_mm(task);
        put_task_struct(task);
  
 -      if (mm) {
 -              if (nodes)
 -                      err = do_pages_move(mm, task_nodes, nr_pages, pages,
 -                                          nodes, status, flags);
 -              else
 -                      err = do_pages_stat(mm, nr_pages, pages, status);
 -      else
 -              err = -EINVAL;
 +      if (!mm)
 +              return -EINVAL;
 +
 +      if (nodes)
 +              err = do_pages_move(mm, task_nodes, nr_pages, pages,
 +                                  nodes, status, flags);
 +      else
 +              err = do_pages_stat(mm, nr_pages, pages, status);
  
        mmput(mm);
        return err;
diff --combined net/core/sock.c
index f372d9bf497675a436aa2cbc7cba00b5464faf9b,e1ec8ba1381cf596d0065cd6429da2406232319f..653f8c0aedc54aafb08c6f451157f7ca7e432efa
@@@ -89,8 -89,6 +89,8 @@@
   *            2 of the License, or (at your option) any later version.
   */
  
 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 +
  #include <linux/capability.h>
  #include <linux/errno.h>
  #include <linux/types.h>
  #include <linux/user_namespace.h>
  #include <linux/static_key.h>
  #include <linux/memcontrol.h>
 +#include <linux/prefetch.h>
  
  #include <asm/uaccess.h>
  
@@@ -143,7 -140,7 +143,7 @@@ static DEFINE_MUTEX(proto_list_mutex)
  static LIST_HEAD(proto_list);
  
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 -int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
 +int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
  {
        struct proto *proto;
        int ret = 0;
        mutex_lock(&proto_list_mutex);
        list_for_each_entry(proto, &proto_list, node) {
                if (proto->init_cgroup) {
 -                      ret = proto->init_cgroup(cgrp, ss);
 +                      ret = proto->init_cgroup(memcg, ss);
                        if (ret)
                                goto out;
                }
  out:
        list_for_each_entry_continue_reverse(proto, &proto_list, node)
                if (proto->destroy_cgroup)
 -                      proto->destroy_cgroup(cgrp);
 +                      proto->destroy_cgroup(memcg);
        mutex_unlock(&proto_list_mutex);
        return ret;
  }
  
 -void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
 +void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
  {
        struct proto *proto;
  
        mutex_lock(&proto_list_mutex);
        list_for_each_entry_reverse(proto, &proto_list, node)
                if (proto->destroy_cgroup)
 -                      proto->destroy_cgroup(cgrp);
 +                      proto->destroy_cgroup(memcg);
        mutex_unlock(&proto_list_mutex);
  }
  #endif
@@@ -261,9 -258,7 +261,9 @@@ static struct lock_class_key af_callbac
  
  /* Run time adjustable parameters. */
  __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 +EXPORT_SYMBOL(sysctl_wmem_max);
  __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 +EXPORT_SYMBOL(sysctl_rmem_max);
  __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
  __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
  
@@@ -299,8 -294,9 +299,8 @@@ static int sock_set_timeout(long *timeo
                *timeo_p = 0;
                if (warned < 10 && net_ratelimit()) {
                        warned++;
 -                      printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
 -                             "tries to set negative timeout\n",
 -                              current->comm, task_pid_nr(current));
 +                      pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 +                              __func__, current->comm, task_pid_nr(current));
                }
                return 0;
        }
@@@ -318,8 -314,8 +318,8 @@@ static void sock_warn_obsolete_bsdism(c
        static char warncomm[TASK_COMM_LEN];
        if (strcmp(warncomm, current->comm) && warned < 5) {
                strcpy(warncomm,  current->comm);
 -              printk(KERN_WARNING "process `%s' is using obsolete "
 -                     "%s SO_BSDCOMPAT\n", warncomm, name);
 +              pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 +                      warncomm, name);
                warned++;
        }
  }
@@@ -393,7 -389,7 +393,7 @@@ int sk_receive_skb(struct sock *sk, str
  
        skb->dev = NULL;
  
 -      if (sk_rcvqueues_full(sk, skb)) {
 +      if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
                atomic_inc(&sk->sk_drops);
                goto discard_and_relse;
        }
                rc = sk_backlog_rcv(sk, skb);
  
                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 -      } else if (sk_add_backlog(sk, skb)) {
 +      } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
                bh_unlock_sock(sk);
                atomic_inc(&sk->sk_drops);
                goto discard_and_relse;
@@@ -565,7 -561,7 +565,7 @@@ int sock_setsockopt(struct socket *sock
                        sock_valbool_flag(sk, SOCK_DBG, valbool);
                break;
        case SO_REUSEADDR:
 -              sk->sk_reuse = valbool;
 +              sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
                break;
        case SO_TYPE:
        case SO_PROTOCOL:
                break;
        case SO_SNDBUF:
                /* Don't error on this BSD doesn't and if you think
 -                 about it this is right. Otherwise apps have to
 -                 play 'guess the biggest size' games. RCVBUF/SNDBUF
 -                 are treated in BSD as hints */
 -
 -              if (val > sysctl_wmem_max)
 -                      val = sysctl_wmem_max;
 +               * about it this is right. Otherwise apps have to
 +               * play 'guess the biggest size' games. RCVBUF/SNDBUF
 +               * are treated in BSD as hints
 +               */
 +              val = min_t(u32, val, sysctl_wmem_max);
  set_sndbuf:
                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 -              if ((val * 2) < SOCK_MIN_SNDBUF)
 -                      sk->sk_sndbuf = SOCK_MIN_SNDBUF;
 -              else
 -                      sk->sk_sndbuf = val * 2;
 -
 -              /*
 -               *      Wake up sending tasks if we
 -               *      upped the value.
 -               */
 +              sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
 +              /* Wake up sending tasks if we upped the value. */
                sk->sk_write_space(sk);
                break;
  
  
        case SO_RCVBUF:
                /* Don't error on this BSD doesn't and if you think
 -                 about it this is right. Otherwise apps have to
 -                 play 'guess the biggest size' games. RCVBUF/SNDBUF
 -                 are treated in BSD as hints */
 -
 -              if (val > sysctl_rmem_max)
 -                      val = sysctl_rmem_max;
 +               * about it this is right. Otherwise apps have to
 +               * play 'guess the biggest size' games. RCVBUF/SNDBUF
 +               * are treated in BSD as hints
 +               */
 +              val = min_t(u32, val, sysctl_rmem_max);
  set_rcvbuf:
                sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
                /*
                 * returning the value we actually used in getsockopt
                 * is the most desirable behavior.
                 */
 -              if ((val * 2) < SOCK_MIN_RCVBUF)
 -                      sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
 -              else
 -                      sk->sk_rcvbuf = val * 2;
 +              sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
                break;
  
        case SO_RCVBUFFORCE:
@@@ -813,8 -821,8 +813,8 @@@ void cred_to_ucred(struct pid *pid, con
        if (cred) {
                struct user_namespace *current_ns = current_user_ns();
  
-               ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
-               ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
+               ucred->uid = from_kuid(current_ns, cred->euid);
+               ucred->gid = from_kgid(current_ns, cred->egid);
        }
  }
  EXPORT_SYMBOL_GPL(cred_to_ucred);
@@@ -850,7 -858,7 +850,7 @@@ int sock_getsockopt(struct socket *sock
                break;
  
        case SO_BROADCAST:
 -              v.val = !!sock_flag(sk, SOCK_BROADCAST);
 +              v.val = sock_flag(sk, SOCK_BROADCAST);
                break;
  
        case SO_SNDBUF:
                break;
  
        case SO_KEEPALIVE:
 -              v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
 +              v.val = sock_flag(sk, SOCK_KEEPOPEN);
                break;
  
        case SO_TYPE:
                break;
  
        case SO_OOBINLINE:
 -              v.val = !!sock_flag(sk, SOCK_URGINLINE);
 +              v.val = sock_flag(sk, SOCK_URGINLINE);
                break;
  
        case SO_NO_CHECK:
  
        case SO_LINGER:
                lv              = sizeof(v.ling);
 -              v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
 +              v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
                v.ling.l_linger = sk->sk_lingertime / HZ;
                break;
  
                break;
  
        case SO_PASSCRED:
 -              v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
 +              v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
                break;
  
        case SO_PEERCRED:
                break;
  
        case SO_PASSSEC:
 -              v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 +              v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
                break;
  
        case SO_PEERSEC:
                break;
  
        case SO_RXQ_OVFL:
 -              v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
 +              v.val = sock_flag(sk, SOCK_RXQ_OVFL);
                break;
  
        case SO_WIFI_STATUS:
 -              v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
 +              v.val = sock_flag(sk, SOCK_WIFI_STATUS);
                break;
  
        case SO_PEEK_OFF:
                v.val = sk->sk_peek_off;
                break;
        case SO_NOFCS:
 -              v.val = !!sock_flag(sk, SOCK_NOFCS);
 +              v.val = sock_flag(sk, SOCK_NOFCS);
                break;
        default:
                return -ENOPROTOOPT;
@@@ -1239,8 -1247,8 +1239,8 @@@ static void __sk_free(struct sock *sk
        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
  
        if (atomic_read(&sk->sk_omem_alloc))
 -              printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
 -                     __func__, atomic_read(&sk->sk_omem_alloc));
 +              pr_debug("%s: optmem leakage (%d bytes) detected\n",
 +                       __func__, atomic_read(&sk->sk_omem_alloc));
  
        if (sk->sk_peer_cred)
                put_cred(sk->sk_peer_cred);
@@@ -1526,7 -1534,7 +1526,7 @@@ struct sk_buff *sock_rmalloc(struct soc
   */
  void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
  {
 -      if ((unsigned)size <= sysctl_optmem_max &&
 +      if ((unsigned int)size <= sysctl_optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
                void *mem;
                /* First do the add, to avoid the race if kmalloc
@@@ -1704,7 -1712,6 +1704,7 @@@ static void __release_sock(struct sock 
                do {
                        struct sk_buff *next = skb->next;
  
 +                      prefetch(next);
                        WARN_ON_ONCE(skb_dst_is_noref(skb));
                        skb->next = NULL;
                        sk_backlog_rcv(sk, skb);
@@@ -2425,7 -2432,7 +2425,7 @@@ static void assign_proto_idx(struct pro
        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
  
        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
 -              printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
 +              pr_err("PROTO_INUSE_NR exhausted\n");
                return;
        }
  
@@@ -2455,8 -2462,8 +2455,8 @@@ int proto_register(struct proto *prot, 
                                        NULL);
  
                if (prot->slab == NULL) {
 -                      printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
 -                             prot->name);
 +                      pr_crit("%s: Can't create sock SLAB cache!\n",
 +                              prot->name);
                        goto out;
                }
  
                                                                 SLAB_HWCACHE_ALIGN, NULL);
  
                        if (prot->rsk_prot->slab == NULL) {
 -                              printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
 -                                     prot->name);
 +                              pr_crit("%s: Can't create request sock SLAB cache!\n",
 +                                      prot->name);
                                goto out_free_request_sock_slab_name;
                        }
                }
@@@ -2569,7 -2576,7 +2569,7 @@@ static char proto_method_implemented(co
  }
  static long sock_prot_memory_allocated(struct proto *proto)
  {
 -      return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
 +      return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
  }
  
  static char *sock_prot_memory_pressure(struct proto *proto)
diff --combined net/ipv4/ping.c
index 6e930c7174dd2e9d755e6c64c5e562974c33aa72,9d3044ff45b93d4a2bcfa916b1577f597863074c..2c00e8bf684d1a3272401652c343b66fa69a3e54
@@@ -51,16 -51,15 +51,16 @@@ static struct ping_table ping_table
  
  static u16 ping_port_rover;
  
 -static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask)
 +static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask)
  {
        int res = (num + net_hash_mix(net)) & mask;
 +
        pr_debug("hash(%d) = %d\n", num, res);
        return res;
  }
  
  static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
 -                                           struct net *net, unsigned num)
 +                                           struct net *net, unsigned int num)
  {
        return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
  }
@@@ -189,8 -188,7 +189,8 @@@ static void inet_get_ping_group_range_n
                                          gid_t *high)
  {
        gid_t *data = net->ipv4.sysctl_ping_group_range;
 -      unsigned seq;
 +      unsigned int seq;
 +
        do {
                seq = read_seqbegin(&sysctl_local_ports.lock);
  
@@@ -207,17 -205,22 +207,22 @@@ static int ping_init_sock(struct sock *
        gid_t range[2];
        struct group_info *group_info = get_current_groups();
        int i, j, count = group_info->ngroups;
+       kgid_t low, high;
  
        inet_get_ping_group_range_net(net, range, range+1);
+       low = make_kgid(&init_user_ns, range[0]);
+       high = make_kgid(&init_user_ns, range[1]);
+       if (!gid_valid(low) || !gid_valid(high) || gid_lt(high, low))
+               return -EACCES;
        if (range[0] <= group && group <= range[1])
                return 0;
  
        for (i = 0; i < group_info->nblocks; i++) {
                int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
                for (j = 0; j < cp_count; j++) {
-                       group = group_info->blocks[i][j];
-                       if (range[0] <= group && group <= range[1])
+                       kgid_t gid = group_info->blocks[i][j];
+                       if (gid_lte(low, gid) && gid_lte(gid, high))
                                return 0;
                }
  
@@@ -412,7 -415,7 +417,7 @@@ struct pingfakehdr 
        __wsum wcheck;
  };
  
 -static int ping_getfrag(void *from, char * to,
 +static int ping_getfrag(void *from, char *to,
                        int offset, int fraglen, int odd, struct sk_buff *skb)
  {
        struct pingfakehdr *pfh = (struct pingfakehdr *)from;
diff --combined security/commoncap.c
index f80d116093915acb906b7bd80a9424d5d1f98f0b,ff9b113bb07cdb9a32328aa6f4cd9c3f4a2d792e..e771cb1b2d7947f0c85651b38cc7c9c1d3da11d7
@@@ -29,7 -29,6 +29,7 @@@
  #include <linux/securebits.h>
  #include <linux/user_namespace.h>
  #include <linux/binfmts.h>
 +#include <linux/personality.h>
  
  /*
   * If a non-root user executes a setuid-root binary in
@@@ -77,12 -76,12 +77,12 @@@ int cap_capable(const struct cred *cred
                int cap, int audit)
  {
        for (;;) {
-               /* The creator of the user namespace has all caps. */
-               if (targ_ns != &init_user_ns && targ_ns->creator == cred->user)
+               /* The owner of the user namespace has all caps. */
+               if (targ_ns != &init_user_ns && uid_eq(targ_ns->owner, cred->euid))
                        return 0;
  
                /* Do we have the necessary capabilities? */
-               if (targ_ns == cred->user->user_ns)
+               if (targ_ns == cred->user_ns)
                        return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
  
                /* Have we tried all of the parent namespaces? */
@@@ -93,7 -92,7 +93,7 @@@
                 *If you have a capability in a parent user ns, then you have
                 * it over all children user namespaces as well.
                 */
-               targ_ns = targ_ns->creator->user_ns;
+               targ_ns = targ_ns->parent;
        }
  
        /* We never get here */
@@@ -137,10 -136,10 +137,10 @@@ int cap_ptrace_access_check(struct task
        rcu_read_lock();
        cred = current_cred();
        child_cred = __task_cred(child);
-       if (cred->user->user_ns == child_cred->user->user_ns &&
+       if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
                goto out;
-       if (ns_capable(child_cred->user->user_ns, CAP_SYS_PTRACE))
+       if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
  out:
@@@ -169,10 -168,10 +169,10 @@@ int cap_ptrace_traceme(struct task_stru
        rcu_read_lock();
        cred = __task_cred(parent);
        child_cred = current_cred();
-       if (cred->user->user_ns == child_cred->user->user_ns &&
+       if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
                goto out;
-       if (has_ns_capability(parent, child_cred->user->user_ns, CAP_SYS_PTRACE))
+       if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
  out:
@@@ -215,7 -214,7 +215,7 @@@ static inline int cap_inh_is_capped(voi
        /* they are so limited unless the current task has the CAP_SETPCAP
         * capability
         */
-       if (cap_capable(current_cred(), current_cred()->user->user_ns,
+       if (cap_capable(current_cred(), current_cred()->user_ns,
                        CAP_SETPCAP, SECURITY_CAP_AUDIT) == 0)
                return 0;
        return 1;
@@@ -473,19 -472,22 +473,22 @@@ int cap_bprm_set_creds(struct linux_bin
        struct cred *new = bprm->cred;
        bool effective, has_cap = false;
        int ret;
+       kuid_t root_uid;
  
        effective = false;
        ret = get_file_caps(bprm, &effective, &has_cap);
        if (ret < 0)
                return ret;
  
+       root_uid = make_kuid(new->user_ns, 0);
        if (!issecure(SECURE_NOROOT)) {
                /*
                 * If the legacy file capability is set, then don't set privs
                 * for a setuid root binary run by a non-root user.  Do set it
                 * for a root user just to cause least surprise to an admin.
                 */
-               if (has_cap && new->uid != 0 && new->euid == 0) {
+               if (has_cap && !uid_eq(new->uid, root_uid) && uid_eq(new->euid, root_uid)) {
                        warn_setuid_and_fcaps_mixed(bprm->filename);
                        goto skip;
                }
                 *
                 * If only the real uid is 0, we do not set the effective bit.
                 */
-               if (new->euid == 0 || new->uid == 0) {
+               if (uid_eq(new->euid, root_uid) || uid_eq(new->uid, root_uid)) {
                        /* pP' = (cap_bset & ~0) | (pI & ~0) */
                        new->cap_permitted = cap_combine(old->cap_bset,
                                                         old->cap_inheritable);
                }
-               if (new->euid == 0)
+               if (uid_eq(new->euid, root_uid))
                        effective = true;
        }
  skip:
  
 +      /* if we have fs caps, clear dangerous personality flags */
 +      if (!cap_issubset(new->cap_permitted, old->cap_permitted))
 +              bprm->per_clear |= PER_CLEAR_ON_SETID;
 +
 +
        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
 -       * credentials unless they have the appropriate permit
 +       * credentials unless they have the appropriate permit.
 +       *
 +       * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
-       if ((new->euid != old->uid ||
-            new->egid != old->gid ||
+       if ((!uid_eq(new->euid, old->uid) ||
+            !gid_eq(new->egid, old->gid) ||
             !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
            bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
                /* downgrade; they get no more than they had, and maybe less */
 -              if (!capable(CAP_SETUID)) {
 +              if (!capable(CAP_SETUID) ||
 +                  (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
                        new->euid = new->uid;
                        new->egid = new->gid;
                }
         */
        if (!cap_isclear(new->cap_effective)) {
                if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
-                   new->euid != 0 || new->uid != 0 ||
+                   !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
                    issecure(SECURE_NOROOT)) {
                        ret = audit_log_bprm_fcaps(bprm, new, old);
                        if (ret < 0)
  int cap_bprm_secureexec(struct linux_binprm *bprm)
  {
        const struct cred *cred = current_cred();
+       kuid_t root_uid = make_kuid(cred->user_ns, 0);
  
-       if (cred->uid != 0) {
+       if (!uid_eq(cred->uid, root_uid)) {
                if (bprm->cap_effective)
                        return 1;
                if (!cap_isclear(cred->cap_permitted))
                        return 1;
        }
  
-       return (cred->euid != cred->uid ||
-               cred->egid != cred->gid);
+       return (!uid_eq(cred->euid, cred->uid) ||
+               !gid_eq(cred->egid, cred->gid));
  }
  
  /**
@@@ -677,15 -672,21 +681,21 @@@ int cap_inode_removexattr(struct dentr
   */
  static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
  {
-       if ((old->uid == 0 || old->euid == 0 || old->suid == 0) &&
-           (new->uid != 0 && new->euid != 0 && new->suid != 0) &&
+       kuid_t root_uid = make_kuid(old->user_ns, 0);
+       if ((uid_eq(old->uid, root_uid) ||
+            uid_eq(old->euid, root_uid) ||
+            uid_eq(old->suid, root_uid)) &&
+           (!uid_eq(new->uid, root_uid) &&
+            !uid_eq(new->euid, root_uid) &&
+            !uid_eq(new->suid, root_uid)) &&
            !issecure(SECURE_KEEP_CAPS)) {
                cap_clear(new->cap_permitted);
                cap_clear(new->cap_effective);
        }
-       if (old->euid == 0 && new->euid != 0)
+       if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
-       if (old->euid != 0 && new->euid == 0)
+       if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
                new->cap_effective = new->cap_permitted;
  }
  
@@@ -718,11 -719,12 +728,12 @@@ int cap_task_fix_setuid(struct cred *ne
                 *          if not, we might be a bit too harsh here.
                 */
                if (!issecure(SECURE_NO_SETUID_FIXUP)) {
-                       if (old->fsuid == 0 && new->fsuid != 0)
+                       kuid_t root_uid = make_kuid(old->user_ns, 0);
+                       if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_drop_fs_set(new->cap_effective);
  
-                       if (old->fsuid != 0 && new->fsuid == 0)
+                       if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_raise_fs_set(new->cap_effective,
                                                         new->cap_permitted);
@@@ -875,7 -877,7 +886,7 @@@ int cap_task_prctl(int option, unsigne
                    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))   /*[2]*/
                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))   /*[3]*/
                    || (cap_capable(current_cred(),
-                                   current_cred()->user->user_ns, CAP_SETPCAP,
+                                   current_cred()->user_ns, CAP_SETPCAP,
                                    SECURITY_CAP_AUDIT) != 0)           /*[4]*/
                        /*
                         * [1] no changing of bits that are locked
diff --combined security/keys/key.c
index c9bf66ac36e0ff6d566c2d02a358f4551371de66,7e6034793af3845447e69f2673cd47150b8c37cc..50d96d4e06f235c3e8950255c4b8bd5fd64aa7d2
@@@ -253,7 -253,7 +253,7 @@@ struct key *key_alloc(struct key_type *
        quotalen = desclen + type->def_datalen;
  
        /* get hold of the key tracking for this user */
-       user = key_user_lookup(uid, cred->user->user_ns);
+       user = key_user_lookup(uid, cred->user_ns);
        if (!user)
                goto no_memory_1;
  
@@@ -954,28 -954,6 +954,28 @@@ void key_revoke(struct key *key
  }
  EXPORT_SYMBOL(key_revoke);
  
 +/**
 + * key_invalidate - Invalidate a key.
 + * @key: The key to be invalidated.
 + *
 + * Mark a key as being invalidated and have it cleaned up immediately.  The key
 + * is ignored by all searches and other operations from this point.
 + */
 +void key_invalidate(struct key *key)
 +{
 +      kenter("%d", key_serial(key));
 +
 +      key_check(key);
 +
 +      if (!test_bit(KEY_FLAG_INVALIDATED, &key->flags)) {
 +              down_write_nested(&key->sem, 1);
 +              if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags))
 +                      key_schedule_gc_links();
 +              up_write(&key->sem);
 +      }
 +}
 +EXPORT_SYMBOL(key_invalidate);
 +
  /**
   * register_key_type - Register a type of key.
   * @ktype: The new key type.
@@@ -1002,8 -980,6 +1002,8 @@@ int register_key_type(struct key_type *
  
        /* store the type */
        list_add(&ktype->link, &key_types_list);
 +
 +      pr_notice("Key type %s registered\n", ktype->name);
        ret = 0;
  
  out:
@@@ -1026,7 -1002,6 +1026,7 @@@ void unregister_key_type(struct key_typ
        list_del_init(&ktype->link);
        downgrade_write(&key_types_sem);
        key_gc_keytype(ktype);
 +      pr_notice("Key type %s unregistered\n", ktype->name);
        up_read(&key_types_sem);
  }
  EXPORT_SYMBOL(unregister_key_type);
index 57d96363d7f1ec99b555df2300fd2d938f75617b,5442900d2929c3da7cf70f8a7acd40fa115df969..0b4d019e027d187d42272bbc906b70ca106776b6
@@@ -36,7 -36,7 +36,7 @@@ int key_task_permission(const key_ref_
  
        key = key_ref_to_ptr(key_ref);
  
-       if (key->user->user_ns != cred->user->user_ns)
+       if (key->user->user_ns != cred->user_ns)
                goto use_other_perms;
  
        /* use the second 8-bits of permissions for keys the caller owns */
@@@ -53,7 -53,8 +53,8 @@@
                        goto use_these_perms;
                }
  
-               ret = groups_search(cred->group_info, key->gid);
+               ret = groups_search(cred->group_info,
+                                   make_kgid(current_user_ns(), key->gid));
                if (ret) {
                        kperm = key->perm >> 8;
                        goto use_these_perms;
@@@ -87,29 -88,32 +88,29 @@@ EXPORT_SYMBOL(key_task_permission)
   * key_validate - Validate a key.
   * @key: The key to be validated.
   *
 - * Check that a key is valid, returning 0 if the key is okay, -EKEYREVOKED if
 - * the key's type has been removed or if the key has been revoked or
 - * -EKEYEXPIRED if the key has expired.
 + * Check that a key is valid, returning 0 if the key is okay, -ENOKEY if the
 + * key is invalidated, -EKEYREVOKED if the key's type has been removed or if
 + * the key has been revoked or -EKEYEXPIRED if the key has expired.
   */
 -int key_validate(struct key *key)
 +int key_validate(const struct key *key)
  {
 -      struct timespec now;
 -      int ret = 0;
 -
 -      if (key) {
 -              /* check it's still accessible */
 -              ret = -EKEYREVOKED;
 -              if (test_bit(KEY_FLAG_REVOKED, &key->flags) ||
 -                  test_bit(KEY_FLAG_DEAD, &key->flags))
 -                      goto error;
 -
 -              /* check it hasn't expired */
 -              ret = 0;
 -              if (key->expiry) {
 -                      now = current_kernel_time();
 -                      if (now.tv_sec >= key->expiry)
 -                              ret = -EKEYEXPIRED;
 -              }
 +      unsigned long flags = key->flags;
 +
 +      if (flags & (1 << KEY_FLAG_INVALIDATED))
 +              return -ENOKEY;
 +
 +      /* check it's still accessible */
 +      if (flags & ((1 << KEY_FLAG_REVOKED) |
 +                   (1 << KEY_FLAG_DEAD)))
 +              return -EKEYREVOKED;
 +
 +      /* check it hasn't expired */
 +      if (key->expiry) {
 +              struct timespec now = current_kernel_time();
 +              if (now.tv_sec >= key->expiry)
 +                      return -EKEYEXPIRED;
        }
  
 -error:
 -      return ret;
 +      return 0;
  }
  EXPORT_SYMBOL(key_validate);
index e137fcd7042c933ed965e56e79847007add678e1,447fb7618ff38d4bda9d7d3282243c31be8f8c5c..d71056db7b67501a085fd4a8feda5c841dd83094
@@@ -732,8 -732,6 +732,8 @@@ try_again
        if (ret < 0)
                goto invalid_key;
  
 +      key->last_used_at = current_kernel_time().tv_sec;
 +
  error:
        put_cred(cred);
        return key_ref;
@@@ -860,7 -858,7 +860,7 @@@ void key_replace_session_keyring(void
        new-> sgid      = old-> sgid;
        new->fsgid      = old->fsgid;
        new->user       = get_uid(old->user);
-       new->user_ns    = new->user->user_ns;
+       new->user_ns    = get_user_ns(new->user_ns);
        new->group_info = get_group_info(old->group_info);
  
        new->securebits = old->securebits;