]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 17 Dec 2012 23:44:47 +0000 (15:44 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 17 Dec 2012 23:44:47 +0000 (15:44 -0800)
Pull user namespace changes from Eric Biederman:
 "While small this set of changes is very significant with respect to
  containers in general and user namespaces in particular.  The user
  space interface is now complete.

  This set of changes adds support for unprivileged users to create user
  namespaces and as a user namespace root to create other namespaces.
  The tyranny of supporting suid root preventing unprivileged users from
  using cool new kernel features is broken.

  This set of changes completes the work on setns, adding support for
  the pid, user, mount namespaces.

  This set of changes includes a bunch of basic pid namespace
  cleanups/simplifications.  Of particular significance is the rework of
  the pid namespace cleanup so it no longer requires sending out
  tendrils into all kinds of unexpected cleanup paths for operation.  At
  least one case of broken error handling is fixed by this cleanup.

  The files under /proc/<pid>/ns/ have been converted from regular files
  to magic symlinks which prevents incorrect caching by the VFS,
  ensuring the files always refer to the namespace the process is
  currently using and ensuring that the ptrace_mayaccess permission
  checks are always applied.

  The files under /proc/<pid>/ns/ have been given stable inode numbers
  so it is now possible to see if different processes share the same
  namespaces.

  Through the David Miller's net tree are changes to relax many of the
  permission checks in the networking stack to allowing the user
  namespace root to usefully use the networking stack.  Similar changes
  for the mount namespace and the pid namespace are coming through my
  tree.

  Two small changes to add user namespace support were commited here adn
  in David Miller's -net tree so that I could complete the work on the
  /proc/<pid>/ns/ files in this tree.

  Work remains to make it safe to build user namespaces and 9p, afs,
  ceph, cifs, coda, gfs2, ncpfs, nfs, nfsd, ocfs2, and xfs so the
  Kconfig guard remains in place preventing that user namespaces from
  being built when any of those filesystems are enabled.

  Future design work remains to allow root users outside of the initial
  user namespace to mount more than just /proc and /sys."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (38 commits)
  proc: Usable inode numbers for the namespace file descriptors.
  proc: Fix the namespace inode permission checks.
  proc: Generalize proc inode allocation
  userns: Allow unprivilged mounts of proc and sysfs
  userns: For /proc/self/{uid,gid}_map derive the lower userns from the struct file
  procfs: Print task uids and gids in the userns that opened the proc file
  userns: Implement unshare of the user namespace
  userns: Implent proc namespace operations
  userns: Kill task_user_ns
  userns: Make create_new_namespaces take a user_ns parameter
  userns: Allow unprivileged use of setns.
  userns: Allow unprivileged users to create new namespaces
  userns: Allow setting a userns mapping to your current uid.
  userns: Allow chown and setgid preservation
  userns: Allow unprivileged users to create user namespaces.
  userns: Ignore suid and sgid on binaries if the uid or gid can not be mapped
  userns: fix return value on mntns_install() failure
  vfs: Allow unprivileged manipulation of the mount namespace.
  vfs: Only support slave subtrees across different user namespaces
  vfs: Add a user namespace reference from struct mnt_namespace
  ...

17 files changed:
1  2 
arch/um/drivers/mconsole_kern.c
drivers/staging/android/binder.c
fs/exec.c
fs/proc/array.c
fs/proc/base.c
include/linux/cred.h
include/linux/fs.h
init/Kconfig
init/main.c
kernel/cgroup.c
kernel/events/core.c
kernel/exit.c
kernel/fork.c
kernel/pid.c
kernel/sched/core.c
kernel/signal.c
security/yama/yama_lsm.c

index 49e3b49e552f7f81dea63e708bbb0abf1e32a3f4,7fc71c628267faadd44e31709a7cc95ce4983a3b..4bd82ac0210f27c8ef7c755480399ad9dce2f15b
@@@ -123,7 -123,7 +123,7 @@@ void mconsole_log(struct mc_request *re
  
  void mconsole_proc(struct mc_request *req)
  {
-       struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
+       struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
        char *buf;
        int len;
        struct file *file;
@@@ -648,7 -648,7 +648,7 @@@ static void stack_proc(void *arg
        struct task_struct *from = current, *to = arg;
  
        to->thread.saved_task = from;
 -      rcu_switch(from, to);
 +      rcu_user_hooks_switch(from, to);
        switch_to(from, to, from);
  }
  
index 4a36e9ab8cf7d5ffa66723ca3fd7be255c13aa72,a97bbcd1c9ea3f9bbeead7dba687b804fdeef318..2d12e8a1f82ee06b89f8be1127e644c16ead8994
@@@ -15,8 -15,6 +15,8 @@@
   *
   */
  
 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 +
  #include <asm/cacheflush.h>
  #include <linux/fdtable.h>
  #include <linux/file.h>
  #include <linux/uaccess.h>
  #include <linux/vmalloc.h>
  #include <linux/slab.h>
+ #include <linux/pid_namespace.h>
  
  #include "binder.h"
 +#include "binder_trace.h"
  
 -static DEFINE_MUTEX(binder_lock);
 +static DEFINE_MUTEX(binder_main_lock);
  static DEFINE_MUTEX(binder_deferred_lock);
  static DEFINE_MUTEX(binder_mmap_lock);
  
@@@ -414,19 -412,6 +415,19 @@@ static long task_close_fd(struct binder
        return retval;
  }
  
 +static inline void binder_lock(const char *tag)
 +{
 +      trace_binder_lock(tag);
 +      mutex_lock(&binder_main_lock);
 +      trace_binder_locked(tag);
 +}
 +
 +static inline void binder_unlock(const char *tag)
 +{
 +      trace_binder_unlock(tag);
 +      mutex_unlock(&binder_main_lock);
 +}
 +
  static void binder_set_nice(long nice)
  {
        long min_nice;
        }
        min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur;
        binder_debug(BINDER_DEBUG_PRIORITY_CAP,
 -                   "binder: %d: nice value %ld not allowed use "
 -                   "%ld instead\n", current->pid, nice, min_nice);
 +                   "%d: nice value %ld not allowed use %ld instead\n",
 +                    current->pid, nice, min_nice);
        set_user_nice(current, min_nice);
        if (min_nice < 20)
                return;
 -      binder_user_error("binder: %d RLIMIT_NICE not set\n", current->pid);
 +      binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
  }
  
  static size_t binder_buffer_size(struct binder_proc *proc,
@@@ -468,8 -453,8 +469,8 @@@ static void binder_insert_free_buffer(s
        new_buffer_size = binder_buffer_size(proc, new_buffer);
  
        binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 -                   "binder: %d: add free buffer, size %zd, "
 -                   "at %p\n", proc->pid, new_buffer_size, new_buffer);
 +                   "%d: add free buffer, size %zd, at %p\n",
 +                    proc->pid, new_buffer_size, new_buffer);
  
        while (*p) {
                parent = *p;
@@@ -547,14 -532,12 +548,14 @@@ static int binder_update_page_range(str
        struct mm_struct *mm;
  
        binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 -                   "binder: %d: %s pages %p-%p\n", proc->pid,
 +                   "%d: %s pages %p-%p\n", proc->pid,
                     allocate ? "allocate" : "free", start, end);
  
        if (end <= start)
                return 0;
  
 +      trace_binder_update_page_range(proc, allocate, start, end);
 +
        if (vma)
                mm = NULL;
        else
                down_write(&mm->mmap_sem);
                vma = proc->vma;
                if (vma && mm != proc->vma_vm_mm) {
 -                      pr_err("binder: %d: vma mm and task mm mismatch\n",
 +                      pr_err("%d: vma mm and task mm mismatch\n",
                                proc->pid);
                        vma = NULL;
                }
                goto free_range;
  
        if (vma == NULL) {
 -              pr_err("binder: %d: binder_alloc_buf failed to "
 -                     "map pages in userspace, no vma\n", proc->pid);
 +              pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
 +                      proc->pid);
                goto err_no_vma;
        }
  
                BUG_ON(*page);
                *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
                if (*page == NULL) {
 -                      pr_err("binder: %d: binder_alloc_buf failed "
 -                             "for page at %p\n", proc->pid, page_addr);
 +                      pr_err("%d: binder_alloc_buf failed for page at %p\n",
 +                              proc->pid, page_addr);
                        goto err_alloc_page_failed;
                }
                tmp_area.addr = page_addr;
                page_array_ptr = page;
                ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr);
                if (ret) {
 -                      pr_err("binder: %d: binder_alloc_buf failed "
 -                             "to map page at %p in kernel\n",
 +                      pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n",
                               proc->pid, page_addr);
                        goto err_map_kernel_failed;
                }
                        (uintptr_t)page_addr + proc->user_buffer_offset;
                ret = vm_insert_page(vma, user_page_addr, page[0]);
                if (ret) {
 -                      pr_err("binder: %d: binder_alloc_buf failed "
 -                             "to map page at %lx in userspace\n",
 +                      pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
                               proc->pid, user_page_addr);
                        goto err_vm_insert_page_failed;
                }
@@@ -652,7 -637,7 +653,7 @@@ static struct binder_buffer *binder_all
        size_t size;
  
        if (proc->vma == NULL) {
 -              pr_err("binder: %d: binder_alloc_buf, no vma\n",
 +              pr_err("%d: binder_alloc_buf, no vma\n",
                       proc->pid);
                return NULL;
        }
                ALIGN(offsets_size, sizeof(void *));
  
        if (size < data_size || size < offsets_size) {
 -              binder_user_error("binder: %d: got transaction with invalid "
 -                      "size %zd-%zd\n", proc->pid, data_size, offsets_size);
 +              binder_user_error("%d: got transaction with invalid size %zd-%zd\n",
 +                              proc->pid, data_size, offsets_size);
                return NULL;
        }
  
        if (is_async &&
            proc->free_async_space < size + sizeof(struct binder_buffer)) {
                binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 -                           "binder: %d: binder_alloc_buf size %zd"
 -                           "failed, no async space left\n", proc->pid, size);
 +                           "%d: binder_alloc_buf size %zd failed, no async space left\n",
 +                            proc->pid, size);
                return NULL;
        }
  
                }
        }
        if (best_fit == NULL) {
 -              pr_err("binder: %d: binder_alloc_buf size %zd failed, "
 -                     "no address space\n", proc->pid, size);
 +              pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
 +                      proc->pid, size);
                return NULL;
        }
        if (n == NULL) {
        }
  
        binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 -                   "binder: %d: binder_alloc_buf size %zd got buff"
 -                   "er %p size %zd\n", proc->pid, size, buffer, buffer_size);
 +                   "%d: binder_alloc_buf size %zd got buffer %p size %zd\n",
 +                    proc->pid, size, buffer, buffer_size);
  
        has_page_addr =
                (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
                binder_insert_free_buffer(proc, new_buffer);
        }
        binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 -                   "binder: %d: binder_alloc_buf size %zd got "
 -                   "%p\n", proc->pid, size, buffer);
 +                   "%d: binder_alloc_buf size %zd got %p\n",
 +                    proc->pid, size, buffer);
        buffer->data_size = data_size;
        buffer->offsets_size = offsets_size;
        buffer->async_transaction = is_async;
        if (is_async) {
                proc->free_async_space -= size + sizeof(struct binder_buffer);
                binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
 -                           "binder: %d: binder_alloc_buf size %zd "
 -                           "async free %zd\n", proc->pid, size,
 -                           proc->free_async_space);
 +                           "%d: binder_alloc_buf size %zd async free %zd\n",
 +                            proc->pid, size, proc->free_async_space);
        }
  
        return buffer;
@@@ -769,8 -755,8 +770,8 @@@ static void binder_delete_free_buffer(s
                if (buffer_end_page(prev) == buffer_end_page(buffer))
                        free_page_end = 0;
                binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 -                           "binder: %d: merge free, buffer %p "
 -                           "share page with %p\n", proc->pid, buffer, prev);
 +                           "%d: merge free, buffer %p share page with %p\n",
 +                            proc->pid, buffer, prev);
        }
  
        if (!list_is_last(&buffer->entry, &proc->buffers)) {
                            buffer_start_page(buffer))
                                free_page_start = 0;
                        binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 -                                   "binder: %d: merge free, buffer"
 -                                   " %p share page with %p\n", proc->pid,
 -                                   buffer, prev);
 +                                   "%d: merge free, buffer %p share page with %p\n",
 +                                    proc->pid, buffer, prev);
                }
        }
        list_del(&buffer->entry);
        if (free_page_start || free_page_end) {
                binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 -                           "binder: %d: merge free, buffer %p do "
 -                           "not share page%s%s with with %p or %p\n",
 +                           "%d: merge free, buffer %p do not share page%s%s with with %p or %p\n",
                             proc->pid, buffer, free_page_start ? "" : " end",
                             free_page_end ? "" : " start", prev, next);
                binder_update_page_range(proc, 0, free_page_start ?
@@@ -810,8 -798,8 +811,8 @@@ static void binder_free_buf(struct bind
                ALIGN(buffer->offsets_size, sizeof(void *));
  
        binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 -                   "binder: %d: binder_free_buf %p size %zd buffer"
 -                   "_size %zd\n", proc->pid, buffer, size, buffer_size);
 +                   "%d: binder_free_buf %p size %zd buffer_size %zd\n",
 +                    proc->pid, buffer, size, buffer_size);
  
        BUG_ON(buffer->free);
        BUG_ON(size > buffer_size);
                proc->free_async_space += size + sizeof(struct binder_buffer);
  
                binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
 -                           "binder: %d: binder_free_buf size %zd "
 -                           "async free %zd\n", proc->pid, size,
 -                           proc->free_async_space);
 +                           "%d: binder_free_buf size %zd async free %zd\n",
 +                            proc->pid, size, proc->free_async_space);
        }
  
        binder_update_page_range(proc, 0,
@@@ -906,7 -895,7 +907,7 @@@ static struct binder_node *binder_new_n
        INIT_LIST_HEAD(&node->work.entry);
        INIT_LIST_HEAD(&node->async_todo);
        binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 -                   "binder: %d:%d node %d u%p c%p created\n",
 +                   "%d:%d node %d u%p c%p created\n",
                     proc->pid, current->pid, node->debug_id,
                     node->ptr, node->cookie);
        return node;
@@@ -921,8 -910,8 +922,8 @@@ static int binder_inc_node(struct binde
                            node->internal_strong_refs == 0 &&
                            !(node == binder_context_mgr_node &&
                            node->has_strong_ref)) {
 -                              pr_err("binder: invalid inc strong "
 -                                      "node for %d\n", node->debug_id);
 +                              pr_err("invalid inc strong node for %d\n",
 +                                      node->debug_id);
                                return -EINVAL;
                        }
                        node->internal_strong_refs++;
                        node->local_weak_refs++;
                if (!node->has_weak_ref && list_empty(&node->work.entry)) {
                        if (target_list == NULL) {
 -                              pr_err("binder: invalid inc weak node "
 -                                      "for %d\n", node->debug_id);
 +                              pr_err("invalid inc weak node for %d\n",
 +                                      node->debug_id);
                                return -EINVAL;
                        }
                        list_add_tail(&node->work.entry, target_list);
@@@ -974,12 -963,12 +975,12 @@@ static int binder_dec_node(struct binde
                        if (node->proc) {
                                rb_erase(&node->rb_node, &node->proc->nodes);
                                binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 -                                           "binder: refless node %d deleted\n",
 +                                           "refless node %d deleted\n",
                                             node->debug_id);
                        } else {
                                hlist_del(&node->dead_node);
                                binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 -                                           "binder: dead node %d deleted\n",
 +                                           "dead node %d deleted\n",
                                             node->debug_id);
                        }
                        kfree(node);
@@@ -1065,13 -1054,14 +1066,13 @@@ static struct binder_ref *binder_get_re
                hlist_add_head(&new_ref->node_entry, &node->refs);
  
                binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 -                           "binder: %d new ref %d desc %d for "
 -                           "node %d\n", proc->pid, new_ref->debug_id,
 -                           new_ref->desc, node->debug_id);
 +                           "%d new ref %d desc %d for node %d\n",
 +                            proc->pid, new_ref->debug_id, new_ref->desc,
 +                            node->debug_id);
        } else {
                binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 -                           "binder: %d new ref %d desc %d for "
 -                           "dead node\n", proc->pid, new_ref->debug_id,
 -                            new_ref->desc);
 +                           "%d new ref %d desc %d for dead node\n",
 +                            proc->pid, new_ref->debug_id, new_ref->desc);
        }
        return new_ref;
  }
  static void binder_delete_ref(struct binder_ref *ref)
  {
        binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 -                   "binder: %d delete ref %d desc %d for "
 -                   "node %d\n", ref->proc->pid, ref->debug_id,
 -                   ref->desc, ref->node->debug_id);
 +                   "%d delete ref %d desc %d for node %d\n",
 +                    ref->proc->pid, ref->debug_id, ref->desc,
 +                    ref->node->debug_id);
  
        rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc);
        rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node);
        binder_dec_node(ref->node, 0, 1);
        if (ref->death) {
                binder_debug(BINDER_DEBUG_DEAD_BINDER,
 -                           "binder: %d delete ref %d desc %d "
 -                           "has death notification\n", ref->proc->pid,
 -                           ref->debug_id, ref->desc);
 +                           "%d delete ref %d desc %d has death notification\n",
 +                            ref->proc->pid, ref->debug_id, ref->desc);
                list_del(&ref->death->work.entry);
                kfree(ref->death);
                binder_stats_deleted(BINDER_STAT_DEATH);
@@@ -1128,7 -1119,8 +1129,7 @@@ static int binder_dec_ref(struct binder
  {
        if (strong) {
                if (ref->strong == 0) {
 -                      binder_user_error("binder: %d invalid dec strong, "
 -                                        "ref %d desc %d s %d w %d\n",
 +                      binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n",
                                          ref->proc->pid, ref->debug_id,
                                          ref->desc, ref->strong, ref->weak);
                        return -EINVAL;
                }
        } else {
                if (ref->weak == 0) {
 -                      binder_user_error("binder: %d invalid dec weak, "
 -                                        "ref %d desc %d s %d w %d\n",
 +                      binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n",
                                          ref->proc->pid, ref->debug_id,
                                          ref->desc, ref->strong, ref->weak);
                        return -EINVAL;
@@@ -1187,7 -1180,8 +1188,7 @@@ static void binder_send_failed_reply(st
                        }
                        if (target_thread->return_error == BR_OK) {
                                binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 -                                           "binder: send failed reply for "
 -                                           "transaction %d to %d:%d\n",
 +                                           "send failed reply for transaction %d to %d:%d\n",
                                              t->debug_id, target_thread->proc->pid,
                                              target_thread->pid);
  
                                target_thread->return_error = error_code;
                                wake_up_interruptible(&target_thread->wait);
                        } else {
 -                              pr_err("binder: reply failed, target "
 -                                      "thread, %d:%d, has error code %d "
 -                                      "already\n", target_thread->proc->pid,
 +                              pr_err("reply failed, target thread, %d:%d, has error code %d already\n",
 +                                      target_thread->proc->pid,
                                        target_thread->pid,
                                        target_thread->return_error);
                        }
                        struct binder_transaction *next = t->from_parent;
  
                        binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 -                                   "binder: send failed reply "
 -                                   "for transaction %d, target dead\n",
 +                                   "send failed reply for transaction %d, target dead\n",
                                     t->debug_id);
  
                        binder_pop_transaction(target_thread, t);
                        if (next == NULL) {
                                binder_debug(BINDER_DEBUG_DEAD_BINDER,
 -                                           "binder: reply failed,"
 -                                           " no target thread at root\n");
 +                                           "reply failed, no target thread at root\n");
                                return;
                        }
                        t = next;
                        binder_debug(BINDER_DEBUG_DEAD_BINDER,
 -                                   "binder: reply failed, no target "
 -                                   "thread -- retry %d\n", t->debug_id);
 +                                   "reply failed, no target thread -- retry %d\n",
 +                                    t->debug_id);
                }
        }
  }
@@@ -1230,7 -1227,7 +1231,7 @@@ static void binder_transaction_buffer_r
        int debug_id = buffer->debug_id;
  
        binder_debug(BINDER_DEBUG_TRANSACTION,
 -                   "binder: %d buffer release %d, size %zd-%zd, failed at %p\n",
 +                   "%d buffer release %d, size %zd-%zd, failed at %p\n",
                     proc->pid, buffer->debug_id,
                     buffer->data_size, buffer->offsets_size, failed_at);
  
                if (*offp > buffer->data_size - sizeof(*fp) ||
                    buffer->data_size < sizeof(*fp) ||
                    !IS_ALIGNED(*offp, sizeof(void *))) {
 -                      pr_err("binder: transaction release %d bad"
 -                                      "offset %zd, size %zd\n", debug_id,
 -                                      *offp, buffer->data_size);
 +                      pr_err("transaction release %d bad offset %zd, size %zd\n",
 +                       debug_id, *offp, buffer->data_size);
                        continue;
                }
                fp = (struct flat_binder_object *)(buffer->data + *offp);
                case BINDER_TYPE_WEAK_BINDER: {
                        struct binder_node *node = binder_get_node(proc, fp->binder);
                        if (node == NULL) {
 -                              pr_err("binder: transaction release %d"
 -                                     " bad node %p\n", debug_id, fp->binder);
 +                              pr_err("transaction release %d bad node %p\n",
 +                                      debug_id, fp->binder);
                                break;
                        }
                        binder_debug(BINDER_DEBUG_TRANSACTION,
                case BINDER_TYPE_WEAK_HANDLE: {
                        struct binder_ref *ref = binder_get_ref(proc, fp->handle);
                        if (ref == NULL) {
 -                              pr_err("binder: transaction release %d"
 -                                     " bad handle %ld\n", debug_id,
 -                                     fp->handle);
 +                              pr_err("transaction release %d bad handle %ld\n",
 +                               debug_id, fp->handle);
                                break;
                        }
                        binder_debug(BINDER_DEBUG_TRANSACTION,
                        break;
  
                default:
 -                      pr_err("binder: transaction release %d bad "
 -                             "object type %lx\n", debug_id, fp->type);
 +                      pr_err("transaction release %d bad object type %lx\n",
 +                              debug_id, fp->type);
                        break;
                }
        }
@@@ -1322,14 -1321,17 +1323,14 @@@ static void binder_transaction(struct b
        if (reply) {
                in_reply_to = thread->transaction_stack;
                if (in_reply_to == NULL) {
 -                      binder_user_error("binder: %d:%d got reply transaction "
 -                                        "with no transaction stack\n",
 +                      binder_user_error("%d:%d got reply transaction with no transaction stack\n",
                                          proc->pid, thread->pid);
                        return_error = BR_FAILED_REPLY;
                        goto err_empty_call_stack;
                }
                binder_set_nice(in_reply_to->saved_priority);
                if (in_reply_to->to_thread != thread) {
 -                      binder_user_error("binder: %d:%d got reply transaction "
 -                              "with bad transaction stack,"
 -                              " transaction %d has target %d:%d\n",
 +                      binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n",
                                proc->pid, thread->pid, in_reply_to->debug_id,
                                in_reply_to->to_proc ?
                                in_reply_to->to_proc->pid : 0,
                        goto err_dead_binder;
                }
                if (target_thread->transaction_stack != in_reply_to) {
 -                      binder_user_error("binder: %d:%d got reply transaction "
 -                              "with bad target transaction stack %d, "
 -                              "expected %d\n",
 +                      binder_user_error("%d:%d got reply transaction with bad target transaction stack %d, expected %d\n",
                                proc->pid, thread->pid,
                                target_thread->transaction_stack ?
                                target_thread->transaction_stack->debug_id : 0,
                        struct binder_ref *ref;
                        ref = binder_get_ref(proc, tr->target.handle);
                        if (ref == NULL) {
 -                              binder_user_error("binder: %d:%d got "
 -                                      "transaction to invalid handle\n",
 +                              binder_user_error("%d:%d got transaction to invalid handle\n",
                                        proc->pid, thread->pid);
                                return_error = BR_FAILED_REPLY;
                                goto err_invalid_target_handle;
                        struct binder_transaction *tmp;
                        tmp = thread->transaction_stack;
                        if (tmp->to_thread != thread) {
 -                              binder_user_error("binder: %d:%d got new "
 -                                      "transaction with bad transaction stack"
 -                                      ", transaction %d has target %d:%d\n",
 +                              binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n",
                                        proc->pid, thread->pid, tmp->debug_id,
                                        tmp->to_proc ? tmp->to_proc->pid : 0,
                                        tmp->to_thread ?
  
        if (reply)
                binder_debug(BINDER_DEBUG_TRANSACTION,
 -                           "binder: %d:%d BC_REPLY %d -> %d:%d, "
 -                           "data %p-%p size %zd-%zd\n",
 +                           "%d:%d BC_REPLY %d -> %d:%d, data %p-%p size %zd-%zd\n",
                             proc->pid, thread->pid, t->debug_id,
                             target_proc->pid, target_thread->pid,
                             tr->data.ptr.buffer, tr->data.ptr.offsets,
                             tr->data_size, tr->offsets_size);
        else
                binder_debug(BINDER_DEBUG_TRANSACTION,
 -                           "binder: %d:%d BC_TRANSACTION %d -> "
 -                           "%d - node %d, data %p-%p size %zd-%zd\n",
 +                           "%d:%d BC_TRANSACTION %d -> %d - node %d, data %p-%p size %zd-%zd\n",
                             proc->pid, thread->pid, t->debug_id,
                             target_proc->pid, target_node->debug_id,
                             tr->data.ptr.buffer, tr->data.ptr.offsets,
        t->code = tr->code;
        t->flags = tr->flags;
        t->priority = task_nice(current);
 +
 +      trace_binder_transaction(reply, t, target_node);
 +
        t->buffer = binder_alloc_buf(target_proc, tr->data_size,
                tr->offsets_size, !reply && (t->flags & TF_ONE_WAY));
        if (t->buffer == NULL) {
        t->buffer->debug_id = t->debug_id;
        t->buffer->transaction = t;
        t->buffer->target_node = target_node;
 +      trace_binder_transaction_alloc_buf(t->buffer);
        if (target_node)
                binder_inc_node(target_node, 1, 0, NULL);
  
        offp = (size_t *)(t->buffer->data + ALIGN(tr->data_size, sizeof(void *)));
  
        if (copy_from_user(t->buffer->data, tr->data.ptr.buffer, tr->data_size)) {
 -              binder_user_error("binder: %d:%d got transaction with invalid "
 -                      "data ptr\n", proc->pid, thread->pid);
 +              binder_user_error("%d:%d got transaction with invalid data ptr\n",
 +                              proc->pid, thread->pid);
                return_error = BR_FAILED_REPLY;
                goto err_copy_data_failed;
        }
        if (copy_from_user(offp, tr->data.ptr.offsets, tr->offsets_size)) {
 -              binder_user_error("binder: %d:%d got transaction with invalid "
 -                      "offsets ptr\n", proc->pid, thread->pid);
 +              binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
 +                              proc->pid, thread->pid);
                return_error = BR_FAILED_REPLY;
                goto err_copy_data_failed;
        }
        if (!IS_ALIGNED(tr->offsets_size, sizeof(size_t))) {
 -              binder_user_error("binder: %d:%d got transaction with "
 -                      "invalid offsets size, %zd\n",
 -                      proc->pid, thread->pid, tr->offsets_size);
 +              binder_user_error("%d:%d got transaction with invalid offsets size, %zd\n",
 +                              proc->pid, thread->pid, tr->offsets_size);
                return_error = BR_FAILED_REPLY;
                goto err_bad_offset;
        }
                if (*offp > t->buffer->data_size - sizeof(*fp) ||
                    t->buffer->data_size < sizeof(*fp) ||
                    !IS_ALIGNED(*offp, sizeof(void *))) {
 -                      binder_user_error("binder: %d:%d got transaction with "
 -                              "invalid offset, %zd\n",
 -                              proc->pid, thread->pid, *offp);
 +                      binder_user_error("%d:%d got transaction with invalid offset, %zd\n",
 +                                      proc->pid, thread->pid, *offp);
                        return_error = BR_FAILED_REPLY;
                        goto err_bad_offset;
                }
                                node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
                        }
                        if (fp->cookie != node->cookie) {
 -                              binder_user_error("binder: %d:%d sending u%p "
 -                                      "node %d, cookie mismatch %p != %p\n",
 +                              binder_user_error("%d:%d sending u%p node %d, cookie mismatch %p != %p\n",
                                        proc->pid, thread->pid,
                                        fp->binder, node->debug_id,
                                        fp->cookie, node->cookie);
                        binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE,
                                       &thread->todo);
  
 +                      trace_binder_transaction_node_to_ref(t, node, ref);
                        binder_debug(BINDER_DEBUG_TRANSACTION,
                                     "        node %d u%p -> ref %d desc %d\n",
                                     node->debug_id, node->ptr, ref->debug_id,
                case BINDER_TYPE_WEAK_HANDLE: {
                        struct binder_ref *ref = binder_get_ref(proc, fp->handle);
                        if (ref == NULL) {
 -                              binder_user_error("binder: %d:%d got "
 -                                      "transaction with invalid "
 -                                      "handle, %ld\n", proc->pid,
 -                                      thread->pid, fp->handle);
 +                              binder_user_error("%d:%d got transaction with invalid handle, %ld\n",
 +                                              proc->pid,
 +                                              thread->pid, fp->handle);
                                return_error = BR_FAILED_REPLY;
                                goto err_binder_get_ref_failed;
                        }
                                fp->binder = ref->node->ptr;
                                fp->cookie = ref->node->cookie;
                                binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL);
 +                              trace_binder_transaction_ref_to_node(t, ref);
                                binder_debug(BINDER_DEBUG_TRANSACTION,
                                             "        ref %d desc %d -> node %d u%p\n",
                                             ref->debug_id, ref->desc, ref->node->debug_id,
                                }
                                fp->handle = new_ref->desc;
                                binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL);
 +                              trace_binder_transaction_ref_to_ref(t, ref,
 +                                                                  new_ref);
                                binder_debug(BINDER_DEBUG_TRANSACTION,
                                             "        ref %d desc %d -> ref %d desc %d (node %d)\n",
                                             ref->debug_id, ref->desc, new_ref->debug_id,
  
                        if (reply) {
                                if (!(in_reply_to->flags & TF_ACCEPT_FDS)) {
 -                                      binder_user_error("binder: %d:%d got reply with fd, %ld, but target does not allow fds\n",
 +                                      binder_user_error("%d:%d got reply with fd, %ld, but target does not allow fds\n",
                                                proc->pid, thread->pid, fp->handle);
                                        return_error = BR_FAILED_REPLY;
                                        goto err_fd_not_allowed;
                                }
                        } else if (!target_node->accept_fds) {
 -                              binder_user_error("binder: %d:%d got transaction with fd, %ld, but target does not allow fds\n",
 +                              binder_user_error("%d:%d got transaction with fd, %ld, but target does not allow fds\n",
                                        proc->pid, thread->pid, fp->handle);
                                return_error = BR_FAILED_REPLY;
                                goto err_fd_not_allowed;
  
                        file = fget(fp->handle);
                        if (file == NULL) {
 -                              binder_user_error("binder: %d:%d got transaction with invalid fd, %ld\n",
 +                              binder_user_error("%d:%d got transaction with invalid fd, %ld\n",
                                        proc->pid, thread->pid, fp->handle);
                                return_error = BR_FAILED_REPLY;
                                goto err_fget_failed;
                                goto err_get_unused_fd_failed;
                        }
                        task_fd_install(target_proc, target_fd, file);
 +                      trace_binder_transaction_fd(t, fp->handle, target_fd);
                        binder_debug(BINDER_DEBUG_TRANSACTION,
                                     "        fd %ld -> %d\n", fp->handle, target_fd);
                        /* TODO: fput? */
                } break;
  
                default:
 -                      binder_user_error("binder: %d:%d got transactio"
 -                              "n with invalid object type, %lx\n",
 +                      binder_user_error("%d:%d got transaction with invalid object type, %lx\n",
                                proc->pid, thread->pid, fp->type);
                        return_error = BR_FAILED_REPLY;
                        goto err_bad_object_type;
@@@ -1663,7 -1668,6 +1664,7 @@@ err_binder_new_node_failed
  err_bad_object_type:
  err_bad_offset:
  err_copy_data_failed:
 +      trace_binder_transaction_failed_buffer_release(t->buffer);
        binder_transaction_buffer_release(target_proc, t->buffer, offp);
        t->buffer->transaction = NULL;
        binder_free_buf(target_proc, t->buffer);
@@@ -1680,7 -1684,7 +1681,7 @@@ err_dead_binder
  err_invalid_target_handle:
  err_no_context_mgr_node:
        binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 -                   "binder: %d:%d transaction failed %d, size %zd-%zd\n",
 +                   "%d:%d transaction failed %d, size %zd-%zd\n",
                     proc->pid, thread->pid, return_error,
                     tr->data_size, tr->offsets_size);
  
@@@ -1709,7 -1713,6 +1710,7 @@@ int binder_thread_write(struct binder_p
                if (get_user(cmd, (uint32_t __user *)ptr))
                        return -EFAULT;
                ptr += sizeof(uint32_t);
 +              trace_binder_command(cmd);
                if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) {
                        binder_stats.bc[_IOC_NR(cmd)]++;
                        proc->stats.bc[_IOC_NR(cmd)]++;
                                ref = binder_get_ref_for_node(proc,
                                               binder_context_mgr_node);
                                if (ref->desc != target) {
 -                                      binder_user_error("binder: %d:"
 -                                              "%d tried to acquire "
 -                                              "reference to desc 0, "
 -                                              "got %d instead\n",
 +                                      binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n",
                                                proc->pid, thread->pid,
                                                ref->desc);
                                }
                        } else
                                ref = binder_get_ref(proc, target);
                        if (ref == NULL) {
 -                              binder_user_error("binder: %d:%d refcou"
 -                                      "nt change on invalid ref %d\n",
 +                              binder_user_error("%d:%d refcount change on invalid ref %d\n",
                                        proc->pid, thread->pid, target);
                                break;
                        }
                                break;
                        }
                        binder_debug(BINDER_DEBUG_USER_REFS,
 -                                   "binder: %d:%d %s ref %d desc %d s %d w %d for node %d\n",
 +                                   "%d:%d %s ref %d desc %d s %d w %d for node %d\n",
                                     proc->pid, thread->pid, debug_string, ref->debug_id,
                                     ref->desc, ref->strong, ref->weak, ref->node->debug_id);
                        break;
                        ptr += sizeof(void *);
                        node = binder_get_node(proc, node_ptr);
                        if (node == NULL) {
 -                              binder_user_error("binder: %d:%d "
 -                                      "%s u%p no match\n",
 +                              binder_user_error("%d:%d %s u%p no match\n",
                                        proc->pid, thread->pid,
                                        cmd == BC_INCREFS_DONE ?
                                        "BC_INCREFS_DONE" :
                                break;
                        }
                        if (cookie != node->cookie) {
 -                              binder_user_error("binder: %d:%d %s u%p node %d"
 -                                      " cookie mismatch %p != %p\n",
 +                              binder_user_error("%d:%d %s u%p node %d cookie mismatch %p != %p\n",
                                        proc->pid, thread->pid,
                                        cmd == BC_INCREFS_DONE ?
                                        "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
                        }
                        if (cmd == BC_ACQUIRE_DONE) {
                                if (node->pending_strong_ref == 0) {
 -                                      binder_user_error("binder: %d:%d "
 -                                              "BC_ACQUIRE_DONE node %d has "
 -                                              "no pending acquire request\n",
 +                                      binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n",
                                                proc->pid, thread->pid,
                                                node->debug_id);
                                        break;
                                node->pending_strong_ref = 0;
                        } else {
                                if (node->pending_weak_ref == 0) {
 -                                      binder_user_error("binder: %d:%d "
 -                                              "BC_INCREFS_DONE node %d has "
 -                                              "no pending increfs request\n",
 +                                      binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n",
                                                proc->pid, thread->pid,
                                                node->debug_id);
                                        break;
                        }
                        binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0);
                        binder_debug(BINDER_DEBUG_USER_REFS,
 -                                   "binder: %d:%d %s node %d ls %d lw %d\n",
 +                                   "%d:%d %s node %d ls %d lw %d\n",
                                     proc->pid, thread->pid,
                                     cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
                                     node->debug_id, node->local_strong_refs, node->local_weak_refs);
                        break;
                }
                case BC_ATTEMPT_ACQUIRE:
 -                      pr_err("binder: BC_ATTEMPT_ACQUIRE not supported\n");
 +                      pr_err("BC_ATTEMPT_ACQUIRE not supported\n");
                        return -EINVAL;
                case BC_ACQUIRE_RESULT:
 -                      pr_err("binder: BC_ACQUIRE_RESULT not supported\n");
 +                      pr_err("BC_ACQUIRE_RESULT not supported\n");
                        return -EINVAL;
  
                case BC_FREE_BUFFER: {
  
                        buffer = binder_buffer_lookup(proc, data_ptr);
                        if (buffer == NULL) {
 -                              binder_user_error("binder: %d:%d "
 -                                      "BC_FREE_BUFFER u%p no match\n",
 +                              binder_user_error("%d:%d BC_FREE_BUFFER u%p no match\n",
                                        proc->pid, thread->pid, data_ptr);
                                break;
                        }
                        if (!buffer->allow_user_free) {
 -                              binder_user_error("binder: %d:%d "
 -                                      "BC_FREE_BUFFER u%p matched "
 -                                      "unreturned buffer\n",
 +                              binder_user_error("%d:%d BC_FREE_BUFFER u%p matched unreturned buffer\n",
                                        proc->pid, thread->pid, data_ptr);
                                break;
                        }
                        binder_debug(BINDER_DEBUG_FREE_BUFFER,
 -                                   "binder: %d:%d BC_FREE_BUFFER u%p found buffer %d for %s transaction\n",
 +                                   "%d:%d BC_FREE_BUFFER u%p found buffer %d for %s transaction\n",
                                     proc->pid, thread->pid, data_ptr, buffer->debug_id,
                                     buffer->transaction ? "active" : "finished");
  
                                else
                                        list_move_tail(buffer->target_node->async_todo.next, &thread->todo);
                        }
 +                      trace_binder_transaction_buffer_release(buffer);
                        binder_transaction_buffer_release(proc, buffer, NULL);
                        binder_free_buf(proc, buffer);
                        break;
  
                case BC_REGISTER_LOOPER:
                        binder_debug(BINDER_DEBUG_THREADS,
 -                                   "binder: %d:%d BC_REGISTER_LOOPER\n",
 +                                   "%d:%d BC_REGISTER_LOOPER\n",
                                     proc->pid, thread->pid);
                        if (thread->looper & BINDER_LOOPER_STATE_ENTERED) {
                                thread->looper |= BINDER_LOOPER_STATE_INVALID;
 -                              binder_user_error("binder: %d:%d ERROR:"
 -                                      " BC_REGISTER_LOOPER called "
 -                                      "after BC_ENTER_LOOPER\n",
 +                              binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n",
                                        proc->pid, thread->pid);
                        } else if (proc->requested_threads == 0) {
                                thread->looper |= BINDER_LOOPER_STATE_INVALID;
 -                              binder_user_error("binder: %d:%d ERROR:"
 -                                      " BC_REGISTER_LOOPER called "
 -                                      "without request\n",
 +                              binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called without request\n",
                                        proc->pid, thread->pid);
                        } else {
                                proc->requested_threads--;
                        break;
                case BC_ENTER_LOOPER:
                        binder_debug(BINDER_DEBUG_THREADS,
 -                                   "binder: %d:%d BC_ENTER_LOOPER\n",
 +                                   "%d:%d BC_ENTER_LOOPER\n",
                                     proc->pid, thread->pid);
                        if (thread->looper & BINDER_LOOPER_STATE_REGISTERED) {
                                thread->looper |= BINDER_LOOPER_STATE_INVALID;
 -                              binder_user_error("binder: %d:%d ERROR:"
 -                                      " BC_ENTER_LOOPER called after "
 -                                      "BC_REGISTER_LOOPER\n",
 +                              binder_user_error("%d:%d ERROR: BC_ENTER_LOOPER called after BC_REGISTER_LOOPER\n",
                                        proc->pid, thread->pid);
                        }
                        thread->looper |= BINDER_LOOPER_STATE_ENTERED;
                        break;
                case BC_EXIT_LOOPER:
                        binder_debug(BINDER_DEBUG_THREADS,
 -                                   "binder: %d:%d BC_EXIT_LOOPER\n",
 +                                   "%d:%d BC_EXIT_LOOPER\n",
                                     proc->pid, thread->pid);
                        thread->looper |= BINDER_LOOPER_STATE_EXITED;
                        break;
                        ptr += sizeof(void *);
                        ref = binder_get_ref(proc, target);
                        if (ref == NULL) {
 -                              binder_user_error("binder: %d:%d %s "
 -                                      "invalid ref %d\n",
 +                              binder_user_error("%d:%d %s invalid ref %d\n",
                                        proc->pid, thread->pid,
                                        cmd == BC_REQUEST_DEATH_NOTIFICATION ?
                                        "BC_REQUEST_DEATH_NOTIFICATION" :
                        }
  
                        binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION,
 -                                   "binder: %d:%d %s %p ref %d desc %d s %d w %d for node %d\n",
 +                                   "%d:%d %s %p ref %d desc %d s %d w %d for node %d\n",
                                     proc->pid, thread->pid,
                                     cmd == BC_REQUEST_DEATH_NOTIFICATION ?
                                     "BC_REQUEST_DEATH_NOTIFICATION" :
  
                        if (cmd == BC_REQUEST_DEATH_NOTIFICATION) {
                                if (ref->death) {
 -                                      binder_user_error("binder: %d:%"
 -                                              "d BC_REQUEST_DEATH_NOTI"
 -                                              "FICATION death notific"
 -                                              "ation already set\n",
 +                                      binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n",
                                                proc->pid, thread->pid);
                                        break;
                                }
                                if (death == NULL) {
                                        thread->return_error = BR_ERROR;
                                        binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
 -                                                   "binder: %d:%d "
 -                                                   "BC_REQUEST_DEATH_NOTIFICATION failed\n",
 +                                                   "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
                                                     proc->pid, thread->pid);
                                        break;
                                }
                                }
                        } else {
                                if (ref->death == NULL) {
 -                                      binder_user_error("binder: %d:%"
 -                                              "d BC_CLEAR_DEATH_NOTIFI"
 -                                              "CATION death notificat"
 -                                              "ion not active\n",
 +                                      binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n",
                                                proc->pid, thread->pid);
                                        break;
                                }
                                death = ref->death;
                                if (death->cookie != cookie) {
 -                                      binder_user_error("binder: %d:%"
 -                                              "d BC_CLEAR_DEATH_NOTIFI"
 -                                              "CATION death notificat"
 -                                              "ion cookie mismatch "
 -                                              "%p != %p\n",
 +                                      binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch %p != %p\n",
                                                proc->pid, thread->pid,
                                                death->cookie, cookie);
                                        break;
                                }
                        }
                        binder_debug(BINDER_DEBUG_DEAD_BINDER,
 -                                   "binder: %d:%d BC_DEAD_BINDER_DONE %p found %p\n",
 +                                   "%d:%d BC_DEAD_BINDER_DONE %p found %p\n",
                                     proc->pid, thread->pid, cookie, death);
                        if (death == NULL) {
 -                              binder_user_error("binder: %d:%d BC_DEAD"
 -                                      "_BINDER_DONE %p not found\n",
 +                              binder_user_error("%d:%d BC_DEAD_BINDER_DONE %p not found\n",
                                        proc->pid, thread->pid, cookie);
                                break;
                        }
                } break;
  
                default:
 -                      pr_err("binder: %d:%d unknown command %d\n",
 +                      pr_err("%d:%d unknown command %d\n",
                               proc->pid, thread->pid, cmd);
                        return -EINVAL;
                }
  void binder_stat_br(struct binder_proc *proc, struct binder_thread *thread,
                    uint32_t cmd)
  {
 +      trace_binder_return(cmd);
        if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) {
                binder_stats.br[_IOC_NR(cmd)]++;
                proc->stats.br[_IOC_NR(cmd)]++;
@@@ -2103,7 -2136,6 +2104,7 @@@ retry
                        if (put_user(thread->return_error2, (uint32_t __user *)ptr))
                                return -EFAULT;
                        ptr += sizeof(uint32_t);
 +                      binder_stat_br(proc, thread, thread->return_error2);
                        if (ptr == end)
                                goto done;
                        thread->return_error2 = BR_OK;
                if (put_user(thread->return_error, (uint32_t __user *)ptr))
                        return -EFAULT;
                ptr += sizeof(uint32_t);
 +              binder_stat_br(proc, thread, thread->return_error);
                thread->return_error = BR_OK;
                goto done;
        }
        thread->looper |= BINDER_LOOPER_STATE_WAITING;
        if (wait_for_proc_work)
                proc->ready_threads++;
 -      mutex_unlock(&binder_lock);
 +
 +      binder_unlock(__func__);
 +
 +      trace_binder_wait_for_work(wait_for_proc_work,
 +                                 !!thread->transaction_stack,
 +                                 !list_empty(&thread->todo));
        if (wait_for_proc_work) {
                if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED |
                                        BINDER_LOOPER_STATE_ENTERED))) {
 -                      binder_user_error("binder: %d:%d ERROR: Thread waiting "
 -                              "for process work before calling BC_REGISTER_"
 -                              "LOOPER or BC_ENTER_LOOPER (state %x)\n",
 +                      binder_user_error("%d:%d ERROR: Thread waiting for process work before calling BC_REGISTER_LOOPER or BC_ENTER_LOOPER (state %x)\n",
                                proc->pid, thread->pid, thread->looper);
                        wait_event_interruptible(binder_user_error_wait,
                                                 binder_stop_on_user_error < 2);
                } else
                        ret = wait_event_interruptible(thread->wait, binder_has_thread_work(thread));
        }
 -      mutex_lock(&binder_lock);
 +
 +      binder_lock(__func__);
 +
        if (wait_for_proc_work)
                proc->ready_threads--;
        thread->looper &= ~BINDER_LOOPER_STATE_WAITING;
  
                        binder_stat_br(proc, thread, cmd);
                        binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE,
 -                                   "binder: %d:%d BR_TRANSACTION_COMPLETE\n",
 +                                   "%d:%d BR_TRANSACTION_COMPLETE\n",
                                     proc->pid, thread->pid);
  
                        list_del(&w->entry);
  
                                binder_stat_br(proc, thread, cmd);
                                binder_debug(BINDER_DEBUG_USER_REFS,
 -                                           "binder: %d:%d %s %d u%p c%p\n",
 +                                           "%d:%d %s %d u%p c%p\n",
                                             proc->pid, thread->pid, cmd_name, node->debug_id, node->ptr, node->cookie);
                        } else {
                                list_del_init(&w->entry);
                                if (!weak && !strong) {
                                        binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 -                                                   "binder: %d:%d node %d u%p c%p deleted\n",
 +                                                   "%d:%d node %d u%p c%p deleted\n",
                                                     proc->pid, thread->pid, node->debug_id,
                                                     node->ptr, node->cookie);
                                        rb_erase(&node->rb_node, &proc->nodes);
                                        binder_stats_deleted(BINDER_STAT_NODE);
                                } else {
                                        binder_debug(BINDER_DEBUG_INTERNAL_REFS,
 -                                                   "binder: %d:%d node %d u%p c%p state unchanged\n",
 +                                                   "%d:%d node %d u%p c%p state unchanged\n",
                                                     proc->pid, thread->pid, node->debug_id, node->ptr,
                                                     node->cookie);
                                }
                        if (put_user(death->cookie, (void * __user *)ptr))
                                return -EFAULT;
                        ptr += sizeof(void *);
 +                      binder_stat_br(proc, thread, cmd);
                        binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION,
 -                                   "binder: %d:%d %s %p\n",
 +                                   "%d:%d %s %p\n",
                                      proc->pid, thread->pid,
                                      cmd == BR_DEAD_BINDER ?
                                      "BR_DEAD_BINDER" :
                if (t->from) {
                        struct task_struct *sender = t->from->proc->tsk;
                        tr.sender_pid = task_tgid_nr_ns(sender,
-                                                       current->nsproxy->pid_ns);
+                                                       task_active_pid_ns(current));
                } else {
                        tr.sender_pid = 0;
                }
                        return -EFAULT;
                ptr += sizeof(tr);
  
 +              trace_binder_transaction_received(t);
                binder_stat_br(proc, thread, cmd);
                binder_debug(BINDER_DEBUG_TRANSACTION,
 -                           "binder: %d:%d %s %d %d:%d, cmd %d"
 -                           "size %zd-%zd ptr %p-%p\n",
 +                           "%d:%d %s %d %d:%d, cmd %d size %zd-%zd ptr %p-%p\n",
                             proc->pid, thread->pid,
                             (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" :
                             "BR_REPLY",
@@@ -2376,11 -2401,10 +2377,11 @@@ done
             /*spawn a new thread if we leave this out */) {
                proc->requested_threads++;
                binder_debug(BINDER_DEBUG_THREADS,
 -                           "binder: %d:%d BR_SPAWN_LOOPER\n",
 +                           "%d:%d BR_SPAWN_LOOPER\n",
                             proc->pid, thread->pid);
                if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer))
                        return -EFAULT;
 +              binder_stat_br(proc, thread, BR_SPAWN_LOOPER);
        }
        return 0;
  }
@@@ -2401,7 -2425,7 +2402,7 @@@ static void binder_release_work(struct 
                                binder_send_failed_reply(t, BR_DEAD_REPLY);
                        } else {
                                binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
 -                                      "binder: undelivered transaction %d\n",
 +                                      "undelivered transaction %d\n",
                                        t->debug_id);
                                t->buffer->transaction = NULL;
                                kfree(t);
                } break;
                case BINDER_WORK_TRANSACTION_COMPLETE: {
                        binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
 -                              "binder: undelivered TRANSACTION_COMPLETE\n");
 +                              "undelivered TRANSACTION_COMPLETE\n");
                        kfree(w);
                        binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
                } break;
  
                        death = container_of(w, struct binder_ref_death, work);
                        binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
 -                              "binder: undelivered death notification, %p\n",
 +                              "undelivered death notification, %p\n",
                                death->cookie);
                        kfree(death);
                        binder_stats_deleted(BINDER_STAT_DEATH);
                } break;
                default:
 -                      pr_err("binder: unexpected work type, %d, not freed\n",
 +                      pr_err("unexpected work type, %d, not freed\n",
                               w->type);
                        break;
                }
@@@ -2483,8 -2507,8 +2484,8 @@@ static int binder_free_thread(struct bi
        while (t) {
                active_transactions++;
                binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
 -                           "binder: release %d:%d transaction %d "
 -                           "%s, still active\n", proc->pid, thread->pid,
 +                           "release %d:%d transaction %d %s, still active\n",
 +                            proc->pid, thread->pid,
                             t->debug_id,
                             (t->to_thread == thread) ? "in" : "out");
  
@@@ -2517,14 -2541,12 +2518,14 @@@ static unsigned int binder_poll(struct 
        struct binder_thread *thread = NULL;
        int wait_for_proc_work;
  
 -      mutex_lock(&binder_lock);
 +      binder_lock(__func__);
 +
        thread = binder_get_thread(proc);
  
        wait_for_proc_work = thread->transaction_stack == NULL &&
                list_empty(&thread->todo) && thread->return_error == BR_OK;
 -      mutex_unlock(&binder_lock);
 +
 +      binder_unlock(__func__);
  
        if (wait_for_proc_work) {
                if (binder_has_proc_work(proc, thread))
@@@ -2552,13 -2574,11 +2553,13 @@@ static long binder_ioctl(struct file *f
  
        /*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/
  
 +      trace_binder_ioctl(cmd, arg);
 +
        ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
        if (ret)
 -              return ret;
 +              goto err_unlocked;
  
 -      mutex_lock(&binder_lock);
 +      binder_lock(__func__);
        thread = binder_get_thread(proc);
        if (thread == NULL) {
                ret = -ENOMEM;
                        goto err;
                }
                binder_debug(BINDER_DEBUG_READ_WRITE,
 -                           "binder: %d:%d write %ld at %08lx, read %ld at %08lx\n",
 -                           proc->pid, thread->pid, bwr.write_size, bwr.write_buffer,
 -                           bwr.read_size, bwr.read_buffer);
 +                           "%d:%d write %ld at %08lx, read %ld at %08lx\n",
 +                           proc->pid, thread->pid, bwr.write_size,
 +                           bwr.write_buffer, bwr.read_size, bwr.read_buffer);
  
                if (bwr.write_size > 0) {
                        ret = binder_thread_write(proc, thread, (void __user *)bwr.write_buffer, bwr.write_size, &bwr.write_consumed);
 +                      trace_binder_write_done(ret);
                        if (ret < 0) {
                                bwr.read_consumed = 0;
                                if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
                }
                if (bwr.read_size > 0) {
                        ret = binder_thread_read(proc, thread, (void __user *)bwr.read_buffer, bwr.read_size, &bwr.read_consumed, filp->f_flags & O_NONBLOCK);
 +                      trace_binder_read_done(ret);
                        if (!list_empty(&proc->todo))
                                wake_up_interruptible(&proc->wait);
                        if (ret < 0) {
                        }
                }
                binder_debug(BINDER_DEBUG_READ_WRITE,
 -                           "binder: %d:%d wrote %ld of %ld, read return %ld of %ld\n",
 +                           "%d:%d wrote %ld of %ld, read return %ld of %ld\n",
                             proc->pid, thread->pid, bwr.write_consumed, bwr.write_size,
                             bwr.read_consumed, bwr.read_size);
                if (copy_to_user(ubuf, &bwr, sizeof(bwr))) {
                break;
        case BINDER_SET_CONTEXT_MGR:
                if (binder_context_mgr_node != NULL) {
 -                      pr_err("binder: BINDER_SET_CONTEXT_MGR already set\n");
 +                      pr_err("BINDER_SET_CONTEXT_MGR already set\n");
                        ret = -EBUSY;
                        goto err;
                }
                if (uid_valid(binder_context_mgr_uid)) {
                        if (!uid_eq(binder_context_mgr_uid, current->cred->euid)) {
 -                              pr_err("binder: BINDER_SET_"
 -                                     "CONTEXT_MGR bad uid %d != %d\n",
 +                              pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
                                       from_kuid(&init_user_ns, current->cred->euid),
                                       from_kuid(&init_user_ns, binder_context_mgr_uid));
                                ret = -EPERM;
                binder_context_mgr_node->has_weak_ref = 1;
                break;
        case BINDER_THREAD_EXIT:
 -              binder_debug(BINDER_DEBUG_THREADS, "binder: %d:%d exit\n",
 +              binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n",
                             proc->pid, thread->pid);
                binder_free_thread(proc, thread);
                thread = NULL;
  err:
        if (thread)
                thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN;
 -      mutex_unlock(&binder_lock);
 +      binder_unlock(__func__);
        wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
        if (ret && ret != -ERESTARTSYS)
 -              pr_info("binder: %d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
 +              pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
 +err_unlocked:
 +      trace_binder_ioctl_done(ret);
        return ret;
  }
  
@@@ -2681,7 -2698,7 +2682,7 @@@ static void binder_vma_open(struct vm_a
  {
        struct binder_proc *proc = vma->vm_private_data;
        binder_debug(BINDER_DEBUG_OPEN_CLOSE,
 -                   "binder: %d open vm area %lx-%lx (%ld K) vma %lx pagep %lx\n",
 +                   "%d open vm area %lx-%lx (%ld K) vma %lx pagep %lx\n",
                     proc->pid, vma->vm_start, vma->vm_end,
                     (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
                     (unsigned long)pgprot_val(vma->vm_page_prot));
@@@ -2691,7 -2708,7 +2692,7 @@@ static void binder_vma_close(struct vm_
  {
        struct binder_proc *proc = vma->vm_private_data;
        binder_debug(BINDER_DEBUG_OPEN_CLOSE,
 -                   "binder: %d close vm area %lx-%lx (%ld K) vma %lx pagep %lx\n",
 +                   "%d close vm area %lx-%lx (%ld K) vma %lx pagep %lx\n",
                     proc->pid, vma->vm_start, vma->vm_end,
                     (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
                     (unsigned long)pgprot_val(vma->vm_page_prot));
@@@ -2819,16 -2836,13 +2820,16 @@@ static int binder_open(struct inode *no
        INIT_LIST_HEAD(&proc->todo);
        init_waitqueue_head(&proc->wait);
        proc->default_priority = task_nice(current);
 -      mutex_lock(&binder_lock);
 +
 +      binder_lock(__func__);
 +
        binder_stats_created(BINDER_STAT_PROC);
        hlist_add_head(&proc->proc_node, &binder_procs);
        proc->pid = current->group_leader->pid;
        INIT_LIST_HEAD(&proc->delivered_death);
        filp->private_data = proc;
 -      mutex_unlock(&binder_lock);
 +
 +      binder_unlock(__func__);
  
        if (binder_debugfs_dir_entry_proc) {
                char strbuf[11];
@@@ -2936,8 -2950,9 +2937,8 @@@ static void binder_deferred_release(str
                                }
                        }
                        binder_debug(BINDER_DEBUG_DEAD_BINDER,
 -                                   "binder: node %d now dead, "
 -                                   "refs %d, death %d\n", node->debug_id,
 -                                   incoming_refs, death);
 +                                   "node %d now dead, refs %d, death %d\n",
 +                                    node->debug_id, incoming_refs, death);
                }
        }
        outgoing_refs = 0;
                if (t) {
                        t->buffer = NULL;
                        buffer->transaction = NULL;
 -                      pr_err("binder: release proc %d, "
 -                             "transaction %d, not freed\n",
 +                      pr_err("release proc %d, transaction %d, not freed\n",
                               proc->pid, t->debug_id);
                        /*BUG();*/
                }
                        if (proc->pages[i]) {
                                void *page_addr = proc->buffer + i * PAGE_SIZE;
                                binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
 -                                           "binder_release: %d: "
 -                                           "page %d at %p not freed\n",
 +                                           "binder_release: %d: page %d at %p not freed\n",
                                             proc->pid, i,
                                             page_addr);
                                unmap_kernel_range((unsigned long)page_addr,
        put_task_struct(proc->tsk);
  
        binder_debug(BINDER_DEBUG_OPEN_CLOSE,
 -                   "binder_release: %d threads %d, nodes %d (ref %d), "
 -                   "refs %d, active transactions %d, buffers %d, "
 -                   "pages %d\n",
 +                   "binder_release: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d, buffers %d, pages %d\n",
                     proc->pid, threads, nodes, incoming_refs, outgoing_refs,
                     active_transactions, buffers, page_count);
  
@@@ -3005,7 -3024,7 +3006,7 @@@ static void binder_deferred_func(struc
  
        int defer;
        do {
 -              mutex_lock(&binder_lock);
 +              binder_lock(__func__);
                mutex_lock(&binder_deferred_lock);
                if (!hlist_empty(&binder_deferred_list)) {
                        proc = hlist_entry(binder_deferred_list.first,
                if (defer & BINDER_DEFERRED_RELEASE)
                        binder_deferred_release(proc); /* frees proc */
  
 -              mutex_unlock(&binder_lock);
 +              binder_unlock(__func__);
                if (files)
                        put_files_struct(files);
        } while (proc);
@@@ -3373,7 -3392,7 +3374,7 @@@ static int binder_state_show(struct seq
        int do_lock = !binder_debug_no_lock;
  
        if (do_lock)
 -              mutex_lock(&binder_lock);
 +              binder_lock(__func__);
  
        seq_puts(m, "binder state:\n");
  
        hlist_for_each_entry(proc, pos, &binder_procs, proc_node)
                print_binder_proc(m, proc, 1);
        if (do_lock)
 -              mutex_unlock(&binder_lock);
 +              binder_unlock(__func__);
        return 0;
  }
  
@@@ -3396,7 -3415,7 +3397,7 @@@ static int binder_stats_show(struct seq
        int do_lock = !binder_debug_no_lock;
  
        if (do_lock)
 -              mutex_lock(&binder_lock);
 +              binder_lock(__func__);
  
        seq_puts(m, "binder stats:\n");
  
        hlist_for_each_entry(proc, pos, &binder_procs, proc_node)
                print_binder_proc_stats(m, proc);
        if (do_lock)
 -              mutex_unlock(&binder_lock);
 +              binder_unlock(__func__);
        return 0;
  }
  
@@@ -3416,13 -3435,13 +3417,13 @@@ static int binder_transactions_show(str
        int do_lock = !binder_debug_no_lock;
  
        if (do_lock)
 -              mutex_lock(&binder_lock);
 +              binder_lock(__func__);
  
        seq_puts(m, "binder transactions:\n");
        hlist_for_each_entry(proc, pos, &binder_procs, proc_node)
                print_binder_proc(m, proc, 0);
        if (do_lock)
 -              mutex_unlock(&binder_lock);
 +              binder_unlock(__func__);
        return 0;
  }
  
@@@ -3432,11 -3451,11 +3433,11 @@@ static int binder_proc_show(struct seq_
        int do_lock = !binder_debug_no_lock;
  
        if (do_lock)
 -              mutex_lock(&binder_lock);
 +              binder_lock(__func__);
        seq_puts(m, "binder proc state:\n");
        print_binder_proc(m, proc, 1);
        if (do_lock)
 -              mutex_unlock(&binder_lock);
 +              binder_unlock(__func__);
        return 0;
  }
  
@@@ -3531,7 -3550,4 +3532,7 @@@ static int __init binder_init(void
  
  device_initcall(binder_init);
  
 +#define CREATE_TRACE_POINTS
 +#include "binder_trace.h"
 +
  MODULE_LICENSE("GPL v2");
diff --combined fs/exec.c
index 721a299295117f92d271f17afd224db1787712a1,aef0c2f19750f40a83970749f6aec112fd75baac..b71b08ce71204824c7c66c8d101a7ebecdbb4db0
+++ b/fs/exec.c
@@@ -1266,14 -1266,13 +1266,13 @@@ int prepare_binprm(struct linux_binprm 
        bprm->cred->egid = current_egid();
  
        if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
-           !current->no_new_privs) {
+           !current->no_new_privs &&
+           kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
+           kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
                /* Set-uid? */
                if (mode & S_ISUID) {
-                       if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
-                               return -EPERM;
                        bprm->per_clear |= PER_CLEAR_ON_SETID;
                        bprm->cred->euid = inode->i_uid;
                }
  
                /* Set-gid? */
                 * executable.
                 */
                if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
-                       if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
-                               return -EPERM;
                        bprm->per_clear |= PER_CLEAR_ON_SETID;
                        bprm->cred->egid = inode->i_gid;
                }
@@@ -1349,7 -1346,7 +1346,7 @@@ EXPORT_SYMBOL(remove_arg_zero)
  /*
   * cycle the list of binary formats handler, until one recognizes the image
   */
 -int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 +int search_binary_handler(struct linux_binprm *bprm)
  {
        unsigned int depth = bprm->recursion_depth;
        int try,retval;
        for (try=0; try<2; try++) {
                read_lock(&binfmt_lock);
                list_for_each_entry(fmt, &formats, lh) {
 -                      int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
 +                      int (*fn)(struct linux_binprm *) = fmt->load_binary;
                        if (!fn)
                                continue;
                        if (!try_module_get(fmt->module))
                                continue;
                        read_unlock(&binfmt_lock);
 -                      retval = fn(bprm, regs);
 +                      retval = fn(bprm);
                        /*
                         * Restore the depth counter to its starting value
                         * in this call, so we don't have to rely on every
@@@ -1439,7 -1436,8 +1436,7 @@@ EXPORT_SYMBOL(search_binary_handler)
   */
  static int do_execve_common(const char *filename,
                                struct user_arg_ptr argv,
 -                              struct user_arg_ptr envp,
 -                              struct pt_regs *regs)
 +                              struct user_arg_ptr envp)
  {
        struct linux_binprm *bprm;
        struct file *file;
        if (retval < 0)
                goto out;
  
 -      retval = search_binary_handler(bprm,regs);
 +      retval = search_binary_handler(bprm);
        if (retval < 0)
                goto out;
  
@@@ -1565,17 -1563,19 +1562,17 @@@ out_ret
  
  int do_execve(const char *filename,
        const char __user *const __user *__argv,
 -      const char __user *const __user *__envp,
 -      struct pt_regs *regs)
 +      const char __user *const __user *__envp)
  {
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };
 -      return do_execve_common(filename, argv, envp, regs);
 +      return do_execve_common(filename, argv, envp);
  }
  
  #ifdef CONFIG_COMPAT
 -int compat_do_execve(const char *filename,
 +static int compat_do_execve(const char *filename,
        const compat_uptr_t __user *__argv,
 -      const compat_uptr_t __user *__envp,
 -      struct pt_regs *regs)
 +      const compat_uptr_t __user *__envp)
  {
        struct user_arg_ptr argv = {
                .is_compat = true,
                .is_compat = true,
                .ptr.compat = __envp,
        };
 -      return do_execve_common(filename, argv, envp, regs);
 +      return do_execve_common(filename, argv, envp);
  }
  #endif
  
@@@ -1666,7 -1666,7 +1663,7 @@@ SYSCALL_DEFINE3(execve
        struct filename *path = getname(filename);
        int error = PTR_ERR(path);
        if (!IS_ERR(path)) {
 -              error = do_execve(path->name, argv, envp, current_pt_regs());
 +              error = do_execve(path->name, argv, envp);
                putname(path);
        }
        return error;
@@@ -1679,7 -1679,8 +1676,7 @@@ asmlinkage long compat_sys_execve(cons
        struct filename *path = getname(filename);
        int error = PTR_ERR(path);
        if (!IS_ERR(path)) {
 -              error = compat_do_execve(path->name, argv, envp,
 -                                                      current_pt_regs());
 +              error = compat_do_execve(path->name, argv, envp);
                putname(path);
        }
        return error;
@@@ -1692,9 -1693,12 +1689,9 @@@ int kernel_execve(const char *filename
                  const char *const argv[],
                  const char *const envp[])
  {
 -      struct pt_regs *p = current_pt_regs();
 -      int ret;
 -
 -      ret = do_execve(filename,
 +      int ret = do_execve(filename,
                        (const char __user *const __user *)argv,
 -                      (const char __user *const __user *)envp, p);
 +                      (const char __user *const __user *)envp);
        if (ret < 0)
                return ret;
  
         * We were successful.  We won't be returning to our caller, but
         * instead to user space by manipulating the kernel stack.
         */
 -      ret_from_kernel_execve(p);
 +      ret_from_kernel_execve(current_pt_regs());
  }
  #endif
diff --combined fs/proc/array.c
index d3696708fc1ae4bff76a1d5a253103e792629ae4,554434265613331c22c87e860c219b83652f3e93..d66248a1919b3a3028d07ae1d4dc59f36ce57aea
@@@ -162,7 -162,7 +162,7 @@@ static inline const char *get_task_stat
  static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *p)
  {
-       struct user_namespace *user_ns = current_user_ns();
+       struct user_namespace *user_ns = seq_user_ns(m);
        struct group_info *group_info;
        int g;
        struct fdtable *fdt = NULL;
@@@ -438,7 -438,7 +438,7 @@@ static int do_task_stat(struct seq_fil
  
                        min_flt += sig->min_flt;
                        maj_flt += sig->maj_flt;
 -                      thread_group_times(task, &utime, &stime);
 +                      thread_group_cputime_adjusted(task, &utime, &stime);
                        gtime += sig->gtime;
                }
  
        if (!whole) {
                min_flt = task->min_flt;
                maj_flt = task->maj_flt;
 -              task_times(task, &utime, &stime);
 +              task_cputime_adjusted(task, &utime, &stime);
                gtime = task->gtime;
        }
  
diff --combined fs/proc/base.c
index aa63d25157b8d396a9a7d0f1728fe673fa577e31,7621dc51cff8c7feac70b5c93e275167bb3e1766..5a5a0be40e405f4693bad85fcc7d04703b362523
@@@ -873,119 -873,12 +873,119 @@@ static const struct file_operations pro
        .release        = mem_release,
  };
  
 +static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
 +                          loff_t *ppos)
 +{
 +      struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 +      char buffer[PROC_NUMBUF];
 +      int oom_adj = OOM_ADJUST_MIN;
 +      size_t len;
 +      unsigned long flags;
 +
 +      if (!task)
 +              return -ESRCH;
 +      if (lock_task_sighand(task, &flags)) {
 +              if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
 +                      oom_adj = OOM_ADJUST_MAX;
 +              else
 +                      oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
 +                                OOM_SCORE_ADJ_MAX;
 +              unlock_task_sighand(task, &flags);
 +      }
 +      put_task_struct(task);
 +      len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
 +      return simple_read_from_buffer(buf, count, ppos, buffer, len);
 +}
 +
 +static ssize_t oom_adj_write(struct file *file, const char __user *buf,
 +                           size_t count, loff_t *ppos)
 +{
 +      struct task_struct *task;
 +      char buffer[PROC_NUMBUF];
 +      int oom_adj;
 +      unsigned long flags;
 +      int err;
 +
 +      memset(buffer, 0, sizeof(buffer));
 +      if (count > sizeof(buffer) - 1)
 +              count = sizeof(buffer) - 1;
 +      if (copy_from_user(buffer, buf, count)) {
 +              err = -EFAULT;
 +              goto out;
 +      }
 +
 +      err = kstrtoint(strstrip(buffer), 0, &oom_adj);
 +      if (err)
 +              goto out;
 +      if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
 +           oom_adj != OOM_DISABLE) {
 +              err = -EINVAL;
 +              goto out;
 +      }
 +
 +      task = get_proc_task(file->f_path.dentry->d_inode);
 +      if (!task) {
 +              err = -ESRCH;
 +              goto out;
 +      }
 +
 +      task_lock(task);
 +      if (!task->mm) {
 +              err = -EINVAL;
 +              goto err_task_lock;
 +      }
 +
 +      if (!lock_task_sighand(task, &flags)) {
 +              err = -ESRCH;
 +              goto err_task_lock;
 +      }
 +
 +      /*
 +       * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
 +       * value is always attainable.
 +       */
 +      if (oom_adj == OOM_ADJUST_MAX)
 +              oom_adj = OOM_SCORE_ADJ_MAX;
 +      else
 +              oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
 +
 +      if (oom_adj < task->signal->oom_score_adj &&
 +          !capable(CAP_SYS_RESOURCE)) {
 +              err = -EACCES;
 +              goto err_sighand;
 +      }
 +
 +      /*
 +       * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
 +       * /proc/pid/oom_score_adj instead.
 +       */
 +      printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
 +                current->comm, task_pid_nr(current), task_pid_nr(task),
 +                task_pid_nr(task));
 +
 +      task->signal->oom_score_adj = oom_adj;
 +      trace_oom_score_adj_update(task);
 +err_sighand:
 +      unlock_task_sighand(task, &flags);
 +err_task_lock:
 +      task_unlock(task);
 +      put_task_struct(task);
 +out:
 +      return err < 0 ? err : count;
 +}
 +
 +static const struct file_operations proc_oom_adj_operations = {
 +      .read           = oom_adj_read,
 +      .write          = oom_adj_write,
 +      .llseek         = generic_file_llseek,
 +};
 +
  static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                                        size_t count, loff_t *ppos)
  {
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
        char buffer[PROC_NUMBUF];
 -      int oom_score_adj = OOM_SCORE_ADJ_MIN;
 +      short oom_score_adj = OOM_SCORE_ADJ_MIN;
        unsigned long flags;
        size_t len;
  
                unlock_task_sighand(task, &flags);
        }
        put_task_struct(task);
 -      len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
 +      len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
  }
  
@@@ -1043,15 -936,15 +1043,15 @@@ static ssize_t oom_score_adj_write(stru
                goto err_task_lock;
        }
  
 -      if (oom_score_adj < task->signal->oom_score_adj_min &&
 +      if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
                        !capable(CAP_SYS_RESOURCE)) {
                err = -EACCES;
                goto err_sighand;
        }
  
 -      task->signal->oom_score_adj = oom_score_adj;
 +      task->signal->oom_score_adj = (short)oom_score_adj;
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
 -              task->signal->oom_score_adj_min = oom_score_adj;
 +              task->signal->oom_score_adj_min = (short)oom_score_adj;
        trace_oom_score_adj_update(task);
  
  err_sighand:
@@@ -1877,9 -1770,8 +1877,9 @@@ static struct dentry *proc_map_files_lo
        if (!vma)
                goto out_no_vma;
  
 -      result = proc_map_files_instantiate(dir, dentry, task,
 -                      (void *)(unsigned long)vma->vm_file->f_mode);
 +      if (vma->vm_file)
 +              result = proc_map_files_instantiate(dir, dentry, task,
 +                              (void *)(unsigned long)vma->vm_file->f_mode);
  
  out_no_vma:
        up_read(&mm->mmap_sem);
@@@ -2345,146 -2237,6 +2345,6 @@@ static const struct file_operations pro
  };
  #endif
  
- /*
-  * /proc/self:
-  */
- static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
-                             int buflen)
- {
-       struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-       pid_t tgid = task_tgid_nr_ns(current, ns);
-       char tmp[PROC_NUMBUF];
-       if (!tgid)
-               return -ENOENT;
-       sprintf(tmp, "%d", tgid);
-       return vfs_readlink(dentry,buffer,buflen,tmp);
- }
- static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
- {
-       struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-       pid_t tgid = task_tgid_nr_ns(current, ns);
-       char *name = ERR_PTR(-ENOENT);
-       if (tgid) {
-               /* 11 for max length of signed int in decimal + NULL term */
-               name = kmalloc(12, GFP_KERNEL);
-               if (!name)
-                       name = ERR_PTR(-ENOMEM);
-               else
-                       sprintf(name, "%d", tgid);
-       }
-       nd_set_link(nd, name);
-       return NULL;
- }
- static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
-                               void *cookie)
- {
-       char *s = nd_get_link(nd);
-       if (!IS_ERR(s))
-               kfree(s);
- }
- static const struct inode_operations proc_self_inode_operations = {
-       .readlink       = proc_self_readlink,
-       .follow_link    = proc_self_follow_link,
-       .put_link       = proc_self_put_link,
- };
- /*
-  * proc base
-  *
-  * These are the directory entries in the root directory of /proc
-  * that properly belong to the /proc filesystem, as they describe
-  * describe something that is process related.
-  */
- static const struct pid_entry proc_base_stuff[] = {
-       NOD("self", S_IFLNK|S_IRWXUGO,
-               &proc_self_inode_operations, NULL, {}),
- };
- static struct dentry *proc_base_instantiate(struct inode *dir,
-       struct dentry *dentry, struct task_struct *task, const void *ptr)
- {
-       const struct pid_entry *p = ptr;
-       struct inode *inode;
-       struct proc_inode *ei;
-       struct dentry *error;
-       /* Allocate the inode */
-       error = ERR_PTR(-ENOMEM);
-       inode = new_inode(dir->i_sb);
-       if (!inode)
-               goto out;
-       /* Initialize the inode */
-       ei = PROC_I(inode);
-       inode->i_ino = get_next_ino();
-       inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-       /*
-        * grab the reference to the task.
-        */
-       ei->pid = get_task_pid(task, PIDTYPE_PID);
-       if (!ei->pid)
-               goto out_iput;
-       inode->i_mode = p->mode;
-       if (S_ISDIR(inode->i_mode))
-               set_nlink(inode, 2);
-       if (S_ISLNK(inode->i_mode))
-               inode->i_size = 64;
-       if (p->iop)
-               inode->i_op = p->iop;
-       if (p->fop)
-               inode->i_fop = p->fop;
-       ei->op = p->op;
-       d_add(dentry, inode);
-       error = NULL;
- out:
-       return error;
- out_iput:
-       iput(inode);
-       goto out;
- }
- static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
- {
-       struct dentry *error;
-       struct task_struct *task = get_proc_task(dir);
-       const struct pid_entry *p, *last;
-       error = ERR_PTR(-ENOENT);
-       if (!task)
-               goto out_no_task;
-       /* Lookup the directory entry */
-       last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
-       for (p = proc_base_stuff; p <= last; p++) {
-               if (p->len != dentry->d_name.len)
-                       continue;
-               if (!memcmp(dentry->d_name.name, p->name, p->len))
-                       break;
-       }
-       if (p > last)
-               goto out;
-       error = proc_base_instantiate(dir, dentry, task, p);
- out:
-       put_task_struct(task);
- out_no_task:
-       return error;
- }
- static int proc_base_fill_cache(struct file *filp, void *dirent,
-       filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
- {
-       return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
-                               proc_base_instantiate, task, p);
- }
  #ifdef CONFIG_TASK_IO_ACCOUNTING
  static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
  {
@@@ -2706,7 -2458,6 +2566,7 @@@ static const struct pid_entry tgid_base
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
  #endif
        INF("oom_score",  S_IRUGO, proc_oom_score),
 +      REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
  #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@@ -2839,10 -2590,6 +2699,6 @@@ void proc_flush_task(struct task_struc
                proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
                                        tgid->numbers[i].nr);
        }
-       upid = &pid->numbers[pid->level];
-       if (upid->nr == 1)
-               pid_ns_release_proc(upid->ns);
  }
  
  static struct dentry *proc_pid_instantiate(struct inode *dir,
  
  struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
  {
-       struct dentry *result;
+       struct dentry *result = NULL;
        struct task_struct *task;
        unsigned tgid;
        struct pid_namespace *ns;
  
-       result = proc_base_lookup(dir, dentry);
-       if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
-               goto out;
        tgid = name_to_int(dentry);
        if (tgid == ~0U)
                goto out;
@@@ -2947,7 -2690,7 +2799,7 @@@ retry
        return iter;
  }
  
- #define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
+ #define TGID_OFFSET (FIRST_PROCESS_ENTRY)
  
  static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
        struct tgid_iter iter)
@@@ -2967,25 -2710,12 +2819,12 @@@ static int fake_filldir(void *buf, cons
  /* for the /proc/ directory itself, after non-process stuff has been done */
  int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
  {
-       unsigned int nr;
-       struct task_struct *reaper;
        struct tgid_iter iter;
        struct pid_namespace *ns;
        filldir_t __filldir;
  
        if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
-               goto out_no_task;
-       nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-       reaper = get_proc_task(filp->f_path.dentry->d_inode);
-       if (!reaper)
-               goto out_no_task;
-       for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
-               const struct pid_entry *p = &proc_base_stuff[nr];
-               if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
-                       goto out;
-       }
+               goto out;
  
        ns = filp->f_dentry->d_sb->s_fs_info;
        iter.task = NULL;
        }
        filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
  out:
-       put_task_struct(reaper);
- out_no_task:
        return 0;
  }
  
@@@ -3073,7 -2801,6 +2910,7 @@@ static const struct pid_entry tid_base_
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
  #endif
        INF("oom_score", S_IRUGO, proc_oom_score),
 +      REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
  #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
diff --combined include/linux/cred.h
index 0142aacb70b7049583a1618c735c7410a6827d65,856d2622d832eeab89abde1de6ce5d07d58b69d3..abb2cd50f6b26ace7da7a554527087d4685d3e76
@@@ -76,6 -76,21 +76,6 @@@ extern int groups_search(const struct g
  extern int in_group_p(kgid_t);
  extern int in_egroup_p(kgid_t);
  
 -/*
 - * The common credentials for a thread group
 - * - shared by CLONE_THREAD
 - */
 -#ifdef CONFIG_KEYS
 -struct thread_group_cred {
 -      atomic_t        usage;
 -      pid_t           tgid;                   /* thread group process ID */
 -      spinlock_t      lock;
 -      struct key __rcu *session_keyring;      /* keyring inherited over fork */
 -      struct key      *process_keyring;       /* keyring private to this process */
 -      struct rcu_head rcu;                    /* RCU deletion hook */
 -};
 -#endif
 -
  /*
   * The security context of a task
   *
@@@ -124,8 -139,6 +124,8 @@@ struct cred 
  #ifdef CONFIG_KEYS
        unsigned char   jit_keyring;    /* default keyring to attach requested
                                         * keys to */
 +      struct key __rcu *session_keyring; /* keyring inherited over fork */
 +      struct key      *process_keyring; /* keyring private to this process */
        struct key      *thread_keyring; /* keyring private to this thread */
        struct key      *request_key_auth; /* assumed request_key authority */
        struct thread_group_cred *tgcred; /* thread-group shared credentials */
@@@ -344,10 -357,8 +344,8 @@@ static inline void put_cred(const struc
  extern struct user_namespace init_user_ns;
  #ifdef CONFIG_USER_NS
  #define current_user_ns()     (current_cred_xxx(user_ns))
- #define task_user_ns(task)    (task_cred_xxx((task), user_ns))
  #else
  #define current_user_ns()     (&init_user_ns)
- #define task_user_ns(task)    (&init_user_ns)
  #endif
  
  
diff --combined include/linux/fs.h
index 408fb1e77a0a36804363d49d804e3ea3d3a802ab,5037aa6817fd2878c2da338de6680563d3bfd956..035521b46528ace428c7f07d4aa49ac4880e145e
@@@ -418,7 -418,7 +418,7 @@@ struct address_space 
        struct backing_dev_info *backing_dev_info; /* device readahead, etc */
        spinlock_t              private_lock;   /* for use by the address_space */
        struct list_head        private_list;   /* ditto */
 -      struct address_space    *assoc_mapping; /* ditto */
 +      void                    *private_data;  /* ditto */
  } __attribute__((aligned(sizeof(long))));
        /*
         * On most architectures that alignment is already the case; but
@@@ -462,6 -462,8 +462,6 @@@ struct block_device 
        int                     bd_fsfreeze_count;
        /* Mutex for freeze */
        struct mutex            bd_fsfreeze_mutex;
 -      /* A semaphore that prevents I/O while block size is being changed */
 -      struct percpu_rw_semaphore      bd_block_size_semaphore;
  };
  
  /*
@@@ -1810,6 -1812,8 +1810,8 @@@ struct file_system_type 
  #define FS_REQUIRES_DEV               1 
  #define FS_BINARY_MOUNTDATA   2
  #define FS_HAS_SUBTYPE                4
+ #define FS_USERNS_MOUNT               8       /* Can be mounted by userns root */
+ #define FS_USERNS_DEV_MOUNT   16 /* A userns mount does not imply MNT_NODEV */
  #define FS_REVAL_DOT          16384   /* Check the paths ".", ".." for staleness */
  #define FS_RENAME_DOES_D_MOVE 32768   /* FS will handle d_move() during rename() internally. */
        struct dentry *(*mount) (struct file_system_type *, int,
@@@ -2047,6 -2051,7 +2049,6 @@@ extern void unregister_blkdev(unsigned 
  extern struct block_device *bdget(dev_t);
  extern struct block_device *bdgrab(struct block_device *bdev);
  extern void bd_set_size(struct block_device *, loff_t size);
 -extern sector_t blkdev_max_block(struct block_device *bdev);
  extern void bd_forget(struct inode *inode);
  extern void bdput(struct block_device *);
  extern void invalidate_bdev(struct block_device *);
@@@ -2376,6 -2381,8 +2378,6 @@@ extern int generic_segment_checks(cons
                unsigned long *nr_segs, size_t *count, int access_flags);
  
  /* fs/block_dev.c */
 -extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
 -                             unsigned long nr_segs, loff_t pos);
  extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
  extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
diff --combined init/Kconfig
index 1a207efca5918d8ba97a8f9abffcc65527f3da2d,38c1a1d0bf3879441d162cfdcae667afea99ac43..675d8a2326cf29fc3c758e6a4533e98d40aa6aa1
@@@ -486,35 -486,35 +486,35 @@@ config PREEMPT_RC
          This option enables preemptible-RCU code that is common between
          the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
  
 +config CONTEXT_TRACKING
 +       bool
 +
  config RCU_USER_QS
        bool "Consider userspace as in RCU extended quiescent state"
 -      depends on HAVE_RCU_USER_QS && SMP
 +      depends on HAVE_CONTEXT_TRACKING && SMP
 +      select CONTEXT_TRACKING
        help
          This option sets hooks on kernel / userspace boundaries and
          puts RCU in extended quiescent state when the CPU runs in
          userspace. It means that when a CPU runs in userspace, it is
          excluded from the global RCU state machine and thus doesn't
 -        to keep the timer tick on for RCU.
 +        try to keep the timer tick on for RCU.
  
          Unless you want to hack and help the development of the full
 -        tickless feature, you shouldn't enable this option. It adds
 -        unnecessary overhead.
 +        dynticks mode, you shouldn't enable this option.  It also
 +        adds unnecessary overhead.
  
          If unsure say N
  
 -config RCU_USER_QS_FORCE
 -      bool "Force userspace extended QS by default"
 -      depends on RCU_USER_QS
 +config CONTEXT_TRACKING_FORCE
 +      bool "Force context tracking"
 +      depends on CONTEXT_TRACKING
        help
 -        Set the hooks in user/kernel boundaries by default in order to
 -        test this feature that treats userspace as an extended quiescent
 -        state until we have a real user like a full adaptive nohz option.
 -
 -        Unless you want to hack and help the development of the full
 -        tickless feature, you shouldn't enable this option. It adds
 -        unnecessary overhead.
 -
 -        If unsure say N
 +        Probe on user/kernel boundaries by default in order to
 +        test the features that rely on it such as userspace RCU extended
 +        quiescent states.
 +        This test is there for debugging until we have a real user like the
 +        full dynticks mode.
  
  config RCU_FANOUT
        int "Tree-based hierarchical RCU fanout value"
@@@ -582,13 -582,14 +582,13 @@@ config RCU_FAST_NO_H
        depends on NO_HZ && SMP
        default n
        help
 -        This option causes RCU to attempt to accelerate grace periods
 -        in order to allow CPUs to enter dynticks-idle state more
 -        quickly.  On the other hand, this option increases the overhead
 -        of the dynticks-idle checking, particularly on systems with
 -        large numbers of CPUs.
 +        This option causes RCU to attempt to accelerate grace periods in
 +        order to allow CPUs to enter dynticks-idle state more quickly.
 +        On the other hand, this option increases the overhead of the
 +        dynticks-idle checking, thus degrading scheduling latency.
  
 -        Say Y if energy efficiency is critically important, particularly
 -              if you have relatively few CPUs.
 +        Say Y if energy efficiency is critically important, and you don't
 +              care about real-time response.
  
          Say N if you are unsure.
  
@@@ -654,28 -655,6 +654,28 @@@ config RCU_BOOST_DELA
  
          Accept the default if unsure.
  
 +config RCU_NOCB_CPU
 +      bool "Offload RCU callback processing from boot-selected CPUs"
 +      depends on TREE_RCU || TREE_PREEMPT_RCU
 +      default n
 +      help
 +        Use this option to reduce OS jitter for aggressive HPC or
 +        real-time workloads.  It can also be used to offload RCU
 +        callback invocation to energy-efficient CPUs in battery-powered
 +        asymmetric multiprocessors.
 +
 +        This option offloads callback invocation from the set of
 +        CPUs specified at boot time by the rcu_nocbs parameter.
 +        For each such CPU, a kthread ("rcuoN") will be created to
 +        invoke callbacks, where the "N" is the CPU being offloaded.
 +        Nothing prevents this kthread from running on the specified
 +        CPUs, but (1) the kthreads may be preempted between each
 +        callback, and (2) affinity or cgroups can be used to force
 +        the kthreads to run on whatever set of CPUs is desired.
 +
 +        Say Y here if you want reduced OS jitter on selected CPUs.
 +        Say N here if you are unsure.
 +
  endmenu # "RCU Subsystem"
  
  config IKCONFIG
@@@ -717,50 -696,6 +717,50 @@@ config LOG_BUF_SHIF
  config HAVE_UNSTABLE_SCHED_CLOCK
        bool
  
 +#
 +# For architectures that want to enable the support for NUMA-affine scheduler
 +# balancing logic:
 +#
 +config ARCH_SUPPORTS_NUMA_BALANCING
 +      bool
 +
 +# For architectures that (ab)use NUMA to represent different memory regions
 +# all cpu-local but of different latencies, such as SuperH.
 +#
 +config ARCH_WANT_NUMA_VARIABLE_LOCALITY
 +      bool
 +
 +#
 +# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
 +config ARCH_WANTS_PROT_NUMA_PROT_NONE
 +      bool
 +
 +config ARCH_USES_NUMA_PROT_NONE
 +      bool
 +      default y
 +      depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
 +      depends on NUMA_BALANCING
 +
 +config NUMA_BALANCING_DEFAULT_ENABLED
 +      bool "Automatically enable NUMA aware memory/task placement"
 +      default y
 +      depends on NUMA_BALANCING
 +      help
 +        If set, autonumic NUMA balancing will be enabled if running on a NUMA
 +        machine.
 +
 +config NUMA_BALANCING
 +      bool "Memory placement aware NUMA scheduler"
 +      depends on ARCH_SUPPORTS_NUMA_BALANCING
 +      depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
 +      depends on SMP && NUMA && MIGRATION
 +      help
 +        This option adds support for automatic NUMA aware memory/task placement.
 +        The mechanism is quite primitive and is based on migrating memory when
 +        it is references to the node the task is running on.
 +
 +        This system will be inactive on UMA systems.
 +
  menuconfig CGROUPS
        boolean "Control Group support"
        depends on EVENTFD
@@@ -1069,11 -1004,9 +1069,9 @@@ config UIDGID_CONVERTE
        # Filesystems
        depends on 9P_FS = n
        depends on AFS_FS = n
-       depends on AUTOFS4_FS = n
        depends on CEPH_FS = n
        depends on CIFS = n
        depends on CODA_FS = n
-       depends on FUSE_FS = n
        depends on GFS2_FS = n
        depends on NCP_FS = n
        depends on NFSD = n
diff --combined init/main.c
index 63ae904a99a8eb3718f6a57ee515c12f0b60b8dc,317750a18f74c87be9023bcc42bdbc4cb58c1177..baf1f0f5c4611eb08b3f0eae7995c5d789f8e741
@@@ -442,11 -442,9 +442,11 @@@ void __init __weak smp_setup_processor_
  {
  }
  
 +# if THREAD_SIZE >= PAGE_SIZE
  void __init __weak thread_info_cache_init(void)
  {
  }
 +#endif
  
  /*
   * Set up kernel memory allocators
@@@ -812,7 -810,6 +812,6 @@@ static int __ref kernel_init(void *unus
        system_state = SYSTEM_RUNNING;
        numa_default_policy();
  
-       current->signal->flags |= SIGNAL_UNKILLABLE;
        flush_delayed_fput();
  
        if (ramdisk_execute_command) {
@@@ -857,7 -854,7 +856,7 @@@ static void __init kernel_init_freeable
        /*
         * init can allocate pages on any node
         */
 -      set_mems_allowed(node_states[N_HIGH_MEMORY]);
 +      set_mems_allowed(node_states[N_MEMORY]);
        /*
         * init can run on any cpu.
         */
diff --combined kernel/cgroup.c
index f34c41bfaa37daa2b399c6387740d397703a277c,0dbfba2efa770be9cbd79e730be501e12db04728..9915ffe013727d68fbe21013288b282154d576de
@@@ -138,9 -138,6 +138,9 @@@ struct cgroupfs_root 
        /* Hierarchy-specific flags */
        unsigned long flags;
  
 +      /* IDs for cgroups in this hierarchy */
 +      struct ida cgroup_ida;
 +
        /* The path to use for release notifications. */
        char release_agent_path[PATH_MAX];
  
@@@ -174,8 -171,8 +174,8 @@@ struct css_id 
         * The css to which this ID points. This pointer is set to valid value
         * after cgroup is populated. If cgroup is removed, this will be NULL.
         * This pointer is expected to be RCU-safe because destroy()
 -       * is called after synchronize_rcu(). But for safe use, css_is_removed()
 -       * css_tryget() should be used for avoiding race.
 +       * is called after synchronize_rcu(). But for safe use, css_tryget()
 +       * should be used for avoiding race.
         */
        struct cgroup_subsys_state __rcu *css;
        /*
@@@ -245,10 -242,6 +245,10 @@@ static DEFINE_SPINLOCK(hierarchy_id_loc
   */
  static int need_forkexit_callback __read_mostly;
  
 +static int cgroup_destroy_locked(struct cgroup *cgrp);
 +static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 +                            struct cftype cfts[], bool is_add);
 +
  #ifdef CONFIG_PROVE_LOCKING
  int cgroup_lock_is_held(void)
  {
@@@ -301,6 -294,11 +301,6 @@@ static int notify_on_release(const stru
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  }
  
 -static int clone_children(const struct cgroup *cgrp)
 -{
 -      return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 -}
 -
  /*
   * for_each_subsys() allows you to iterate on each subsystem attached to
   * an active hierarchy
@@@ -784,12 -782,12 +784,12 @@@ static struct cgroup *task_cgroup_from_
   *    The task_lock() exception
   *
   * The need for this exception arises from the action of
 - * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
 + * cgroup_attach_task(), which overwrites one task's cgroup pointer with
   * another.  It does so using cgroup_mutex, however there are
   * several performance critical places that need to reference
   * task->cgroup without the expense of grabbing a system global
   * mutex.  Therefore except as noted below, when dereferencing or, as
 - * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
 + * in cgroup_attach_task(), modifying a task's cgroup pointer we use
   * task_lock(), which acts on a spinlock (task->alloc_lock) already in
   * the task_struct routinely used for such matters.
   *
@@@ -856,6 -854,30 +856,6 @@@ static struct inode *cgroup_new_inode(u
        return inode;
  }
  
 -/*
 - * Call subsys's pre_destroy handler.
 - * This is called before css refcnt check.
 - */
 -static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 -{
 -      struct cgroup_subsys *ss;
 -      int ret = 0;
 -
 -      for_each_subsys(cgrp->root, ss) {
 -              if (!ss->pre_destroy)
 -                      continue;
 -
 -              ret = ss->pre_destroy(cgrp);
 -              if (ret) {
 -                      /* ->pre_destroy() failure is being deprecated */
 -                      WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
 -                      break;
 -              }
 -      }
 -
 -      return ret;
 -}
 -
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
  {
        /* is dentry a directory ? if so, kfree() associated cgroup */
                 * Release the subsystem state objects.
                 */
                for_each_subsys(cgrp->root, ss)
 -                      ss->destroy(cgrp);
 +                      ss->css_free(cgrp);
  
                cgrp->root->number_of_cgroups--;
                mutex_unlock(&cgroup_mutex);
  
                simple_xattrs_free(&cgrp->xattrs);
  
 +              ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
                kfree_rcu(cgrp, rcu_head);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
@@@ -966,7 -987,7 +966,7 @@@ static void cgroup_clear_directory(stru
                if (!test_bit(ss->subsys_id, &subsys_mask))
                        continue;
                list_for_each_entry(set, &ss->cftsets, node)
 -                      cgroup_rm_file(cgrp, set->cfts);
 +                      cgroup_addrm_files(cgrp, NULL, set->cfts, false);
        }
        if (base_files) {
                while (!list_empty(&cgrp->files))
@@@ -993,6 -1014,33 +993,6 @@@ static void cgroup_d_remove_dir(struct 
        remove_dir(dentry);
  }
  
 -/*
 - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
 - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
 - * reference to css->refcnt. In general, this refcnt is expected to goes down
 - * to zero, soon.
 - *
 - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
 - */
 -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
 -
 -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
 -{
 -      if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
 -              wake_up_all(&cgroup_rmdir_waitq);
 -}
 -
 -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
 -{
 -      css_get(css);
 -}
 -
 -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
 -{
 -      cgroup_wakeup_rmdir_waiter(css->cgroup);
 -      css_put(css);
 -}
 -
  /*
   * Call with cgroup_mutex held. Drops reference counts on modules, including
   * any duplicate ones that parse_cgroupfs_options took. If this function
@@@ -1102,7 -1150,7 +1102,7 @@@ static int cgroup_show_options(struct s
                seq_puts(seq, ",xattr");
        if (strlen(root->release_agent_path))
                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
 -      if (clone_children(&root->top_cgroup))
 +      if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
                seq_printf(seq, ",name=%s", root->name);
@@@ -1114,7 -1162,7 +1114,7 @@@ struct cgroup_sb_opts 
        unsigned long subsys_mask;
        unsigned long flags;
        char *release_agent;
 -      bool clone_children;
 +      bool cpuset_clone_children;
        char *name;
        /* User explicitly requested empty subsystem */
        bool none;
@@@ -1165,7 -1213,7 +1165,7 @@@ static int parse_cgroupfs_options(char 
                        continue;
                }
                if (!strcmp(token, "clone_children")) {
 -                      opts->clone_children = true;
 +                      opts->cpuset_clone_children = true;
                        continue;
                }
                if (!strcmp(token, "xattr")) {
@@@ -1349,21 -1397,14 +1349,21 @@@ static int cgroup_remount(struct super_
                goto out_unlock;
        }
  
 +      /*
 +       * Clear out the files of subsystems that should be removed, do
 +       * this before rebind_subsystems, since rebind_subsystems may
 +       * change this hierarchy's subsys_list.
 +       */
 +      cgroup_clear_directory(cgrp->dentry, false, removed_mask);
 +
        ret = rebind_subsystems(root, opts.subsys_mask);
        if (ret) {
 +              /* rebind_subsystems failed, re-populate the removed files */
 +              cgroup_populate_dir(cgrp, false, removed_mask);
                drop_parsed_module_refcounts(opts.subsys_mask);
                goto out_unlock;
        }
  
 -      /* clear out any existing files and repopulate subsystem files */
 -      cgroup_clear_directory(cgrp->dentry, false, removed_mask);
        /* re-populate subsystem files */
        cgroup_populate_dir(cgrp, false, added_mask);
  
@@@ -1391,7 -1432,6 +1391,7 @@@ static void init_cgroup_housekeeping(st
        INIT_LIST_HEAD(&cgrp->children);
        INIT_LIST_HEAD(&cgrp->files);
        INIT_LIST_HEAD(&cgrp->css_sets);
 +      INIT_LIST_HEAD(&cgrp->allcg_node);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
@@@ -1410,8 -1450,8 +1410,8 @@@ static void init_cgroup_root(struct cgr
        root->number_of_cgroups = 1;
        cgrp->root = root;
        cgrp->top_cgroup = cgrp;
 -      list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        init_cgroup_housekeeping(cgrp);
 +      list_add_tail(&cgrp->allcg_node, &root->allcg_list);
  }
  
  static bool init_root_id(struct cgroupfs_root *root)
@@@ -1478,13 -1518,12 +1478,13 @@@ static struct cgroupfs_root *cgroup_roo
  
        root->subsys_mask = opts->subsys_mask;
        root->flags = opts->flags;
 +      ida_init(&root->cgroup_ida);
        if (opts->release_agent)
                strcpy(root->release_agent_path, opts->release_agent);
        if (opts->name)
                strcpy(root->name, opts->name);
 -      if (opts->clone_children)
 -              set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
 +      if (opts->cpuset_clone_children)
 +              set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
        return root;
  }
  
@@@ -1497,7 -1536,6 +1497,7 @@@ static void cgroup_drop_root(struct cgr
        spin_lock(&hierarchy_id_lock);
        ida_remove(&hierarchy_ida, root->hierarchy_id);
        spin_unlock(&hierarchy_id_lock);
 +      ida_destroy(&root->cgroup_ida);
        kfree(root);
  }
  
@@@ -1663,6 -1701,7 +1663,6 @@@ static struct dentry *cgroup_mount(stru
  
                free_cg_links(&tmp_cg_links);
  
 -              BUG_ON(!list_empty(&root_cgrp->sibling));
                BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
  
@@@ -1711,6 -1750,7 +1711,6 @@@ static void cgroup_kill_sb(struct super
  
        BUG_ON(root->number_of_cgroups != 1);
        BUG_ON(!list_empty(&cgrp->children));
 -      BUG_ON(!list_empty(&cgrp->sibling));
  
        mutex_lock(&cgroup_mutex);
        mutex_lock(&cgroup_root_mutex);
@@@ -1768,11 -1808,9 +1768,11 @@@ static struct kobject *cgroup_kobj
   */
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
  {
 +      struct dentry *dentry = cgrp->dentry;
        char *start;
 -      struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
 -                                                    cgroup_lock_is_held());
 +
 +      rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
 +                         "cgroup_path() called without proper locking");
  
        if (!dentry || cgrp == dummytop) {
                /*
                return 0;
        }
  
 -      start = buf + buflen;
 +      start = buf + buflen - 1;
  
 -      *--start = '\0';
 +      *start = '\0';
        for (;;) {
                int len = dentry->d_name.len;
  
                if (!cgrp)
                        break;
  
 -              dentry = rcu_dereference_check(cgrp->dentry,
 -                                             cgroup_lock_is_held());
 +              dentry = cgrp->dentry;
                if (!cgrp->parent)
                        continue;
                if (--start < buf)
@@@ -1891,7 -1930,9 +1891,7 @@@ EXPORT_SYMBOL_GPL(cgroup_taskset_size)
  /*
   * cgroup_task_migrate - move a task from one cgroup to another.
   *
 - * 'guarantee' is set if the caller promises that a new css_set for the task
 - * will already exist. If not set, this function might sleep, and can fail with
 - * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
 + * Must be called with cgroup_mutex and threadgroup locked.
   */
  static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
                                struct task_struct *tsk, struct css_set *newcg)
@@@ -1984,6 -2025,12 +1984,6 @@@ int cgroup_attach_task(struct cgroup *c
        }
  
        synchronize_rcu();
 -
 -      /*
 -       * wake up rmdir() waiter. the rmdir should fail since the cgroup
 -       * is no longer empty.
 -       */
 -      cgroup_wakeup_rmdir_waiter(cgrp);
  out:
        if (retval) {
                for_each_subsys(root, ss) {
@@@ -2153,6 -2200,7 +2153,6 @@@ static int cgroup_attach_proc(struct cg
         * step 5: success! and cleanup
         */
        synchronize_rcu();
 -      cgroup_wakeup_rmdir_waiter(cgrp);
        retval = 0;
  out_put_css_set_refs:
        if (retval) {
@@@ -2663,17 -2711,10 +2663,17 @@@ static int cgroup_create_file(struct de
  
                /* start off with i_nlink == 2 (for "." entry) */
                inc_nlink(inode);
 +              inc_nlink(dentry->d_parent->d_inode);
  
 -              /* start with the directory inode held, so that we can
 -               * populate it without racing with another mkdir */
 -              mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 +              /*
 +               * Control reaches here with cgroup_mutex held.
 +               * @inode->i_mutex should nest outside cgroup_mutex but we
 +               * want to populate it immediately without releasing
 +               * cgroup_mutex.  As @inode isn't visible to anyone else
 +               * yet, trylock will always succeed without affecting
 +               * lockdep checks.
 +               */
 +              WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
        } else if (S_ISREG(mode)) {
                inode->i_size = 0;
                inode->i_fop = &cgroup_file_operations;
        return 0;
  }
  
 -/*
 - * cgroup_create_dir - create a directory for an object.
 - * @cgrp: the cgroup we create the directory for. It must have a valid
 - *        ->parent field. And we are going to fill its ->dentry field.
 - * @dentry: dentry of the new cgroup
 - * @mode: mode to set on new directory.
 - */
 -static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 -                              umode_t mode)
 -{
 -      struct dentry *parent;
 -      int error = 0;
 -
 -      parent = cgrp->parent->dentry;
 -      error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
 -      if (!error) {
 -              dentry->d_fsdata = cgrp;
 -              inc_nlink(parent->d_inode);
 -              rcu_assign_pointer(cgrp->dentry, dentry);
 -              dget(dentry);
 -      }
 -      dput(dentry);
 -
 -      return error;
 -}
 -
  /**
   * cgroup_file_mode - deduce file mode of a control file
   * @cft: the control file in question
@@@ -2724,6 -2791,12 +2724,6 @@@ static int cgroup_add_file(struct cgrou
  
        simple_xattrs_init(&cft->xattrs);
  
 -      /* does @cft->flags tell us to skip creation on @cgrp? */
 -      if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
 -              return 0;
 -      if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
 -              return 0;
 -
        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
                strcpy(name, subsys->name);
                strcat(name, ".");
@@@ -2764,12 -2837,6 +2764,12 @@@ static int cgroup_addrm_files(struct cg
        int err, ret = 0;
  
        for (cft = cfts; cft->name[0] != '\0'; cft++) {
 +              /* does cft->flags tell us to skip this file on @cgrp? */
 +              if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
 +                      continue;
 +              if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
 +                      continue;
 +
                if (is_add)
                        err = cgroup_add_file(cgrp, subsys, cft);
                else
@@@ -2977,92 -3044,6 +2977,92 @@@ static void cgroup_enable_task_cg_lists
        write_unlock(&css_set_lock);
  }
  
 +/**
 + * cgroup_next_descendant_pre - find the next descendant for pre-order walk
 + * @pos: the current position (%NULL to initiate traversal)
 + * @cgroup: cgroup whose descendants to walk
 + *
 + * To be used by cgroup_for_each_descendant_pre().  Find the next
 + * descendant to visit for pre-order traversal of @cgroup's descendants.
 + */
 +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 +                                        struct cgroup *cgroup)
 +{
 +      struct cgroup *next;
 +
 +      WARN_ON_ONCE(!rcu_read_lock_held());
 +
 +      /* if first iteration, pretend we just visited @cgroup */
 +      if (!pos) {
 +              if (list_empty(&cgroup->children))
 +                      return NULL;
 +              pos = cgroup;
 +      }
 +
 +      /* visit the first child if exists */
 +      next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
 +      if (next)
 +              return next;
 +
 +      /* no child, visit my or the closest ancestor's next sibling */
 +      do {
 +              next = list_entry_rcu(pos->sibling.next, struct cgroup,
 +                                    sibling);
 +              if (&next->sibling != &pos->parent->children)
 +                      return next;
 +
 +              pos = pos->parent;
 +      } while (pos != cgroup);
 +
 +      return NULL;
 +}
 +EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
 +
 +static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
 +{
 +      struct cgroup *last;
 +
 +      do {
 +              last = pos;
 +              pos = list_first_or_null_rcu(&pos->children, struct cgroup,
 +                                           sibling);
 +      } while (pos);
 +
 +      return last;
 +}
 +
 +/**
 + * cgroup_next_descendant_post - find the next descendant for post-order walk
 + * @pos: the current position (%NULL to initiate traversal)
 + * @cgroup: cgroup whose descendants to walk
 + *
 + * To be used by cgroup_for_each_descendant_post().  Find the next
 + * descendant to visit for post-order traversal of @cgroup's descendants.
 + */
 +struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
 +                                         struct cgroup *cgroup)
 +{
 +      struct cgroup *next;
 +
 +      WARN_ON_ONCE(!rcu_read_lock_held());
 +
 +      /* if first iteration, visit the leftmost descendant */
 +      if (!pos) {
 +              next = cgroup_leftmost_descendant(cgroup);
 +              return next != cgroup ? next : NULL;
 +      }
 +
 +      /* if there's an unvisited sibling, visit its leftmost descendant */
 +      next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
 +      if (&next->sibling != &pos->parent->children)
 +              return cgroup_leftmost_descendant(next);
 +
 +      /* no sibling left, visit parent */
 +      next = pos->parent;
 +      return next != cgroup ? next : NULL;
 +}
 +EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
 +
  void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
        __acquires(css_set_lock)
  {
@@@ -3409,7 -3390,7 +3409,7 @@@ static struct cgroup_pidlist *cgroup_pi
  {
        struct cgroup_pidlist *l;
        /* don't need task_nsproxy() if we're looking at ourself */
-       struct pid_namespace *ns = current->nsproxy->pid_ns;
+       struct pid_namespace *ns = task_active_pid_ns(current);
  
        /*
         * We can't drop the pidlist_mutex before taking the l->mutex in case
@@@ -3776,7 -3757,7 +3776,7 @@@ static int cgroup_event_wake(wait_queue
        if (flags & POLLHUP) {
                __remove_wait_queue(event->wqh, &event->wait);
                spin_lock(&cgrp->event_list_lock);
 -              list_del(&event->list);
 +              list_del_init(&event->list);
                spin_unlock(&cgrp->event_list_lock);
                /*
                 * We are in atomic context, but cgroup_event_remove() may
@@@ -3913,7 -3894,7 +3913,7 @@@ fail
  static u64 cgroup_clone_children_read(struct cgroup *cgrp,
                                    struct cftype *cft)
  {
 -      return clone_children(cgrp);
 +      return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
  }
  
  static int cgroup_clone_children_write(struct cgroup *cgrp,
                                     u64 val)
  {
        if (val)
 -              set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 +              set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
        else
 -              clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 +              clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
        return 0;
  }
  
@@@ -4036,57 -4017,19 +4036,57 @@@ static void init_cgroup_css(struct cgro
        css->flags = 0;
        css->id = NULL;
        if (cgrp == dummytop)
 -              set_bit(CSS_ROOT, &css->flags);
 +              css->flags |= CSS_ROOT;
        BUG_ON(cgrp->subsys[ss->subsys_id]);
        cgrp->subsys[ss->subsys_id] = css;
  
        /*
 -       * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
 -       * which is put on the last css_put().  dput() requires process
 -       * context, which css_put() may be called without.  @css->dput_work
 -       * will be used to invoke dput() asynchronously from css_put().
 +       * css holds an extra ref to @cgrp->dentry which is put on the last
 +       * css_put().  dput() requires process context, which css_put() may
 +       * be called without.  @css->dput_work will be used to invoke
 +       * dput() asynchronously from css_put().
         */
        INIT_WORK(&css->dput_work, css_dput_fn);
 -      if (ss->__DEPRECATED_clear_css_refs)
 -              set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
 +}
 +
 +/* invoke ->post_create() on a new CSS and mark it online if successful */
 +static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
 +{
 +      int ret = 0;
 +
 +      lockdep_assert_held(&cgroup_mutex);
 +
 +      if (ss->css_online)
 +              ret = ss->css_online(cgrp);
 +      if (!ret)
 +              cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
 +      return ret;
 +}
 +
 +/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
 +static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
 +      __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 +{
 +      struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 +
 +      lockdep_assert_held(&cgroup_mutex);
 +
 +      if (!(css->flags & CSS_ONLINE))
 +              return;
 +
 +      /*
 +       * css_offline() should be called with cgroup_mutex unlocked.  See
 +       * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
 +       * details.  This temporary unlocking should go away once
 +       * cgroup_mutex is unexported from controllers.
 +       */
 +      if (ss->css_offline) {
 +              mutex_unlock(&cgroup_mutex);
 +              ss->css_offline(cgrp);
 +              mutex_lock(&cgroup_mutex);
 +      }
 +
 +      cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
  }
  
  /*
@@@ -4106,27 -4049,10 +4106,27 @@@ static long cgroup_create(struct cgrou
        struct cgroup_subsys *ss;
        struct super_block *sb = root->sb;
  
 +      /* allocate the cgroup and its ID, 0 is reserved for the root */
        cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
        if (!cgrp)
                return -ENOMEM;
  
 +      cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
 +      if (cgrp->id < 0)
 +              goto err_free_cgrp;
 +
 +      /*
 +       * Only live parents can have children.  Note that the liveliness
 +       * check isn't strictly necessary because cgroup_mkdir() and
 +       * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
 +       * anyway so that locking is contained inside cgroup proper and we
 +       * don't get nasty surprises if we ever grow another caller.
 +       */
 +      if (!cgroup_lock_live_group(parent)) {
 +              err = -ENODEV;
 +              goto err_free_id;
 +      }
 +
        /* Grab a reference on the superblock so the hierarchy doesn't
         * get deleted on unmount if there are child cgroups.  This
         * can be done outside cgroup_mutex, since the sb can't
         * fs */
        atomic_inc(&sb->s_active);
  
 -      mutex_lock(&cgroup_mutex);
 -
        init_cgroup_housekeeping(cgrp);
  
        cgrp->parent = parent;
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  
 -      if (clone_children(parent))
 -              set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
 +      if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
 +              set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
  
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css;
  
 -              css = ss->create(cgrp);
 +              css = ss->css_alloc(cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
 -                      goto err_destroy;
 +                      goto err_free_all;
                }
                init_cgroup_css(css, ss, cgrp);
                if (ss->use_id) {
                        err = alloc_css_id(ss, parent, cgrp);
                        if (err)
 -                              goto err_destroy;
 +                              goto err_free_all;
                }
 -              /* At error, ->destroy() callback has to free assigned ID. */
 -              if (clone_children(parent) && ss->post_clone)
 -                      ss->post_clone(cgrp);
 +      }
 +
 +      /*
 +       * Create directory.  cgroup_create_file() returns with the new
 +       * directory locked on success so that it can be populated without
 +       * dropping cgroup_mutex.
 +       */
 +      err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
 +      if (err < 0)
 +              goto err_free_all;
 +      lockdep_assert_held(&dentry->d_inode->i_mutex);
 +
 +      /* allocation complete, commit to creation */
 +      dentry->d_fsdata = cgrp;
 +      cgrp->dentry = dentry;
 +      list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 +      list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
 +      root->number_of_cgroups++;
 +
 +      /* each css holds a ref to the cgroup's dentry */
 +      for_each_subsys(root, ss)
 +              dget(dentry);
 +
 +      /* creation succeeded, notify subsystems */
 +      for_each_subsys(root, ss) {
 +              err = online_css(ss, cgrp);
 +              if (err)
 +                      goto err_destroy;
  
                if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
                    parent->parent) {
                }
        }
  
 -      list_add(&cgrp->sibling, &cgrp->parent->children);
 -      root->number_of_cgroups++;
 -
 -      err = cgroup_create_dir(cgrp, dentry, mode);
 -      if (err < 0)
 -              goto err_remove;
 -
 -      /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
 -      for_each_subsys(root, ss)
 -              if (!ss->__DEPRECATED_clear_css_refs)
 -                      dget(dentry);
 -
 -      /* The cgroup directory was pre-locked for us */
 -      BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
 -
 -      list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 -
        err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
 -      /* If err < 0, we have a half-filled directory - oh well ;) */
 +      if (err)
 +              goto err_destroy;
  
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
  
        return 0;
  
 - err_remove:
 -
 -      list_del(&cgrp->sibling);
 -      root->number_of_cgroups--;
 -
 - err_destroy:
 -
 +err_free_all:
        for_each_subsys(root, ss) {
                if (cgrp->subsys[ss->subsys_id])
 -                      ss->destroy(cgrp);
 +                      ss->css_free(cgrp);
        }
 -
        mutex_unlock(&cgroup_mutex);
 -
        /* Release the reference count that we took on the superblock */
        deactivate_super(sb);
 -
 +err_free_id:
 +      ida_simple_remove(&root->cgroup_ida, cgrp->id);
 +err_free_cgrp:
        kfree(cgrp);
        return err;
 +
 +err_destroy:
 +      cgroup_destroy_locked(cgrp);
 +      mutex_unlock(&cgroup_mutex);
 +      mutex_unlock(&dentry->d_inode->i_mutex);
 +      return err;
  }
  
  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@@ -4278,60 -4197,153 +4278,60 @@@ static int cgroup_has_css_refs(struct c
        return 0;
  }
  
 -/*
 - * Atomically mark all (or else none) of the cgroup's CSS objects as
 - * CSS_REMOVED. Return true on success, or false if the cgroup has
 - * busy subsystems. Call with cgroup_mutex held
 - *
 - * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
 - * not, cgroup removal behaves differently.
 - *
 - * If clear is set, css refcnt for the subsystem should be zero before
 - * cgroup removal can be committed.  This is implemented by
 - * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
 - * called multiple times until all css refcnts reach zero and is allowed to
 - * veto removal on any invocation.  This behavior is deprecated and will be
 - * removed as soon as the existing user (memcg) is updated.
 - *
 - * If clear is not set, each css holds an extra reference to the cgroup's
 - * dentry and cgroup removal proceeds regardless of css refs.
 - * ->pre_destroy() will be called at least once and is not allowed to fail.
 - * On the last put of each css, whenever that may be, the extra dentry ref
 - * is put so that dentry destruction happens only after all css's are
 - * released.
 - */
 -static int cgroup_clear_css_refs(struct cgroup *cgrp)
 +static int cgroup_destroy_locked(struct cgroup *cgrp)
 +      __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
 +      struct dentry *d = cgrp->dentry;
 +      struct cgroup *parent = cgrp->parent;
 +      DEFINE_WAIT(wait);
 +      struct cgroup_event *event, *tmp;
        struct cgroup_subsys *ss;
 -      unsigned long flags;
 -      bool failed = false;
 +      LIST_HEAD(tmp_list);
 +
 +      lockdep_assert_held(&d->d_inode->i_mutex);
 +      lockdep_assert_held(&cgroup_mutex);
  
 -      local_irq_save(flags);
 +      if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
 +              return -EBUSY;
  
        /*
 -       * Block new css_tryget() by deactivating refcnt.  If all refcnts
 -       * for subsystems w/ clear_css_refs set were 1 at the moment of
 -       * deactivation, we succeeded.
 +       * Block new css_tryget() by deactivating refcnt and mark @cgrp
 +       * removed.  This makes future css_tryget() and child creation
 +       * attempts fail thus maintaining the removal conditions verified
 +       * above.
         */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
  
                WARN_ON(atomic_read(&css->refcnt) < 0);
                atomic_add(CSS_DEACT_BIAS, &css->refcnt);
 -
 -              if (ss->__DEPRECATED_clear_css_refs)
 -                      failed |= css_refcnt(css) != 1;
 -      }
 -
 -      /*
 -       * If succeeded, set REMOVED and put all the base refs; otherwise,
 -       * restore refcnts to positive values.  Either way, all in-progress
 -       * css_tryget() will be released.
 -       */
 -      for_each_subsys(cgrp->root, ss) {
 -              struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 -
 -              if (!failed) {
 -                      set_bit(CSS_REMOVED, &css->flags);
 -                      css_put(css);
 -              } else {
 -                      atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
 -              }
        }
 +      set_bit(CGRP_REMOVED, &cgrp->flags);
  
 -      local_irq_restore(flags);
 -      return !failed;
 -}
 -
 -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 -{
 -      struct cgroup *cgrp = dentry->d_fsdata;
 -      struct dentry *d;
 -      struct cgroup *parent;
 -      DEFINE_WAIT(wait);
 -      struct cgroup_event *event, *tmp;
 -      int ret;
 -
 -      /* the vfs holds both inode->i_mutex already */
 -again:
 -      mutex_lock(&cgroup_mutex);
 -      if (atomic_read(&cgrp->count) != 0) {
 -              mutex_unlock(&cgroup_mutex);
 -              return -EBUSY;
 -      }
 -      if (!list_empty(&cgrp->children)) {
 -              mutex_unlock(&cgroup_mutex);
 -              return -EBUSY;
 -      }
 -      mutex_unlock(&cgroup_mutex);
 -
 -      /*
 -       * In general, subsystem has no css->refcnt after pre_destroy(). But
 -       * in racy cases, subsystem may have to get css->refcnt after
 -       * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
 -       * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
 -       * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
 -       * and subsystem's reference count handling. Please see css_get/put
 -       * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
 -       */
 -      set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 +      /* tell subsystems to initate destruction */
 +      for_each_subsys(cgrp->root, ss)
 +              offline_css(ss, cgrp);
  
        /*
 -       * Call pre_destroy handlers of subsys. Notify subsystems
 -       * that rmdir() request comes.
 +       * Put all the base refs.  Each css holds an extra reference to the
 +       * cgroup's dentry and cgroup removal proceeds regardless of css
 +       * refs.  On the last put of each css, whenever that may be, the
 +       * extra dentry ref is put so that dentry destruction happens only
 +       * after all css's are released.
         */
 -      ret = cgroup_call_pre_destroy(cgrp);
 -      if (ret) {
 -              clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 -              return ret;
 -      }
 -
 -      mutex_lock(&cgroup_mutex);
 -      parent = cgrp->parent;
 -      if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
 -              clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 -              mutex_unlock(&cgroup_mutex);
 -              return -EBUSY;
 -      }
 -      prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
 -      if (!cgroup_clear_css_refs(cgrp)) {
 -              mutex_unlock(&cgroup_mutex);
 -              /*
 -               * Because someone may call cgroup_wakeup_rmdir_waiter() before
 -               * prepare_to_wait(), we need to check this flag.
 -               */
 -              if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
 -                      schedule();
 -              finish_wait(&cgroup_rmdir_waitq, &wait);
 -              clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 -              if (signal_pending(current))
 -                      return -EINTR;
 -              goto again;
 -      }
 -      /* NO css_tryget() can success after here. */
 -      finish_wait(&cgroup_rmdir_waitq, &wait);
 -      clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 +      for_each_subsys(cgrp->root, ss)
 +              css_put(cgrp->subsys[ss->subsys_id]);
  
        raw_spin_lock(&release_list_lock);
 -      set_bit(CGRP_REMOVED, &cgrp->flags);
        if (!list_empty(&cgrp->release_list))
                list_del_init(&cgrp->release_list);
        raw_spin_unlock(&release_list_lock);
  
        /* delete this cgroup from parent->children */
 -      list_del_init(&cgrp->sibling);
 -
 +      list_del_rcu(&cgrp->sibling);
        list_del_init(&cgrp->allcg_node);
  
 -      d = dget(cgrp->dentry);
 -
 +      dget(d);
        cgroup_d_remove_dir(d);
        dput(d);
  
        /*
         * Unregister events and notify userspace.
         * Notify userspace about cgroup removing only after rmdir of cgroup
 -       * directory to avoid race between userspace and kernelspace
 +       * directory to avoid race between userspace and kernelspace. Use
 +       * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
 +       * cgroup_event_wake() is called with the wait queue head locked,
 +       * remove_wait_queue() cannot be called while holding event_list_lock.
         */
        spin_lock(&cgrp->event_list_lock);
 -      list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
 -              list_del(&event->list);
 +      list_splice_init(&cgrp->event_list, &tmp_list);
 +      spin_unlock(&cgrp->event_list_lock);
 +      list_for_each_entry_safe(event, tmp, &tmp_list, list) {
 +              list_del_init(&event->list);
                remove_wait_queue(event->wqh, &event->wait);
                eventfd_signal(event->eventfd, 1);
                schedule_work(&event->remove);
        }
 -      spin_unlock(&cgrp->event_list_lock);
  
 -      mutex_unlock(&cgroup_mutex);
        return 0;
  }
  
 +static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 +{
 +      int ret;
 +
 +      mutex_lock(&cgroup_mutex);
 +      ret = cgroup_destroy_locked(dentry->d_fsdata);
 +      mutex_unlock(&cgroup_mutex);
 +
 +      return ret;
 +}
 +
  static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
  {
        INIT_LIST_HEAD(&ss->cftsets);
@@@ -4390,15 -4388,13 +4390,15 @@@ static void __init cgroup_init_subsys(s
  
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
  
 +      mutex_lock(&cgroup_mutex);
 +
        /* init base cftset */
        cgroup_init_cftsets(ss);
  
        /* Create the top cgroup state for this subsystem */
        list_add(&ss->sibling, &rootnode.subsys_list);
        ss->root = &rootnode;
 -      css = ss->create(dummytop);
 +      css = ss->css_alloc(dummytop);
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
        init_cgroup_css(css, ss, dummytop);
         * pointer to this state - since the subsystem is
         * newly registered, all tasks and hence the
         * init_css_set is in the subsystem's top cgroup. */
 -      init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
 +      init_css_set.subsys[ss->subsys_id] = css;
  
        need_forkexit_callback |= ss->fork || ss->exit;
  
        BUG_ON(!list_empty(&init_task.tasks));
  
        ss->active = 1;
 +      BUG_ON(online_css(ss, dummytop));
 +
 +      mutex_unlock(&cgroup_mutex);
  
        /* this function shouldn't be used with modular subsystems, since they
         * need to register a subsys_id, among other things */
   */
  int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
  {
 -      int i;
        struct cgroup_subsys_state *css;
 +      int i, ret;
  
        /* check name and function validity */
        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
 -          ss->create == NULL || ss->destroy == NULL)
 +          ss->css_alloc == NULL || ss->css_free == NULL)
                return -EINVAL;
  
        /*
        subsys[ss->subsys_id] = ss;
  
        /*
 -       * no ss->create seems to need anything important in the ss struct, so
 -       * this can happen first (i.e. before the rootnode attachment).
 +       * no ss->css_alloc seems to need anything important in the ss
 +       * struct, so this can happen first (i.e. before the rootnode
 +       * attachment).
         */
 -      css = ss->create(dummytop);
 +      css = ss->css_alloc(dummytop);
        if (IS_ERR(css)) {
                /* failure case - need to deassign the subsys[] slot. */
                subsys[ss->subsys_id] = NULL;
        init_cgroup_css(css, ss, dummytop);
        /* init_idr must be after init_cgroup_css because it sets css->id. */
        if (ss->use_id) {
 -              int ret = cgroup_init_idr(ss, css);
 -              if (ret) {
 -                      dummytop->subsys[ss->subsys_id] = NULL;
 -                      ss->destroy(dummytop);
 -                      subsys[ss->subsys_id] = NULL;
 -                      mutex_unlock(&cgroup_mutex);
 -                      return ret;
 -              }
 +              ret = cgroup_init_idr(ss, css);
 +              if (ret)
 +                      goto err_unload;
        }
  
        /*
        write_unlock(&css_set_lock);
  
        ss->active = 1;
 +      ret = online_css(ss, dummytop);
 +      if (ret)
 +              goto err_unload;
  
        /* success! */
        mutex_unlock(&cgroup_mutex);
        return 0;
 +
 +err_unload:
 +      mutex_unlock(&cgroup_mutex);
 +      /* @ss can't be mounted here as try_module_get() would fail */
 +      cgroup_unload_subsys(ss);
 +      return ret;
  }
  EXPORT_SYMBOL_GPL(cgroup_load_subsys);
  
@@@ -4564,15 -4552,6 +4564,15 @@@ void cgroup_unload_subsys(struct cgroup
        BUG_ON(ss->root != &rootnode);
  
        mutex_lock(&cgroup_mutex);
 +
 +      offline_css(ss, dummytop);
 +      ss->active = 0;
 +
 +      if (ss->use_id) {
 +              idr_remove_all(&ss->idr);
 +              idr_destroy(&ss->idr);
 +      }
 +
        /* deassign the subsys_id */
        subsys[ss->subsys_id] = NULL;
  
                struct css_set *cg = link->cg;
  
                hlist_del(&cg->hlist);
 -              BUG_ON(!cg->subsys[ss->subsys_id]);
                cg->subsys[ss->subsys_id] = NULL;
                hhead = css_set_hash(cg->subsys);
                hlist_add_head(&cg->hlist, hhead);
        write_unlock(&css_set_lock);
  
        /*
 -       * remove subsystem's css from the dummytop and free it - need to free
 -       * before marking as null because ss->destroy needs the cgrp->subsys
 -       * pointer to find their state. note that this also takes care of
 -       * freeing the css_id.
 +       * remove subsystem's css from the dummytop and free it - need to
 +       * free before marking as null because ss->css_free needs the
 +       * cgrp->subsys pointer to find their state. note that this also
 +       * takes care of freeing the css_id.
         */
 -      ss->destroy(dummytop);
 +      ss->css_free(dummytop);
        dummytop->subsys[ss->subsys_id] = NULL;
  
        mutex_unlock(&cgroup_mutex);
@@@ -4644,8 -4624,8 +4644,8 @@@ int __init cgroup_init_early(void
  
                BUG_ON(!ss->name);
                BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
 -              BUG_ON(!ss->create);
 -              BUG_ON(!ss->destroy);
 +              BUG_ON(!ss->css_alloc);
 +              BUG_ON(!ss->css_free);
                if (ss->subsys_id != i) {
                        printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
                               ss->name, ss->subsys_id);
@@@ -4851,20 -4831,45 +4851,20 @@@ void cgroup_fork(struct task_struct *ch
        INIT_LIST_HEAD(&child->cg_list);
  }
  
 -/**
 - * cgroup_fork_callbacks - run fork callbacks
 - * @child: the new task
 - *
 - * Called on a new task very soon before adding it to the
 - * tasklist. No need to take any locks since no-one can
 - * be operating on this task.
 - */
 -void cgroup_fork_callbacks(struct task_struct *child)
 -{
 -      if (need_forkexit_callback) {
 -              int i;
 -              for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 -                      struct cgroup_subsys *ss = subsys[i];
 -
 -                      /*
 -                       * forkexit callbacks are only supported for
 -                       * builtin subsystems.
 -                       */
 -                      if (!ss || ss->module)
 -                              continue;
 -
 -                      if (ss->fork)
 -                              ss->fork(child);
 -              }
 -      }
 -}
 -
  /**
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
   *
 - * Adds the task to the list running through its css_set if necessary.
 - * Has to be after the task is visible on the task list in case we race
 - * with the first call to cgroup_iter_start() - to guarantee that the
 - * new task ends up on its list.
 + * Adds the task to the list running through its css_set if necessary and
 + * call the subsystem fork() callbacks.  Has to be after the task is
 + * visible on the task list in case we race with the first call to
 + * cgroup_iter_start() - to guarantee that the new task ends up on its
 + * list.
   */
  void cgroup_post_fork(struct task_struct *child)
  {
 +      int i;
 +
        /*
         * use_task_css_set_links is set to 1 before we walk the tasklist
         * under the tasklist_lock and we read it here after we added the child
                task_unlock(child);
                write_unlock(&css_set_lock);
        }
 +
 +      /*
 +       * Call ss->fork().  This must happen after @child is linked on
 +       * css_set; otherwise, @child might change state between ->fork()
 +       * and addition to css_set.
 +       */
 +      if (need_forkexit_callback) {
 +              for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 +                      struct cgroup_subsys *ss = subsys[i];
 +
 +                      /*
 +                       * fork/exit callbacks are supported only for
 +                       * builtin subsystems and we don't need further
 +                       * synchronization as they never go away.
 +                       */
 +                      if (!ss || ss->module)
 +                              continue;
 +
 +                      if (ss->fork)
 +                              ss->fork(child);
 +              }
 +      }
  }
 +
  /**
   * cgroup_exit - detach cgroup from exiting task
   * @tsk: pointer to task_struct of exiting process
@@@ -5040,17 -5022,15 +5040,17 @@@ static void check_for_release(struct cg
  /* Caller must verify that the css is not for root cgroup */
  bool __css_tryget(struct cgroup_subsys_state *css)
  {
 -      do {
 -              int v = css_refcnt(css);
 +      while (true) {
 +              int t, v;
  
 -              if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
 +              v = css_refcnt(css);
 +              t = atomic_cmpxchg(&css->refcnt, v, v + 1);
 +              if (likely(t == v))
                        return true;
 +              else if (t < 0)
 +                      return false;
                cpu_relax();
 -      } while (!test_bit(CSS_REMOVED, &css->flags));
 -
 -      return false;
 +      }
  }
  EXPORT_SYMBOL_GPL(__css_tryget);
  
@@@ -5069,9 -5049,11 +5069,9 @@@ void __css_put(struct cgroup_subsys_sta
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
 -              cgroup_wakeup_rmdir_waiter(cgrp);
                break;
        case 0:
 -              if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
 -                      schedule_work(&css->dput_work);
 +              schedule_work(&css->dput_work);
                break;
        }
        rcu_read_unlock();
@@@ -5457,7 -5439,7 +5457,7 @@@ struct cgroup_subsys_state *cgroup_css_
  }
  
  #ifdef CONFIG_CGROUP_DEBUG
 -static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
 +static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
  {
        struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
  
        return css;
  }
  
 -static void debug_destroy(struct cgroup *cont)
 +static void debug_css_free(struct cgroup *cont)
  {
        kfree(cont->subsys[debug_subsys_id]);
  }
@@@ -5596,8 -5578,8 +5596,8 @@@ static struct cftype debug_files[] =  
  
  struct cgroup_subsys debug_subsys = {
        .name = "debug",
 -      .create = debug_create,
 -      .destroy = debug_destroy,
 +      .css_alloc = debug_css_alloc,
 +      .css_free = debug_css_free,
        .subsys_id = debug_subsys_id,
        .base_cftypes = debug_files,
  };
diff --combined kernel/events/core.c
index f9ff5493171d83208b140d19f8276fe3908e670b,738f3564e83bface92dfd487c9187242026e351a..301079d06f24ebe44081a286766436de104a3a91
@@@ -6155,7 -6155,7 +6155,7 @@@ perf_event_alloc(struct perf_event_att
  
        event->parent           = parent_event;
  
-       event->ns               = get_pid_ns(current->nsproxy->pid_ns);
+       event->ns               = get_pid_ns(task_active_pid_ns(current));
        event->id               = atomic64_inc_return(&perf_event_id);
  
        event->state            = PERF_EVENT_STATE_INACTIVE;
@@@ -7434,7 -7434,7 +7434,7 @@@ unlock
  device_initcall(perf_event_sysfs_init);
  
  #ifdef CONFIG_CGROUP_PERF
 -static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
 +static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
  {
        struct perf_cgroup *jc;
  
        return &jc->css;
  }
  
 -static void perf_cgroup_destroy(struct cgroup *cont)
 +static void perf_cgroup_css_free(struct cgroup *cont)
  {
        struct perf_cgroup *jc;
        jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@@ -7492,8 -7492,8 +7492,8 @@@ static void perf_cgroup_exit(struct cgr
  struct cgroup_subsys perf_subsys = {
        .name           = "perf_event",
        .subsys_id      = perf_subsys_id,
 -      .create         = perf_cgroup_create,
 -      .destroy        = perf_cgroup_destroy,
 +      .css_alloc      = perf_cgroup_css_alloc,
 +      .css_free       = perf_cgroup_css_free,
        .exit           = perf_cgroup_exit,
        .attach         = perf_cgroup_attach,
  
diff --combined kernel/exit.c
index 50d2e93c36ea6ff421192e7fb0f92a3cb0df6e63,d7fe58db452709444a42e622c964e604ddd42302..b4df21937216e1704670d89e8ef8fe8aa9aee810
@@@ -72,18 -72,6 +72,6 @@@ static void __unhash_process(struct tas
                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
-               /*
-                * If we are the last child process in a pid namespace to be
-                * reaped, notify the reaper sleeping zap_pid_ns_processes().
-                */
-               if (IS_ENABLED(CONFIG_PID_NS)) {
-                       struct task_struct *parent = p->real_parent;
-                       if ((task_active_pid_ns(parent)->child_reaper == parent) &&
-                           list_empty(&parent->children) &&
-                           (parent->flags & PF_EXITING))
-                               wake_up_process(parent);
-               }
        }
        list_del_rcu(&p->thread_group);
  }
@@@ -322,6 -310,43 +310,6 @@@ kill_orphaned_pgrp(struct task_struct *
        }
  }
  
 -/**
 - * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
 - *
 - * If a kernel thread is launched as a result of a system call, or if
 - * it ever exits, it should generally reparent itself to kthreadd so it
 - * isn't in the way of other processes and is correctly cleaned up on exit.
 - *
 - * The various task state such as scheduling policy and priority may have
 - * been inherited from a user process, so we reset them to sane values here.
 - *
 - * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
 - */
 -static void reparent_to_kthreadd(void)
 -{
 -      write_lock_irq(&tasklist_lock);
 -
 -      ptrace_unlink(current);
 -      /* Reparent to init */
 -      current->real_parent = current->parent = kthreadd_task;
 -      list_move_tail(&current->sibling, &current->real_parent->children);
 -
 -      /* Set the exit signal to SIGCHLD so we signal init on exit */
 -      current->exit_signal = SIGCHLD;
 -
 -      if (task_nice(current) < 0)
 -              set_user_nice(current, 0);
 -      /* cpus_allowed? */
 -      /* rt_priority? */
 -      /* signals? */
 -      memcpy(current->signal->rlim, init_task.signal->rlim,
 -             sizeof(current->signal->rlim));
 -
 -      atomic_inc(&init_cred.usage);
 -      commit_creds(&init_cred);
 -      write_unlock_irq(&tasklist_lock);
 -}
 -
  void __set_special_pids(struct pid *pid)
  {
        struct task_struct *curr = current->group_leader;
                change_pid(curr, PIDTYPE_PGID, pid);
  }
  
 -static void set_special_pids(struct pid *pid)
 -{
 -      write_lock_irq(&tasklist_lock);
 -      __set_special_pids(pid);
 -      write_unlock_irq(&tasklist_lock);
 -}
 -
  /*
   * Let kernel threads use this to say that they allow a certain signal.
   * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@@ -372,6 -404,54 +360,6 @@@ int disallow_signal(int sig
  
  EXPORT_SYMBOL(disallow_signal);
  
 -/*
 - *    Put all the gunge required to become a kernel thread without
 - *    attached user resources in one place where it belongs.
 - */
 -
 -void daemonize(const char *name, ...)
 -{
 -      va_list args;
 -      sigset_t blocked;
 -
 -      va_start(args, name);
 -      vsnprintf(current->comm, sizeof(current->comm), name, args);
 -      va_end(args);
 -
 -      /*
 -       * If we were started as result of loading a module, close all of the
 -       * user space pages.  We don't need them, and if we didn't close them
 -       * they would be locked into memory.
 -       */
 -      exit_mm(current);
 -      /*
 -       * We don't want to get frozen, in case system-wide hibernation
 -       * or suspend transition begins right now.
 -       */
 -      current->flags |= (PF_NOFREEZE | PF_KTHREAD);
 -
 -      if (current->nsproxy != &init_nsproxy) {
 -              get_nsproxy(&init_nsproxy);
 -              switch_task_namespaces(current, &init_nsproxy);
 -      }
 -      set_special_pids(&init_struct_pid);
 -      proc_clear_tty(current);
 -
 -      /* Block and flush all signals */
 -      sigfillset(&blocked);
 -      sigprocmask(SIG_BLOCK, &blocked, NULL);
 -      flush_signals(current);
 -
 -      /* Become as one with the init task */
 -
 -      daemonize_fs_struct();
 -      daemonize_descriptors();
 -
 -      reparent_to_kthreadd();
 -}
 -
 -EXPORT_SYMBOL(daemonize);
 -
  #ifdef CONFIG_MM_OWNER
  /*
   * A task is exiting.   If it owned this mm, find a new owner for the mm.
@@@ -1094,11 -1174,11 +1082,11 @@@ static int wait_task_zombie(struct wait
                 * as other threads in the parent group can be right
                 * here reaping other children at the same time.
                 *
 -               * We use thread_group_times() to get times for the thread
 +               * We use thread_group_cputime_adjusted() to get times for the thread
                 * group, which consolidates times for all threads in the
                 * group including the group leader.
                 */
 -              thread_group_times(p, &tgutime, &tgstime);
 +              thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
diff --combined kernel/fork.c
index 115d6c2e4cca0dda8601efe7c3b114f3c37859a3,38e53b87402c865e06e9a55e5319c297715b893c..c36c4e301efef7c92a39b35b71a67e72cc0fb365
@@@ -352,7 -352,6 +352,7 @@@ static int dup_mmap(struct mm_struct *m
        unsigned long charge;
        struct mempolicy *pol;
  
 +      uprobe_start_dup_mmap();
        down_write(&oldmm->mmap_sem);
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
@@@ -470,7 -469,6 +470,7 @@@ out
        up_write(&mm->mmap_sem);
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
 +      uprobe_end_dup_mmap();
        return retval;
  fail_nomem_anon_vma_fork:
        mpol_put(pol);
@@@ -822,9 -820,6 +822,9 @@@ struct mm_struct *dup_mm(struct task_st
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        mm->pmd_huge_pte = NULL;
 +#endif
 +#ifdef CONFIG_NUMA_BALANCING
 +      mm->first_nid = NUMA_PTE_SCAN_INIT;
  #endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@@ -1044,8 -1039,6 +1044,6 @@@ static int copy_signal(unsigned long cl
        atomic_set(&sig->live, 1);
        atomic_set(&sig->sigcnt, 1);
        init_waitqueue_head(&sig->wait_chldexit);
-       if (clone_flags & CLONE_NEWPID)
-               sig->flags |= SIGNAL_UNKILLABLE;
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
@@@ -1132,6 -1125,7 +1130,6 @@@ static void posix_cpu_timers_init(struc
   */
  static struct task_struct *copy_process(unsigned long clone_flags,
                                        unsigned long stack_start,
 -                                      struct pt_regs *regs,
                                        unsigned long stack_size,
                                        int __user *child_tidptr,
                                        struct pid *pid,
  {
        int retval;
        struct task_struct *p;
 -      int cgroup_callbacks_done = 0;
  
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
        p->utime = p->stime = p->gtime = 0;
        p->utimescaled = p->stimescaled = 0;
  #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 -      p->prev_utime = p->prev_stime = 0;
 +      p->prev_cputime.utime = p->prev_cputime.stime = 0;
  #endif
  #if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
 -      retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
 +      retval = copy_thread(clone_flags, stack_start, stack_size, p);
        if (retval)
                goto bad_fork_cleanup_io;
  
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
  
 -      /* Now that the task is set up, run cgroup callbacks if
 -       * necessary. We need to run them before the task is visible
 -       * on the tasklist. */
 -      cgroup_fork_callbacks(p);
 -      cgroup_callbacks_done = 1;
 -
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
  
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
  
                if (thread_group_leader(p)) {
-                       if (is_child_reaper(pid))
-                               p->nsproxy->pid_ns->child_reaper = p;
+                       if (is_child_reaper(pid)) {
+                               ns_of_pid(pid)->child_reaper = p;
+                               p->signal->flags |= SIGNAL_UNKILLABLE;
+                       }
  
                        p->signal->leader_pid = pid;
                        p->signal->tty = tty_kref_get(current->signal->tty);
@@@ -1473,8 -1476,6 +1473,6 @@@ bad_fork_cleanup_io
        if (p->io_context)
                exit_io_context(p);
  bad_fork_cleanup_namespaces:
-       if (unlikely(clone_flags & CLONE_NEWPID))
-               pid_ns_release_proc(p->nsproxy->pid_ns);
        exit_task_namespaces(p);
  bad_fork_cleanup_mm:
        if (p->mm)
@@@ -1500,7 -1501,7 +1498,7 @@@ bad_fork_cleanup_cgroup
  #endif
        if (clone_flags & CLONE_THREAD)
                threadgroup_change_end(current);
 -      cgroup_exit(p, cgroup_callbacks_done);
 +      cgroup_exit(p, 0);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
  bad_fork_cleanup_count:
@@@ -1512,6 -1513,12 +1510,6 @@@ fork_out
        return ERR_PTR(retval);
  }
  
 -noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
 -{
 -      memset(regs, 0, sizeof(struct pt_regs));
 -      return regs;
 -}
 -
  static inline void init_idle_pids(struct pid_link *links)
  {
        enum pid_type type;
  struct task_struct * __cpuinit fork_idle(int cpu)
  {
        struct task_struct *task;
 -      struct pt_regs regs;
 -
 -      task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
 -                          &init_struct_pid, 0);
 +      task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
        if (!IS_ERR(task)) {
                init_idle_pids(task->pids);
                init_idle(task, cpu);
   */
  long do_fork(unsigned long clone_flags,
              unsigned long stack_start,
 -            struct pt_regs *regs,
              unsigned long stack_size,
              int __user *parent_tidptr,
              int __user *child_tidptr)
         * Do some preliminary argument and permissions checking before we
         * actually start allocating stuff
         */
-       if (clone_flags & CLONE_NEWUSER) {
-               if (clone_flags & CLONE_THREAD)
+       if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
+               if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
                        return -EINVAL;
-               /* hopefully this check will go away when userns support is
-                * complete
-                */
-               if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
-                               !capable(CAP_SETGID))
-                       return -EPERM;
        }
  
        /*
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
 -      if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) {
 +      if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if ((clone_flags & CSIGNAL) != SIGCHLD)
                        trace = 0;
        }
  
 -      p = copy_process(clone_flags, stack_start, regs, stack_size,
 +      p = copy_process(clone_flags, stack_start, stack_size,
                         child_tidptr, NULL, trace);
        /*
         * Do this prior waking up the new thread - the thread pointer
   */
  pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
  {
 -      return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL,
 +      return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
                (unsigned long)arg, NULL, NULL);
  }
  #endif
  
 +#ifdef __ARCH_WANT_SYS_FORK
 +SYSCALL_DEFINE0(fork)
 +{
 +#ifdef CONFIG_MMU
 +      return do_fork(SIGCHLD, 0, 0, NULL, NULL);
 +#else
 +      /* can not support in nommu mode */
 +      return(-EINVAL);
 +#endif
 +}
 +#endif
 +
 +#ifdef __ARCH_WANT_SYS_VFORK
 +SYSCALL_DEFINE0(vfork)
 +{
 +      return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 
 +                      0, NULL, NULL);
 +}
 +#endif
 +
 +#ifdef __ARCH_WANT_SYS_CLONE
 +#ifdef CONFIG_CLONE_BACKWARDS
 +SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 +               int __user *, parent_tidptr,
 +               int, tls_val,
 +               int __user *, child_tidptr)
 +#elif defined(CONFIG_CLONE_BACKWARDS2)
 +SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
 +               int __user *, parent_tidptr,
 +               int __user *, child_tidptr,
 +               int, tls_val)
 +#else
 +SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 +               int __user *, parent_tidptr,
 +               int __user *, child_tidptr,
 +               int, tls_val)
 +#endif
 +{
 +      return do_fork(clone_flags, newsp, 0,
 +              parent_tidptr, child_tidptr);
 +}
 +#endif
 +
  #ifndef ARCH_MIN_MMSTRUCT_ALIGN
  #define ARCH_MIN_MMSTRUCT_ALIGN 0
  #endif
@@@ -1724,7 -1686,8 +1716,8 @@@ static int check_unshare_flags(unsigne
  {
        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-                               CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+                               CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+                               CLONE_NEWUSER|CLONE_NEWPID))
                return -EINVAL;
        /*
         * Not implemented, but pretend it works if there is nothing to
@@@ -1791,19 -1754,40 +1784,40 @@@ SYSCALL_DEFINE1(unshare, unsigned long
  {
        struct fs_struct *fs, *new_fs = NULL;
        struct files_struct *fd, *new_fd = NULL;
+       struct cred *new_cred = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
        int err;
  
-       err = check_unshare_flags(unshare_flags);
-       if (err)
-               goto bad_unshare_out;
+       /*
+        * If unsharing a user namespace must also unshare the thread.
+        */
+       if (unshare_flags & CLONE_NEWUSER)
+               unshare_flags |= CLONE_THREAD;
+       /*
+        * If unsharing a pid namespace must also unshare the thread.
+        */
+       if (unshare_flags & CLONE_NEWPID)
+               unshare_flags |= CLONE_THREAD;
+       /*
+        * If unsharing a thread from a thread group, must also unshare vm.
+        */
+       if (unshare_flags & CLONE_THREAD)
+               unshare_flags |= CLONE_VM;
+       /*
+        * If unsharing vm, must also unshare signal handlers.
+        */
+       if (unshare_flags & CLONE_VM)
+               unshare_flags |= CLONE_SIGHAND;
        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & CLONE_NEWNS)
                unshare_flags |= CLONE_FS;
+       err = check_unshare_flags(unshare_flags);
+       if (err)
+               goto bad_unshare_out;
        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
        err = unshare_fd(unshare_flags, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
-       err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+       err = unshare_userns(unshare_flags, &new_cred);
        if (err)
                goto bad_unshare_cleanup_fd;
+       err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+                                        new_cred, new_fs);
+       if (err)
+               goto bad_unshare_cleanup_cred;
  
-       if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+       if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
                }
  
                task_unlock(current);
+               if (new_cred) {
+                       /* Install the new user namespace */
+                       commit_creds(new_cred);
+                       new_cred = NULL;
+               }
        }
  
        if (new_nsproxy)
                put_nsproxy(new_nsproxy);
  
+ bad_unshare_cleanup_cred:
+       if (new_cred)
+               put_cred(new_cred);
  bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);
diff --combined kernel/pid.c
index fd996c1ed9f891988607812abb95dc8820ab3751,3026ddae0a348fb644c6f4cfb3bc35057601ac1e..3e2cf8100acc84b23b5741603c44fc908e3d5068
@@@ -1,8 -1,8 +1,8 @@@
  /*
   * Generic pidhash and scalable, time-bounded PID allocator
   *
 - * (C) 2002-2003 William Irwin, IBM
 - * (C) 2004 William Irwin, Oracle
 + * (C) 2002-2003 Nadia Yvette Chambers, IBM
 + * (C) 2004 Nadia Yvette Chambers, Oracle
   * (C) 2002-2004 Ingo Molnar, Red Hat
   *
   * pid-structures are backing objects for tasks sharing a given ID to chain
@@@ -36,6 -36,7 +36,7 @@@
  #include <linux/pid_namespace.h>
  #include <linux/init_task.h>
  #include <linux/syscalls.h>
+ #include <linux/proc_fs.h>
  
  #define pid_hashfn(nr, ns)    \
        hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@@ -78,6 -79,8 +79,8 @@@ struct pid_namespace init_pid_ns = 
        .last_pid = 0,
        .level = 0,
        .child_reaper = &init_task,
+       .user_ns = &init_user_ns,
+       .proc_inum = PROC_PID_INIT_INO,
  };
  EXPORT_SYMBOL_GPL(init_pid_ns);
  
@@@ -269,8 -272,24 +272,24 @@@ void free_pid(struct pid *pid
        unsigned long flags;
  
        spin_lock_irqsave(&pidmap_lock, flags);
-       for (i = 0; i <= pid->level; i++)
-               hlist_del_rcu(&pid->numbers[i].pid_chain);
+       for (i = 0; i <= pid->level; i++) {
+               struct upid *upid = pid->numbers + i;
+               struct pid_namespace *ns = upid->ns;
+               hlist_del_rcu(&upid->pid_chain);
+               switch(--ns->nr_hashed) {
+               case 1:
+                       /* When all that is left in the pid namespace
+                        * is the reaper wake up the reaper.  The reaper
+                        * may be sleeping in zap_pid_ns_processes().
+                        */
+                       wake_up_process(ns->child_reaper);
+                       break;
+               case 0:
+                       ns->nr_hashed = -1;
+                       schedule_work(&ns->proc_work);
+                       break;
+               }
+       }
        spin_unlock_irqrestore(&pidmap_lock, flags);
  
        for (i = 0; i <= pid->level; i++)
@@@ -292,6 -311,7 +311,7 @@@ struct pid *alloc_pid(struct pid_namesp
                goto out;
  
        tmp = ns;
+       pid->level = ns->level;
        for (i = ns->level; i >= 0; i--) {
                nr = alloc_pidmap(tmp);
                if (nr < 0)
                tmp = tmp->parent;
        }
  
+       if (unlikely(is_child_reaper(pid))) {
+               if (pid_ns_prepare_proc(ns))
+                       goto out_free;
+       }
        get_pid_ns(ns);
-       pid->level = ns->level;
        atomic_set(&pid->count, 1);
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);
  
        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
-       for ( ; upid >= pid->numbers; --upid)
+       if (ns->nr_hashed < 0)
+               goto out_unlock;
+       for ( ; upid >= pid->numbers; --upid) {
                hlist_add_head_rcu(&upid->pid_chain,
                                &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+               upid->ns->nr_hashed++;
+       }
        spin_unlock_irq(&pidmap_lock);
  
  out:
        return pid;
  
+ out_unlock:
+       spin_unlock(&pidmap_lock);
  out_free:
        while (++i <= ns->level)
                free_pidmap(pid->numbers + i);
@@@ -344,7 -374,7 +374,7 @@@ EXPORT_SYMBOL_GPL(find_pid_ns)
  
  struct pid *find_vpid(int nr)
  {
-       return find_pid_ns(nr, current->nsproxy->pid_ns);
+       return find_pid_ns(nr, task_active_pid_ns(current));
  }
  EXPORT_SYMBOL_GPL(find_vpid);
  
@@@ -428,7 -458,7 +458,7 @@@ struct task_struct *find_task_by_pid_ns
  
  struct task_struct *find_task_by_vpid(pid_t vnr)
  {
-       return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+       return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
  }
  
  struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@@ -483,7 -513,7 +513,7 @@@ EXPORT_SYMBOL_GPL(pid_nr_ns)
  
  pid_t pid_vnr(struct pid *pid)
  {
-       return pid_nr_ns(pid, current->nsproxy->pid_ns);
+       return pid_nr_ns(pid, task_active_pid_ns(current));
  }
  EXPORT_SYMBOL_GPL(pid_vnr);
  
@@@ -494,7 -524,7 +524,7 @@@ pid_t __task_pid_nr_ns(struct task_stru
  
        rcu_read_lock();
        if (!ns)
-               ns = current->nsproxy->pid_ns;
+               ns = task_active_pid_ns(current);
        if (likely(pid_alive(task))) {
                if (type != PIDTYPE_PID)
                        task = task->group_leader;
@@@ -569,6 -599,7 +599,7 @@@ void __init pidmap_init(void
        /* Reserve PID 0. We never call free_pidmap(0) */
        set_bit(0, init_pid_ns.pidmap[0].page);
        atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+       init_pid_ns.nr_hashed = 1;
  
        init_pid_ns.pid_cachep = KMEM_CACHE(pid,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --combined kernel/sched/core.c
index c1fb82104bfbc9405d0c782799173ecd388ceb9a,2f5eb1838b3eb5c2b1e569e8a0b2ac762203bc93..257002c13bb02acad92c74347e3b38ca3bc881b1
@@@ -72,7 -72,6 +72,7 @@@
  #include <linux/slab.h>
  #include <linux/init_task.h>
  #include <linux/binfmts.h>
 +#include <linux/context_tracking.h>
  
  #include <asm/switch_to.h>
  #include <asm/tlb.h>
@@@ -193,10 -192,23 +193,10 @@@ static void sched_feat_disable(int i) 
  static void sched_feat_enable(int i) { };
  #endif /* HAVE_JUMP_LABEL */
  
 -static ssize_t
 -sched_feat_write(struct file *filp, const char __user *ubuf,
 -              size_t cnt, loff_t *ppos)
 +static int sched_feat_set(char *cmp)
  {
 -      char buf[64];
 -      char *cmp;
 -      int neg = 0;
        int i;
 -
 -      if (cnt > 63)
 -              cnt = 63;
 -
 -      if (copy_from_user(&buf, ubuf, cnt))
 -              return -EFAULT;
 -
 -      buf[cnt] = 0;
 -      cmp = strstrip(buf);
 +      int neg = 0;
  
        if (strncmp(cmp, "NO_", 3) == 0) {
                neg = 1;
                }
        }
  
 +      return i;
 +}
 +
 +static ssize_t
 +sched_feat_write(struct file *filp, const char __user *ubuf,
 +              size_t cnt, loff_t *ppos)
 +{
 +      char buf[64];
 +      char *cmp;
 +      int i;
 +
 +      if (cnt > 63)
 +              cnt = 63;
 +
 +      if (copy_from_user(&buf, ubuf, cnt))
 +              return -EFAULT;
 +
 +      buf[cnt] = 0;
 +      cmp = strstrip(buf);
 +
 +      i = sched_feat_set(cmp);
        if (i == __SCHED_FEAT_NR)
                return -EINVAL;
  
@@@ -931,13 -922,6 +931,13 @@@ void check_preempt_curr(struct rq *rq, 
                rq->skip_clock_update = 1;
  }
  
 +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
 +
 +void register_task_migration_notifier(struct notifier_block *n)
 +{
 +      atomic_notifier_chain_register(&task_migration_notifier, n);
 +}
 +
  #ifdef CONFIG_SMP
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
        trace_sched_migrate_task(p, new_cpu);
  
        if (task_cpu(p) != new_cpu) {
 +              struct task_migration_notifier tmn;
 +
 +              if (p->sched_class->migrate_task_rq)
 +                      p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
 +
 +              tmn.task = p;
 +              tmn.from_cpu = task_cpu(p);
 +              tmn.to_cpu = new_cpu;
 +
 +              atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
        }
  
        __set_task_cpu(p, new_cpu);
@@@ -1550,15 -1524,6 +1550,15 @@@ static void __sched_fork(struct task_st
        p->se.vruntime                  = 0;
        INIT_LIST_HEAD(&p->se.group_node);
  
 +/*
 + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
 + * removed when useful for applications beyond shares distribution (e.g.
 + * load-balance).
 + */
 +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
 +      p->se.avg.runnable_avg_period = 0;
 +      p->se.avg.runnable_avg_sum = 0;
 +#endif
  #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
  #endif
  #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
  #endif
 +
 +#ifdef CONFIG_NUMA_BALANCING
 +      if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
 +              p->mm->numa_next_scan = jiffies;
 +              p->mm->numa_next_reset = jiffies;
 +              p->mm->numa_scan_seq = 0;
 +      }
 +
 +      p->node_stamp = 0ULL;
 +      p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
 +      p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
 +      p->numa_scan_period = sysctl_numa_balancing_scan_delay;
 +      p->numa_work.next = &p->numa_work;
 +#endif /* CONFIG_NUMA_BALANCING */
  }
  
 +#ifdef CONFIG_NUMA_BALANCING
 +#ifdef CONFIG_SCHED_DEBUG
 +void set_numabalancing_state(bool enabled)
 +{
 +      if (enabled)
 +              sched_feat_set("NUMA");
 +      else
 +              sched_feat_set("NO_NUMA");
 +}
 +#else
 +__read_mostly bool numabalancing_enabled;
 +
 +void set_numabalancing_state(bool enabled)
 +{
 +      numabalancing_enabled = enabled;
 +}
 +#endif /* CONFIG_SCHED_DEBUG */
 +#endif /* CONFIG_NUMA_BALANCING */
 +
  /*
   * fork()/clone()-time setup:
   */
@@@ -1954,8 -1886,8 +1954,8 @@@ context_switch(struct rq *rq, struct ta
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
  #endif
  
 +      context_tracking_task_switch(prev, next);
        /* Here we just switch the register state and the stack. */
 -      rcu_switch(prev, next);
        switch_to(prev, next, prev);
  
        barrier();
@@@ -2979,7 -2911,7 +2979,7 @@@ asmlinkage void __sched schedule(void
  }
  EXPORT_SYMBOL(schedule);
  
 -#ifdef CONFIG_RCU_USER_QS
 +#ifdef CONFIG_CONTEXT_TRACKING
  asmlinkage void __sched schedule_user(void)
  {
        /*
         * we haven't yet exited the RCU idle mode. Do it here manually until
         * we find a better solution.
         */
 -      rcu_user_exit();
 +      user_exit();
        schedule();
 -      rcu_user_enter();
 +      user_enter();
  }
  #endif
  
@@@ -3095,7 -3027,7 +3095,7 @@@ asmlinkage void __sched preempt_schedul
        /* Catch callers which need to be fixed */
        BUG_ON(ti->preempt_count || !irqs_disabled());
  
 -      rcu_user_exit();
 +      user_exit();
        do {
                add_preempt_count(PREEMPT_ACTIVE);
                local_irq_enable();
@@@ -4097,8 -4029,14 +4097,14 @@@ long sched_setaffinity(pid_t pid, cons
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-       if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
-               goto out_unlock;
+       if (!check_same_owner(p)) {
+               rcu_read_lock();
+               if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+                       rcu_read_unlock();
+                       goto out_unlock;
+               }
+               rcu_read_unlock();
+       }
  
        retval = security_task_setscheduler(p);
        if (retval)
@@@ -4542,7 -4480,6 +4548,7 @@@ static const char stat_nam[] = TASK_STA
  void sched_show_task(struct task_struct *p)
  {
        unsigned long free = 0;
 +      int ppid;
        unsigned state;
  
        state = p->state ? __ffs(p->state) + 1 : 0;
  #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
  #endif
 +      rcu_read_lock();
 +      ppid = task_pid_nr(rcu_dereference(p->real_parent));
 +      rcu_read_unlock();
        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
 -              task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
 +              task_pid_nr(p), ppid,
                (unsigned long)task_thread_info(p)->flags);
  
        show_stack(p, NULL);
@@@ -7540,7 -7474,7 +7546,7 @@@ static inline struct task_group *cgroup
                            struct task_group, css);
  }
  
 -static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
 +static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
  {
        struct task_group *tg, *parent;
  
        return &tg->css;
  }
  
 -static void cpu_cgroup_destroy(struct cgroup *cgrp)
 +static void cpu_cgroup_css_free(struct cgroup *cgrp)
  {
        struct task_group *tg = cgroup_tg(cgrp);
  
@@@ -7917,8 -7851,8 +7923,8 @@@ static struct cftype cpu_files[] = 
  
  struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
 -      .create         = cpu_cgroup_create,
 -      .destroy        = cpu_cgroup_destroy,
 +      .css_alloc      = cpu_cgroup_css_alloc,
 +      .css_free       = cpu_cgroup_css_free,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
  struct cpuacct root_cpuacct;
  
  /* create a new cpu accounting group */
 -static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
 +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
  {
        struct cpuacct *ca;
  
@@@ -7971,7 -7905,7 +7977,7 @@@ out
  }
  
  /* destroy an existing cpu accounting group */
 -static void cpuacct_destroy(struct cgroup *cgrp)
 +static void cpuacct_css_free(struct cgroup *cgrp)
  {
        struct cpuacct *ca = cgroup_ca(cgrp);
  
@@@ -8142,15 -8076,9 +8148,15 @@@ void cpuacct_charge(struct task_struct 
  
  struct cgroup_subsys cpuacct_subsys = {
        .name = "cpuacct",
 -      .create = cpuacct_create,
 -      .destroy = cpuacct_destroy,
 +      .css_alloc = cpuacct_css_alloc,
 +      .css_free = cpuacct_css_free,
        .subsys_id = cpuacct_subsys_id,
        .base_cftypes = files,
  };
  #endif        /* CONFIG_CGROUP_CPUACCT */
 +
 +void dump_cpu_task(int cpu)
 +{
 +      pr_info("Task dump for CPU %d:\n", cpu);
 +      sched_show_task(cpu_curr(cpu));
 +}
diff --combined kernel/signal.c
index a49c7f36ceb3e595d98a437f0b3031a51cab305d,b2445d86f22691b34296796ee5f91cfe2c900b8d..580a91e634710b6dbbc75f328c3bbef549b999cb
@@@ -1159,9 -1159,8 +1159,9 @@@ static int send_signal(int sig, struct 
        return __send_signal(sig, info, t, group, from_ancestor_ns);
  }
  
 -static void print_fatal_signal(struct pt_regs *regs, int signr)
 +static void print_fatal_signal(int signr)
  {
 +      struct pt_regs *regs = signal_pt_regs();
        printk("%s/%d: potentially unexpected fatal signal %d.\n",
                current->comm, task_pid_nr(current), signr);
  
@@@ -1753,7 -1752,7 +1753,7 @@@ static void do_notify_parent_cldstop(st
         * see comment in do_notify_parent() about the following 4 lines
         */
        rcu_read_lock();
-       info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+       info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
        rcu_read_unlock();
  
@@@ -1909,7 -1908,7 +1909,7 @@@ static void ptrace_stop(int exit_code, 
                preempt_disable();
                read_unlock(&tasklist_lock);
                preempt_enable_no_resched();
 -              schedule();
 +              freezable_schedule();
        } else {
                /*
                 * By the time we got the lock, our tracer went away.
                read_unlock(&tasklist_lock);
        }
  
 -      /*
 -       * While in TASK_TRACED, we were considered "frozen enough".
 -       * Now that we woke up, it's crucial if we're supposed to be
 -       * frozen that we freeze now before running anything substantial.
 -       */
 -      try_to_freeze();
 -
        /*
         * We are back.  Now reacquire the siglock before touching
         * last_siginfo, so that we are sure to have synchronized with
@@@ -2086,7 -2092,7 +2086,7 @@@ static bool do_signal_stop(int signr
                }
  
                /* Now we don't run again until woken by SIGCONT or SIGKILL */
 -              schedule();
 +              freezable_schedule();
                return true;
        } else {
                /*
@@@ -2132,9 -2138,10 +2132,9 @@@ static void do_jobctl_trap(void
        }
  }
  
 -static int ptrace_signal(int signr, siginfo_t *info,
 -                       struct pt_regs *regs, void *cookie)
 +static int ptrace_signal(int signr, siginfo_t *info)
  {
 -      ptrace_signal_deliver(regs, cookie);
 +      ptrace_signal_deliver();
        /*
         * We do not check sig_kernel_stop(signr) but set this marker
         * unconditionally because we do not know whether debugger will
@@@ -2193,14 -2200,15 +2193,14 @@@ int get_signal_to_deliver(siginfo_t *in
        if (unlikely(uprobe_deny_signal()))
                return 0;
  
 -relock:
        /*
 -       * We'll jump back here after any time we were stopped in TASK_STOPPED.
 -       * While in TASK_STOPPED, we were considered "frozen enough".
 -       * Now that we woke up, it's crucial if we're supposed to be
 -       * frozen that we freeze now before running anything substantial.
 +       * Do this once, we can't return to user-mode if freezing() == T.
 +       * do_signal_stop() and ptrace_stop() do freezable_schedule() and
 +       * thus do not need another check after return.
         */
        try_to_freeze();
  
 +relock:
        spin_lock_irq(&sighand->siglock);
        /*
         * Every stopped thread goes here after wakeup. Check to see if
                        break; /* will return 0 */
  
                if (unlikely(current->ptrace) && signr != SIGKILL) {
 -                      signr = ptrace_signal(signr, info,
 -                                            regs, cookie);
 +                      signr = ptrace_signal(signr, info);
                        if (!signr)
                                continue;
                }
  
                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
 -                              print_fatal_signal(regs, info->si_signo);
 +                              print_fatal_signal(info->si_signo);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
 -                      do_coredump(info, regs);
 +                      do_coredump(info);
                }
  
                /*
diff --combined security/yama/yama_lsm.c
index 2663145d1197a104b71f0e2feca175d21156ee00,0e72239aeb053ddf5b7c589aeeff9563a673321f..23414b93771f30ec82ccf76b6cfb49fbed27edef
@@@ -17,7 -17,6 +17,7 @@@
  #include <linux/ptrace.h>
  #include <linux/prctl.h>
  #include <linux/ratelimit.h>
 +#include <linux/workqueue.h>
  
  #define YAMA_SCOPE_DISABLED   0
  #define YAMA_SCOPE_RELATIONAL 1
@@@ -30,37 -29,12 +30,37 @@@ static int ptrace_scope = YAMA_SCOPE_RE
  struct ptrace_relation {
        struct task_struct *tracer;
        struct task_struct *tracee;
 +      bool invalid;
        struct list_head node;
 +      struct rcu_head rcu;
  };
  
  static LIST_HEAD(ptracer_relations);
  static DEFINE_SPINLOCK(ptracer_relations_lock);
  
 +static void yama_relation_cleanup(struct work_struct *work);
 +static DECLARE_WORK(yama_relation_work, yama_relation_cleanup);
 +
 +/**
 + * yama_relation_cleanup - remove invalid entries from the relation list
 + *
 + */
 +static void yama_relation_cleanup(struct work_struct *work)
 +{
 +      struct ptrace_relation *relation;
 +
 +      spin_lock(&ptracer_relations_lock);
 +      rcu_read_lock();
 +      list_for_each_entry_rcu(relation, &ptracer_relations, node) {
 +              if (relation->invalid) {
 +                      list_del_rcu(&relation->node);
 +                      kfree_rcu(relation, rcu);
 +              }
 +      }
 +      rcu_read_unlock();
 +      spin_unlock(&ptracer_relations_lock);
 +}
 +
  /**
   * yama_ptracer_add - add/replace an exception for this tracer/tracee pair
   * @tracer: the task_struct of the process doing the ptrace
  static int yama_ptracer_add(struct task_struct *tracer,
                            struct task_struct *tracee)
  {
 -      int rc = 0;
 -      struct ptrace_relation *added;
 -      struct ptrace_relation *entry, *relation = NULL;
 +      struct ptrace_relation *relation, *added;
  
        added = kmalloc(sizeof(*added), GFP_KERNEL);
        if (!added)
                return -ENOMEM;
  
 -      spin_lock_bh(&ptracer_relations_lock);
 -      list_for_each_entry(entry, &ptracer_relations, node)
 -              if (entry->tracee == tracee) {
 -                      relation = entry;
 -                      break;
 +      added->tracee = tracee;
 +      added->tracer = tracer;
 +      added->invalid = false;
 +
 +      spin_lock(&ptracer_relations_lock);
 +      rcu_read_lock();
 +      list_for_each_entry_rcu(relation, &ptracer_relations, node) {
 +              if (relation->invalid)
 +                      continue;
 +              if (relation->tracee == tracee) {
 +                      list_replace_rcu(&relation->node, &added->node);
 +                      kfree_rcu(relation, rcu);
 +                      goto out;
                }
 -      if (!relation) {
 -              relation = added;
 -              relation->tracee = tracee;
 -              list_add(&relation->node, &ptracer_relations);
        }
 -      relation->tracer = tracer;
  
 -      spin_unlock_bh(&ptracer_relations_lock);
 -      if (added != relation)
 -              kfree(added);
 +      list_add_rcu(&added->node, &ptracer_relations);
  
 -      return rc;
 +out:
 +      rcu_read_unlock();
 +      spin_unlock(&ptracer_relations_lock);
 +      return 0;
  }
  
  /**
  static void yama_ptracer_del(struct task_struct *tracer,
                             struct task_struct *tracee)
  {
 -      struct ptrace_relation *relation, *safe;
 +      struct ptrace_relation *relation;
 +      bool marked = false;
  
 -      spin_lock_bh(&ptracer_relations_lock);
 -      list_for_each_entry_safe(relation, safe, &ptracer_relations, node)
 +      rcu_read_lock();
 +      list_for_each_entry_rcu(relation, &ptracer_relations, node) {
 +              if (relation->invalid)
 +                      continue;
                if (relation->tracee == tracee ||
                    (tracer && relation->tracer == tracer)) {
 -                      list_del(&relation->node);
 -                      kfree(relation);
 +                      relation->invalid = true;
 +                      marked = true;
                }
 -      spin_unlock_bh(&ptracer_relations_lock);
 +      }
 +      rcu_read_unlock();
 +
 +      if (marked)
 +              schedule_work(&yama_relation_work);
  }
  
  /**
@@@ -252,22 -217,21 +252,22 @@@ static int ptracer_exception_found(stru
        struct task_struct *parent = NULL;
        bool found = false;
  
 -      spin_lock_bh(&ptracer_relations_lock);
        rcu_read_lock();
        if (!thread_group_leader(tracee))
                tracee = rcu_dereference(tracee->group_leader);
 -      list_for_each_entry(relation, &ptracer_relations, node)
 +      list_for_each_entry_rcu(relation, &ptracer_relations, node) {
 +              if (relation->invalid)
 +                      continue;
                if (relation->tracee == tracee) {
                        parent = relation->tracer;
                        found = true;
                        break;
                }
 +      }
  
        if (found && (parent == NULL || task_is_descendant(parent, tracer)))
                rc = 1;
        rcu_read_unlock();
 -      spin_unlock_bh(&ptracer_relations_lock);
  
        return rc;
  }
@@@ -298,14 -262,18 +298,18 @@@ int yama_ptrace_access_check(struct tas
                        /* No additional restrictions. */
                        break;
                case YAMA_SCOPE_RELATIONAL:
+                       rcu_read_lock();
                        if (!task_is_descendant(current, child) &&
                            !ptracer_exception_found(current, child) &&
-                           !ns_capable(task_user_ns(child), CAP_SYS_PTRACE))
+                           !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
                                rc = -EPERM;
+                       rcu_read_unlock();
                        break;
                case YAMA_SCOPE_CAPABILITY:
-                       if (!ns_capable(task_user_ns(child), CAP_SYS_PTRACE))
+                       rcu_read_lock();
+                       if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
                                rc = -EPERM;
+                       rcu_read_unlock();
                        break;
                case YAMA_SCOPE_NO_ATTACH:
                default:
@@@ -343,8 -311,10 +347,10 @@@ int yama_ptrace_traceme(struct task_str
        /* Only disallow PTRACE_TRACEME on more aggressive settings. */
        switch (ptrace_scope) {
        case YAMA_SCOPE_CAPABILITY:
-               if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE))
+               rcu_read_lock();
+               if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE))
                        rc = -EPERM;
+               rcu_read_unlock();
                break;
        case YAMA_SCOPE_NO_ATTACH:
                rc = -EPERM;