void mconsole_proc(struct mc_request *req)
{
- struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
+ struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
char *buf;
int len;
struct file *file;
struct task_struct *from = current, *to = arg;
to->thread.saved_task = from;
- rcu_switch(from, to);
+ rcu_user_hooks_switch(from, to);
switch_to(from, to, from);
}
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <asm/cacheflush.h>
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+ #include <linux/pid_namespace.h>
#include "binder.h"
+#include "binder_trace.h"
-static DEFINE_MUTEX(binder_lock);
+static DEFINE_MUTEX(binder_main_lock);
static DEFINE_MUTEX(binder_deferred_lock);
static DEFINE_MUTEX(binder_mmap_lock);
return retval;
}
+static inline void binder_lock(const char *tag)
+{
+ trace_binder_lock(tag);
+ mutex_lock(&binder_main_lock);
+ trace_binder_locked(tag);
+}
+
+static inline void binder_unlock(const char *tag)
+{
+ trace_binder_unlock(tag);
+ mutex_unlock(&binder_main_lock);
+}
+
static void binder_set_nice(long nice)
{
long min_nice;
}
min_nice = 20 - current->signal->rlim[RLIMIT_NICE].rlim_cur;
binder_debug(BINDER_DEBUG_PRIORITY_CAP,
- "binder: %d: nice value %ld not allowed use "
- "%ld instead\n", current->pid, nice, min_nice);
+ "%d: nice value %ld not allowed use %ld instead\n",
+ current->pid, nice, min_nice);
set_user_nice(current, min_nice);
if (min_nice < 20)
return;
- binder_user_error("binder: %d RLIMIT_NICE not set\n", current->pid);
+ binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
}
static size_t binder_buffer_size(struct binder_proc *proc,
new_buffer_size = binder_buffer_size(proc, new_buffer);
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "binder: %d: add free buffer, size %zd, "
- "at %p\n", proc->pid, new_buffer_size, new_buffer);
+ "%d: add free buffer, size %zd, at %p\n",
+ proc->pid, new_buffer_size, new_buffer);
while (*p) {
parent = *p;
struct mm_struct *mm;
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "binder: %d: %s pages %p-%p\n", proc->pid,
+ "%d: %s pages %p-%p\n", proc->pid,
allocate ? "allocate" : "free", start, end);
if (end <= start)
return 0;
+ trace_binder_update_page_range(proc, allocate, start, end);
+
if (vma)
mm = NULL;
else
down_write(&mm->mmap_sem);
vma = proc->vma;
if (vma && mm != proc->vma_vm_mm) {
- pr_err("binder: %d: vma mm and task mm mismatch\n",
+ pr_err("%d: vma mm and task mm mismatch\n",
proc->pid);
vma = NULL;
}
goto free_range;
if (vma == NULL) {
- pr_err("binder: %d: binder_alloc_buf failed to "
- "map pages in userspace, no vma\n", proc->pid);
+ pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n",
+ proc->pid);
goto err_no_vma;
}
BUG_ON(*page);
*page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
if (*page == NULL) {
- pr_err("binder: %d: binder_alloc_buf failed "
- "for page at %p\n", proc->pid, page_addr);
+ pr_err("%d: binder_alloc_buf failed for page at %p\n",
+ proc->pid, page_addr);
goto err_alloc_page_failed;
}
tmp_area.addr = page_addr;
page_array_ptr = page;
ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr);
if (ret) {
- pr_err("binder: %d: binder_alloc_buf failed "
- "to map page at %p in kernel\n",
+ pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n",
proc->pid, page_addr);
goto err_map_kernel_failed;
}
(uintptr_t)page_addr + proc->user_buffer_offset;
ret = vm_insert_page(vma, user_page_addr, page[0]);
if (ret) {
- pr_err("binder: %d: binder_alloc_buf failed "
- "to map page at %lx in userspace\n",
+ pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n",
proc->pid, user_page_addr);
goto err_vm_insert_page_failed;
}
size_t size;
if (proc->vma == NULL) {
- pr_err("binder: %d: binder_alloc_buf, no vma\n",
+ pr_err("%d: binder_alloc_buf, no vma\n",
proc->pid);
return NULL;
}
ALIGN(offsets_size, sizeof(void *));
if (size < data_size || size < offsets_size) {
- binder_user_error("binder: %d: got transaction with invalid "
- "size %zd-%zd\n", proc->pid, data_size, offsets_size);
+ binder_user_error("%d: got transaction with invalid size %zd-%zd\n",
+ proc->pid, data_size, offsets_size);
return NULL;
}
if (is_async &&
proc->free_async_space < size + sizeof(struct binder_buffer)) {
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "binder: %d: binder_alloc_buf size %zd"
- "failed, no async space left\n", proc->pid, size);
+ "%d: binder_alloc_buf size %zd failed, no async space left\n",
+ proc->pid, size);
return NULL;
}
}
}
if (best_fit == NULL) {
- pr_err("binder: %d: binder_alloc_buf size %zd failed, "
- "no address space\n", proc->pid, size);
+ pr_err("%d: binder_alloc_buf size %zd failed, no address space\n",
+ proc->pid, size);
return NULL;
}
if (n == NULL) {
}
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "binder: %d: binder_alloc_buf size %zd got buff"
- "er %p size %zd\n", proc->pid, size, buffer, buffer_size);
+ "%d: binder_alloc_buf size %zd got buffer %p size %zd\n",
+ proc->pid, size, buffer, buffer_size);
has_page_addr =
(void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK);
binder_insert_free_buffer(proc, new_buffer);
}
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "binder: %d: binder_alloc_buf size %zd got "
- "%p\n", proc->pid, size, buffer);
+ "%d: binder_alloc_buf size %zd got %p\n",
+ proc->pid, size, buffer);
buffer->data_size = data_size;
buffer->offsets_size = offsets_size;
buffer->async_transaction = is_async;
if (is_async) {
proc->free_async_space -= size + sizeof(struct binder_buffer);
binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
- "binder: %d: binder_alloc_buf size %zd "
- "async free %zd\n", proc->pid, size,
- proc->free_async_space);
+ "%d: binder_alloc_buf size %zd async free %zd\n",
+ proc->pid, size, proc->free_async_space);
}
return buffer;
if (buffer_end_page(prev) == buffer_end_page(buffer))
free_page_end = 0;
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "binder: %d: merge free, buffer %p "
- "share page with %p\n", proc->pid, buffer, prev);
+ "%d: merge free, buffer %p share page with %p\n",
+ proc->pid, buffer, prev);
}
if (!list_is_last(&buffer->entry, &proc->buffers)) {
buffer_start_page(buffer))
free_page_start = 0;
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "binder: %d: merge free, buffer"
- " %p share page with %p\n", proc->pid,
- buffer, prev);
+ "%d: merge free, buffer %p share page with %p\n",
+ proc->pid, buffer, prev);
}
}
list_del(&buffer->entry);
if (free_page_start || free_page_end) {
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "binder: %d: merge free, buffer %p do "
- "not share page%s%s with with %p or %p\n",
+ "%d: merge free, buffer %p do not share page%s%s with with %p or %p\n",
proc->pid, buffer, free_page_start ? "" : " end",
free_page_end ? "" : " start", prev, next);
binder_update_page_range(proc, 0, free_page_start ?
ALIGN(buffer->offsets_size, sizeof(void *));
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "binder: %d: binder_free_buf %p size %zd buffer"
- "_size %zd\n", proc->pid, buffer, size, buffer_size);
+ "%d: binder_free_buf %p size %zd buffer_size %zd\n",
+ proc->pid, buffer, size, buffer_size);
BUG_ON(buffer->free);
BUG_ON(size > buffer_size);
proc->free_async_space += size + sizeof(struct binder_buffer);
binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC,
- "binder: %d: binder_free_buf size %zd "
- "async free %zd\n", proc->pid, size,
- proc->free_async_space);
+ "%d: binder_free_buf size %zd async free %zd\n",
+ proc->pid, size, proc->free_async_space);
}
binder_update_page_range(proc, 0,
INIT_LIST_HEAD(&node->work.entry);
INIT_LIST_HEAD(&node->async_todo);
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "binder: %d:%d node %d u%p c%p created\n",
+ "%d:%d node %d u%p c%p created\n",
proc->pid, current->pid, node->debug_id,
node->ptr, node->cookie);
return node;
node->internal_strong_refs == 0 &&
!(node == binder_context_mgr_node &&
node->has_strong_ref)) {
- pr_err("binder: invalid inc strong "
- "node for %d\n", node->debug_id);
+ pr_err("invalid inc strong node for %d\n",
+ node->debug_id);
return -EINVAL;
}
node->internal_strong_refs++;
node->local_weak_refs++;
if (!node->has_weak_ref && list_empty(&node->work.entry)) {
if (target_list == NULL) {
- pr_err("binder: invalid inc weak node "
- "for %d\n", node->debug_id);
+ pr_err("invalid inc weak node for %d\n",
+ node->debug_id);
return -EINVAL;
}
list_add_tail(&node->work.entry, target_list);
if (node->proc) {
rb_erase(&node->rb_node, &node->proc->nodes);
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "binder: refless node %d deleted\n",
+ "refless node %d deleted\n",
node->debug_id);
} else {
hlist_del(&node->dead_node);
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "binder: dead node %d deleted\n",
+ "dead node %d deleted\n",
node->debug_id);
}
kfree(node);
hlist_add_head(&new_ref->node_entry, &node->refs);
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "binder: %d new ref %d desc %d for "
- "node %d\n", proc->pid, new_ref->debug_id,
- new_ref->desc, node->debug_id);
+ "%d new ref %d desc %d for node %d\n",
+ proc->pid, new_ref->debug_id, new_ref->desc,
+ node->debug_id);
} else {
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "binder: %d new ref %d desc %d for "
- "dead node\n", proc->pid, new_ref->debug_id,
- new_ref->desc);
+ "%d new ref %d desc %d for dead node\n",
+ proc->pid, new_ref->debug_id, new_ref->desc);
}
return new_ref;
}
static void binder_delete_ref(struct binder_ref *ref)
{
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "binder: %d delete ref %d desc %d for "
- "node %d\n", ref->proc->pid, ref->debug_id,
- ref->desc, ref->node->debug_id);
+ "%d delete ref %d desc %d for node %d\n",
+ ref->proc->pid, ref->debug_id, ref->desc,
+ ref->node->debug_id);
rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc);
rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node);
binder_dec_node(ref->node, 0, 1);
if (ref->death) {
binder_debug(BINDER_DEBUG_DEAD_BINDER,
- "binder: %d delete ref %d desc %d "
- "has death notification\n", ref->proc->pid,
- ref->debug_id, ref->desc);
+ "%d delete ref %d desc %d has death notification\n",
+ ref->proc->pid, ref->debug_id, ref->desc);
list_del(&ref->death->work.entry);
kfree(ref->death);
binder_stats_deleted(BINDER_STAT_DEATH);
{
if (strong) {
if (ref->strong == 0) {
- binder_user_error("binder: %d invalid dec strong, "
- "ref %d desc %d s %d w %d\n",
+ binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n",
ref->proc->pid, ref->debug_id,
ref->desc, ref->strong, ref->weak);
return -EINVAL;
}
} else {
if (ref->weak == 0) {
- binder_user_error("binder: %d invalid dec weak, "
- "ref %d desc %d s %d w %d\n",
+ binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n",
ref->proc->pid, ref->debug_id,
ref->desc, ref->strong, ref->weak);
return -EINVAL;
}
if (target_thread->return_error == BR_OK) {
binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "binder: send failed reply for "
- "transaction %d to %d:%d\n",
+ "send failed reply for transaction %d to %d:%d\n",
t->debug_id, target_thread->proc->pid,
target_thread->pid);
target_thread->return_error = error_code;
wake_up_interruptible(&target_thread->wait);
} else {
- pr_err("binder: reply failed, target "
- "thread, %d:%d, has error code %d "
- "already\n", target_thread->proc->pid,
+ pr_err("reply failed, target thread, %d:%d, has error code %d already\n",
+ target_thread->proc->pid,
target_thread->pid,
target_thread->return_error);
}
struct binder_transaction *next = t->from_parent;
binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "binder: send failed reply "
- "for transaction %d, target dead\n",
+ "send failed reply for transaction %d, target dead\n",
t->debug_id);
binder_pop_transaction(target_thread, t);
if (next == NULL) {
binder_debug(BINDER_DEBUG_DEAD_BINDER,
- "binder: reply failed,"
- " no target thread at root\n");
+ "reply failed, no target thread at root\n");
return;
}
t = next;
binder_debug(BINDER_DEBUG_DEAD_BINDER,
- "binder: reply failed, no target "
- "thread -- retry %d\n", t->debug_id);
+ "reply failed, no target thread -- retry %d\n",
+ t->debug_id);
}
}
}
int debug_id = buffer->debug_id;
binder_debug(BINDER_DEBUG_TRANSACTION,
- "binder: %d buffer release %d, size %zd-%zd, failed at %p\n",
+ "%d buffer release %d, size %zd-%zd, failed at %p\n",
proc->pid, buffer->debug_id,
buffer->data_size, buffer->offsets_size, failed_at);
if (*offp > buffer->data_size - sizeof(*fp) ||
buffer->data_size < sizeof(*fp) ||
!IS_ALIGNED(*offp, sizeof(void *))) {
- pr_err("binder: transaction release %d bad"
- "offset %zd, size %zd\n", debug_id,
- *offp, buffer->data_size);
+ pr_err("transaction release %d bad offset %zd, size %zd\n",
+ debug_id, *offp, buffer->data_size);
continue;
}
fp = (struct flat_binder_object *)(buffer->data + *offp);
case BINDER_TYPE_WEAK_BINDER: {
struct binder_node *node = binder_get_node(proc, fp->binder);
if (node == NULL) {
- pr_err("binder: transaction release %d"
- " bad node %p\n", debug_id, fp->binder);
+ pr_err("transaction release %d bad node %p\n",
+ debug_id, fp->binder);
break;
}
binder_debug(BINDER_DEBUG_TRANSACTION,
case BINDER_TYPE_WEAK_HANDLE: {
struct binder_ref *ref = binder_get_ref(proc, fp->handle);
if (ref == NULL) {
- pr_err("binder: transaction release %d"
- " bad handle %ld\n", debug_id,
- fp->handle);
+ pr_err("transaction release %d bad handle %ld\n",
+ debug_id, fp->handle);
break;
}
binder_debug(BINDER_DEBUG_TRANSACTION,
break;
default:
- pr_err("binder: transaction release %d bad "
- "object type %lx\n", debug_id, fp->type);
+ pr_err("transaction release %d bad object type %lx\n",
+ debug_id, fp->type);
break;
}
}
if (reply) {
in_reply_to = thread->transaction_stack;
if (in_reply_to == NULL) {
- binder_user_error("binder: %d:%d got reply transaction "
- "with no transaction stack\n",
+ binder_user_error("%d:%d got reply transaction with no transaction stack\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
goto err_empty_call_stack;
}
binder_set_nice(in_reply_to->saved_priority);
if (in_reply_to->to_thread != thread) {
- binder_user_error("binder: %d:%d got reply transaction "
- "with bad transaction stack,"
- " transaction %d has target %d:%d\n",
+ binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n",
proc->pid, thread->pid, in_reply_to->debug_id,
in_reply_to->to_proc ?
in_reply_to->to_proc->pid : 0,
goto err_dead_binder;
}
if (target_thread->transaction_stack != in_reply_to) {
- binder_user_error("binder: %d:%d got reply transaction "
- "with bad target transaction stack %d, "
- "expected %d\n",
+ binder_user_error("%d:%d got reply transaction with bad target transaction stack %d, expected %d\n",
proc->pid, thread->pid,
target_thread->transaction_stack ?
target_thread->transaction_stack->debug_id : 0,
struct binder_ref *ref;
ref = binder_get_ref(proc, tr->target.handle);
if (ref == NULL) {
- binder_user_error("binder: %d:%d got "
- "transaction to invalid handle\n",
+ binder_user_error("%d:%d got transaction to invalid handle\n",
proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
goto err_invalid_target_handle;
struct binder_transaction *tmp;
tmp = thread->transaction_stack;
if (tmp->to_thread != thread) {
- binder_user_error("binder: %d:%d got new "
- "transaction with bad transaction stack"
- ", transaction %d has target %d:%d\n",
+ binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n",
proc->pid, thread->pid, tmp->debug_id,
tmp->to_proc ? tmp->to_proc->pid : 0,
tmp->to_thread ?
if (reply)
binder_debug(BINDER_DEBUG_TRANSACTION,
- "binder: %d:%d BC_REPLY %d -> %d:%d, "
- "data %p-%p size %zd-%zd\n",
+ "%d:%d BC_REPLY %d -> %d:%d, data %p-%p size %zd-%zd\n",
proc->pid, thread->pid, t->debug_id,
target_proc->pid, target_thread->pid,
tr->data.ptr.buffer, tr->data.ptr.offsets,
tr->data_size, tr->offsets_size);
else
binder_debug(BINDER_DEBUG_TRANSACTION,
- "binder: %d:%d BC_TRANSACTION %d -> "
- "%d - node %d, data %p-%p size %zd-%zd\n",
+ "%d:%d BC_TRANSACTION %d -> %d - node %d, data %p-%p size %zd-%zd\n",
proc->pid, thread->pid, t->debug_id,
target_proc->pid, target_node->debug_id,
tr->data.ptr.buffer, tr->data.ptr.offsets,
t->code = tr->code;
t->flags = tr->flags;
t->priority = task_nice(current);
+
+ trace_binder_transaction(reply, t, target_node);
+
t->buffer = binder_alloc_buf(target_proc, tr->data_size,
tr->offsets_size, !reply && (t->flags & TF_ONE_WAY));
if (t->buffer == NULL) {
t->buffer->debug_id = t->debug_id;
t->buffer->transaction = t;
t->buffer->target_node = target_node;
+ trace_binder_transaction_alloc_buf(t->buffer);
if (target_node)
binder_inc_node(target_node, 1, 0, NULL);
offp = (size_t *)(t->buffer->data + ALIGN(tr->data_size, sizeof(void *)));
if (copy_from_user(t->buffer->data, tr->data.ptr.buffer, tr->data_size)) {
- binder_user_error("binder: %d:%d got transaction with invalid "
- "data ptr\n", proc->pid, thread->pid);
+ binder_user_error("%d:%d got transaction with invalid data ptr\n",
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
goto err_copy_data_failed;
}
if (copy_from_user(offp, tr->data.ptr.offsets, tr->offsets_size)) {
- binder_user_error("binder: %d:%d got transaction with invalid "
- "offsets ptr\n", proc->pid, thread->pid);
+ binder_user_error("%d:%d got transaction with invalid offsets ptr\n",
+ proc->pid, thread->pid);
return_error = BR_FAILED_REPLY;
goto err_copy_data_failed;
}
if (!IS_ALIGNED(tr->offsets_size, sizeof(size_t))) {
- binder_user_error("binder: %d:%d got transaction with "
- "invalid offsets size, %zd\n",
- proc->pid, thread->pid, tr->offsets_size);
+ binder_user_error("%d:%d got transaction with invalid offsets size, %zd\n",
+ proc->pid, thread->pid, tr->offsets_size);
return_error = BR_FAILED_REPLY;
goto err_bad_offset;
}
if (*offp > t->buffer->data_size - sizeof(*fp) ||
t->buffer->data_size < sizeof(*fp) ||
!IS_ALIGNED(*offp, sizeof(void *))) {
- binder_user_error("binder: %d:%d got transaction with "
- "invalid offset, %zd\n",
- proc->pid, thread->pid, *offp);
+ binder_user_error("%d:%d got transaction with invalid offset, %zd\n",
+ proc->pid, thread->pid, *offp);
return_error = BR_FAILED_REPLY;
goto err_bad_offset;
}
node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
}
if (fp->cookie != node->cookie) {
- binder_user_error("binder: %d:%d sending u%p "
- "node %d, cookie mismatch %p != %p\n",
+ binder_user_error("%d:%d sending u%p node %d, cookie mismatch %p != %p\n",
proc->pid, thread->pid,
fp->binder, node->debug_id,
fp->cookie, node->cookie);
binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE,
&thread->todo);
+ trace_binder_transaction_node_to_ref(t, node, ref);
binder_debug(BINDER_DEBUG_TRANSACTION,
" node %d u%p -> ref %d desc %d\n",
node->debug_id, node->ptr, ref->debug_id,
case BINDER_TYPE_WEAK_HANDLE: {
struct binder_ref *ref = binder_get_ref(proc, fp->handle);
if (ref == NULL) {
- binder_user_error("binder: %d:%d got "
- "transaction with invalid "
- "handle, %ld\n", proc->pid,
- thread->pid, fp->handle);
+ binder_user_error("%d:%d got transaction with invalid handle, %ld\n",
+ proc->pid,
+ thread->pid, fp->handle);
return_error = BR_FAILED_REPLY;
goto err_binder_get_ref_failed;
}
fp->binder = ref->node->ptr;
fp->cookie = ref->node->cookie;
binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL);
+ trace_binder_transaction_ref_to_node(t, ref);
binder_debug(BINDER_DEBUG_TRANSACTION,
" ref %d desc %d -> node %d u%p\n",
ref->debug_id, ref->desc, ref->node->debug_id,
}
fp->handle = new_ref->desc;
binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL);
+ trace_binder_transaction_ref_to_ref(t, ref,
+ new_ref);
binder_debug(BINDER_DEBUG_TRANSACTION,
" ref %d desc %d -> ref %d desc %d (node %d)\n",
ref->debug_id, ref->desc, new_ref->debug_id,
if (reply) {
if (!(in_reply_to->flags & TF_ACCEPT_FDS)) {
- binder_user_error("binder: %d:%d got reply with fd, %ld, but target does not allow fds\n",
+ binder_user_error("%d:%d got reply with fd, %ld, but target does not allow fds\n",
proc->pid, thread->pid, fp->handle);
return_error = BR_FAILED_REPLY;
goto err_fd_not_allowed;
}
} else if (!target_node->accept_fds) {
- binder_user_error("binder: %d:%d got transaction with fd, %ld, but target does not allow fds\n",
+ binder_user_error("%d:%d got transaction with fd, %ld, but target does not allow fds\n",
proc->pid, thread->pid, fp->handle);
return_error = BR_FAILED_REPLY;
goto err_fd_not_allowed;
file = fget(fp->handle);
if (file == NULL) {
- binder_user_error("binder: %d:%d got transaction with invalid fd, %ld\n",
+ binder_user_error("%d:%d got transaction with invalid fd, %ld\n",
proc->pid, thread->pid, fp->handle);
return_error = BR_FAILED_REPLY;
goto err_fget_failed;
goto err_get_unused_fd_failed;
}
task_fd_install(target_proc, target_fd, file);
+ trace_binder_transaction_fd(t, fp->handle, target_fd);
binder_debug(BINDER_DEBUG_TRANSACTION,
" fd %ld -> %d\n", fp->handle, target_fd);
/* TODO: fput? */
} break;
default:
- binder_user_error("binder: %d:%d got transactio"
- "n with invalid object type, %lx\n",
+ binder_user_error("%d:%d got transaction with invalid object type, %lx\n",
proc->pid, thread->pid, fp->type);
return_error = BR_FAILED_REPLY;
goto err_bad_object_type;
err_bad_object_type:
err_bad_offset:
err_copy_data_failed:
+ trace_binder_transaction_failed_buffer_release(t->buffer);
binder_transaction_buffer_release(target_proc, t->buffer, offp);
t->buffer->transaction = NULL;
binder_free_buf(target_proc, t->buffer);
err_invalid_target_handle:
err_no_context_mgr_node:
binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "binder: %d:%d transaction failed %d, size %zd-%zd\n",
+ "%d:%d transaction failed %d, size %zd-%zd\n",
proc->pid, thread->pid, return_error,
tr->data_size, tr->offsets_size);
if (get_user(cmd, (uint32_t __user *)ptr))
return -EFAULT;
ptr += sizeof(uint32_t);
+ trace_binder_command(cmd);
if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) {
binder_stats.bc[_IOC_NR(cmd)]++;
proc->stats.bc[_IOC_NR(cmd)]++;
ref = binder_get_ref_for_node(proc,
binder_context_mgr_node);
if (ref->desc != target) {
- binder_user_error("binder: %d:"
- "%d tried to acquire "
- "reference to desc 0, "
- "got %d instead\n",
+ binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n",
proc->pid, thread->pid,
ref->desc);
}
} else
ref = binder_get_ref(proc, target);
if (ref == NULL) {
- binder_user_error("binder: %d:%d refcou"
- "nt change on invalid ref %d\n",
+ binder_user_error("%d:%d refcount change on invalid ref %d\n",
proc->pid, thread->pid, target);
break;
}
break;
}
binder_debug(BINDER_DEBUG_USER_REFS,
- "binder: %d:%d %s ref %d desc %d s %d w %d for node %d\n",
+ "%d:%d %s ref %d desc %d s %d w %d for node %d\n",
proc->pid, thread->pid, debug_string, ref->debug_id,
ref->desc, ref->strong, ref->weak, ref->node->debug_id);
break;
ptr += sizeof(void *);
node = binder_get_node(proc, node_ptr);
if (node == NULL) {
- binder_user_error("binder: %d:%d "
- "%s u%p no match\n",
+ binder_user_error("%d:%d %s u%p no match\n",
proc->pid, thread->pid,
cmd == BC_INCREFS_DONE ?
"BC_INCREFS_DONE" :
break;
}
if (cookie != node->cookie) {
- binder_user_error("binder: %d:%d %s u%p node %d"
- " cookie mismatch %p != %p\n",
+ binder_user_error("%d:%d %s u%p node %d cookie mismatch %p != %p\n",
proc->pid, thread->pid,
cmd == BC_INCREFS_DONE ?
"BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
}
if (cmd == BC_ACQUIRE_DONE) {
if (node->pending_strong_ref == 0) {
- binder_user_error("binder: %d:%d "
- "BC_ACQUIRE_DONE node %d has "
- "no pending acquire request\n",
+ binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n",
proc->pid, thread->pid,
node->debug_id);
break;
node->pending_strong_ref = 0;
} else {
if (node->pending_weak_ref == 0) {
- binder_user_error("binder: %d:%d "
- "BC_INCREFS_DONE node %d has "
- "no pending increfs request\n",
+ binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n",
proc->pid, thread->pid,
node->debug_id);
break;
}
binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0);
binder_debug(BINDER_DEBUG_USER_REFS,
- "binder: %d:%d %s node %d ls %d lw %d\n",
+ "%d:%d %s node %d ls %d lw %d\n",
proc->pid, thread->pid,
cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE",
node->debug_id, node->local_strong_refs, node->local_weak_refs);
break;
}
case BC_ATTEMPT_ACQUIRE:
- pr_err("binder: BC_ATTEMPT_ACQUIRE not supported\n");
+ pr_err("BC_ATTEMPT_ACQUIRE not supported\n");
return -EINVAL;
case BC_ACQUIRE_RESULT:
- pr_err("binder: BC_ACQUIRE_RESULT not supported\n");
+ pr_err("BC_ACQUIRE_RESULT not supported\n");
return -EINVAL;
case BC_FREE_BUFFER: {
buffer = binder_buffer_lookup(proc, data_ptr);
if (buffer == NULL) {
- binder_user_error("binder: %d:%d "
- "BC_FREE_BUFFER u%p no match\n",
+ binder_user_error("%d:%d BC_FREE_BUFFER u%p no match\n",
proc->pid, thread->pid, data_ptr);
break;
}
if (!buffer->allow_user_free) {
- binder_user_error("binder: %d:%d "
- "BC_FREE_BUFFER u%p matched "
- "unreturned buffer\n",
+ binder_user_error("%d:%d BC_FREE_BUFFER u%p matched unreturned buffer\n",
proc->pid, thread->pid, data_ptr);
break;
}
binder_debug(BINDER_DEBUG_FREE_BUFFER,
- "binder: %d:%d BC_FREE_BUFFER u%p found buffer %d for %s transaction\n",
+ "%d:%d BC_FREE_BUFFER u%p found buffer %d for %s transaction\n",
proc->pid, thread->pid, data_ptr, buffer->debug_id,
buffer->transaction ? "active" : "finished");
else
list_move_tail(buffer->target_node->async_todo.next, &thread->todo);
}
+ trace_binder_transaction_buffer_release(buffer);
binder_transaction_buffer_release(proc, buffer, NULL);
binder_free_buf(proc, buffer);
break;
case BC_REGISTER_LOOPER:
binder_debug(BINDER_DEBUG_THREADS,
- "binder: %d:%d BC_REGISTER_LOOPER\n",
+ "%d:%d BC_REGISTER_LOOPER\n",
proc->pid, thread->pid);
if (thread->looper & BINDER_LOOPER_STATE_ENTERED) {
thread->looper |= BINDER_LOOPER_STATE_INVALID;
- binder_user_error("binder: %d:%d ERROR:"
- " BC_REGISTER_LOOPER called "
- "after BC_ENTER_LOOPER\n",
+ binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n",
proc->pid, thread->pid);
} else if (proc->requested_threads == 0) {
thread->looper |= BINDER_LOOPER_STATE_INVALID;
- binder_user_error("binder: %d:%d ERROR:"
- " BC_REGISTER_LOOPER called "
- "without request\n",
+ binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called without request\n",
proc->pid, thread->pid);
} else {
proc->requested_threads--;
break;
case BC_ENTER_LOOPER:
binder_debug(BINDER_DEBUG_THREADS,
- "binder: %d:%d BC_ENTER_LOOPER\n",
+ "%d:%d BC_ENTER_LOOPER\n",
proc->pid, thread->pid);
if (thread->looper & BINDER_LOOPER_STATE_REGISTERED) {
thread->looper |= BINDER_LOOPER_STATE_INVALID;
- binder_user_error("binder: %d:%d ERROR:"
- " BC_ENTER_LOOPER called after "
- "BC_REGISTER_LOOPER\n",
+ binder_user_error("%d:%d ERROR: BC_ENTER_LOOPER called after BC_REGISTER_LOOPER\n",
proc->pid, thread->pid);
}
thread->looper |= BINDER_LOOPER_STATE_ENTERED;
break;
case BC_EXIT_LOOPER:
binder_debug(BINDER_DEBUG_THREADS,
- "binder: %d:%d BC_EXIT_LOOPER\n",
+ "%d:%d BC_EXIT_LOOPER\n",
proc->pid, thread->pid);
thread->looper |= BINDER_LOOPER_STATE_EXITED;
break;
ptr += sizeof(void *);
ref = binder_get_ref(proc, target);
if (ref == NULL) {
- binder_user_error("binder: %d:%d %s "
- "invalid ref %d\n",
+ binder_user_error("%d:%d %s invalid ref %d\n",
proc->pid, thread->pid,
cmd == BC_REQUEST_DEATH_NOTIFICATION ?
"BC_REQUEST_DEATH_NOTIFICATION" :
}
binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION,
- "binder: %d:%d %s %p ref %d desc %d s %d w %d for node %d\n",
+ "%d:%d %s %p ref %d desc %d s %d w %d for node %d\n",
proc->pid, thread->pid,
cmd == BC_REQUEST_DEATH_NOTIFICATION ?
"BC_REQUEST_DEATH_NOTIFICATION" :
if (cmd == BC_REQUEST_DEATH_NOTIFICATION) {
if (ref->death) {
- binder_user_error("binder: %d:%"
- "d BC_REQUEST_DEATH_NOTI"
- "FICATION death notific"
- "ation already set\n",
+ binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n",
proc->pid, thread->pid);
break;
}
if (death == NULL) {
thread->return_error = BR_ERROR;
binder_debug(BINDER_DEBUG_FAILED_TRANSACTION,
- "binder: %d:%d "
- "BC_REQUEST_DEATH_NOTIFICATION failed\n",
+ "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
proc->pid, thread->pid);
break;
}
}
} else {
if (ref->death == NULL) {
- binder_user_error("binder: %d:%"
- "d BC_CLEAR_DEATH_NOTIFI"
- "CATION death notificat"
- "ion not active\n",
+ binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n",
proc->pid, thread->pid);
break;
}
death = ref->death;
if (death->cookie != cookie) {
- binder_user_error("binder: %d:%"
- "d BC_CLEAR_DEATH_NOTIFI"
- "CATION death notificat"
- "ion cookie mismatch "
- "%p != %p\n",
+ binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch %p != %p\n",
proc->pid, thread->pid,
death->cookie, cookie);
break;
}
}
binder_debug(BINDER_DEBUG_DEAD_BINDER,
- "binder: %d:%d BC_DEAD_BINDER_DONE %p found %p\n",
+ "%d:%d BC_DEAD_BINDER_DONE %p found %p\n",
proc->pid, thread->pid, cookie, death);
if (death == NULL) {
- binder_user_error("binder: %d:%d BC_DEAD"
- "_BINDER_DONE %p not found\n",
+ binder_user_error("%d:%d BC_DEAD_BINDER_DONE %p not found\n",
proc->pid, thread->pid, cookie);
break;
}
} break;
default:
- pr_err("binder: %d:%d unknown command %d\n",
+ pr_err("%d:%d unknown command %d\n",
proc->pid, thread->pid, cmd);
return -EINVAL;
}
void binder_stat_br(struct binder_proc *proc, struct binder_thread *thread,
uint32_t cmd)
{
+ trace_binder_return(cmd);
if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) {
binder_stats.br[_IOC_NR(cmd)]++;
proc->stats.br[_IOC_NR(cmd)]++;
if (put_user(thread->return_error2, (uint32_t __user *)ptr))
return -EFAULT;
ptr += sizeof(uint32_t);
+ binder_stat_br(proc, thread, thread->return_error2);
if (ptr == end)
goto done;
thread->return_error2 = BR_OK;
if (put_user(thread->return_error, (uint32_t __user *)ptr))
return -EFAULT;
ptr += sizeof(uint32_t);
+ binder_stat_br(proc, thread, thread->return_error);
thread->return_error = BR_OK;
goto done;
}
thread->looper |= BINDER_LOOPER_STATE_WAITING;
if (wait_for_proc_work)
proc->ready_threads++;
- mutex_unlock(&binder_lock);
+
+ binder_unlock(__func__);
+
+ trace_binder_wait_for_work(wait_for_proc_work,
+ !!thread->transaction_stack,
+ !list_empty(&thread->todo));
if (wait_for_proc_work) {
if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED |
BINDER_LOOPER_STATE_ENTERED))) {
- binder_user_error("binder: %d:%d ERROR: Thread waiting "
- "for process work before calling BC_REGISTER_"
- "LOOPER or BC_ENTER_LOOPER (state %x)\n",
+ binder_user_error("%d:%d ERROR: Thread waiting for process work before calling BC_REGISTER_LOOPER or BC_ENTER_LOOPER (state %x)\n",
proc->pid, thread->pid, thread->looper);
wait_event_interruptible(binder_user_error_wait,
binder_stop_on_user_error < 2);
} else
ret = wait_event_interruptible(thread->wait, binder_has_thread_work(thread));
}
- mutex_lock(&binder_lock);
+
+ binder_lock(__func__);
+
if (wait_for_proc_work)
proc->ready_threads--;
thread->looper &= ~BINDER_LOOPER_STATE_WAITING;
binder_stat_br(proc, thread, cmd);
binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE,
- "binder: %d:%d BR_TRANSACTION_COMPLETE\n",
+ "%d:%d BR_TRANSACTION_COMPLETE\n",
proc->pid, thread->pid);
list_del(&w->entry);
binder_stat_br(proc, thread, cmd);
binder_debug(BINDER_DEBUG_USER_REFS,
- "binder: %d:%d %s %d u%p c%p\n",
+ "%d:%d %s %d u%p c%p\n",
proc->pid, thread->pid, cmd_name, node->debug_id, node->ptr, node->cookie);
} else {
list_del_init(&w->entry);
if (!weak && !strong) {
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "binder: %d:%d node %d u%p c%p deleted\n",
+ "%d:%d node %d u%p c%p deleted\n",
proc->pid, thread->pid, node->debug_id,
node->ptr, node->cookie);
rb_erase(&node->rb_node, &proc->nodes);
binder_stats_deleted(BINDER_STAT_NODE);
} else {
binder_debug(BINDER_DEBUG_INTERNAL_REFS,
- "binder: %d:%d node %d u%p c%p state unchanged\n",
+ "%d:%d node %d u%p c%p state unchanged\n",
proc->pid, thread->pid, node->debug_id, node->ptr,
node->cookie);
}
if (put_user(death->cookie, (void * __user *)ptr))
return -EFAULT;
ptr += sizeof(void *);
+ binder_stat_br(proc, thread, cmd);
binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION,
- "binder: %d:%d %s %p\n",
+ "%d:%d %s %p\n",
proc->pid, thread->pid,
cmd == BR_DEAD_BINDER ?
"BR_DEAD_BINDER" :
if (t->from) {
struct task_struct *sender = t->from->proc->tsk;
tr.sender_pid = task_tgid_nr_ns(sender,
- current->nsproxy->pid_ns);
+ task_active_pid_ns(current));
} else {
tr.sender_pid = 0;
}
return -EFAULT;
ptr += sizeof(tr);
+ trace_binder_transaction_received(t);
binder_stat_br(proc, thread, cmd);
binder_debug(BINDER_DEBUG_TRANSACTION,
- "binder: %d:%d %s %d %d:%d, cmd %d"
- "size %zd-%zd ptr %p-%p\n",
+ "%d:%d %s %d %d:%d, cmd %d size %zd-%zd ptr %p-%p\n",
proc->pid, thread->pid,
(cmd == BR_TRANSACTION) ? "BR_TRANSACTION" :
"BR_REPLY",
/*spawn a new thread if we leave this out */) {
proc->requested_threads++;
binder_debug(BINDER_DEBUG_THREADS,
- "binder: %d:%d BR_SPAWN_LOOPER\n",
+ "%d:%d BR_SPAWN_LOOPER\n",
proc->pid, thread->pid);
if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer))
return -EFAULT;
+ binder_stat_br(proc, thread, BR_SPAWN_LOOPER);
}
return 0;
}
binder_send_failed_reply(t, BR_DEAD_REPLY);
} else {
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
- "binder: undelivered transaction %d\n",
+ "undelivered transaction %d\n",
t->debug_id);
t->buffer->transaction = NULL;
kfree(t);
} break;
case BINDER_WORK_TRANSACTION_COMPLETE: {
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
- "binder: undelivered TRANSACTION_COMPLETE\n");
+ "undelivered TRANSACTION_COMPLETE\n");
kfree(w);
binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
} break;
death = container_of(w, struct binder_ref_death, work);
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
- "binder: undelivered death notification, %p\n",
+ "undelivered death notification, %p\n",
death->cookie);
kfree(death);
binder_stats_deleted(BINDER_STAT_DEATH);
} break;
default:
- pr_err("binder: unexpected work type, %d, not freed\n",
+ pr_err("unexpected work type, %d, not freed\n",
w->type);
break;
}
while (t) {
active_transactions++;
binder_debug(BINDER_DEBUG_DEAD_TRANSACTION,
- "binder: release %d:%d transaction %d "
- "%s, still active\n", proc->pid, thread->pid,
+ "release %d:%d transaction %d %s, still active\n",
+ proc->pid, thread->pid,
t->debug_id,
(t->to_thread == thread) ? "in" : "out");
struct binder_thread *thread = NULL;
int wait_for_proc_work;
- mutex_lock(&binder_lock);
+ binder_lock(__func__);
+
thread = binder_get_thread(proc);
wait_for_proc_work = thread->transaction_stack == NULL &&
list_empty(&thread->todo) && thread->return_error == BR_OK;
- mutex_unlock(&binder_lock);
+
+ binder_unlock(__func__);
if (wait_for_proc_work) {
if (binder_has_proc_work(proc, thread))
/*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/
+ trace_binder_ioctl(cmd, arg);
+
ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
if (ret)
- return ret;
+ goto err_unlocked;
- mutex_lock(&binder_lock);
+ binder_lock(__func__);
thread = binder_get_thread(proc);
if (thread == NULL) {
ret = -ENOMEM;
goto err;
}
binder_debug(BINDER_DEBUG_READ_WRITE,
- "binder: %d:%d write %ld at %08lx, read %ld at %08lx\n",
- proc->pid, thread->pid, bwr.write_size, bwr.write_buffer,
- bwr.read_size, bwr.read_buffer);
+ "%d:%d write %ld at %08lx, read %ld at %08lx\n",
+ proc->pid, thread->pid, bwr.write_size,
+ bwr.write_buffer, bwr.read_size, bwr.read_buffer);
if (bwr.write_size > 0) {
ret = binder_thread_write(proc, thread, (void __user *)bwr.write_buffer, bwr.write_size, &bwr.write_consumed);
+ trace_binder_write_done(ret);
if (ret < 0) {
bwr.read_consumed = 0;
if (copy_to_user(ubuf, &bwr, sizeof(bwr)))
}
if (bwr.read_size > 0) {
ret = binder_thread_read(proc, thread, (void __user *)bwr.read_buffer, bwr.read_size, &bwr.read_consumed, filp->f_flags & O_NONBLOCK);
+ trace_binder_read_done(ret);
if (!list_empty(&proc->todo))
wake_up_interruptible(&proc->wait);
if (ret < 0) {
}
}
binder_debug(BINDER_DEBUG_READ_WRITE,
- "binder: %d:%d wrote %ld of %ld, read return %ld of %ld\n",
+ "%d:%d wrote %ld of %ld, read return %ld of %ld\n",
proc->pid, thread->pid, bwr.write_consumed, bwr.write_size,
bwr.read_consumed, bwr.read_size);
if (copy_to_user(ubuf, &bwr, sizeof(bwr))) {
break;
case BINDER_SET_CONTEXT_MGR:
if (binder_context_mgr_node != NULL) {
- pr_err("binder: BINDER_SET_CONTEXT_MGR already set\n");
+ pr_err("BINDER_SET_CONTEXT_MGR already set\n");
ret = -EBUSY;
goto err;
}
if (uid_valid(binder_context_mgr_uid)) {
if (!uid_eq(binder_context_mgr_uid, current->cred->euid)) {
- pr_err("binder: BINDER_SET_"
- "CONTEXT_MGR bad uid %d != %d\n",
+ pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n",
from_kuid(&init_user_ns, current->cred->euid),
from_kuid(&init_user_ns, binder_context_mgr_uid));
ret = -EPERM;
binder_context_mgr_node->has_weak_ref = 1;
break;
case BINDER_THREAD_EXIT:
- binder_debug(BINDER_DEBUG_THREADS, "binder: %d:%d exit\n",
+ binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n",
proc->pid, thread->pid);
binder_free_thread(proc, thread);
thread = NULL;
err:
if (thread)
thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN;
- mutex_unlock(&binder_lock);
+ binder_unlock(__func__);
wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
if (ret && ret != -ERESTARTSYS)
- pr_info("binder: %d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
+ pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
+err_unlocked:
+ trace_binder_ioctl_done(ret);
return ret;
}
{
struct binder_proc *proc = vma->vm_private_data;
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
- "binder: %d open vm area %lx-%lx (%ld K) vma %lx pagep %lx\n",
+ "%d open vm area %lx-%lx (%ld K) vma %lx pagep %lx\n",
proc->pid, vma->vm_start, vma->vm_end,
(vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
(unsigned long)pgprot_val(vma->vm_page_prot));
{
struct binder_proc *proc = vma->vm_private_data;
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
- "binder: %d close vm area %lx-%lx (%ld K) vma %lx pagep %lx\n",
+ "%d close vm area %lx-%lx (%ld K) vma %lx pagep %lx\n",
proc->pid, vma->vm_start, vma->vm_end,
(vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags,
(unsigned long)pgprot_val(vma->vm_page_prot));
INIT_LIST_HEAD(&proc->todo);
init_waitqueue_head(&proc->wait);
proc->default_priority = task_nice(current);
- mutex_lock(&binder_lock);
+
+ binder_lock(__func__);
+
binder_stats_created(BINDER_STAT_PROC);
hlist_add_head(&proc->proc_node, &binder_procs);
proc->pid = current->group_leader->pid;
INIT_LIST_HEAD(&proc->delivered_death);
filp->private_data = proc;
- mutex_unlock(&binder_lock);
+
+ binder_unlock(__func__);
if (binder_debugfs_dir_entry_proc) {
char strbuf[11];
}
}
binder_debug(BINDER_DEBUG_DEAD_BINDER,
- "binder: node %d now dead, "
- "refs %d, death %d\n", node->debug_id,
- incoming_refs, death);
+ "node %d now dead, refs %d, death %d\n",
+ node->debug_id, incoming_refs, death);
}
}
outgoing_refs = 0;
if (t) {
t->buffer = NULL;
buffer->transaction = NULL;
- pr_err("binder: release proc %d, "
- "transaction %d, not freed\n",
+ pr_err("release proc %d, transaction %d, not freed\n",
proc->pid, t->debug_id);
/*BUG();*/
}
if (proc->pages[i]) {
void *page_addr = proc->buffer + i * PAGE_SIZE;
binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- "binder_release: %d: "
- "page %d at %p not freed\n",
+ "binder_release: %d: page %d at %p not freed\n",
proc->pid, i,
page_addr);
unmap_kernel_range((unsigned long)page_addr,
put_task_struct(proc->tsk);
binder_debug(BINDER_DEBUG_OPEN_CLOSE,
- "binder_release: %d threads %d, nodes %d (ref %d), "
- "refs %d, active transactions %d, buffers %d, "
- "pages %d\n",
+ "binder_release: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d, buffers %d, pages %d\n",
proc->pid, threads, nodes, incoming_refs, outgoing_refs,
active_transactions, buffers, page_count);
int defer;
do {
- mutex_lock(&binder_lock);
+ binder_lock(__func__);
mutex_lock(&binder_deferred_lock);
if (!hlist_empty(&binder_deferred_list)) {
proc = hlist_entry(binder_deferred_list.first,
if (defer & BINDER_DEFERRED_RELEASE)
binder_deferred_release(proc); /* frees proc */
- mutex_unlock(&binder_lock);
+ binder_unlock(__func__);
if (files)
put_files_struct(files);
} while (proc);
int do_lock = !binder_debug_no_lock;
if (do_lock)
- mutex_lock(&binder_lock);
+ binder_lock(__func__);
seq_puts(m, "binder state:\n");
hlist_for_each_entry(proc, pos, &binder_procs, proc_node)
print_binder_proc(m, proc, 1);
if (do_lock)
- mutex_unlock(&binder_lock);
+ binder_unlock(__func__);
return 0;
}
int do_lock = !binder_debug_no_lock;
if (do_lock)
- mutex_lock(&binder_lock);
+ binder_lock(__func__);
seq_puts(m, "binder stats:\n");
hlist_for_each_entry(proc, pos, &binder_procs, proc_node)
print_binder_proc_stats(m, proc);
if (do_lock)
- mutex_unlock(&binder_lock);
+ binder_unlock(__func__);
return 0;
}
int do_lock = !binder_debug_no_lock;
if (do_lock)
- mutex_lock(&binder_lock);
+ binder_lock(__func__);
seq_puts(m, "binder transactions:\n");
hlist_for_each_entry(proc, pos, &binder_procs, proc_node)
print_binder_proc(m, proc, 0);
if (do_lock)
- mutex_unlock(&binder_lock);
+ binder_unlock(__func__);
return 0;
}
int do_lock = !binder_debug_no_lock;
if (do_lock)
- mutex_lock(&binder_lock);
+ binder_lock(__func__);
seq_puts(m, "binder proc state:\n");
print_binder_proc(m, proc, 1);
if (do_lock)
- mutex_unlock(&binder_lock);
+ binder_unlock(__func__);
return 0;
}
device_initcall(binder_init);
+#define CREATE_TRACE_POINTS
+#include "binder_trace.h"
+
MODULE_LICENSE("GPL v2");
bprm->cred->egid = current_egid();
if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
- !current->no_new_privs) {
+ !current->no_new_privs &&
+ kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
+ kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
/* Set-uid? */
if (mode & S_ISUID) {
- if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
- return -EPERM;
bprm->per_clear |= PER_CLEAR_ON_SETID;
bprm->cred->euid = inode->i_uid;
-
}
/* Set-gid? */
* executable.
*/
if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
- if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
- return -EPERM;
bprm->per_clear |= PER_CLEAR_ON_SETID;
bprm->cred->egid = inode->i_gid;
}
/*
* cycle the list of binary formats handler, until one recognizes the image
*/
-int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
+int search_binary_handler(struct linux_binprm *bprm)
{
unsigned int depth = bprm->recursion_depth;
int try,retval;
for (try=0; try<2; try++) {
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
- int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
+ int (*fn)(struct linux_binprm *) = fmt->load_binary;
if (!fn)
continue;
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
- retval = fn(bprm, regs);
+ retval = fn(bprm);
/*
* Restore the depth counter to its starting value
* in this call, so we don't have to rely on every
*/
static int do_execve_common(const char *filename,
struct user_arg_ptr argv,
- struct user_arg_ptr envp,
- struct pt_regs *regs)
+ struct user_arg_ptr envp)
{
struct linux_binprm *bprm;
struct file *file;
if (retval < 0)
goto out;
- retval = search_binary_handler(bprm,regs);
+ retval = search_binary_handler(bprm);
if (retval < 0)
goto out;
int do_execve(const char *filename,
const char __user *const __user *__argv,
- const char __user *const __user *__envp,
- struct pt_regs *regs)
+ const char __user *const __user *__envp)
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
- return do_execve_common(filename, argv, envp, regs);
+ return do_execve_common(filename, argv, envp);
}
#ifdef CONFIG_COMPAT
-int compat_do_execve(const char *filename,
+static int compat_do_execve(const char *filename,
const compat_uptr_t __user *__argv,
- const compat_uptr_t __user *__envp,
- struct pt_regs *regs)
+ const compat_uptr_t __user *__envp)
{
struct user_arg_ptr argv = {
.is_compat = true,
.is_compat = true,
.ptr.compat = __envp,
};
- return do_execve_common(filename, argv, envp, regs);
+ return do_execve_common(filename, argv, envp);
}
#endif
struct filename *path = getname(filename);
int error = PTR_ERR(path);
if (!IS_ERR(path)) {
- error = do_execve(path->name, argv, envp, current_pt_regs());
+ error = do_execve(path->name, argv, envp);
putname(path);
}
return error;
struct filename *path = getname(filename);
int error = PTR_ERR(path);
if (!IS_ERR(path)) {
- error = compat_do_execve(path->name, argv, envp,
- current_pt_regs());
+ error = compat_do_execve(path->name, argv, envp);
putname(path);
}
return error;
const char *const argv[],
const char *const envp[])
{
- struct pt_regs *p = current_pt_regs();
- int ret;
-
- ret = do_execve(filename,
+ int ret = do_execve(filename,
(const char __user *const __user *)argv,
- (const char __user *const __user *)envp, p);
+ (const char __user *const __user *)envp);
if (ret < 0)
return ret;
* We were successful. We won't be returning to our caller, but
* instead to user space by manipulating the kernel stack.
*/
- ret_from_kernel_execve(p);
+ ret_from_kernel_execve(current_pt_regs());
}
#endif
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
- struct user_namespace *user_ns = current_user_ns();
+ struct user_namespace *user_ns = seq_user_ns(m);
struct group_info *group_info;
int g;
struct fdtable *fdt = NULL;
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
- thread_group_times(task, &utime, &stime);
+ thread_group_cputime_adjusted(task, &utime, &stime);
gtime += sig->gtime;
}
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_times(task, &utime, &stime);
+ task_cputime_adjusted(task, &utime, &stime);
gtime = task->gtime;
}
.release = mem_release,
};
+static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+ char buffer[PROC_NUMBUF];
+ int oom_adj = OOM_ADJUST_MIN;
+ size_t len;
+ unsigned long flags;
+
+ if (!task)
+ return -ESRCH;
+ if (lock_task_sighand(task, &flags)) {
+ if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+ oom_adj = OOM_ADJUST_MAX;
+ else
+ oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+ OOM_SCORE_ADJ_MAX;
+ unlock_task_sighand(task, &flags);
+ }
+ put_task_struct(task);
+ len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+ return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ char buffer[PROC_NUMBUF];
+ int oom_adj;
+ unsigned long flags;
+ int err;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+ if (err)
+ goto out;
+ if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+ oom_adj != OOM_DISABLE) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ task = get_proc_task(file->f_path.dentry->d_inode);
+ if (!task) {
+ err = -ESRCH;
+ goto out;
+ }
+
+ task_lock(task);
+ if (!task->mm) {
+ err = -EINVAL;
+ goto err_task_lock;
+ }
+
+ if (!lock_task_sighand(task, &flags)) {
+ err = -ESRCH;
+ goto err_task_lock;
+ }
+
+ /*
+ * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+ * value is always attainable.
+ */
+ if (oom_adj == OOM_ADJUST_MAX)
+ oom_adj = OOM_SCORE_ADJ_MAX;
+ else
+ oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+
+ if (oom_adj < task->signal->oom_score_adj &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EACCES;
+ goto err_sighand;
+ }
+
+ /*
+ * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+ * /proc/pid/oom_score_adj instead.
+ */
+ printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+ current->comm, task_pid_nr(current), task_pid_nr(task),
+ task_pid_nr(task));
+
+ task->signal->oom_score_adj = oom_adj;
+ trace_oom_score_adj_update(task);
+err_sighand:
+ unlock_task_sighand(task, &flags);
+err_task_lock:
+ task_unlock(task);
+ put_task_struct(task);
+out:
+ return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_adj_operations = {
+ .read = oom_adj_read,
+ .write = oom_adj_write,
+ .llseek = generic_file_llseek,
+};
+
static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
char buffer[PROC_NUMBUF];
- int oom_score_adj = OOM_SCORE_ADJ_MIN;
+ short oom_score_adj = OOM_SCORE_ADJ_MIN;
unsigned long flags;
size_t len;
unlock_task_sighand(task, &flags);
}
put_task_struct(task);
- len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+ len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
goto err_task_lock;
}
- if (oom_score_adj < task->signal->oom_score_adj_min &&
+ if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
!capable(CAP_SYS_RESOURCE)) {
err = -EACCES;
goto err_sighand;
}
- task->signal->oom_score_adj = oom_score_adj;
+ task->signal->oom_score_adj = (short)oom_score_adj;
if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
- task->signal->oom_score_adj_min = oom_score_adj;
+ task->signal->oom_score_adj_min = (short)oom_score_adj;
trace_oom_score_adj_update(task);
err_sighand:
if (!vma)
goto out_no_vma;
- result = proc_map_files_instantiate(dir, dentry, task,
- (void *)(unsigned long)vma->vm_file->f_mode);
+ if (vma->vm_file)
+ result = proc_map_files_instantiate(dir, dentry, task,
+ (void *)(unsigned long)vma->vm_file->f_mode);
out_no_vma:
up_read(&mm->mmap_sem);
};
#endif
- /*
- * /proc/self:
- */
- static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
- int buflen)
- {
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
- pid_t tgid = task_tgid_nr_ns(current, ns);
- char tmp[PROC_NUMBUF];
- if (!tgid)
- return -ENOENT;
- sprintf(tmp, "%d", tgid);
- return vfs_readlink(dentry,buffer,buflen,tmp);
- }
-
- static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
- {
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
- pid_t tgid = task_tgid_nr_ns(current, ns);
- char *name = ERR_PTR(-ENOENT);
- if (tgid) {
- /* 11 for max length of signed int in decimal + NULL term */
- name = kmalloc(12, GFP_KERNEL);
- if (!name)
- name = ERR_PTR(-ENOMEM);
- else
- sprintf(name, "%d", tgid);
- }
- nd_set_link(nd, name);
- return NULL;
- }
-
- static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
- void *cookie)
- {
- char *s = nd_get_link(nd);
- if (!IS_ERR(s))
- kfree(s);
- }
-
- static const struct inode_operations proc_self_inode_operations = {
- .readlink = proc_self_readlink,
- .follow_link = proc_self_follow_link,
- .put_link = proc_self_put_link,
- };
-
- /*
- * proc base
- *
- * These are the directory entries in the root directory of /proc
- * that properly belong to the /proc filesystem, as they describe
- * describe something that is process related.
- */
- static const struct pid_entry proc_base_stuff[] = {
- NOD("self", S_IFLNK|S_IRWXUGO,
- &proc_self_inode_operations, NULL, {}),
- };
-
- static struct dentry *proc_base_instantiate(struct inode *dir,
- struct dentry *dentry, struct task_struct *task, const void *ptr)
- {
- const struct pid_entry *p = ptr;
- struct inode *inode;
- struct proc_inode *ei;
- struct dentry *error;
-
- /* Allocate the inode */
- error = ERR_PTR(-ENOMEM);
- inode = new_inode(dir->i_sb);
- if (!inode)
- goto out;
-
- /* Initialize the inode */
- ei = PROC_I(inode);
- inode->i_ino = get_next_ino();
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-
- /*
- * grab the reference to the task.
- */
- ei->pid = get_task_pid(task, PIDTYPE_PID);
- if (!ei->pid)
- goto out_iput;
-
- inode->i_mode = p->mode;
- if (S_ISDIR(inode->i_mode))
- set_nlink(inode, 2);
- if (S_ISLNK(inode->i_mode))
- inode->i_size = 64;
- if (p->iop)
- inode->i_op = p->iop;
- if (p->fop)
- inode->i_fop = p->fop;
- ei->op = p->op;
- d_add(dentry, inode);
- error = NULL;
- out:
- return error;
- out_iput:
- iput(inode);
- goto out;
- }
-
- static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
- {
- struct dentry *error;
- struct task_struct *task = get_proc_task(dir);
- const struct pid_entry *p, *last;
-
- error = ERR_PTR(-ENOENT);
-
- if (!task)
- goto out_no_task;
-
- /* Lookup the directory entry */
- last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
- for (p = proc_base_stuff; p <= last; p++) {
- if (p->len != dentry->d_name.len)
- continue;
- if (!memcmp(dentry->d_name.name, p->name, p->len))
- break;
- }
- if (p > last)
- goto out;
-
- error = proc_base_instantiate(dir, dentry, task, p);
-
- out:
- put_task_struct(task);
- out_no_task:
- return error;
- }
-
- static int proc_base_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
- {
- return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
- proc_base_instantiate, task, p);
- }
-
#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
{
REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
INF("oom_score", S_IRUGO, proc_oom_score),
+ REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}
static struct dentry *proc_pid_instantiate(struct inode *dir,
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
- struct dentry *result;
+ struct dentry *result = NULL;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
- result = proc_base_lookup(dir, dentry);
- if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
- goto out;
-
tgid = name_to_int(dentry);
if (tgid == ~0U)
goto out;
return iter;
}
- #define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
+ #define TGID_OFFSET (FIRST_PROCESS_ENTRY)
static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
struct tgid_iter iter)
/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
- unsigned int nr;
- struct task_struct *reaper;
struct tgid_iter iter;
struct pid_namespace *ns;
filldir_t __filldir;
if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
- goto out_no_task;
- nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-
- reaper = get_proc_task(filp->f_path.dentry->d_inode);
- if (!reaper)
- goto out_no_task;
-
- for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
- const struct pid_entry *p = &proc_base_stuff[nr];
- if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
- goto out;
- }
+ goto out;
ns = filp->f_dentry->d_sb->s_fs_info;
iter.task = NULL;
}
filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
out:
- put_task_struct(reaper);
- out_no_task:
return 0;
}
REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
INF("oom_score", S_IRUGO, proc_oom_score),
+ REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
extern int in_group_p(kgid_t);
extern int in_egroup_p(kgid_t);
-/*
- * The common credentials for a thread group
- * - shared by CLONE_THREAD
- */
-#ifdef CONFIG_KEYS
-struct thread_group_cred {
- atomic_t usage;
- pid_t tgid; /* thread group process ID */
- spinlock_t lock;
- struct key __rcu *session_keyring; /* keyring inherited over fork */
- struct key *process_keyring; /* keyring private to this process */
- struct rcu_head rcu; /* RCU deletion hook */
-};
-#endif
-
/*
* The security context of a task
*
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested
* keys to */
+ struct key __rcu *session_keyring; /* keyring inherited over fork */
+ struct key *process_keyring; /* keyring private to this process */
struct key *thread_keyring; /* keyring private to this thread */
struct key *request_key_auth; /* assumed request_key authority */
struct thread_group_cred *tgcred; /* thread-group shared credentials */
extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns() (current_cred_xxx(user_ns))
- #define task_user_ns(task) (task_cred_xxx((task), user_ns))
#else
#define current_user_ns() (&init_user_ns)
- #define task_user_ns(task) (&init_user_ns)
#endif
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
- struct address_space *assoc_mapping; /* ditto */
+ void *private_data; /* ditto */
} __attribute__((aligned(sizeof(long))));
/*
* On most architectures that alignment is already the case; but
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
- /* A semaphore that prevents I/O while block size is being changed */
- struct percpu_rw_semaphore bd_block_size_semaphore;
};
/*
#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
+ #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
+ #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */
#define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int,
extern struct block_device *bdget(dev_t);
extern struct block_device *bdgrab(struct block_device *bdev);
extern void bd_set_size(struct block_device *, loff_t size);
-extern sector_t blkdev_max_block(struct block_device *bdev);
extern void bd_forget(struct inode *inode);
extern void bdput(struct block_device *);
extern void invalidate_bdev(struct block_device *);
unsigned long *nr_segs, size_t *count, int access_flags);
/* fs/block_dev.c */
-extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
This option enables preemptible-RCU code that is common between
the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
+config CONTEXT_TRACKING
+ bool
+
config RCU_USER_QS
bool "Consider userspace as in RCU extended quiescent state"
- depends on HAVE_RCU_USER_QS && SMP
+ depends on HAVE_CONTEXT_TRACKING && SMP
+ select CONTEXT_TRACKING
help
This option sets hooks on kernel / userspace boundaries and
puts RCU in extended quiescent state when the CPU runs in
userspace. It means that when a CPU runs in userspace, it is
excluded from the global RCU state machine and thus doesn't
- to keep the timer tick on for RCU.
+ try to keep the timer tick on for RCU.
Unless you want to hack and help the development of the full
- tickless feature, you shouldn't enable this option. It adds
- unnecessary overhead.
+ dynticks mode, you shouldn't enable this option. It also
+ adds unnecessary overhead.
If unsure say N
-config RCU_USER_QS_FORCE
- bool "Force userspace extended QS by default"
- depends on RCU_USER_QS
+config CONTEXT_TRACKING_FORCE
+ bool "Force context tracking"
+ depends on CONTEXT_TRACKING
help
- Set the hooks in user/kernel boundaries by default in order to
- test this feature that treats userspace as an extended quiescent
- state until we have a real user like a full adaptive nohz option.
-
- Unless you want to hack and help the development of the full
- tickless feature, you shouldn't enable this option. It adds
- unnecessary overhead.
-
- If unsure say N
+ Probe on user/kernel boundaries by default in order to
+ test the features that rely on it such as userspace RCU extended
+ quiescent states.
+ This test is there for debugging until we have a real user like the
+ full dynticks mode.
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
depends on NO_HZ && SMP
default n
help
- This option causes RCU to attempt to accelerate grace periods
- in order to allow CPUs to enter dynticks-idle state more
- quickly. On the other hand, this option increases the overhead
- of the dynticks-idle checking, particularly on systems with
- large numbers of CPUs.
+ This option causes RCU to attempt to accelerate grace periods in
+ order to allow CPUs to enter dynticks-idle state more quickly.
+ On the other hand, this option increases the overhead of the
+ dynticks-idle checking, thus degrading scheduling latency.
- Say Y if energy efficiency is critically important, particularly
- if you have relatively few CPUs.
+ Say Y if energy efficiency is critically important, and you don't
+ care about real-time response.
Say N if you are unsure.
Accept the default if unsure.
+config RCU_NOCB_CPU
+ bool "Offload RCU callback processing from boot-selected CPUs"
+ depends on TREE_RCU || TREE_PREEMPT_RCU
+ default n
+ help
+ Use this option to reduce OS jitter for aggressive HPC or
+ real-time workloads. It can also be used to offload RCU
+ callback invocation to energy-efficient CPUs in battery-powered
+ asymmetric multiprocessors.
+
+ This option offloads callback invocation from the set of
+ CPUs specified at boot time by the rcu_nocbs parameter.
+ For each such CPU, a kthread ("rcuoN") will be created to
+ invoke callbacks, where the "N" is the CPU being offloaded.
+ Nothing prevents this kthread from running on the specified
+ CPUs, but (1) the kthreads may be preempted between each
+ callback, and (2) affinity or cgroups can be used to force
+ the kthreads to run on whatever set of CPUs is desired.
+
+ Say Y here if you want reduced OS jitter on selected CPUs.
+ Say N here if you are unsure.
+
endmenu # "RCU Subsystem"
config IKCONFIG
config HAVE_UNSTABLE_SCHED_CLOCK
bool
+#
+# For architectures that want to enable the support for NUMA-affine scheduler
+# balancing logic:
+#
+config ARCH_SUPPORTS_NUMA_BALANCING
+ bool
+
+# For architectures that (ab)use NUMA to represent different memory regions
+# all cpu-local but of different latencies, such as SuperH.
+#
+config ARCH_WANT_NUMA_VARIABLE_LOCALITY
+ bool
+
+#
+# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
+config ARCH_WANTS_PROT_NUMA_PROT_NONE
+ bool
+
+config ARCH_USES_NUMA_PROT_NONE
+ bool
+ default y
+ depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
+ depends on NUMA_BALANCING
+
+config NUMA_BALANCING_DEFAULT_ENABLED
+ bool "Automatically enable NUMA aware memory/task placement"
+ default y
+ depends on NUMA_BALANCING
+ help
+ If set, autonumic NUMA balancing will be enabled if running on a NUMA
+ machine.
+
+config NUMA_BALANCING
+ bool "Memory placement aware NUMA scheduler"
+ depends on ARCH_SUPPORTS_NUMA_BALANCING
+ depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
+ depends on SMP && NUMA && MIGRATION
+ help
+ This option adds support for automatic NUMA aware memory/task placement.
+ The mechanism is quite primitive and is based on migrating memory when
+ it is references to the node the task is running on.
+
+ This system will be inactive on UMA systems.
+
menuconfig CGROUPS
boolean "Control Group support"
depends on EVENTFD
# Filesystems
depends on 9P_FS = n
depends on AFS_FS = n
- depends on AUTOFS4_FS = n
depends on CEPH_FS = n
depends on CIFS = n
depends on CODA_FS = n
- depends on FUSE_FS = n
depends on GFS2_FS = n
depends on NCP_FS = n
depends on NFSD = n
{
}
+# if THREAD_SIZE >= PAGE_SIZE
void __init __weak thread_info_cache_init(void)
{
}
+#endif
/*
* Set up kernel memory allocators
system_state = SYSTEM_RUNNING;
numa_default_policy();
- current->signal->flags |= SIGNAL_UNKILLABLE;
flush_delayed_fput();
if (ramdisk_execute_command) {
/*
* init can allocate pages on any node
*/
- set_mems_allowed(node_states[N_HIGH_MEMORY]);
+ set_mems_allowed(node_states[N_MEMORY]);
/*
* init can run on any cpu.
*/
/* Hierarchy-specific flags */
unsigned long flags;
+ /* IDs for cgroups in this hierarchy */
+ struct ida cgroup_ida;
+
/* The path to use for release notifications. */
char release_agent_path[PATH_MAX];
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_is_removed()
- * css_tryget() should be used for avoiding race.
+ * is called after synchronize_rcu(). But for safe use, css_tryget()
+ * should be used for avoiding race.
*/
struct cgroup_subsys_state __rcu *css;
/*
*/
static int need_forkexit_callback __read_mostly;
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+ struct cftype cfts[], bool is_add);
+
#ifdef CONFIG_PROVE_LOCKING
int cgroup_lock_is_held(void)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
-static int clone_children(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-}
-
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
* The task_lock() exception
*
* The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
+ * cgroup_attach_task(), which overwrites one task's cgroup pointer with
* another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
* task->cgroup without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroup pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
return inode;
}
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
- */
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- int ret = 0;
-
- for_each_subsys(cgrp->root, ss) {
- if (!ss->pre_destroy)
- continue;
-
- ret = ss->pre_destroy(cgrp);
- if (ret) {
- /* ->pre_destroy() failure is being deprecated */
- WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
- break;
- }
- }
-
- return ret;
-}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
* Release the subsystem state objects.
*/
for_each_subsys(cgrp->root, ss)
- ss->destroy(cgrp);
+ ss->css_free(cgrp);
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);
simple_xattrs_free(&cgrp->xattrs);
+ ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
kfree_rcu(cgrp, rcu_head);
} else {
struct cfent *cfe = __d_cfe(dentry);
if (!test_bit(ss->subsys_id, &subsys_mask))
continue;
list_for_each_entry(set, &ss->cftsets, node)
- cgroup_rm_file(cgrp, set->cfts);
+ cgroup_addrm_files(cgrp, NULL, set->cfts, false);
}
if (base_files) {
while (!list_empty(&cgrp->files))
remove_dir(dentry);
}
-/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
-
/*
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
seq_puts(seq, ",xattr");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
- if (clone_children(&root->top_cgroup))
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
unsigned long subsys_mask;
unsigned long flags;
char *release_agent;
- bool clone_children;
+ bool cpuset_clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
continue;
}
if (!strcmp(token, "clone_children")) {
- opts->clone_children = true;
+ opts->cpuset_clone_children = true;
continue;
}
if (!strcmp(token, "xattr")) {
goto out_unlock;
}
+ /*
+ * Clear out the files of subsystems that should be removed, do
+ * this before rebind_subsystems, since rebind_subsystems may
+ * change this hierarchy's subsys_list.
+ */
+ cgroup_clear_directory(cgrp->dentry, false, removed_mask);
+
ret = rebind_subsystems(root, opts.subsys_mask);
if (ret) {
+ /* rebind_subsystems failed, re-populate the removed files */
+ cgroup_populate_dir(cgrp, false, removed_mask);
drop_parsed_module_refcounts(opts.subsys_mask);
goto out_unlock;
}
- /* clear out any existing files and repopulate subsystem files */
- cgroup_clear_directory(cgrp->dentry, false, removed_mask);
/* re-populate subsystem files */
cgroup_populate_dir(cgrp, false, added_mask);
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->files);
INIT_LIST_HEAD(&cgrp->css_sets);
+ INIT_LIST_HEAD(&cgrp->allcg_node);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
root->number_of_cgroups = 1;
cgrp->root = root;
cgrp->top_cgroup = cgrp;
- list_add_tail(&cgrp->allcg_node, &root->allcg_list);
init_cgroup_housekeeping(cgrp);
+ list_add_tail(&cgrp->allcg_node, &root->allcg_list);
}
static bool init_root_id(struct cgroupfs_root *root)
root->subsys_mask = opts->subsys_mask;
root->flags = opts->flags;
+ ida_init(&root->cgroup_ida);
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
- if (opts->clone_children)
- set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
+ if (opts->cpuset_clone_children)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
return root;
}
spin_lock(&hierarchy_id_lock);
ida_remove(&hierarchy_ida, root->hierarchy_id);
spin_unlock(&hierarchy_id_lock);
+ ida_destroy(&root->cgroup_ida);
kfree(root);
}
free_cg_links(&tmp_cg_links);
- BUG_ON(!list_empty(&root_cgrp->sibling));
BUG_ON(!list_empty(&root_cgrp->children));
BUG_ON(root->number_of_cgroups != 1);
BUG_ON(root->number_of_cgroups != 1);
BUG_ON(!list_empty(&cgrp->children));
- BUG_ON(!list_empty(&cgrp->sibling));
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
+ struct dentry *dentry = cgrp->dentry;
char *start;
- struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
- cgroup_lock_is_held());
+
+ rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
+ "cgroup_path() called without proper locking");
if (!dentry || cgrp == dummytop) {
/*
return 0;
}
- start = buf + buflen;
+ start = buf + buflen - 1;
- *--start = '\0';
+ *start = '\0';
for (;;) {
int len = dentry->d_name.len;
if (!cgrp)
break;
- dentry = rcu_dereference_check(cgrp->dentry,
- cgroup_lock_is_held());
+ dentry = cgrp->dentry;
if (!cgrp->parent)
continue;
if (--start < buf)
/*
* cgroup_task_migrate - move a task from one cgroup to another.
*
- * 'guarantee' is set if the caller promises that a new css_set for the task
- * will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
+ * Must be called with cgroup_mutex and threadgroup locked.
*/
static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
struct task_struct *tsk, struct css_set *newcg)
}
synchronize_rcu();
-
- /*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
- */
- cgroup_wakeup_rmdir_waiter(cgrp);
out:
if (retval) {
for_each_subsys(root, ss) {
* step 5: success! and cleanup
*/
synchronize_rcu();
- cgroup_wakeup_rmdir_waiter(cgrp);
retval = 0;
out_put_css_set_refs:
if (retval) {
/* start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
+ inc_nlink(dentry->d_parent->d_inode);
- /* start with the directory inode held, so that we can
- * populate it without racing with another mkdir */
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ /*
+ * Control reaches here with cgroup_mutex held.
+ * @inode->i_mutex should nest outside cgroup_mutex but we
+ * want to populate it immediately without releasing
+ * cgroup_mutex. As @inode isn't visible to anyone else
+ * yet, trylock will always succeed without affecting
+ * lockdep checks.
+ */
+ WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
} else if (S_ISREG(mode)) {
inode->i_size = 0;
inode->i_fop = &cgroup_file_operations;
return 0;
}
-/*
- * cgroup_create_dir - create a directory for an object.
- * @cgrp: the cgroup we create the directory for. It must have a valid
- * ->parent field. And we are going to fill its ->dentry field.
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new directory.
- */
-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
- umode_t mode)
-{
- struct dentry *parent;
- int error = 0;
-
- parent = cgrp->parent->dentry;
- error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
- if (!error) {
- dentry->d_fsdata = cgrp;
- inc_nlink(parent->d_inode);
- rcu_assign_pointer(cgrp->dentry, dentry);
- dget(dentry);
- }
- dput(dentry);
-
- return error;
-}
-
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
simple_xattrs_init(&cft->xattrs);
- /* does @cft->flags tell us to skip creation on @cgrp? */
- if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
- return 0;
- if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
- return 0;
-
if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
strcpy(name, subsys->name);
strcat(name, ".");
int err, ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
+ /* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+ continue;
+ if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+ continue;
+
if (is_add)
err = cgroup_add_file(cgrp, subsys, cft);
else
write_unlock(&css_set_lock);
}
+/**
+ * cgroup_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_pre(). Find the next
+ * descendant to visit for pre-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
+ struct cgroup *cgroup)
+{
+ struct cgroup *next;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* if first iteration, pretend we just visited @cgroup */
+ if (!pos) {
+ if (list_empty(&cgroup->children))
+ return NULL;
+ pos = cgroup;
+ }
+
+ /* visit the first child if exists */
+ next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
+ if (next)
+ return next;
+
+ /* no child, visit my or the closest ancestor's next sibling */
+ do {
+ next = list_entry_rcu(pos->sibling.next, struct cgroup,
+ sibling);
+ if (&next->sibling != &pos->parent->children)
+ return next;
+
+ pos = pos->parent;
+ } while (pos != cgroup);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+
+static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
+{
+ struct cgroup *last;
+
+ do {
+ last = pos;
+ pos = list_first_or_null_rcu(&pos->children, struct cgroup,
+ sibling);
+ } while (pos);
+
+ return last;
+}
+
+/**
+ * cgroup_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_post(). Find the next
+ * descendant to visit for post-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
+ struct cgroup *cgroup)
+{
+ struct cgroup *next;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* if first iteration, visit the leftmost descendant */
+ if (!pos) {
+ next = cgroup_leftmost_descendant(cgroup);
+ return next != cgroup ? next : NULL;
+ }
+
+ /* if there's an unvisited sibling, visit its leftmost descendant */
+ next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+ if (&next->sibling != &pos->parent->children)
+ return cgroup_leftmost_descendant(next);
+
+ /* no sibling left, visit parent */
+ next = pos->parent;
+ return next != cgroup ? next : NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
+
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
__acquires(css_set_lock)
{
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = current->nsproxy->pid_ns;
+ struct pid_namespace *ns = task_active_pid_ns(current);
/*
* We can't drop the pidlist_mutex before taking the l->mutex in case
if (flags & POLLHUP) {
__remove_wait_queue(event->wqh, &event->wait);
spin_lock(&cgrp->event_list_lock);
- list_del(&event->list);
+ list_del_init(&event->list);
spin_unlock(&cgrp->event_list_lock);
/*
* We are in atomic context, but cgroup_event_remove() may
static u64 cgroup_clone_children_read(struct cgroup *cgrp,
struct cftype *cft)
{
- return clone_children(cgrp);
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
}
static int cgroup_clone_children_write(struct cgroup *cgrp,
u64 val)
{
if (val)
- set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
else
- clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
return 0;
}
css->flags = 0;
css->id = NULL;
if (cgrp == dummytop)
- set_bit(CSS_ROOT, &css->flags);
+ css->flags |= CSS_ROOT;
BUG_ON(cgrp->subsys[ss->subsys_id]);
cgrp->subsys[ss->subsys_id] = css;
/*
- * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
- * which is put on the last css_put(). dput() requires process
- * context, which css_put() may be called without. @css->dput_work
- * will be used to invoke dput() asynchronously from css_put().
+ * css holds an extra ref to @cgrp->dentry which is put on the last
+ * css_put(). dput() requires process context, which css_put() may
+ * be called without. @css->dput_work will be used to invoke
+ * dput() asynchronously from css_put().
*/
INIT_WORK(&css->dput_work, css_dput_fn);
- if (ss->__DEPRECATED_clear_css_refs)
- set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
+}
+
+/* invoke ->post_create() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ int ret = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (ss->css_online)
+ ret = ss->css_online(cgrp);
+ if (!ret)
+ cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
+ return ret;
+}
+
+/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
+static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+ struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!(css->flags & CSS_ONLINE))
+ return;
+
+ /*
+ * css_offline() should be called with cgroup_mutex unlocked. See
+ * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
+ * details. This temporary unlocking should go away once
+ * cgroup_mutex is unexported from controllers.
+ */
+ if (ss->css_offline) {
+ mutex_unlock(&cgroup_mutex);
+ ss->css_offline(cgrp);
+ mutex_lock(&cgroup_mutex);
+ }
+
+ cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
}
/*
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
+ /* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
return -ENOMEM;
+ cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
+ if (cgrp->id < 0)
+ goto err_free_cgrp;
+
+ /*
+ * Only live parents can have children. Note that the liveliness
+ * check isn't strictly necessary because cgroup_mkdir() and
+ * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
+ * anyway so that locking is contained inside cgroup proper and we
+ * don't get nasty surprises if we ever grow another caller.
+ */
+ if (!cgroup_lock_live_group(parent)) {
+ err = -ENODEV;
+ goto err_free_id;
+ }
+
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
* fs */
atomic_inc(&sb->s_active);
- mutex_lock(&cgroup_mutex);
-
init_cgroup_housekeeping(cgrp);
cgrp->parent = parent;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
- if (clone_children(parent))
- set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css;
- css = ss->create(cgrp);
+ css = ss->css_alloc(cgrp);
if (IS_ERR(css)) {
err = PTR_ERR(css);
- goto err_destroy;
+ goto err_free_all;
}
init_cgroup_css(css, ss, cgrp);
if (ss->use_id) {
err = alloc_css_id(ss, parent, cgrp);
if (err)
- goto err_destroy;
+ goto err_free_all;
}
- /* At error, ->destroy() callback has to free assigned ID. */
- if (clone_children(parent) && ss->post_clone)
- ss->post_clone(cgrp);
+ }
+
+ /*
+ * Create directory. cgroup_create_file() returns with the new
+ * directory locked on success so that it can be populated without
+ * dropping cgroup_mutex.
+ */
+ err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
+ if (err < 0)
+ goto err_free_all;
+ lockdep_assert_held(&dentry->d_inode->i_mutex);
+
+ /* allocation complete, commit to creation */
+ dentry->d_fsdata = cgrp;
+ cgrp->dentry = dentry;
+ list_add_tail(&cgrp->allcg_node, &root->allcg_list);
+ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
+ root->number_of_cgroups++;
+
+ /* each css holds a ref to the cgroup's dentry */
+ for_each_subsys(root, ss)
+ dget(dentry);
+
+ /* creation succeeded, notify subsystems */
+ for_each_subsys(root, ss) {
+ err = online_css(ss, cgrp);
+ if (err)
+ goto err_destroy;
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
}
}
- list_add(&cgrp->sibling, &cgrp->parent->children);
- root->number_of_cgroups++;
-
- err = cgroup_create_dir(cgrp, dentry, mode);
- if (err < 0)
- goto err_remove;
-
- /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
- for_each_subsys(root, ss)
- if (!ss->__DEPRECATED_clear_css_refs)
- dget(dentry);
-
- /* The cgroup directory was pre-locked for us */
- BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
-
- list_add_tail(&cgrp->allcg_node, &root->allcg_list);
-
err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
- /* If err < 0, we have a half-filled directory - oh well ;) */
+ if (err)
+ goto err_destroy;
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return 0;
- err_remove:
-
- list_del(&cgrp->sibling);
- root->number_of_cgroups--;
-
- err_destroy:
-
+err_free_all:
for_each_subsys(root, ss) {
if (cgrp->subsys[ss->subsys_id])
- ss->destroy(cgrp);
+ ss->css_free(cgrp);
}
-
mutex_unlock(&cgroup_mutex);
-
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
-
+err_free_id:
+ ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_cgrp:
kfree(cgrp);
return err;
+
+err_destroy:
+ cgroup_destroy_locked(cgrp);
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ return err;
}
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return 0;
}
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
- *
- * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
- * not, cgroup removal behaves differently.
- *
- * If clear is set, css refcnt for the subsystem should be zero before
- * cgroup removal can be committed. This is implemented by
- * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
- * called multiple times until all css refcnts reach zero and is allowed to
- * veto removal on any invocation. This behavior is deprecated and will be
- * removed as soon as the existing user (memcg) is updated.
- *
- * If clear is not set, each css holds an extra reference to the cgroup's
- * dentry and cgroup removal proceeds regardless of css refs.
- * ->pre_destroy() will be called at least once and is not allowed to fail.
- * On the last put of each css, whenever that may be, the extra dentry ref
- * is put so that dentry destruction happens only after all css's are
- * released.
- */
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
+static int cgroup_destroy_locked(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
+ struct dentry *d = cgrp->dentry;
+ struct cgroup *parent = cgrp->parent;
+ DEFINE_WAIT(wait);
+ struct cgroup_event *event, *tmp;
struct cgroup_subsys *ss;
- unsigned long flags;
- bool failed = false;
+ LIST_HEAD(tmp_list);
+
+ lockdep_assert_held(&d->d_inode->i_mutex);
+ lockdep_assert_held(&cgroup_mutex);
- local_irq_save(flags);
+ if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+ return -EBUSY;
/*
- * Block new css_tryget() by deactivating refcnt. If all refcnts
- * for subsystems w/ clear_css_refs set were 1 at the moment of
- * deactivation, we succeeded.
+ * Block new css_tryget() by deactivating refcnt and mark @cgrp
+ * removed. This makes future css_tryget() and child creation
+ * attempts fail thus maintaining the removal conditions verified
+ * above.
*/
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
WARN_ON(atomic_read(&css->refcnt) < 0);
atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-
- if (ss->__DEPRECATED_clear_css_refs)
- failed |= css_refcnt(css) != 1;
- }
-
- /*
- * If succeeded, set REMOVED and put all the base refs; otherwise,
- * restore refcnts to positive values. Either way, all in-progress
- * css_tryget() will be released.
- */
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-
- if (!failed) {
- set_bit(CSS_REMOVED, &css->flags);
- css_put(css);
- } else {
- atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
- }
}
+ set_bit(CGRP_REMOVED, &cgrp->flags);
- local_irq_restore(flags);
- return !failed;
-}
-
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
-{
- struct cgroup *cgrp = dentry->d_fsdata;
- struct dentry *d;
- struct cgroup *parent;
- DEFINE_WAIT(wait);
- struct cgroup_event *event, *tmp;
- int ret;
-
- /* the vfs holds both inode->i_mutex already */
-again:
- mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- if (!list_empty(&cgrp->children)) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- mutex_unlock(&cgroup_mutex);
-
- /*
- * In general, subsystem has no css->refcnt after pre_destroy(). But
- * in racy cases, subsystem may have to get css->refcnt after
- * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
- * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
- * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
- * and subsystem's reference count handling. Please see css_get/put
- * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
- */
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ /* tell subsystems to initate destruction */
+ for_each_subsys(cgrp->root, ss)
+ offline_css(ss, cgrp);
/*
- * Call pre_destroy handlers of subsys. Notify subsystems
- * that rmdir() request comes.
+ * Put all the base refs. Each css holds an extra reference to the
+ * cgroup's dentry and cgroup removal proceeds regardless of css
+ * refs. On the last put of each css, whenever that may be, the
+ * extra dentry ref is put so that dentry destruction happens only
+ * after all css's are released.
*/
- ret = cgroup_call_pre_destroy(cgrp);
- if (ret) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- return ret;
- }
-
- mutex_lock(&cgroup_mutex);
- parent = cgrp->parent;
- if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
- if (!cgroup_clear_css_refs(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- /*
- * Because someone may call cgroup_wakeup_rmdir_waiter() before
- * prepare_to_wait(), we need to check this flag.
- */
- if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
- schedule();
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- if (signal_pending(current))
- return -EINTR;
- goto again;
- }
- /* NO css_tryget() can success after here. */
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ for_each_subsys(cgrp->root, ss)
+ css_put(cgrp->subsys[ss->subsys_id]);
raw_spin_lock(&release_list_lock);
- set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
/* delete this cgroup from parent->children */
- list_del_init(&cgrp->sibling);
-
+ list_del_rcu(&cgrp->sibling);
list_del_init(&cgrp->allcg_node);
- d = dget(cgrp->dentry);
-
+ dget(d);
cgroup_d_remove_dir(d);
dput(d);
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
- * directory to avoid race between userspace and kernelspace
+ * directory to avoid race between userspace and kernelspace. Use
+ * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
+ * cgroup_event_wake() is called with the wait queue head locked,
+ * remove_wait_queue() cannot be called while holding event_list_lock.
*/
spin_lock(&cgrp->event_list_lock);
- list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
- list_del(&event->list);
+ list_splice_init(&cgrp->event_list, &tmp_list);
+ spin_unlock(&cgrp->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &tmp_list, list) {
+ list_del_init(&event->list);
remove_wait_queue(event->wqh, &event->wait);
eventfd_signal(event->eventfd, 1);
schedule_work(&event->remove);
}
- spin_unlock(&cgrp->event_list_lock);
- mutex_unlock(&cgroup_mutex);
return 0;
}
+static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = cgroup_destroy_locked(dentry->d_fsdata);
+ mutex_unlock(&cgroup_mutex);
+
+ return ret;
+}
+
static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
{
INIT_LIST_HEAD(&ss->cftsets);
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+ mutex_lock(&cgroup_mutex);
+
/* init base cftset */
cgroup_init_cftsets(ss);
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &rootnode.subsys_list);
ss->root = &rootnode;
- css = ss->create(dummytop);
+ css = ss->css_alloc(dummytop);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_cgroup_css(css, ss, dummytop);
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
* init_css_set is in the subsystem's top cgroup. */
- init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+ init_css_set.subsys[ss->subsys_id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
BUG_ON(!list_empty(&init_task.tasks));
ss->active = 1;
+ BUG_ON(online_css(ss, dummytop));
+
+ mutex_unlock(&cgroup_mutex);
/* this function shouldn't be used with modular subsystems, since they
* need to register a subsys_id, among other things */
*/
int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
- int i;
struct cgroup_subsys_state *css;
+ int i, ret;
/* check name and function validity */
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
- ss->create == NULL || ss->destroy == NULL)
+ ss->css_alloc == NULL || ss->css_free == NULL)
return -EINVAL;
/*
subsys[ss->subsys_id] = ss;
/*
- * no ss->create seems to need anything important in the ss struct, so
- * this can happen first (i.e. before the rootnode attachment).
+ * no ss->css_alloc seems to need anything important in the ss
+ * struct, so this can happen first (i.e. before the rootnode
+ * attachment).
*/
- css = ss->create(dummytop);
+ css = ss->css_alloc(dummytop);
if (IS_ERR(css)) {
/* failure case - need to deassign the subsys[] slot. */
subsys[ss->subsys_id] = NULL;
init_cgroup_css(css, ss, dummytop);
/* init_idr must be after init_cgroup_css because it sets css->id. */
if (ss->use_id) {
- int ret = cgroup_init_idr(ss, css);
- if (ret) {
- dummytop->subsys[ss->subsys_id] = NULL;
- ss->destroy(dummytop);
- subsys[ss->subsys_id] = NULL;
- mutex_unlock(&cgroup_mutex);
- return ret;
- }
+ ret = cgroup_init_idr(ss, css);
+ if (ret)
+ goto err_unload;
}
/*
write_unlock(&css_set_lock);
ss->active = 1;
+ ret = online_css(ss, dummytop);
+ if (ret)
+ goto err_unload;
/* success! */
mutex_unlock(&cgroup_mutex);
return 0;
+
+err_unload:
+ mutex_unlock(&cgroup_mutex);
+ /* @ss can't be mounted here as try_module_get() would fail */
+ cgroup_unload_subsys(ss);
+ return ret;
}
EXPORT_SYMBOL_GPL(cgroup_load_subsys);
BUG_ON(ss->root != &rootnode);
mutex_lock(&cgroup_mutex);
+
+ offline_css(ss, dummytop);
+ ss->active = 0;
+
+ if (ss->use_id) {
+ idr_remove_all(&ss->idr);
+ idr_destroy(&ss->idr);
+ }
+
/* deassign the subsys_id */
subsys[ss->subsys_id] = NULL;
struct css_set *cg = link->cg;
hlist_del(&cg->hlist);
- BUG_ON(!cg->subsys[ss->subsys_id]);
cg->subsys[ss->subsys_id] = NULL;
hhead = css_set_hash(cg->subsys);
hlist_add_head(&cg->hlist, hhead);
write_unlock(&css_set_lock);
/*
- * remove subsystem's css from the dummytop and free it - need to free
- * before marking as null because ss->destroy needs the cgrp->subsys
- * pointer to find their state. note that this also takes care of
- * freeing the css_id.
+ * remove subsystem's css from the dummytop and free it - need to
+ * free before marking as null because ss->css_free needs the
+ * cgrp->subsys pointer to find their state. note that this also
+ * takes care of freeing the css_id.
*/
- ss->destroy(dummytop);
+ ss->css_free(dummytop);
dummytop->subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
BUG_ON(!ss->name);
BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
- BUG_ON(!ss->create);
- BUG_ON(!ss->destroy);
+ BUG_ON(!ss->css_alloc);
+ BUG_ON(!ss->css_free);
if (ss->subsys_id != i) {
printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
ss->name, ss->subsys_id);
INIT_LIST_HEAD(&child->cg_list);
}
-/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
- if (need_forkexit_callback) {
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
-
- /*
- * forkexit callbacks are only supported for
- * builtin subsystems.
- */
- if (!ss || ss->module)
- continue;
-
- if (ss->fork)
- ss->fork(child);
- }
- }
-}
-
/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks. Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_iter_start() - to guarantee that the new task ends up on its
+ * list.
*/
void cgroup_post_fork(struct task_struct *child)
{
+ int i;
+
/*
* use_task_css_set_links is set to 1 before we walk the tasklist
* under the tasklist_lock and we read it here after we added the child
task_unlock(child);
write_unlock(&css_set_lock);
}
+
+ /*
+ * Call ss->fork(). This must happen after @child is linked on
+ * css_set; otherwise, @child might change state between ->fork()
+ * and addition to css_set.
+ */
+ if (need_forkexit_callback) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+
+ /*
+ * fork/exit callbacks are supported only for
+ * builtin subsystems and we don't need further
+ * synchronization as they never go away.
+ */
+ if (!ss || ss->module)
+ continue;
+
+ if (ss->fork)
+ ss->fork(child);
+ }
+ }
}
+
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
/* Caller must verify that the css is not for root cgroup */
bool __css_tryget(struct cgroup_subsys_state *css)
{
- do {
- int v = css_refcnt(css);
+ while (true) {
+ int t, v;
- if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+ v = css_refcnt(css);
+ t = atomic_cmpxchg(&css->refcnt, v, v + 1);
+ if (likely(t == v))
return true;
+ else if (t < 0)
+ return false;
cpu_relax();
- } while (!test_bit(CSS_REMOVED, &css->flags));
-
- return false;
+ }
}
EXPORT_SYMBOL_GPL(__css_tryget);
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
- cgroup_wakeup_rmdir_waiter(cgrp);
break;
case 0:
- if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
- schedule_work(&css->dput_work);
+ schedule_work(&css->dput_work);
break;
}
rcu_read_unlock();
}
#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
+static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
return css;
}
-static void debug_destroy(struct cgroup *cont)
+static void debug_css_free(struct cgroup *cont)
{
kfree(cont->subsys[debug_subsys_id]);
}
struct cgroup_subsys debug_subsys = {
.name = "debug",
- .create = debug_create,
- .destroy = debug_destroy,
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
.subsys_id = debug_subsys_id,
.base_cftypes = debug_files,
};
event->parent = parent_event;
- event->ns = get_pid_ns(current->nsproxy->pid_ns);
+ event->ns = get_pid_ns(task_active_pid_ns(current));
event->id = atomic64_inc_return(&perf_event_id);
event->state = PERF_EVENT_STATE_INACTIVE;
device_initcall(perf_event_sysfs_init);
#ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
+static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
{
struct perf_cgroup *jc;
return &jc->css;
}
-static void perf_cgroup_destroy(struct cgroup *cont)
+static void perf_cgroup_css_free(struct cgroup *cont)
{
struct perf_cgroup *jc;
jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
struct cgroup_subsys perf_subsys = {
.name = "perf_event",
.subsys_id = perf_subsys_id,
- .create = perf_cgroup_create,
- .destroy = perf_cgroup_destroy,
+ .css_alloc = perf_cgroup_css_alloc,
+ .css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
- /*
- * If we are the last child process in a pid namespace to be
- * reaped, notify the reaper sleeping zap_pid_ns_processes().
- */
- if (IS_ENABLED(CONFIG_PID_NS)) {
- struct task_struct *parent = p->real_parent;
-
- if ((task_active_pid_ns(parent)->child_reaper == parent) &&
- list_empty(&parent->children) &&
- (parent->flags & PF_EXITING))
- wake_up_process(parent);
- }
}
list_del_rcu(&p->thread_group);
}
}
}
-/**
- * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
- *
- * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to kthreadd so it
- * isn't in the way of other processes and is correctly cleaned up on exit.
- *
- * The various task state such as scheduling policy and priority may have
- * been inherited from a user process, so we reset them to sane values here.
- *
- * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
- */
-static void reparent_to_kthreadd(void)
-{
- write_lock_irq(&tasklist_lock);
-
- ptrace_unlink(current);
- /* Reparent to init */
- current->real_parent = current->parent = kthreadd_task;
- list_move_tail(¤t->sibling, ¤t->real_parent->children);
-
- /* Set the exit signal to SIGCHLD so we signal init on exit */
- current->exit_signal = SIGCHLD;
-
- if (task_nice(current) < 0)
- set_user_nice(current, 0);
- /* cpus_allowed? */
- /* rt_priority? */
- /* signals? */
- memcpy(current->signal->rlim, init_task.signal->rlim,
- sizeof(current->signal->rlim));
-
- atomic_inc(&init_cred.usage);
- commit_creds(&init_cred);
- write_unlock_irq(&tasklist_lock);
-}
-
void __set_special_pids(struct pid *pid)
{
struct task_struct *curr = current->group_leader;
change_pid(curr, PIDTYPE_PGID, pid);
}
-static void set_special_pids(struct pid *pid)
-{
- write_lock_irq(&tasklist_lock);
- __set_special_pids(pid);
- write_unlock_irq(&tasklist_lock);
-}
-
/*
* Let kernel threads use this to say that they allow a certain signal.
* Must not be used if kthread was cloned with CLONE_SIGHAND.
EXPORT_SYMBOL(disallow_signal);
-/*
- * Put all the gunge required to become a kernel thread without
- * attached user resources in one place where it belongs.
- */
-
-void daemonize(const char *name, ...)
-{
- va_list args;
- sigset_t blocked;
-
- va_start(args, name);
- vsnprintf(current->comm, sizeof(current->comm), name, args);
- va_end(args);
-
- /*
- * If we were started as result of loading a module, close all of the
- * user space pages. We don't need them, and if we didn't close them
- * they would be locked into memory.
- */
- exit_mm(current);
- /*
- * We don't want to get frozen, in case system-wide hibernation
- * or suspend transition begins right now.
- */
- current->flags |= (PF_NOFREEZE | PF_KTHREAD);
-
- if (current->nsproxy != &init_nsproxy) {
- get_nsproxy(&init_nsproxy);
- switch_task_namespaces(current, &init_nsproxy);
- }
- set_special_pids(&init_struct_pid);
- proc_clear_tty(current);
-
- /* Block and flush all signals */
- sigfillset(&blocked);
- sigprocmask(SIG_BLOCK, &blocked, NULL);
- flush_signals(current);
-
- /* Become as one with the init task */
-
- daemonize_fs_struct();
- daemonize_descriptors();
-
- reparent_to_kthreadd();
-}
-
-EXPORT_SYMBOL(daemonize);
-
#ifdef CONFIG_MM_OWNER
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
* as other threads in the parent group can be right
* here reaping other children at the same time.
*
- * We use thread_group_times() to get times for the thread
+ * We use thread_group_cputime_adjusted() to get times for the thread
* group, which consolidates times for all threads in the
* group including the group leader.
*/
- thread_group_times(p, &tgutime, &tgstime);
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
unsigned long charge;
struct mempolicy *pol;
+ uprobe_start_dup_mmap();
down_write(&oldmm->mmap_sem);
flush_cache_dup_mm(oldmm);
uprobe_dup_mmap(oldmm, mm);
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
+ uprobe_end_dup_mmap();
return retval;
fail_nomem_anon_vma_fork:
mpol_put(pol);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ mm->first_nid = NUMA_PTE_SCAN_INIT;
#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
atomic_set(&sig->live, 1);
atomic_set(&sig->sigcnt, 1);
init_waitqueue_head(&sig->wait_chldexit);
- if (clone_flags & CLONE_NEWPID)
- sig->flags |= SIGNAL_UNKILLABLE;
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
*/
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
- struct pt_regs *regs,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
{
int retval;
struct task_struct *p;
- int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- p->prev_utime = p->prev_stime = 0;
+ p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
+ retval = copy_thread(clone_flags, stack_start, stack_size, p);
if (retval)
goto bad_fork_cleanup_io;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
- /* Now that the task is set up, run cgroup callbacks if
- * necessary. We need to run them before the task is visible
- * on the tasklist. */
- cgroup_fork_callbacks(p);
- cgroup_callbacks_done = 1;
-
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
if (thread_group_leader(p)) {
- if (is_child_reaper(pid))
- p->nsproxy->pid_ns->child_reaper = p;
+ if (is_child_reaper(pid)) {
+ ns_of_pid(pid)->child_reaper = p;
+ p->signal->flags |= SIGNAL_UNKILLABLE;
+ }
p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
- if (unlikely(clone_flags & CLONE_NEWPID))
- pid_ns_release_proc(p->nsproxy->pid_ns);
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm)
#endif
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
- cgroup_exit(p, cgroup_callbacks_done);
+ cgroup_exit(p, 0);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
return ERR_PTR(retval);
}
-noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
-{
- memset(regs, 0, sizeof(struct pt_regs));
- return regs;
-}
-
static inline void init_idle_pids(struct pid_link *links)
{
enum pid_type type;
struct task_struct * __cpuinit fork_idle(int cpu)
{
struct task_struct *task;
- struct pt_regs regs;
-
- task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL,
- &init_struct_pid, 0);
+ task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
if (!IS_ERR(task)) {
init_idle_pids(task->pids);
init_idle(task, cpu);
*/
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
- struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
* Do some preliminary argument and permissions checking before we
* actually start allocating stuff
*/
- if (clone_flags & CLONE_NEWUSER) {
- if (clone_flags & CLONE_THREAD)
+ if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
+ if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
return -EINVAL;
- /* hopefully this check will go away when userns support is
- * complete
- */
- if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
- !capable(CAP_SETGID))
- return -EPERM;
}
/*
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
- if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) {
+ if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
trace = 0;
}
- p = copy_process(clone_flags, stack_start, regs, stack_size,
+ p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace);
/*
* Do this prior waking up the new thread - the thread pointer
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
- return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL,
+ return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
(unsigned long)arg, NULL, NULL);
}
#endif
+#ifdef __ARCH_WANT_SYS_FORK
+SYSCALL_DEFINE0(fork)
+{
+#ifdef CONFIG_MMU
+ return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+#else
+ /* can not support in nommu mode */
+ return(-EINVAL);
+#endif
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_VFORK
+SYSCALL_DEFINE0(vfork)
+{
+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+ 0, NULL, NULL);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE
+#ifdef CONFIG_CLONE_BACKWARDS
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int __user *, parent_tidptr,
+ int, tls_val,
+ int __user *, child_tidptr)
+#elif defined(CONFIG_CLONE_BACKWARDS2)
+SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#else
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
+#endif
+{
+ return do_fork(clone_flags, newsp, 0,
+ parent_tidptr, child_tidptr);
+}
+#endif
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
{
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+ CLONE_NEWUSER|CLONE_NEWPID))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing to
{
struct fs_struct *fs, *new_fs = NULL;
struct files_struct *fd, *new_fd = NULL;
+ struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
int err;
- err = check_unshare_flags(unshare_flags);
- if (err)
- goto bad_unshare_out;
-
+ /*
+ * If unsharing a user namespace must also unshare the thread.
+ */
+ if (unshare_flags & CLONE_NEWUSER)
+ unshare_flags |= CLONE_THREAD;
+ /*
+ * If unsharing a pid namespace must also unshare the thread.
+ */
+ if (unshare_flags & CLONE_NEWPID)
+ unshare_flags |= CLONE_THREAD;
+ /*
+ * If unsharing a thread from a thread group, must also unshare vm.
+ */
+ if (unshare_flags & CLONE_THREAD)
+ unshare_flags |= CLONE_VM;
+ /*
+ * If unsharing vm, must also unshare signal handlers.
+ */
+ if (unshare_flags & CLONE_VM)
+ unshare_flags |= CLONE_SIGHAND;
/*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
+
+ err = check_unshare_flags(unshare_flags);
+ if (err)
+ goto bad_unshare_out;
/*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
err = unshare_fd(unshare_flags, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
- err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+ err = unshare_userns(unshare_flags, &new_cred);
if (err)
goto bad_unshare_cleanup_fd;
+ err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_cred, new_fs);
+ if (err)
+ goto bad_unshare_cleanup_cred;
- if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+ if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
}
task_unlock(current);
+
+ if (new_cred) {
+ /* Install the new user namespace */
+ commit_creds(new_cred);
+ new_cred = NULL;
+ }
}
if (new_nsproxy)
put_nsproxy(new_nsproxy);
+ bad_unshare_cleanup_cred:
+ if (new_cred)
+ put_cred(new_cred);
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
/*
* Generic pidhash and scalable, time-bounded PID allocator
*
- * (C) 2002-2003 William Irwin, IBM
- * (C) 2004 William Irwin, Oracle
+ * (C) 2002-2003 Nadia Yvette Chambers, IBM
+ * (C) 2004 Nadia Yvette Chambers, Oracle
* (C) 2002-2004 Ingo Molnar, Red Hat
*
* pid-structures are backing objects for tasks sharing a given ID to chain
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
+ #include <linux/proc_fs.h>
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
.last_pid = 0,
.level = 0,
.child_reaper = &init_task,
+ .user_ns = &init_user_ns,
+ .proc_inum = PROC_PID_INIT_INO,
};
EXPORT_SYMBOL_GPL(init_pid_ns);
unsigned long flags;
spin_lock_irqsave(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- hlist_del_rcu(&pid->numbers[i].pid_chain);
+ for (i = 0; i <= pid->level; i++) {
+ struct upid *upid = pid->numbers + i;
+ struct pid_namespace *ns = upid->ns;
+ hlist_del_rcu(&upid->pid_chain);
+ switch(--ns->nr_hashed) {
+ case 1:
+ /* When all that is left in the pid namespace
+ * is the reaper wake up the reaper. The reaper
+ * may be sleeping in zap_pid_ns_processes().
+ */
+ wake_up_process(ns->child_reaper);
+ break;
+ case 0:
+ ns->nr_hashed = -1;
+ schedule_work(&ns->proc_work);
+ break;
+ }
+ }
spin_unlock_irqrestore(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++)
goto out;
tmp = ns;
+ pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
if (nr < 0)
tmp = tmp->parent;
}
+ if (unlikely(is_child_reaper(pid))) {
+ if (pid_ns_prepare_proc(ns))
+ goto out_free;
+ }
+
get_pid_ns(ns);
- pid->level = ns->level;
atomic_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- for ( ; upid >= pid->numbers; --upid)
+ if (ns->nr_hashed < 0)
+ goto out_unlock;
+ for ( ; upid >= pid->numbers; --upid) {
hlist_add_head_rcu(&upid->pid_chain,
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+ upid->ns->nr_hashed++;
+ }
spin_unlock_irq(&pidmap_lock);
out:
return pid;
+ out_unlock:
+ spin_unlock(&pidmap_lock);
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
struct pid *find_vpid(int nr)
{
- return find_pid_ns(nr, current->nsproxy->pid_ns);
+ return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);
struct task_struct *find_task_by_vpid(pid_t vnr)
{
- return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+ return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
pid_t pid_vnr(struct pid *pid)
{
- return pid_nr_ns(pid, current->nsproxy->pid_ns);
+ return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);
rcu_read_lock();
if (!ns)
- ns = current->nsproxy->pid_ns;
+ ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
task = task->group_leader;
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ init_pid_ns.nr_hashed = 1;
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC);
#include <linux/slab.h>
#include <linux/init_task.h>
#include <linux/binfmts.h>
+#include <linux/context_tracking.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int sched_feat_set(char *cmp)
{
- char buf[64];
- char *cmp;
- int neg = 0;
int i;
-
- if (cnt > 63)
- cnt = 63;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- cmp = strstrip(buf);
+ int neg = 0;
if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
}
}
+ return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int i;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ i = sched_feat_set(cmp);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
rq->skip_clock_update = 1;
}
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+
+void register_task_migration_notifier(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&task_migration_notifier, n);
+}
+
#ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
+ struct task_migration_notifier tmn;
+
+ if (p->sched_class->migrate_task_rq)
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+
+ tmn.task = p;
+ tmn.from_cpu = task_cpu(p);
+ tmn.to_cpu = new_cpu;
+
+ atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
}
__set_task_cpu(p, new_cpu);
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ p->se.avg.runnable_avg_period = 0;
+ p->se.avg.runnable_avg_sum = 0;
+#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+ p->mm->numa_next_scan = jiffies;
+ p->mm->numa_next_reset = jiffies;
+ p->mm->numa_scan_seq = 0;
+ }
+
+ p->node_stamp = 0ULL;
+ p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
}
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_SCHED_DEBUG
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sched_feat_set("NUMA");
+ else
+ sched_feat_set("NO_NUMA");
+}
+#else
+__read_mostly bool numabalancing_enabled;
+
+void set_numabalancing_state(bool enabled)
+{
+ numabalancing_enabled = enabled;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_NUMA_BALANCING */
+
/*
* fork()/clone()-time setup:
*/
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif
+ context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
- rcu_switch(prev, next);
switch_to(prev, next, prev);
barrier();
}
EXPORT_SYMBOL(schedule);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_CONTEXT_TRACKING
asmlinkage void __sched schedule_user(void)
{
/*
* we haven't yet exited the RCU idle mode. Do it here manually until
* we find a better solution.
*/
- rcu_user_exit();
+ user_exit();
schedule();
- rcu_user_enter();
+ user_enter();
}
#endif
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
- rcu_user_exit();
+ user_exit();
do {
add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
- goto out_unlock;
+ if (!check_same_owner(p)) {
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
+ rcu_read_unlock();
+ }
retval = security_task_setscheduler(p);
if (retval)
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
+ int ppid;
unsigned state;
state = p->state ? __ffs(p->state) + 1 : 0;
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
+ rcu_read_lock();
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ rcu_read_unlock();
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
+ task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
show_stack(p, NULL);
struct task_group, css);
}
-static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
{
struct task_group *tg, *parent;
return &tg->css;
}
-static void cpu_cgroup_destroy(struct cgroup *cgrp)
+static void cpu_cgroup_css_free(struct cgroup *cgrp)
{
struct task_group *tg = cgroup_tg(cgrp);
struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
- .create = cpu_cgroup_create,
- .destroy = cpu_cgroup_destroy,
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_free = cpu_cgroup_css_free,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
struct cpuacct root_cpuacct;
/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
{
struct cpuacct *ca;
}
/* destroy an existing cpu accounting group */
-static void cpuacct_destroy(struct cgroup *cgrp)
+static void cpuacct_css_free(struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
- .create = cpuacct_create,
- .destroy = cpuacct_destroy,
+ .css_alloc = cpuacct_css_alloc,
+ .css_free = cpuacct_css_free,
.subsys_id = cpuacct_subsys_id,
.base_cftypes = files,
};
#endif /* CONFIG_CGROUP_CPUACCT */
+
+void dump_cpu_task(int cpu)
+{
+ pr_info("Task dump for CPU %d:\n", cpu);
+ sched_show_task(cpu_curr(cpu));
+}
return __send_signal(sig, info, t, group, from_ancestor_ns);
}
-static void print_fatal_signal(struct pt_regs *regs, int signr)
+static void print_fatal_signal(int signr)
{
+ struct pt_regs *regs = signal_pt_regs();
printk("%s/%d: potentially unexpected fatal signal %d.\n",
current->comm, task_pid_nr(current), signr);
* see comment in do_notify_parent() about the following 4 lines
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
rcu_read_unlock();
preempt_disable();
read_unlock(&tasklist_lock);
preempt_enable_no_resched();
- schedule();
+ freezable_schedule();
} else {
/*
* By the time we got the lock, our tracer went away.
read_unlock(&tasklist_lock);
}
- /*
- * While in TASK_TRACED, we were considered "frozen enough".
- * Now that we woke up, it's crucial if we're supposed to be
- * frozen that we freeze now before running anything substantial.
- */
- try_to_freeze();
-
/*
* We are back. Now reacquire the siglock before touching
* last_siginfo, so that we are sure to have synchronized with
}
/* Now we don't run again until woken by SIGCONT or SIGKILL */
- schedule();
+ freezable_schedule();
return true;
} else {
/*
}
}
-static int ptrace_signal(int signr, siginfo_t *info,
- struct pt_regs *regs, void *cookie)
+static int ptrace_signal(int signr, siginfo_t *info)
{
- ptrace_signal_deliver(regs, cookie);
+ ptrace_signal_deliver();
/*
* We do not check sig_kernel_stop(signr) but set this marker
* unconditionally because we do not know whether debugger will
if (unlikely(uprobe_deny_signal()))
return 0;
-relock:
/*
- * We'll jump back here after any time we were stopped in TASK_STOPPED.
- * While in TASK_STOPPED, we were considered "frozen enough".
- * Now that we woke up, it's crucial if we're supposed to be
- * frozen that we freeze now before running anything substantial.
+ * Do this once, we can't return to user-mode if freezing() == T.
+ * do_signal_stop() and ptrace_stop() do freezable_schedule() and
+ * thus do not need another check after return.
*/
try_to_freeze();
+relock:
spin_lock_irq(&sighand->siglock);
/*
* Every stopped thread goes here after wakeup. Check to see if
break; /* will return 0 */
if (unlikely(current->ptrace) && signr != SIGKILL) {
- signr = ptrace_signal(signr, info,
- regs, cookie);
+ signr = ptrace_signal(signr, info);
if (!signr)
continue;
}
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
- print_fatal_signal(regs, info->si_signo);
+ print_fatal_signal(info->si_signo);
/*
* If it was able to dump core, this kills all
* other threads in the group and synchronizes with
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(info, regs);
+ do_coredump(info);
}
/*
#include <linux/ptrace.h>
#include <linux/prctl.h>
#include <linux/ratelimit.h>
+#include <linux/workqueue.h>
#define YAMA_SCOPE_DISABLED 0
#define YAMA_SCOPE_RELATIONAL 1
struct ptrace_relation {
struct task_struct *tracer;
struct task_struct *tracee;
+ bool invalid;
struct list_head node;
+ struct rcu_head rcu;
};
static LIST_HEAD(ptracer_relations);
static DEFINE_SPINLOCK(ptracer_relations_lock);
+static void yama_relation_cleanup(struct work_struct *work);
+static DECLARE_WORK(yama_relation_work, yama_relation_cleanup);
+
+/**
+ * yama_relation_cleanup - remove invalid entries from the relation list
+ *
+ */
+static void yama_relation_cleanup(struct work_struct *work)
+{
+ struct ptrace_relation *relation;
+
+ spin_lock(&ptracer_relations_lock);
+ rcu_read_lock();
+ list_for_each_entry_rcu(relation, &ptracer_relations, node) {
+ if (relation->invalid) {
+ list_del_rcu(&relation->node);
+ kfree_rcu(relation, rcu);
+ }
+ }
+ rcu_read_unlock();
+ spin_unlock(&ptracer_relations_lock);
+}
+
/**
* yama_ptracer_add - add/replace an exception for this tracer/tracee pair
* @tracer: the task_struct of the process doing the ptrace
static int yama_ptracer_add(struct task_struct *tracer,
struct task_struct *tracee)
{
- int rc = 0;
- struct ptrace_relation *added;
- struct ptrace_relation *entry, *relation = NULL;
+ struct ptrace_relation *relation, *added;
added = kmalloc(sizeof(*added), GFP_KERNEL);
if (!added)
return -ENOMEM;
- spin_lock_bh(&ptracer_relations_lock);
- list_for_each_entry(entry, &ptracer_relations, node)
- if (entry->tracee == tracee) {
- relation = entry;
- break;
+ added->tracee = tracee;
+ added->tracer = tracer;
+ added->invalid = false;
+
+ spin_lock(&ptracer_relations_lock);
+ rcu_read_lock();
+ list_for_each_entry_rcu(relation, &ptracer_relations, node) {
+ if (relation->invalid)
+ continue;
+ if (relation->tracee == tracee) {
+ list_replace_rcu(&relation->node, &added->node);
+ kfree_rcu(relation, rcu);
+ goto out;
}
- if (!relation) {
- relation = added;
- relation->tracee = tracee;
- list_add(&relation->node, &ptracer_relations);
}
- relation->tracer = tracer;
- spin_unlock_bh(&ptracer_relations_lock);
- if (added != relation)
- kfree(added);
+ list_add_rcu(&added->node, &ptracer_relations);
- return rc;
+out:
+ rcu_read_unlock();
+ spin_unlock(&ptracer_relations_lock);
+ return 0;
}
/**
static void yama_ptracer_del(struct task_struct *tracer,
struct task_struct *tracee)
{
- struct ptrace_relation *relation, *safe;
+ struct ptrace_relation *relation;
+ bool marked = false;
- spin_lock_bh(&ptracer_relations_lock);
- list_for_each_entry_safe(relation, safe, &ptracer_relations, node)
+ rcu_read_lock();
+ list_for_each_entry_rcu(relation, &ptracer_relations, node) {
+ if (relation->invalid)
+ continue;
if (relation->tracee == tracee ||
(tracer && relation->tracer == tracer)) {
- list_del(&relation->node);
- kfree(relation);
+ relation->invalid = true;
+ marked = true;
}
- spin_unlock_bh(&ptracer_relations_lock);
+ }
+ rcu_read_unlock();
+
+ if (marked)
+ schedule_work(&yama_relation_work);
}
/**
struct task_struct *parent = NULL;
bool found = false;
- spin_lock_bh(&ptracer_relations_lock);
rcu_read_lock();
if (!thread_group_leader(tracee))
tracee = rcu_dereference(tracee->group_leader);
- list_for_each_entry(relation, &ptracer_relations, node)
+ list_for_each_entry_rcu(relation, &ptracer_relations, node) {
+ if (relation->invalid)
+ continue;
if (relation->tracee == tracee) {
parent = relation->tracer;
found = true;
break;
}
+ }
if (found && (parent == NULL || task_is_descendant(parent, tracer)))
rc = 1;
rcu_read_unlock();
- spin_unlock_bh(&ptracer_relations_lock);
return rc;
}
/* No additional restrictions. */
break;
case YAMA_SCOPE_RELATIONAL:
+ rcu_read_lock();
if (!task_is_descendant(current, child) &&
!ptracer_exception_found(current, child) &&
- !ns_capable(task_user_ns(child), CAP_SYS_PTRACE))
+ !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
rc = -EPERM;
+ rcu_read_unlock();
break;
case YAMA_SCOPE_CAPABILITY:
- if (!ns_capable(task_user_ns(child), CAP_SYS_PTRACE))
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
rc = -EPERM;
+ rcu_read_unlock();
break;
case YAMA_SCOPE_NO_ATTACH:
default:
/* Only disallow PTRACE_TRACEME on more aggressive settings. */
switch (ptrace_scope) {
case YAMA_SCOPE_CAPABILITY:
- if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE))
+ rcu_read_lock();
+ if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE))
rc = -EPERM;
+ rcu_read_unlock();
break;
case YAMA_SCOPE_NO_ATTACH:
rc = -EPERM;