/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)
+static struct cgroup_name root_cgroup_name = { .name = "/" };
+
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
* extra work in the fork/exit path if none of the subsystems need to
{
int i;
struct cgroupfs_root *root = cgrp->root;
- struct hlist_node *node;
struct css_set *cg;
unsigned long key;
}
key = css_set_hash(template);
- hash_for_each_possible(css_set_table, cg, node, hlist, key) {
+ hash_for_each_possible(css_set_table, cg, hlist, key) {
if (!compare_css_sets(cg, oldcg, cgrp, template))
continue;
return inode;
}
+static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
+{
+ struct cgroup_name *name;
+
+ name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
+ if (!name)
+ return NULL;
+ strcpy(name->name, dentry->d_name.name);
+ return name;
+}
+
static void cgroup_free_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
simple_xattrs_free(&cgrp->xattrs);
ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+ kfree(rcu_dereference_raw(cgrp->name));
kfree(cgrp);
}
INIT_LIST_HEAD(&root->allcg_list);
root->number_of_cgroups = 1;
cgrp->root = root;
+ cgrp->name = &root_cgroup_name;
cgrp->top_cgroup = cgrp;
init_cgroup_housekeeping(cgrp);
list_add_tail(&cgrp->allcg_node, &root->allcg_list);
struct cgroupfs_root *existing_root;
const struct cred *cred;
int i;
- struct hlist_node *node;
struct css_set *cg;
BUG_ON(sb->s_root != NULL);
/* Link the top cgroup in this hierarchy into all
* the css_set objects */
write_lock(&css_set_lock);
- hash_for_each(css_set_table, i, node, cg, hlist)
+ hash_for_each(css_set_table, i, cg, hlist)
link_css_set(&tmp_cg_links, cg, root_cgrp);
write_unlock(&css_set_lock);
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference. Writes path of cgroup into buf. Returns 0 on success,
- * -errno on error.
+ * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
+ *
+ * We can't generate cgroup path using dentry->d_name, as accessing
+ * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
+ * inode's i_mutex, while on the other hand cgroup_path() can be called
+ * with some irq-safe spinlocks held.
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
- struct dentry *dentry = cgrp->dentry;
+ int ret = -ENAMETOOLONG;
char *start;
- rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
- "cgroup_path() called without proper locking");
-
- if (cgrp == dummytop) {
- /*
- * Inactive subsystems have no dentry for their root
- * cgroup
- */
- strcpy(buf, "/");
- return 0;
- }
-
start = buf + buflen - 1;
-
*start = '\0';
- for (;;) {
- int len = dentry->d_name.len;
+ rcu_read_lock();
+ while (cgrp) {
+ const char *name = cgroup_name(cgrp);
+ int len;
+
+ len = strlen(name);
if ((start -= len) < buf)
- return -ENAMETOOLONG;
- memcpy(start, dentry->d_name.name, len);
- cgrp = cgrp->parent;
- if (!cgrp)
- break;
+ goto out;
+ memcpy(start, name, len);
- dentry = cgrp->dentry;
if (!cgrp->parent)
- continue;
+ break;
+
if (--start < buf)
- return -ENAMETOOLONG;
+ goto out;
*start = '/';
+
+ cgrp = cgrp->parent;
}
+ ret = 0;
memmove(buf, start, buf + buflen - start);
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path);
if (!group)
return -ENOMEM;
/* pre-allocate to guarantee space while iterating in rcu read-side. */
- retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
if (retval)
goto out_free_group_list;
static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
+ int ret;
+ struct cgroup_name *name, *old_name;
+ struct cgroup *cgrp;
+
+ /*
+ * It's convinient to use parent dir's i_mutex to protected
+ * cgrp->name.
+ */
+ lockdep_assert_held(&old_dir->i_mutex);
+
if (!S_ISDIR(old_dentry->d_inode->i_mode))
return -ENOTDIR;
if (new_dentry->d_inode)
return -EEXIST;
if (old_dir != new_dir)
return -EIO;
- return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+ cgrp = __d_cgrp(old_dentry);
+
+ name = cgroup_alloc_name(new_dentry);
+ if (!name)
+ return -ENOMEM;
+
+ ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ old_name = cgrp->name;
+ rcu_assign_pointer(cgrp->name, name);
+
+ kfree_rcu(old_name, rcu_head);
+ return 0;
}
static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
*/
static inline struct cftype *__file_cft(struct file *file)
{
- if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+ if (file_inode(file)->i_fop != &cgroup_file_operations)
return ERR_PTR(-EINVAL);
return __d_cft(file->f_dentry);
}
else
kfree(p);
}
-static void *pidlist_resize(void *p, int newcount)
-{
- void *newlist;
- /* note: if new alloc fails, old p will still be valid either way */
- if (is_vmalloc_addr(p)) {
- newlist = vmalloc(newcount * sizeof(pid_t));
- if (!newlist)
- return NULL;
- memcpy(newlist, p, newcount * sizeof(pid_t));
- vfree(p);
- } else {
- newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
- }
- return newlist;
-}
/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * If the new stripped list is sufficiently smaller and there's enough memory
- * to allocate a new buffer, will let go of the unneeded memory. Returns the
- * number of unique elements.
+ * Returns the number of unique elements.
*/
-/* is the size difference enough that we should re-allocate the array? */
-#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
-static int pidlist_uniq(pid_t **p, int length)
+static int pidlist_uniq(pid_t *list, int length)
{
int src, dest = 1;
- pid_t *list = *p;
- pid_t *newlist;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
dest++;
}
after:
- /*
- * if the length difference is large enough, we want to allocate a
- * smaller buffer to save memory. if this fails due to out of memory,
- * we'll just stay with what we've got.
- */
- if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
- newlist = pidlist_resize(list, dest);
- if (newlist)
- *p = newlist;
- }
return dest;
}
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(&array, length);
+ length = pidlist_uniq(array, length);
l = cgroup_pidlist_find(cgrp, type);
if (!l) {
pidlist_free(array);
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
+ ret = inode_permission(file_inode(cfile), MAY_READ);
if (ret < 0)
goto fail;
if (ret)
goto fail;
- if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
- event->cft->unregister_event(cgrp, event->cft, event->eventfd);
- ret = 0;
- goto fail;
- }
-
/*
* Events should be removed after rmdir of cgroup directory, but before
* destroying subsystem state objects. Let's take reference to cgroup
if (!(css->flags & CSS_ONLINE))
return;
- /*
- * css_offline() should be called with cgroup_mutex unlocked. See
- * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
- * details. This temporary unlocking should go away once
- * cgroup_mutex is unexported from controllers.
- */
- if (ss->css_offline) {
- mutex_unlock(&cgroup_mutex);
+ if (ss->css_offline)
ss->css_offline(cgrp);
- mutex_lock(&cgroup_mutex);
- }
cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
}
umode_t mode)
{
struct cgroup *cgrp;
+ struct cgroup_name *name;
struct cgroupfs_root *root = parent->root;
int err = 0;
struct cgroup_subsys *ss;
if (!cgrp)
return -ENOMEM;
+ name = cgroup_alloc_name(dentry);
+ if (!name)
+ goto err_free_cgrp;
+ rcu_assign_pointer(cgrp->name, name);
+
cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
if (cgrp->id < 0)
- goto err_free_cgrp;
+ goto err_free_name;
/*
* Only live parents can have children. Note that the liveliness
deactivate_super(sb);
err_free_id:
ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_name:
+ kfree(rcu_dereference_raw(cgrp->name));
err_free_cgrp:
kfree(cgrp);
return err;
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
-/*
- * Check the reference count on each subsystem. Since we already
- * established that there are no tasks in the cgroup, if the css refcount
- * is also 1, then there should be no outstanding references, so the
- * subsystem is safe to destroy. We scan across all subsystems rather than
- * using the per-hierarchy linked list of mounted subsystems since we can
- * be called via check_for_release() with no synchronization other than
- * RCU, and the subsystem linked list isn't RCU-safe.
- */
-static int cgroup_has_css_refs(struct cgroup *cgrp)
-{
- int i;
-
- /*
- * We won't need to lock the subsys array, because the subsystems
- * we're concerned about aren't going anywhere since our cgroup root
- * has a reference on them.
- */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- struct cgroup_subsys_state *css;
-
- /* Skip subsystems not present or not in this hierarchy */
- if (ss == NULL || ss->root != cgrp->root)
- continue;
-
- css = cgrp->subsys[ss->subsys_id];
- /*
- * When called from check_for_release() it's possible
- * that by this point the cgroup has been removed
- * and the css deleted. But a false-positive doesn't
- * matter, since it can only happen if the cgroup
- * has been deleted and hence no longer needs the
- * release agent to be called anyway.
- */
- if (css && css_refcnt(css) > 1)
- return 1;
- }
- return 0;
-}
-
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct dentry *d = cgrp->dentry;
struct cgroup *parent = cgrp->parent;
- DEFINE_WAIT(wait);
struct cgroup_event *event, *tmp;
struct cgroup_subsys *ss;
- LIST_HEAD(tmp_list);
lockdep_assert_held(&d->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
{
struct cgroup_subsys_state *css;
int i, ret;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
struct css_set *cg;
unsigned long key;
* this is all done under the css_set_lock.
*/
write_lock(&css_set_lock);
- hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) {
+ hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
/* skip entries that we already rehashed */
if (cg->subsys[ss->subsys_id])
continue;
cg->subsys[ss->subsys_id] = css;
/* recompute hash and restore entry */
key = css_set_hash(cg->subsys);
- hash_add(css_set_table, node, key);
+ hash_add(css_set_table, &cg->hlist, key);
}
write_unlock(&css_set_lock);
offline_css(ss, dummytop);
ss->active = 0;
- if (ss->use_id) {
- idr_remove_all(&ss->idr);
+ if (ss->use_id)
idr_destroy(&ss->idr);
- }
/* deassign the subsys_id */
subsys[ss->subsys_id] = NULL;
* and addition to css_set.
*/
if (need_forkexit_callback) {
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /*
+ * fork/exit callbacks are supported only for builtin
+ * subsystems, and the builtin section of the subsys
+ * array is immutable, so we don't need to lock the
+ * subsys array here. On the other hand, modular section
+ * of the array can be freed at module unload, so we
+ * can't touch that.
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
- /*
- * fork/exit callbacks are supported only for
- * builtin subsystems and we don't need further
- * synchronization as they never go away.
- */
- if (!ss || ss->module)
- continue;
-
if (ss->fork)
ss->fork(child);
}
tsk->cgroups = &init_css_set;
if (run_callbacks && need_forkexit_callback) {
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /*
+ * fork/exit callbacks are supported only for builtin
+ * subsystems, see cgroup_post_fork() for details.
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
- /* modular subsystems can't use callbacks */
- if (!ss || ss->module)
- continue;
-
if (ss->exit) {
struct cgroup *old_cgrp =
rcu_dereference_raw(cg->subsys[i])->cgroup;
put_css_set_taskexit(cg);
}
-/**
- * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
- * @cgrp: the cgroup in question
- * @task: the task in question
- *
- * See if @cgrp is a descendant of @task's cgroup in the appropriate
- * hierarchy.
- *
- * If we are sending in dummytop, then presumably we are creating
- * the top cgroup in the subsystem.
- *
- * Called only by the ns (nsproxy) cgroup.
- */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
-{
- int ret;
- struct cgroup *target;
-
- if (cgrp == dummytop)
- return 1;
-
- target = task_cgroup_from_root(task, cgrp->root);
- while (cgrp != target && cgrp!= cgrp->top_cgroup)
- cgrp = cgrp->parent;
- ret = (cgrp == target);
- return ret;
-}
-
static void check_for_release(struct cgroup *cgrp)
{
/* All of these checks rely on RCU to keep the cgroup
* structure alive */
- if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
- && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
- /* Control Group is currently removeable. If it's not
+ if (cgroup_is_releasable(cgrp) &&
+ !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
+ /*
+ * Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
- * it now */
+ * it now
+ */
int need_schedule_work = 0;
+
raw_spin_lock(&release_list_lock);
if (!cgroup_is_removed(cgrp) &&
list_empty(&cgrp->release_list)) {
/* Caller must verify that the css is not for root cgroup */
void __css_put(struct cgroup_subsys_state *css)
{
- struct cgroup *cgrp = css->cgroup;
int v;
- rcu_read_lock();
v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-
- switch (v) {
- case 1:
- if (notify_on_release(cgrp)) {
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
- break;
- case 0:
+ if (v == 0)
schedule_work(&css->dput_work);
- break;
- }
- rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(__css_put);
static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
{
struct css_id *newid;
- int myid, error, size;
+ int ret, size;
BUG_ON(!ss->use_id);
newid = kzalloc(size, GFP_KERNEL);
if (!newid)
return ERR_PTR(-ENOMEM);
- /* get id */
- if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
- error = -ENOMEM;
- goto err_out;
- }
+
+ idr_preload(GFP_KERNEL);
spin_lock(&ss->id_lock);
/* Don't use 0. allocates an ID of 1-65535 */
- error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+ ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
spin_unlock(&ss->id_lock);
+ idr_preload_end();
/* Returns error when there are no free spaces for new ID.*/
- if (error) {
- error = -ENOSPC;
+ if (ret < 0)
goto err_out;
- }
- if (myid > CSS_ID_MAX)
- goto remove_idr;
- newid->id = myid;
+ newid->id = ret;
newid->depth = depth;
return newid;
-remove_idr:
- error = -ENOSPC;
- spin_lock(&ss->id_lock);
- idr_remove(&ss->idr, myid);
- spin_unlock(&ss->id_lock);
err_out:
kfree(newid);
- return ERR_PTR(error);
+ return ERR_PTR(ret);
}
struct inode *inode;
struct cgroup_subsys_state *css;
- inode = f->f_dentry->d_inode;
+ inode = file_inode(f);
/* check in cgroup filesystem dir */
if (inode->i_op != &cgroup_dir_inode_operations)
return ERR_PTR(-EBADF);