cgroup: fix an off-by-one bug which may trigger BUG_ON()

[karo-tx-linux.git] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index b5c64327e7127491e93507052ae5a600931ce158..54689fc008f654eb45034773b13949a91a2af126 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -238,6 +238,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
  /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
  #define dummytop (&rootnode.top_cgroup)
  
+static struct cgroup_name root_cgroup_name = { .name = "/" };
+
  /* This flag indicates whether tasks in the fork and exit paths should
   * check for fork/exit handlers to call. This avoids us having to do
   * extra work in the fork/exit path if none of the subsystems need to
@@ -554,7 +556,6 @@ static struct css_set *find_existing_css_set(
  {
         int i;
         struct cgroupfs_root *root = cgrp->root;
-       struct hlist_node *node;
         struct css_set *cg;
         unsigned long key;
  
@@ -577,7 +578,7 @@ static struct css_set *find_existing_css_set(
         }
  
         key = css_set_hash(template);
-       hash_for_each_possible(css_set_table, cg, node, hlist, key) {
+       hash_for_each_possible(css_set_table, cg, hlist, key) {
                 if (!compare_css_sets(cg, oldcg, cgrp, template))
                         continue;
  
@@ -860,6 +861,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
         return inode;
  }
  
+static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
+{
+       struct cgroup_name *name;
+
+       name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
+       if (!name)
+               return NULL;
+       strcpy(name->name, dentry->d_name.name);
+       return name;
+}
+
  static void cgroup_free_fn(struct work_struct *work)
  {
         struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -890,6 +902,7 @@ static void cgroup_free_fn(struct work_struct *work)
         simple_xattrs_free(&cgrp->xattrs);
  
         ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+       kfree(rcu_dereference_raw(cgrp->name));
         kfree(cgrp);
  }
  
@@ -1422,6 +1435,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
         INIT_LIST_HEAD(&root->allcg_list);
         root->number_of_cgroups = 1;
         cgrp->root = root;
+       cgrp->name = &root_cgroup_name;
         cgrp->top_cgroup = cgrp;
         init_cgroup_housekeeping(cgrp);
         list_add_tail(&cgrp->allcg_node, &root->allcg_list);
@@ -1611,7 +1625,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 struct cgroupfs_root *existing_root;
                 const struct cred *cred;
                 int i;
-               struct hlist_node *node;
                 struct css_set *cg;
  
                 BUG_ON(sb->s_root != NULL);
@@ -1666,7 +1679,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 /* Link the top cgroup in this hierarchy into all
                  * the css_set objects */
                 write_lock(&css_set_lock);
-               hash_for_each(css_set_table, i, node, cg, hlist)
+               hash_for_each(css_set_table, i, cg, hlist)
                         link_css_set(&tmp_cg_links, cg, root_cgrp);
                 write_unlock(&css_set_lock);
  
@@ -1771,49 +1784,45 @@ static struct kobject *cgroup_kobj;
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference.  Writes path of cgroup into buf.  Returns 0 on success,
- * -errno on error.
+ * Writes path of cgroup into buf.  Returns 0 on success, -errno on error.
+ *
+ * We can't generate cgroup path using dentry->d_name, as accessing
+ * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
+ * inode's i_mutex, while on the other hand cgroup_path() can be called
+ * with some irq-safe spinlocks held.
   */
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
  {
-       struct dentry *dentry = cgrp->dentry;
+       int ret = -ENAMETOOLONG;
         char *start;
  
-       rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
-                          "cgroup_path() called without proper locking");
-
-       if (cgrp == dummytop) {
-               /*
-                * Inactive subsystems have no dentry for their root
-                * cgroup
-                */
-               strcpy(buf, "/");
-               return 0;
-       }
-
         start = buf + buflen - 1;
-
         *start = '\0';
-       for (;;) {
-               int len = dentry->d_name.len;
  
+       rcu_read_lock();
+       while (cgrp) {
+               const char *name = cgroup_name(cgrp);
+               int len;
+
+               len = strlen(name);
                 if ((start -= len) < buf)
-                       return -ENAMETOOLONG;
-               memcpy(start, dentry->d_name.name, len);
-               cgrp = cgrp->parent;
-               if (!cgrp)
-                       break;
+                       goto out;
+               memcpy(start, name, len);
  
-               dentry = cgrp->dentry;
                 if (!cgrp->parent)
-                       continue;
+                       break;
+
                 if (--start < buf)
-                       return -ENAMETOOLONG;
+                       goto out;
                 *start = '/';
+
+               cgrp = cgrp->parent;
         }
+       ret = 0;
         memmove(buf, start, buf + buflen - start);
-       return 0;
+out:
+       rcu_read_unlock();
+       return ret;
  }
  EXPORT_SYMBOL_GPL(cgroup_path);
  
@@ -2067,7 +2076,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         if (!group)
                 return -ENOMEM;
         /* pre-allocate to guarantee space while iterating in rcu read-side. */
-       retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+       retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
         if (retval)
                 goto out_free_group_list;
  
@@ -2539,13 +2548,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
  static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry)
  {
+       int ret;
+       struct cgroup_name *name, *old_name;
+       struct cgroup *cgrp;
+
+       /*
+        * It's convinient to use parent dir's i_mutex to protected
+        * cgrp->name.
+        */
+       lockdep_assert_held(&old_dir->i_mutex);
+
         if (!S_ISDIR(old_dentry->d_inode->i_mode))
                 return -ENOTDIR;
         if (new_dentry->d_inode)
                 return -EEXIST;
         if (old_dir != new_dir)
                 return -EIO;
-       return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+       cgrp = __d_cgrp(old_dentry);
+
+       name = cgroup_alloc_name(new_dentry);
+       if (!name)
+               return -ENOMEM;
+
+       ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+       if (ret) {
+               kfree(name);
+               return ret;
+       }
+
+       old_name = cgrp->name;
+       rcu_assign_pointer(cgrp->name, name);
+
+       kfree_rcu(old_name, rcu_head);
+       return 0;
  }
  
  static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -2645,7 +2681,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
   */
  static inline struct cftype *__file_cft(struct file *file)
  {
-       if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+       if (file_inode(file)->i_fop != &cgroup_file_operations)
                 return ERR_PTR(-EINVAL);
         return __d_cft(file->f_dentry);
  }
@@ -3364,35 +3400,14 @@ static void pidlist_free(void *p)
         else
                 kfree(p);
  }
-static void *pidlist_resize(void *p, int newcount)
-{
-       void *newlist;
-       /* note: if new alloc fails, old p will still be valid either way */
-       if (is_vmalloc_addr(p)) {
-               newlist = vmalloc(newcount * sizeof(pid_t));
-               if (!newlist)
-                       return NULL;
-               memcpy(newlist, p, newcount * sizeof(pid_t));
-               vfree(p);
-       } else {
-               newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
-       }
-       return newlist;
-}
  
  /*
   * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * If the new stripped list is sufficiently smaller and there's enough memory
- * to allocate a new buffer, will let go of the unneeded memory. Returns the
- * number of unique elements.
+ * Returns the number of unique elements.
   */
-/* is the size difference enough that we should re-allocate the array? */
-#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
-static int pidlist_uniq(pid_t **p, int length)
+static int pidlist_uniq(pid_t *list, int length)
  {
         int src, dest = 1;
-       pid_t *list = *p;
-       pid_t *newlist;
  
         /*
          * we presume the 0th element is unique, so i starts at 1. trivial
@@ -3413,16 +3428,6 @@ static int pidlist_uniq(pid_t **p, int length)
                 dest++;
         }
  after:
-       /*
-        * if the length difference is large enough, we want to allocate a
-        * smaller buffer to save memory. if this fails due to out of memory,
-        * we'll just stay with what we've got.
-        */
-       if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
-               newlist = pidlist_resize(list, dest);
-               if (newlist)
-                       *p = newlist;
-       }
         return dest;
  }
  
@@ -3518,7 +3523,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
         /* now sort & (if procs) strip out duplicates */
         sort(array, length, sizeof(pid_t), cmppid, NULL);
         if (type == CGROUP_FILE_PROCS)
-               length = pidlist_uniq(&array, length);
+               length = pidlist_uniq(array, length);
         l = cgroup_pidlist_find(cgrp, type);
         if (!l) {
                 pidlist_free(array);
@@ -3902,7 +3907,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
  
         /* the process need read permission on control file */
         /* AV: shouldn't we check that it's been opened for read instead? */
-       ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
+       ret = inode_permission(file_inode(cfile), MAY_READ);
         if (ret < 0)
                 goto fail;
  
@@ -3932,12 +3937,6 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
         if (ret)
                 goto fail;
  
-       if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
-               event->cft->unregister_event(cgrp, event->cft, event->eventfd);
-               ret = 0;
-               goto fail;
-       }
-
         /*
          * Events should be removed after rmdir of cgroup directory, but before
          * destroying subsystem state objects. Let's take reference to cgroup
@@ -4133,17 +4132,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
         if (!(css->flags & CSS_ONLINE))
                 return;
  
-       /*
-        * css_offline() should be called with cgroup_mutex unlocked.  See
-        * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
-        * details.  This temporary unlocking should go away once
-        * cgroup_mutex is unexported from controllers.
-        */
-       if (ss->css_offline) {
-               mutex_unlock(&cgroup_mutex);
+       if (ss->css_offline)
                 ss->css_offline(cgrp);
-               mutex_lock(&cgroup_mutex);
-       }
  
         cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
  }
@@ -4160,6 +4150,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                              umode_t mode)
  {
         struct cgroup *cgrp;
+       struct cgroup_name *name;
         struct cgroupfs_root *root = parent->root;
         int err = 0;
         struct cgroup_subsys *ss;
@@ -4170,9 +4161,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         if (!cgrp)
                 return -ENOMEM;
  
+       name = cgroup_alloc_name(dentry);
+       if (!name)
+               goto err_free_cgrp;
+       rcu_assign_pointer(cgrp->name, name);
+
         cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
         if (cgrp->id < 0)
-               goto err_free_cgrp;
+               goto err_free_name;
  
         /*
          * Only live parents can have children.  Note that the liveliness
@@ -4278,6 +4274,8 @@ err_free_all:
         deactivate_super(sb);
  err_free_id:
         ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_name:
+       kfree(rcu_dereference_raw(cgrp->name));
  err_free_cgrp:
         kfree(cgrp);
         return err;
@@ -4297,56 +4295,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
         return cgroup_create(c_parent, dentry, mode | S_IFDIR);
  }
  
-/*
- * Check the reference count on each subsystem. Since we already
- * established that there are no tasks in the cgroup, if the css refcount
- * is also 1, then there should be no outstanding references, so the
- * subsystem is safe to destroy. We scan across all subsystems rather than
- * using the per-hierarchy linked list of mounted subsystems since we can
- * be called via check_for_release() with no synchronization other than
- * RCU, and the subsystem linked list isn't RCU-safe.
- */
-static int cgroup_has_css_refs(struct cgroup *cgrp)
-{
-       int i;
-
-       /*
-        * We won't need to lock the subsys array, because the subsystems
-        * we're concerned about aren't going anywhere since our cgroup root
-        * has a reference on them.
-        */
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               struct cgroup_subsys *ss = subsys[i];
-               struct cgroup_subsys_state *css;
-
-               /* Skip subsystems not present or not in this hierarchy */
-               if (ss == NULL || ss->root != cgrp->root)
-                       continue;
-
-               css = cgrp->subsys[ss->subsys_id];
-               /*
-                * When called from check_for_release() it's possible
-                * that by this point the cgroup has been removed
-                * and the css deleted. But a false-positive doesn't
-                * matter, since it can only happen if the cgroup
-                * has been deleted and hence no longer needs the
-                * release agent to be called anyway.
-                */
-               if (css && css_refcnt(css) > 1)
-                       return 1;
-       }
-       return 0;
-}
-
  static int cgroup_destroy_locked(struct cgroup *cgrp)
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
         struct dentry *d = cgrp->dentry;
         struct cgroup *parent = cgrp->parent;
-       DEFINE_WAIT(wait);
         struct cgroup_event *event, *tmp;
         struct cgroup_subsys *ss;
-       LIST_HEAD(tmp_list);
  
         lockdep_assert_held(&d->d_inode->i_mutex);
         lockdep_assert_held(&cgroup_mutex);
@@ -4493,7 +4448,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
  {
         struct cgroup_subsys_state *css;
         int i, ret;
-       struct hlist_node *node, *tmp;
+       struct hlist_node *tmp;
         struct css_set *cg;
         unsigned long key;
  
@@ -4561,7 +4516,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
          * this is all done under the css_set_lock.
          */
         write_lock(&css_set_lock);
-       hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) {
+       hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
                 /* skip entries that we already rehashed */
                 if (cg->subsys[ss->subsys_id])
                         continue;
@@ -4571,7 +4526,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
                 cg->subsys[ss->subsys_id] = css;
                 /* recompute hash and restore entry */
                 key = css_set_hash(cg->subsys);
-               hash_add(css_set_table, node, key);
+               hash_add(css_set_table, &cg->hlist, key);
         }
         write_unlock(&css_set_lock);
  
@@ -4618,10 +4573,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
         offline_css(ss, dummytop);
         ss->active = 0;
  
-       if (ss->use_id) {
-               idr_remove_all(&ss->idr);
+       if (ss->use_id)
                 idr_destroy(&ss->idr);
-       }
  
         /* deassign the subsys_id */
         subsys[ss->subsys_id] = NULL;
@@ -4939,17 +4892,17 @@ void cgroup_post_fork(struct task_struct *child)
          * and addition to css_set.
          */
         if (need_forkexit_callback) {
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               /*
+                * fork/exit callbacks are supported only for builtin
+                * subsystems, and the builtin section of the subsys
+                * array is immutable, so we don't need to lock the
+                * subsys array here. On the other hand, modular section
+                * of the array can be freed at module unload, so we
+                * can't touch that.
+                */
+               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                         struct cgroup_subsys *ss = subsys[i];
  
-                       /*
-                        * fork/exit callbacks are supported only for
-                        * builtin subsystems and we don't need further
-                        * synchronization as they never go away.
-                        */
-                       if (!ss || ss->module)
-                               continue;
-
                         if (ss->fork)
                                 ss->fork(child);
                 }
@@ -5014,13 +4967,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
         tsk->cgroups = &init_css_set;
  
         if (run_callbacks && need_forkexit_callback) {
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               /*
+                * fork/exit callbacks are supported only for builtin
+                * subsystems, see cgroup_post_fork() for details.
+                */
+               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                         struct cgroup_subsys *ss = subsys[i];
  
-                       /* modular subsystems can't use callbacks */
-                       if (!ss || ss->module)
-                               continue;
-
                         if (ss->exit) {
                                 struct cgroup *old_cgrp =
                                         rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5034,44 +4987,19 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
         put_css_set_taskexit(cg);
  }
  
-/**
- * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
- * @cgrp: the cgroup in question
- * @task: the task in question
- *
- * See if @cgrp is a descendant of @task's cgroup in the appropriate
- * hierarchy.
- *
- * If we are sending in dummytop, then presumably we are creating
- * the top cgroup in the subsystem.
- *
- * Called only by the ns (nsproxy) cgroup.
- */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
-{
-       int ret;
-       struct cgroup *target;
-
-       if (cgrp == dummytop)
-               return 1;
-
-       target = task_cgroup_from_root(task, cgrp->root);
-       while (cgrp != target && cgrp!= cgrp->top_cgroup)
-               cgrp = cgrp->parent;
-       ret = (cgrp == target);
-       return ret;
-}
-
  static void check_for_release(struct cgroup *cgrp)
  {
         /* All of these checks rely on RCU to keep the cgroup
          * structure alive */
-       if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
-           && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
-               /* Control Group is currently removeable. If it's not
+       if (cgroup_is_releasable(cgrp) &&
+           !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
+               /*
+                * Control Group is currently removeable. If it's not
                  * already queued for a userspace notification, queue
-                * it now */
+                * it now
+                */
                 int need_schedule_work = 0;
+
                 raw_spin_lock(&release_list_lock);
                 if (!cgroup_is_removed(cgrp) &&
                     list_empty(&cgrp->release_list)) {
@@ -5104,24 +5032,11 @@ EXPORT_SYMBOL_GPL(__css_tryget);
  /* Caller must verify that the css is not for root cgroup */
  void __css_put(struct cgroup_subsys_state *css)
  {
-       struct cgroup *cgrp = css->cgroup;
         int v;
  
-       rcu_read_lock();
         v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-
-       switch (v) {
-       case 1:
-               if (notify_on_release(cgrp)) {
-                       set_bit(CGRP_RELEASABLE, &cgrp->flags);
-                       check_for_release(cgrp);
-               }
-               break;
-       case 0:
+       if (v == 0)
                 schedule_work(&css->dput_work);
-               break;
-       }
-       rcu_read_unlock();
  }
  EXPORT_SYMBOL_GPL(__css_put);
  
@@ -5322,7 +5237,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
  static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
  {
         struct css_id *newid;
-       int myid, error, size;
+       int ret, size;
  
         BUG_ON(!ss->use_id);
  
@@ -5330,35 +5245,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
         newid = kzalloc(size, GFP_KERNEL);
         if (!newid)
                 return ERR_PTR(-ENOMEM);
-       /* get id */
-       if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
-               error = -ENOMEM;
-               goto err_out;
-       }
+
+       idr_preload(GFP_KERNEL);
         spin_lock(&ss->id_lock);
         /* Don't use 0. allocates an ID of 1-65535 */
-       error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+       ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
         spin_unlock(&ss->id_lock);
+       idr_preload_end();
  
         /* Returns error when there are no free spaces for new ID.*/
-       if (error) {
-               error = -ENOSPC;
+       if (ret < 0)
                 goto err_out;
-       }
-       if (myid > CSS_ID_MAX)
-               goto remove_idr;
  
-       newid->id = myid;
+       newid->id = ret;
         newid->depth = depth;
         return newid;
-remove_idr:
-       error = -ENOSPC;
-       spin_lock(&ss->id_lock);
-       idr_remove(&ss->idr, myid);
-       spin_unlock(&ss->id_lock);
  err_out:
         kfree(newid);
-       return ERR_PTR(error);
+       return ERR_PTR(ret);
  
  }
  
@@ -5489,7 +5393,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
         struct inode *inode;
         struct cgroup_subsys_state *css;
  
-       inode = f->f_dentry->d_inode;
+       inode = file_inode(f);
         /* check in cgroup filesystem dir */
         if (inode->i_op != &cgroup_dir_inode_operations)
                 return ERR_PTR(-EBADF);