cgroup: introduce cgroup_tree_mutex

author Tejun Heo <tj@kernel.org>

Tue, 11 Feb 2014 16:52:47 +0000 (11:52 -0500)

committer Tejun Heo <tj@kernel.org>

Tue, 11 Feb 2014 16:52:47 +0000 (11:52 -0500)
author Tejun Heo <tj@kernel.org>
Tue, 11 Feb 2014 16:52:47 +0000 (11:52 -0500)
committer Tejun Heo <tj@kernel.org>
Tue, 11 Feb 2014 16:52:47 +0000 (11:52 -0500)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index fc2db071d95e423a3aa8f9f010c1713a8d01c25a..cb20d12cb096a736a8c1aa12af729e7b37897ea3 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -67,6 +67,15 @@
   */
  #define CGROUP_PIDLIST_DESTROY_DELAY   HZ
  
+/*
+ * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
+ * creation/removal and hierarchy changing operations including cgroup
+ * creation, removal, css association and controller rebinding.  This outer
+ * lock is needed mainly to resolve the circular dependency between kernfs
+ * active ref and cgroup_mutex.  cgroup_tree_mutex nests above both.
+ */
+static DEFINE_MUTEX(cgroup_tree_mutex);
+
  /*
   * cgroup_mutex is the master lock.  Any modification to cgroup or its
   * hierarchy must be performed while holding it.
@@ -84,10 +93,11 @@ static DEFINE_MUTEX(cgroup_mutex);
   */
  static DEFINE_SPINLOCK(release_agent_path_lock);
  
-#define cgroup_assert_mutex_or_rcu_locked()                            \
+#define cgroup_assert_mutexes_or_rcu_locked()                          \
         rcu_lockdep_assert(rcu_read_lock_held() ||                      \
+                          lockdep_is_held(&cgroup_tree_mutex) ||       \
                            lockdep_is_held(&cgroup_mutex),              \
-                          "cgroup_mutex or RCU read lock required");
+                          "cgroup_[tree_]mutex or RCU read lock required");
  
  /*
   * cgroup destruction makes heavy use of work items and there can be a lot
@@ -179,7 +189,8 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
  {
         if (ss)
                 return rcu_dereference_check(cgrp->subsys[ss->id],
-                                            lockdep_is_held(&cgroup_mutex));
+                                       lockdep_is_held(&cgroup_tree_mutex) ||
+                                       lockdep_is_held(&cgroup_mutex));
         else
                 return &cgrp->dummy_css;
  }
@@ -235,6 +246,7 @@ static int notify_on_release(const struct cgroup *cgrp)
         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
                 if (!((css) = rcu_dereference_check(                    \
                                 (cgrp)->subsys[(ssid)],                 \
+                               lockdep_is_held(&cgroup_tree_mutex) ||  \
                                 lockdep_is_held(&cgroup_mutex)))) { }   \
                 else
  
@@ -883,7 +895,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
         struct cfent *cfe;
  
         lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
-       lockdep_assert_held(&cgroup_mutex);
+       lockdep_assert_held(&cgroup_tree_mutex);
  
         /*
          * If we're doing cleanup due to failure of cgroup_create(),
@@ -948,7 +960,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
         struct cgroup_subsys *ss;
         int i, ret;
  
-       BUG_ON(!mutex_is_locked(&cgroup_mutex));
+       lockdep_assert_held(&cgroup_tree_mutex);
+       lockdep_assert_held(&cgroup_mutex);
  
         /* Check that any added subsystems are currently free */
         for_each_subsys(ss, i)
@@ -1220,6 +1233,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
         }
  
         mutex_lock(&cgrp->dentry->d_inode->i_mutex);
+       mutex_lock(&cgroup_tree_mutex);
         mutex_lock(&cgroup_mutex);
  
         /* See what subsystems are wanted */
@@ -1263,6 +1277,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
         kfree(opts.release_agent);
         kfree(opts.name);
         mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&cgroup_tree_mutex);
         mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
         return ret;
  }
@@ -1494,6 +1509,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 inode = sb->s_root->d_inode;
  
                 mutex_lock(&inode->i_mutex);
+               mutex_lock(&cgroup_tree_mutex);
                 mutex_lock(&cgroup_mutex);
  
                 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
@@ -1568,6 +1584,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 BUG_ON(root->number_of_cgroups != 1);
  
                 mutex_unlock(&cgroup_mutex);
+               mutex_unlock(&cgroup_tree_mutex);
                 mutex_unlock(&inode->i_mutex);
         } else {
                 /*
@@ -1598,6 +1615,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
   unlock_drop:
         cgroup_exit_root_id(root);
         mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&cgroup_tree_mutex);
         mutex_unlock(&inode->i_mutex);
   drop_new_super:
         deactivate_locked_super(sb);
@@ -1620,6 +1638,7 @@ static void cgroup_kill_sb(struct super_block *sb)
         BUG_ON(!list_empty(&cgrp->children));
  
         mutex_lock(&cgrp->dentry->d_inode->i_mutex);
+       mutex_lock(&cgroup_tree_mutex);
         mutex_lock(&cgroup_mutex);
  
         /* Rebind all subsystems back to the default hierarchy */
@@ -1650,6 +1669,7 @@ static void cgroup_kill_sb(struct super_block *sb)
         cgroup_exit_root_id(root);
  
         mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&cgroup_tree_mutex);
         mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
  
         simple_xattrs_free(&cgrp->xattrs);
@@ -2625,7 +2645,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
         int ret;
  
         lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
-       lockdep_assert_held(&cgroup_mutex);
+       lockdep_assert_held(&cgroup_tree_mutex);
  
         for (cft = cfts; cft->name[0] != '\0'; cft++) {
                 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2659,6 +2679,7 @@ static void cgroup_cfts_prepare(void)
          * Instead, we use css_for_each_descendant_pre() and drop RCU read
          * lock before calling cgroup_addrm_files().
          */
+       mutex_lock(&cgroup_tree_mutex);
         mutex_lock(&cgroup_mutex);
  }
  
@@ -2679,6 +2700,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
         if (!cfts || ss->root == &cgroup_dummy_root ||
             !atomic_inc_not_zero(&sb->s_active)) {
                 mutex_unlock(&cgroup_mutex);
+               mutex_unlock(&cgroup_tree_mutex);
                 return 0;
         }
  
@@ -2702,7 +2724,9 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
                 prev = cgrp->dentry;
  
                 mutex_unlock(&cgroup_mutex);
+               mutex_unlock(&cgroup_tree_mutex);
                 mutex_lock(&inode->i_mutex);
+               mutex_lock(&cgroup_tree_mutex);
                 mutex_lock(&cgroup_mutex);
                 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
                         ret = cgroup_addrm_files(cgrp, cfts, is_add);
@@ -2711,6 +2735,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
                         break;
         }
         mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&cgroup_tree_mutex);
         dput(prev);
         deactivate_super(sb);
         return ret;
@@ -2856,7 +2881,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
         struct cgroup *cgrp = parent_css->cgroup;
         struct cgroup *next;
  
-       cgroup_assert_mutex_or_rcu_locked();
+       cgroup_assert_mutexes_or_rcu_locked();
  
         /*
          * @pos could already have been removed.  Once a cgroup is removed,
@@ -2914,7 +2939,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
  {
         struct cgroup_subsys_state *next;
  
-       cgroup_assert_mutex_or_rcu_locked();
+       cgroup_assert_mutexes_or_rcu_locked();
  
         /* if first iteration, visit @root */
         if (!pos)
@@ -2955,7 +2980,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
  {
         struct cgroup_subsys_state *last, *tmp;
  
-       cgroup_assert_mutex_or_rcu_locked();
+       cgroup_assert_mutexes_or_rcu_locked();
  
         do {
                 last = pos;
@@ -3003,7 +3028,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
  {
         struct cgroup_subsys_state *next;
  
-       cgroup_assert_mutex_or_rcu_locked();
+       cgroup_assert_mutexes_or_rcu_locked();
  
         /* if first iteration, visit leftmost descendant which may be @root */
         if (!pos)
@@ -3977,6 +4002,7 @@ static int online_css(struct cgroup_subsys_state *css)
         struct cgroup_subsys *ss = css->ss;
         int ret = 0;
  
+       lockdep_assert_held(&cgroup_tree_mutex);
         lockdep_assert_held(&cgroup_mutex);
  
         if (ss->css_online)
@@ -3994,6 +4020,7 @@ static void offline_css(struct cgroup_subsys_state *css)
  {
         struct cgroup_subsys *ss = css->ss;
  
+       lockdep_assert_held(&cgroup_tree_mutex);
         lockdep_assert_held(&cgroup_mutex);
  
         if (!(css->flags & CSS_ONLINE))
@@ -4093,6 +4120,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         }
         rcu_assign_pointer(cgrp->name, name);
  
+       mutex_lock(&cgroup_tree_mutex);
+
         /*
          * Only live parents can have children.  Note that the liveliness
          * check isn't strictly necessary because cgroup_mkdir() and
@@ -4102,7 +4131,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
          */
         if (!cgroup_lock_live_group(parent)) {
                 err = -ENODEV;
-               goto err_free_name;
+               goto err_unlock_tree;
         }
  
         /*
@@ -4176,6 +4205,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         }
  
         mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&cgroup_tree_mutex);
         mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
  
         return 0;
@@ -4186,7 +4216,8 @@ err_free_id:
         deactivate_super(sb);
  err_unlock:
         mutex_unlock(&cgroup_mutex);
-err_free_name:
+err_unlock_tree:
+       mutex_unlock(&cgroup_tree_mutex);
         kfree(rcu_dereference_raw(cgrp->name));
  err_free_cgrp:
         kfree(cgrp);
@@ -4195,6 +4226,7 @@ err_free_cgrp:
  err_destroy:
         cgroup_destroy_locked(cgrp);
         mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&cgroup_tree_mutex);
         mutex_unlock(&dentry->d_inode->i_mutex);
         return err;
  }
@@ -4217,6 +4249,7 @@ static void css_killed_work_fn(struct work_struct *work)
                 container_of(work, struct cgroup_subsys_state, destroy_work);
         struct cgroup *cgrp = css->cgroup;
  
+       mutex_lock(&cgroup_tree_mutex);
         mutex_lock(&cgroup_mutex);
  
         /*
@@ -4234,6 +4267,7 @@ static void css_killed_work_fn(struct work_struct *work)
                 cgroup_destroy_css_killed(cgrp);
  
         mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&cgroup_tree_mutex);
  
         /*
          * Put the css refs from kill_css().  Each css holds an extra
@@ -4321,6 +4355,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         int ssid;
  
         lockdep_assert_held(&d->d_inode->i_mutex);
+       lockdep_assert_held(&cgroup_tree_mutex);
         lockdep_assert_held(&cgroup_mutex);
  
         /*
@@ -4407,6 +4442,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
         struct cgroup *parent = cgrp->parent;
         struct dentry *d = cgrp->dentry;
  
+       lockdep_assert_held(&cgroup_tree_mutex);
         lockdep_assert_held(&cgroup_mutex);
  
         /* delete this cgroup from parent->children */
@@ -4422,9 +4458,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
         int ret;
  
+       mutex_lock(&cgroup_tree_mutex);
         mutex_lock(&cgroup_mutex);
         ret = cgroup_destroy_locked(dentry->d_fsdata);
         mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&cgroup_tree_mutex);
  
         return ret;
  }
@@ -4454,6 +4492,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
  
         printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
  
+       mutex_lock(&cgroup_tree_mutex);
         mutex_lock(&cgroup_mutex);
  
         /* init base cftset */
@@ -4482,6 +4521,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         BUG_ON(online_css(css));
  
         mutex_unlock(&cgroup_mutex);
+       mutex_unlock(&cgroup_tree_mutex);
  }
  
  /**
@@ -5021,7 +5061,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
  {
         struct cgroup *cgrp;
  
-       cgroup_assert_mutex_or_rcu_locked();
+       cgroup_assert_mutexes_or_rcu_locked();
  
         cgrp = idr_find(&ss->root->cgroup_idr, id);
         if (cgrp)
author	Tejun Heo <tj@kernel.org>
	Tue, 11 Feb 2014 16:52:47 +0000 (11:52 -0500)
committer	Tejun Heo <tj@kernel.org>
	Tue, 11 Feb 2014 16:52:47 +0000 (11:52 -0500)