]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
cgroup: Fix task counter common ancestor logic
authorFrederic Weisbecker <fweisbec@gmail.com>
Thu, 8 Dec 2011 04:42:38 +0000 (15:42 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Tue, 13 Dec 2011 06:43:28 +0000 (17:43 +1100)
The task counter subsystem has been written assuming that
can_attach_task/attach_task/cancel_attach_task calls are serialized per
task.  This is true when we attach only one task but not when we attach a
whole thread group, in which case the sequence is:

for each thread
if (can_attach_task() < 0)
goto rollback

for each_thread
attach_task()

rollback:
for each thread
cancel_attach_task()

The common ancestor, searched on task_counter_attach_task(), can thus
change between each of these calls for a given task.  This breaks if some
tasks in the thread group are not in the same cgroup origin.  The uncharge
made in attach_task() or the rollback in cancel_attach_task() there would
have an erroneous propagation.

This can even break seriously is some scenario. For example there
with $PID beeing the pid of a multithread process:

mkdir /dev/cgroup/cgroup0
echo $PID > /dev/cgroup/cgroup0/cgroup.procs
echo $PID > /dev/cgroup/tasks
echo $PID > /dev/cgroup/cgroup0/cgroup.procs

On the last move, attach_task() is called on the thread leader with
the wrong common_ancestor, leading to a crash because we uncharge
a res_counter that doesn't exist:

[  149.805063] BUG: unable to handle kernel NULL pointer dereference at 0000000000000040
[  149.806013] IP: [<ffffffff810a0172>] __lock_acquire+0x62/0x15d0
[  149.806013] PGD 51d38067 PUD 5119e067 PMD 0
[  149.806013] Oops: 0000 [#1] PREEMPT SMP
[  149.806013] Dumping ftrace buffer:
[  149.806013]    (ftrace buffer empty)
[  149.806013] CPU 3
[  149.806013] Modules linked in:
[  149.806013]
[  149.806013] Pid: 1111, comm: spread_thread_g Not tainted 3.1.0-rc3+ #165 FUJITSU SIEMENS AMD690VM-FMH/AMD690VM-FMH
[  149.806013] RIP: 0010:[<ffffffff810a0172>]  [<ffffffff810a0172>] __lock_acquire+0x62/0x15d0
[  149.806013] RSP: 0018:ffff880051479b38  EFLAGS: 00010046
[  149.806013] RAX: 0000000000000046 RBX: 0000000000000040 RCX: 0000000000000000
[  149.868002] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000040
[  149.868002] RBP: ffff880051479c08 R08: 0000000000000002 R09: 0000000000000001
[  149.868002] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002
[  149.868002] R13: 0000000000000000 R14: 0000000000000000 R15: ffff880051f120a0
[  149.868002] FS:  00007f1e01dd7700(0000) GS:ffff880057d80000(0000) knlGS:0000000000000000
[  149.868002] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  149.868002] CR2: 0000000000000040 CR3: 0000000051c59000 CR4: 00000000000006e0
[  149.868002] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  149.868002] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  149.868002] Process spread_thread_g (pid: 1111, threadinfo ffff880051478000, task ffff880051f120a0)
[  149.868002] Stack:
[  149.868002]  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[  149.868002]  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[  149.868002]  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[  149.868002] Call Trace:
[  149.868002]  [<ffffffff810a1d32>] lock_acquire+0xa2/0x1a0
[  149.868002]  [<ffffffff810c373c>] ? res_counter_uncharge_until+0x4c/0xb0
[  149.868002]  [<ffffffff8180802b>] _raw_spin_lock+0x3b/0x50
[  149.868002]  [<ffffffff810c373c>] ? res_counter_uncharge_until+0x4c/0xb0
[  149.868002]  [<ffffffff810c373c>] res_counter_uncharge_until+0x4c/0xb0
[  149.868002]  [<ffffffff810c26c4>] task_counter_attach_task+0x44/0x50
[  149.868002]  [<ffffffff810bffcd>] cgroup_attach_proc+0x5ad/0x720
[  149.868002]  [<ffffffff810bfa99>] ? cgroup_attach_proc+0x79/0x720
[  149.868002]  [<ffffffff810c01cf>] attach_task_by_pid+0x8f/0x220
[  149.868002]  [<ffffffff810c0230>] ? attach_task_by_pid+0xf0/0x220
[  149.868002]  [<ffffffff810c0230>] ? attach_task_by_pid+0xf0/0x220
[  149.868002]  [<ffffffff810c0388>] cgroup_procs_write+0x28/0x40
[  149.868002]  [<ffffffff810c0bd9>] cgroup_file_write+0x209/0x2f0
[  149.868002]  [<ffffffff812b8d08>] ? apparmor_file_permission+0x18/0x20
[  149.868002]  [<ffffffff8127ef43>] ? security_file_permission+0x23/0x90
[  149.868002]  [<ffffffff81157038>] vfs_write+0xc8/0x190
[  149.868002]  [<ffffffff811571f1>] sys_write+0x51/0x90
[  149.868002]  [<ffffffff818102c2>] system_call_fastpath+0x16/0x1b

To solve this, keep the original cgroup of each thread in the thread
group cached in the flex array and pass it to can_attach_task()/attach_task()
and cancel_attach_task() so that the correct common ancestor between the old
and new cgroup can be safely retrieved for each task.

This is inspired by a previous patch from Li Zefan:
"[PATCH] cgroups: don't cache common ancestor in task counter subsys".

Reported-by: Ben Blum <bblum@andrew.cmu.edu>
Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Menage <paul@paulmenage.org>
Cc: Tim Hockin <thockin@hockin.org>
Cc: Tejun Heo <htejun@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Documentation/cgroups/cgroups.txt
include/linux/cgroup.h
kernel/cgroup.c
kernel/cgroup_task_counter.c

index 3fa646f6c6d68a3b05f2fa5f2a62e415d6c671fb..7df0e5b6f4cda841b0a8d429bcf66417bd235f76 100644 (file)
@@ -623,7 +623,8 @@ function, so that the subsystem can implement a rollback. If not, not necessary.
 This will be called only about subsystems whose can_attach() operation have
 succeeded.
 
-void cancel_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+void cancel_attach_task(struct cgroup *cgrp, struct cgroup *old_cgrp,
+                       struct task_struct *tsk)
 (cgroup_mutex held by caller)
 
 As cancel_attach, but for operations that must be cancelled once per
index ddc13eb76b09eafe813388d35a28dd1bd14157f3..8dada1dbc73e5339887d7bd422b338fd9d2e1b71 100644 (file)
@@ -475,7 +475,7 @@ struct cgroup_subsys {
                               struct task_struct *tsk);
        void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
                              struct task_struct *tsk);
-       void (*cancel_attach_task)(struct cgroup *cgrp,
+       void (*cancel_attach_task)(struct cgroup *cgrp, struct cgroup *old_cgrp,
                                   struct task_struct *tsk);
        void (*pre_attach)(struct cgroup *cgrp);
        void (*attach_task)(struct cgroup *cgrp, struct cgroup *old_cgrp,
index 893fc3d5c87604d738c9a5cbee461671212d4858..19a4faff23316166ee4bcde12bf71640eda0eae3 100644 (file)
@@ -1885,7 +1885,7 @@ out:
                                break;
 
                        if (ss->cancel_attach_task)
-                               ss->cancel_attach_task(cgrp, tsk);
+                               ss->cancel_attach_task(cgrp, oldcgrp, tsk);
                        if (ss->cancel_attach)
                                ss->cancel_attach(ss, cgrp, tsk);
                }
@@ -1983,6 +1983,11 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
        return 0;
 }
 
+struct task_cgroup {
+       struct task_struct *tsk;
+       struct cgroup *oldcgrp;
+};
+
 /**
  * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
  * @cgrp: the cgroup to attach to
@@ -2003,6 +2008,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        /* threadgroup list cursor and array */
        struct task_struct *tsk;
        struct flex_array *group;
+       struct task_cgroup *tc;
        /*
         * we need to make sure we have css_sets for all the tasks we're
         * going to move -before- we actually start moving them, so that in
@@ -2020,7 +2026,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         */
        group_size = get_nr_threads(leader);
        /* flex_array supports very large thread-groups better than kmalloc. */
-       group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+       group = flex_array_alloc(sizeof(struct task_cgroup), group_size,
                                 GFP_KERNEL);
        if (!group)
                return -ENOMEM;
@@ -2047,14 +2053,18 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        tsk = leader;
        i = 0;
        do {
+               struct task_cgroup tsk_cgrp;
+
                /* as per above, nr_threads may decrease, but not increase. */
                BUG_ON(i >= group_size);
                get_task_struct(tsk);
+               tsk_cgrp.tsk = tsk;
+               tsk_cgrp.oldcgrp = task_cgroup_from_root(tsk, root);
                /*
                 * saying GFP_ATOMIC has no effect here because we did prealloc
                 * earlier, but it's good form to communicate our expectations.
                 */
-               retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+               retval = flex_array_put(group, i, &tsk_cgrp, GFP_ATOMIC);
                BUG_ON(retval != 0);
                i++;
        } while_each_thread(leader, tsk);
@@ -2077,14 +2087,13 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                if (ss->can_attach_task) {
                        /* run on each task in the threadgroup. */
                        for (i = 0; i < group_size; i++) {
-                               tsk = flex_array_get_ptr(group, i);
-                               oldcgrp = task_cgroup_from_root(tsk, root);
-
+                               tc = flex_array_get(group, i);
                                retval = ss->can_attach_task(cgrp,
-                                                            oldcgrp, tsk);
+                                                            tc->oldcgrp,
+                                                            tc->tsk);
                                if (retval) {
                                        failed_ss = ss;
-                                       failed_task = tsk;
+                                       failed_task = tc->tsk;
                                        goto out_cancel_attach;
                                }
                        }
@@ -2097,10 +2106,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         */
        INIT_LIST_HEAD(&newcg_list);
        for (i = 0; i < group_size; i++) {
-               tsk = flex_array_get_ptr(group, i);
+               tc = flex_array_get(group, i);
+               tsk = tc->tsk;
                /* nothing to do if this task is already in the cgroup */
-               oldcgrp = task_cgroup_from_root(tsk, root);
-               if (cgrp == oldcgrp)
+               if (cgrp == tc->oldcgrp)
                        continue;
                /* get old css_set pointer */
                task_lock(tsk);
@@ -2136,9 +2145,10 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                        ss->pre_attach(cgrp);
        }
        for (i = 0; i < group_size; i++) {
-               tsk = flex_array_get_ptr(group, i);
+               tc = flex_array_get(group, i);
+               tsk = tc->tsk;
+               oldcgrp = tc->oldcgrp;
                /* leave current thread as it is if it's already there */
-               oldcgrp = task_cgroup_from_root(tsk, root);
                if (cgrp == oldcgrp)
                        continue;
                /* if the thread is PF_EXITING, it can just get skipped. */
@@ -2151,7 +2161,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                        }
                } else if (retval == -ESRCH) {
                        if (ss->cancel_attach_task)
-                               ss->cancel_attach_task(cgrp, tsk);
+                               ss->cancel_attach_task(cgrp, oldcgrp, tsk);
                } else {
                        BUG_ON(1);
                }
@@ -2188,10 +2198,10 @@ out_cancel_attach:
                        if (ss->cancel_attach_task && (ss != failed_ss ||
                                                       failed_task)) {
                                for (i = 0; i < group_size; i++) {
-                                       tsk = flex_array_get_ptr(group, i);
-                                       if (tsk == failed_task)
+                                       tc = flex_array_get(group, i);
+                                       if (tc->tsk == failed_task)
                                                break;
-                                       ss->cancel_attach_task(cgrp, tsk);
+                                       ss->cancel_attach_task(cgrp, tc->oldcgrp, tc->tsk);
                                }
                        }
 
@@ -2206,8 +2216,8 @@ out_cancel_attach:
        }
        /* clean up the array of referenced threads in the group. */
        for (i = 0; i < group_size; i++) {
-               tsk = flex_array_get_ptr(group, i);
-               put_task_struct(tsk);
+               tc = flex_array_get(group, i);
+               put_task_struct(tc->tsk);
        }
 out_free_group_list:
        flex_array_free(group);
index d3b2a8290fef57c3d9c8b2c5d2fe14e12a38c79b..c04340d0ef9861d71f25ab8fd4986118d6fa0922 100644 (file)
@@ -94,12 +94,6 @@ static void task_counter_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
                res_counter_uncharge(cgroup_task_res_counter(old_cgrp), 1);
 }
 
-/*
- * Protected amongst can_attach_task/attach_task/cancel_attach_task by
- * cgroup mutex
- */
-static struct res_counter *common_ancestor;
-
 /*
  * This does more than just probing the ability to attach to the dest cgroup.
  * We can not just _check_ if we can attach to the destination and do the real
@@ -111,9 +105,10 @@ static int task_counter_can_attach_task(struct cgroup *cgrp,
                                        struct cgroup *old_cgrp,
                                        struct task_struct *tsk)
 {
+       int err;
+       struct res_counter *common_ancestor;
        struct res_counter *res = cgroup_task_res_counter(cgrp);
        struct res_counter *old_res = cgroup_task_res_counter(old_cgrp);
-       int err;
 
        /*
         * When moving a task from a cgroup to another, we don't want
@@ -138,10 +133,15 @@ static int task_counter_can_attach_task(struct cgroup *cgrp,
 
 /* Uncharge the dest cgroup that we charged in task_counter_can_attach_task() */
 static void task_counter_cancel_attach_task(struct cgroup *cgrp,
+                                           struct cgroup *old_cgrp,
                                            struct task_struct *tsk)
 {
-       res_counter_uncharge_until(cgroup_task_res_counter(cgrp),
-                                  common_ancestor, 1);
+       struct res_counter *common_ancestor;
+       struct res_counter *res = cgroup_task_res_counter(cgrp);
+       struct res_counter *old_res = cgroup_task_res_counter(old_cgrp);
+
+       common_ancestor = res_counter_common_ancestor(res, old_res);
+       res_counter_uncharge_until(res, common_ancestor, 1);
 }
 
 /*
@@ -155,8 +155,12 @@ static void task_counter_attach_task(struct cgroup *cgrp,
                                     struct cgroup *old_cgrp,
                                     struct task_struct *tsk)
 {
-       res_counter_uncharge_until(cgroup_task_res_counter(old_cgrp),
-                                  common_ancestor, 1);
+       struct res_counter *common_ancestor;
+       struct res_counter *res = cgroup_task_res_counter(cgrp);
+       struct res_counter *old_res = cgroup_task_res_counter(old_cgrp);
+
+       common_ancestor = res_counter_common_ancestor(res, old_res);
+       res_counter_uncharge_until(old_res, common_ancestor, 1);
 }
 
 static u64 task_counter_read_u64(struct cgroup *cgrp, struct cftype *cft)