rcu: Add call_rcu_tasks()

author Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Fri, 27 Jun 2014 20:42:20 +0000 (13:42 -0700)

committer Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Sun, 7 Sep 2014 23:27:19 +0000 (16:27 -0700)
author Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Fri, 27 Jun 2014 20:42:20 +0000 (13:42 -0700)
committer Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Sun, 7 Sep 2014 23:27:19 +0000 (16:27 -0700)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index 2bb4c4f3531ab4b724a1050bf31df9640386daaa..dffd9258ee60b61c25351c6b1d2111fe63c81d09 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -117,6 +117,14 @@ extern struct group_info init_groups;
  #else
  #define INIT_TASK_RCU_PREEMPT(tsk)
  #endif
+#ifdef CONFIG_TASKS_RCU
+#define INIT_TASK_RCU_TASKS(tsk)                                       \
+       .rcu_tasks_holdout = false,                                     \
+       .rcu_tasks_holdout_list =                                       \
+               LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list),
+#else
+#define INIT_TASK_RCU_TASKS(tsk)
+#endif
  
  extern struct cred init_cred;
  
@@ -224,6 +232,7 @@ extern struct task_group root_task_group;
         INIT_FTRACE_GRAPH                                               \
         INIT_TRACE_RECURSION                                            \
         INIT_TASK_RCU_PREEMPT(tsk)                                      \
+       INIT_TASK_RCU_TASKS(tsk)                                        \
         INIT_CPUSET_SEQ(tsk)                                            \
         INIT_RT_MUTEXES(tsk)                                            \
         INIT_VTIME(tsk)                                                 \
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h

index d231aa17b1d7490092b1994facbe21c496bead66..3432063f4c873660dcfd1402e4ef1dab94ca680c 100644 (file)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -197,6 +197,26 @@ void call_rcu_sched(struct rcu_head *head,
  
  void synchronize_sched(void);
  
+/**
+ * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_tasks() assumes
+ * that the read-side critical sections end at a voluntary context
+ * switch (not a preemption!), entry into idle, or transition to usermode
+ * execution.  As such, there are no read-side primitives analogous to
+ * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
+ * to determine that all tasks have passed through a safe state, not so
+ * much for data-strcuture synchronization.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
+ */
+void call_rcu_tasks(struct rcu_head *head, void (*func)(struct rcu_head *head));
+
  #ifdef CONFIG_PREEMPT_RCU
  
  void __rcu_read_lock(void);
@@ -294,6 +314,22 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev,
                 rcu_irq_exit(); \
         } while (0)
  
+/*
+ * Note a voluntary context switch for RCU-tasks benefit.  This is a
+ * macro rather than an inline function to avoid #include hell.
+ */
+#ifdef CONFIG_TASKS_RCU
+#define rcu_note_voluntary_context_switch(t) \
+       do { \
+               preempt_disable(); /* Exclude synchronize_sched(); */ \
+               if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
+                       ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
+               preempt_enable(); \
+       } while (0)
+#else /* #ifdef CONFIG_TASKS_RCU */
+#define rcu_note_voluntary_context_switch(t)   do { } while (0)
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
  #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP)
  bool __rcu_is_watching(void);
  #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 5c2c885ee52b3996a2665dc3d8c0e21ff9245aaf..eaacac4ae77d58eb3e6f5c237d0b3e1bd1e8f962 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1270,6 +1270,11 @@ struct task_struct {
  #ifdef CONFIG_TREE_PREEMPT_RCU
         struct rcu_node *rcu_blocked_node;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+       unsigned long rcu_tasks_nvcsw;
+       bool rcu_tasks_holdout;
+       struct list_head rcu_tasks_holdout_list;
+#endif /* #ifdef CONFIG_TASKS_RCU */
  
  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
         struct sched_info sched_info;
@@ -2000,28 +2005,24 @@ extern void task_clear_jobctl_pending(struct task_struct *task,
                                       unsigned int mask);
  
  #ifdef CONFIG_PREEMPT_RCU
-
  #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
  #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
  
  static inline void rcu_copy_process(struct task_struct *p)
  {
+#ifdef CONFIG_PREEMPT_RCU
         p->rcu_read_lock_nesting = 0;
         p->rcu_read_unlock_special = 0;
-#ifdef CONFIG_TREE_PREEMPT_RCU
         p->rcu_blocked_node = NULL;
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
         INIT_LIST_HEAD(&p->rcu_node_entry);
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+       p->rcu_tasks_holdout = false;
+       INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
+#endif /* #ifdef CONFIG_TASKS_RCU */
  }
  
-#else
-
-static inline void rcu_copy_process(struct task_struct *p)
-{
-}
-
-#endif
-
  static inline void tsk_restore_flags(struct task_struct *task,
                                 unsigned long orig_flags, unsigned long flags)
  {
diff --git a/init/Kconfig b/init/Kconfig

index e84c6423a2e5a2dbe80157b13f8b16d17c3e2d06..c4539c4e177f4c3fdadae755cc6a2b2930485cff 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -507,6 +507,16 @@ config PREEMPT_RCU
           This option enables preemptible-RCU code that is common between
           TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
  
+config TASKS_RCU
+       bool "Task_based RCU implementation using voluntary context switch"
+       default n
+       help
+         This option enables a task-based RCU implementation that uses
+         only voluntary context switch (not preemption!), idle, and
+         user-mode execution as quiescent states.
+
+         If unsure, say N.
+
  config RCU_STALL_COMMON
         def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
         help
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c

index d9efcc13008c00201c130f87135348c7238118ff..717f00854fc073fcb04bd9b3dde7c2e509db529e 100644 (file)
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -254,6 +254,8 @@ void rcu_check_callbacks(int cpu, int user)
                 rcu_sched_qs(cpu);
         else if (!in_softirq())
                 rcu_bh_qs(cpu);
+       if (user)
+               rcu_note_voluntary_context_switch(current);
  }
  
  /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index 1b70cb6fbe3ccda0466f3f0004865d82cdd9399d..8ad91d1e317dca532a30567645d0f192a8e93d3f 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2410,6 +2410,8 @@ void rcu_check_callbacks(int cpu, int user)
         rcu_preempt_check_callbacks(cpu);
         if (rcu_pending(cpu))
                 invoke_rcu_core();
+       if (user)
+               rcu_note_voluntary_context_switch(current);
         trace_rcu_utilization(TPS("End scheduler-tick"));
  }
  
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c

index 4056d7992a6c3d86d7a41478aeb35279cd5cde66..19b3dacb0753cee633a56ae2a6b8ab8fd931d40c 100644 (file)
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -47,6 +47,7 @@
  #include <linux/hardirq.h>
  #include <linux/delay.h>
  #include <linux/module.h>
+#include <linux/kthread.h>
  
  #define CREATE_TRACE_POINTS
  
@@ -347,3 +348,173 @@ static int __init check_cpu_stall_init(void)
  early_initcall(check_cpu_stall_init);
  
  #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Simple variant of RCU whose quiescent states are voluntary context switch,
+ * user-space execution, and idle.  As such, grace periods can take one good
+ * long time.  There are no read-side primitives similar to rcu_read_lock()
+ * and rcu_read_unlock() because this implementation is intended to get
+ * the system into a safe state for some of the manipulations involved in
+ * tracing and the like.  Finally, this implementation does not support
+ * high call_rcu_tasks() rates from multiple CPUs.  If this is required,
+ * per-CPU callback lists will be needed.
+ */
+
+/* Global list of callbacks and associated lock. */
+static struct rcu_head *rcu_tasks_cbs_head;
+static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
+
+/* Post an RCU-tasks callback. */
+void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
+{
+       unsigned long flags;
+
+       rhp->next = NULL;
+       rhp->func = func;
+       raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+       *rcu_tasks_cbs_tail = rhp;
+       rcu_tasks_cbs_tail = &rhp->next;
+       raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu_tasks);
+
+/* See if the current task has stopped holding out, remove from list if so. */
+static void check_holdout_task(struct task_struct *t)
+{
+       if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
+           t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
+           !ACCESS_ONCE(t->on_rq)) {
+               ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+               list_del_rcu(&t->rcu_tasks_holdout_list);
+               put_task_struct(t);
+       }
+}
+
+/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
+       unsigned long flags;
+       struct task_struct *g, *t;
+       struct rcu_head *list;
+       struct rcu_head *next;
+       LIST_HEAD(rcu_tasks_holdouts);
+
+       /* FIXME: Add housekeeping affinity. */
+
+       /*
+        * Each pass through the following loop makes one check for
+        * newly arrived callbacks, and, if there are some, waits for
+        * one RCU-tasks grace period and then invokes the callbacks.
+        * This loop is terminated by the system going down.  ;-)
+        */
+       for (;;) {
+
+               /* Pick up any new callbacks. */
+               raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+               list = rcu_tasks_cbs_head;
+               rcu_tasks_cbs_head = NULL;
+               rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+               raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+
+               /* If there were none, wait a bit and start over. */
+               if (!list) {
+                       schedule_timeout_interruptible(HZ);
+                       WARN_ON(signal_pending(current));
+                       continue;
+               }
+
+               /*
+                * Wait for all pre-existing t->on_rq and t->nvcsw
+                * transitions to complete.  Invoking synchronize_sched()
+                * suffices because all these transitions occur with
+                * interrupts disabled.  Without this synchronize_sched(),
+                * a read-side critical section that started before the
+                * grace period might be incorrectly seen as having started
+                * after the grace period.
+                *
+                * This synchronize_sched() also dispenses with the
+                * need for a memory barrier on the first store to
+                * ->rcu_tasks_holdout, as it forces the store to happen
+                * after the beginning of the grace period.
+                */
+               synchronize_sched();
+
+               /*
+                * There were callbacks, so we need to wait for an
+                * RCU-tasks grace period.  Start off by scanning
+                * the task list for tasks that are not already
+                * voluntarily blocked.  Mark these tasks and make
+                * a list of them in rcu_tasks_holdouts.
+                */
+               rcu_read_lock();
+               for_each_process_thread(g, t) {
+                       if (t != current && ACCESS_ONCE(t->on_rq) &&
+                           !is_idle_task(t)) {
+                               get_task_struct(t);
+                               t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
+                               ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+                               list_add(&t->rcu_tasks_holdout_list,
+                                        &rcu_tasks_holdouts);
+                       }
+               }
+               rcu_read_unlock();
+
+               /*
+                * Each pass through the following loop scans the list
+                * of holdout tasks, removing any that are no longer
+                * holdouts.  When the list is empty, we are done.
+                */
+               while (!list_empty(&rcu_tasks_holdouts)) {
+                       schedule_timeout_interruptible(HZ);
+                       WARN_ON(signal_pending(current));
+                       rcu_read_lock();
+                       list_for_each_entry_rcu(t, &rcu_tasks_holdouts,
+                                               rcu_tasks_holdout_list)
+                               check_holdout_task(t);
+                       rcu_read_unlock();
+               }
+
+               /*
+                * Because ->on_rq and ->nvcsw are not guaranteed
+                * to have a full memory barriers prior to them in the
+                * schedule() path, memory reordering on other CPUs could
+                * cause their RCU-tasks read-side critical sections to
+                * extend past the end of the grace period.  However,
+                * because these ->nvcsw updates are carried out with
+                * interrupts disabled, we can use synchronize_sched()
+                * to force the needed ordering on all such CPUs.
+                *
+                * This synchronize_sched() also confines all
+                * ->rcu_tasks_holdout accesses to be within the grace
+                * period, avoiding the need for memory barriers for
+                * ->rcu_tasks_holdout accesses.
+                */
+               synchronize_sched();
+
+               /* Invoke the callbacks. */
+               while (list) {
+                       next = list->next;
+                       local_bh_disable();
+                       list->func(list);
+                       local_bh_enable();
+                       list = next;
+                       cond_resched();
+               }
+       }
+}
+
+/* Spawn rcu_tasks_kthread() at boot time. */
+static int __init rcu_spawn_tasks_kthread(void)
+{
+       struct task_struct __maybe_unused *t;
+
+       t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
+       BUG_ON(IS_ERR(t));
+       return 0;
+}
+early_initcall(rcu_spawn_tasks_kthread);
+
+#endif /* #ifdef CONFIG_TASKS_RCU */
author	Paul E. McKenney <paulmck@linux.vnet.ibm.com>
	Fri, 27 Jun 2014 20:42:20 +0000 (13:42 -0700)
committer	Paul E. McKenney <paulmck@linux.vnet.ibm.com>
	Sun, 7 Sep 2014 23:27:19 +0000 (16:27 -0700)
include/linux/init_task.h		patch \| blob \| history
include/linux/rcupdate.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/rcu/tiny.c		patch \| blob \| history
kernel/rcu/tree.c		patch \| blob \| history
kernel/rcu/update.c		patch \| blob \| history