rwsem: Support optimistic spinning

author Davidlohr Bueso <davidlohr@hp.com>

Sat, 17 May 2014 13:19:24 +0000 (23:19 +1000)

committer Stephen Rothwell <sfr@canb.auug.org.au>

Wed, 21 May 2014 07:11:31 +0000 (17:11 +1000)
author Davidlohr Bueso <davidlohr@hp.com>
Sat, 17 May 2014 13:19:24 +0000 (23:19 +1000)
committer Stephen Rothwell <sfr@canb.auug.org.au>
Wed, 21 May 2014 07:11:31 +0000 (17:11 +1000)
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h

index 03f3b05e8ec17dda4d7b8dcefcc1e723bc6e606c..3e108f154cb66917ff3b6dd704827d3bc4f9133d 100644 (file)
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,6 +16,7 @@
  
  #include <linux/atomic.h>
  
+struct optimistic_spin_queue;
  struct rw_semaphore;
  
  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -23,9 +24,17 @@ struct rw_semaphore;
  #else
  /* All arch specific implementations share the same struct */
  struct rw_semaphore {
-       long                    count;
-       raw_spinlock_t          wait_lock;
-       struct list_head        wait_list;
+       long count;
+       raw_spinlock_t wait_lock;
+       struct list_head wait_list;
+#ifdef CONFIG_SMP
+       /*
+        * Write owner. Used as a speculative check to see
+        * if the owner is running on the cpu.
+        */
+       struct task_struct *owner;
+       struct optimistic_spin_queue *osq; /* spinner MCS lock */
+#endif
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
         struct lockdep_map      dep_map;
  #endif
@@ -55,11 +64,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
  # define __RWSEM_DEP_MAP_INIT(lockname)
  #endif
  
+#ifdef CONFIG_SMP
+#define __RWSEM_INITIALIZER(name)                      \
+       { RWSEM_UNLOCKED_VALUE,                         \
+         __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),     \
+         LIST_HEAD_INIT((name).wait_list),             \
+         NULL, /* owner */                             \
+         NULL /* mcs lock */                           \
+         __RWSEM_DEP_MAP_INIT(name) }
+#else
  #define __RWSEM_INITIALIZER(name)                      \
         { RWSEM_UNLOCKED_VALUE,                         \
           __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),     \
           LIST_HEAD_INIT((name).wait_list)              \
           __RWSEM_DEP_MAP_INIT(name) }
+#endif
  
  #define DECLARE_RWSEM(name) \
         struct rw_semaphore name = __RWSEM_INITIALIZER(name)
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c

index b4219ff87b8c6dc95ae85bb2a4f38118681c0df6..b562aca654cdcf0e80fed18297ec4da4a3bebfb6 100644 (file)
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,12 +5,18 @@
   *
   * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
   * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
   */
  #include <linux/rwsem.h>
  #include <linux/sched.h>
  #include <linux/init.h>
+#include <linux/sched/rt.h>
  #include <linux/export.h>
  
+#include "mcs_spinlock.h"
+
  /*
   * Guide to the rw_semaphore's count field for common values.
   * (32-bit case illustrated, similar for 64-bit)
@@ -60,9 +66,7 @@
   *
   */
  
-/*
- * Initialize an rwsem:
- */
+/* initialize a rwsem */
  void __init_rwsem(struct rw_semaphore *sem, const char *name,
                   struct lock_class_key *key)
  {
@@ -76,6 +80,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
         sem->count = RWSEM_UNLOCKED_VALUE;
         raw_spin_lock_init(&sem->wait_lock);
         INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_SMP
+       sem->owner = NULL;
+       sem->osq = NULL;
+#endif
  }
  
  EXPORT_SYMBOL(__init_rwsem);
@@ -190,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
  }
  
  /*
- * wait for the read lock to be granted
+ * Wait for the read lock to be granted
   */
  __visible
  struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
@@ -237,64 +245,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
         return sem;
  }
  
+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+       if (!(count & RWSEM_ACTIVE_MASK)) {
+               /* try acquiring the write lock */
+               if (sem->count == RWSEM_WAITING_BIAS &&
+                   cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+                           RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+                       if (!list_is_singular(&sem->wait_list))
+                               rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+                       return true;
+               }
+       }
+       return false;
+}
+
+#ifdef CONFIG_SMP
  /*
- * wait until we successfully acquire the write lock
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+       long old, count = ACCESS_ONCE(sem->count);
+
+       while (true) {
+               if (!(count == 0 || count == RWSEM_WAITING_BIAS))
+                       return false;
+
+               old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+               if (old == count)
+                       return true;
+
+               count = old;
+       }
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+       struct task_struct *owner;
+       bool on_cpu = true;
+
+       if (need_resched())
+               return 0;
+
+       rcu_read_lock();
+       owner = ACCESS_ONCE(sem->owner);
+       if (owner)
+               on_cpu = owner->on_cpu;
+       rcu_read_unlock();
+
+       /*
+        * If sem->owner is not set, the rwsem owner may have
+        * just acquired it and not set the owner yet or the rwsem
+        * has been released.
+        */
+       return on_cpu;
+}
+
+static inline bool owner_running(struct rw_semaphore *sem,
+                                struct task_struct *owner)
+{
+       if (sem->owner != owner)
+               return false;
+
+       /*
+        * Ensure we emit the owner->on_cpu, dereference _after_ checking
+        * sem->owner still matches owner, if that fails, owner might
+        * point to free()d memory, if it still matches, the rcu_read_lock()
+        * ensures the memory stays valid.
+        */
+       barrier();
+
+       return owner->on_cpu;
+}
+
+static noinline
+bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+{
+       rcu_read_lock();
+       while (owner_running(sem, owner)) {
+               if (need_resched())
+                       break;
+
+               arch_mutex_cpu_relax();
+       }
+       rcu_read_unlock();
+
+       /*
+        * We break out the loop above on need_resched() or when the
+        * owner changed, which is a sign for heavy contention. Return
+        * success only when sem->owner is NULL.
+        */
+       return sem->owner == NULL;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+       struct task_struct *owner;
+       bool taken = false;
+
+       preempt_disable();
+
+       /* sem->wait_lock should not be held when doing optimistic spinning */
+       if (!rwsem_can_spin_on_owner(sem))
+               goto done;
+
+       if (!osq_lock(&sem->osq))
+               goto done;
+
+       while (true) {
+               owner = ACCESS_ONCE(sem->owner);
+               if (owner && !rwsem_spin_on_owner(sem, owner))
+                       break;
+
+               /* wait_lock will be acquired if write_lock is obtained */
+               if (rwsem_try_write_lock_unqueued(sem)) {
+                       taken = true;
+                       break;
+               }
+
+               /*
+                * When there's no owner, we might have preempted between the
+                * owner acquiring the lock and setting the owner field. If
+                * we're an RT task that will live-lock because we won't let
+                * the owner complete.
+                */
+               if (!owner && (need_resched() || rt_task(current)))
+                       break;
+
+               /*
+                * The cpu_relax() call is a compiler barrier which forces
+                * everything in this loop to be re-loaded. We don't need
+                * memory barriers as we'll eventually observe the right
+                * values at the cost of a few extra spins.
+                */
+               arch_mutex_cpu_relax();
+       }
+       osq_unlock(&sem->osq);
+done:
+       preempt_enable();
+       return taken;
+}
+
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+       return false;
+}
+#endif
+
+/*
+ * Wait until we successfully acquire the write lock
   */
  __visible
  struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
  {
-       long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+       long count;
+       bool waiting = true; /* any queued threads before us */
         struct rwsem_waiter waiter;
-       struct task_struct *tsk = current;
  
-       /* set up my own style of waitqueue */
-       waiter.task = tsk;
+       /* undo write bias from down_write operation, stop active locking */
+       count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+
+       /* do optimistic spinning and steal lock if possible */
+       if (rwsem_optimistic_spin(sem))
+               return sem;
+
+       /*
+        * Optimistic spinning failed, proceed to the slowpath
+        * and block until we can acquire the sem.
+        */
+       waiter.task = current;
         waiter.type = RWSEM_WAITING_FOR_WRITE;
  
         raw_spin_lock_irq(&sem->wait_lock);
+
+       /* account for this before adding a new element to the list */
         if (list_empty(&sem->wait_list))
-               adjustment += RWSEM_WAITING_BIAS;
+               waiting = false;
+
         list_add_tail(&waiter.list, &sem->wait_list);
  
         /* we're now waiting on the lock, but no longer actively locking */
-       count = rwsem_atomic_update(adjustment, sem);
+       if (waiting) {
+               count = ACCESS_ONCE(sem->count);
  
-       /* If there were already threads queued before us and there are no
-        * active writers, the lock must be read owned; so we try to wake
-        * any read locks that were queued ahead of us. */
-       if (count > RWSEM_WAITING_BIAS &&
-           adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
-               sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+               /*
+                * If there were already threads queued before us and there are no
+                * active writers, the lock must be read owned; so we try to wake
+                * any read locks that were queued ahead of us.
+                */
+               if (count > RWSEM_WAITING_BIAS)
+                       sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+
+       } else
+               count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
  
         /* wait until we successfully acquire the lock */
-       set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+       set_current_state(TASK_UNINTERRUPTIBLE);
         while (true) {
-               if (!(count & RWSEM_ACTIVE_MASK)) {
-                       /* Try acquiring the write lock. */
-                       count = RWSEM_ACTIVE_WRITE_BIAS;
-                       if (!list_is_singular(&sem->wait_list))
-                               count += RWSEM_WAITING_BIAS;
-
-                       if (sem->count == RWSEM_WAITING_BIAS &&
-                           cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
-                                                       RWSEM_WAITING_BIAS)
-                               break;
-               }
-
+               if (rwsem_try_write_lock(count, sem))
+                       break;
                 raw_spin_unlock_irq(&sem->wait_lock);
  
                 /* Block until there are no active lockers. */
                 do {
                         schedule();
-                       set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                       set_current_state(TASK_UNINTERRUPTIBLE);
                 } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
  
                 raw_spin_lock_irq(&sem->wait_lock);
         }
+       __set_current_state(TASK_RUNNING);
  
         list_del(&waiter.list);
         raw_spin_unlock_irq(&sem->wait_lock);
-       tsk->state = TASK_RUNNING;
  
         return sem;
  }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c

index cfff1435bdfb2f1e6d8a88d797c2a205f9145daf..42f806de49d421092a7bd077c8efb4df9546cb94 100644 (file)
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@
  
  #include <linux/atomic.h>
  
+#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+       sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+       sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
+
  /*
   * lock for reading
   */
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
         rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
  
         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+       rwsem_set_owner(sem);
  }
  
  EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
  {
         int ret = __down_write_trylock(sem);
  
-       if (ret == 1)
+       if (ret == 1) {
                 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+               rwsem_set_owner(sem);
+       }
+
         return ret;
  }
  
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
  {
         rwsem_release(&sem->dep_map, 1, _RET_IP_);
  
+       rwsem_clear_owner(sem);
         __up_write(sem);
  }
  
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
          * lockdep: a downgraded write will live on as a write
          * dependency.
          */
+       rwsem_clear_owner(sem);
         __downgrade_write(sem);
  }
  
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
         rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
  
         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+       rwsem_set_owner(sem);
  }
  
  EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
         rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
  
         LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+       rwsem_set_owner(sem);
  }
  
  EXPORT_SYMBOL(down_write_nested);
author	Davidlohr Bueso <davidlohr@hp.com>
	Sat, 17 May 2014 13:19:24 +0000 (23:19 +1000)
committer	Stephen Rothwell <sfr@canb.auug.org.au>
	Wed, 21 May 2014 07:11:31 +0000 (17:11 +1000)
include/linux/rwsem.h		patch \| blob \| history
kernel/locking/rwsem-xadd.c		patch \| blob \| history
kernel/locking/rwsem.c		patch \| blob \| history