]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - kernel/locking/qspinlock_paravirt.h
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[karo-tx-linux.git] / kernel / locking / qspinlock_paravirt.h
index df19ae4debd09c134d438b57e4ead7c71462c2b6..f0450ff4829b6c1308d4768b2ae3a7c575b1cf51 100644 (file)
 
 #define _Q_SLOW_VAL    (3U << _Q_LOCKED_OFFSET)
 
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
 enum vcpu_state {
        vcpu_running = 0,
-       vcpu_halted,
+       vcpu_halted,            /* Used only in pv_wait_node */
+       vcpu_hashed,            /* = pv_hash'ed + vcpu_halted */
 };
 
 struct pv_node {
@@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)
 
 /*
  * Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to wake the vcpu again.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
  */
 static void pv_wait_node(struct mcs_spinlock *node)
 {
@@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 *
                 * [S] pn->state = vcpu_halted    [S] next->locked = 1
                 *     MB                             MB
-                * [L] pn->locked               [RmW] pn->state = vcpu_running
+                * [L] pn->locked               [RmW] pn->state = vcpu_hashed
                 *
-                * Matches the xchg() from pv_kick_node().
+                * Matches the cmpxchg() from pv_kick_node().
                 */
                smp_store_mb(pn->state, vcpu_halted);
 
@@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)
                        pv_wait(&pn->state, vcpu_halted);
 
                /*
-                * Reset the vCPU state to avoid unncessary CPU kicking
+                * If pv_kick_node() changed us to vcpu_hashed, retain that value
+                * so that pv_wait_head() knows to not also try to hash this lock.
                 */
-               WRITE_ONCE(pn->state, vcpu_running);
+               cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
                /*
                 * If the locked flag is still not set after wakeup, it is a
@@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 * MCS lock will be released soon.
                 */
        }
+
        /*
         * By now our node->locked should be 1 and our caller will not actually
         * spin-wait for it. We do however rely on our caller to do a
@@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
 }
 
 /*
- * Called after setting next->locked = 1, used to wake those stuck in
- * pv_wait_node().
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
  */
-static void pv_kick_node(struct mcs_spinlock *node)
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
        struct pv_node *pn = (struct pv_node *)node;
+       struct __qspinlock *l = (void *)lock;
 
        /*
-        * Note that because node->locked is already set, this actual
-        * mcs_spinlock entry could be re-used already.
+        * If the vCPU is indeed halted, advance its state to match that of
+        * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+        * observe its next->locked value and advance itself.
         *
-        * This should be fine however, kicking people for no reason is
-        * harmless.
+        * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+        */
+       if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+               return;
+
+       /*
+        * Put the lock into the hash table and set the _Q_SLOW_VAL.
         *
-        * See the comment in pv_wait_node().
+        * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+        * the hash table later on at unlock time, no atomic instruction is
+        * needed.
         */
-       if (xchg(&pn->state, vcpu_running) == vcpu_halted)
-               pv_kick(pn->cpu);
+       WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+       (void)pv_hash(lock, pn);
 }
 
 /*
@@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
        struct qspinlock **lp = NULL;
        int loop;
 
+       /*
+        * If pv_kick_node() already advanced our state, we don't need to
+        * insert ourselves into the hash table anymore.
+        */
+       if (READ_ONCE(pn->state) == vcpu_hashed)
+               lp = (struct qspinlock **)1;
+
        for (;;) {
                for (loop = SPIN_THRESHOLD; loop; loop--) {
                        if (!READ_ONCE(l->locked))
@@ -240,17 +266,19 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
                        cpu_relax();
                }
 
-               WRITE_ONCE(pn->state, vcpu_halted);
                if (!lp) { /* ONCE */
                        lp = pv_hash(lock, pn);
+
                        /*
-                        * lp must be set before setting _Q_SLOW_VAL
+                        * We must hash before setting _Q_SLOW_VAL, such that
+                        * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+                        * we'll be sure to be able to observe our hash entry.
                         *
-                        * [S] lp = lock                [RmW] l = l->locked = 0
-                        *     MB                             MB
-                        * [S] l->locked = _Q_SLOW_VAL  [L]   lp
+                        *   [S] <hash>                 [Rmw] l->locked == _Q_SLOW_VAL
+                        *       MB                           RMB
+                        * [RmW] l->locked = _Q_SLOW_VAL  [L] <unhash>
                         *
-                        * Matches the cmpxchg() in __pv_queued_spin_unlock().
+                        * Matches the smp_rmb() in __pv_queued_spin_unlock().
                         */
                        if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
                                /*
@@ -287,23 +315,33 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 {
        struct __qspinlock *l = (void *)lock;
        struct pv_node *node;
-       u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+       u8 locked;
 
        /*
         * We must not unlock if SLOW, because in that case we must first
         * unhash. Otherwise it would be possible to have multiple @lock
         * entries, which would be BAD.
         */
-       if (likely(lockval == _Q_LOCKED_VAL))
+       locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+       if (likely(locked == _Q_LOCKED_VAL))
                return;
 
-       if (unlikely(lockval != _Q_SLOW_VAL)) {
-               if (debug_locks_silent)
-                       return;
-               WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val));
+       if (unlikely(locked != _Q_SLOW_VAL)) {
+               WARN(!debug_locks_silent,
+                    "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+                    (unsigned long)lock, atomic_read(&lock->val));
                return;
        }
 
+       /*
+        * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+        * so we need a barrier to order the read of the node data in
+        * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+        *
+        * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+        */
+       smp_rmb();
+
        /*
         * Since the above failed to release, this must be the SLOW path.
         * Therefore start by looking up the blocked node and unhashing it.
@@ -319,9 +357,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
        /*
         * At this point the memory pointed at by lock can be freed/reused,
         * however we can still use the pv_node to kick the CPU.
+        * The other vCPU may not really be halted, but kicking an active
+        * vCPU is harmless other than the additional latency in completing
+        * the unlock.
         */
-       if (READ_ONCE(node->state) == vcpu_halted)
-               pv_kick(node->cpu);
+       pv_kick(node->cpu);
 }
 /*
  * Include the architecture specific callee-save thunk of the