]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - kernel/events/ring_buffer.c
perf: Simplify the ring-buffer code
[karo-tx-linux.git] / kernel / events / ring_buffer.c
index cd55144270b5401030f4b5ce5576f97b6b976b63..6929c5848d4ff5ad19a20d6804d809d45da82396 100644 (file)
 #include <linux/perf_event.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/circ_buf.h>
 
 #include "internal.h"
 
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
-                             unsigned long offset, unsigned long head)
-{
-       unsigned long sz = perf_data_size(rb);
-       unsigned long mask = sz - 1;
-
-       /*
-        * check if user-writable
-        * overwrite : over-write its own tail
-        * !overwrite: buffer possibly drops events.
-        */
-       if (rb->overwrite)
-               return true;
-
-       /*
-        * verify that payload is not bigger than buffer
-        * otherwise masking logic may fail to detect
-        * the "not enough space" condition
-        */
-       if ((head - offset) > sz)
-               return false;
-
-       offset = (offset - tail) & mask;
-       head   = (head   - tail) & mask;
-
-       if ((int)(head - offset) < 0)
-               return false;
-
-       return true;
-}
-
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
        atomic_set(&handle->rb->poll, POLL_IN);
@@ -87,10 +57,31 @@ again:
                goto out;
 
        /*
-        * Publish the known good head. Rely on the full barrier implied
-        * by atomic_dec_and_test() order the rb->head read and this
-        * write.
+        * Since the mmap() consumer (userspace) can run on a different CPU:
+        *
+        *   kernel                             user
+        *
+        *   READ ->data_tail                   READ ->data_head
+        *   smp_mb()   (A)                     smp_rmb()       (C)
+        *   WRITE $data                        READ $data
+        *   smp_wmb()  (B)                     smp_mb()        (D)
+        *   STORE ->data_head                  WRITE ->data_tail
+        *
+        * Where A pairs with D, and B pairs with C.
+        *
+        * I don't think A needs to be a full barrier because we won't in fact
+        * write data until we see the store from userspace. So we simply don't
+        * issue the data WRITE until we observe it. Be conservative for now.
+        *
+        * OTOH, D needs to be a full barrier since it separates the data READ
+        * from the tail WRITE.
+        *
+        * For B a WMB is sufficient since it separates two WRITEs, and for C
+        * an RMB is sufficient since it separates two READs.
+        *
+        * See perf_output_begin().
         */
+       smp_wmb();
        rb->user_page->data_head = head;
 
        /*
@@ -154,13 +145,16 @@ int perf_output_begin(struct perf_output_handle *handle,
                 * Userspace could choose to issue a mb() before updating the
                 * tail pointer. So that all reads will be completed before the
                 * write is issued.
+                *
+                * See perf_output_put_handle().
                 */
                tail = ACCESS_ONCE(rb->user_page->data_tail);
-               smp_rmb();
+               smp_mb();
                offset = head = local_read(&rb->head);
-               head += size;
-               if (unlikely(!perf_output_space(rb, tail, offset, head)))
+               if (!rb->overwrite &&
+                   unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
                        goto fail;
+               head += size;
        } while (local_cmpxchg(&rb->head, offset, head) != offset);
 
        if (head - local_read(&rb->wakeup) > rb->watermark)