]> git.kernelconcepts.de Git - karo-tx-linux.git/commitdiff
epoll: support for disabling items, and a self-test app
authorPaton J. Lewis <palewis@adobe.com>
Sat, 3 Nov 2012 00:42:52 +0000 (11:42 +1100)
committerStephen Rothwell <sfr@canb.auug.org.au>
Wed, 7 Nov 2012 04:15:27 +0000 (15:15 +1100)
It is not currently possible to reliably delete epoll items when using the
same epoll set from multiple threads.  After calling epoll_ctl with
EPOLL_CTL_DEL, another thread might still be executing code related to an
event for that epoll item (in response to epoll_wait).  Therefore the
deleting thread does not know when it is safe to delete resources
pertaining to the associated epoll item because another thread might be
using those resources.

The deleting thread could wait an arbitrary amount of time after calling
epoll_ctl with EPOLL_CTL_DEL and before deleting the item, but this is
inefficient and could result in the destruction of resources before
another thread is done handling an event returned by epoll_wait.

This patch enhances epoll_ctl to support EPOLL_CTL_DISABLE, which disables
an epoll item.  If epoll_ctl returns -EBUSY in this case, then another
thread may handling a return from epoll_wait for this item.  Otherwise if
epoll_ctl returns 0, then it is safe to delete the epoll item.  This
allows multiple threads to use a mutex to determine when it is safe to
delete an epoll item and its associated resources, which allows epoll
items to be deleted both efficiently and without error in a multi-threaded
environment.  Note that EPOLL_CTL_DISABLE is only useful in conjunction
with EPOLLONESHOT, and using EPOLL_CTL_DISABLE on an epoll item without
EPOLLONESHOT returns -EINVAL.

This patch also adds a new test_epoll self-test program to both
demonstrate the need for this feature and test it.

Signed-off-by: Paton J. Lewis <palewis@adobe.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Paul Holland <pholland@adobe.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/eventpoll.c
include/uapi/linux/eventpoll.h
tools/testing/selftests/Makefile
tools/testing/selftests/epoll/Makefile [new file with mode: 0644]
tools/testing/selftests/epoll/test_epoll.c [new file with mode: 0644]

index cd96649bfe62da9e408dbd629f56f6452c139bc9..fadf5958af4bf6314d36751ebbf210647c1a33ee 100644 (file)
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
 /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
 static inline int ep_op_has_event(int op)
 {
-       return op != EPOLL_CTL_DEL;
+       return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
 }
 
 /* Initialize the poll safe wake up structure */
@@ -676,6 +676,36 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
        return 0;
 }
 
+/*
+ * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
+ * had no event flags set, indicating that another thread may be currently
+ * handling that item's events (in the case that EPOLLONESHOT was being
+ * used). Otherwise a zero result indicates that the item has been disabled
+ * from receiving events. A disabled item may be re-enabled via
+ * EPOLL_CTL_MOD. Must be called with "mtx" held.
+ */
+static int ep_disable(struct eventpoll *ep, struct epitem *epi)
+{
+       int result = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ep->lock, flags);
+       if (epi->event.events & EPOLLONESHOT) {
+               if (epi->event.events & ~EP_PRIVATE_BITS) {
+                       if (ep_is_linked(&epi->rdllink))
+                               list_del_init(&epi->rdllink);
+                       /* Ensure ep_poll_callback will not add epi back onto
+                          ready list: */
+                       epi->event.events &= EP_PRIVATE_BITS;
+               } else
+                       result = -EBUSY;
+       } else
+               result = -EINVAL;
+       spin_unlock_irqrestore(&ep->lock, flags);
+
+       return result;
+}
+
 static void ep_free(struct eventpoll *ep)
 {
        struct rb_node *rbp;
@@ -1020,8 +1050,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
        rb_insert_color(&epi->rbn, &ep->rbr);
 }
 
-
-
 #define PATH_ARR_SIZE 5
 /*
  * These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1787,6 +1815,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                } else
                        error = -ENOENT;
                break;
+       case EPOLL_CTL_DISABLE:
+               if (epi)
+                       error = ep_disable(ep, epi);
+               else
+                       error = -ENOENT;
+               break;
        }
        mutex_unlock(&ep->mtx);
 
index 2c267bcbb85c1c4e31b676127db7686d8d027e09..8c99ce7202c53bcd93887c93dd8d8e75ebcd8aa2 100644 (file)
@@ -25,6 +25,7 @@
 #define EPOLL_CTL_ADD 1
 #define EPOLL_CTL_DEL 2
 #define EPOLL_CTL_MOD 3
+#define EPOLL_CTL_DISABLE 4
 
 /*
  * Request the handling of system wakeup events so as to prevent system suspends
index 85baf11e2acd7d11aa4990a0f7f53f8d28689a20..c5aab2bb9fbdb876900b3812dfb29baf5c02c341 100644 (file)
@@ -1,4 +1,4 @@
-TARGETS = breakpoints kcmp mqueue vm cpu-hotplug memory-hotplug
+TARGETS = breakpoints epoll kcmp mqueue vm cpu-hotplug memory-hotplug
 
 all:
        for TARGET in $(TARGETS); do \
diff --git a/tools/testing/selftests/epoll/Makefile b/tools/testing/selftests/epoll/Makefile
new file mode 100644 (file)
index 0000000..19806ed
--- /dev/null
@@ -0,0 +1,11 @@
+# Makefile for epoll selftests
+
+all: test_epoll
+%: %.c
+       gcc -pthread -g -o $@ $^
+
+run_tests: all
+       ./test_epoll
+
+clean:
+       $(RM) test_epoll
diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c
new file mode 100644 (file)
index 0000000..4a14a7f
--- /dev/null
@@ -0,0 +1,364 @@
+/*
+ *  tools/testing/selftests/epoll/test_epoll.c
+ *
+ *  Copyright 2012 Adobe Systems Incorporated
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Paton J. Lewis <palewis@adobe.com>
+ *
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+
+/*
+ * A pointer to an epoll_item_private structure will be stored in the epoll
+ * item's event structure so that we can get access to the epoll_item_private
+ * data after calling epoll_wait:
+ */
+struct epoll_item_private {
+       int index;  /* Position of this struct within the epoll_items array. */
+       int fd;
+       uint32_t events;
+       pthread_mutex_t mutex;  /* Guards the following variables... */
+       int stop;
+       int status;  /* Stores any error encountered while handling item. */
+       /* The following variable allows us to test whether we have encountered
+          a problem while attempting to cancel and delete the associated
+          event. When the test program exits, 'deleted' should be exactly
+          one. If it is greater than one, then the failed test reflects a real
+          world situation where we would have tried to access the epoll item's
+          private data after deleting it: */
+       int deleted;
+};
+
+struct epoll_item_private *epoll_items;
+
+/*
+ * Delete the specified item from the epoll set. In a real-world secneario this
+ * is where we would free the associated data structure, but in this testing
+ * environment we retain the structure so that we can test for double-deletion:
+ */
+void delete_item(int index)
+{
+       __sync_fetch_and_add(&epoll_items[index].deleted, 1);
+}
+
+/*
+ * A pointer to a read_thread_data structure will be passed as the argument to
+ * each read thread:
+ */
+struct read_thread_data {
+       int stop;
+       int status;  /* Indicates any error encountered by the read thread. */
+       int epoll_set;
+};
+
+/*
+ * The function executed by the read threads:
+ */
+void *read_thread_function(void *function_data)
+{
+       struct read_thread_data *thread_data =
+               (struct read_thread_data *)function_data;
+       struct epoll_event event_data;
+       struct epoll_item_private *item_data;
+       char socket_data;
+
+       /* Handle events until we encounter an error or this thread's 'stop'
+          condition is set: */
+       while (1) {
+               int result = epoll_wait(thread_data->epoll_set,
+                                       &event_data,
+                                       1,      /* Number of desired events */
+                                       1000);  /* Timeout in ms */
+               if (result < 0) {
+                       /* Breakpoints signal all threads. Ignore that while
+                          debugging: */
+                       if (errno == EINTR)
+                               continue;
+                       thread_data->status = errno;
+                       return 0;
+               } else if (thread_data->stop)
+                       return 0;
+               else if (result == 0)  /* Timeout */
+                       continue;
+
+               /* We need the mutex here because checking for the stop
+                  condition and re-enabling the epoll item need to be done
+                  together as one atomic operation when EPOLL_CTL_DISABLE is
+                  available: */
+               item_data = (struct epoll_item_private *)event_data.data.ptr;
+               pthread_mutex_lock(&item_data->mutex);
+
+               /* Remove the item from the epoll set if we want to stop
+                  handling that event: */
+               if (item_data->stop)
+                       delete_item(item_data->index);
+               else {
+                       /* Clear the data that was written to the other end of
+                          our non-blocking socket: */
+                       do {
+                               if (read(item_data->fd, &socket_data, 1) < 1) {
+                                       if ((errno == EAGAIN) ||
+                                           (errno == EWOULDBLOCK))
+                                               break;
+                                       else
+                                               goto error_unlock;
+                               }
+                       } while (item_data->events & EPOLLET);
+
+                       /* The item was one-shot, so re-enable it: */
+                       event_data.events = item_data->events;
+                       if (epoll_ctl(thread_data->epoll_set,
+                                                 EPOLL_CTL_MOD,
+                                                 item_data->fd,
+                                                 &event_data) < 0)
+                               goto error_unlock;
+               }
+
+               pthread_mutex_unlock(&item_data->mutex);
+       }
+
+error_unlock:
+       thread_data->status = item_data->status = errno;
+       pthread_mutex_unlock(&item_data->mutex);
+       return 0;
+}
+
+/*
+ * A pointer to a write_thread_data structure will be passed as the argument to
+ * the write thread:
+ */
+struct write_thread_data {
+       int stop;
+       int status;  /* Indicates any error encountered by the write thread. */
+       int n_fds;
+       int *fds;
+};
+
+/*
+ * The function executed by the write thread. It writes a single byte to each
+ * socket in turn until the stop condition for this thread is set. If writing to
+ * a socket would block (i.e. errno was EAGAIN), we leave that socket alone for
+ * the moment and just move on to the next socket in the list. We don't care
+ * about the order in which we deliver events to the epoll set. In fact we don't
+ * care about the data we're writing to the pipes at all; we just want to
+ * trigger epoll events:
+ */
+void *write_thread_function(void *function_data)
+{
+       const char data = 'X';
+       int index;
+       struct write_thread_data *thread_data =
+               (struct write_thread_data *)function_data;
+       while (!thread_data->stop)
+               for (index = 0;
+                    !thread_data->stop && (index < thread_data->n_fds);
+                    ++index)
+                       if ((write(thread_data->fds[index], &data, 1) < 1) &&
+                               (errno != EAGAIN) &&
+                               (errno != EWOULDBLOCK)) {
+                               thread_data->status = errno;
+                               return;
+                       }
+}
+
+/*
+ * Arguments are currently ignored:
+ */
+int main(int argc, char **argv)
+{
+       const int n_read_threads = 100;
+       const int n_epoll_items = 500;
+       int index;
+       int epoll_set = epoll_create1(0);
+       struct write_thread_data write_thread_data = {
+               0, 0, n_epoll_items, malloc(n_epoll_items * sizeof(int))
+       };
+       struct read_thread_data *read_thread_data =
+               malloc(n_read_threads * sizeof(struct read_thread_data));
+       pthread_t *read_threads = malloc(n_read_threads * sizeof(pthread_t));
+       pthread_t write_thread;
+       int socket_pair[2];
+       struct epoll_event event_data;
+
+       printf("-----------------\n");
+       printf("Runing test_epoll\n");
+       printf("-----------------\n");
+
+       epoll_items = malloc(n_epoll_items * sizeof(struct epoll_item_private));
+
+       if (epoll_set < 0 || epoll_items == 0 || write_thread_data.fds == 0 ||
+               read_thread_data == 0 || read_threads == 0)
+               goto error;
+
+       if (sysconf(_SC_NPROCESSORS_ONLN) < 2) {
+               printf("Error: please run this test on a multi-core system.\n");
+               goto error;
+       }
+
+       /* Create the socket pairs and epoll items: */
+       for (index = 0; index < n_epoll_items; ++index) {
+               if (socketpair(AF_UNIX,
+                              SOCK_STREAM | SOCK_NONBLOCK,
+                              0,
+                              socket_pair) < 0)
+                       goto error;
+               write_thread_data.fds[index] = socket_pair[0];
+               epoll_items[index].index = index;
+               epoll_items[index].fd = socket_pair[1];
+               if (pthread_mutex_init(&epoll_items[index].mutex, NULL) != 0)
+                       goto error;
+               /* We always use EPOLLONESHOT because this test is currently
+                  structured to demonstrate the need for EPOLL_CTL_DISABLE,
+                  which only produces useful information in the EPOLLONESHOT
+                  case (without EPOLLONESHOT, calling epoll_ctl with
+                  EPOLL_CTL_DISABLE will never return EBUSY). If support for
+                  testing events without EPOLLONESHOT is desired, it should
+                  probably be implemented in a separate unit test. */
+               epoll_items[index].events = EPOLLIN | EPOLLONESHOT;
+               if (index < n_epoll_items / 2)
+                       epoll_items[index].events |= EPOLLET;
+               epoll_items[index].stop = 0;
+               epoll_items[index].status = 0;
+               epoll_items[index].deleted = 0;
+               event_data.events = epoll_items[index].events;
+               event_data.data.ptr = &epoll_items[index];
+               if (epoll_ctl(epoll_set,
+                             EPOLL_CTL_ADD,
+                             epoll_items[index].fd,
+                             &event_data) < 0)
+                       goto error;
+       }
+
+#ifdef EPOLL_CTL_DISABLE
+       /* Test to make sure that using EPOLL_CTL_DISABLE without EPOLLONESHOT
+          returns a clear error: */
+       if (socketpair(AF_UNIX,
+                      SOCK_STREAM | SOCK_NONBLOCK,
+                      0,
+                      socket_pair) < 0)
+               goto error;
+       event_data.events = EPOLLIN;
+       event_data.data.ptr = NULL;
+       if (epoll_ctl(epoll_set, EPOLL_CTL_ADD,
+                     socket_pair[1], &event_data) < 0)
+               goto error;
+       if ((epoll_ctl(epoll_set, EPOLL_CTL_DISABLE,
+                      socket_pair[1], NULL) == 0) || (errno != EINVAL))
+               goto error;
+       if (epoll_ctl(epoll_set, EPOLL_CTL_DEL, socket_pair[1], NULL) != 0)
+               goto error;
+#endif
+
+       /* Create and start the read threads: */
+       for (index = 0; index < n_read_threads; ++index) {
+               read_thread_data[index].stop = 0;
+               read_thread_data[index].status = 0;
+               read_thread_data[index].epoll_set = epoll_set;
+               if (pthread_create(&read_threads[index],
+                                  NULL,
+                                  read_thread_function,
+                                  &read_thread_data[index]) != 0)
+                       goto error;
+       }
+
+       if (pthread_create(&write_thread,
+                          NULL,
+                          write_thread_function,
+                          &write_thread_data) != 0)
+               goto error;
+
+       /* Cancel all event pollers: */
+#ifdef EPOLL_CTL_DISABLE
+       for (index = 0; index < n_epoll_items; ++index) {
+               pthread_mutex_lock(&epoll_items[index].mutex);
+               ++epoll_items[index].stop;
+               if (epoll_ctl(epoll_set,
+                             EPOLL_CTL_DISABLE,
+                             epoll_items[index].fd,
+                             NULL) == 0)
+                       delete_item(index);
+               else if (errno != EBUSY) {
+                       pthread_mutex_unlock(&epoll_items[index].mutex);
+                       goto error;
+               }
+               /* EBUSY means events were being handled; allow the other thread
+                  to delete the item. */
+               pthread_mutex_unlock(&epoll_items[index].mutex);
+       }
+#else
+       for (index = 0; index < n_epoll_items; ++index) {
+               pthread_mutex_lock(&epoll_items[index].mutex);
+               ++epoll_items[index].stop;
+               pthread_mutex_unlock(&epoll_items[index].mutex);
+               /* Wait in case a thread running read_thread_function is
+                  currently executing code between epoll_wait and
+                  pthread_mutex_lock with this item. Note that a longer delay
+                  would make double-deletion less likely (at the expense of
+                  performance), but there is no guarantee that any delay would
+                  ever be sufficient. Note also that we delete all event
+                  pollers at once for testing purposes, but in a real-world
+                  environment we are likely to want to be able to cancel event
+                  pollers at arbitrary times. Therefore we can't improve this
+                  situation by just splitting this loop into two loops
+                  (i.e. signal 'stop' for all items, sleep, and then delete all
+                  items). We also can't fix the problem via EPOLL_CTL_DEL
+                  because that command can't prevent the case where some other
+                  thread is executing read_thread_function within the region
+                  mentioned above: */
+               usleep(1);
+               pthread_mutex_lock(&epoll_items[index].mutex);
+               if (!epoll_items[index].deleted)
+                       delete_item(index);
+               pthread_mutex_unlock(&epoll_items[index].mutex);
+       }
+#endif
+
+       /* Shut down the read threads: */
+       for (index = 0; index < n_read_threads; ++index)
+               __sync_fetch_and_add(&read_thread_data[index].stop, 1);
+       for (index = 0; index < n_read_threads; ++index) {
+               if (pthread_join(read_threads[index], NULL) != 0)
+                       goto error;
+               if (read_thread_data[index].status)
+                       goto error;
+       }
+
+       /* Shut down the write thread: */
+       __sync_fetch_and_add(&write_thread_data.stop, 1);
+       if ((pthread_join(write_thread, NULL) != 0) || write_thread_data.status)
+               goto error;
+
+       /* Check for final error conditions: */
+       for (index = 0; index < n_epoll_items; ++index) {
+               if (epoll_items[index].status != 0)
+                       goto error;
+               if (pthread_mutex_destroy(&epoll_items[index].mutex) < 0)
+                       goto error;
+       }
+       for (index = 0; index < n_epoll_items; ++index)
+               if (epoll_items[index].deleted != 1) {
+                       printf("Error: item data deleted %1d times.\n",
+                                  epoll_items[index].deleted);
+                       goto error;
+               }
+
+       printf("[PASS]\n");
+       return 0;
+
+ error:
+       printf("[FAIL]\n");
+       return errno;
+}