From 98022748f6c7bce85b9f123fd4d1a621219dd8d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Aug 2012 22:42:36 -0400 Subject: eventpoll: use-after-possible-free in epoll_create1() As soon as we'd installed the file into descriptor table, it can get closed by another thread. Freeing ep in process... Signed-off-by: Al Viro --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1c8b5567080..eedec84c180 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1654,8 +1654,8 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) error = PTR_ERR(file); goto out_free_fd; } - fd_install(fd, file); ep->file = file; + fd_install(fd, file); return fd; out_free_fd: -- cgit v1.2.3 From 5e196a9cf557dbc2bf808824c6c4998150e18d06 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Aug 2012 21:27:40 -0400 Subject: switch epoll_wait(2) to fget_light() Signed-off-by: Al Viro --- fs/eventpoll.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eedec84c180..567ae72e155 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1809,7 +1809,7 @@ error_return: SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { - int error; + int error, fput_needed; struct file *file; struct eventpoll *ep; @@ -1825,7 +1825,7 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, /* Get the "struct file *" for the eventpoll file */ error = -EBADF; - file = fget(epfd); + file = fget_light(epfd, &fput_needed); if (!file) goto error_return; @@ -1847,7 +1847,7 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, error = ep_poll(ep, events, maxevents, timeout); error_fput: - fput(file); + fput_light(file, fput_needed); error_return: return error; -- cgit v1.2.3 From 2903ff019b346ab8d36ebbf54853c3aaf6590608 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 28 Aug 2012 12:52:22 -0400 Subject: switch simple cases of fget_light to fdget Signed-off-by: Al Viro --- fs/eventpoll.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 567ae72e155..cd96649bfe6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1809,8 +1809,8 @@ error_return: SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { - int error, fput_needed; - struct file *file; + int error; + struct fd f; struct eventpoll *ep; /* The maximum number of event must be greater than zero */ @@ -1818,38 +1818,33 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, return -EINVAL; /* Verify that the area passed by the user is writeable */ - if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { - error = -EFAULT; - goto error_return; - } + if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) + return -EFAULT; /* Get the "struct file *" for the eventpoll file */ - error = -EBADF; - file = fget_light(epfd, &fput_needed); - if (!file) - goto error_return; + f = fdget(epfd); + if (!f.file) + return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; - if (!is_file_epoll(file)) + if (!is_file_epoll(f.file)) goto error_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ - ep = file->private_data; + ep = f.file->private_data; /* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, timeout); error_fput: - fput_light(file, fput_needed); -error_return: - + fdput(f); return error; } -- cgit v1.2.3 From 03a7beb55b9fad363f0dd33e72ccf2d3e1c2a406 Mon Sep 17 00:00:00 2001 From: "Paton J. Lewis" Date: Thu, 4 Oct 2012 17:13:39 -0700 Subject: epoll: support for disabling items, and a self-test app Enhanced epoll_ctl to support EPOLL_CTL_DISABLE, which disables an epoll item. If epoll_ctl doesn't return -EBUSY in this case, it is then safe to delete the epoll item in a multi-threaded environment. Also added a new test_epoll self- test app to both demonstrate the need for this feature and test it. Signed-off-by: Paton J. Lewis Cc: Alexander Viro Cc: Jason Baron Cc: Paul Holland Cc: Davide Libenzi Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cd96649bfe6..da72250ddc1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { - return op != EPOLL_CTL_DEL; + return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; } /* Initialize the poll safe wake up structure */ @@ -676,6 +676,34 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) return 0; } +/* + * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item + * had no event flags set, indicating that another thread may be currently + * handling that item's events (in the case that EPOLLONESHOT was being + * used). Otherwise a zero result indicates that the item has been disabled + * from receiving events. A disabled item may be re-enabled via + * EPOLL_CTL_MOD. Must be called with "mtx" held. + */ +static int ep_disable(struct eventpoll *ep, struct epitem *epi) +{ + int result = 0; + unsigned long flags; + + spin_lock_irqsave(&ep->lock, flags); + if (epi->event.events & ~EP_PRIVATE_BITS) { + if (ep_is_linked(&epi->rdllink)) + list_del_init(&epi->rdllink); + /* Ensure ep_poll_callback will not add epi back onto ready + list: */ + epi->event.events &= EP_PRIVATE_BITS; + } + else + result = -EBUSY; + spin_unlock_irqrestore(&ep->lock, flags); + + return result; +} + static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; @@ -1020,8 +1048,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } - - #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate @@ -1787,6 +1813,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else error = -ENOENT; break; + case EPOLL_CTL_DISABLE: + if (epi) + error = ep_disable(ep, epi); + else + error = -ENOENT; + break; } mutex_unlock(&ep->mtx); -- cgit v1.2.3 From a80a6b85b428e6ce12a8363bb1f08d44c50f3252 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 8 Nov 2012 15:53:35 -0800 Subject: revert "epoll: support for disabling items, and a self-test app" Revert commit 03a7beb55b9f ("epoll: support for disabling items, and a self-test app") pending resolution of the issues identified by Michael Kerrisk, copied below. We'll revisit this for 3.8. : I've taken a look at this patch as it currently stands in 3.7-rc1, and : done a bit of testing. (By the way, the test program : tools/testing/selftests/epoll/test_epoll.c does not compile...) : : There are one or two places where the behavior seems a little strange, : so I have a question or two at the end of this mail. But other than : that, I want to check my understanding so that the interface can be : correctly documented. : : Just to go though my understanding, the problem is the following : scenario in a multithreaded application: : : 1. Multiple threads are performing epoll_wait() operations, : and maintaining a user-space cache that contains information : corresponding to each file descriptor being monitored by : epoll_wait(). : : 2. At some point, a thread wants to delete (EPOLL_CTL_DEL) : a file descriptor from the epoll interest list, and : delete the corresponding record from the user-space cache. : : 3. The problem with (2) is that some other thread may have : previously done an epoll_wait() that retrieved information : about the fd in question, and may be in the middle of using : information in the cache that relates to that fd. Thus, : there is a potential race. : : 4. The race can't solved purely in user space, because doing : so would require applying a mutex across the epoll_wait() : call, which would of course blow thread concurrency. : : Right? : : Your solution is the EPOLL_CTL_DISABLE operation. I want to : confirm my understanding about how to use this flag, since : the description that has accompanied the patches so far : has been a bit sparse : : 0. In the scenario you're concerned about, deleting a file : descriptor means (safely) doing the following: : (a) Deleting the file descriptor from the epoll interest list : using EPOLL_CTL_DEL : (b) Deleting the corresponding record in the user-space cache : : 1. It's only meaningful to use this EPOLL_CTL_DISABLE in : conjunction with EPOLLONESHOT. : : 2. Using EPOLL_CTL_DISABLE without using EPOLLONESHOT in : conjunction is a logical error. : : 3. The correct way to code multithreaded applications using : EPOLL_CTL_DISABLE and EPOLLONESHOT is as follows: : : a. All EPOLL_CTL_ADD and EPOLL_CTL_MOD operations should : should EPOLLONESHOT. : : b. When a thread wants to delete a file descriptor, it : should do the following: : : [1] Call epoll_ctl(EPOLL_CTL_DISABLE) : [2] If the return status from epoll_ctl(EPOLL_CTL_DISABLE) : was zero, then the file descriptor can be safely : deleted by the thread that made this call. : [3] If the epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY, : then the descriptor is in use. In this case, the calling : thread should set a flag in the user-space cache to : indicate that the thread that is using the descriptor : should perform the deletion operation. : : Is all of the above correct? : : The implementation depends on checking on whether : (events & ~EP_PRIVATE_BITS) == 0 : This replies on the fact that EPOLL_CTL_AD and EPOLL_CTL_MOD always : set EPOLLHUP and EPOLLERR in the 'events' mask, and EPOLLONESHOT : causes those flags (as well as all others in ~EP_PRIVATE_BITS) to be : cleared. : : A corollary to the previous paragraph is that using EPOLL_CTL_DISABLE : is only useful in conjunction with EPOLLONESHOT. However, as things : stand, one can use EPOLL_CTL_DISABLE on a file descriptor that does : not have EPOLLONESHOT set in 'events' This results in the following : (slightly surprising) behavior: : : (a) The first call to epoll_ctl(EPOLL_CTL_DISABLE) returns 0 : (the indicator that the file descriptor can be safely deleted). : (b) The next call to epoll_ctl(EPOLL_CTL_DISABLE) fails with EBUSY. : : This doesn't seem particularly useful, and in fact is probably an : indication that the user made a logic error: they should only be using : epoll_ctl(EPOLL_CTL_DISABLE) on a file descriptor for which : EPOLLONESHOT was set in 'events'. If that is correct, then would it : not make sense to return an error to user space for this case? Cc: Michael Kerrisk Cc: "Paton J. Lewis" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 38 +++----------------------------------- 1 file changed, 3 insertions(+), 35 deletions(-) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index da72250ddc1..cd96649bfe6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { - return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD; + return op != EPOLL_CTL_DEL; } /* Initialize the poll safe wake up structure */ @@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) return 0; } -/* - * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item - * had no event flags set, indicating that another thread may be currently - * handling that item's events (in the case that EPOLLONESHOT was being - * used). Otherwise a zero result indicates that the item has been disabled - * from receiving events. A disabled item may be re-enabled via - * EPOLL_CTL_MOD. Must be called with "mtx" held. - */ -static int ep_disable(struct eventpoll *ep, struct epitem *epi) -{ - int result = 0; - unsigned long flags; - - spin_lock_irqsave(&ep->lock, flags); - if (epi->event.events & ~EP_PRIVATE_BITS) { - if (ep_is_linked(&epi->rdllink)) - list_del_init(&epi->rdllink); - /* Ensure ep_poll_callback will not add epi back onto ready - list: */ - epi->event.events &= EP_PRIVATE_BITS; - } - else - result = -EBUSY; - spin_unlock_irqrestore(&ep->lock, flags); - - return result; -} - static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; @@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate @@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else error = -ENOENT; break; - case EPOLL_CTL_DISABLE: - if (epi) - error = ep_disable(ep, epi); - else - error = -ENOENT; - break; } mutex_unlock(&ep->mtx); -- cgit v1.2.3 From 138d22b58696c506799f8de759804083ff9effae Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:05:02 -0800 Subject: fs, epoll: add procfs fdinfo helper This allows us to print out eventpoll target file descriptor, events and data, the /proc/pid/fdinfo/fd consists of | pos: 0 | flags: 02 | tfd: 5 events: 1d data: ffffffffffffffff enabled: 1 [avagin@: fix for unitialized ret variable] Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: Andrey Vagin Cc: Al Viro Cc: Alexey Dobriyan Cc: James Bottomley Cc: "Aneesh Kumar K.V" Cc: Alexey Dobriyan Cc: Matthew Helsley Cc: "J. Bruce Fields" Cc: "Aneesh Kumar K.V" Cc: Tvrtko Ursulin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index cd96649bfe6..be56b21435f 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -38,6 +38,8 @@ #include #include #include +#include +#include /* * LOCKING: @@ -783,8 +785,34 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) return pollflags != -1 ? pollflags : 0; } +#ifdef CONFIG_PROC_FS +static int ep_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct eventpoll *ep = f->private_data; + struct rb_node *rbp; + int ret = 0; + + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + struct epitem *epi = rb_entry(rbp, struct epitem, rbn); + + ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n", + epi->ffd.fd, epi->event.events, + (long long)epi->event.data); + if (ret) + break; + } + mutex_unlock(&ep->mtx); + + return ret; +} +#endif + /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = ep_show_fdinfo, +#endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, -- cgit v1.2.3 From 128dd1759d96ad36c379240f8b9463e8acfd37a1 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 1 Jan 2013 21:20:27 +0000 Subject: epoll: prevent missed events on EPOLL_CTL_MOD EPOLL_CTL_MOD sets the interest mask before calling f_op->poll() to ensure events are not missed. Since the modifications to the interest mask are not protected by the same lock as ep_poll_callback, we need to ensure the change is visible to other CPUs calling ep_poll_callback. We also need to ensure f_op->poll() has an up-to-date view of past events which occured before we modified the interest mask. So this barrier also pairs with the barrier in wq_has_sleeper(). This should guarantee either ep_poll_callback or f_op->poll() (or both) will notice the readiness of a recently-ready/modified item. This issue was encountered by Andreas Voellmy and Junchang(Jason) Wang in: http://thread.gmane.org/gmane.linux.kernel/1408782/ Signed-off-by: Eric Wong Cc: Hans Verkuil Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Al Viro Cc: Davide Libenzi Cc: Hans de Goede Cc: Mauro Carvalho Chehab Cc: David Miller Cc: Eric Dumazet Cc: Andrew Morton Cc: Andreas Voellmy Tested-by: "Junchang(Jason) Wang" Cc: netdev@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index be56b21435f..9fec1836057 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1313,7 +1313,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ - epi->event.events = event->events; + epi->event.events = event->events; /* need barrier below */ pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { @@ -1323,6 +1323,26 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even ep_destroy_wakeup_source(epi); } + /* + * The following barrier has two effects: + * + * 1) Flush epi changes above to other CPUs. This ensures + * we do not miss events from ep_poll_callback if an + * event occurs immediately after we call f_op->poll(). + * We need this because we did not take ep->lock while + * changing epi above (but ep_poll_callback does take + * ep->lock). + * + * 2) We also need to ensure we do not miss _past_ events + * when calling f_op->poll(). This barrier also + * pairs with the barrier in wq_has_sleeper (see + * comments for wq_has_sleeper). + * + * This barrier will now guarantee ep_poll_callback or f_op->poll + * (or both) will notice the readiness of an item. + */ + smp_mb(); + /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. -- cgit v1.2.3