From f3f86e33dc3da437fa4f204588ce7c78ea756982 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 30 Oct 2015 16:53:57 -0700 Subject: [PATCH] vfs: Fix pathological performance case for __alloc_fd() Al Viro points out that: > > * [Linux-specific aside] our __alloc_fd() can degrade quite badly > > with some use patterns. The cacheline pingpong in the bitmap is probably > > inevitable, unless we accept considerably heavier memory footprint, > > but we also have a case when alloc_fd() takes O(n) and it's _not_ hard > > to trigger - close(3);open(...); will have the next open() after that > > scanning the entire in-use bitmap. And Eric Dumazet has a somewhat realistic multithreaded microbenchmark that opens and closes a lot of sockets with minimal work per socket. This patch largely fixes it. We keep a 2nd-level bitmap of the open file bitmaps, showing which words are already full. So then we can traverse that second-level bitmap to efficiently skip already allocated file descriptors. On his benchmark, this improves performance by up to an order of magnitude, by avoiding the excessive open file bitmap scanning. Tested-and-acked-by: Eric Dumazet Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/file.c | 39 +++++++++++++++++++++++++++++++++++---- include/linux/fdtable.h | 2 ++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/fs/file.c b/fs/file.c index 6c672ad329e9..6f6eb2b03af5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -56,6 +56,9 @@ static void free_fdtable_rcu(struct rcu_head *rcu) __free_fdtable(container_of(rcu, struct fdtable, rcu)); } +#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) +#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) + /* * Expand the fdset in the files_struct. Called with the files spinlock * held for write. @@ -77,6 +80,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) memset((char *)(nfdt->open_fds) + cpy, 0, set); memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); memset((char *)(nfdt->close_on_exec) + cpy, 0, set); + + cpy = BITBIT_SIZE(ofdt->max_fds); + set = BITBIT_SIZE(nfdt->max_fds) - cpy; + memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); + memset(cpy+(char *)nfdt->full_fds_bits, 0, set); } static struct fdtable * alloc_fdtable(unsigned int nr) @@ -115,12 +123,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr) fdt->fd = data; data = alloc_fdmem(max_t(size_t, - 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); + 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES)); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; + data += nr / BITS_PER_BYTE; + fdt->full_fds_bits = data; return fdt; @@ -229,14 +239,18 @@ static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) __clear_bit(fd, fdt->close_on_exec); } -static inline void __set_open_fd(int fd, struct fdtable *fdt) +static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->open_fds); + fd /= BITS_PER_LONG; + if (!~fdt->open_fds[fd]) + __set_bit(fd, fdt->full_fds_bits); } -static inline void __clear_open_fd(int fd, struct fdtable *fdt) +static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); + __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } static int count_open_files(struct fdtable *fdt) @@ -280,6 +294,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; + new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); @@ -323,6 +338,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); + memcpy(new_fdt->full_fds_bits, old_fdt->full_fds_bits, BITBIT_SIZE(open_files)); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -454,10 +470,25 @@ struct files_struct init_files = { .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, + .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), }; +static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start) +{ + unsigned long maxfd = fdt->max_fds; + unsigned long maxbit = maxfd / BITS_PER_LONG; + unsigned long bitbit = start / BITS_PER_LONG; + + bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; + if (bitbit > maxfd) + return maxfd; + if (bitbit > start) + start = bitbit; + return find_next_zero_bit(fdt->open_fds, maxfd, start); +} + /* * allocate a file descriptor, mark it busy. */ @@ -476,7 +507,7 @@ repeat: fd = files->next_fd; if (fd < fdt->max_fds) - fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); + fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 674e3e226465..5295535b60c6 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -26,6 +26,7 @@ struct fdtable { struct file __rcu **fd; /* current fd array */ unsigned long *close_on_exec; unsigned long *open_fds; + unsigned long *full_fds_bits; struct rcu_head rcu; }; @@ -59,6 +60,7 @@ struct files_struct { int next_fd; unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; + unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; -- 2.39.2