From: Emil Renner Berthing <erb@one.com>
To: John McCutchan <john@johnmccutchan.com>,
Robert Love <rlove@rlove.org>, Eric Paris <eparis@parisplace.org>
Cc: linux-kernel@vger.kernel.org, Jesper Dahl Nyerup <nyerup@one.com>,
Anders Saaby <as@one.com>
Subject: Inotify scalability issue
Date: Tue, 19 Jun 2012 15:51:10 +0200 [thread overview]
Message-ID: <4FE083CE.20908@one.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 2217 bytes --]
Hi,
We're running Dovecot mailservers and are experiencing problems similar
to what is described here:
http://old.nabble.com/Very-High-Load-on-Dovecot-2-and-Errors-in-mail.err.-tt33856207.html#a33856207
I've written two small programs to expose the problem.
watcher.c:
This program reads a filename from the commandline, creates a new
inotify handle and sets it up to watch IN_CLOSE_WRITE and IN_DELETE on
the file. It then writes a 'z' to stdout, and does a blocking read from
inotify. After receiving an event from inotify the program prints an 'x'
to stdout, closes the inotify handle and then prints a '.' to stdout
before exiting.
test.c:
This program creates 20 files and spawns 20 watchers to watch each of
them. For each watcher it waits between 1 and 2 seconds before touching
the file they watch (which should cause it to wake up and exit), and
then spawns a new watcher on the file, again waiting between 1 and 2
seconds before touching the file again etc.
On my dualcore workstation running the test program behaves as you'd
expect. That is it prints
zzzzzzzzzzzzzzzzzzzzx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx (etc.)
However on a 16-core server it behaves very differently:
zzzzzzzzzzzzzzzzzzzzxzxzxzxz.xzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxz......................................................................................................xzxzxzxzxz.xxzzxzxzxzxzxzxzxzxzxzxzxz.................xzxz.xz
(sorry about the long line)
That is watchers are spawned to watch their files, they're woken up by
inotify as they should be, but then they pile up in D-state waiting for
the close call to finish. Only at irregular intervals do they all return.
They seem to be sleeping on the syncronize_srcu() call in
fsnotify_destroy_group() of fs/notify/group.c.
We've tested this on various machines running kernels from 3.0 and up,
and the trend very clear: The more processors the worse it gets.
However, I also tried it on one 48-core server running an old 2.6.32
debian kernels, and here the processes don't pile up.
/Emil
[-- Attachment #2: watcher.c --]
[-- Type: text/x-c, Size: 1157 bytes --]
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/inotify.h>
#include <limits.h>
#define log(...) fprintf(stderr, __VA_ARGS__)
static void
put(int c)
{
putchar(c);
fflush(stdout);
}
int
main(int argc, char *argv[])
{
int ifd = -1;
int wd;
char buf[sizeof(struct inotify_event) + NAME_MAX + 1];
if (argc != 2) {
log("I need a file..\n");
goto error;
}
ifd = inotify_init();
if (ifd < 0) {
if (errno == EMFILE)
put('!');
else
log("Error initializing inotify: %s\n",
strerror(errno));
goto error;
}
wd = inotify_add_watch(ifd, argv[1], IN_CLOSE_WRITE | IN_DELETE);
if (wd < 0) {
log("Error adding watch on '%s': %s\n",
argv[1], strerror(errno));
goto error;
}
put('z');
if (read(ifd, buf, sizeof(buf)) < 0) {
log("Error reading inotify event: %s\n",
strerror(errno));
goto error;
}
put('x');
if (close(ifd)) {
log("Error closing inotify: %s\n",
strerror(errno));
ifd = -1;
goto error;
}
put('.');
return EXIT_SUCCESS;
error:
if (ifd > 0)
close(ifd);
return EXIT_FAILURE;
}
[-- Attachment #3: test.c --]
[-- Type: text/x-c, Size: 2346 bytes --]
#include <stdint.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <signal.h>
#define log(...) fprintf(stderr, __VA_ARGS__)
#define WATCHER "watcher"
static struct proc {
uint64_t deadline;
char filename[32 - sizeof(uint64_t)];
} procs[NPROC];
static pid_t
spawn_watcher(char *filename)
{
char *argv[3] = { WATCHER, filename, NULL };
pid_t pid;
pid = fork();
if (pid == 0) {
if (execv(WATCHER, argv)) {
log("Error spawning '%s': %s\n",
WATCHER, strerror(errno));
exit(EXIT_FAILURE);
}
}
if (pid < 0)
log("Error forking: %s\n", strerror(errno));
return pid;
}
static int
touch(char *filename, int flags)
{
int fd = open(filename, flags, 0644);
if (fd < 0 || close(fd)) {
log("Error touching '%s': %s\n",
filename, strerror(errno));
return -1;
}
return 0;
}
static uint64_t
now(void)
{
struct timeval tv;
(void)gettimeofday(&tv, NULL);
return ((uint64_t)tv.tv_sec * 1000000) + (uint64_t)tv.tv_usec;
}
static uint64_t
random_delay(void)
{
return (uint64_t)DELAY_MIN + (random() / (RAND_MAX/(DELAY_MAX - DELAY_MIN)));
}
int
main()
{
unsigned int i;
struct proc *next;
if (signal(SIGCHLD, SIG_IGN)) {
log("Error setting SIGCHLD handler: %s\n",
strerror(errno));
return EXIT_FAILURE;
}
for (i = 0; i < NPROC; i++) {
(void)sprintf(procs[i].filename, "%03u.tmp", i);
if (touch(procs[i].filename, O_WRONLY | O_CREAT)) {
log("Error creating file '%s': %s\n",
procs[i].filename, strerror(errno));
return EXIT_FAILURE;
}
}
next = &procs[0];
for (i = 0; i < NPROC; i++) {
if (spawn_watcher(procs[i].filename) < 0)
goto error;
procs[i].deadline = now() + random_delay();
if (procs[i].deadline < next->deadline)
next = &procs[i];
}
while (1) {
uint64_t n = now();
if (next->deadline > n) {
usleep(next->deadline - n);
continue;
}
if (touch(next->filename, O_WRONLY))
goto error;
if (spawn_watcher(next->filename) < 0)
goto error;
next->deadline = now() + random_delay();
for (i = 0; i < NPROC; i++) {
if (procs[i].deadline < next->deadline)
next = &procs[i];
}
}
error:
for (i = 0; i < NPROC; i++)
(void)touch(procs[i].filename, O_WRONLY);
return EXIT_FAILURE;
}
[-- Attachment #4: Makefile --]
[-- Type: text/plain, Size: 362 bytes --]
NPROC = 20
DELAY_MIN = 1000000
DELAY_MAX = 2000000
CC = gcc
CFLAGS ?= -O2 -pipe -g
CFLAGS += -std=gnu99 -Wall -Wextra -pedantic
all: test watcher
test: test.c
$(CC) $(CFLAGS) -DNPROC=$(NPROC) -DDELAY_MIN=$(DELAY_MIN) -DDELAY_MAX=$(DELAY_MAX) $(LDFLAGS) $< -o $@
watcher: watcher.c
$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@
clean:
rm -rf test watcher *.tmp
reply other threads:[~2012-06-19 14:00 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4FE083CE.20908@one.com \
--to=erb@one.com \
--cc=as@one.com \
--cc=eparis@parisplace.org \
--cc=john@johnmccutchan.com \
--cc=linux-kernel@vger.kernel.org \
--cc=nyerup@one.com \
--cc=rlove@rlove.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.