All of lore.kernel.org
 help / color / mirror / Atom feed
From: Emil Renner Berthing <erb@one.com>
To: John McCutchan <john@johnmccutchan.com>,
	Robert Love <rlove@rlove.org>, Eric Paris <eparis@parisplace.org>
Cc: linux-kernel@vger.kernel.org, Jesper Dahl Nyerup <nyerup@one.com>,
	Anders Saaby <as@one.com>
Subject: Inotify scalability issue
Date: Tue, 19 Jun 2012 15:51:10 +0200	[thread overview]
Message-ID: <4FE083CE.20908@one.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 2217 bytes --]

Hi,

We're running Dovecot mailservers and are experiencing problems similar 
to what is described here:
http://old.nabble.com/Very-High-Load-on-Dovecot-2-and-Errors-in-mail.err.-tt33856207.html#a33856207

I've written two small programs to expose the problem.

watcher.c:
This program reads a filename from the commandline, creates a new 
inotify handle and sets it up to watch IN_CLOSE_WRITE and IN_DELETE on 
the file. It then writes a 'z' to stdout, and does a blocking read from 
inotify. After receiving an event from inotify the program prints an 'x' 
to stdout, closes the inotify handle and then prints a '.' to stdout 
before exiting.

test.c:
This program creates 20 files and spawns 20 watchers to watch each of 
them. For each watcher it waits between 1 and 2 seconds before touching 
the file they watch (which should cause it to wake up and exit), and 
then spawns a new watcher on the file, again waiting between 1 and 2 
seconds before touching the file again etc.

On my dualcore workstation running the test program behaves as you'd 
expect. That is it prints
zzzzzzzzzzzzzzzzzzzzx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx.zx (etc.)

However on a 16-core server it behaves very differently:
zzzzzzzzzzzzzzzzzzzzxzxzxzxz.xzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxzxz......................................................................................................xzxzxzxzxz.xxzzxzxzxzxzxzxzxzxzxzxzxz.................xzxz.xz

(sorry about the long line)
That is watchers are spawned to watch their files, they're woken up by 
inotify as they should be, but then they pile up in D-state waiting for 
the close call to finish. Only at irregular intervals do they all return.

They seem to be sleeping on the syncronize_srcu() call in 
fsnotify_destroy_group() of fs/notify/group.c.

We've tested this on various machines running kernels from 3.0 and up, 
and the trend very clear: The more processors the worse it gets.
However, I also tried it on one 48-core server running an old 2.6.32 
debian kernels, and here the processes don't pile up.

/Emil

[-- Attachment #2: watcher.c --]
[-- Type: text/x-c, Size: 1157 bytes --]

#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/inotify.h>
#include <limits.h>

#define log(...) fprintf(stderr, __VA_ARGS__)

static void
put(int c)
{
	putchar(c);
	fflush(stdout);
}

int
main(int argc, char *argv[])
{
	int ifd = -1;
	int wd;
	char buf[sizeof(struct inotify_event) + NAME_MAX + 1];

	if (argc != 2) {
		log("I need a file..\n");
		goto error;
	}

	ifd = inotify_init();
	if (ifd < 0) {
		if (errno == EMFILE)
			put('!');
		else
			log("Error initializing inotify: %s\n",
				strerror(errno));
		goto error;
	}

	wd = inotify_add_watch(ifd, argv[1], IN_CLOSE_WRITE | IN_DELETE);
	if (wd < 0) {
		log("Error adding watch on '%s': %s\n",
				argv[1], strerror(errno));
		goto error;
	}

	put('z');
	if (read(ifd, buf, sizeof(buf)) < 0) {
		log("Error reading inotify event: %s\n",
				strerror(errno));
		goto error;
	}

	put('x');
	if (close(ifd)) {
		log("Error closing inotify: %s\n",
				strerror(errno));
		ifd = -1;
		goto error;
	}
	put('.');
	return EXIT_SUCCESS;

error:
	if (ifd > 0)
		close(ifd);
	return EXIT_FAILURE;
}

[-- Attachment #3: test.c --]
[-- Type: text/x-c, Size: 2346 bytes --]

#include <stdint.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <signal.h>

#define log(...) fprintf(stderr, __VA_ARGS__)
#define WATCHER "watcher"

static struct proc {
	uint64_t deadline;
	char filename[32 - sizeof(uint64_t)];
} procs[NPROC];

static pid_t
spawn_watcher(char *filename)
{
	char *argv[3] = { WATCHER, filename, NULL };
	pid_t pid;

	pid = fork();
	if (pid == 0) {
		if (execv(WATCHER, argv)) {
			log("Error spawning '%s': %s\n",
					WATCHER, strerror(errno));
			exit(EXIT_FAILURE);
		}
	}

	if (pid < 0)
		log("Error forking: %s\n", strerror(errno));
	
	return pid;
}

static int
touch(char *filename, int flags)
{
	int fd = open(filename, flags, 0644);
	if (fd < 0 || close(fd)) {
		log("Error touching '%s': %s\n",
				filename, strerror(errno));
		return -1;
	}
	return 0;
}

static uint64_t 
now(void)
{
	struct timeval tv;
	(void)gettimeofday(&tv, NULL);
	return ((uint64_t)tv.tv_sec * 1000000) + (uint64_t)tv.tv_usec;
}

static uint64_t
random_delay(void)
{
	return (uint64_t)DELAY_MIN + (random() / (RAND_MAX/(DELAY_MAX - DELAY_MIN)));
}

int
main()
{
	unsigned int i;
	struct proc *next;

	if (signal(SIGCHLD, SIG_IGN)) {
		log("Error setting SIGCHLD handler: %s\n",
				strerror(errno));
		return EXIT_FAILURE;
	}

	for (i = 0; i < NPROC; i++) {
		(void)sprintf(procs[i].filename, "%03u.tmp", i);
		if (touch(procs[i].filename, O_WRONLY | O_CREAT)) {
			log("Error creating file '%s': %s\n",
					procs[i].filename, strerror(errno));
			return EXIT_FAILURE;
		}
	}

	next = &procs[0];
	for (i = 0; i < NPROC; i++) {
		if (spawn_watcher(procs[i].filename) < 0)
			goto error;

		procs[i].deadline = now() + random_delay();
		if (procs[i].deadline < next->deadline)
			next = &procs[i];
	}

	while (1) {
		uint64_t n = now();

		if (next->deadline > n) {
			usleep(next->deadline - n);
			continue;
		}

		if (touch(next->filename, O_WRONLY))
			goto error;

		if (spawn_watcher(next->filename) < 0)
			goto error;

		next->deadline = now() + random_delay();
		for (i = 0; i < NPROC; i++) {
			if (procs[i].deadline < next->deadline)
				next = &procs[i];
		}
	}

error:
	for (i = 0; i < NPROC; i++)
		(void)touch(procs[i].filename, O_WRONLY);

	return EXIT_FAILURE;
}

[-- Attachment #4: Makefile --]
[-- Type: text/plain, Size: 362 bytes --]

NPROC     = 20
DELAY_MIN = 1000000
DELAY_MAX = 2000000

CC = gcc
CFLAGS ?= -O2 -pipe -g
CFLAGS += -std=gnu99 -Wall -Wextra -pedantic

all: test watcher

test: test.c
	$(CC) $(CFLAGS) -DNPROC=$(NPROC) -DDELAY_MIN=$(DELAY_MIN) -DDELAY_MAX=$(DELAY_MAX) $(LDFLAGS) $< -o $@

watcher: watcher.c
	$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@

clean:
	rm -rf test watcher *.tmp

                 reply	other threads:[~2012-06-19 14:00 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4FE083CE.20908@one.com \
    --to=erb@one.com \
    --cc=as@one.com \
    --cc=eparis@parisplace.org \
    --cc=john@johnmccutchan.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=nyerup@one.com \
    --cc=rlove@rlove.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.