All of lore.kernel.org
 help / color / mirror / Atom feed
From: Olaf Kirch <okir@suse.de>
To: nfs@lists.sourceforge.net
Cc: Andrea Arcangeli <andrea@suse.de>
Subject: Problems with mmap consistency
Date: Fri, 17 Feb 2006 11:57:56 +0100	[thread overview]
Message-ID: <20060217105756.GE25707@suse.de> (raw)

[-- Attachment #1: Type: text/plain, Size: 1814 bytes --]

Hi,

we are currently investigating a customer problem with NFS mmap
consistency. They have an application (binary only, oh joy :) that
makes heavy use of a shared mmapped file (it's basically used as a work
queue for a local service).

They experienced frequent data corruption on the file. What happens is
that they establish a lock when updating a portion of the file, which
invalidates all cached data. Now until recently, nfs_revalidate_inode did
not make any attempt to flush out dirty mmapped pages. Andrea Arcangeli
recently submitted a fix to mainline with changes __nfs_revalidate_inode
to do this:

/* Do the page cache invalidation */
if (flags & NFS_INO_INVALID_DATA) {
	if (S_ISREG(inode->i_mode)) {
		if (mapping_mapped(inode->i_mapping))
			unmap_mapping_range(inode->i_mapping, 0, -1, 0);
		filemap_write_and_wait(inode->i_mapping);
		nfs_wb_all(inode);
	}
	nfsi->flags &= ~NFS_INO_INVALID_DATA;
	invalidate_inode_pages2(inode->i_mapping);

This improved the customer's situation slightly, but they are still
seeing corruption. I put together a test case that demonstrates this
problem within a few seconds.

It seems the problem is that there is still a fairly big race window
in this code - after unmap_mapping_range returns, we're waiting for
all dirty pages to be flushed to the server. A lot can happen while
we wait - for instance, some other process may modify a page that
has already been written. This modification will be lost during
the subsequent invalidate_inode_pages2 call.

Is there a simple VM mechanism that could be used to prevent this,
or does it need some extra logic to block a process in nfs_readpage
while we're revalidating?

Olaf
-- 
Olaf Kirch   |  --- o --- Nous sommes du soleil we love when we play
okir@suse.de |    / | \   sol.dhoop.naytheet.ah kin.ir.samse.qurax

[-- Attachment #2: mmapper.c --]
[-- Type: text/plain, Size: 6902 bytes --]

/*
 * Test NFS mmap consistency
 *
 * Run as "./mmapper -u /nfsdir/somefile"
 *
 * Copyright (C) 2005 Olaf Kirch <okir@suse.de>
 */

#define _LARGEFILE64_SOURCE
#define _GNU_SOURCE
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/file.h>
#include <sys/vfs.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <stdint.h>
#include <signal.h>
#include <stdarg.h>
#include <errno.h>
#include <time.h>

static void	handler(int);
static void	setup(const char *, size_t);
static pid_t	do_master(const char *, size_t);
static pid_t	do_slave(unsigned int, const char *, size_t);
static void	usage(int);

#define MAXPROCS	64
#define PROCSPACE	1024

static int	opt_procs = 10;
static int	opt_timeout = 0;
static int	opt_lock = 1;
static int	opt_sleep = 100; /* ms */
static int	opt_unmap = 0;
static int	child_died = 0;

struct procinfo {
	uint32_t	pid;
	uint32_t	time;
	uint32_t	index;
	uint32_t	done;
};

int
main(int argc, char **argv)
{
	char *	opt_filename = NULL;
	size_t	opt_filesize = MAXPROCS * PROCSPACE;
	pid_t	children[MAXPROCS], pid;
	int	c, n, status, failed = 0;

	while ((c = getopt(argc, argv, "hln:t:u")) != EOF) {
		switch (c) {
		case 'h':
			usage(0);
		case 'l':
			opt_lock = !opt_lock;
			break;
		case 'n':
			opt_procs = atoi(optarg);
			if (opt_procs < 2) {
				fprintf(stderr, "nprocs must be >= 2\n");
				usage(1);
			}
			break;
		case 't':
			opt_timeout = atoi(optarg);
			break;
		case 'u':
			opt_unmap = 1;
			break;
		default:
			usage(1);
		}
	}

	if (optind + 1 != argc)
		usage(1);
	opt_filename = argv[optind++];

	setup(opt_filename, opt_filesize);

	for (n = 1; n < opt_procs; ++n)
		children[n] = do_slave(n, opt_filename, opt_filesize);
	children[0] = do_master(opt_filename, opt_filesize);

	signal(SIGCHLD, handler);
	if (opt_timeout) {
		sleep(opt_timeout);
		pid = 0;
	} else {
		/* Just wait for any status */
		pid = wait(&status);
	}

	for (n = 0; n < opt_procs; ++n)
		kill(children[n], SIGTERM);

	do {
		if (pid && WIFEXITED(status) && WEXITSTATUS(status) != 0) {
			fprintf(stderr,
				"Subprocess exit code %u - test failed\n",
				WEXITSTATUS(status));
			failed++;
		}
		pid = wait(&status);
	} while (pid > 0);

	if (!failed)
		printf("Test completed successfully\n");

	return failed? 1 : 0;
}

void
handler(int dummy)
{
	child_died++;
}

void
usage(int error)
{
	fprintf(error? stderr : stdout,
		"usage: mmapper [-n numprocs] [-h] mapfile\n");
	exit(error);
}

static void
fatal(const char *fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);
	exit(1);
}

static void
setup(const char *filename, size_t size)
{
	int	fd;

	if ((fd = open(filename, O_RDWR|O_CREAT, 0644)) < 0)
		fatal("Unable to open %s: %m\n", filename);

	/* Wipe the file */
	if (ftruncate(fd, 0) < 0
	 || ftruncate(fd, size) < 0)
		fatal("%s: %m\n", filename);

	close(fd);
}

/*
 * Map/unmap the file
 */
static void *
map_it(const char *filename, size_t size, int *fdp)
{
	unsigned char	*addr = NULL;
	int		fd;

	if ((fd = open(filename, O_RDWR, 0644)) < 0)
		fatal("Unable to open %s: %m\n", filename);

	addr = mmap(NULL, size, PROT_WRITE|PROT_READ, MAP_SHARED, fd, 0);
	if (addr == MAP_FAILED)
		fatal("Unable to map %s: %m\n", filename);

	*fdp = fd;
	return addr;
}

static void
unmap_it(unsigned char *addr, size_t size, int fd)
{
	munmap(addr, size);
	close(fd);
}

/*
 * Lock/unlock the file
 */
static void
lock_it(int fd, struct flock *flp, off_t start, size_t len)
{
	flp->l_type = F_WRLCK;
	flp->l_whence = SEEK_SET;
	flp->l_start = start;
	flp->l_len = len;
	if (fcntl(fd, F_SETLKW, flp) < 0)
		fatal("fcntl(F_SETLKW): %m\n");
}

static void
unlock_it(int fd, struct flock *flp)
{
	flp->l_type = F_UNLCK;
	if (fcntl(fd, F_SETLKW, flp) < 0)
		fatal("fcntl(F_SETLKW): %m\n");
}

/*
 * Frob the per-process data in the file
 */
#define proc_offset(n)		((n) * PROCSPACE)

static inline struct procinfo *
get_procinfo(unsigned char *addr, unsigned int n)
{
	return (struct procinfo *) (addr + proc_offset(n));
}

static void
init_procinfo(struct procinfo *info)
{
	info->pid = getpid();
	info->time = time(NULL);
	info->index = 0;
	info->done = 0;
}

static void
check_procinfo(struct procinfo *info, const char *name, struct procinfo *old)
{
	uint32_t	now;

	now = time(NULL);
	if (info->time != old->time
	 || info->pid != old->pid
	 || info->index != old->index) {
		fprintf(stderr,
			"\nError: process %s: data is stale\n"
			"----------------------------------\n"
			"                   Got    Expected\n"
			"----------------------------------\n"
			"Index:      %10u  %10u\n"
			"PID:        %10u  %10u\n"
			"Done:       %10u  %10u%s\n"
			"Time:       %10u  %10u\n",
			name,
			info->index, old->index,
			info->pid, old->pid,
			info->done, old->done,
				(info->done != old->done)? " (ignored)" : "",
			info->time, old->time);
		exit(1);
	 }

	info->time = now;
	info->index++;
	info->done = 0;

	*old = *info;
}

pid_t
do_master(const char *filename, size_t size)
{
	int		fd;
	unsigned char	*addr = NULL;
	struct procinfo	*info, previous;
	pid_t		pid;

	pid = fork();
	if (pid < 0)
		fatal("Unable to fork master: %m\n");
	if (pid != 0) {
		close(fd);
		return pid;
	}

	addr = map_it(filename, size, &fd);
	info = get_procinfo(addr, 0);
	init_procinfo(info);
	previous = *info;

	while (1) {
		struct flock	fl;
		unsigned int	n;

		write(1, ".", 1);

		lock_it(fd, &fl, 0, 0);

		check_procinfo(info, "master", &previous);

		/* Loop over all slaves and mark their "jobs"
		 * as done. Doesn't really matter what we do
		 * here as long as we do something to their mapped
		 * data. */
		for (n = 1; n < opt_procs; ++n) {
			struct procinfo *clnt;

			clnt = get_procinfo(addr, n);
			clnt->done = 1;
		}

		msync(addr, size, MS_SYNC);
		unlock_it(fd, &fl);

		usleep(opt_sleep * 1000);
	}

	unmap_it(addr, size, fd);
}

pid_t
do_slave(unsigned int slot, const char *filename, size_t size)
{
	char		procname[16];
	int		fd;
	unsigned char	*addr = NULL;
	struct procinfo	*info, previous;
	pid_t		pid;

	sprintf(procname, "slave%u", slot);

	pid = fork();
	if (pid < 0)
		fatal("Unable to fork %s: %m\n", procname);
	if (pid != 0) {
		close(fd);
		return pid;
	}

	addr = map_it(filename, size, &fd);
	info = get_procinfo(addr, slot);
	init_procinfo(info);
	previous = *info;
	if (opt_unmap)
		unmap_it(addr, size, fd);

	printf("Start %s, offset %u, pid %u\n", procname, proc_offset(slot), getpid());

	while (1) {
		struct flock	fl;

		if (opt_unmap) {
			addr = map_it(filename, size, &fd);
			info = get_procinfo(addr, slot);
		}

		lock_it(fd, &fl, proc_offset(slot), PROCSPACE);

		check_procinfo(info, procname, &previous);

		msync(addr, size, MS_SYNC);
		unlock_it(fd, &fl);

		if (opt_unmap)
			unmap_it(addr, size, fd);

		usleep(opt_sleep * 1000);
	}
	if (!opt_unmap)
		unmap_it(addr, size, fd);
}

             reply	other threads:[~2006-02-17 10:58 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-02-17 10:57 Olaf Kirch [this message]
2006-02-17 15:15 ` Problems with mmap consistency Trond Myklebust
2006-02-24  4:01   ` Andrea Arcangeli
2006-02-24  6:15     ` Neil Brown
2006-02-24 16:08       ` Andrea Arcangeli
2006-02-24 23:39         ` Andrew Morton
2006-02-25  0:33           ` Andrea Arcangeli
2006-02-25  0:59             ` Trond Myklebust
2006-02-25  1:17               ` Andrew Morton
2006-02-25  4:59                 ` Andrea Arcangeli
2006-02-25 14:55                   ` Trond Myklebust
2006-02-25 17:27                     ` Andrea Arcangeli
2006-02-25 18:42                       ` Trond Myklebust
2006-02-27  0:26                         ` Neil Brown
2006-02-28  1:04                         ` Andrea Arcangeli
2006-02-26 22:33         ` Neil Brown
2006-02-24  7:12     ` Olaf Kirch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060217105756.GE25707@suse.de \
    --to=okir@suse.de \
    --cc=andrea@suse.de \
    --cc=nfs@lists.sourceforge.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.