From: Olaf Kirch <okir@suse.de>
To: nfs@lists.sourceforge.net
Cc: Andrea Arcangeli <andrea@suse.de>
Subject: Problems with mmap consistency
Date: Fri, 17 Feb 2006 11:57:56 +0100 [thread overview]
Message-ID: <20060217105756.GE25707@suse.de> (raw)
[-- Attachment #1: Type: text/plain, Size: 1814 bytes --]
Hi,
we are currently investigating a customer problem with NFS mmap
consistency. They have an application (binary only, oh joy :) that
makes heavy use of a shared mmapped file (it's basically used as a work
queue for a local service).
They experienced frequent data corruption on the file. What happens is
that they establish a lock when updating a portion of the file, which
invalidates all cached data. Now until recently, nfs_revalidate_inode did
not make any attempt to flush out dirty mmapped pages. Andrea Arcangeli
recently submitted a fix to mainline with changes __nfs_revalidate_inode
to do this:
/* Do the page cache invalidation */
if (flags & NFS_INO_INVALID_DATA) {
if (S_ISREG(inode->i_mode)) {
if (mapping_mapped(inode->i_mapping))
unmap_mapping_range(inode->i_mapping, 0, -1, 0);
filemap_write_and_wait(inode->i_mapping);
nfs_wb_all(inode);
}
nfsi->flags &= ~NFS_INO_INVALID_DATA;
invalidate_inode_pages2(inode->i_mapping);
This improved the customer's situation slightly, but they are still
seeing corruption. I put together a test case that demonstrates this
problem within a few seconds.
It seems the problem is that there is still a fairly big race window
in this code - after unmap_mapping_range returns, we're waiting for
all dirty pages to be flushed to the server. A lot can happen while
we wait - for instance, some other process may modify a page that
has already been written. This modification will be lost during
the subsequent invalidate_inode_pages2 call.
Is there a simple VM mechanism that could be used to prevent this,
or does it need some extra logic to block a process in nfs_readpage
while we're revalidating?
Olaf
--
Olaf Kirch | --- o --- Nous sommes du soleil we love when we play
okir@suse.de | / | \ sol.dhoop.naytheet.ah kin.ir.samse.qurax
[-- Attachment #2: mmapper.c --]
[-- Type: text/plain, Size: 6902 bytes --]
/*
* Test NFS mmap consistency
*
* Run as "./mmapper -u /nfsdir/somefile"
*
* Copyright (C) 2005 Olaf Kirch <okir@suse.de>
*/
#define _LARGEFILE64_SOURCE
#define _GNU_SOURCE
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/file.h>
#include <sys/vfs.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <stdint.h>
#include <signal.h>
#include <stdarg.h>
#include <errno.h>
#include <time.h>
static void handler(int);
static void setup(const char *, size_t);
static pid_t do_master(const char *, size_t);
static pid_t do_slave(unsigned int, const char *, size_t);
static void usage(int);
#define MAXPROCS 64
#define PROCSPACE 1024
static int opt_procs = 10;
static int opt_timeout = 0;
static int opt_lock = 1;
static int opt_sleep = 100; /* ms */
static int opt_unmap = 0;
static int child_died = 0;
struct procinfo {
uint32_t pid;
uint32_t time;
uint32_t index;
uint32_t done;
};
int
main(int argc, char **argv)
{
char * opt_filename = NULL;
size_t opt_filesize = MAXPROCS * PROCSPACE;
pid_t children[MAXPROCS], pid;
int c, n, status, failed = 0;
while ((c = getopt(argc, argv, "hln:t:u")) != EOF) {
switch (c) {
case 'h':
usage(0);
case 'l':
opt_lock = !opt_lock;
break;
case 'n':
opt_procs = atoi(optarg);
if (opt_procs < 2) {
fprintf(stderr, "nprocs must be >= 2\n");
usage(1);
}
break;
case 't':
opt_timeout = atoi(optarg);
break;
case 'u':
opt_unmap = 1;
break;
default:
usage(1);
}
}
if (optind + 1 != argc)
usage(1);
opt_filename = argv[optind++];
setup(opt_filename, opt_filesize);
for (n = 1; n < opt_procs; ++n)
children[n] = do_slave(n, opt_filename, opt_filesize);
children[0] = do_master(opt_filename, opt_filesize);
signal(SIGCHLD, handler);
if (opt_timeout) {
sleep(opt_timeout);
pid = 0;
} else {
/* Just wait for any status */
pid = wait(&status);
}
for (n = 0; n < opt_procs; ++n)
kill(children[n], SIGTERM);
do {
if (pid && WIFEXITED(status) && WEXITSTATUS(status) != 0) {
fprintf(stderr,
"Subprocess exit code %u - test failed\n",
WEXITSTATUS(status));
failed++;
}
pid = wait(&status);
} while (pid > 0);
if (!failed)
printf("Test completed successfully\n");
return failed? 1 : 0;
}
void
handler(int dummy)
{
child_died++;
}
void
usage(int error)
{
fprintf(error? stderr : stdout,
"usage: mmapper [-n numprocs] [-h] mapfile\n");
exit(error);
}
static void
fatal(const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
va_end(ap);
exit(1);
}
static void
setup(const char *filename, size_t size)
{
int fd;
if ((fd = open(filename, O_RDWR|O_CREAT, 0644)) < 0)
fatal("Unable to open %s: %m\n", filename);
/* Wipe the file */
if (ftruncate(fd, 0) < 0
|| ftruncate(fd, size) < 0)
fatal("%s: %m\n", filename);
close(fd);
}
/*
* Map/unmap the file
*/
static void *
map_it(const char *filename, size_t size, int *fdp)
{
unsigned char *addr = NULL;
int fd;
if ((fd = open(filename, O_RDWR, 0644)) < 0)
fatal("Unable to open %s: %m\n", filename);
addr = mmap(NULL, size, PROT_WRITE|PROT_READ, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED)
fatal("Unable to map %s: %m\n", filename);
*fdp = fd;
return addr;
}
static void
unmap_it(unsigned char *addr, size_t size, int fd)
{
munmap(addr, size);
close(fd);
}
/*
* Lock/unlock the file
*/
static void
lock_it(int fd, struct flock *flp, off_t start, size_t len)
{
flp->l_type = F_WRLCK;
flp->l_whence = SEEK_SET;
flp->l_start = start;
flp->l_len = len;
if (fcntl(fd, F_SETLKW, flp) < 0)
fatal("fcntl(F_SETLKW): %m\n");
}
static void
unlock_it(int fd, struct flock *flp)
{
flp->l_type = F_UNLCK;
if (fcntl(fd, F_SETLKW, flp) < 0)
fatal("fcntl(F_SETLKW): %m\n");
}
/*
* Frob the per-process data in the file
*/
#define proc_offset(n) ((n) * PROCSPACE)
static inline struct procinfo *
get_procinfo(unsigned char *addr, unsigned int n)
{
return (struct procinfo *) (addr + proc_offset(n));
}
static void
init_procinfo(struct procinfo *info)
{
info->pid = getpid();
info->time = time(NULL);
info->index = 0;
info->done = 0;
}
static void
check_procinfo(struct procinfo *info, const char *name, struct procinfo *old)
{
uint32_t now;
now = time(NULL);
if (info->time != old->time
|| info->pid != old->pid
|| info->index != old->index) {
fprintf(stderr,
"\nError: process %s: data is stale\n"
"----------------------------------\n"
" Got Expected\n"
"----------------------------------\n"
"Index: %10u %10u\n"
"PID: %10u %10u\n"
"Done: %10u %10u%s\n"
"Time: %10u %10u\n",
name,
info->index, old->index,
info->pid, old->pid,
info->done, old->done,
(info->done != old->done)? " (ignored)" : "",
info->time, old->time);
exit(1);
}
info->time = now;
info->index++;
info->done = 0;
*old = *info;
}
pid_t
do_master(const char *filename, size_t size)
{
int fd;
unsigned char *addr = NULL;
struct procinfo *info, previous;
pid_t pid;
pid = fork();
if (pid < 0)
fatal("Unable to fork master: %m\n");
if (pid != 0) {
close(fd);
return pid;
}
addr = map_it(filename, size, &fd);
info = get_procinfo(addr, 0);
init_procinfo(info);
previous = *info;
while (1) {
struct flock fl;
unsigned int n;
write(1, ".", 1);
lock_it(fd, &fl, 0, 0);
check_procinfo(info, "master", &previous);
/* Loop over all slaves and mark their "jobs"
* as done. Doesn't really matter what we do
* here as long as we do something to their mapped
* data. */
for (n = 1; n < opt_procs; ++n) {
struct procinfo *clnt;
clnt = get_procinfo(addr, n);
clnt->done = 1;
}
msync(addr, size, MS_SYNC);
unlock_it(fd, &fl);
usleep(opt_sleep * 1000);
}
unmap_it(addr, size, fd);
}
pid_t
do_slave(unsigned int slot, const char *filename, size_t size)
{
char procname[16];
int fd;
unsigned char *addr = NULL;
struct procinfo *info, previous;
pid_t pid;
sprintf(procname, "slave%u", slot);
pid = fork();
if (pid < 0)
fatal("Unable to fork %s: %m\n", procname);
if (pid != 0) {
close(fd);
return pid;
}
addr = map_it(filename, size, &fd);
info = get_procinfo(addr, slot);
init_procinfo(info);
previous = *info;
if (opt_unmap)
unmap_it(addr, size, fd);
printf("Start %s, offset %u, pid %u\n", procname, proc_offset(slot), getpid());
while (1) {
struct flock fl;
if (opt_unmap) {
addr = map_it(filename, size, &fd);
info = get_procinfo(addr, slot);
}
lock_it(fd, &fl, proc_offset(slot), PROCSPACE);
check_procinfo(info, procname, &previous);
msync(addr, size, MS_SYNC);
unlock_it(fd, &fl);
if (opt_unmap)
unmap_it(addr, size, fd);
usleep(opt_sleep * 1000);
}
if (!opt_unmap)
unmap_it(addr, size, fd);
}
next reply other threads:[~2006-02-17 10:58 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-02-17 10:57 Olaf Kirch [this message]
2006-02-17 15:15 ` Problems with mmap consistency Trond Myklebust
2006-02-24 4:01 ` Andrea Arcangeli
2006-02-24 6:15 ` Neil Brown
2006-02-24 16:08 ` Andrea Arcangeli
2006-02-24 23:39 ` Andrew Morton
2006-02-25 0:33 ` Andrea Arcangeli
2006-02-25 0:59 ` Trond Myklebust
2006-02-25 1:17 ` Andrew Morton
2006-02-25 4:59 ` Andrea Arcangeli
2006-02-25 14:55 ` Trond Myklebust
2006-02-25 17:27 ` Andrea Arcangeli
2006-02-25 18:42 ` Trond Myklebust
2006-02-27 0:26 ` Neil Brown
2006-02-28 1:04 ` Andrea Arcangeli
2006-02-26 22:33 ` Neil Brown
2006-02-24 7:12 ` Olaf Kirch
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060217105756.GE25707@suse.de \
--to=okir@suse.de \
--cc=andrea@suse.de \
--cc=nfs@lists.sourceforge.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox