From: Chris Mason <clm@fb.com>
To: xfs@oss.sgi.com, Dave Chinner <david@fromorbit.com>,
Eric Sandeen <sandeen@redhat.com>
Subject: [PATCH] xfs: don't zero partial page cache pages during O_DIRECT
Date: Fri, 8 Aug 2014 10:35:38 -0400 [thread overview]
Message-ID: <53E4E03A.7050101@fb.com> (raw)
xfs is using truncate_pagecache_range to invalidate the page cache
during DIO reads. This is different from the other filesystems who only
invalidate pages during DIO writes.
truncate_pagecache_range is meant to be used when we are freeing the
underlying data structs from disk, so it will zero any partial ranges
in the page. This means a DIO read can zero out part of the page cache
page, and it is possible the page will stay in cache.
buffered reads will find an up to date page with zeros instead of the
data actually on disk.
This patch fixes things by leaving the page cache alone during DIO
reads.
We discovered this when our buffered IO program for distributing
database indexes was finding zero filled blocks. I think writes
are broken too, but I'll leave that for a separate patch because I don't
fully understand what XFS needs to happen during a DIO write.
Test program:
/*
* gcc -Wall -o read-race read-race.c
* ./read-race filename
*/
#define _XOPEN_SOURCE 600
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/time.h>
#ifndef O_DIRECT
#define O_DIRECT 00040000
#endif
#define READ_SIZE 512
#define BUF_SIZE 1024 * 1024
static void usage(char *name)
{
fprintf(stderr, "usage: %s filename\n", name);
exit(1);
}
/* return 1 if the buffer is full of zeros */
static int all_zeros(char *buf, int sz)
{
int i;
for (i = 0; i < sz; i++) {
if (*buf++)
return 0;
}
return 1;
}
static void run_test(char *filename, char *buf, int direct)
{
int fd;
int ret;
struct timeval start;
struct timeval now;
fd = open(filename, O_RDONLY | direct);
if (fd < 0) {
perror("open");
exit(1);
}
/*
* seek to a 512b aligned offset in the page and then do a
* read. Check the read for zeros, and if we're buffered
* use FADV_DONTNEED to drop the page cache. repeat for 15 seconds
*/
gettimeofday(&start, NULL);
while (1) {
ret = lseek(fd, 5120, SEEK_SET);
if (ret < 0) {
perror("lseek");
exit(1);
}
if (!direct) {
ret = posix_fadvise(fd, 0, 8192, POSIX_FADV_DONTNEED);
if (ret) {
perror("fadvise");
exit(1);
}
}
ret = read(fd, buf, READ_SIZE);
if (ret < READ_SIZE) {
fprintf(stderr, "invalid read\n");
exit(1);
}
if (all_zeros(buf, READ_SIZE)) {
fprintf(stderr, "error: found zero range direct: %d\n",
direct ? 1 : 0);
exit(255);
}
gettimeofday(&now, NULL);
if (now.tv_sec - start.tv_sec > 15)
exit(0);
}
}
int main(int ac, char **av)
{
int ret;
int fd;
char *filename;
char *buf;
int pagesize = sysconf(_SC_PAGESIZE);
pid_t buffered_pid = 0;
pid_t direct_pid = 0;
pid_t wait_pid;
int status = 0;
int test_failure = 0;
if (ac != 2)
usage(av[0]);
else
filename = av[1];
ret = posix_memalign((void **)&buf, pagesize, BUF_SIZE);
if (ret) {
perror("posix_memalign");
exit(1);
}
/* step one, create our test file and fill with non-zero */
fd = open(filename, O_WRONLY | O_CREAT, 0700);
if (fd < 0) {
perror("open for writing");
exit(1);
}
memset(buf, 1, BUF_SIZE);
ret = write(fd, buf, BUF_SIZE);
if (ret != BUF_SIZE) {
fprintf(stderr, "failed to fill the test file\n");
exit(1);
}
close(fd);
/* start the buffered reader */
buffered_pid = fork();
if (buffered_pid < 0) {
perror("fork");
exit(1);
} else if (buffered_pid == 0) {
run_test(filename, buf, 0);
exit(0);
}
/* start the direct reader */
direct_pid = fork();
if (direct_pid < 0) {
perror("fork");
goto cleanup;
} else if (direct_pid == 0) {
run_test(filename, buf, O_DIRECT);
exit(0);
}
/* wait for buffered to finish */
wait_pid = waitpid(buffered_pid, &status, 0);
if (wait_pid < 0) {
perror("waitpid buffered");
goto cleanup;
}
if (WIFEXITED(status)) {
int ret = WEXITSTATUS(status);
printf("buffered exits with %d\n", ret);
if (ret) {
buffered_pid = 0;
test_failure = ret;
goto cleanup;
}
} else {
test_failure = 1;
}
/* wait for direct to finish */
wait_pid = waitpid(direct_pid, &status, 0);
if (wait_pid < 0) {
perror("waitpid direct");
goto cleanup;
}
if (WIFEXITED(status)) {
int ret = WEXITSTATUS(status);
printf("direct exits with %d\n", ret);
test_failure |= ret;
} else {
test_failure |= 1;
}
exit(test_failure);
cleanup:
if (direct_pid > 0)
kill(direct_pid, SIGTERM);
if (buffered_pid > 0)
kill(buffered_pid, SIGTERM);
exit(test_failure);
}
Signed-off-by: Chris Mason <clm@fb.com>
cc: stable@vger.kernel.org
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f66779..8d25d98 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -295,7 +295,11 @@ xfs_file_read_iter(
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
- truncate_pagecache_range(VFS_I(ip), pos, -1);
+
+ /* we don't remove any pages here. A direct read
+ * does not invalidate any contents of the page
+ * cache
+ */
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
next reply other threads:[~2014-08-08 14:35 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-08-08 14:35 Chris Mason [this message]
2014-08-08 15:17 ` [PATCH] xfs: don't zero partial page cache pages during O_DIRECT Chris Mason
2014-08-08 16:04 ` [PATCH RFC] xfs: use invalidate_inode_pages2_range for DIO writes Chris Mason
2014-08-09 0:48 ` Dave Chinner
2014-08-09 2:42 ` Chris Mason
2014-08-08 20:39 ` [PATCH] xfs: don't zero partial page cache pages during O_DIRECT Brian Foster
2014-08-09 0:36 ` Dave Chinner
2014-08-09 2:32 ` Chris Mason
2014-08-09 3:19 ` Eric Sandeen
2014-08-09 4:17 ` Dave Chinner
2014-08-09 12:57 ` [PATCH v2] " Chris Mason
2014-08-11 13:29 ` Brian Foster
2014-08-12 1:17 ` Dave Chinner
2014-08-19 19:24 ` Chris Mason
2014-08-19 22:35 ` Dave Chinner
2014-08-20 1:54 ` Chris Mason
2014-08-20 2:19 ` Dave Chinner
2014-08-20 2:36 ` Dave Chinner
2014-08-20 4:41 ` Dave Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=53E4E03A.7050101@fb.com \
--to=clm@fb.com \
--cc=david@fromorbit.com \
--cc=sandeen@redhat.com \
--cc=xfs@oss.sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox