From: "Maxim V. Patlasov" <MPatlasov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
To: miklos-sUDqSbJrdHQHWmgEVkV9KA@public.gmane.org
Cc: dev-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org,
xemul-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org,
fuse-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org,
linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
jbottomley-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
devel-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org
Subject: [PATCH 12/14] fuse: Fix O_DIRECT operations vs cached writeback misorder - v2
Date: Mon, 01 Apr 2013 14:42:37 +0400 [thread overview]
Message-ID: <20130401104233.19027.2301.stgit@maximpc.sw.ru> (raw)
In-Reply-To: <20130401103749.19027.89833.stgit-vWG5eQQidJHciZdyczg/7Q@public.gmane.org>
The problem is:
1. write cached data to a file
2. read directly from the same file (via another fd)
The 2nd operation may read stale data, i.e. the one that was in a file
before the 1st op. Problem is in how fuse manages writeback.
When direct op occurs the core kernel code calls filemap_write_and_wait
to flush all the cached ops in flight. But fuse acks the writeback right
after the ->writepages callback exits w/o waiting for the real write to
happen. Thus the subsequent direct op proceeds while the real writeback
is still in flight. This is a problem for backends that reorder operation.
Fix this by making the fuse direct IO callback explicitly wait on the
in-flight writeback to finish.
Changed in v2:
- do not wait on writeback if fuse_direct_io() call came from
CUSE (because it doesn't use fuse inodes)
Original patch by: Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
Signed-off-by: Maxim Patlasov <MPatlasov-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
---
fs/fuse/cuse.c | 5 +++--
fs/fuse/file.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
fs/fuse/fuse_i.h | 13 ++++++++++++-
3 files changed, 62 insertions(+), 5 deletions(-)
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 6f96a8d..fb63185 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -93,7 +93,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
loff_t pos = 0;
struct iovec iov = { .iov_base = buf, .iov_len = count };
- return fuse_direct_io(file, &iov, 1, count, &pos, 0);
+ return fuse_direct_io(file, &iov, 1, count, &pos, FUSE_DIO_CUSE);
}
static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -106,7 +106,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
* No locking or generic_write_checks(), the server is
* responsible for locking and sanity checks.
*/
- return fuse_direct_io(file, &iov, 1, count, &pos, 1);
+ return fuse_direct_io(file, &iov, 1, count, &pos,
+ FUSE_DIO_WRITE | FUSE_DIO_CUSE);
}
static int cuse_open(struct inode *inode, struct file *file)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7c24f6b..14880bb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -338,6 +338,31 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
return (u64) v0 + ((u64) v1 << 32);
}
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+ pgoff_t idx_to)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_req *req;
+ bool found = false;
+
+ spin_lock(&fc->lock);
+ list_for_each_entry(req, &fi->writepages, writepages_entry) {
+ pgoff_t curr_index;
+
+ BUG_ON(req->inode != inode);
+ curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ if (!(idx_from >= curr_index + req->num_pages ||
+ idx_to < curr_index)) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&fc->lock);
+
+ return found;
+}
+
/*
* Check if page is under writeback
*
@@ -382,6 +407,19 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
return 0;
}
+static void fuse_wait_on_writeback(struct inode *inode, pgoff_t start,
+ size_t bytes)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ pgoff_t idx_from, idx_to;
+
+ idx_from = start >> PAGE_CACHE_SHIFT;
+ idx_to = (start + bytes - 1) >> PAGE_CACHE_SHIFT;
+
+ wait_event(fi->page_waitq,
+ !fuse_range_is_writeback(inode, idx_from, idx_to));
+}
+
static int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file->f_path.dentry->d_inode;
@@ -1248,8 +1286,10 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
unsigned long nr_segs, size_t count, loff_t *ppos,
- int write)
+ int flags)
{
+ int write = flags & FUSE_DIO_WRITE;
+ int cuse = flags & FUSE_DIO_CUSE;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
@@ -1274,6 +1314,10 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
break;
}
+ if (!cuse)
+ fuse_wait_on_writeback(file->f_mapping->host, pos,
+ nbytes);
+
if (write)
nres = fuse_send_write(req, file, pos, nbytes, owner);
else
@@ -1342,7 +1386,8 @@ static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov,
res = generic_write_checks(file, ppos, &count, 0);
if (!res) {
- res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1);
+ res = fuse_direct_io(file, iov, nr_segs, count, ppos,
+ FUSE_DIO_WRITE);
if (res > 0) {
struct fuse_inode *fi = get_fuse_inode(inode);
fuse_write_update_size(inode, *ppos);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ddf482b..f54d669 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -828,9 +828,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
bool isdir);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE (1 << 1)
+
ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
unsigned long nr_segs, size_t count, loff_t *ppos,
- int write);
+ int flags);
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags);
long fuse_ioctl_common(struct file *file, unsigned int cmd,
------------------------------------------------------------------------------
Own the Future-Intel® Level Up Game Demo Contest 2013
Rise to greatness in Intel's independent game demo contest.
Compete for recognition, cash, and the chance to get your game
on Steam. $5K grand prize plus 10 genre and skill prizes.
Submit your demo by 6/6/13. http://p.sf.net/sfu/intel_levelupd2d
WARNING: multiple messages have this Message-ID (diff)
From: "Maxim V. Patlasov" <MPatlasov@parallels.com>
To: miklos@szeredi.hu
Cc: dev@parallels.com, xemul@parallels.com,
fuse-devel@lists.sourceforge.net, linux-kernel@vger.kernel.org,
jbottomley@parallels.com, viro@zeniv.linux.org.uk,
linux-fsdevel@vger.kernel.org, devel@openvz.org
Subject: [PATCH 12/14] fuse: Fix O_DIRECT operations vs cached writeback misorder - v2
Date: Mon, 01 Apr 2013 14:42:37 +0400 [thread overview]
Message-ID: <20130401104233.19027.2301.stgit@maximpc.sw.ru> (raw)
In-Reply-To: <20130401103749.19027.89833.stgit@maximpc.sw.ru>
The problem is:
1. write cached data to a file
2. read directly from the same file (via another fd)
The 2nd operation may read stale data, i.e. the one that was in a file
before the 1st op. Problem is in how fuse manages writeback.
When direct op occurs the core kernel code calls filemap_write_and_wait
to flush all the cached ops in flight. But fuse acks the writeback right
after the ->writepages callback exits w/o waiting for the real write to
happen. Thus the subsequent direct op proceeds while the real writeback
is still in flight. This is a problem for backends that reorder operation.
Fix this by making the fuse direct IO callback explicitly wait on the
in-flight writeback to finish.
Changed in v2:
- do not wait on writeback if fuse_direct_io() call came from
CUSE (because it doesn't use fuse inodes)
Original patch by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
---
fs/fuse/cuse.c | 5 +++--
fs/fuse/file.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
fs/fuse/fuse_i.h | 13 ++++++++++++-
3 files changed, 62 insertions(+), 5 deletions(-)
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 6f96a8d..fb63185 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -93,7 +93,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
loff_t pos = 0;
struct iovec iov = { .iov_base = buf, .iov_len = count };
- return fuse_direct_io(file, &iov, 1, count, &pos, 0);
+ return fuse_direct_io(file, &iov, 1, count, &pos, FUSE_DIO_CUSE);
}
static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -106,7 +106,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
* No locking or generic_write_checks(), the server is
* responsible for locking and sanity checks.
*/
- return fuse_direct_io(file, &iov, 1, count, &pos, 1);
+ return fuse_direct_io(file, &iov, 1, count, &pos,
+ FUSE_DIO_WRITE | FUSE_DIO_CUSE);
}
static int cuse_open(struct inode *inode, struct file *file)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7c24f6b..14880bb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -338,6 +338,31 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
return (u64) v0 + ((u64) v1 << 32);
}
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+ pgoff_t idx_to)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_req *req;
+ bool found = false;
+
+ spin_lock(&fc->lock);
+ list_for_each_entry(req, &fi->writepages, writepages_entry) {
+ pgoff_t curr_index;
+
+ BUG_ON(req->inode != inode);
+ curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ if (!(idx_from >= curr_index + req->num_pages ||
+ idx_to < curr_index)) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&fc->lock);
+
+ return found;
+}
+
/*
* Check if page is under writeback
*
@@ -382,6 +407,19 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
return 0;
}
+static void fuse_wait_on_writeback(struct inode *inode, pgoff_t start,
+ size_t bytes)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ pgoff_t idx_from, idx_to;
+
+ idx_from = start >> PAGE_CACHE_SHIFT;
+ idx_to = (start + bytes - 1) >> PAGE_CACHE_SHIFT;
+
+ wait_event(fi->page_waitq,
+ !fuse_range_is_writeback(inode, idx_from, idx_to));
+}
+
static int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file->f_path.dentry->d_inode;
@@ -1248,8 +1286,10 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
unsigned long nr_segs, size_t count, loff_t *ppos,
- int write)
+ int flags)
{
+ int write = flags & FUSE_DIO_WRITE;
+ int cuse = flags & FUSE_DIO_CUSE;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
@@ -1274,6 +1314,10 @@ ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
break;
}
+ if (!cuse)
+ fuse_wait_on_writeback(file->f_mapping->host, pos,
+ nbytes);
+
if (write)
nres = fuse_send_write(req, file, pos, nbytes, owner);
else
@@ -1342,7 +1386,8 @@ static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov,
res = generic_write_checks(file, ppos, &count, 0);
if (!res) {
- res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1);
+ res = fuse_direct_io(file, iov, nr_segs, count, ppos,
+ FUSE_DIO_WRITE);
if (res > 0) {
struct fuse_inode *fi = get_fuse_inode(inode);
fuse_write_update_size(inode, *ppos);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ddf482b..f54d669 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -828,9 +828,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
bool isdir);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE (1 << 1)
+
ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
unsigned long nr_segs, size_t count, loff_t *ppos,
- int write);
+ int flags);
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags);
long fuse_ioctl_common(struct file *file, unsigned int cmd,
next prev parent reply other threads:[~2013-04-01 10:42 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-04-01 10:40 [PATCH v4 00/14] fuse: An attempt to implement a write-back cache policy Maxim V. Patlasov
[not found] ` <20130401103749.19027.89833.stgit-vWG5eQQidJHciZdyczg/7Q@public.gmane.org>
2013-04-01 10:40 ` [PATCH 01/14] fuse: Linking file to inode helper Maxim V. Patlasov
2013-04-01 10:40 ` Maxim V. Patlasov
2013-04-01 10:40 ` [PATCH 02/14] fuse: Getting file for writeback helper Maxim V. Patlasov
2013-04-01 10:40 ` Maxim V. Patlasov
2013-04-01 10:41 ` [PATCH 03/14] fuse: Prepare to handle short reads Maxim V. Patlasov
2013-04-01 10:41 ` Maxim V. Patlasov
2013-04-01 10:41 ` [PATCH 04/14] fuse: Prepare to handle multiple pages in writeback Maxim V. Patlasov
2013-04-01 10:41 ` Maxim V. Patlasov
2013-04-25 10:22 ` Miklos Szeredi
2013-04-01 10:41 ` [PATCH 05/14] fuse: Connection bit for enabling writeback Maxim V. Patlasov
2013-04-01 10:41 ` Maxim V. Patlasov
2013-04-01 10:41 ` [PATCH 06/14] fuse: Trust kernel i_size only - v3 Maxim V. Patlasov
2013-04-01 10:41 ` Maxim V. Patlasov
2013-04-01 10:41 ` [PATCH 07/14] fuse: Trust kernel i_mtime only Maxim V. Patlasov
2013-04-01 10:41 ` Maxim V. Patlasov
2013-04-01 10:42 ` [PATCH 09/14] fuse: Implement writepages and write_begin/write_end callbacks - v3 Maxim V. Patlasov
2013-04-01 10:42 ` Maxim V. Patlasov
2013-04-25 10:35 ` Miklos Szeredi
2013-06-14 14:03 ` Maxim Patlasov
2013-04-01 10:42 ` [PATCH 10/14] fuse: fuse_writepage_locked() should wait on writeback Maxim V. Patlasov
2013-04-01 10:42 ` Maxim V. Patlasov
2013-04-01 10:42 ` [PATCH 11/14] fuse: fuse_flush() " Maxim V. Patlasov
2013-04-01 10:42 ` Maxim V. Patlasov
2013-04-01 10:42 ` Maxim V. Patlasov [this message]
2013-04-01 10:42 ` [PATCH 12/14] fuse: Fix O_DIRECT operations vs cached writeback misorder - v2 Maxim V. Patlasov
2013-04-01 10:42 ` [PATCH 13/14] fuse: Turn writeback cache on Maxim V. Patlasov
2013-04-01 10:42 ` Maxim V. Patlasov
2013-04-01 10:42 ` [PATCH 14/14] mm: Account for WRITEBACK_TEMP in balance_dirty_pages Maxim V. Patlasov
2013-04-01 10:42 ` Maxim V. Patlasov
2013-04-25 14:29 ` [fuse-devel] " Maxim V. Patlasov
2013-04-25 15:49 ` Miklos Szeredi
2013-04-25 16:16 ` Maxim V. Patlasov
2013-04-25 20:43 ` Miklos Szeredi
2013-04-26 8:32 ` Maxim V. Patlasov
2013-04-26 8:32 ` Maxim V. Patlasov
2013-04-26 8:32 ` Maxim V. Patlasov
2013-04-26 14:02 ` Miklos Szeredi
2013-04-26 14:02 ` Miklos Szeredi
2013-04-26 17:44 ` Maxim V. Patlasov
2013-04-26 17:44 ` Maxim V. Patlasov
2013-04-26 17:44 ` Maxim V. Patlasov
2013-05-07 11:39 ` Miklos Szeredi
2013-05-07 11:39 ` Miklos Szeredi
2013-04-01 10:41 ` [PATCH 08/14] fuse: Flush files on wb close Maxim V. Patlasov
2013-04-11 11:18 ` [fuse-devel] [PATCH v4 00/14] fuse: An attempt to implement a write-back cache policy Maxim V. Patlasov
2013-04-11 14:36 ` Miklos Szeredi
-- strict thread matches above, loose matches on Subject: below --
2013-01-25 18:20 [PATCH v3 " Maxim V. Patlasov
2013-01-25 18:27 ` [PATCH 12/14] fuse: Fix O_DIRECT operations vs cached writeback misorder - v2 Maxim V. Patlasov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130401104233.19027.2301.stgit@maximpc.sw.ru \
--to=mpatlasov-bzqdu9zft3wakbo8gow8eq@public.gmane.org \
--cc=dev-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org \
--cc=devel-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org \
--cc=fuse-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org \
--cc=jbottomley-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org \
--cc=linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
--cc=linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
--cc=miklos-sUDqSbJrdHQHWmgEVkV9KA@public.gmane.org \
--cc=viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org \
--cc=xemul-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.