linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] fs: Fix page cache inconsistency when mixing buffered and AIO DIO for bdev
@ 2022-05-07  4:10 Liang Chen
  2022-05-07  4:28 ` Dave Chinner
  2022-05-18 13:47 ` Christoph Hellwig
  0 siblings, 2 replies; 5+ messages in thread
From: Liang Chen @ 2022-05-07  4:10 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: hch, jmoyer, jack, lczerner, Liang Chen

From: Liang Chen <liangchen.linux@gmail.com>

As pointed out in commit 332391a, mixing buffered reads and asynchronous
direct writes risks ending up with a situation where stale data is left
in page cache while new data is already written to disk. The same problem
hits block dev fs too. A similar approach needs to be taken here.

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
---
V2: declare blkdev_sb_init_dio_done_wq static
---
 block/fops.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/block/fops.c b/block/fops.c
index 9f2ecec406b0..d3ae5eddc11b 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -136,11 +136,51 @@ struct blkdev_dio {
 	size_t			size;
 	atomic_t		ref;
 	unsigned int		flags;
+	struct work_struct	complete_work;
 	struct bio		bio ____cacheline_aligned_in_smp;
 };
 
 static struct bio_set blkdev_dio_pool;
 
+static void blkdev_aio_complete_work(struct work_struct *work)
+{
+	struct blkdev_dio *dio = container_of(work, struct blkdev_dio, complete_work);
+	struct kiocb *iocb = dio->iocb;
+	int err;
+	struct inode *inode = bdev_file_inode(iocb->ki_filp);
+	loff_t offset = iocb->ki_pos;
+	ssize_t ret;
+
+	WRITE_ONCE(iocb->private, NULL);
+
+	if (likely(!dio->bio.bi_status)) {
+		ret = dio->size;
+		iocb->ki_pos += ret;
+	} else {
+		ret = blk_status_to_errno(dio->bio.bi_status);
+	}
+
+	/*
+	 * Try again to invalidate clean pages which might have been cached by
+	 * non-direct readahead, or faulted in by get_user_pages() if the source
+	 * of the write was an mmap'ed region of the file we're writing.  Either
+	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
+	 * this invalidation fails, tough, the write still worked...
+	 */
+	if (iocb->ki_flags & IOCB_WRITE && ret > 0 &&
+	    inode->i_mapping->nrpages) {
+		err = invalidate_inode_pages2_range(inode->i_mapping,
+				offset >> PAGE_SHIFT,
+				(offset + ret - 1) >> PAGE_SHIFT);
+		if (err)
+			dio_warn_stale_pagecache(iocb->ki_filp);
+	}
+
+	iocb->ki_complete(iocb, ret);
+
+	bio_put(&dio->bio);
+}
+
 static void blkdev_bio_end_io(struct bio *bio)
 {
 	struct blkdev_dio *dio = bio->bi_private;
@@ -153,6 +193,14 @@ static void blkdev_bio_end_io(struct bio *bio)
 		if (!(dio->flags & DIO_IS_SYNC)) {
 			struct kiocb *iocb = dio->iocb;
 			ssize_t ret;
+			struct inode *inode = bdev_file_inode(iocb->ki_filp);
+
+			if (iocb->ki_flags & IOCB_WRITE){
+				INIT_WORK(&dio->complete_work, blkdev_aio_complete_work);
+				queue_work(inode->i_sb->s_dio_done_wq,
+						&dio->complete_work);
+				goto out;
+			}
 
 			WRITE_ONCE(iocb->private, NULL);
 
@@ -173,6 +221,7 @@ static void blkdev_bio_end_io(struct bio *bio)
 		}
 	}
 
+out:
 	if (should_dirty) {
 		bio_check_pages_dirty(bio);
 	} else {
@@ -284,6 +333,20 @@ static void blkdev_bio_end_io_async(struct bio *bio)
 	struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
 	struct kiocb *iocb = dio->iocb;
 	ssize_t ret;
+	struct inode *inode = bdev_file_inode(iocb->ki_filp);
+
+	if (iocb->ki_flags & IOCB_WRITE){
+		INIT_WORK(&dio->complete_work, blkdev_aio_complete_work);
+		/*
+		 * Grab an extra reference to ensure the dio structure
+		 * which the bio embeds in stays around for complete_work
+		 * to access.
+		 */
+		bio_get(bio);
+		queue_work(inode->i_sb->s_dio_done_wq,
+				&dio->complete_work);
+		goto out;
+	}
 
 	WRITE_ONCE(iocb->private, NULL);
 
@@ -296,6 +359,7 @@ static void blkdev_bio_end_io_async(struct bio *bio)
 
 	iocb->ki_complete(iocb, ret);
 
+out:
 	if (dio->flags & DIO_SHOULD_DIRTY) {
 		bio_check_pages_dirty(bio);
 	} else {
@@ -366,14 +430,37 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	return -EIOCBQUEUED;
 }
 
+static int blkdev_sb_init_dio_done_wq(struct super_block *sb)
+{
+	struct workqueue_struct *old;
+	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
+						     WQ_MEM_RECLAIM, 0,
+						     sb->s_id);
+	if (!wq)
+	       return -ENOMEM;
+	/*
+	 * This has to be atomic as more DIOs can race to create the workqueue
+	 */
+	old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
+	/* Someone created workqueue before us? Free ours... */
+	if (old)
+		destroy_workqueue(wq);
+       return 0;
+}
+
 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	unsigned int nr_pages;
+	struct inode *inode = bdev_file_inode(iocb->ki_filp);
 
 	if (!iov_iter_count(iter))
 		return 0;
 
 	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
+
+	if(!inode->i_sb->s_dio_done_wq && blkdev_sb_init_dio_done_wq(inode->i_sb))
+		return -ENOMEM;
+
 	if (likely(nr_pages <= BIO_MAX_VECS)) {
 		if (is_sync_kiocb(iocb))
 			return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] fs: Fix page cache inconsistency when mixing buffered and AIO DIO for bdev
  2022-05-07  4:10 [PATCH v2] fs: Fix page cache inconsistency when mixing buffered and AIO DIO for bdev Liang Chen
@ 2022-05-07  4:28 ` Dave Chinner
  2022-05-07  8:51   ` Liang Chen
  2022-05-18 13:47 ` Christoph Hellwig
  1 sibling, 1 reply; 5+ messages in thread
From: Dave Chinner @ 2022-05-07  4:28 UTC (permalink / raw)
  To: Liang Chen; +Cc: linux-fsdevel, hch, jmoyer, jack, lczerner

On Sat, May 07, 2022 at 12:10:33PM +0800, Liang Chen wrote:
> From: Liang Chen <liangchen.linux@gmail.com>
> 
> As pointed out in commit 332391a, mixing buffered reads and asynchronous
> direct writes risks ending up with a situation where stale data is left
> in page cache while new data is already written to disk. The same problem
> hits block dev fs too. A similar approach needs to be taken here.
> 
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> ---
> V2: declare blkdev_sb_init_dio_done_wq static
> ---
>  block/fops.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 87 insertions(+)

Rather than copying functionality from the two other generic DIO
paths (which we really want to get down to 1!) into this cut down,
less functional DIO path, shouldn't we be spending the effort to
convert the blkdev device to use one of the other generic DIO paths
that already solves this problem and likely gets a lot more test
coverage?

Cheers,

Dave.

-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] fs: Fix page cache inconsistency when mixing buffered and AIO DIO for bdev
  2022-05-07  4:28 ` Dave Chinner
@ 2022-05-07  8:51   ` Liang Chen
  0 siblings, 0 replies; 5+ messages in thread
From: Liang Chen @ 2022-05-07  8:51 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, Christoph Hellwig, jmoyer, jack, lczerner

On Sat, May 7, 2022 at 12:28 PM Dave Chinner <david@fromorbit.com> wrote:
>
> On Sat, May 07, 2022 at 12:10:33PM +0800, Liang Chen wrote:
> > From: Liang Chen <liangchen.linux@gmail.com>
> >
> > As pointed out in commit 332391a, mixing buffered reads and asynchronous
> > direct writes risks ending up with a situation where stale data is left
> > in page cache while new data is already written to disk. The same problem
> > hits block dev fs too. A similar approach needs to be taken here.
> >
> > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > ---
> > V2: declare blkdev_sb_init_dio_done_wq static
> > ---
> >  block/fops.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 87 insertions(+)
>
> Rather than copying functionality from the two other generic DIO
> paths (which we really want to get down to 1!) into this cut down,
> less functional DIO path, shouldn't we be spending the effort to
> convert the blkdev device to use one of the other generic DIO paths
> that already solves this problem and likely gets a lot more test
> coverage?

Yeah, that would be better for sure. In fact I just realized the patch
introduces
a performance drawback compared to the two other DIO paths. Making use of
the well tested code path would avoid such a mistake. I will take a look on
converting blkdev device to use __blockdev_direct_IO (seems requiring less
effort).

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] fs: Fix page cache inconsistency when mixing buffered and AIO DIO for bdev
  2022-05-07  4:10 [PATCH v2] fs: Fix page cache inconsistency when mixing buffered and AIO DIO for bdev Liang Chen
  2022-05-07  4:28 ` Dave Chinner
@ 2022-05-18 13:47 ` Christoph Hellwig
  2022-06-08  9:48   ` Liang Chen
  1 sibling, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2022-05-18 13:47 UTC (permalink / raw)
  To: Liang Chen; +Cc: linux-fsdevel, hch, jmoyer, jack, lczerner

On Sat, May 07, 2022 at 12:10:33PM +0800, Liang Chen wrote:
> From: Liang Chen <liangchen.linux@gmail.com>
> 
> As pointed out in commit 332391a, mixing buffered reads and asynchronous
> direct writes risks ending up with a situation where stale data is left
> in page cache while new data is already written to disk. The same problem
> hits block dev fs too. A similar approach needs to be taken here.

What is the real issue here?  If you mix direct and buffered I/O
you generally get what you pay for.  Even more so on block devices that
don't even follow Posix file system semantics.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] fs: Fix page cache inconsistency when mixing buffered and AIO DIO for bdev
  2022-05-18 13:47 ` Christoph Hellwig
@ 2022-06-08  9:48   ` Liang Chen
  0 siblings, 0 replies; 5+ messages in thread
From: Liang Chen @ 2022-06-08  9:48 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-fsdevel, jmoyer, jack, lczerner

On Wed, May 18, 2022 at 9:47 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Sat, May 07, 2022 at 12:10:33PM +0800, Liang Chen wrote:
> > From: Liang Chen <liangchen.linux@gmail.com>
> >
> > As pointed out in commit 332391a, mixing buffered reads and asynchronous
> > direct writes risks ending up with a situation where stale data is left
> > in page cache while new data is already written to disk. The same problem
> > hits block dev fs too. A similar approach needs to be taken here.
>
> What is the real issue here?  If you mix direct and buffered I/O
> you generally get what you pay for.  Even more so on block devices that
> don't even follow Posix file system semantics.

Sorry, missed your reply.
You are right. The problem was manifested when mixing direct and
buffered I/O, and can be avoided by not doing so.
I tried to bring block device direct IO onto one of the generic DIO
paths, but it resulted in some noticeable performance degradation.
So I would rather leave it as is, since as you mentioned block devices
don't even follow Posix file system semantics.

Thanks,
Liang

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-06-08 10:06 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-05-07  4:10 [PATCH v2] fs: Fix page cache inconsistency when mixing buffered and AIO DIO for bdev Liang Chen
2022-05-07  4:28 ` Dave Chinner
2022-05-07  8:51   ` Liang Chen
2022-05-18 13:47 ` Christoph Hellwig
2022-06-08  9:48   ` Liang Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).