All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jens Axboe <axboe@fb.com>
To: <linux-fsdevel@vger.kernel.org>
Cc: <hch@infradead.org>, <viro@zeniv.linux.org.uk>,
	<akpm@linux-foundation.org>, <linux-kernel@vger.kernel.org>
Subject: [PATCH] direct-io: only inc/dec inode->i_dio_count for file systems
Date: Wed, 15 Apr 2015 12:22:56 -0600	[thread overview]
Message-ID: <20150415182256.GA30339@kernel.dk> (raw)

Hi,

This is a reposting of a patch that was originally in the blk-mq series.
It has huge upside on shared access to a multiqueue device doing
O_DIRECT, it's basically the scaling block that ends up killing
performance. A quick test here reveals that we spend 30% of all system
time just incrementing and decremening inode->i_dio_count. For block
devices this isn't useful at all, as we don't need protection against
truncate. For that test case, performance increases about 3.6x (!!) by
getting rid of this inc/dec per IO.

I've cleaned it up a bit since last time, integrating the checks in
inode_dio_done() and adding a inode_dio_begin() so that callers don't
need to know about this.

We've been running a variant of this patch in the FB kernel for a while.
I'd like to finally get this upstream.

Signed-off-by: Jens Axboe <axboe@fb.com>

---

 fs/block_dev.c     |    2 +-
 fs/btrfs/inode.c   |    6 +++---
 fs/dax.c           |    4 ++--
 fs/direct-io.c     |    5 +++--
 fs/ext4/indirect.c |    6 +++---
 fs/ext4/inode.c    |    4 ++--
 fs/inode.c         |   21 +++++++++++++++++++--
 fs/nfs/direct.c    |   10 +++++-----
 include/linux/fs.h |    6 +++++-
 9 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 975266be67d3..9b290121301a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -155,7 +155,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
 
 	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
 				    offset, blkdev_get_block,
-				    NULL, NULL, 0);
+				    NULL, NULL, DIO_IGNORE_TRUNCATE);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d2e732d7af52..6c66719179ba 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8129,7 +8129,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
 		return 0;
 
-	atomic_inc(&inode->i_dio_count);
+	inode_dio_begin(inode, flags);
 	smp_mb__after_atomic();
 
 	/*
@@ -8169,7 +8169,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 		current->journal_info = &outstanding_extents;
 	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 				     &BTRFS_I(inode)->runtime_flags)) {
-		inode_dio_done(inode);
+		inode_dio_done(inode, flags);
 		flags = DIO_LOCKING | DIO_SKIP_HOLES;
 		wakeup = false;
 	}
@@ -8188,7 +8188,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	}
 out:
 	if (wakeup)
-		inode_dio_done(inode);
+		inode_dio_done(inode, flags);
 	if (relock)
 		mutex_lock(&inode->i_mutex);
 
diff --git a/fs/dax.c b/fs/dax.c
index ed1619ec6537..0f55ee4ed2fe 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -210,7 +210,7 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
 	}
 
 	/* Protects against truncate */
-	atomic_inc(&inode->i_dio_count);
+	inode_dio_begin(inode, flags);
 
 	retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
 
@@ -220,7 +220,7 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
 	if ((retval > 0) && end_io)
 		end_io(iocb, pos, retval, bh.b_private);
 
-	inode_dio_done(inode);
+	inode_dio_done(inode, flags);
  out:
 	return retval;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b2e297..5f59709a8b80 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -254,7 +254,8 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
 	if (dio->end_io && dio->result)
 		dio->end_io(dio->iocb, offset, transferred, dio->private);
 
-	inode_dio_done(dio->inode);
+	inode_dio_done(dio->inode, dio->flags);
+
 	if (is_async) {
 		if (dio->rw & WRITE) {
 			int err;
@@ -1199,7 +1200,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	/*
 	 * Will be decremented at I/O completion time.
 	 */
-	atomic_inc(&inode->i_dio_count);
+	inode_dio_begin(inode, dio->flags);
 
 	retval = 0;
 	sdio.blkbits = blkbits;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 45fe924f82bc..853a8674b137 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -682,11 +682,11 @@ retry:
 		 * via ext4_inode_block_unlocked_dio(). Check inode's state
 		 * while holding extra i_dio_count ref.
 		 */
-		atomic_inc(&inode->i_dio_count);
+		inode_dio_begin(inode, 0);
 		smp_mb();
 		if (unlikely(ext4_test_inode_state(inode,
 						    EXT4_STATE_DIOREAD_LOCK))) {
-			inode_dio_done(inode);
+			inode_dio_done(inode, 0);
 			goto locked;
 		}
 		if (IS_DAX(inode))
@@ -696,7 +696,7 @@ retry:
 			ret = __blockdev_direct_IO(rw, iocb, inode,
 					inode->i_sb->s_bdev, iter, offset,
 					ext4_get_block, NULL, NULL, 0);
-		inode_dio_done(inode);
+		inode_dio_done(inode, 0);
 	} else {
 locked:
 		if (IS_DAX(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cb9a212b86f..5310e31a4327 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2978,7 +2978,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
 	 */
 	if (rw == WRITE)
-		atomic_inc(&inode->i_dio_count);
+		inode_dio_begin(inode, dio_flags);
 
 	/* If we do a overwrite dio, i_mutex locking can be released */
 	overwrite = *((int *)iocb->private);
@@ -3080,7 +3080,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 
 retake_lock:
 	if (rw == WRITE)
-		inode_dio_done(inode);
+		inode_dio_done(inode, dio_flags);
 	/* take i_mutex locking again if we do a ovewrite dio */
 	if (overwrite) {
 		up_read(&EXT4_I(inode)->i_data_sem);
diff --git a/fs/inode.c b/fs/inode.c
index f00b16f45507..7a544bea1566 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1946,15 +1946,32 @@ void inode_dio_wait(struct inode *inode)
 EXPORT_SYMBOL(inode_dio_wait);
 
 /*
+ * inode_dio_begin - signal start of a direct I/O requests
+ * @inode: inode the direct I/O happens on
+ * @flags: DIO_* flags
+ *
+ * This is called once we've finished processing a direct I/O request,
+ * and is used to wake up callers waiting for direct I/O to be quiesced.
+ */
+void inode_dio_begin(struct inode *inode, unsigned int flags)
+{
+	if (!(flags & DIO_IGNORE_TRUNCATE))
+		atomic_inc(&inode->i_dio_count);
+}
+EXPORT_SYMBOL(inode_dio_begin);
+
+/*
  * inode_dio_done - signal finish of a direct I/O requests
  * @inode: inode the direct I/O happens on
+ * @flags: DIO_* flags
  *
  * This is called once we've finished processing a direct I/O request,
  * and is used to wake up callers waiting for direct I/O to be quiesced.
  */
-void inode_dio_done(struct inode *inode)
+void inode_dio_done(struct inode *inode, unsigned int flags)
 {
-	if (atomic_dec_and_test(&inode->i_dio_count))
+	if ((flags & DIO_IGNORE_TRUNCATE) ||
+	    atomic_dec_and_test(&inode->i_dio_count))
 		wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
 }
 EXPORT_SYMBOL(inode_dio_done);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e907c8cf732e..a5d764431f11 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -387,7 +387,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 	if (write)
 		nfs_zap_mapping(inode, inode->i_mapping);
 
-	inode_dio_done(inode);
+	inode_dio_done(inode, 0);
 
 	if (dreq->iocb) {
 		long res = (long) dreq->error;
@@ -487,7 +487,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 			     &nfs_direct_read_completion_ops);
 	get_dreq(dreq);
 	desc.pg_dreq = dreq;
-	atomic_inc(&inode->i_dio_count);
+	inode_dio_begin(inode, 0);
 
 	while (iov_iter_count(iter)) {
 		struct page **pagevec;
@@ -539,7 +539,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 	 * generic layer handle the completion.
 	 */
 	if (requested_bytes == 0) {
-		inode_dio_done(inode);
+		inode_dio_done(inode, 0);
 		nfs_direct_req_release(dreq);
 		return result < 0 ? result : -EIO;
 	}
@@ -873,7 +873,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 			      &nfs_direct_write_completion_ops);
 	desc.pg_dreq = dreq;
 	get_dreq(dreq);
-	atomic_inc(&inode->i_dio_count);
+	inode_dio_begin(inode, 0);
 
 	NFS_I(inode)->write_io += iov_iter_count(iter);
 	while (iov_iter_count(iter)) {
@@ -929,7 +929,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	 * generic layer handle the completion.
 	 */
 	if (requested_bytes == 0) {
-		inode_dio_done(inode);
+		inode_dio_done(inode, 0);
 		nfs_direct_req_release(dreq);
 		return result < 0 ? result : -EIO;
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 52cc4492cb3a..43aab9f588fa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2613,6 +2613,9 @@ enum {
 
 	/* filesystem can handle aio writes beyond i_size */
 	DIO_ASYNC_EXTEND = 0x04,
+
+	/* inode/fs/bdev does not need truncate protection */
+	DIO_IGNORE_TRUNCATE = 0x08,
 };
 
 void dio_end_io(struct bio *bio, int error);
@@ -2633,7 +2636,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 #endif
 
 void inode_dio_wait(struct inode *inode);
-void inode_dio_done(struct inode *inode);
+void inode_dio_begin(struct inode *inode, unsigned int flags);
+void inode_dio_done(struct inode *inode, unsigned int flags);
 
 extern void inode_set_flags(struct inode *inode, unsigned int flags,
 			    unsigned int mask);

-- 
Jens Axboe

             reply	other threads:[~2015-04-15 18:22 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-04-15 18:22 Jens Axboe [this message]
2015-04-15 18:56 ` [PATCH] direct-io: only inc/dec inode->i_dio_count for file systems Andrew Morton
2015-04-15 19:26   ` Jens Axboe
2015-04-15 19:46     ` Andrew Morton
2015-04-15 20:08       ` Jens Axboe
2015-04-15 22:25   ` Dave Chinner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150415182256.GA30339@kernel.dk \
    --to=axboe@fb.com \
    --cc=akpm@linux-foundation.org \
    --cc=hch@infradead.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.