linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Benjamin LaHaise <bcrl@kvack.org>
To: Milosz Tanski <milosz@adfin.com>
Cc: linux-kernel@vger.kernel.org,
	Christoph Hellwig <hch@infradead.org>,
	linux-fsdevel@vger.kernel.org, linux-aio@kvack.org,
	Mel Gorman <mgorman@suse.de>,
	Volker Lendecke <Volker.Lendecke@sernet.de>,
	Tejun Heo <tj@kernel.org>, Jeff Moyer <jmoyer@redhat.com>,
	Andreas Dilger <adilger@dilger.ca>,
	Jeff Moyer <jmoyer@redhat.com>
Subject: [RFC 1/2] aio: async readahead
Date: Wed, 17 Sep 2014 10:49:25 -0400	[thread overview]
Message-ID: <20140917144925.GU24821@kvack.org> (raw)
In-Reply-To: <cover.1410810247.git.milosz@adfin.com>

Hi Milosz et al,

This code is probably relevant to the non-blocking read thread.  A 
non-blocking read is pretty useless without some way to trigger and 
become aware of data being read into the page cache, and the attached 
patch is one way to do so.

The changes below introduce an async readahead operation that is based 
on readpage (sorry, I haven't done an mpage version of this code yet).  
Please note that this code was written against an older kernel (3.4) 
and hasn't been extensively tested against recent kernels, so there may 
be a few bugs lingering.  That said, the code has been enabled in our 
internal kernel at Solace Systems for a few months now with no reported 
issues.

There is a companion patch to make ext3's readpage operation use async 
metadata reads that will follow.  A test program that uses the new readhead 
operation can be found at http://www.kvack.org/~bcrl/aio-readahead.c .

		-ben
-- 
"Thought is the essence of where you are now."

 fs/aio.c                     |  220 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/pagemap.h      |    2 
 include/uapi/linux/aio_abi.h |    2 
 mm/filemap.c                 |    2 
 4 files changed, 225 insertions(+), 1 deletion(-)
diff --git a/fs/aio.c b/fs/aio.c
index 7337500..f1c0f74 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -46,6 +46,8 @@
 
 #include "internal.h"
 
+static long aio_readahead(struct kiocb *iocb);
+
 #define AIO_RING_MAGIC			0xa10a10a1
 #define AIO_RING_COMPAT_FEATURES	1
 #define AIO_RING_INCOMPAT_FEATURES	0
@@ -1379,6 +1381,12 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 		iter_op	= file->f_op->read_iter;
 		goto rw_common;
 
+	case IOCB_CMD_READAHEAD:
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_READ)))
+			break;
+		return aio_readahead(req);
+
 	case IOCB_CMD_PWRITE:
 	case IOCB_CMD_PWRITEV:
 		mode	= FMODE_WRITE;
@@ -1710,3 +1718,215 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 	}
 	return ret;
 }
+
+/* for readahead */
+struct readahead_state;
+struct readahead_pginfo {
+	struct wait_bit_queue		wait_bit;
+	struct page			*page;
+};
+
+struct readahead_state {
+	struct kiocb			*iocb;
+	unsigned			nr_pages;
+	atomic_t			nr_pages_reading;
+
+	struct readahead_pginfo	pginfo[];
+};
+
+static void aio_readahead_complete(struct readahead_state *state)
+{
+	unsigned i, nr_uptodate = 0;
+	struct kiocb *iocb;
+	long res;
+	if (!atomic_dec_and_test(&state->nr_pages_reading))
+		return;
+	for (i = 0; i < state->nr_pages; i++) {
+		struct page *page = state->pginfo[i].page;
+
+		if (PageUptodate(page))
+			nr_uptodate++;
+		page_cache_release(page);
+	}
+	iocb = state->iocb;
+	if (nr_uptodate == state->nr_pages) {
+		res = iocb->ki_nbytes;
+	} else
+		res = -EIO;
+	kfree(state);
+	aio_complete(iocb, res, 0);
+}
+
+static int pginfo_wait_func(wait_queue_t *wait, unsigned mode, int flags,
+			    void *arg)
+{
+	struct readahead_state *state = wait->private;
+	struct readahead_pginfo *pginfo;
+	struct wait_bit_key *key = arg;
+	unsigned idx;
+
+	pginfo = container_of(wait, struct readahead_pginfo, wait_bit.wait);
+	idx = pginfo - state->pginfo;
+	BUG_ON(idx >= state->nr_pages);
+
+	if (pginfo->wait_bit.key.flags != key->flags ||
+	    pginfo->wait_bit.key.bit_nr != key->bit_nr ||
+	    test_bit(key->bit_nr, key->flags))
+		return 0;
+	list_del_init(&wait->task_list);
+	aio_readahead_complete(state);
+	return 1;
+}
+
+static void pginfo_wait_on_page(struct readahead_state *state,
+				struct readahead_pginfo *pginfo)
+{
+	struct page *page = pginfo->page;
+	wait_queue_head_t *wq;
+	unsigned long flags;
+	int ret;
+
+	pginfo->wait_bit.key.flags = &page->flags;
+	pginfo->wait_bit.key.bit_nr = PG_locked;
+	pginfo->wait_bit.wait.private = state;
+	pginfo->wait_bit.wait.func = pginfo_wait_func;
+	
+	page = pginfo->page;
+	wq = page_waitqueue(page);
+	atomic_inc(&state->nr_pages_reading);
+
+	spin_lock_irqsave(&wq->lock, flags);
+	__add_wait_queue(wq, &pginfo->wait_bit.wait);
+	if (!PageLocked(page))
+		ret = pginfo_wait_func(&pginfo->wait_bit.wait, 0, 0,
+				       &pginfo->wait_bit.key);
+	spin_unlock_irqrestore(&wq->lock, flags);
+}
+
+
+/*
+ * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates all
+ * the pages first, then submits them all for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ *
+ * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ */
+static int
+__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+			pgoff_t offset, unsigned long nr_to_read,
+			unsigned long lookahead_size,
+			struct readahead_state *state)
+{
+	struct inode *inode = mapping->host;
+	struct page *page;
+	unsigned long end_index;	/* The last page we want to read */
+	LIST_HEAD(page_pool);
+	int page_idx;
+	int ret = 0;
+	loff_t isize = i_size_read(inode);
+
+	if (isize == 0)
+		goto out;
+
+	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+
+	/*
+	 * Preallocate as many pages as we will need.
+	 */
+	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
+		pgoff_t page_offset = offset + page_idx;
+		struct readahead_pginfo *pginfo = &state->pginfo[page_idx];
+		int locked = 0;
+
+		if (page_offset > end_index)
+			break;
+
+		init_waitqueue_func_entry(&pginfo->wait_bit.wait,
+					  pginfo_wait_func);
+find_page:
+		page = find_get_page(mapping, page_offset);
+		if (!page) {
+			int err;
+			page = page_cache_alloc_cold(mapping);
+			err = add_to_page_cache_lru(page, mapping,
+						    page_offset,
+						    GFP_KERNEL);
+			if (err)
+				page_cache_release(page);
+			if (err == -EEXIST)
+				goto find_page;
+			if (err)
+				break;
+			locked = 1;
+		}
+		if (!page)
+			break;
+
+		ret++;
+		state->nr_pages++;
+		pginfo->page = page;
+		if (!locked && PageUptodate(page))
+			continue;
+		if (locked || trylock_page(page)) {
+			if (PageUptodate(page)) {
+				unlock_page(page);
+				continue;
+			}
+			pginfo_wait_on_page(state, pginfo);
+
+			/* Ignoring the return code from readpage here is
+			 * safe, as the readpage() operation will unlock
+			 * the page and thus kick our state machine.
+			 */
+			mapping->a_ops->readpage(filp, page);
+			continue;
+		}
+		pginfo_wait_on_page(state, pginfo);
+	}
+
+out:
+	return ret;
+}
+
+static long aio_readahead(struct kiocb *iocb)
+{
+	struct file *filp = iocb->ki_filp;
+	struct readahead_state *state;
+	pgoff_t start, end;
+	unsigned nr_pages;
+	int ret;
+
+	if (!filp->f_mapping || !filp->f_mapping->a_ops ||
+	    !filp->f_mapping->a_ops->readpage)
+		return -EINVAL;
+
+	if (iocb->ki_nbytes == 0) {
+		aio_complete(iocb, 0, 0);
+		return 0;
+	}
+
+	start = iocb->ki_pos >> PAGE_CACHE_SHIFT;
+	end = (iocb->ki_pos + iocb->ki_nbytes - 1) >> PAGE_CACHE_SHIFT;
+	nr_pages = 1 + end - start;
+
+	state = kzalloc(sizeof(*state) +
+			nr_pages * sizeof(struct readahead_pginfo),
+			GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+
+	state->iocb = iocb;
+	atomic_set(&state->nr_pages_reading, 1);
+
+	ret = __do_page_cache_readahead(filp->f_mapping, filp, start, nr_pages,
+					0, state);
+	if (ret <= 0) {
+		kfree(state);
+		aio_complete(iocb, 0, 0);
+		return 0;
+	}
+
+	aio_readahead_complete(state);	// Drops ref of 1 from nr_pages_reading
+	return 0;
+}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3df8c7d..afd1f20 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -495,6 +495,8 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
 	return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
 }
 
+wait_queue_head_t *page_waitqueue(struct page *page);
+
 /*
  * This is exported only for wait_on_page_locked/wait_on_page_writeback.
  * Never use this directly!
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index bb2554f..11723c53 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -44,6 +44,8 @@ enum {
 	IOCB_CMD_NOOP = 6,
 	IOCB_CMD_PREADV = 7,
 	IOCB_CMD_PWRITEV = 8,
+
+	IOCB_CMD_READAHEAD = 12,
 };
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 90effcd..3368b73 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -670,7 +670,7 @@ EXPORT_SYMBOL(__page_cache_alloc);
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-static wait_queue_head_t *page_waitqueue(struct page *page)
+wait_queue_head_t *page_waitqueue(struct page *page)
 {
 	const struct zone *zone = page_zone(page);
 

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

  parent reply	other threads:[~2014-09-17 14:49 UTC|newest]

Thread overview: 86+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-09-15 20:20 [RFC PATCH 0/7] Non-blockling buffered fs read (page cache only) Milosz Tanski
2014-09-15 20:20 ` [PATCH 1/7] Prepare for adding a new readv/writev with user flags Milosz Tanski
2014-09-15 20:28   ` Al Viro
2014-09-15 21:15     ` Christoph Hellwig
2014-09-15 21:44       ` Milosz Tanski
2014-09-15 20:20 ` [PATCH 2/7] Define new syscalls readv2,preadv2,writev2,pwritev2 Milosz Tanski
2014-09-16 19:20   ` Jeff Moyer
2014-09-16 19:54     ` Milosz Tanski
2014-09-16 21:03     ` Christoph Hellwig
2014-09-17 15:43   ` Theodore Ts'o
2014-09-17 16:05     ` Milosz Tanski
2014-09-17 16:59       ` Theodore Ts'o
2014-09-17 17:24         ` Zach Brown
2014-09-15 20:20 ` [PATCH 3/7] Export new vector IO (with flags) to userland Milosz Tanski
2014-09-15 20:21 ` [PATCH 4/7] O_NONBLOCK flag for readv2/preadv2 Milosz Tanski
2014-09-16 19:19   ` Jeff Moyer
2014-09-16 19:44     ` Milosz Tanski
2014-09-16 19:53       ` Jeff Moyer
2014-09-15 20:21 ` [PATCH 5/7] documentation updates Christoph Hellwig
2014-09-15 20:21 ` [PATCH 6/7] move flags enforcement to vfs_preadv/vfs_pwritev Christoph Hellwig
2014-09-15 21:15   ` Christoph Hellwig
2014-09-15 21:45     ` Milosz Tanski
2014-09-15 20:22 ` [PATCH 7/7] check for O_NONBLOCK in all read_iter instances Christoph Hellwig
2014-09-16 19:27   ` Jeff Moyer
2014-09-16 19:45     ` Milosz Tanski
2014-09-16 21:42       ` Dave Chinner
2014-09-17 12:24         ` Benjamin LaHaise
2014-09-17 13:47           ` Theodore Ts'o
2014-09-17 13:56             ` Benjamin LaHaise
2014-09-17 15:33               ` Milosz Tanski
2014-09-17 15:49                 ` Theodore Ts'o
2014-09-17 15:52               ` Zach Brown
2014-09-16 21:04     ` Christoph Hellwig
2014-09-16 21:24       ` Jeff Moyer
2014-09-15 20:27 ` [RFC PATCH 0/7] Non-blockling buffered fs read (page cache only) Milosz Tanski
2014-09-15 21:33 ` Andreas Dilger
2014-09-15 22:13   ` Milosz Tanski
2014-09-15 22:36   ` Elliott, Robert (Server Storage)
2014-09-16 18:24     ` Zach Brown
2014-09-19 11:21     ` Christoph Hellwig
2014-09-22 15:48       ` Jeff Moyer
2014-09-22 16:32         ` Milosz Tanski
2014-09-22 16:42           ` Christoph Hellwig
2014-09-22 17:02             ` Milosz Tanski
2014-09-22 16:25       ` Elliott, Robert (Server Storage)
2014-09-15 21:58 ` Jeff Moyer
2014-09-15 22:27   ` Milosz Tanski
2014-09-16 13:44     ` Jeff Moyer
2014-09-19 11:23   ` Christoph Hellwig
2014-09-16 19:30 ` Jeff Moyer
2014-09-16 20:34   ` Milosz Tanski
2014-09-16 20:49     ` Jeff Moyer
2014-09-17 14:49 ` Benjamin LaHaise [this message]
2014-09-17 15:26   ` [RFC 2/2] ext4: async readpage for indirect style inodes Benjamin LaHaise
2014-09-19 11:26   ` [RFC 1/2] aio: async readahead Christoph Hellwig
2014-09-19 16:01     ` Benjamin LaHaise
2014-09-17 22:20 ` [RFC v2 0/5] Non-blockling buffered fs read (page cache only) Milosz Tanski
2014-09-17 22:20   ` [RFC v2 1/5] Prepare for adding a new readv/writev with user flags Milosz Tanski
2014-09-17 22:20   ` [RFC v2 2/5] Define new syscalls readv2,preadv2,writev2,pwritev2 Milosz Tanski
2014-09-18 18:48     ` Darrick J. Wong
2014-09-19 10:52       ` Christoph Hellwig
2014-09-20  0:19         ` Darrick J. Wong
2014-09-17 22:20   ` [RFC v2 3/5] Export new vector IO (with flags) to userland Milosz Tanski
2014-09-17 22:20   ` [RFC v2 4/5] O_NONBLOCK flag for readv2/preadv2 Milosz Tanski
2014-09-19 11:27     ` Christoph Hellwig
2014-09-19 11:59       ` Milosz Tanski
2014-09-22 17:12     ` Jeff Moyer
2014-09-17 22:20   ` [RFC v2 5/5] Check for O_NONBLOCK in all read_iter instances Milosz Tanski
2014-09-19 11:26     ` Christoph Hellwig
2014-09-19 14:42   ` [RFC v2 0/5] Non-blockling buffered fs read (page cache only) Jonathan Corbet
2014-09-19 16:13     ` Volker Lendecke
2014-09-19 17:19     ` Milosz Tanski
2014-09-19 17:33     ` Milosz Tanski
2014-09-22 14:12       ` Jonathan Corbet
2014-09-22 14:24         ` Jeff Moyer
2014-09-22 14:25         ` Christoph Hellwig
2014-09-22 14:30         ` Milosz Tanski
2014-09-24 21:46 ` [RFC v3 0/4] vfs: " Milosz Tanski
2014-09-24 21:46   ` [RFC v3 1/4] vfs: Prepare for adding a new preadv/pwritev with user flags Milosz Tanski
2014-09-24 21:46   ` [RFC v3 2/4] vfs: Define new syscalls preadv2,pwritev2 Milosz Tanski
2014-09-24 21:46   ` [RFC v3 3/4] vfs: Export new vector IO syscalls (with flags) to userland Milosz Tanski
2014-09-24 21:46   ` [RFC v3 4/4] vfs: RWF_NONBLOCK flag for preadv2 Milosz Tanski
     [not found]   ` <cover.1411594644.git.milosz-B5zB6C1i6pkAvxtiuMwx3w@public.gmane.org>
2014-09-25  4:06     ` [RFC v3 0/4] vfs: Non-blockling buffered fs read (page cache only) Michael Kerrisk
2014-09-25 11:16       ` Jan Kara
2014-09-25 15:48       ` Milosz Tanski
2014-10-08  2:53   ` Milosz Tanski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140917144925.GU24821@kvack.org \
    --to=bcrl@kvack.org \
    --cc=Volker.Lendecke@sernet.de \
    --cc=adilger@dilger.ca \
    --cc=hch@infradead.org \
    --cc=jmoyer@redhat.com \
    --cc=linux-aio@kvack.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=milosz@adfin.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).