Re: [PATCH 14/25] aio: Make aio_read_evt() more efficient

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Kent Overstreet <koverstreet@google.com>
To: Zach Brown <zab@redhat.com>
Cc: linux-kernel@vger.kernel.org, linux-aio@kvack.org,
	linux-fsdevel@vger.kernel.org, bcrl@kvack.org, jmoyer@redhat.com,
	axboe@kernel.dk, viro@zeniv.linux.org.uk
Subject: Re: [PATCH 14/25] aio: Make aio_read_evt() more efficient
Date: Thu, 29 Nov 2012 16:20:52 -0800	[thread overview]
Message-ID: <20121130002052.GM15094@google.com> (raw)
In-Reply-To: <20121129003816.GJ18574@lenny.home.zabbo.net>

On Wed, Nov 28, 2012 at 04:38:16PM -0800, Zach Brown wrote:
> As mentioned offlist: we don't want to be blocking under
> TASK_INTERRUPTIBLE.  Is the plan to do a non-blocking check and pop
> outside the wait loop to do a blocking copy?

Here's the latest version that I posted on irc earlier:


commit 913ff32bbd4de15a87b07a87ac196e978bc29e17
Author: Kent Overstreet <koverstreet@google.com>
Date:   Thu Nov 29 14:12:40 2012 -0800

    aio: Make aio_read_evt() more efficient
    
    Previously, aio_read_event() pulled a single completion off the
    ringbuffer at a time, locking and unlocking each time.
    
    Changed it to pull off as many events as it can at a time, and copy them
    directly to userspace.
    
    This also fixes a bug where if copying the event to userspace failed,
    we'd lose the event.
    
    Signed-off-by: Kent Overstreet <koverstreet@google.com>

diff --git a/fs/aio.c b/fs/aio.c
index 46e6d30..5eca2a4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -63,7 +63,7 @@ struct aio_ring_info {
 	unsigned long		mmap_size;
 
 	struct page		**ring_pages;
-	spinlock_t		ring_lock;
+	struct mutex		ring_lock;
 	long			nr_pages;
 
 	unsigned		nr, tail;
@@ -341,7 +341,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	atomic_set(&ctx->users, 2);
 	atomic_set(&ctx->dead, 0);
 	spin_lock_init(&ctx->ctx_lock);
-	spin_lock_init(&ctx->ring_info.ring_lock);
+	mutex_init(&ctx->ring_info.ring_lock);
 	init_waitqueue_head(&ctx->wait);
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
@@ -746,149 +746,138 @@ put_rq:
 }
 EXPORT_SYMBOL(aio_complete);
 
-/* aio_read_evt
- *	Pull an event off of the ioctx's event ring.  Returns the number of 
- *	events fetched (0 or 1 ;-)
- *	FIXME: make this use cmpxchg.
- *	TODO: make the ringbuffer user mmap()able (requires FIXME).
+/* aio_read_events
+ *	Pull an event off of the ioctx's event ring.  Returns the number of
+ *	events fetched
  */
-static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+static int aio_read_events(struct kioctx *ctx, struct io_event __user *event,
+			   long nr, unsigned *head)
 {
-	struct aio_ring_info *info = &ioctx->ring_info;
+	struct aio_ring_info *info = &ctx->ring_info;
 	struct aio_ring *ring;
-	unsigned long head;
-	int ret = 0;
+	unsigned pos;
+	int ret = 0, copy_ret;
 
-	ring = kmap_atomic(info->ring_pages[0]);
-	pr_debug("h%u t%u m%u\n", ring->head, ring->tail, ring->nr);
+	pr_debug("h%u t%u m%u\n", *head, info->tail, info->nr);
 
-	if (ring->head == ring->tail)
-		goto out;
+	while (ret < nr) {
+		unsigned i = (*head < info->tail ? info->tail : info->nr) - *head;
+		struct io_event *ev;
+		struct page *page;
+
+		if (*head == info->tail)
+			break;
+
+		i = min_t(int, i, nr - ret);
+		i = min_t(int, i, AIO_EVENTS_PER_PAGE -
+			  ((*head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
+
+		pos = *head + AIO_EVENTS_OFFSET;
+		page = info->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+		pos %= AIO_EVENTS_PER_PAGE;
 
-	spin_lock(&info->ring_lock);
-
-	head = ring->head % info->nr;
-	if (head != ring->tail) {
-		struct io_event *evp = aio_ring_event(info, head);
-		*ent = *evp;
-		head = (head + 1) % info->nr;
-		smp_mb(); /* finish reading the event before updatng the head */
-		ring->head = head;
-		ret = 1;
-		put_aio_ring_event(evp);
+		ev = kmap(page);
+		copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * i);
+		kunmap(page);
+
+		if (unlikely(copy_ret))
+			return -EFAULT;
+
+		ret += i;
+		*head += i;
+		*head %= info->nr;
 	}
-	spin_unlock(&info->ring_lock);
 
-out:
+	smp_mb(); /* finish reading the event before updating the head */
+
+	ring = kmap_atomic(info->ring_pages[0]);
+	ring->head = *head;
 	kunmap_atomic(ring);
-	pr_debug("%d  h%u t%u\n", ret, ring->head, ring->tail);
+
+	pr_debug("%d  h%u t%u\n", ret, *head, info->tail);
+
 	return ret;
 }
 
 static int read_events(struct kioctx *ctx,
-			long min_nr, long nr,
-			struct io_event __user *event,
-			struct timespec __user *timeout)
+		       long min_nr, long nr,
+		       struct io_event __user *event,
+		       struct timespec __user *timeout)
 {
 	DEFINE_WAIT(wait);
+	struct aio_ring_info *info = &ctx->ring_info;
+	struct aio_ring *ring;
 	struct hrtimer_sleeper t;
+	unsigned head;
 	size_t i = 0;
-	int ret;
-	struct io_event		ent;
+	int ret = 0;
 
-	/* needed to zero any padding within an entry (there shouldn't be 
-	 * any, but C is fun!
-	 */
-	memset(&ent, 0, sizeof(ent));
-	ret = 0;
-	while (likely(i < nr)) {
-		ret = aio_read_evt(ctx, &ent);
-		if (unlikely(ret <= 0))
-			break;
+	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init_sleeper(&t, current);
 
-		pr_debug("%Lx %Lx %Lx %Lx\n",
-			 ent.data, ent.obj, ent.res, ent.res2);
+	mutex_lock(&info->ring_lock);
 
-		/* Could we split the check in two? */
-		ret = -EFAULT;
-		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
-			pr_debug("lost an event due to EFAULT.\n");
+	while (i < nr) {
+		ring = kmap_atomic(info->ring_pages[0]);
+		head = ring->head;
+		kunmap_atomic(ring);
+retry:
+		ret = aio_read_events(ctx, event + i, nr - i, &head);
+		if (ret < 0)
 			break;
-		}
-		ret = 0;
 
-		/* Good, event copied to userland, update counts. */
-		event ++;
-		i ++;
-	}
-
-	if (min_nr <= i)
-		return i;
-	if (ret)
-		return ret;
-
-	/* End fast path */
+		i += ret;
+		if (i >= min_nr)
+			break;
+		if (unlikely(atomic_read(&ctx->dead))) {
+			ret = -EINVAL;
+			break;
+		}
+		if (!t.task)	/* Only check after read evt */
+			break;
 
-	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hrtimer_init_sleeper(&t, current);
+		if (timeout) {
+			struct timespec	ts;
 
-	if (timeout) {
-		struct timespec	ts;
+			if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) {
+				ret = -EFAULT;
+				break;
+			}
 
-		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) {
-			ret = -EFAULT;
-			goto out;
+			timeout = NULL;
+			hrtimer_start_range_ns(&t.timer, timespec_to_ktime(ts),
+					       current->timer_slack_ns,
+					       HRTIMER_MODE_REL);
 		}
 
-		hrtimer_start_range_ns(&t.timer, timespec_to_ktime(ts),
-				       current->timer_slack_ns, HRTIMER_MODE_REL);
-	}
-
-	while (likely(i < nr)) {
 		prepare_to_wait_exclusive(&ctx->wait, &wait,
 					  TASK_INTERRUPTIBLE);
 
-		do {
-			ret = aio_read_evt(ctx, &ent);
-			if (ret)
-				break;
-			if (min_nr <= i)
-				break;
-			if (unlikely(atomic_read(&ctx->dead))) {
-				ret = -EINVAL;
-				break;
-			}
-			if (!t.task)	/* Only check after read evt */
-				break;
-			/* Try to only show up in io wait if there are ops
-			 *  in flight */
-			if (atomic_read(&ctx->reqs_active))
-				io_schedule();
-			else
-				schedule();
-			if (signal_pending(current)) {
-				ret = -EINTR;
-				break;
-			}
-			/*ret = aio_read_evt(ctx, &ent);*/
-		} while (1) ;
+		if (head != info->tail) {
+			__set_current_state(TASK_RUNNING);
+			goto retry;
+		}
 
-		finish_wait(&ctx->wait, &wait);
+		mutex_unlock(&info->ring_lock);
 
-		if (unlikely(ret <= 0))
-			break;
+		/* Try to only show up in io wait if there are ops in flight */
+		if (atomic_read(&ctx->reqs_active))
+			io_schedule();
+		else
+			schedule();
 
-		ret = -EFAULT;
-		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
-			pr_debug("lost an event due to EFAULT.\n");
-			break;
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			goto out;
 		}
 
-		/* Good, event copied to userland, update counts. */
-		event ++;
-		i ++;
+		__set_current_state(TASK_RUNNING);
+		mutex_lock(&info->ring_lock);
 	}
+
+	mutex_unlock(&info->ring_lock);
 out:
+	finish_wait(&ctx->wait, &wait);
 	hrtimer_cancel(&t.timer);
 	destroy_hrtimer_on_stack(&t.timer);
 	return i ? i : ret;

next prev parent reply	other threads:[~2012-11-30  0:20 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-11-28 16:43 [PATCH 00/25] AIO performance improvements/cleanups Kent Overstreet
2012-11-28 16:43 ` [PATCH 01/25] mm: remove old aio use_mm() comment Kent Overstreet
2012-11-28 16:43 ` [PATCH 02/25] aio: remove dead code from aio.h Kent Overstreet
2012-11-28 16:43 ` [PATCH 03/25] gadget: remove only user of aio retry Kent Overstreet
2012-11-28 16:43 ` [PATCH 04/25] aio: remove retry-based AIO Kent Overstreet
2012-11-28 16:43 ` [PATCH 05/25] char: add aio_{read,write} to /dev/{null,zero} Kent Overstreet
2012-11-28 16:43 ` [PATCH 06/25] aio: Kill return value of aio_complete() Kent Overstreet
2012-11-28 16:43 ` [PATCH 07/25] aio: kiocb_cancel() Kent Overstreet
2012-11-29  0:07   ` Zach Brown
2012-11-29  0:58     ` Kent Overstreet
2012-11-28 16:43 ` [PATCH 08/25] aio: Move private stuff out of aio.h Kent Overstreet
2012-11-28 16:43 ` [PATCH 09/25] aio: dprintk() -> pr_debug() Kent Overstreet
2012-11-28 16:43 ` [PATCH 10/25] aio: do fget() after aio_get_req() Kent Overstreet
2012-11-28 16:43 ` [PATCH 11/25] aio: Make aio_put_req() lockless Kent Overstreet
2012-11-28 16:43 ` [PATCH 12/25] aio: Refcounting cleanup Kent Overstreet
2012-11-29  0:17   ` Zach Brown
2012-11-29  1:12     ` Kent Overstreet
2012-11-29  0:46   ` Benjamin LaHaise
2012-11-29  1:38     ` Kent Overstreet
2012-11-28 16:43 ` [PATCH 13/25] aio: Convert read_events() to hrtimers Kent Overstreet
2012-11-29  0:24   ` Zach Brown
2012-11-29  1:05     ` Kent Overstreet
2012-11-28 16:43 ` [PATCH 14/25] aio: Make aio_read_evt() more efficient Kent Overstreet
2012-11-29  0:38   ` Zach Brown
2012-11-29 19:31     ` Kent Overstreet
2012-11-30  0:20     ` Kent Overstreet [this message]
2012-11-28 16:43 ` [PATCH 15/25] aio: Use cancellation list lazily Kent Overstreet
2012-11-28 16:43 ` [PATCH 16/25] aio: Change reqs_active to include unreaped completions Kent Overstreet
2012-11-28 16:43 ` [PATCH 17/25] aio: Kill batch allocation Kent Overstreet
2012-11-28 16:43 ` [PATCH 18/25] aio: Kill struct aio_ring_info Kent Overstreet
2012-11-28 16:43 ` [PATCH 19/25] aio: Give shared kioctx fields their own cachelines Kent Overstreet
2012-11-28 16:43 ` [PATCH 20/25] aio: reqs_active -> reqs_available Kent Overstreet
2012-11-28 16:43 ` [PATCH 21/25] aio: percpu reqs_available Kent Overstreet
2012-11-28 16:43 ` [PATCH 22/25] Generic dynamic per cpu refcounting Kent Overstreet
2012-11-29 18:45   ` Andi Kleen
2012-11-29 18:57     ` Kent Overstreet
2012-11-29 18:59       ` Andi Kleen
2012-11-29 19:12         ` Kent Overstreet
2012-11-29 19:20           ` Andi Kleen
2012-11-29 19:29             ` Kent Overstreet
2012-11-29 19:34               ` Benjamin LaHaise
2012-11-29 20:22                 ` Kent Overstreet
2012-11-29 20:42                   ` Andi Kleen
2012-11-29 20:45                     ` Kent Overstreet
2012-11-29 20:54                       ` Andi Kleen
2012-11-29 20:59                         ` Kent Overstreet
2012-11-29 21:57                           ` Jamie Lokier
2012-11-28 16:43 ` [PATCH 23/25] aio: Percpu ioctx refcount Kent Overstreet
2012-11-28 16:43 ` [PATCH 24/25] aio: use xchg() instead of completion_lock Kent Overstreet
2012-11-28 16:43 ` [PATCH 25/25] aio: Don't include aio.h in sched.h Kent Overstreet
2012-11-29  0:03 ` [PATCH 00/25] AIO performance improvements/cleanups Zach Brown
2012-11-29 19:01   ` Kent Overstreet
  -- strict thread matches above, loose matches on Subject: below --
2012-11-28  3:19 Kent Overstreet
2012-11-28  3:19 ` [PATCH 14/25] aio: Make aio_read_evt() more efficient Kent Overstreet

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:46e6d30 dfblob:5eca2a4 )
 OR (
bs:"Re: [PATCH 14/25] aio: Make aio_read_evt() more efficient" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20121130002052.GM15094@google.com \
    --to=koverstreet@google.com \
    --cc=axboe@kernel.dk \
    --cc=bcrl@kvack.org \
    --cc=jmoyer@redhat.com \
    --cc=linux-aio@kvack.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=zab@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).