[patch] convert aio event reap to use atomic-op instead of spin

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch] convert aio event reap to use atomic-op instead of spin_lock
@ 2007-04-10 23:53 Ken Chen
  2007-04-11  5:18 ` Andrew Morton
  2007-04-11 18:00 ` Zach Brown
  0 siblings, 2 replies; 20+ messages in thread
From: Ken Chen @ 2007-04-10 23:53 UTC (permalink / raw)
  To: akpm, linux-aio, linux-kernel

Resurrect an old patch that uses atomic operation to update ring buffer
index on AIO event queue.  This work allows futher application/libaio
optimization to run fast path io_getevents in user space.

I've also added one more change on top of old implementation that rounds
ring buffer size to power of 2 to allow fast head/tail calculation. With
the new scheme, there is no more modulo operation on them and we simply
increment either pointer index directly.  This scheme also automatically
handles integer wrap nicely without any additional special treatment.


Signed-off-by: Ken Chen <kenchen@google.com>


diff --git a/fs/aio.c b/fs/aio.c
index e4598d6..00ecd14 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx 
 	unsigned long size;
 	int nr_pages;
 
-	/* Compensate for the ring buffer's head/tail overlap entry */
-	nr_events += 2;	/* 1 is required, 2 for good luck */
+	/* round nr_event to next power of 2 */
+	nr_events = roundup_pow_of_two(nr_events);
 
 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
@@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx 
 	if (nr_pages < 0)
 		return -EINVAL;
 
-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
-
 	info->nr = 0;
 	info->ring_pages = info->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
@@ -177,8 +175,8 @@ #define AIO_EVENTS_PER_PAGE	(PAGE_SIZE /
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr, km) ({					\
-	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
+#define aio_ring_event(info, __nr, km) ({				\
+	unsigned pos = ((__nr) & ((info)->nr - 1)) + AIO_EVENTS_OFFSET;	\
 	struct io_event *__event;					\
 	__event = kmap_atomic(						\
 			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
@@ -220,7 +218,6 @@ static struct kioctx *ioctx_alloc(unsign
 
 	atomic_set(&ctx->users, 1);
 	spin_lock_init(&ctx->ctx_lock);
-	spin_lock_init(&ctx->ring_info.ring_lock);
 	init_waitqueue_head(&ctx->wait);
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
@@ -927,7 +924,7 @@ int fastcall aio_complete(struct kiocb *
 	struct aio_ring	*ring;
 	struct io_event	*event;
 	unsigned long	flags;
-	unsigned long	tail;
+	unsigned	tail;
 	int		ret;
 
 	/*
@@ -969,8 +966,6 @@ int fastcall aio_complete(struct kiocb *
 
 	tail = info->tail;
 	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail >= info->nr)
-		tail = 0;
 
 	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
 	event->data = iocb->ki_user_data;
@@ -986,13 +981,14 @@ int fastcall aio_complete(struct kiocb *
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 
+	tail++;
 	info->tail = tail;
 	ring->tail = tail;
 
 	put_aio_ring_event(event, KM_IRQ0);
 	kunmap_atomic(ring, KM_IRQ1);
 
-	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
 	ret = __aio_put_req(ctx, iocb);
@@ -1007,42 +1003,36 @@ put_rq:
 /* aio_read_evt
  *	Pull an event off of the ioctx's event ring.  Returns the number of 
  *	events fetched (0 or 1 ;-)
- *	FIXME: make this use cmpxchg.
  *	TODO: make the ringbuffer user mmap()able (requires FIXME).
  */
 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
 {
 	struct aio_ring_info *info = &ioctx->ring_info;
 	struct aio_ring *ring;
-	unsigned long head;
-	int ret = 0;
+	struct io_event *evp;
+	unsigned head;
+	int ret;
 
 	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
-	dprintk("in aio_read_evt h%lu t%lu m%lu\n",
-		 (unsigned long)ring->head, (unsigned long)ring->tail,
-		 (unsigned long)ring->nr);
-
-	if (ring->head == ring->tail)
-		goto out;
-
-	spin_lock(&info->ring_lock);
+	dprintk("in aio_read_evt h%u t%u m%u\n",
+		 ring->head, ring->tail, ring->nr);
 
-	head = ring->head % info->nr;
-	if (head != ring->tail) {
-		struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+	do {
+		head = ring->head;
+		if (head == ring->tail) {
+			ret = 0;
+			break;
+		}
+		evp = aio_ring_event(info, head, KM_USER1);
 		*ent = *evp;
-		head = (head + 1) % info->nr;
 		smp_mb(); /* finish reading the event before updatng the head */
-		ring->head = head;
 		ret = 1;
 		put_aio_ring_event(evp, KM_USER1);
-	}
-	spin_unlock(&info->ring_lock);
+	} while (head != cmpxchg(&ring->head, head, head + 1));
 
-out:
 	kunmap_atomic(ring, KM_USER0);
-	dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,
-		 (unsigned long)ring->head, (unsigned long)ring->tail);
+	dprintk("leaving aio_read_evt: %d  h%u t%u\n", ret,
+		 ring->head, ring->tail);
 	return ret;
 }
 
diff --git a/include/linux/aio.h b/include/linux/aio.h
index a30ef13..42ca8d3 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -152,11 +152,10 @@ struct aio_ring {
 	unsigned	incompat_features;
 	unsigned	header_length;	/* size of aio_ring */
 
-
-	struct io_event		io_events[0];
+	struct io_event	io_events[0];
 }; /* 128 bytes + ring size */
 
-#define aio_ring_avail(info, ring)	(((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr)
+#define aio_ring_avail(info, ring) ((info)->nr + (ring)->head - (ring)->tail)
 
 #define AIO_RING_PAGES	8
 struct aio_ring_info {
@@ -164,7 +163,6 @@ struct aio_ring_info {
 	unsigned long		mmap_size;
 
 	struct page		**ring_pages;
-	spinlock_t		ring_lock;
 	long			nr_pages;
 
 	unsigned		nr, tail;


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-10 23:53 [patch] convert aio event reap to use atomic-op instead of spin_lock Ken Chen
@ 2007-04-11  5:18 ` Andrew Morton
  2007-04-11 16:54   ` Ken Chen
  2007-04-11 18:00 ` Zach Brown
  1 sibling, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2007-04-11  5:18 UTC (permalink / raw)
  To: Ken Chen; +Cc: linux-aio, linux-kernel

On Tue, 10 Apr 2007 16:53:53 -0700 (PDT) kenchen@google.com (Ken Chen) wrote:

> +	} while (head != cmpxchg(&ring->head, head, head + 1));

A hasty grep indicates that only 14 out of 23 architectures implement
cmpxchg().

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11  5:18 ` Andrew Morton
@ 2007-04-11 16:54   ` Ken Chen
  2007-04-11 18:15     ` Zach Brown
  0 siblings, 1 reply; 20+ messages in thread
From: Ken Chen @ 2007-04-11 16:54 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-aio, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 564 bytes --]

On 4/10/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Tue, 10 Apr 2007 16:53:53 -0700 (PDT) kenchen@google.com (Ken Chen) wrote:
>
> > +     } while (head != cmpxchg(&ring->head, head, head + 1));
>
> A hasty grep indicates that only 14 out of 23 architectures implement
> cmpxchg().

Sorry I wasn't thorough enough.  And partially because I was worried
about changing structure type for user space facing struct aio_ring.
Now that I looked through all arches, it looks safe as all arch's
atomic_t has the same size as int.

Here is the updated patch.

[-- Attachment #2: aio-use-atomic.patch --]
[-- Type: text/x-patch, Size: 5718 bytes --]

diff --git a/fs/aio.c b/fs/aio.c
index e4598d6..091bcb4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 	unsigned long size;
 	int nr_pages;
 
-	/* Compensate for the ring buffer's head/tail overlap entry */
-	nr_events += 2;	/* 1 is required, 2 for good luck */
+	/* round nr_event to next power of 2 */
+	nr_events = roundup_pow_of_two(nr_events);
 
 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
@@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 	if (nr_pages < 0)
 		return -EINVAL;
 
-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
-
 	info->nr = 0;
 	info->ring_pages = info->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
@@ -159,7 +157,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
 	ring->nr = nr_events;	/* user copy */
 	ring->id = ctx->user_id;
-	ring->head = ring->tail = 0;
+	atomic_set(&ring->head, 0);
+	ring->tail = 0;
 	ring->magic = AIO_RING_MAGIC;
 	ring->compat_features = AIO_RING_COMPAT_FEATURES;
 	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
@@ -177,8 +176,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr, km) ({					\
-	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
+#define aio_ring_event(info, __nr, km) ({				\
+	unsigned pos = ((__nr) & ((info)->nr - 1)) + AIO_EVENTS_OFFSET;	\
 	struct io_event *__event;					\
 	__event = kmap_atomic(						\
 			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
@@ -220,7 +219,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	atomic_set(&ctx->users, 1);
 	spin_lock_init(&ctx->ctx_lock);
-	spin_lock_init(&ctx->ring_info.ring_lock);
 	init_waitqueue_head(&ctx->wait);
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
@@ -927,7 +925,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	struct aio_ring	*ring;
 	struct io_event	*event;
 	unsigned long	flags;
-	unsigned long	tail;
+	unsigned	tail;
 	int		ret;
 
 	/*
@@ -969,8 +967,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 
 	tail = info->tail;
 	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail >= info->nr)
-		tail = 0;
 
 	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
 	event->data = iocb->ki_user_data;
@@ -986,13 +982,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 
+	tail++;
 	info->tail = tail;
 	ring->tail = tail;
 
 	put_aio_ring_event(event, KM_IRQ0);
 	kunmap_atomic(ring, KM_IRQ1);
 
-	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
 	ret = __aio_put_req(ctx, iocb);
@@ -1007,42 +1004,36 @@ put_rq:
 /* aio_read_evt
  *	Pull an event off of the ioctx's event ring.  Returns the number of 
  *	events fetched (0 or 1 ;-)
- *	FIXME: make this use cmpxchg.
  *	TODO: make the ringbuffer user mmap()able (requires FIXME).
  */
 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
 {
 	struct aio_ring_info *info = &ioctx->ring_info;
 	struct aio_ring *ring;
-	unsigned long head;
-	int ret = 0;
+	struct io_event *evp;
+	unsigned head;
+	int ret;
 
 	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
-	dprintk("in aio_read_evt h%lu t%lu m%lu\n",
-		 (unsigned long)ring->head, (unsigned long)ring->tail,
-		 (unsigned long)ring->nr);
-
-	if (ring->head == ring->tail)
-		goto out;
-
-	spin_lock(&info->ring_lock);
+	dprintk("in aio_read_evt h%u t%u m%u\n",
+		 atomic_read(&ring->head), ring->tail, ring->nr);
 
-	head = ring->head % info->nr;
-	if (head != ring->tail) {
-		struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+	do {
+		head = atomic_read(&ring->head);
+		if (head == ring->tail) {
+			ret = 0;
+			break;
+		}
+		evp = aio_ring_event(info, head, KM_USER1);
 		*ent = *evp;
-		head = (head + 1) % info->nr;
 		smp_mb(); /* finish reading the event before updatng the head */
-		ring->head = head;
 		ret = 1;
 		put_aio_ring_event(evp, KM_USER1);
-	}
-	spin_unlock(&info->ring_lock);
+	} while (head != atomic_cmpxchg(&ring->head, head, head + 1));
 
-out:
 	kunmap_atomic(ring, KM_USER0);
-	dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,
-		 (unsigned long)ring->head, (unsigned long)ring->tail);
+	dprintk("leaving aio_read_evt: %d  h%u t%u\n", ret,
+		 atomic_read(&ring->head), ring->tail);
 	return ret;
 }
 
diff --git a/include/linux/aio.h b/include/linux/aio.h
index a30ef13..aebb135 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -144,7 +144,7 @@ struct kiocb {
 struct aio_ring {
 	unsigned	id;	/* kernel internal index number */
 	unsigned	nr;	/* number of io_events */
-	unsigned	head;
+	atomic_t	head;
 	unsigned	tail;
 
 	unsigned	magic;
@@ -152,11 +152,11 @@ struct aio_ring {
 	unsigned	incompat_features;
 	unsigned	header_length;	/* size of aio_ring */
 
-
-	struct io_event		io_events[0];
+	struct io_event	io_events[0];
 }; /* 128 bytes + ring size */
 
-#define aio_ring_avail(info, ring)	(((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr)
+#define aio_ring_avail(info, ring)					\
+	((info)->nr + (unsigned) atomic_read(&(ring)->head) - (ring)->tail)
 
 #define AIO_RING_PAGES	8
 struct aio_ring_info {
@@ -164,7 +164,6 @@ struct aio_ring_info {
 	unsigned long		mmap_size;
 
 	struct page		**ring_pages;
-	spinlock_t		ring_lock;
 	long			nr_pages;
 
 	unsigned		nr, tail;

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-10 23:53 [patch] convert aio event reap to use atomic-op instead of spin_lock Ken Chen
  2007-04-11  5:18 ` Andrew Morton
@ 2007-04-11 18:00 ` Zach Brown
  2007-04-11 19:11   ` Andrew Morton
                     ` (2 more replies)
  1 sibling, 3 replies; 20+ messages in thread
From: Zach Brown @ 2007-04-11 18:00 UTC (permalink / raw)
  To: Ken Chen; +Cc: akpm, linux-aio, linux-kernel

First, I'll NAK this and all AIO patches until the patch description
says that it's been run through the regression tests that we've started
collecting in autotest.  They're trivial to run, never fear:

 cd /usr/local
 svn checkout svn://test.kernel.org/autotest/trunk/client autotest
 cd autotest
 bin/autotest tests/aio_dio_bugs/control

(We'll be moving this to LTP soon, I think, but that will just change
the commands to cut and paste.) 

> Resurrect an old patch that uses atomic operation to update ring buffer
> index on AIO event queue.  This work allows futher application/libaio
> optimization to run fast path io_getevents in user space.

I have mixed feelings.  I think the userspace getevents support was
poorly designed and the simple fact that we've gone this long without it
says just how desperately the feature isn't needed.

At the same time, it's a small change and the code is already there.  So
it's hard to get too worked up about it.  We can always remove support
in the future by never changing the indexes if we require that apps fall
back to sys_io_getevents().

In any case, let's see a test which exercises userspace event collection
before we claim to support it.  I'd prefer a little stand-alone C app
like the ones in autotest for simple unit testing.  A fio patch to
stress test it would probably also be well received.

> I've also added one more change on top of old implementation that rounds
> ring buffer size to power of 2 to allow fast head/tail calculation. With
> the new scheme, there is no more modulo operation on them and we simply
> increment either pointer index directly.  

Please quantify the improvement.  We're allocating more ring elements
than userspace asked for and than were accounted for in aio_max_nr.
That cost, however teeny, will be measured against the benefit.

> -	/* Compensate for the ring buffer's head/tail overlap entry */
> -	nr_events += 2;	/* 1 is required, 2 for good luck */
> +	/* round nr_event to next power of 2 */
> +	nr_events = roundup_pow_of_two(nr_events);

Is that buggy?  How will the code tell if head == tail means a full ring
or an empty ring?  The old code added that extra event to let it tell
the ring was full before head == tail and it would think it's empty
again, I think.  I'm guessing roundup(nr_events + 1) is needed.

> -#define aio_ring_event(info, nr, km) ({					\
> -	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
> +#define aio_ring_event(info, __nr, km) ({				\
> +	unsigned pos = ((__nr) & ((info)->nr - 1)) + AIO_EVENTS_OFFSET;	\
>  	struct io_event *__event;					\
>  	__event = kmap_atomic(						\
>  			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \

(info) is now evaluated twice.  __info = (info), please, just like
__event there.

>  	put_aio_ring_event(event, KM_IRQ0);
>  	kunmap_atomic(ring, KM_IRQ1);

The ring page, which is mmaped to userspace at some weird address, was
just written through a kmap.  Do we need a flush_dcache_page()?  This
isn't this patch's problem, but it'd need to be addressed if we're using
the ring from userspace.

> -
> -	struct io_event		io_events[0];
> +	struct io_event	io_events[0];

Try to minimize gratuitous reformatting, please.

- z

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 16:54   ` Ken Chen
@ 2007-04-11 18:15     ` Zach Brown
  2007-04-11 18:29       ` Ken Chen
  0 siblings, 1 reply; 20+ messages in thread
From: Zach Brown @ 2007-04-11 18:15 UTC (permalink / raw)
  To: Ken Chen; +Cc: Andrew Morton, linux-aio, linux-kernel

> Sorry I wasn't thorough enough.  And partially because I was worried
> about changing structure type for user space facing struct aio_ring.
> Now that I looked through all arches, it looks safe as all arch's
> atomic_t has the same size as int.

> Here is the updated patch.

> @@ -144,7 +144,7 @@ struct kiocb {
>  struct aio_ring {
>  	unsigned	id;	/* kernel internal index number */
>  	unsigned	nr;	/* number of io_events */
> -	unsigned	head;
> +	atomic_t	head;
>  	unsigned	tail;

Embedding an atomic_t in an ABI struct?  That makes everyone else
nervous too, right?

It may look safe on i386/x86-64 today, but this doesn't seem like wise
practice.  Is there any reason to believe that atomic_t will never
change size?  Does anything else do this already?

If nothing else, the "unsigned" (should be __u32, sigh) could be cast to
an atomic_t.

Is being able to do atomic work on a u32 between the kernel and
userspace something that all archs have support for?  I mean, take the
fact that userspace and the kernel could both be doing these atomic ops
on different virtual addresses and so conceivably different cachelines.
Is that a problem for anyone?

I do find myself wondering if the notion of userspace ring
synchronization shouldn't be built around futexes.  They weren't around
when this mmap()ed ring business was created.

- z

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 18:15     ` Zach Brown
@ 2007-04-11 18:29       ` Ken Chen
  0 siblings, 0 replies; 20+ messages in thread
From: Ken Chen @ 2007-04-11 18:29 UTC (permalink / raw)
  To: Zach Brown; +Cc: Andrew Morton, linux-aio, linux-kernel

On 4/11/07, Zach Brown <zach.brown@oracle.com> wrote:
> > Sorry I wasn't thorough enough.  And partially because I was worried
> > about changing structure type for user space facing struct aio_ring.
> > Now that I looked through all arches, it looks safe as all arch's
> > atomic_t has the same size as int.
>
> > Here is the updated patch.
>
> > @@ -144,7 +144,7 @@ struct kiocb {
> >  struct aio_ring {
> >       unsigned        id;     /* kernel internal index number */
> >       unsigned        nr;     /* number of io_events */
> > -     unsigned        head;
> > +     atomic_t        head;
> >       unsigned        tail;
>
> Embedding an atomic_t in an ABI struct?  That makes everyone else
> nervous too, right?

sure, make me nervous at least.


> It may look safe on i386/x86-64 today, but this doesn't seem like wise
> practice.  Is there any reason to believe that atomic_t will never
> change size?  Does anything else do this already?
>
> If nothing else, the "unsigned" (should be __u32, sigh) could be cast to
> an atomic_t.

The atomic_t cast is equally iffy.  I don't know what would be a best solution.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 18:00 ` Zach Brown
@ 2007-04-11 19:11   ` Andrew Morton
  2007-04-11 19:34     ` Zach Brown
  2007-04-11 19:28   ` Ken Chen
  2007-04-12  7:50   ` Ken Chen
  2 siblings, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2007-04-11 19:11 UTC (permalink / raw)
  To: Zach Brown; +Cc: Ken Chen, linux-aio, linux-kernel

On Wed, 11 Apr 2007 11:00:38 -0700
Zach Brown <zach.brown@oracle.com> wrote:

> > -	/* Compensate for the ring buffer's head/tail overlap entry */
> > -	nr_events += 2;	/* 1 is required, 2 for good luck */
> > +	/* round nr_event to next power of 2 */
> > +	nr_events = roundup_pow_of_two(nr_events);
> 
> Is that buggy?  How will the code tell if head == tail means a full ring
> or an empty ring?  The old code added that extra event to let it tell
> the ring was full before head == tail and it would think it's empty
> again, I think.  I'm guessing roundup(nr_events + 1) is needed.

Ken uses the other (superior!) way of implementing ringbuffers: the head
and tail pointers (the naming of which AIO appears to have reversed) are
not constrained to the ringsize - they are simply allowed to wrap through
0xfffffff.  Consequently:

ring full:		(head-tail == size)
ring empty:		head==tail
numer-of-items-in-ring:	head-tail
add to ring:		ring[head++]=item
remove from ring: 	item=ring[tail++]

(adjust the above for AIO naming assbackwardness)
(requires that size be a power of 2)

Many net drivers do it this way for their DMA rings.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 18:00 ` Zach Brown
  2007-04-11 19:11   ` Andrew Morton
@ 2007-04-11 19:28   ` Ken Chen
  2007-04-11 19:44     ` Zach Brown
                       ` (2 more replies)
  2007-04-12  7:50   ` Ken Chen
  2 siblings, 3 replies; 20+ messages in thread
From: Ken Chen @ 2007-04-11 19:28 UTC (permalink / raw)
  To: Zach Brown; +Cc: akpm, linux-aio, linux-kernel

On 4/11/07, Zach Brown <zach.brown@oracle.com> wrote:
> First, I'll NAK this and all AIO patches until the patch description
> says that it's been run through the regression tests that we've started
> collecting in autotest.  They're trivial to run, never fear:

OK.  I will run those regression tests.


> > Resurrect an old patch that uses atomic operation to update ring buffer
> > index on AIO event queue.  This work allows futher application/libaio
> > optimization to run fast path io_getevents in user space.
>
> I have mixed feelings.  I think the userspace getevents support was
> poorly designed and the simple fact that we've gone this long without it
> says just how desperately the feature isn't needed.

I kept on getting requests from application developers who want that
feature.  My initial patch was dated back May 2004.


> We're allocating more ring elements
> than userspace asked for and than were accounted for in aio_max_nr.
> That cost, however teeny, will be measured against the benefit.

I noticed that while writing the patch.  We have the same bug right
now that nr_events is enlarged to consume the entire mmap'ed ring
buffer pages.  But yet, we don't account those in aio_max_nr.  I
didn't fix that up in this patch because I was going to do a separate
patch on that.


> > -     /* Compensate for the ring buffer's head/tail overlap entry */
> > -     nr_events += 2; /* 1 is required, 2 for good luck */
> > +     /* round nr_event to next power of 2 */
> > +     nr_events = roundup_pow_of_two(nr_events);
>
> Is that buggy?  How will the code tell if head == tail means a full ring
> or an empty ring?  The old code added that extra event to let it tell
> the ring was full before head == tail and it would think it's empty
> again, I think.  I'm guessing roundup(nr_events + 1) is needed.

I don't think it's a bug.  akpm taught me the trick: we don't wrap the
index around the ring size in this scheme, so the extra space is not
needed.


> The ring page, which is mmaped to userspace at some weird address, was
> just written through a kmap.  Do we need a flush_dcache_page()?  This
> isn't this patch's problem, but it'd need to be addressed if we're using
> the ring from userspace.

I will look into this aside from this patch.

- Ken

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 19:11   ` Andrew Morton
@ 2007-04-11 19:34     ` Zach Brown
  0 siblings, 0 replies; 20+ messages in thread
From: Zach Brown @ 2007-04-11 19:34 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ken Chen, linux-aio, linux-kernel

> Ken uses the other (superior!) way of implementing ringbuffers: the head
> and tail pointers (the naming of which AIO appears to have reversed) are
> not constrained to the ringsize - they are simply allowed to wrap through
> 0xfffffff.

A-ha!  That sure sounds great.

I'd be happy to see the kernel side fs/aio.c ring management move in
that direction, sure, with a specific patch to do that.

I get less excited when we talk about changing the index semantics as
part of supporting serialization of the ring with userspace.

- z

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 19:28   ` Ken Chen
@ 2007-04-11 19:44     ` Zach Brown
  2007-04-11 19:45     ` Benjamin LaHaise
  2007-04-12  7:29     ` Ken Chen
  2 siblings, 0 replies; 20+ messages in thread
From: Zach Brown @ 2007-04-11 19:44 UTC (permalink / raw)
  To: Ken Chen; +Cc: akpm, linux-aio, linux-kernel

> >I have mixed feelings.  I think the userspace getevents support was
> >poorly designed and the simple fact that we've gone this long without it
> >says just how desperately the feature isn't needed.
> 
> I kept on getting requests from application developers who want that
> feature.  My initial patch was dated back May 2004.

My point is that demand for it doesn't seem to be very high, and your
use of the past tense there seems to support that.  And the patches to
support it are not looking great.

Is it worth it, then?  What difference are you measuring so far?

- z

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 19:28   ` Ken Chen
  2007-04-11 19:44     ` Zach Brown
@ 2007-04-11 19:45     ` Benjamin LaHaise
  2007-04-11 19:52       ` Zach Brown
  2007-04-12  7:29     ` Ken Chen
  2 siblings, 1 reply; 20+ messages in thread
From: Benjamin LaHaise @ 2007-04-11 19:45 UTC (permalink / raw)
  To: Ken Chen; +Cc: Zach Brown, akpm, linux-aio, linux-kernel

On Wed, Apr 11, 2007 at 12:28:26PM -0700, Ken Chen wrote:
> >I have mixed feelings.  I think the userspace getevents support was
> >poorly designed and the simple fact that we've gone this long without it
> >says just how desperately the feature isn't needed.
> 
> I kept on getting requests from application developers who want that
> feature.  My initial patch was dated back May 2004.

The right way to do it involves synchronization between the kernel side 
io_getevents() and the userspace code pulling events out of the ring.  
Alan Cox suggested embedding a futex in the shared memory region, but I 
don't think anyone ever implemented that.

> I noticed that while writing the patch.  We have the same bug right
> now that nr_events is enlarged to consume the entire mmap'ed ring
> buffer pages.  But yet, we don't account those in aio_max_nr.  I
> didn't fix that up in this patch because I was going to do a separate
> patch on that.

It's not accounted because it is irrelevant -- the memory used by the 
events is already accounted against the user's address space.  The actual 
number of aios in flight is clamped to the maximum number of requests 
that was specified and accounted against the global reservation.  The 
number of events in the ringbuffer can be larger, and that doesn't hurt 
anything.

> >The ring page, which is mmaped to userspace at some weird address, was
> >just written through a kmap.  Do we need a flush_dcache_page()?  This
> >isn't this patch's problem, but it'd need to be addressed if we're using
> >the ring from userspace.
> 
> I will look into this aside from this patch.

That's probably the case.  Also, any changes in this area *must* correctly 
update the compat/incompat feature flags in the ring buffer header.  That 
has been missed in the past...

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <zyntrop@kvack.org>.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 19:45     ` Benjamin LaHaise
@ 2007-04-11 19:52       ` Zach Brown
  2007-04-11 20:03         ` Benjamin LaHaise
  0 siblings, 1 reply; 20+ messages in thread
From: Zach Brown @ 2007-04-11 19:52 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Ken Chen, akpm, linux-aio, linux-kernel

> > I kept on getting requests from application developers who want that
> > feature.  My initial patch was dated back May 2004.
> 
> The right way to do it involves synchronization between the kernel side 
> io_getevents() and the userspace code pulling events out of the ring.  
> Alan Cox suggested embedding a futex in the shared memory region, but I 
> don't think anyone ever implemented that.

Yeah, I like the idea of futexes.

I'm worried that virtual aliasing spells doom for the current
home-brewed serialization that fs/aio.c is doing with the shared ring
head/tail accesses.  Am I worrying about nothing here?

> > I will look into this aside from this patch.
> 
> That's probably the case.  Also, any changes in this area *must* correctly 
> update the compat/incompat feature flags in the ring buffer header.  That 
> has been missed in the past...

Do you know of anyone using the current ring info ABI?

The *only* user I know of is the check of ctx->magic in libaio.

- z

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 19:52       ` Zach Brown
@ 2007-04-11 20:03         ` Benjamin LaHaise
  0 siblings, 0 replies; 20+ messages in thread
From: Benjamin LaHaise @ 2007-04-11 20:03 UTC (permalink / raw)
  To: Zach Brown; +Cc: Ken Chen, akpm, linux-aio, linux-kernel

On Wed, Apr 11, 2007 at 12:52:56PM -0700, Zach Brown wrote:
> I'm worried that virtual aliasing spells doom for the current
> home-brewed serialization that fs/aio.c is doing with the shared ring
> head/tail accesses.  Am I worrying about nothing here?

Adding a flush_dcache_page() should fix that, but it might also need a 
change in the layout of the ring buffer header to get things completely 
correct.  What I'm thinking of is that the head and tail bits might need 
to be on different cachelines to ensure any aliasing that does occur will 
not result in updates colliding.  I'm not that much of an expert on virtually 
aliased caches, though.

> > > I will look into this aside from this patch.
> > 
> > That's probably the case.  Also, any changes in this area *must* correctly 
> > update the compat/incompat feature flags in the ring buffer header.  That 
> > has been missed in the past...
> 
> Do you know of anyone using the current ring info ABI?
> 
> The *only* user I know of is the check of ctx->magic in libaio.

But... any libaio implementing the new sematics must be able to run on old 
kernels by falling back to the syscall when it notices that required bits 
aren't set in the header.  It's easy enough to implement the checks, they 
just need to be carefully checked before being shipped.

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <zyntrop@kvack.org>.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 19:28   ` Ken Chen
  2007-04-11 19:44     ` Zach Brown
  2007-04-11 19:45     ` Benjamin LaHaise
@ 2007-04-12  7:29     ` Ken Chen
  2007-04-12 12:06       ` Jeff Moyer
  2 siblings, 1 reply; 20+ messages in thread
From: Ken Chen @ 2007-04-12  7:29 UTC (permalink / raw)
  To: Zach Brown; +Cc: akpm, linux-aio, linux-kernel

On 4/11/07, Ken Chen <kenchen@google.com> wrote:
> On 4/11/07, Zach Brown <zach.brown@oracle.com> wrote:
> > First, I'll NAK this and all AIO patches until the patch description
> > says that it's been run through the regression tests that we've started
> > collecting in autotest.  They're trivial to run, never fear:
>
> OK.  I will run those regression tests.

Unfortunately, the aio_dio_bugs test in autotest has bug in it :-(
We need stress test the "test code".

on stock 2.6.21-rc6 kernel:

[rock-me-baby]$ cd autotest/tests/aio_dio_bugs/src
[rock-me-baby]$ make
[rock-me-baby]$ ./aio-free-ring-with-bogus-nr-pages
aio-free-ring-with-bogus-nr-pages: Error: io_setup returned -22,
expected -ENOMEM

hmm???  The problem is that the test code forgot to initialized ctx
variable and in the kernel, sys_io_setup returns EINVAL if user
address contain none-zero value.

I will submit the following patch to autotest to correct the test code.

--- aio-free-ring-with-bogus-nr-pages.c.orig	2007-04-11 23:57:45 -0700
+++ aio-free-ring-with-bogus-nr-pages.c	2007-04-11 23:57:59 -0700
@@ -39,7 +39,7 @@
 int main(int __attribute__((unused)) argc, char **argv)
 {
 	long res;
-	io_context_t ctx;
+	io_context_t ctx = 0;
 	void* map;

 	while (1) {

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-11 18:00 ` Zach Brown
  2007-04-11 19:11   ` Andrew Morton
  2007-04-11 19:28   ` Ken Chen
@ 2007-04-12  7:50   ` Ken Chen
  2007-04-12 14:31     ` Benjamin LaHaise
  2 siblings, 1 reply; 20+ messages in thread
From: Ken Chen @ 2007-04-12  7:50 UTC (permalink / raw)
  To: Zach Brown; +Cc: akpm, linux-aio, linux-kernel, Benjamin LaHaise

[-- Attachment #1: Type: text/plain, Size: 1927 bytes --]

On 4/11/07, Zach Brown <zach.brown@oracle.com> wrote:
> First, I'll NAK this and all AIO patches until the patch description
> says that it's been run through the regression tests that we've started
> collecting in autotest.  They're trivial to run, never fear:
>
>  cd /usr/local
>  svn checkout svn://test.kernel.org/autotest/trunk/client autotest
>  cd autotest
>  bin/autotest tests/aio_dio_bugs/control

I ran through the autotest (with bug fix in the test code).  It passes
the regression tests. I made the following change since last rev:

* struct aio_ring stays unchanged.  I added a cast to atomic_cmpxchg()
* aio_ring_event() has grown too big, I turned that into a function
* take out white space change from previous rev.

open issue that this patch does not try to fix:
* kmap address alias needs a flush_dcache_page

Zach, does this make you more comfortable? (or maybe less iffy is the
right word?)


From: Ken Chen <kenchen@google.com>

Resurrect an old patch that uses atomic operation to update ring buffer
index on AIO event queue.  This work allows further application/libaio
optimization to run fast path io_getevents in user space.

I've also added one more change on top of old implementation that rounds
ring buffer size to power of 2 to allow fast head/tail calculation. With
the new scheme, there is no more modulo operation on them and we simply
increment either pointer index directly.  This scheme also automatically
handles integer wrap nicely without any additional special treatment.

Patch was stress tested and in addition, tested on
autotest:aio_dio_bugs.  The results were all pass:  Output of
autotest:aio_dio_bugs:
ran for 200 seconds without error, passing
AIO read of last block in file succeeded.
aio-free-ring-with-bogus-nr-pages: Success!
aio-io-setup-with-nonwritable-context-pointer: Success!
GOOD aio_dio_bugs completed successfully


Signed-off-by: Ken Chen <kenchen@google.com>

[-- Attachment #2: aio-use-atomic.patch --]
[-- Type: text/x-patch, Size: 5106 bytes --]

diff --git a/fs/aio.c b/fs/aio.c
index e4598d6..f8b8e45 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 	unsigned long size;
 	int nr_pages;
 
-	/* Compensate for the ring buffer's head/tail overlap entry */
-	nr_events += 2;	/* 1 is required, 2 for good luck */
+	/* round nr_event to next power of 2 */
+	nr_events = roundup_pow_of_two(nr_events);
 
 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
@@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 	if (nr_pages < 0)
 		return -EINVAL;
 
-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
-
 	info->nr = 0;
 	info->ring_pages = info->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
@@ -177,14 +175,16 @@ static int aio_setup_ring(struct kioctx *ctx)
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr, km) ({					\
-	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
-	struct io_event *__event;					\
-	__event = kmap_atomic(						\
-			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
-	__event += pos % AIO_EVENTS_PER_PAGE;				\
-	__event;							\
-})
+static struct io_event *aio_ring_event(struct aio_ring_info *info,
+					unsigned nr, enum km_type km)
+{
+	unsigned pos = (nr & (info->nr - 1)) + AIO_EVENTS_OFFSET;
+	struct io_event *event;
+
+	event = kmap_atomic(info->ring_pages[pos / AIO_EVENTS_PER_PAGE], km);
+	event += pos % AIO_EVENTS_PER_PAGE;
+	return event;
+}
 
 #define put_aio_ring_event(event, km) do {	\
 	struct io_event *__event = (event);	\
@@ -220,7 +220,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	atomic_set(&ctx->users, 1);
 	spin_lock_init(&ctx->ctx_lock);
-	spin_lock_init(&ctx->ring_info.ring_lock);
 	init_waitqueue_head(&ctx->wait);
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
@@ -927,7 +926,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	struct aio_ring	*ring;
 	struct io_event	*event;
 	unsigned long	flags;
-	unsigned long	tail;
+	unsigned	tail;
 	int		ret;
 
 	/*
@@ -969,8 +968,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 
 	tail = info->tail;
 	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail >= info->nr)
-		tail = 0;
 
 	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
 	event->data = iocb->ki_user_data;
@@ -986,13 +983,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 
+	tail++;
 	info->tail = tail;
 	ring->tail = tail;
 
 	put_aio_ring_event(event, KM_IRQ0);
 	kunmap_atomic(ring, KM_IRQ1);
 
-	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
 	ret = __aio_put_req(ctx, iocb);
@@ -1007,39 +1005,35 @@ put_rq:
 /* aio_read_evt
  *	Pull an event off of the ioctx's event ring.  Returns the number of 
  *	events fetched (0 or 1 ;-)
- *	FIXME: make this use cmpxchg.
  *	TODO: make the ringbuffer user mmap()able (requires FIXME).
  */
 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
 {
 	struct aio_ring_info *info = &ioctx->ring_info;
 	struct aio_ring *ring;
-	unsigned long head;
-	int ret = 0;
+	struct io_event *evp;
+	unsigned head;
+	int ret;
 
 	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
 	dprintk("in aio_read_evt h%lu t%lu m%lu\n",
 		 (unsigned long)ring->head, (unsigned long)ring->tail,
 		 (unsigned long)ring->nr);
 
-	if (ring->head == ring->tail)
-		goto out;
-
-	spin_lock(&info->ring_lock);
-
-	head = ring->head % info->nr;
-	if (head != ring->tail) {
-		struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+	do {
+		head = ring->head;
+		if (head == ring->tail) {
+			ret = 0;
+			break;
+		}
+		evp = aio_ring_event(info, head, KM_USER1);
 		*ent = *evp;
-		head = (head + 1) % info->nr;
 		smp_mb(); /* finish reading the event before updatng the head */
-		ring->head = head;
 		ret = 1;
 		put_aio_ring_event(evp, KM_USER1);
-	}
-	spin_unlock(&info->ring_lock);
+	} while (head != atomic_cmpxchg((atomic_t *) &ring->head,
+					head, head + 1));
 
-out:
 	kunmap_atomic(ring, KM_USER0);
 	dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,
 		 (unsigned long)ring->head, (unsigned long)ring->tail);
diff --git a/include/linux/aio.h b/include/linux/aio.h
index a30ef13..0cd66a3 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -156,7 +156,7 @@ struct aio_ring {
 	struct io_event		io_events[0];
 }; /* 128 bytes + ring size */
 
-#define aio_ring_avail(info, ring)	(((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr)
+#define aio_ring_avail(info, ring) ((info)->nr + (ring)->head - (ring)->tail)
 
 #define AIO_RING_PAGES	8
 struct aio_ring_info {
@@ -164,7 +164,6 @@ struct aio_ring_info {
 	unsigned long		mmap_size;
 
 	struct page		**ring_pages;
-	spinlock_t		ring_lock;
 	long			nr_pages;
 
 	unsigned		nr, tail;

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-12  7:29     ` Ken Chen
@ 2007-04-12 12:06       ` Jeff Moyer
  2007-04-13  0:57         ` Ken Chen
  0 siblings, 1 reply; 20+ messages in thread
From: Jeff Moyer @ 2007-04-12 12:06 UTC (permalink / raw)
  To: Ken Chen; +Cc: Zach Brown, akpm, linux-aio, linux-kernel

==> On Thu, 12 Apr 2007 00:29:28 -0700, "Ken Chen" <kenchen@google.com> said:

Ken> On 4/11/07, Ken Chen <kenchen@google.com> wrote:
Ken> > On 4/11/07, Zach Brown <zach.brown@oracle.com> wrote:
Ken> > > First, I'll NAK this and all AIO patches until the patch
Ken> description > > says that it's been run through the regression
Ken> tests that we've started > > collecting in autotest.  They're
Ken> trivial to run, never fear:
Ken> >
Ken> > OK.  I will run those regression tests.

Ken> Unfortunately, the aio_dio_bugs test in autotest has bug in it
Ken> :-( We need stress test the "test code".

Ken> on stock 2.6.21-rc6 kernel:

Ken> [rock-me-baby]$ cd autotest/tests/aio_dio_bugs/src
Ken> [rock-me-baby]$ make [rock-me-baby]$
Ken> ./aio-free-ring-with-bogus-nr-pages
Ken> aio-free-ring-with-bogus-nr-pages: Error: io_setup returned -22,
Ken> expected -ENOMEM

Ken> hmm???  The problem is that the test code forgot to initialized
Ken> ctx variable and in the kernel, sys_io_setup returns EINVAL if
Ken> user address contain none-zero value.

Zach had already sent a fix for that.  I wonder why it didn't go in.

I didn't see any response to Zach's request for code that actually
tests out the shared ring buffer.  Do you have such code?

-Jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-12  7:50   ` Ken Chen
@ 2007-04-12 14:31     ` Benjamin LaHaise
  2007-04-12 15:13       ` Benjamin LaHaise
  0 siblings, 1 reply; 20+ messages in thread
From: Benjamin LaHaise @ 2007-04-12 14:31 UTC (permalink / raw)
  To: Ken Chen; +Cc: Zach Brown, akpm, linux-aio, linux-kernel

On Thu, Apr 12, 2007 at 12:50:39AM -0700, Ken Chen wrote:
> I ran through the autotest (with bug fix in the test code).  It passes
> the regression tests. I made the following change since last rev:

By removing the spinlock around ring insertion, you've made it possible 
for two events being inserted on different CPUs to end up creating 
inconsistent state, as there is nothing which guarantees that resulting 
event in the ring will be wholely one event or another.

Also, I don't like rounding up of the number of events to the nearest power 
of two, as it can round things up significantly more than just going to the 
end of the last actual page of memory allocated.

NAK.

		-ben

PS  base64 encoded patches are incredibly difficult to quote.
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <zyntrop@kvack.org>.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-12 14:31     ` Benjamin LaHaise
@ 2007-04-12 15:13       ` Benjamin LaHaise
  0 siblings, 0 replies; 20+ messages in thread
From: Benjamin LaHaise @ 2007-04-12 15:13 UTC (permalink / raw)
  To: Ken Chen; +Cc: Zach Brown, akpm, linux-aio, linux-kernel

On Thu, Apr 12, 2007 at 10:31:31AM -0400, Benjamin LaHaise wrote:
> On Thu, Apr 12, 2007 at 12:50:39AM -0700, Ken Chen wrote:
> > I ran through the autotest (with bug fix in the test code).  It passes
> > the regression tests. I made the following change since last rev:
> 
> By removing the spinlock around ring insertion, you've made it possible 
> for two events being inserted on different CPUs to end up creating 
> inconsistent state, as there is nothing which guarantees that resulting 
> event in the ring will be wholely one event or another.

Ignore that, I misread the function it was applied to.  -ENEEDCOFFEE.  Yes, 
that spinlock can go if we're doing cmpxchg().

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <zyntrop@kvack.org>.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-12 12:06       ` Jeff Moyer
@ 2007-04-13  0:57         ` Ken Chen
  2007-04-13  1:08           ` Ken Chen
  0 siblings, 1 reply; 20+ messages in thread
From: Ken Chen @ 2007-04-13  0:57 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Zach Brown, akpm, linux-aio, linux-kernel

On 4/12/07, Jeff Moyer <jmoyer@redhat.com> wrote:
> I didn't see any response to Zach's request for code that actually
> tests out the shared ring buffer.  Do you have such code?

Yes, I do.  I was stress testing the code since last night.  After 20+
hours of stress run with fio and aio-stress, now I'm posting it with
confidence.

I modified libaio's io_getevents to take advantage of new user level
reap function. The feature is exported out via ring->compat_features.
btw, is compat_feature suppose to be a version number or a bit mask?
I think bitmask make more sense and more flexible.

(warning: some lines are extremely long in the patch and my email
client will probably mangle it badly).


diff -Nurp libaio-0.3.104/src/io_getevents.c
libaio-0.3.104-new/src/io_getevents.c
--- libaio-0.3.104/src/io_getevents.c	2003-06-18 12:58:21.000000000 -0700
+++ libaio-0.3.104-new/src/io_getevents.c	2007-04-12 17:35:06.000000000 -0700
@@ -21,10 +21,13 @@
 #include <stdlib.h>
 #include <time.h>
 #include "syscall.h"
+#include <asm/system.h>

 io_syscall5(int, __io_getevents_0_4, io_getevents, io_context_t, ctx,
long, min_nr, long, nr, struct io_event *, events, struct timespec *,
timeout)

 #define AIO_RING_MAGIC                  0xa10a10a1
+#define AIO_RING_BASE			1
+#define AIO_RING_USER_REAP		2

 /* Ben will hate me for this */
 struct aio_ring {
@@ -41,7 +44,11 @@ struct aio_ring {

 int io_getevents_0_4(io_context_t ctx, long min_nr, long nr, struct
io_event * events, struct timespec * timeout)
 {
+	long i = 0, ret;
+	unsigned head;
+	struct io_event *evt_base;
 	struct aio_ring *ring;
+
 	ring = (struct aio_ring*)ctx;
 	if (ring==NULL || ring->magic != AIO_RING_MAGIC)
 		goto do_syscall;
@@ -49,9 +56,35 @@ int io_getevents_0_4(io_context_t ctx, l
 		if (ring->head == ring->tail)
 			return 0;
 	}
-	
+
+	if (!(ring->compat_features & AIO_RING_USER_REAP))
+		goto do_syscall;
+
+	if (min_nr > nr || min_nr < 0 || nr < 0)
+		return -EINVAL;
+
+	evt_base = (struct io_event *) (ring + 1);
+	while (i < nr) {
+		head = ring->head;
+		if (head == ring->tail)
+			break;
+
+		*events = evt_base[head & (ring->nr - 1)];
+		if (head == cmpxchg(&ring->head, head, head + 1)) {
+			events++;
+			i++;
+		}
+	}
+
+	if (i >= min_nr)
+		return i;
+
 do_syscall:	
-	return __io_getevents_0_4(ctx, min_nr, nr, events, timeout);
+	ret = __io_getevents_0_4(ctx, min_nr - i, nr - i, events, timeout);
+	if (ret >= 0)
+		return i + ret;
+	else
+		return i ? i : ret;
 }

 DEFSYMVER(io_getevents_0_4, io_getevents, 0.4)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
  2007-04-13  0:57         ` Ken Chen
@ 2007-04-13  1:08           ` Ken Chen
  0 siblings, 0 replies; 20+ messages in thread
From: Ken Chen @ 2007-04-13  1:08 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Zach Brown, akpm, linux-aio, linux-kernel

On 4/12/07, Ken Chen <kenchen@google.com> wrote:
> On 4/12/07, Jeff Moyer <jmoyer@redhat.com> wrote:
> > I didn't see any response to Zach's request for code that actually
> > tests out the shared ring buffer.  Do you have such code?
>
> Yes, I do.  I was stress testing the code since last night.  After 20+
> hours of stress run with fio and aio-stress, now I'm posting it with
> confidence.
>
> I modified libaio's io_getevents to take advantage of new user level
> reap function. The feature is exported out via ring->compat_features.
> btw, is compat_feature suppose to be a version number or a bit mask?
> I think bitmask make more sense and more flexible.

Additional patch on the kernel side to export the new features.  On
top of patch posted at:
http://marc.info/?l=linux-kernel&m=117636401818057&w=2

--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -138,8 +138,11 @@ #define init_sync_kiocb(x, filp)			\
 		init_wait((&(x)->ki_wait));             \
 	} while (0)

+#define AIO_RING_BASE		1
+#define AIO_RING_USER_REAP	2
+
 #define AIO_RING_MAGIC			0xa10a10a1
-#define AIO_RING_COMPAT_FEATURES	1
+#define AIO_RING_COMPAT_FEATURES	(AIO_RING_BASE | AIO_RING_USER_REAP)
 #define AIO_RING_INCOMPAT_FEATURES	0
 struct aio_ring {
 	unsigned	id;	/* kernel internal index number */

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2007-04-13  1:09 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-04-10 23:53 [patch] convert aio event reap to use atomic-op instead of spin_lock Ken Chen
2007-04-11  5:18 ` Andrew Morton
2007-04-11 16:54   ` Ken Chen
2007-04-11 18:15     ` Zach Brown
2007-04-11 18:29       ` Ken Chen
2007-04-11 18:00 ` Zach Brown
2007-04-11 19:11   ` Andrew Morton
2007-04-11 19:34     ` Zach Brown
2007-04-11 19:28   ` Ken Chen
2007-04-11 19:44     ` Zach Brown
2007-04-11 19:45     ` Benjamin LaHaise
2007-04-11 19:52       ` Zach Brown
2007-04-11 20:03         ` Benjamin LaHaise
2007-04-12  7:29     ` Ken Chen
2007-04-12 12:06       ` Jeff Moyer
2007-04-13  0:57         ` Ken Chen
2007-04-13  1:08           ` Ken Chen
2007-04-12  7:50   ` Ken Chen
2007-04-12 14:31     ` Benjamin LaHaise
2007-04-12 15:13       ` Benjamin LaHaise

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox