* [patch] convert aio event reap to use atomic-op instead of spin_lock @ 2007-04-10 23:53 Ken Chen 2007-04-11 5:18 ` Andrew Morton 2007-04-11 18:00 ` Zach Brown 0 siblings, 2 replies; 20+ messages in thread From: Ken Chen @ 2007-04-10 23:53 UTC (permalink / raw) To: akpm, linux-aio, linux-kernel Resurrect an old patch that uses atomic operation to update ring buffer index on AIO event queue. This work allows futher application/libaio optimization to run fast path io_getevents in user space. I've also added one more change on top of old implementation that rounds ring buffer size to power of 2 to allow fast head/tail calculation. With the new scheme, there is no more modulo operation on them and we simply increment either pointer index directly. This scheme also automatically handles integer wrap nicely without any additional special treatment. Signed-off-by: Ken Chen <kenchen@google.com> diff --git a/fs/aio.c b/fs/aio.c index e4598d6..00ecd14 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx unsigned long size; int nr_pages; - /* Compensate for the ring buffer's head/tail overlap entry */ - nr_events += 2; /* 1 is required, 2 for good luck */ + /* round nr_event to next power of 2 */ + nr_events = roundup_pow_of_two(nr_events); size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; @@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx if (nr_pages < 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; info->ring_pages = info->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -177,8 +175,8 @@ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ +#define aio_ring_event(info, __nr, km) ({ \ + unsigned pos = ((__nr) & ((info)->nr - 1)) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ __event = kmap_atomic( \ (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ @@ -220,7 +218,6 @@ static struct kioctx *ioctx_alloc(unsign atomic_set(&ctx->users, 1); spin_lock_init(&ctx->ctx_lock); - spin_lock_init(&ctx->ring_info.ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); @@ -927,7 +924,7 @@ int fastcall aio_complete(struct kiocb * struct aio_ring *ring; struct io_event *event; unsigned long flags; - unsigned long tail; + unsigned tail; int ret; /* @@ -969,8 +966,6 @@ int fastcall aio_complete(struct kiocb * tail = info->tail; event = aio_ring_event(info, tail, KM_IRQ0); - if (++tail >= info->nr) - tail = 0; event->obj = (u64)(unsigned long)iocb->ki_obj.user; event->data = iocb->ki_user_data; @@ -986,13 +981,14 @@ int fastcall aio_complete(struct kiocb * */ smp_wmb(); /* make event visible before updating tail */ + tail++; info->tail = tail; ring->tail = tail; put_aio_ring_event(event, KM_IRQ0); kunmap_atomic(ring, KM_IRQ1); - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + pr_debug("added to ring %p at [%u]\n", iocb, tail); put_rq: /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -1007,42 +1003,36 @@ put_rq: /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. * TODO: make the ringbuffer user mmap()able (requires FIXME). */ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) { struct aio_ring_info *info = &ioctx->ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; + struct io_event *evp; + unsigned head; + int ret; ring = kmap_atomic(info->ring_pages[0], KM_USER0); - dprintk("in aio_read_evt h%lu t%lu m%lu\n", - (unsigned long)ring->head, (unsigned long)ring->tail, - (unsigned long)ring->nr); - - if (ring->head == ring->tail) - goto out; - - spin_lock(&info->ring_lock); + dprintk("in aio_read_evt h%u t%u m%u\n", + ring->head, ring->tail, ring->nr); - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); + do { + head = ring->head; + if (head == ring->tail) { + ret = 0; + break; + } + evp = aio_ring_event(info, head, KM_USER1); *ent = *evp; - head = (head + 1) % info->nr; smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; ret = 1; put_aio_ring_event(evp, KM_USER1); - } - spin_unlock(&info->ring_lock); + } while (head != cmpxchg(&ring->head, head, head + 1)); -out: kunmap_atomic(ring, KM_USER0); - dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, - (unsigned long)ring->head, (unsigned long)ring->tail); + dprintk("leaving aio_read_evt: %d h%u t%u\n", ret, + ring->head, ring->tail); return ret; } diff --git a/include/linux/aio.h b/include/linux/aio.h index a30ef13..42ca8d3 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -152,11 +152,10 @@ struct aio_ring { unsigned incompat_features; unsigned header_length; /* size of aio_ring */ - - struct io_event io_events[0]; + struct io_event io_events[0]; }; /* 128 bytes + ring size */ -#define aio_ring_avail(info, ring) (((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr) +#define aio_ring_avail(info, ring) ((info)->nr + (ring)->head - (ring)->tail) #define AIO_RING_PAGES 8 struct aio_ring_info { @@ -164,7 +163,6 @@ struct aio_ring_info { unsigned long mmap_size; struct page **ring_pages; - spinlock_t ring_lock; long nr_pages; unsigned nr, tail; ^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-10 23:53 [patch] convert aio event reap to use atomic-op instead of spin_lock Ken Chen @ 2007-04-11 5:18 ` Andrew Morton 2007-04-11 16:54 ` Ken Chen 2007-04-11 18:00 ` Zach Brown 1 sibling, 1 reply; 20+ messages in thread From: Andrew Morton @ 2007-04-11 5:18 UTC (permalink / raw) To: Ken Chen; +Cc: linux-aio, linux-kernel On Tue, 10 Apr 2007 16:53:53 -0700 (PDT) kenchen@google.com (Ken Chen) wrote: > + } while (head != cmpxchg(&ring->head, head, head + 1)); A hasty grep indicates that only 14 out of 23 architectures implement cmpxchg(). ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 5:18 ` Andrew Morton @ 2007-04-11 16:54 ` Ken Chen 2007-04-11 18:15 ` Zach Brown 0 siblings, 1 reply; 20+ messages in thread From: Ken Chen @ 2007-04-11 16:54 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-aio, linux-kernel [-- Attachment #1: Type: text/plain, Size: 564 bytes --] On 4/10/07, Andrew Morton <akpm@linux-foundation.org> wrote: > On Tue, 10 Apr 2007 16:53:53 -0700 (PDT) kenchen@google.com (Ken Chen) wrote: > > > + } while (head != cmpxchg(&ring->head, head, head + 1)); > > A hasty grep indicates that only 14 out of 23 architectures implement > cmpxchg(). Sorry I wasn't thorough enough. And partially because I was worried about changing structure type for user space facing struct aio_ring. Now that I looked through all arches, it looks safe as all arch's atomic_t has the same size as int. Here is the updated patch. [-- Attachment #2: aio-use-atomic.patch --] [-- Type: text/x-patch, Size: 5718 bytes --] diff --git a/fs/aio.c b/fs/aio.c index e4598d6..091bcb4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx) unsigned long size; int nr_pages; - /* Compensate for the ring buffer's head/tail overlap entry */ - nr_events += 2; /* 1 is required, 2 for good luck */ + /* round nr_event to next power of 2 */ + nr_events = roundup_pow_of_two(nr_events); size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; @@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx) if (nr_pages < 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; info->ring_pages = info->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -159,7 +157,8 @@ static int aio_setup_ring(struct kioctx *ctx) ring = kmap_atomic(info->ring_pages[0], KM_USER0); ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; - ring->head = ring->tail = 0; + atomic_set(&ring->head, 0); + ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; @@ -177,8 +176,8 @@ static int aio_setup_ring(struct kioctx *ctx) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ +#define aio_ring_event(info, __nr, km) ({ \ + unsigned pos = ((__nr) & ((info)->nr - 1)) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ __event = kmap_atomic( \ (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ @@ -220,7 +219,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(&ctx->users, 1); spin_lock_init(&ctx->ctx_lock); - spin_lock_init(&ctx->ring_info.ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); @@ -927,7 +925,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) struct aio_ring *ring; struct io_event *event; unsigned long flags; - unsigned long tail; + unsigned tail; int ret; /* @@ -969,8 +967,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) tail = info->tail; event = aio_ring_event(info, tail, KM_IRQ0); - if (++tail >= info->nr) - tail = 0; event->obj = (u64)(unsigned long)iocb->ki_obj.user; event->data = iocb->ki_user_data; @@ -986,13 +982,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) */ smp_wmb(); /* make event visible before updating tail */ + tail++; info->tail = tail; ring->tail = tail; put_aio_ring_event(event, KM_IRQ0); kunmap_atomic(ring, KM_IRQ1); - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + pr_debug("added to ring %p at [%u]\n", iocb, tail); put_rq: /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -1007,42 +1004,36 @@ put_rq: /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. * TODO: make the ringbuffer user mmap()able (requires FIXME). */ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) { struct aio_ring_info *info = &ioctx->ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; + struct io_event *evp; + unsigned head; + int ret; ring = kmap_atomic(info->ring_pages[0], KM_USER0); - dprintk("in aio_read_evt h%lu t%lu m%lu\n", - (unsigned long)ring->head, (unsigned long)ring->tail, - (unsigned long)ring->nr); - - if (ring->head == ring->tail) - goto out; - - spin_lock(&info->ring_lock); + dprintk("in aio_read_evt h%u t%u m%u\n", + atomic_read(&ring->head), ring->tail, ring->nr); - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); + do { + head = atomic_read(&ring->head); + if (head == ring->tail) { + ret = 0; + break; + } + evp = aio_ring_event(info, head, KM_USER1); *ent = *evp; - head = (head + 1) % info->nr; smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; ret = 1; put_aio_ring_event(evp, KM_USER1); - } - spin_unlock(&info->ring_lock); + } while (head != atomic_cmpxchg(&ring->head, head, head + 1)); -out: kunmap_atomic(ring, KM_USER0); - dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, - (unsigned long)ring->head, (unsigned long)ring->tail); + dprintk("leaving aio_read_evt: %d h%u t%u\n", ret, + atomic_read(&ring->head), ring->tail); return ret; } diff --git a/include/linux/aio.h b/include/linux/aio.h index a30ef13..aebb135 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -144,7 +144,7 @@ struct kiocb { struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ - unsigned head; + atomic_t head; unsigned tail; unsigned magic; @@ -152,11 +152,11 @@ struct aio_ring { unsigned incompat_features; unsigned header_length; /* size of aio_ring */ - - struct io_event io_events[0]; + struct io_event io_events[0]; }; /* 128 bytes + ring size */ -#define aio_ring_avail(info, ring) (((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr) +#define aio_ring_avail(info, ring) \ + ((info)->nr + (unsigned) atomic_read(&(ring)->head) - (ring)->tail) #define AIO_RING_PAGES 8 struct aio_ring_info { @@ -164,7 +164,6 @@ struct aio_ring_info { unsigned long mmap_size; struct page **ring_pages; - spinlock_t ring_lock; long nr_pages; unsigned nr, tail; ^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 16:54 ` Ken Chen @ 2007-04-11 18:15 ` Zach Brown 2007-04-11 18:29 ` Ken Chen 0 siblings, 1 reply; 20+ messages in thread From: Zach Brown @ 2007-04-11 18:15 UTC (permalink / raw) To: Ken Chen; +Cc: Andrew Morton, linux-aio, linux-kernel > Sorry I wasn't thorough enough. And partially because I was worried > about changing structure type for user space facing struct aio_ring. > Now that I looked through all arches, it looks safe as all arch's > atomic_t has the same size as int. > Here is the updated patch. > @@ -144,7 +144,7 @@ struct kiocb { > struct aio_ring { > unsigned id; /* kernel internal index number */ > unsigned nr; /* number of io_events */ > - unsigned head; > + atomic_t head; > unsigned tail; Embedding an atomic_t in an ABI struct? That makes everyone else nervous too, right? It may look safe on i386/x86-64 today, but this doesn't seem like wise practice. Is there any reason to believe that atomic_t will never change size? Does anything else do this already? If nothing else, the "unsigned" (should be __u32, sigh) could be cast to an atomic_t. Is being able to do atomic work on a u32 between the kernel and userspace something that all archs have support for? I mean, take the fact that userspace and the kernel could both be doing these atomic ops on different virtual addresses and so conceivably different cachelines. Is that a problem for anyone? I do find myself wondering if the notion of userspace ring synchronization shouldn't be built around futexes. They weren't around when this mmap()ed ring business was created. - z ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 18:15 ` Zach Brown @ 2007-04-11 18:29 ` Ken Chen 0 siblings, 0 replies; 20+ messages in thread From: Ken Chen @ 2007-04-11 18:29 UTC (permalink / raw) To: Zach Brown; +Cc: Andrew Morton, linux-aio, linux-kernel On 4/11/07, Zach Brown <zach.brown@oracle.com> wrote: > > Sorry I wasn't thorough enough. And partially because I was worried > > about changing structure type for user space facing struct aio_ring. > > Now that I looked through all arches, it looks safe as all arch's > > atomic_t has the same size as int. > > > Here is the updated patch. > > > @@ -144,7 +144,7 @@ struct kiocb { > > struct aio_ring { > > unsigned id; /* kernel internal index number */ > > unsigned nr; /* number of io_events */ > > - unsigned head; > > + atomic_t head; > > unsigned tail; > > Embedding an atomic_t in an ABI struct? That makes everyone else > nervous too, right? sure, make me nervous at least. > It may look safe on i386/x86-64 today, but this doesn't seem like wise > practice. Is there any reason to believe that atomic_t will never > change size? Does anything else do this already? > > If nothing else, the "unsigned" (should be __u32, sigh) could be cast to > an atomic_t. The atomic_t cast is equally iffy. I don't know what would be a best solution. ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-10 23:53 [patch] convert aio event reap to use atomic-op instead of spin_lock Ken Chen 2007-04-11 5:18 ` Andrew Morton @ 2007-04-11 18:00 ` Zach Brown 2007-04-11 19:11 ` Andrew Morton ` (2 more replies) 1 sibling, 3 replies; 20+ messages in thread From: Zach Brown @ 2007-04-11 18:00 UTC (permalink / raw) To: Ken Chen; +Cc: akpm, linux-aio, linux-kernel First, I'll NAK this and all AIO patches until the patch description says that it's been run through the regression tests that we've started collecting in autotest. They're trivial to run, never fear: cd /usr/local svn checkout svn://test.kernel.org/autotest/trunk/client autotest cd autotest bin/autotest tests/aio_dio_bugs/control (We'll be moving this to LTP soon, I think, but that will just change the commands to cut and paste.) > Resurrect an old patch that uses atomic operation to update ring buffer > index on AIO event queue. This work allows futher application/libaio > optimization to run fast path io_getevents in user space. I have mixed feelings. I think the userspace getevents support was poorly designed and the simple fact that we've gone this long without it says just how desperately the feature isn't needed. At the same time, it's a small change and the code is already there. So it's hard to get too worked up about it. We can always remove support in the future by never changing the indexes if we require that apps fall back to sys_io_getevents(). In any case, let's see a test which exercises userspace event collection before we claim to support it. I'd prefer a little stand-alone C app like the ones in autotest for simple unit testing. A fio patch to stress test it would probably also be well received. > I've also added one more change on top of old implementation that rounds > ring buffer size to power of 2 to allow fast head/tail calculation. With > the new scheme, there is no more modulo operation on them and we simply > increment either pointer index directly. Please quantify the improvement. We're allocating more ring elements than userspace asked for and than were accounted for in aio_max_nr. That cost, however teeny, will be measured against the benefit. > - /* Compensate for the ring buffer's head/tail overlap entry */ > - nr_events += 2; /* 1 is required, 2 for good luck */ > + /* round nr_event to next power of 2 */ > + nr_events = roundup_pow_of_two(nr_events); Is that buggy? How will the code tell if head == tail means a full ring or an empty ring? The old code added that extra event to let it tell the ring was full before head == tail and it would think it's empty again, I think. I'm guessing roundup(nr_events + 1) is needed. > -#define aio_ring_event(info, nr, km) ({ \ > - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ > +#define aio_ring_event(info, __nr, km) ({ \ > + unsigned pos = ((__nr) & ((info)->nr - 1)) + AIO_EVENTS_OFFSET; \ > struct io_event *__event; \ > __event = kmap_atomic( \ > (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ (info) is now evaluated twice. __info = (info), please, just like __event there. > put_aio_ring_event(event, KM_IRQ0); > kunmap_atomic(ring, KM_IRQ1); The ring page, which is mmaped to userspace at some weird address, was just written through a kmap. Do we need a flush_dcache_page()? This isn't this patch's problem, but it'd need to be addressed if we're using the ring from userspace. > - > - struct io_event io_events[0]; > + struct io_event io_events[0]; Try to minimize gratuitous reformatting, please. - z ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 18:00 ` Zach Brown @ 2007-04-11 19:11 ` Andrew Morton 2007-04-11 19:34 ` Zach Brown 2007-04-11 19:28 ` Ken Chen 2007-04-12 7:50 ` Ken Chen 2 siblings, 1 reply; 20+ messages in thread From: Andrew Morton @ 2007-04-11 19:11 UTC (permalink / raw) To: Zach Brown; +Cc: Ken Chen, linux-aio, linux-kernel On Wed, 11 Apr 2007 11:00:38 -0700 Zach Brown <zach.brown@oracle.com> wrote: > > - /* Compensate for the ring buffer's head/tail overlap entry */ > > - nr_events += 2; /* 1 is required, 2 for good luck */ > > + /* round nr_event to next power of 2 */ > > + nr_events = roundup_pow_of_two(nr_events); > > Is that buggy? How will the code tell if head == tail means a full ring > or an empty ring? The old code added that extra event to let it tell > the ring was full before head == tail and it would think it's empty > again, I think. I'm guessing roundup(nr_events + 1) is needed. Ken uses the other (superior!) way of implementing ringbuffers: the head and tail pointers (the naming of which AIO appears to have reversed) are not constrained to the ringsize - they are simply allowed to wrap through 0xfffffff. Consequently: ring full: (head-tail == size) ring empty: head==tail numer-of-items-in-ring: head-tail add to ring: ring[head++]=item remove from ring: item=ring[tail++] (adjust the above for AIO naming assbackwardness) (requires that size be a power of 2) Many net drivers do it this way for their DMA rings. ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 19:11 ` Andrew Morton @ 2007-04-11 19:34 ` Zach Brown 0 siblings, 0 replies; 20+ messages in thread From: Zach Brown @ 2007-04-11 19:34 UTC (permalink / raw) To: Andrew Morton; +Cc: Ken Chen, linux-aio, linux-kernel > Ken uses the other (superior!) way of implementing ringbuffers: the head > and tail pointers (the naming of which AIO appears to have reversed) are > not constrained to the ringsize - they are simply allowed to wrap through > 0xfffffff. A-ha! That sure sounds great. I'd be happy to see the kernel side fs/aio.c ring management move in that direction, sure, with a specific patch to do that. I get less excited when we talk about changing the index semantics as part of supporting serialization of the ring with userspace. - z ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 18:00 ` Zach Brown 2007-04-11 19:11 ` Andrew Morton @ 2007-04-11 19:28 ` Ken Chen 2007-04-11 19:44 ` Zach Brown ` (2 more replies) 2007-04-12 7:50 ` Ken Chen 2 siblings, 3 replies; 20+ messages in thread From: Ken Chen @ 2007-04-11 19:28 UTC (permalink / raw) To: Zach Brown; +Cc: akpm, linux-aio, linux-kernel On 4/11/07, Zach Brown <zach.brown@oracle.com> wrote: > First, I'll NAK this and all AIO patches until the patch description > says that it's been run through the regression tests that we've started > collecting in autotest. They're trivial to run, never fear: OK. I will run those regression tests. > > Resurrect an old patch that uses atomic operation to update ring buffer > > index on AIO event queue. This work allows futher application/libaio > > optimization to run fast path io_getevents in user space. > > I have mixed feelings. I think the userspace getevents support was > poorly designed and the simple fact that we've gone this long without it > says just how desperately the feature isn't needed. I kept on getting requests from application developers who want that feature. My initial patch was dated back May 2004. > We're allocating more ring elements > than userspace asked for and than were accounted for in aio_max_nr. > That cost, however teeny, will be measured against the benefit. I noticed that while writing the patch. We have the same bug right now that nr_events is enlarged to consume the entire mmap'ed ring buffer pages. But yet, we don't account those in aio_max_nr. I didn't fix that up in this patch because I was going to do a separate patch on that. > > - /* Compensate for the ring buffer's head/tail overlap entry */ > > - nr_events += 2; /* 1 is required, 2 for good luck */ > > + /* round nr_event to next power of 2 */ > > + nr_events = roundup_pow_of_two(nr_events); > > Is that buggy? How will the code tell if head == tail means a full ring > or an empty ring? The old code added that extra event to let it tell > the ring was full before head == tail and it would think it's empty > again, I think. I'm guessing roundup(nr_events + 1) is needed. I don't think it's a bug. akpm taught me the trick: we don't wrap the index around the ring size in this scheme, so the extra space is not needed. > The ring page, which is mmaped to userspace at some weird address, was > just written through a kmap. Do we need a flush_dcache_page()? This > isn't this patch's problem, but it'd need to be addressed if we're using > the ring from userspace. I will look into this aside from this patch. - Ken ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 19:28 ` Ken Chen @ 2007-04-11 19:44 ` Zach Brown 2007-04-11 19:45 ` Benjamin LaHaise 2007-04-12 7:29 ` Ken Chen 2 siblings, 0 replies; 20+ messages in thread From: Zach Brown @ 2007-04-11 19:44 UTC (permalink / raw) To: Ken Chen; +Cc: akpm, linux-aio, linux-kernel > >I have mixed feelings. I think the userspace getevents support was > >poorly designed and the simple fact that we've gone this long without it > >says just how desperately the feature isn't needed. > > I kept on getting requests from application developers who want that > feature. My initial patch was dated back May 2004. My point is that demand for it doesn't seem to be very high, and your use of the past tense there seems to support that. And the patches to support it are not looking great. Is it worth it, then? What difference are you measuring so far? - z ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 19:28 ` Ken Chen 2007-04-11 19:44 ` Zach Brown @ 2007-04-11 19:45 ` Benjamin LaHaise 2007-04-11 19:52 ` Zach Brown 2007-04-12 7:29 ` Ken Chen 2 siblings, 1 reply; 20+ messages in thread From: Benjamin LaHaise @ 2007-04-11 19:45 UTC (permalink / raw) To: Ken Chen; +Cc: Zach Brown, akpm, linux-aio, linux-kernel On Wed, Apr 11, 2007 at 12:28:26PM -0700, Ken Chen wrote: > >I have mixed feelings. I think the userspace getevents support was > >poorly designed and the simple fact that we've gone this long without it > >says just how desperately the feature isn't needed. > > I kept on getting requests from application developers who want that > feature. My initial patch was dated back May 2004. The right way to do it involves synchronization between the kernel side io_getevents() and the userspace code pulling events out of the ring. Alan Cox suggested embedding a futex in the shared memory region, but I don't think anyone ever implemented that. > I noticed that while writing the patch. We have the same bug right > now that nr_events is enlarged to consume the entire mmap'ed ring > buffer pages. But yet, we don't account those in aio_max_nr. I > didn't fix that up in this patch because I was going to do a separate > patch on that. It's not accounted because it is irrelevant -- the memory used by the events is already accounted against the user's address space. The actual number of aios in flight is clamped to the maximum number of requests that was specified and accounted against the global reservation. The number of events in the ringbuffer can be larger, and that doesn't hurt anything. > >The ring page, which is mmaped to userspace at some weird address, was > >just written through a kmap. Do we need a flush_dcache_page()? This > >isn't this patch's problem, but it'd need to be addressed if we're using > >the ring from userspace. > > I will look into this aside from this patch. That's probably the case. Also, any changes in this area *must* correctly update the compat/incompat feature flags in the ring buffer header. That has been missed in the past... -ben -- "Time is of no importance, Mr. President, only life is important." Don't Email: <zyntrop@kvack.org>. ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 19:45 ` Benjamin LaHaise @ 2007-04-11 19:52 ` Zach Brown 2007-04-11 20:03 ` Benjamin LaHaise 0 siblings, 1 reply; 20+ messages in thread From: Zach Brown @ 2007-04-11 19:52 UTC (permalink / raw) To: Benjamin LaHaise; +Cc: Ken Chen, akpm, linux-aio, linux-kernel > > I kept on getting requests from application developers who want that > > feature. My initial patch was dated back May 2004. > > The right way to do it involves synchronization between the kernel side > io_getevents() and the userspace code pulling events out of the ring. > Alan Cox suggested embedding a futex in the shared memory region, but I > don't think anyone ever implemented that. Yeah, I like the idea of futexes. I'm worried that virtual aliasing spells doom for the current home-brewed serialization that fs/aio.c is doing with the shared ring head/tail accesses. Am I worrying about nothing here? > > I will look into this aside from this patch. > > That's probably the case. Also, any changes in this area *must* correctly > update the compat/incompat feature flags in the ring buffer header. That > has been missed in the past... Do you know of anyone using the current ring info ABI? The *only* user I know of is the check of ctx->magic in libaio. - z ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 19:52 ` Zach Brown @ 2007-04-11 20:03 ` Benjamin LaHaise 0 siblings, 0 replies; 20+ messages in thread From: Benjamin LaHaise @ 2007-04-11 20:03 UTC (permalink / raw) To: Zach Brown; +Cc: Ken Chen, akpm, linux-aio, linux-kernel On Wed, Apr 11, 2007 at 12:52:56PM -0700, Zach Brown wrote: > I'm worried that virtual aliasing spells doom for the current > home-brewed serialization that fs/aio.c is doing with the shared ring > head/tail accesses. Am I worrying about nothing here? Adding a flush_dcache_page() should fix that, but it might also need a change in the layout of the ring buffer header to get things completely correct. What I'm thinking of is that the head and tail bits might need to be on different cachelines to ensure any aliasing that does occur will not result in updates colliding. I'm not that much of an expert on virtually aliased caches, though. > > > I will look into this aside from this patch. > > > > That's probably the case. Also, any changes in this area *must* correctly > > update the compat/incompat feature flags in the ring buffer header. That > > has been missed in the past... > > Do you know of anyone using the current ring info ABI? > > The *only* user I know of is the check of ctx->magic in libaio. But... any libaio implementing the new sematics must be able to run on old kernels by falling back to the syscall when it notices that required bits aren't set in the header. It's easy enough to implement the checks, they just need to be carefully checked before being shipped. -ben -- "Time is of no importance, Mr. President, only life is important." Don't Email: <zyntrop@kvack.org>. ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 19:28 ` Ken Chen 2007-04-11 19:44 ` Zach Brown 2007-04-11 19:45 ` Benjamin LaHaise @ 2007-04-12 7:29 ` Ken Chen 2007-04-12 12:06 ` Jeff Moyer 2 siblings, 1 reply; 20+ messages in thread From: Ken Chen @ 2007-04-12 7:29 UTC (permalink / raw) To: Zach Brown; +Cc: akpm, linux-aio, linux-kernel On 4/11/07, Ken Chen <kenchen@google.com> wrote: > On 4/11/07, Zach Brown <zach.brown@oracle.com> wrote: > > First, I'll NAK this and all AIO patches until the patch description > > says that it's been run through the regression tests that we've started > > collecting in autotest. They're trivial to run, never fear: > > OK. I will run those regression tests. Unfortunately, the aio_dio_bugs test in autotest has bug in it :-( We need stress test the "test code". on stock 2.6.21-rc6 kernel: [rock-me-baby]$ cd autotest/tests/aio_dio_bugs/src [rock-me-baby]$ make [rock-me-baby]$ ./aio-free-ring-with-bogus-nr-pages aio-free-ring-with-bogus-nr-pages: Error: io_setup returned -22, expected -ENOMEM hmm??? The problem is that the test code forgot to initialized ctx variable and in the kernel, sys_io_setup returns EINVAL if user address contain none-zero value. I will submit the following patch to autotest to correct the test code. --- aio-free-ring-with-bogus-nr-pages.c.orig 2007-04-11 23:57:45 -0700 +++ aio-free-ring-with-bogus-nr-pages.c 2007-04-11 23:57:59 -0700 @@ -39,7 +39,7 @@ int main(int __attribute__((unused)) argc, char **argv) { long res; - io_context_t ctx; + io_context_t ctx = 0; void* map; while (1) { ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-12 7:29 ` Ken Chen @ 2007-04-12 12:06 ` Jeff Moyer 2007-04-13 0:57 ` Ken Chen 0 siblings, 1 reply; 20+ messages in thread From: Jeff Moyer @ 2007-04-12 12:06 UTC (permalink / raw) To: Ken Chen; +Cc: Zach Brown, akpm, linux-aio, linux-kernel ==> On Thu, 12 Apr 2007 00:29:28 -0700, "Ken Chen" <kenchen@google.com> said: Ken> On 4/11/07, Ken Chen <kenchen@google.com> wrote: Ken> > On 4/11/07, Zach Brown <zach.brown@oracle.com> wrote: Ken> > > First, I'll NAK this and all AIO patches until the patch Ken> description > > says that it's been run through the regression Ken> tests that we've started > > collecting in autotest. They're Ken> trivial to run, never fear: Ken> > Ken> > OK. I will run those regression tests. Ken> Unfortunately, the aio_dio_bugs test in autotest has bug in it Ken> :-( We need stress test the "test code". Ken> on stock 2.6.21-rc6 kernel: Ken> [rock-me-baby]$ cd autotest/tests/aio_dio_bugs/src Ken> [rock-me-baby]$ make [rock-me-baby]$ Ken> ./aio-free-ring-with-bogus-nr-pages Ken> aio-free-ring-with-bogus-nr-pages: Error: io_setup returned -22, Ken> expected -ENOMEM Ken> hmm??? The problem is that the test code forgot to initialized Ken> ctx variable and in the kernel, sys_io_setup returns EINVAL if Ken> user address contain none-zero value. Zach had already sent a fix for that. I wonder why it didn't go in. I didn't see any response to Zach's request for code that actually tests out the shared ring buffer. Do you have such code? -Jeff ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-12 12:06 ` Jeff Moyer @ 2007-04-13 0:57 ` Ken Chen 2007-04-13 1:08 ` Ken Chen 0 siblings, 1 reply; 20+ messages in thread From: Ken Chen @ 2007-04-13 0:57 UTC (permalink / raw) To: Jeff Moyer; +Cc: Zach Brown, akpm, linux-aio, linux-kernel On 4/12/07, Jeff Moyer <jmoyer@redhat.com> wrote: > I didn't see any response to Zach's request for code that actually > tests out the shared ring buffer. Do you have such code? Yes, I do. I was stress testing the code since last night. After 20+ hours of stress run with fio and aio-stress, now I'm posting it with confidence. I modified libaio's io_getevents to take advantage of new user level reap function. The feature is exported out via ring->compat_features. btw, is compat_feature suppose to be a version number or a bit mask? I think bitmask make more sense and more flexible. (warning: some lines are extremely long in the patch and my email client will probably mangle it badly). diff -Nurp libaio-0.3.104/src/io_getevents.c libaio-0.3.104-new/src/io_getevents.c --- libaio-0.3.104/src/io_getevents.c 2003-06-18 12:58:21.000000000 -0700 +++ libaio-0.3.104-new/src/io_getevents.c 2007-04-12 17:35:06.000000000 -0700 @@ -21,10 +21,13 @@ #include <stdlib.h> #include <time.h> #include "syscall.h" +#include <asm/system.h> io_syscall5(int, __io_getevents_0_4, io_getevents, io_context_t, ctx, long, min_nr, long, nr, struct io_event *, events, struct timespec *, timeout) #define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_BASE 1 +#define AIO_RING_USER_REAP 2 /* Ben will hate me for this */ struct aio_ring { @@ -41,7 +44,11 @@ struct aio_ring { int io_getevents_0_4(io_context_t ctx, long min_nr, long nr, struct io_event * events, struct timespec * timeout) { + long i = 0, ret; + unsigned head; + struct io_event *evt_base; struct aio_ring *ring; + ring = (struct aio_ring*)ctx; if (ring==NULL || ring->magic != AIO_RING_MAGIC) goto do_syscall; @@ -49,9 +56,35 @@ int io_getevents_0_4(io_context_t ctx, l if (ring->head == ring->tail) return 0; } - + + if (!(ring->compat_features & AIO_RING_USER_REAP)) + goto do_syscall; + + if (min_nr > nr || min_nr < 0 || nr < 0) + return -EINVAL; + + evt_base = (struct io_event *) (ring + 1); + while (i < nr) { + head = ring->head; + if (head == ring->tail) + break; + + *events = evt_base[head & (ring->nr - 1)]; + if (head == cmpxchg(&ring->head, head, head + 1)) { + events++; + i++; + } + } + + if (i >= min_nr) + return i; + do_syscall: - return __io_getevents_0_4(ctx, min_nr, nr, events, timeout); + ret = __io_getevents_0_4(ctx, min_nr - i, nr - i, events, timeout); + if (ret >= 0) + return i + ret; + else + return i ? i : ret; } DEFSYMVER(io_getevents_0_4, io_getevents, 0.4) ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-13 0:57 ` Ken Chen @ 2007-04-13 1:08 ` Ken Chen 0 siblings, 0 replies; 20+ messages in thread From: Ken Chen @ 2007-04-13 1:08 UTC (permalink / raw) To: Jeff Moyer; +Cc: Zach Brown, akpm, linux-aio, linux-kernel On 4/12/07, Ken Chen <kenchen@google.com> wrote: > On 4/12/07, Jeff Moyer <jmoyer@redhat.com> wrote: > > I didn't see any response to Zach's request for code that actually > > tests out the shared ring buffer. Do you have such code? > > Yes, I do. I was stress testing the code since last night. After 20+ > hours of stress run with fio and aio-stress, now I'm posting it with > confidence. > > I modified libaio's io_getevents to take advantage of new user level > reap function. The feature is exported out via ring->compat_features. > btw, is compat_feature suppose to be a version number or a bit mask? > I think bitmask make more sense and more flexible. Additional patch on the kernel side to export the new features. On top of patch posted at: http://marc.info/?l=linux-kernel&m=117636401818057&w=2 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -138,8 +138,11 @@ #define init_sync_kiocb(x, filp) \ init_wait((&(x)->ki_wait)); \ } while (0) +#define AIO_RING_BASE 1 +#define AIO_RING_USER_REAP 2 + #define AIO_RING_MAGIC 0xa10a10a1 -#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_COMPAT_FEATURES (AIO_RING_BASE | AIO_RING_USER_REAP) #define AIO_RING_INCOMPAT_FEATURES 0 struct aio_ring { unsigned id; /* kernel internal index number */ ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-11 18:00 ` Zach Brown 2007-04-11 19:11 ` Andrew Morton 2007-04-11 19:28 ` Ken Chen @ 2007-04-12 7:50 ` Ken Chen 2007-04-12 14:31 ` Benjamin LaHaise 2 siblings, 1 reply; 20+ messages in thread From: Ken Chen @ 2007-04-12 7:50 UTC (permalink / raw) To: Zach Brown; +Cc: akpm, linux-aio, linux-kernel, Benjamin LaHaise [-- Attachment #1: Type: text/plain, Size: 1927 bytes --] On 4/11/07, Zach Brown <zach.brown@oracle.com> wrote: > First, I'll NAK this and all AIO patches until the patch description > says that it's been run through the regression tests that we've started > collecting in autotest. They're trivial to run, never fear: > > cd /usr/local > svn checkout svn://test.kernel.org/autotest/trunk/client autotest > cd autotest > bin/autotest tests/aio_dio_bugs/control I ran through the autotest (with bug fix in the test code). It passes the regression tests. I made the following change since last rev: * struct aio_ring stays unchanged. I added a cast to atomic_cmpxchg() * aio_ring_event() has grown too big, I turned that into a function * take out white space change from previous rev. open issue that this patch does not try to fix: * kmap address alias needs a flush_dcache_page Zach, does this make you more comfortable? (or maybe less iffy is the right word?) From: Ken Chen <kenchen@google.com> Resurrect an old patch that uses atomic operation to update ring buffer index on AIO event queue. This work allows further application/libaio optimization to run fast path io_getevents in user space. I've also added one more change on top of old implementation that rounds ring buffer size to power of 2 to allow fast head/tail calculation. With the new scheme, there is no more modulo operation on them and we simply increment either pointer index directly. This scheme also automatically handles integer wrap nicely without any additional special treatment. Patch was stress tested and in addition, tested on autotest:aio_dio_bugs. The results were all pass: Output of autotest:aio_dio_bugs: ran for 200 seconds without error, passing AIO read of last block in file succeeded. aio-free-ring-with-bogus-nr-pages: Success! aio-io-setup-with-nonwritable-context-pointer: Success! GOOD aio_dio_bugs completed successfully Signed-off-by: Ken Chen <kenchen@google.com> [-- Attachment #2: aio-use-atomic.patch --] [-- Type: text/x-patch, Size: 5106 bytes --] diff --git a/fs/aio.c b/fs/aio.c index e4598d6..f8b8e45 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx) unsigned long size; int nr_pages; - /* Compensate for the ring buffer's head/tail overlap entry */ - nr_events += 2; /* 1 is required, 2 for good luck */ + /* round nr_event to next power of 2 */ + nr_events = roundup_pow_of_two(nr_events); size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; @@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx) if (nr_pages < 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; info->ring_pages = info->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -177,14 +175,16 @@ static int aio_setup_ring(struct kioctx *ctx) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ - struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ - __event += pos % AIO_EVENTS_PER_PAGE; \ - __event; \ -}) +static struct io_event *aio_ring_event(struct aio_ring_info *info, + unsigned nr, enum km_type km) +{ + unsigned pos = (nr & (info->nr - 1)) + AIO_EVENTS_OFFSET; + struct io_event *event; + + event = kmap_atomic(info->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); + event += pos % AIO_EVENTS_PER_PAGE; + return event; +} #define put_aio_ring_event(event, km) do { \ struct io_event *__event = (event); \ @@ -220,7 +220,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(&ctx->users, 1); spin_lock_init(&ctx->ctx_lock); - spin_lock_init(&ctx->ring_info.ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); @@ -927,7 +926,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) struct aio_ring *ring; struct io_event *event; unsigned long flags; - unsigned long tail; + unsigned tail; int ret; /* @@ -969,8 +968,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) tail = info->tail; event = aio_ring_event(info, tail, KM_IRQ0); - if (++tail >= info->nr) - tail = 0; event->obj = (u64)(unsigned long)iocb->ki_obj.user; event->data = iocb->ki_user_data; @@ -986,13 +983,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) */ smp_wmb(); /* make event visible before updating tail */ + tail++; info->tail = tail; ring->tail = tail; put_aio_ring_event(event, KM_IRQ0); kunmap_atomic(ring, KM_IRQ1); - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + pr_debug("added to ring %p at [%u]\n", iocb, tail); put_rq: /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -1007,39 +1005,35 @@ put_rq: /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. * TODO: make the ringbuffer user mmap()able (requires FIXME). */ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) { struct aio_ring_info *info = &ioctx->ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; + struct io_event *evp; + unsigned head; + int ret; ring = kmap_atomic(info->ring_pages[0], KM_USER0); dprintk("in aio_read_evt h%lu t%lu m%lu\n", (unsigned long)ring->head, (unsigned long)ring->tail, (unsigned long)ring->nr); - if (ring->head == ring->tail) - goto out; - - spin_lock(&info->ring_lock); - - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); + do { + head = ring->head; + if (head == ring->tail) { + ret = 0; + break; + } + evp = aio_ring_event(info, head, KM_USER1); *ent = *evp; - head = (head + 1) % info->nr; smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; ret = 1; put_aio_ring_event(evp, KM_USER1); - } - spin_unlock(&info->ring_lock); + } while (head != atomic_cmpxchg((atomic_t *) &ring->head, + head, head + 1)); -out: kunmap_atomic(ring, KM_USER0); dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, (unsigned long)ring->head, (unsigned long)ring->tail); diff --git a/include/linux/aio.h b/include/linux/aio.h index a30ef13..0cd66a3 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -156,7 +156,7 @@ struct aio_ring { struct io_event io_events[0]; }; /* 128 bytes + ring size */ -#define aio_ring_avail(info, ring) (((ring)->head + (info)->nr - 1 - (ring)->tail) % (info)->nr) +#define aio_ring_avail(info, ring) ((info)->nr + (ring)->head - (ring)->tail) #define AIO_RING_PAGES 8 struct aio_ring_info { @@ -164,7 +164,6 @@ struct aio_ring_info { unsigned long mmap_size; struct page **ring_pages; - spinlock_t ring_lock; long nr_pages; unsigned nr, tail; ^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-12 7:50 ` Ken Chen @ 2007-04-12 14:31 ` Benjamin LaHaise 2007-04-12 15:13 ` Benjamin LaHaise 0 siblings, 1 reply; 20+ messages in thread From: Benjamin LaHaise @ 2007-04-12 14:31 UTC (permalink / raw) To: Ken Chen; +Cc: Zach Brown, akpm, linux-aio, linux-kernel On Thu, Apr 12, 2007 at 12:50:39AM -0700, Ken Chen wrote: > I ran through the autotest (with bug fix in the test code). It passes > the regression tests. I made the following change since last rev: By removing the spinlock around ring insertion, you've made it possible for two events being inserted on different CPUs to end up creating inconsistent state, as there is nothing which guarantees that resulting event in the ring will be wholely one event or another. Also, I don't like rounding up of the number of events to the nearest power of two, as it can round things up significantly more than just going to the end of the last actual page of memory allocated. NAK. -ben PS base64 encoded patches are incredibly difficult to quote. -- "Time is of no importance, Mr. President, only life is important." Don't Email: <zyntrop@kvack.org>. ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [patch] convert aio event reap to use atomic-op instead of spin_lock 2007-04-12 14:31 ` Benjamin LaHaise @ 2007-04-12 15:13 ` Benjamin LaHaise 0 siblings, 0 replies; 20+ messages in thread From: Benjamin LaHaise @ 2007-04-12 15:13 UTC (permalink / raw) To: Ken Chen; +Cc: Zach Brown, akpm, linux-aio, linux-kernel On Thu, Apr 12, 2007 at 10:31:31AM -0400, Benjamin LaHaise wrote: > On Thu, Apr 12, 2007 at 12:50:39AM -0700, Ken Chen wrote: > > I ran through the autotest (with bug fix in the test code). It passes > > the regression tests. I made the following change since last rev: > > By removing the spinlock around ring insertion, you've made it possible > for two events being inserted on different CPUs to end up creating > inconsistent state, as there is nothing which guarantees that resulting > event in the ring will be wholely one event or another. Ignore that, I misread the function it was applied to. -ENEEDCOFFEE. Yes, that spinlock can go if we're doing cmpxchg(). -ben -- "Time is of no importance, Mr. President, only life is important." Don't Email: <zyntrop@kvack.org>. ^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2007-04-13 1:09 UTC | newest] Thread overview: 20+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2007-04-10 23:53 [patch] convert aio event reap to use atomic-op instead of spin_lock Ken Chen 2007-04-11 5:18 ` Andrew Morton 2007-04-11 16:54 ` Ken Chen 2007-04-11 18:15 ` Zach Brown 2007-04-11 18:29 ` Ken Chen 2007-04-11 18:00 ` Zach Brown 2007-04-11 19:11 ` Andrew Morton 2007-04-11 19:34 ` Zach Brown 2007-04-11 19:28 ` Ken Chen 2007-04-11 19:44 ` Zach Brown 2007-04-11 19:45 ` Benjamin LaHaise 2007-04-11 19:52 ` Zach Brown 2007-04-11 20:03 ` Benjamin LaHaise 2007-04-12 7:29 ` Ken Chen 2007-04-12 12:06 ` Jeff Moyer 2007-04-13 0:57 ` Ken Chen 2007-04-13 1:08 ` Ken Chen 2007-04-12 7:50 ` Ken Chen 2007-04-12 14:31 ` Benjamin LaHaise 2007-04-12 15:13 ` Benjamin LaHaise
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox