public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT
@ 2002-11-06  1:03 Badari Pulavarty
  2002-11-06  1:17 ` Andrew Morton
  0 siblings, 1 reply; 4+ messages in thread
From: Badari Pulavarty @ 2002-11-06  1:03 UTC (permalink / raw)
  To: linux-aio, lkml; +Cc: akpm, bcrl, Badari Pulavarty

Hi,

This is (part 2/2) 2.5.46 patch to support AIO for raw/O_DIRECT.

This patch adds AIO support for DIO code path. This patch also
has a work around for calling set_page_dirty() from interrupt 
context problem.

Andrew, could you please check to see if I did "set_page_dirty()"
hack (you suggested) correctly (in the right place) ?

Ben, could you pick these patches and push to Linus ?

NOTE: You need part 1/2 to use this patch.

Thanks,
Badari

diff -Naur -X dontdiff linux-2.5.46/fs/direct-io.c linux-2.5.46.aio/fs/direct-io.c
--- linux-2.5.46/fs/direct-io.c	Tue Nov  5 16:01:18 2002
+++ linux-2.5.46.aio/fs/direct-io.c	Tue Nov  5 14:57:36 2002
@@ -13,6 +13,7 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
@@ -100,6 +101,11 @@
 	spinlock_t bio_list_lock;	/* protects bio_list */
 	struct bio *bio_list;		/* singly linked via bi_private */
 	struct task_struct *waiter;	/* waiting task (NULL if none) */
+
+	/* AIO related stuff */
+	struct kiocb *iocb;		/* kiocb */
+	int is_async;			/* is IO async ? */
+	int result;			/* IO result */
 };
 
 /*
@@ -176,6 +182,43 @@
 	return dio->pages[dio->head++];
 }
 
+static void dio_bio_count(struct dio *dio)
+{
+	if (atomic_dec_and_test(&dio->bio_count)) {
+		if(dio->is_async) {
+			aio_complete(dio->iocb, dio->result, 0);
+			kfree(dio);
+		}
+	}
+}
+
+static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
+{
+	struct dio *dio = bio->bi_private;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int page_no;
+
+	if (bio->bi_size)
+		return 1;
+
+	for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
+		struct page *page = bvec[page_no].bv_page;
+
+		if (dio->rw == READ) {
+			SetPageDirty(page);
+			SetPageWrongList(page);
+		}
+		page_cache_release(page);
+	}
+	if (!uptodate) 
+		dio->result = -EIO;
+
+	dio_bio_count(dio);
+	bio_put(bio);
+	return 0;
+}
+
 /*
  * The BIO completion handler simply queues the BIO up for the process-context
  * handler.
@@ -212,7 +255,10 @@
 
 	bio->bi_bdev = bdev;
 	bio->bi_sector = first_sector;
-	bio->bi_end_io = dio_bio_end_io;
+	if (dio->is_async)
+		bio->bi_end_io = dio_bio_end_aio;
+	else
+		bio->bi_end_io = dio_bio_end_io;
 
 	dio->bio = bio;
 	return 0;
@@ -745,73 +791,84 @@
 }
 
 static int
-direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, 
-	loff_t offset, unsigned long nr_segs, unsigned blkbits,
-	get_blocks_t get_blocks)
+direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
+	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
+	unsigned blkbits, get_blocks_t get_blocks)
 {
 	unsigned long user_addr; 
 	int seg, ret2, ret = 0;
-	struct dio dio;
-	size_t bytes, tot_bytes = 0;
+	struct dio local_dio, *dio;
+	size_t bytes;
 
-	dio.bio = NULL;
-	dio.inode = inode;
-	dio.rw = rw;
-	dio.blkbits = blkbits;
-	dio.blkfactor = inode->i_blkbits - blkbits;
-	dio.start_zero_done = 0;
-	dio.block_in_file = offset >> blkbits;
-	dio.blocks_available = 0;
-
-	dio.cur_page = NULL;
-
-	dio.boundary = 0;
-	dio.reap_counter = 0;
-	dio.get_blocks = get_blocks;
-	dio.final_block_in_bio = -1;
-	dio.next_block_for_io = -1;
+	if (is_sync_kiocb(iocb)) {
+		dio = &local_dio;
+		dio->is_async = 0;
+	} else {
+		dio = (struct dio *)kmalloc(sizeof(struct dio), GFP_KERNEL);
+		if (!dio)
+			return -ENOMEM;
+		dio->is_async = 1;
+	}
+	dio->bio = NULL;
+	dio->inode = inode;
+	dio->rw = rw;
+	dio->blkbits = blkbits;
+	dio->blkfactor = inode->i_blkbits - blkbits;
+	dio->start_zero_done = 0;
+	dio->block_in_file = offset >> blkbits;
+	dio->blocks_available = 0;
 
-	dio.page_errors = 0;
+	dio->cur_page = NULL;
+
+	dio->boundary = 0;
+	dio->reap_counter = 0;
+	dio->get_blocks = get_blocks;
+	dio->final_block_in_bio = -1;
+	dio->next_block_for_io = -1;
+
+	dio->page_errors = 0;
+	dio->result = 0;
+	dio->iocb = iocb;
 
 	/* BIO completion state */
-	atomic_set(&dio.bio_count, 0);
-	spin_lock_init(&dio.bio_list_lock);
-	dio.bio_list = NULL;
-	dio.waiter = NULL;
-	dio.pages_in_io = 0;
+	atomic_set(&dio->bio_count, 1);
+	spin_lock_init(&dio->bio_list_lock);
+	dio->bio_list = NULL;
+	dio->waiter = NULL;
+	dio->pages_in_io = 0;
 
 	for (seg = 0; seg < nr_segs; seg++) 
-		dio.pages_in_io += (iov[seg].iov_len >> blkbits) + 2; 
+		dio->pages_in_io += (iov[seg].iov_len >> blkbits) + 2; 
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
 		bytes = iov[seg].iov_len;
 
 		/* Index into the first page of the first block */
-		dio.first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
-		dio.final_block_in_request = dio.block_in_file + (bytes >> blkbits);
+		dio->first_block_in_page = (user_addr & (PAGE_SIZE - 1)) >> blkbits;
+		dio->final_block_in_request = dio->block_in_file + (bytes >> blkbits);
 		/* Page fetching state */
-		dio.head = 0;
-		dio.tail = 0;
-		dio.curr_page = 0;
+		dio->head = 0;
+		dio->tail = 0;
+		dio->curr_page = 0;
 
-		dio.total_pages = 0;
+		dio->total_pages = 0;
 		if (user_addr & (PAGE_SIZE-1)) {
-			dio.total_pages++;
+			dio->total_pages++;
 			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
 		}
-		dio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		dio.curr_user_address = user_addr;
+		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		dio->curr_user_address = user_addr;
 	
-		ret = do_direct_IO(&dio);
+		ret = do_direct_IO(dio);
 
 		if (ret) {
-			dio_cleanup(&dio);
+			dio_cleanup(dio);
 			break;
 		}
 
-		tot_bytes += iov[seg].iov_len - ((dio.final_block_in_request -
-					dio.block_in_file) << blkbits);
+		dio->result += iov[seg].iov_len - ((dio->final_block_in_request -
+					dio->block_in_file) << blkbits);
 
 	} /* end iovec loop */
 
@@ -819,22 +876,32 @@
 	 * There may be some unwritten disk at the end of a part-written
 	 * fs-block-sized block.  Go zero that now.
 	 */
-	dio_zero_block(&dio, 1);
+	dio_zero_block(dio, 1);
 
-	if (dio.cur_page) {
-		ret2 = dio_send_cur_page(&dio);
-		page_cache_release(dio.cur_page);
+	if (dio->cur_page) {
+		ret2 = dio_send_cur_page(dio);
+		page_cache_release(dio->cur_page);
 		if (ret == 0)
 			ret = ret2;
 	}
-	ret2 = dio_await_completion(&dio);
+
+	if (dio->is_async) {
+		dio_bio_count(dio);
+		if (ret == 0)
+			ret = -EIOCBQUEUED;
+		goto out;
+	}
+
+	dio_bio_count(dio);
+	ret2 = dio_await_completion(dio);
 	if (ret == 0)
 		ret = ret2;
 	if (ret == 0)
-		ret = dio.page_errors;
+		ret = dio->page_errors;
 	if (ret == 0)
-		ret = tot_bytes; 
+		ret = dio->result; 
 
+out:
 	return ret;
 }
 
@@ -878,7 +945,7 @@
 		}
 	}
 
-	retval = direct_io_worker(rw, inode, iov, offset, 
+	retval = direct_io_worker(rw, iocb, inode, iov, offset, 
 				nr_segs, blkbits, get_blocks);
 out:
 	return retval;
diff -Naur -X dontdiff linux-2.5.46/include/linux/page-flags.h linux-2.5.46.aio/include/linux/page-flags.h
--- linux-2.5.46/include/linux/page-flags.h	Mon Nov  4 14:30:37 2002
+++ linux-2.5.46.aio/include/linux/page-flags.h	Tue Nov  5 14:56:44 2002
@@ -70,6 +70,7 @@
 #define PG_chainlock		15	/* lock bit for ->pte_chain */
 
 #define PG_direct		16	/* ->pte_chain points directly at pte */
+#define PG_wronglist		17	/* page is on wrong list */
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -233,6 +234,10 @@
 #define ClearPageDirect(page)		clear_bit(PG_direct, &(page)->flags)
 #define TestClearPageDirect(page)	test_and_clear_bit(PG_direct, &(page)->flags)
 
+#define SetPageWrongList(page)	set_bit(PG_wronglist, &(page)->flags)
+#define PageWrongList(page)	test_bit(PG_wronglist, &(page)->flags)
+#define ClearPageWrongList(page)	clear_bit(PG_wronglist, &(page)->flags)
+
 /*
  * The PageSwapCache predicate doesn't use a PG_flag at this time,
  * but it may again do so one day.
diff -Naur -X dontdiff linux-2.5.46/mm/vmscan.c linux-2.5.46.aio/mm/vmscan.c
--- linux-2.5.46/mm/vmscan.c	Mon Nov  4 14:30:07 2002
+++ linux-2.5.46.aio/mm/vmscan.c	Tue Nov  5 14:58:56 2002
@@ -378,6 +378,12 @@
 			goto keep_locked;
 		}
 
+		if (PageWrongList(page)) {
+			if (TestClearPageDirty(page))
+				set_page_dirty(page);
+			ClearPageWrongList(page);
+		}
+
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
 			swp_entry_t swap = { .val = page->index };


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT
  2002-11-06  1:03 [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT Badari Pulavarty
@ 2002-11-06  1:17 ` Andrew Morton
  2002-11-06  7:05   ` Benjamin LaHaise
  0 siblings, 1 reply; 4+ messages in thread
From: Andrew Morton @ 2002-11-06  1:17 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: linux-aio, lkml, bcrl

Badari Pulavarty wrote:
> 
> Hi,
> 
> This is (part 2/2) 2.5.46 patch to support AIO for raw/O_DIRECT.
> 
> This patch adds AIO support for DIO code path. This patch also
> has a work around for calling set_page_dirty() from interrupt
> context problem.
> 
> Andrew, could you please check to see if I did "set_page_dirty()"
> hack (you suggested) correctly (in the right place) ?
> 

Looks like it.  It's such a hack, I want to hide ;)

Sigh.  I think I'd prefer to just go and make ->page_lock
and ->private_lock irq-safe.

Or not proceed with this patch at all.  If this is to be the
only code which wishes to perform page list motion at interrupt
time, perhaps it's not justifiable?

I really don't have a feeling for how valuable this is, nor
do I know whether there will be other code which wants to
perform page list manipulation at interrupt time.

In fact I also don't know where the whole AIO thing sits at
present.  Is it all done and finished?  Is there more to come,
and if so, what??

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT
  2002-11-06  1:17 ` Andrew Morton
@ 2002-11-06  7:05   ` Benjamin LaHaise
  2002-11-06  7:23     ` Andrew Morton
  0 siblings, 1 reply; 4+ messages in thread
From: Benjamin LaHaise @ 2002-11-06  7:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Badari Pulavarty, linux-aio, lkml

On Tue, Nov 05, 2002 at 05:17:32PM -0800, Andrew Morton wrote:
> Or not proceed with this patch at all.  If this is to be the
> only code which wishes to perform page list motion at interrupt
> time, perhaps it's not justifiable?
> 
> I really don't have a feeling for how valuable this is, nor
> do I know whether there will be other code which wants to
> perform page list manipulation at interrupt time.

I can think of a few other places that would like to perform page 
motion from irq context: anything else doing zero copy or page 
flipping, and more importantly the O(1) vm code that's being worked 
on.  The latter is actually quite important as we've got a number 
of customers running into problems with some of the algorithms in 
the 2.4 kernel where the kernel does not perform any list motion 
from irq context and this results in excess cpu time spent traversing 
lists to see if io has completed.

> In fact I also don't know where the whole AIO thing sits at
> present.  Is it all done and finished?  Is there more to come,
> and if so, what??

There's more to come.  The bits I'm working on are running in kernel 
context mainly to simplify the copy_*_user case since we don't have 
full zero copy semantics available and coping with pinned pages is 
a challenge in a multiuser system, plus it makes reusing the existing 
networking code a lot easier.  Basically, anything that involves a 
copy of data is likely to be better implemented running in a task to 
get the priority of execution correct, whereas anything involving 
zero copy io is going to want completion from irq or bottom half 
context and hence dirty pages.  Does that make sense?

		-ben
-- 
"Do you seek knowledge in time travel?"

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT
  2002-11-06  7:05   ` Benjamin LaHaise
@ 2002-11-06  7:23     ` Andrew Morton
  0 siblings, 0 replies; 4+ messages in thread
From: Andrew Morton @ 2002-11-06  7:23 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Badari Pulavarty, linux-aio, lkml

Benjamin LaHaise wrote:
> 
> On Tue, Nov 05, 2002 at 05:17:32PM -0800, Andrew Morton wrote:
> > Or not proceed with this patch at all.  If this is to be the
> > only code which wishes to perform page list motion at interrupt
> > time, perhaps it's not justifiable?
> >
> > I really don't have a feeling for how valuable this is, nor
> > do I know whether there will be other code which wants to
> > perform page list manipulation at interrupt time.
> 
> I can think of a few other places that would like to perform page
> motion from irq context: anything else doing zero copy or page
> flipping, and more importantly the O(1) vm code that's being worked
> on.  The latter is actually quite important as we've got a number
> of customers running into problems with some of the algorithms in
> the 2.4 kernel where the kernel does not perform any list motion
> from irq context and this results in excess cpu time spent traversing
> lists to see if io has completed.

Interesting.  Would it be possible to get more details on this
problem?

We can shuffle pages around on the LRU from interrupts at present.
But not the address_space lists and buffer stuff.

We could remove the address_space lists and page->list altogether
with a bit of radix tree work.  I started in on that but got distracted.

mapping->clean_pages is actually obsolete now - we never walk it.  It
could be replaced with list_del_init() everywhere.

> > In fact I also don't know where the whole AIO thing sits at
> > present.  Is it all done and finished?  Is there more to come,
> > and if so, what??
> 
> There's more to come.  The bits I'm working on are running in kernel
> context mainly to simplify the copy_*_user case since we don't have
> full zero copy semantics available and coping with pinned pages is
> a challenge in a multiuser system, plus it makes reusing the existing
> networking code a lot easier.  Basically, anything that involves a
> copy of data is likely to be better implemented running in a task to
> get the priority of execution correct, whereas anything involving
> zero copy io is going to want completion from irq or bottom half
> context and hence dirty pages.  Does that make sense?

OK, thanks.  So I guess we make those locks irq-safe.  I don't
expect that would be feasible in 2.4 because of the hold times in 
truncate - it should be OK in 2.5.  I'll do the patch.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2002-11-06  7:17 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-11-06  1:03 [PATCH 2/2] 2.5.46 AIO support for raw/O_DIRECT Badari Pulavarty
2002-11-06  1:17 ` Andrew Morton
2002-11-06  7:05   ` Benjamin LaHaise
2002-11-06  7:23     ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox