patch: aio + bio for raw io

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* patch: aio + bio for raw io
@ 2002-02-08  7:53 Benjamin LaHaise
  2002-02-08  9:40 ` Suparna Bhattacharya
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Benjamin LaHaise @ 2002-02-08  7:53 UTC (permalink / raw)
  To: linux-aio; +Cc: linux-kernel

Quick message: this patch makes aio use bio directly for brw_kvec_async.  
This is against yesterday's patchset.  Comments?

		-ben

===== fs/Makefile 1.15 vs 1.16 =====
--- 1.15/fs/Makefile	Wed Jan 30 02:21:55 2002
+++ 1.16/fs/Makefile	Fri Feb  8 14:57:54 2002
@@ -22,6 +22,7 @@
 obj-y += noquot.o
 endif
 
+export-objs += aio.o
 obj-y += aio.o
 
 subdir-$(CONFIG_PROC_FS)	+= proc
===== fs/buffer.c 1.60 vs 1.61 =====
--- 1.60/fs/buffer.c	Tue Jan 15 05:53:34 2002
+++ 1.61/fs/buffer.c	Fri Feb  8 17:22:14 2002
@@ -54,6 +54,8 @@
 #include <asm/bitops.h>
 #include <asm/mmu_context.h>
 
+extern struct bio *bio_setup_from_kvec(int gfp_mask, struct kvec *kvec);
+
 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
@@ -2764,16 +2766,26 @@
  * It is up to the caller to make sure that there are enough blocks
  * passed in to completely map the iobufs to disk.
  */
+static int brw_kvec_end_io(struct bio *bio, int nr_sectors)
+{
+	kvec_cb_t cb = bio->cb;
+	int res = nr_sectors * 512;
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		res = -EIO;
+	bio_put(bio);
+	cb.fn(cb.data, cb.vec, res);
+	return 0;
+}
 
-int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift)
+int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, sector_t blknr, int sector_shift)
 {
 	struct kvec	*vec = cb.vec;
 	struct kveclet	*veclet;
-	int		err;
 	int		length;
 	unsigned	sector_size = 1 << sector_shift;
 	int		i;
 
+	struct bio	*bio;
 	struct brw_cb	*brw_cb;
 
 	if (!vec->nr)
@@ -2795,125 +2807,21 @@
 	if (length < (blocks << sector_shift))
 		BUG();
 
-	/* 
-	 * OK to walk down the iovec doing page IO on each page we find. 
-	 */
-	err = 0;
-
 	if (!blocks) {
 		printk("brw_kiovec_async: !i\n");
 		return -EINVAL;
 	}
 
-	/* FIXME: tie into userbeans here */
-	brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL);
-	if (!brw_cb)
+	bio = bio_setup_from_kvec(GFP_KERNEL, vec);
+	if (unlikely(!bio))
 		return -ENOMEM;
-
-	brw_cb->cb = cb;
-	brw_cb->nr = 0;
-
-	/* This is ugly.  FIXME. */
-	for (i=0, veclet=vec->veclet; i<vec->nr; i++,veclet++) {
-		struct page *page = veclet->page;
-		unsigned offset = veclet->offset;
-		unsigned length = veclet->length;
-
-		if (!page)
-			BUG();
-
-		while (length > 0) {
-			struct buffer_head *tmp;
-			tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO);
-			err = -ENOMEM;
-			if (!tmp)
-				goto error;
-
-			tmp->b_dev = B_FREE;
-			tmp->b_size = sector_size;
-			set_bh_page(tmp, page, offset);
-			tmp->b_this_page = tmp;
-
-			init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
-			tmp->b_dev = dev;
-			tmp->b_blocknr = blknr++;
-			tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock)
-					| (1 << BH_Req);
-			tmp->b_private = brw_cb;
-
-			if (rw == WRITE) {
-				set_bit(BH_Uptodate, &tmp->b_state);
-				clear_bit(BH_Dirty, &tmp->b_state);
-			}
-
-			brw_cb->bh[brw_cb->nr++] = tmp;
-			length -= sector_size;
-			offset += sector_size;
-
-			if (offset >= PAGE_SIZE) {
-				offset = 0;
-				break;
-			}
-
-			if (brw_cb->nr >= blocks)
-				goto submit;
-		} /* End of block loop */
-	} /* End of page loop */		
-
-submit:
-	atomic_set(&brw_cb->io_count, brw_cb->nr+1);
-	/* okay, we've setup all our io requests, now fire them off! */
-	for (i=0; i<brw_cb->nr; i++) 
-		submit_bh(rw, brw_cb->bh[i]);
-	brw_cb_put(brw_cb);
+	bio->cb = cb;
+	bio->bi_sector = blknr;
+	bio->bi_dev = dev;
+	bio->bi_vcnt = vec->nr;
+	bio->bi_size = length;
+	bio->bi_end_io = brw_kvec_end_io;
+	submit_bio(rw, bio);
 
 	return 0;
-
-error:
-	/* Walk brw_cb_table freeing all the goop associated with each kiobuf */
-	if (brw_cb) {
-		/* We got an error allocating the bh'es.  Just free the current
-		   buffer_heads and exit. */
-		for (i=0; i<brw_cb->nr; i++)
-			kmem_cache_free(bh_cachep, brw_cb->bh[i]);
-		kfree(brw_cb);
-	}
-
-	return err;
-}
-#if 0
-int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
-		kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
-{
-	int i;
-	int transferred = 0;
-	int err = 0;
-
-	if (!nr)
-		return 0;
-
-	/* queue up and trigger the io */
-	err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size);
-	if (err)
-		goto out;
-
-	/* wait on the last iovec first -- it's more likely to finish last */
-	for (i=nr; --i >= 0; )
-		kiobuf_wait_for_io(iovec[i]);
-
-	run_task_queue(&tq_disk);
-
-	/* okay, how much data actually got through? */
-	for (i=0; i<nr; i++) {
-		if (iovec[i]->errno) {
-			if (!err)
-				err = iovec[i]->errno;
-			break;
-		}
-		transferred += iovec[i]->length;
-	}
-
-out:
-	return transferred ? transferred : err;
 }
-#endif
===== fs/bio.c 1.14 vs 1.15 =====
--- 1.14/fs/bio.c	Thu Feb  7 16:13:25 2002
+++ 1.15/fs/bio.c	Fri Feb  8 17:22:14 2002
@@ -151,6 +151,22 @@
 	return NULL;
 }
 
+struct bio *bio_setup_from_kvec(int gfp_mask, struct kvec *kvec)
+{
+	struct bio *bio = mempool_alloc(bio_pool, gfp_mask);
+	struct bio_vec *bvl = NULL;
+
+	if (unlikely(!bio))
+		return NULL;
+
+	bio->bi_max = kvec->max_nr;
+	bio_init(bio);
+	bio->bi_destructor = bio_destructor;
+	bio->bi_io_vec = (struct bio_vec *)kvec->veclet;	/* shoot me */
+	bio->bi_flags |= 1 << BIO_CLONED;	/* don't free the vec */
+	return bio;
+}
+
 /**
  * bio_put - release a reference to a bio
  * @bio:   bio to release reference to
===== include/linux/kiovec.h 1.1 vs 1.2 =====
--- 1.1/include/linux/kiovec.h	Sat Jan 12 02:19:10 2002
+++ 1.2/include/linux/kiovec.h	Fri Feb  8 15:00:54 2002
@@ -6,8 +6,8 @@
 
 struct kveclet {
 	struct page	*page;
-	unsigned	offset;
 	unsigned	length;
+	unsigned	offset;
 };
 
 struct kvec {
===== include/linux/bio.h 1.11 vs 1.12 =====
--- 1.11/include/linux/bio.h	Wed Feb  6 01:23:04 2002
+++ 1.12/include/linux/bio.h	Fri Feb  8 17:22:23 2002
@@ -26,6 +26,8 @@
 #define BIO_VMERGE_BOUNDARY	0
 #endif
 
+#include <linux/kiovec.h>
+
 #define BIO_DEBUG
 
 #ifdef BIO_DEBUG
@@ -91,6 +93,7 @@
 	void			*bi_private;
 
 	bio_destructor_t	*bi_destructor;	/* destructor */
+	kvec_cb_t		cb;
 };
 
 /*
===== fs/aio.c 1.1 vs 1.2 =====
--- 1.1/fs/aio.c	Wed Jan 30 13:12:34 2002
+++ 1.2/fs/aio.c	Fri Feb  8 14:58:10 2002
@@ -37,6 +37,7 @@
 #include <linux/smp_lock.h>
 #include <linux/compiler.h>
 #include <linux/poll.h>
+#include <linux/module.h>
 
 #include <asm/uaccess.h>
 
@@ -964,3 +965,8 @@
 }
 
 __initcall(aio_setup);
+EXPORT_SYMBOL_GPL(generic_file_kvec_read);
+EXPORT_SYMBOL_GPL(generic_file_aio_read);
+EXPORT_SYMBOL_GPL(generic_file_kvec_write);
+EXPORT_SYMBOL_GPL(generic_file_aio_write);
+EXPORT_SYMBOL_GPL(generic_file_new_read);

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-08  7:53 patch: aio + bio for raw io Benjamin LaHaise
@ 2002-02-08  9:40 ` Suparna Bhattacharya
  2002-02-08 22:02   ` Benjamin LaHaise
  2002-02-08 21:07 ` Badari Pulavarty
  2002-02-11 21:37 ` Badari Pulavarty
  2 siblings, 1 reply; 13+ messages in thread
From: Suparna Bhattacharya @ 2002-02-08  9:40 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: linux-aio, linux-kernel

On Fri, Feb 08, 2002 at 02:53:13AM -0500, Benjamin LaHaise wrote:
> Quick message: this patch makes aio use bio directly for brw_kvec_async.  
> This is against yesterday's patchset.  Comments?
> 

I was looking for this in yesterday's aio patchset for 2.5 and was just 
about to send you a note asking you about this :)

You chose to add a kvec_cb field to the bio structure rather than use
bi_private ?
 
For the raw path, you are OK since you never have to copy data out of 
the kvecs after i/o completion, and unmap_kvec only looks at veclet pages. 
So the fact block can change the offset and len fields in the veclets 
doesn't affect you, but thought I'd mention it as a point of caution
anyhow ...

> 		-ben
> 
> ===== fs/Makefile 1.15 vs 1.16 =====
> --- 1.15/fs/Makefile	Wed Jan 30 02:21:55 2002
> +++ 1.16/fs/Makefile	Fri Feb  8 14:57:54 2002
> @@ -22,6 +22,7 @@
>  obj-y += noquot.o
>  endif
>  
> +export-objs += aio.o
>  obj-y += aio.o
>  
>  subdir-$(CONFIG_PROC_FS)	+= proc
> ===== fs/buffer.c 1.60 vs 1.61 =====
> --- 1.60/fs/buffer.c	Tue Jan 15 05:53:34 2002
> +++ 1.61/fs/buffer.c	Fri Feb  8 17:22:14 2002
> @@ -54,6 +54,8 @@
>  #include <asm/bitops.h>
>  #include <asm/mmu_context.h>
>  
> +extern struct bio *bio_setup_from_kvec(int gfp_mask, struct kvec *kvec);
> +
>  #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
>  #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
>  #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
> @@ -2764,16 +2766,26 @@
>   * It is up to the caller to make sure that there are enough blocks
>   * passed in to completely map the iobufs to disk.
>   */
> +static int brw_kvec_end_io(struct bio *bio, int nr_sectors)
> +{
> +	kvec_cb_t cb = bio->cb;
> +	int res = nr_sectors * 512;
> +	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
> +		res = -EIO;
> +	bio_put(bio);
> +	cb.fn(cb.data, cb.vec, res);
> +	return 0;
> +}
>  
> -int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, unsigned long blknr, int sector_shift)
> +int brw_kvec_async(int rw, kvec_cb_t cb, kdev_t dev, unsigned blocks, sector_t blknr, int sector_shift)
>  {
>  	struct kvec	*vec = cb.vec;
>  	struct kveclet	*veclet;
> -	int		err;
>  	int		length;
>  	unsigned	sector_size = 1 << sector_shift;
>  	int		i;
>  
> +	struct bio	*bio;
>  	struct brw_cb	*brw_cb;
>  
>  	if (!vec->nr)
> @@ -2795,125 +2807,21 @@
>  	if (length < (blocks << sector_shift))
>  		BUG();
>  
> -	/* 
> -	 * OK to walk down the iovec doing page IO on each page we find. 
> -	 */
> -	err = 0;
> -
>  	if (!blocks) {
>  		printk("brw_kiovec_async: !i\n");
>  		return -EINVAL;
>  	}
>  
> -	/* FIXME: tie into userbeans here */
> -	brw_cb = kmalloc(sizeof(*brw_cb) + (blocks * sizeof(struct buffer_head *)), GFP_KERNEL);
> -	if (!brw_cb)
> +	bio = bio_setup_from_kvec(GFP_KERNEL, vec);
> +	if (unlikely(!bio))
>  		return -ENOMEM;
> -
> -	brw_cb->cb = cb;
> -	brw_cb->nr = 0;
> -
> -	/* This is ugly.  FIXME. */
> -	for (i=0, veclet=vec->veclet; i<vec->nr; i++,veclet++) {
> -		struct page *page = veclet->page;
> -		unsigned offset = veclet->offset;
> -		unsigned length = veclet->length;
> -
> -		if (!page)
> -			BUG();
> -
> -		while (length > 0) {
> -			struct buffer_head *tmp;
> -			tmp = kmem_cache_alloc(bh_cachep, GFP_NOIO);
> -			err = -ENOMEM;
> -			if (!tmp)
> -				goto error;
> -
> -			tmp->b_dev = B_FREE;
> -			tmp->b_size = sector_size;
> -			set_bh_page(tmp, page, offset);
> -			tmp->b_this_page = tmp;
> -
> -			init_buffer(tmp, end_buffer_io_kiobuf_async, NULL);
> -			tmp->b_dev = dev;
> -			tmp->b_blocknr = blknr++;
> -			tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock)
> -					| (1 << BH_Req);
> -			tmp->b_private = brw_cb;
> -
> -			if (rw == WRITE) {
> -				set_bit(BH_Uptodate, &tmp->b_state);
> -				clear_bit(BH_Dirty, &tmp->b_state);
> -			}
> -
> -			brw_cb->bh[brw_cb->nr++] = tmp;
> -			length -= sector_size;
> -			offset += sector_size;
> -
> -			if (offset >= PAGE_SIZE) {
> -				offset = 0;
> -				break;
> -			}
> -
> -			if (brw_cb->nr >= blocks)
> -				goto submit;
> -		} /* End of block loop */
> -	} /* End of page loop */		
> -
> -submit:
> -	atomic_set(&brw_cb->io_count, brw_cb->nr+1);
> -	/* okay, we've setup all our io requests, now fire them off! */
> -	for (i=0; i<brw_cb->nr; i++) 
> -		submit_bh(rw, brw_cb->bh[i]);
> -	brw_cb_put(brw_cb);
> +	bio->cb = cb;
> +	bio->bi_sector = blknr;
> +	bio->bi_dev = dev;
> +	bio->bi_vcnt = vec->nr;
> +	bio->bi_size = length;
> +	bio->bi_end_io = brw_kvec_end_io;
> +	submit_bio(rw, bio);
>  
>  	return 0;
> -
> -error:
> -	/* Walk brw_cb_table freeing all the goop associated with each kiobuf */
> -	if (brw_cb) {
> -		/* We got an error allocating the bh'es.  Just free the current
> -		   buffer_heads and exit. */
> -		for (i=0; i<brw_cb->nr; i++)
> -			kmem_cache_free(bh_cachep, brw_cb->bh[i]);
> -		kfree(brw_cb);
> -	}
> -
> -	return err;
> -}
> -#if 0
> -int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
> -		kdev_t dev, int nr_blocks, unsigned long b[], int sector_size)
> -{
> -	int i;
> -	int transferred = 0;
> -	int err = 0;
> -
> -	if (!nr)
> -		return 0;
> -
> -	/* queue up and trigger the io */
> -	err = brw_kiovec_async(rw, nr, iovec, dev, nr_blocks, b, sector_size);
> -	if (err)
> -		goto out;
> -
> -	/* wait on the last iovec first -- it's more likely to finish last */
> -	for (i=nr; --i >= 0; )
> -		kiobuf_wait_for_io(iovec[i]);
> -
> -	run_task_queue(&tq_disk);
> -
> -	/* okay, how much data actually got through? */
> -	for (i=0; i<nr; i++) {
> -		if (iovec[i]->errno) {
> -			if (!err)
> -				err = iovec[i]->errno;
> -			break;
> -		}
> -		transferred += iovec[i]->length;
> -	}
> -
> -out:
> -	return transferred ? transferred : err;
>  }
> -#endif
> ===== fs/bio.c 1.14 vs 1.15 =====
> --- 1.14/fs/bio.c	Thu Feb  7 16:13:25 2002
> +++ 1.15/fs/bio.c	Fri Feb  8 17:22:14 2002
> @@ -151,6 +151,22 @@
>  	return NULL;
>  }
>  
> +struct bio *bio_setup_from_kvec(int gfp_mask, struct kvec *kvec)
> +{
> +	struct bio *bio = mempool_alloc(bio_pool, gfp_mask);
> +	struct bio_vec *bvl = NULL;
> +
> +	if (unlikely(!bio))
> +		return NULL;
> +
> +	bio->bi_max = kvec->max_nr;
> +	bio_init(bio);
> +	bio->bi_destructor = bio_destructor;
> +	bio->bi_io_vec = (struct bio_vec *)kvec->veclet;	/* shoot me */
> +	bio->bi_flags |= 1 << BIO_CLONED;	/* don't free the vec */
> +	return bio;
> +}
> +
>  /**
>   * bio_put - release a reference to a bio
>   * @bio:   bio to release reference to
> ===== include/linux/kiovec.h 1.1 vs 1.2 =====
> --- 1.1/include/linux/kiovec.h	Sat Jan 12 02:19:10 2002
> +++ 1.2/include/linux/kiovec.h	Fri Feb  8 15:00:54 2002
> @@ -6,8 +6,8 @@
>  
>  struct kveclet {
>  	struct page	*page;
> -	unsigned	offset;
>  	unsigned	length;
> +	unsigned	offset;
>  };
>  
>  struct kvec {
> ===== include/linux/bio.h 1.11 vs 1.12 =====
> --- 1.11/include/linux/bio.h	Wed Feb  6 01:23:04 2002
> +++ 1.12/include/linux/bio.h	Fri Feb  8 17:22:23 2002
> @@ -26,6 +26,8 @@
>  #define BIO_VMERGE_BOUNDARY	0
>  #endif
>  
> +#include <linux/kiovec.h>
> +
>  #define BIO_DEBUG
>  
>  #ifdef BIO_DEBUG
> @@ -91,6 +93,7 @@
>  	void			*bi_private;
>  
>  	bio_destructor_t	*bi_destructor;	/* destructor */
> +	kvec_cb_t		cb;
>  };
>  
>  /*
> ===== fs/aio.c 1.1 vs 1.2 =====
> --- 1.1/fs/aio.c	Wed Jan 30 13:12:34 2002
> +++ 1.2/fs/aio.c	Fri Feb  8 14:58:10 2002
> @@ -37,6 +37,7 @@
>  #include <linux/smp_lock.h>
>  #include <linux/compiler.h>
>  #include <linux/poll.h>
> +#include <linux/module.h>
>  
>  #include <asm/uaccess.h>
>  
> @@ -964,3 +965,8 @@
>  }
>  
>  __initcall(aio_setup);
> +EXPORT_SYMBOL_GPL(generic_file_kvec_read);
> +EXPORT_SYMBOL_GPL(generic_file_aio_read);
> +EXPORT_SYMBOL_GPL(generic_file_kvec_write);
> +EXPORT_SYMBOL_GPL(generic_file_aio_write);
> +EXPORT_SYMBOL_GPL(generic_file_new_read);
> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to majordomo@kvack.org.  For more info on Linux AIO,
> see: http://www.kvack.org/aio/

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-08  7:53 patch: aio + bio for raw io Benjamin LaHaise
  2002-02-08  9:40 ` Suparna Bhattacharya
@ 2002-02-08 21:07 ` Badari Pulavarty
  2002-02-08 21:18   ` arjan
  2002-02-08 22:13   ` Benjamin LaHaise
  2002-02-11 21:37 ` Badari Pulavarty
  2 siblings, 2 replies; 13+ messages in thread
From: Badari Pulavarty @ 2002-02-08 21:07 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: linux-aio, linux-kernel

> 
> Quick message: this patch makes aio use bio directly for brw_kvec_async.  
> This is against yesterday's patchset.  Comments?
> 
> 		-ben

Hi Ben,

I am looking at the 2.5 patch you sent out. I have few questions/comments:

1) brw_kvec_async() does not seem to split IO at BIO_MAX_SIZE. I thought
   each bio can handle only BIO_MAX_SIZE (ll_rw_kio() is creating one bio
   for each BIO_MAX_SIZE IO). 

   And also, currently BIO_MAX_SIZE is only 64K. Infact, if I try to issue
   64K IO using submit_bio(), I get following BUG() on my QLOGIC controller.

	kernel BUG at ll_rw_blk.c:1336 

	Code is: BUG_ON(bio_sectors(bio) > q->max_sectors); 

        	bio_sectors(bio) is 128 
        	q->max_sectors is 64 (for QLOGIC ISP) 

   Is this going to be fixed ? Can "bio" should be able to handle any
   size IO ? Or we should issue create a new bio for each BIO_MAX_SIZE IO ?

2) Could you please make map_user_kvec() generic enough to handle mapping
   of mutliple iovecs to single kvec (to handle readv/writev). I think
   this is a very easy change:

	* Add alloc_kvec() and take out the kmalloc() from map_user_kvec().
	* Change map_user_kvec() to start mapping from kvec->veclet[kvec->nr]
      	  instead of kvec->veclet[0]

  This way allocation for kvec to hold all the iovecs can be done at higher
  level and map_user_kvec() can be called in a loop once for each iovec.

  Then we can add read/writev support for RAW IO very easily. I am looking
  at making use of kvec for RAW IO and also add support for readv/writev.
  (I already sent a patch to do this - but since you ported all your AIO
   work to 2.5, we can work on your infrastructure :)

Please let me know what you think.

Thanks,
Badari

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-08 21:07 ` Badari Pulavarty
@ 2002-02-08 21:18   ` arjan
  2002-02-08 22:13   ` Benjamin LaHaise
  1 sibling, 0 replies; 13+ messages in thread
From: arjan @ 2002-02-08 21:18 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: linux-kernel

In article <200202082107.g18L7wx26206@eng2.beaverton.ibm.com> you wrote:

> 1) brw_kvec_async() does not seem to split IO at BIO_MAX_SIZE. I thought
>   each bio can handle only BIO_MAX_SIZE (ll_rw_kio() is creating one bio
>   for each BIO_MAX_SIZE IO). 

>   And also, currently BIO_MAX_SIZE is only 64K. Infact, if I try to issue
>   64K IO using submit_bio(), I get following BUG() on my QLOGIC controller.

>        kernel BUG at ll_rw_blk.c:1336 

>        Code is: BUG_ON(bio_sectors(bio) > q->max_sectors); 

>                bio_sectors(bio) is 128 
>                q->max_sectors is 64 (for QLOGIC ISP) 

this is a bio bug. BIO should split if needed.
(Oh and qlogic hw can easily handle 1024 sector-sized requests)

> 2) Could you please make map_user_kvec() generic enough to handle mapping
>   of mutliple iovecs to single kvec (to handle readv/writev).

I think the "move all readv/writev to one single kvec" is a mistake. The
OPPOSITE should happen. If you submit a huge single vector it should be
split up internally. This would also be the fix for the "submit the entire
vector so far and sync wait on it after 512Kb" performance bug in the normal
rawio code, since it can just submit partial (say 256Kb sized) vectors and
wait for ANY one of them before going over a 512Kb boundary.

Sure readv/writev are not optimal now. but that is because the kernel waits for
IO complete per vector element instead of submitting them all async and
waiting at the end (or in the aio case, not wait at all).

Gretings,
  Arjan van de Ven.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-08  9:40 ` Suparna Bhattacharya
@ 2002-02-08 22:02   ` Benjamin LaHaise
  2002-02-11 18:11     ` Suparna Bhattacharya
  0 siblings, 1 reply; 13+ messages in thread
From: Benjamin LaHaise @ 2002-02-08 22:02 UTC (permalink / raw)
  To: Suparna Bhattacharya; +Cc: linux-aio, linux-kernel

On Fri, Feb 08, 2002 at 03:10:09PM +0530, Suparna Bhattacharya wrote:
> You chose to add a kvec_cb field to the bio structure rather than use
> bi_private ?

Yup.  I'm lazy.  Also, the cb struct actually needs 2 pointers, so just 
bi_private isn't enough.

> For the raw path, you are OK since you never have to copy data out of 
> the kvecs after i/o completion, and unmap_kvec only looks at veclet pages. 
> So the fact block can change the offset and len fields in the veclets 
> doesn't affect you, but thought I'd mention it as a point of caution
> anyhow ...

Ugh.  That sounds like something bio should not be doing.  If someone 
wants to fix it, such a patch would be much appreciated, otherwise it'll 
wait until I'm back in Canada.

		-ben

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-08 21:07 ` Badari Pulavarty
  2002-02-08 21:18   ` arjan
@ 2002-02-08 22:13   ` Benjamin LaHaise
  2002-02-08 22:54     ` Linus Torvalds
                       ` (2 more replies)
  1 sibling, 3 replies; 13+ messages in thread
From: Benjamin LaHaise @ 2002-02-08 22:13 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: linux-aio, linux-kernel, axboe

On Fri, Feb 08, 2002 at 01:07:58PM -0800, Badari Pulavarty wrote:
> I am looking at the 2.5 patch you sent out. I have few questions/comments:
> 
> 1) brw_kvec_async() does not seem to split IO at BIO_MAX_SIZE. I thought
>    each bio can handle only BIO_MAX_SIZE (ll_rw_kio() is creating one bio
>    for each BIO_MAX_SIZE IO). 

Sounds like a needless restriction in bio, especially as one of the design 
requirements for the 2.5 block work is that we're able to support large ios 
(think 4MB page support).

>    And also, currently BIO_MAX_SIZE is only 64K. Infact, if I try to issue
>    64K IO using submit_bio(), I get following BUG() on my QLOGIC controller.

Jens?  Also, why does the bio callback return a value?  Nothing could ever 
possibly use it as far as I can see.

> 2) Could you please make map_user_kvec() generic enough to handle mapping
>    of mutliple iovecs to single kvec (to handle readv/writev). I think
>    this is a very easy change:
> 
> 	* Add alloc_kvec() and take out the kmalloc() from map_user_kvec().
> 	* Change map_user_kvec() to start mapping from kvec->veclet[kvec->nr]
>       	  instead of kvec->veclet[0]
> 
>   This way allocation for kvec to hold all the iovecs can be done at higher
>   level and map_user_kvec() can be called in a loop once for each iovec.

Sounds good.

		-ben

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-08 22:13   ` Benjamin LaHaise
@ 2002-02-08 22:54     ` Linus Torvalds
       [not found]     ` <200202082254.g18Mspq08299@penguin.transmeta.com>
  2002-02-11 11:09     ` Jens Axboe
  2 siblings, 0 replies; 13+ messages in thread
From: Linus Torvalds @ 2002-02-08 22:54 UTC (permalink / raw)
  To: linux-kernel

In article <20020208171327.B12788@redhat.com>,
Benjamin LaHaise  <bcrl@redhat.com> wrote:
>On Fri, Feb 08, 2002 at 01:07:58PM -0800, Badari Pulavarty wrote:
>> I am looking at the 2.5 patch you sent out. I have few questions/comments:
>> 
>> 1) brw_kvec_async() does not seem to split IO at BIO_MAX_SIZE. I thought
>>    each bio can handle only BIO_MAX_SIZE (ll_rw_kio() is creating one bio
>>    for each BIO_MAX_SIZE IO). 
>
>Sounds like a needless restriction in bio, especially as one of the design 
>requirements for the 2.5 block work is that we're able to support large ios 
>(think 4MB page support).

bio can handle arbitrarily large IO's, BUT it can never split them. 

Basically, IO splitting is NOT the job of the IO layer.  So you can make
any size request you want, but you had better know that the hardware you
send it to can take it.  The bio layer basically guarantees only that
you can send a single contiguous request of PAGE_SIZE, nothing more (in
fact, we might at some point get away from even that, and only guarantee
sectors - with things like loopback remapping etc you might have trouble
even for "contiguous" requests). 

Now, before you say "that's stupid, I don't know what the driver limits
are", ask yourself: 
 - what is it that you want to go fast?
 - what is it that you CAN make fast?

The answer to the "want" question is: the common case. And like it or
not, the common case is never going to be 4MB pages.

The answer to the "can" question is: merging can be fast, splitting
fundamentally cannot.

Splitting a request _fundamentally_ involves memory management (at the
very least you have to allocate a new request), while growing a request
can (and does) mean just adding an entry to the end of a list (until you
cannot grow it any more, of course, but that's the point where you have
to end anyway, so..)

Now, think about that for five minutes, and if you don't come back with
the right answer, you get an F.

In short:

 - the right answer to handling 4MB pages is not to push complexity into
   the low-level drivers and make them try to handle requests that are
   bigger than the hardware can do. 

   In fact, we don't even want to handle it in the mid layers, because
   (a) the mid layers have historically been even more flaky than some
   device drivers and (b) it's a performance loss to even test for the
   common case where the splitting is neither needed nor wanted.

 - the _right_ answer to handling big areas is to build up big bio's
   from smaller ones. And no, you don't have to call the elevator in
   between requests that you know are consecutive on the disk.

   Another way of saying it: if you have 4MB worth of IO, it's YOUR
   resposibility to do the work to make it fit the controller. It is off
   the default case, and _you_ do a bit of extra work instead of asking
   everybody else to do your heavy lifting for you.

Does bio have the interfaces to do this yet? No.  But if you think that
bio's should natively handle any kind of request at all, you're really
barking up the wrong tree. 

If you are in the small small _small_ minority care about 4MB requests,
you should build the infrastructure not to make drivers split them, but
to build up a list of bio's and then submit them all consecutively in
one go.

Remember: checking the limits as you build stuff up is easy, and fast. 

So you should make sure that you never EVER cause anybody to want to
split a bio. 

		Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
       [not found]     ` <200202082254.g18Mspq08299@penguin.transmeta.com>
@ 2002-02-09  0:01       ` Benjamin LaHaise
  2002-02-09  0:25         ` Linus Torvalds
  0 siblings, 1 reply; 13+ messages in thread
From: Benjamin LaHaise @ 2002-02-09  0:01 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, linux-aio

On Fri, Feb 08, 2002 at 02:54:51PM -0800, Linus Torvalds wrote:
> bio can handle arbitrarily large IO's, BUT it can never split them. 

I agree that it should not split ios, but it should not needlessly limit 
the size of them either -- that choice should be left to individual 
drivers.

...
> If you are in the small small _small_ minority care about 4MB requests,
> you should build the infrastructure not to make drivers split them, but
> to build up a list of bio's and then submit them all consecutively in
> one go.
> 
> Remember: checking the limits as you build stuff up is easy, and fast. 
> 
> So you should make sure that you never EVER cause anybody to want to
> split a bio. 

Yup.  What we need is an interface for getting the max size of an io -- 
I can see this being needed for filesystems, char devices, network drivers, 
basically anything that we can do aio/"direct" io on.  Given that, I can 
put the split / pipelining code into the generic layer.

		-ben

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-09  0:01       ` Benjamin LaHaise
@ 2002-02-09  0:25         ` Linus Torvalds
  2002-02-14  4:46           ` Suparna Bhattacharya
  0 siblings, 1 reply; 13+ messages in thread
From: Linus Torvalds @ 2002-02-09  0:25 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: linux-kernel, linux-aio

On Fri, 8 Feb 2002, Benjamin LaHaise wrote:
>
> Yup.  What we need is an interface for getting the max size of an io --

No. There is no such thing.

There is no "max size". There are various different limits, and "size" is
usually the last one on the list. The limitations are things like "I can
have at most N consecutive segments" or "crossing a 64kB border is fine,
but implies a new segment" or "a single segment is limited to X bytes, and
the _sum_ of all segments are limited to Y bytes" or..

And it can depend on the _address_ of the thing you're writing. If the
address is above a bounce limit, the bouncing code ends up having to copy
it to accessible memory, so you can have a device that can do a 4MB
request in one go if it's directly accessible, but if it is not in the low
XXX bits, then it gets split into chunks and copied down, at which point
you may only be able to do N chunks at a time.

And no, I didn't make any of these examples up.

A "max size" does not work. It needs to be a lot more complex than that.
For block devices, you need the whole "struct request_queue" to describe
the default cases, and even then there are function pointers to let
individual drivers limits of their own _outside_ those cases.

So it basically needs to be a "grow_bio()" function that does the choice,
not a size limitation.

(And then most devices will just use one of a few standard "grow()"
functions, of course - you need the flexibility, but at the same time
there is a lot of common cases).

		Linus

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-08 22:13   ` Benjamin LaHaise
  2002-02-08 22:54     ` Linus Torvalds
       [not found]     ` <200202082254.g18Mspq08299@penguin.transmeta.com>
@ 2002-02-11 11:09     ` Jens Axboe
  2 siblings, 0 replies; 13+ messages in thread
From: Jens Axboe @ 2002-02-11 11:09 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Badari Pulavarty, linux-aio, linux-kernel, axboe

On Fri, Feb 08 2002, Benjamin LaHaise wrote:
> >    And also, currently BIO_MAX_SIZE is only 64K. Infact, if I try to issue
> >    64K IO using submit_bio(), I get following BUG() on my QLOGIC controller.
> 
> Jens?  Also, why does the bio callback return a value?  Nothing could ever 
> possibly use it as far as I can see.

WRT max size, Linus already replied why this is so. The other thing --
the callback return value was really to have it return 'done or not'
when doing partial completions. It was later decided that the end_io
callback should only be called for final completion, so really the
nr_sectors and return value is not used now. I'll clean that up next
time around.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-08 22:02   ` Benjamin LaHaise
@ 2002-02-11 18:11     ` Suparna Bhattacharya
  0 siblings, 0 replies; 13+ messages in thread
From: Suparna Bhattacharya @ 2002-02-11 18:11 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: linux-aio, linux-kernel

On Fri, Feb 08, 2002 at 05:02:57PM -0500, Benjamin LaHaise wrote:
> On Fri, Feb 08, 2002 at 03:10:09PM +0530, Suparna Bhattacharya wrote:
> > You chose to add a kvec_cb field to the bio structure rather than use
> > bi_private ?
> 
> Yup.  I'm lazy.  Also, the cb struct actually needs 2 pointers, so just 
> bi_private isn't enough.
> 
> > For the raw path, you are OK since you never have to copy data out of 
> > the kvecs after i/o completion, and unmap_kvec only looks at veclet pages. 
> > So the fact block can change the offset and len fields in the veclets 
> > doesn't affect you, but thought I'd mention it as a point of caution
> > anyhow ...
> 
> Ugh.  That sounds like something bio should not be doing.  If someone 
> wants to fix it, such a patch would be much appreciated, otherwise it'll 
> wait until I'm back in Canada.

I had posted a suggestion on lkml earlier about this problem and one way 
to fix this - at that time I was thinking of bio->bi_voffset field (the
thought was that it could even be useful for cloning/splitting), and
was waiting for a decision from Jens about it. 
(Your kvec_dst is another way :))

However I'm now wondering if the rq->hard_cur_sectors can be used to 
achieve a similar effect (won't help with splitting, but might suffice
for the request transfer, without requiring new bio fields). 

> 
> 		-ben

------------------------------------------

>From suparna@in.ibm.com Fri Dec 28 15:50:06 2001
Date: Fri, 28 Dec 2001 15:50:06 -0500
From: Suparna Bhattacharya <suparna@in.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: axboe@suse.de
Subject: bio_io_vec and kvecs integration
Message-ID: <20011228155006.A1519@in.ibm.com>
Reply-To: suparna@in.ibm.com
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
User-Agent: Mutt/1.2.5i
Status: RO
Content-Length: 1831
Lines: 36

Since bio, aio and zero-copy networking code all make use of a similar 
representation of a memory vector or i/o currency, I was looking at
what it would take at bio level to make sure that these descriptors can
be passed down as is to the block layer (without having to translate or
copy descriptors). 

Ben has already posted some patches to change bio and skbuff fragments to
his kveclet representation. Once all these subsystems share the same
representation, the next step is to be able to pass the vectors around as
is.

Some of the changes towards this sort of a thing went in during one of
the early updates to bio. The bio structure was modified to maintain a 
pointer to the vector list, so it can point to a vector list sent down
to it by the calling subsystem. Well, this change was also needed for
bio cloning (for lvm etc) and that's all its been used for so far since 
we don't have kvecs in the kernel yet.

I think just a little more work is needed to get bio layer to take in
a vector directly. This is because currently the block layer could modify
the bv_offset and bv_len fields in the vector directly (as part of
end_that_request_first processing) in case the whole segment/veclet/bvec 
doesn't get transfered/completed in one shot. This could result in 
unexpected effects for the higher layer if it tried to interpret the
vector or say copy data from it after the i/o completes.

So, should we add a bi_voffset field, which can be used in conjunction with
bi_vidx to locate exactly where to start the next i/o transfer from/to, and 
try to leave bv_offset and bv_len fields untouched during request processing ?
Or is there some other way to do this ?

One of these days we're going to have to sort out the kiobuf/kvec decision
for 2.5, but that's probably a discussion for a little later.

Regards
Suparna

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-08  7:53 patch: aio + bio for raw io Benjamin LaHaise
  2002-02-08  9:40 ` Suparna Bhattacharya
  2002-02-08 21:07 ` Badari Pulavarty
@ 2002-02-11 21:37 ` Badari Pulavarty
  2 siblings, 0 replies; 13+ messages in thread
From: Badari Pulavarty @ 2002-02-11 21:37 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: linux-aio, linux-kernel

> 
> Quick message: this patch makes aio use bio directly for brw_kvec_async.  
> This is against yesterday's patchset.  Comments?
> 
Hi Ben,

I have one more question on your latest aio patch.

raw_kvec_rw() seem to handle max upto 512K (max_sectors = 2500). 
But I don't see any where you loop thro to satisfy the entire 
IO request.

Infact, generic_aio_read() is mapping the user buffer for the iosize 
and calling file->f_op->kvec_read(file, cb, iosize, pos). 
How does the io > 512K gets handled ?

NOTE: I have not looked at the userlevel code. Is IO split at max 512K
      at userlevel ?

Thanks,
Badari

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: patch: aio + bio for raw io
  2002-02-09  0:25         ` Linus Torvalds
@ 2002-02-14  4:46           ` Suparna Bhattacharya
  0 siblings, 0 replies; 13+ messages in thread
From: Suparna Bhattacharya @ 2002-02-14  4:46 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Benjamin LaHaise, linux-kernel, linux-aio, arjanv

A variation in perspective on this. 

One could look at a bio simply as a completion unit. A caller into 
block would split up its i/os depending on the granularity (latency) 
at which it would like to see partial completion information 
propagated about the i/o as it progresses, in case it is available. 
Of course this would at best be a hint, as the true granularity 
depends on how much the driver handles in one go, and in case of 
merges this would be coarser than what the bio splitup indicates. 
(Arjan, this is also in the light of your concerns about latency 
impacts of making readv/writev vectors mappable to the same kvec; 
the same kvec can be mapped to multiple bios without any difficulty
and the advantage is that it can be split uniformly irrespective
of user space iovec element boundaries)

Alignment requirements might need to be honoured at this level
(this seems inevitable), but when it comes to the size restrictions, 
here are a few aspects to keep in mind.

> bio can handle arbitrarily large IO's, BUT it can never split them. 
> 
> The answer to the "can" question is: merging can be fast, splitting
> fundamentally cannot.
> 
> Splitting a request _fundamentally_ involves memory management (at the
> very least you have to allocate a new request), while growing a request
> can (and does) mean just adding an entry to the end of a list (until you
> cannot grow it any more, of course, but that's the point where you have
> to end anyway, so..)

Splitting of bios by block layer or a driver should only be needed 
in case the i/o needs to be split or striped across multiple 
devices/queues/offsets as in the case of lvm or software raid 
(remappings). If the caller sends down bios with the right limits 
(with the help of grow_bio), then the need for allocating bios for 
splits may be avoided in several such cases, but not always 
(e.g raid5, bad block remapping).

In situations where it is just a matter of the low level driver not 
being able to handle a single request in one shot, it only needs 
a bit of state information to be able to complete it in chunks. 
There is _no_ need for memory allocation in this case. Most of the 
infrastructure for this already exists in bio and is in use for 
a few IDE cases; I have in mind a few enhancements and helpers to 
make this more generic. 

The maximum i/o limits (size + segment limits etc) do have another
implication in terms of how much it makes sense to merge into a 
single request, especially when it comes to more complicated 
merge decisions accounting for latencies/deadlines/priorities. 

So, having a grow_bio or some such function which _combines_ the 
constraints of all the driver layers involved as far as possible, 
could help with minimizing bio splitting due to remapping and 
account for balanced merges. 

It is just that the degree of benefits seem less that it might 
appear at first sight if we are willing to depend on the bio 
infrastructure available to enable drivers to handle a partial 
requests (i.e. complete a single request in smaller chunks). 

Also, notice that combining these layered constraints to the most 
conservative sizes could result in more granular splits than might 
be absolutely necessary, which incurs extra costs down the 
completion path (the need to combine completion status from all 
the pieces, even though they might have ultimately been transferred 
to the device in one shot). 

Regards
Suparna

On Fri, Feb 08, 2002 at 04:25:25PM -0800, Linus Torvalds wrote:
> On Fri, 8 Feb 2002, Benjamin LaHaise wrote:
> >
> > Yup.  What we need is an interface for getting the max size of an io --
> 
> No. There is no such thing.
> 
> There is no "max size". There are various different limits, and "size" is
> usually the last one on the list. The limitations are things like "I can
> have at most N consecutive segments" or "crossing a 64kB border is fine,
> but implies a new segment" or "a single segment is limited to X bytes, and
> the _sum_ of all segments are limited to Y bytes" or..
> 
> And it can depend on the _address_ of the thing you're writing. If the
> address is above a bounce limit, the bouncing code ends up having to copy
> it to accessible memory, so you can have a device that can do a 4MB
> request in one go if it's directly accessible, but if it is not in the low
> XXX bits, then it gets split into chunks and copied down, at which point
> you may only be able to do N chunks at a time.
> 
> And no, I didn't make any of these examples up.
> 
> A "max size" does not work. It needs to be a lot more complex than that.
> For block devices, you need the whole "struct request_queue" to describe
> the default cases, and even then there are function pointers to let
> individual drivers limits of their own _outside_ those cases.
> 
> So it basically needs to be a "grow_bio()" function that does the choice,
> not a size limitation.
> 
> (And then most devices will just use one of a few standard "grow()"
> functions, of course - you need the flexibility, but at the same time
> there is a lot of common cases).
> 
> 		Linus
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to majordomo@kvack.org.  For more info on Linux AIO,
> see: http://www.kvack.org/aio/

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2002-02-14  4:45 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-02-08  7:53 patch: aio + bio for raw io Benjamin LaHaise
2002-02-08  9:40 ` Suparna Bhattacharya
2002-02-08 22:02   ` Benjamin LaHaise
2002-02-11 18:11     ` Suparna Bhattacharya
2002-02-08 21:07 ` Badari Pulavarty
2002-02-08 21:18   ` arjan
2002-02-08 22:13   ` Benjamin LaHaise
2002-02-08 22:54     ` Linus Torvalds
     [not found]     ` <200202082254.g18Mspq08299@penguin.transmeta.com>
2002-02-09  0:01       ` Benjamin LaHaise
2002-02-09  0:25         ` Linus Torvalds
2002-02-14  4:46           ` Suparna Bhattacharya
2002-02-11 11:09     ` Jens Axboe
2002-02-11 21:37 ` Badari Pulavarty

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox