[PATCH 3/3] Btrfs: add basic DIO read support

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 3/3] Btrfs: add basic DIO read support
@ 2010-05-06 19:01 Josef Bacik
  2010-05-06 21:14 ` Christoph Hellwig
  2010-05-07  9:55 ` Jamie Lokier
  0 siblings, 2 replies; 6+ messages in thread
From: Josef Bacik @ 2010-05-06 19:01 UTC (permalink / raw)
  To: linux-btrfs, linux-kernel, linux-fsdevel, linux-mm

This provides basic DIO support for reads only.  It does not do any of the work
to recover from mismatching checksums, that will come later.  A few design
changes have been made from Jim's code (sorry Jim!)

1) Use the generic direct-io code.  Jim originally re-wrote all the generic DIO
code in order to account for all of BTRFS's oddities, but thanks to that work it
seems like the best bet is to just ignore compression and such and just opt to
fallback on buffered IO.

2) Fallback on buffered IO for compressed or inline extents.  Jim's code did
it's own buffering to make dio with compressed extents work.  Now we just
fallback onto normal buffered IO.

3) Lock the entire range during DIO.  I originally had it so we would lock the
extents as get_block was called, and then unlock them as the endio function was
called, which worked great, but if we ever had an error in the submit_io hook,
we could have locked an extent that would never be submitted for IO, so we
wouldn't be able to unlock it, so this solution fixed that problem and made it a
bit cleaner.

I've tested this with fsx and everything works great.  This patch depends on my
dio and filemap.c patches to work.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h     |    2 +
 fs/btrfs/file-item.c |   25 +++++-
 fs/btrfs/inode.c     |  208 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 230 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a724..36994e8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2257,6 +2257,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 logical_offset, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a2550..34ea718 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
 
 
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
-			  struct bio *bio, u32 *dst)
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
+				   struct inode *inode, struct bio *bio,
+				   u64 logical_offset, u32 *dst, int dio)
 {
 	u32 sum;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
-	u64 offset;
+	u64 offset = 0;
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
 	u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	WARN_ON(bio->bi_vcnt <= 0);
 
 	disk_bytenr = (u64)bio->bi_sector << 9;
+	if (dio)
+		offset = logical_offset;
 	while (bio_index < bio->bi_vcnt) {
-		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+		if (!dio)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
 		if (ret == 0)
 			goto found;
@@ -238,6 +242,7 @@ found:
 		else
 			set_state_private(io_tree, offset, sum);
 		disk_bytenr += bvec->bv_len;
+		offset += bvec->bv_len;
 		bio_index++;
 		bvec++;
 	}
@@ -245,6 +250,18 @@ found:
 	return 0;
 }
 
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 offset, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
+
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			     struct list_head *list)
 {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc64..ce7008a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4875,11 +4875,217 @@ out:
 	return em;
 }
 
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	struct extent_map *em;
+	u64 start = iblock << inode->i_blkbits;
+	u64 len = bh_result->b_size;
+
+
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	/*
+	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+	 * io.  INLINE is special, and we could probably kludge it in here, but
+	 * it's still buffered so for safety lets just fall back to the generic
+	 * buffered path.
+	 *
+	 * For COMPRESSED we _have_ to read the entire extent in so we can
+	 * decompress it, so there will be buffering required no matter what we
+	 * do, so go ahead and fallback to buffered.
+	 *
+	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
+	 * to buffered IO.  Don't blame me, this is the price we pay for using
+	 * the generic code.
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+	    em->block_start == EXTENT_MAP_INLINE) {
+		free_extent_map(em);
+		return -ENOTBLK;
+	}
+
+	/* Just a good old fashioned hole, return */
+	if (em->block_start == EXTENT_MAP_HOLE) {
+		free_extent_map(em);
+		return 0;
+	}
+
+	/*
+	 * We use the relative offset in the file so that our own submit bio
+	 * routine will do the mapping to the real blocks.
+	 */
+	bh_result->b_blocknr = start >> inode->i_blkbits;
+	bh_result->b_size = em->len - (start - em->start);
+	bh_result->b_bdev = em->bdev;
+	set_buffer_mapped(bh_result);
+	set_buffer_boundary(bh_result);
+
+	free_extent_map(em);
+
+	return 0;
+}
+
+struct btrfs_dio_private {
+	struct inode *inode;
+	u64 logical_offset;
+	u32 *csums;
+	void *private;
+};
+
+static void btrfs_endio_direct(struct bio *bio, int err)
+{
+	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start;
+	u32 *private = dip->csums;
+
+	start = dip->logical_offset;
+	do {
+		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+			struct page *page = bvec->bv_page;
+			char *kaddr;
+			u32 csum = ~(u32)0;
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+					       csum, bvec->bv_len);
+			btrfs_csum_final(csum, (char *)&csum);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			if (csum != *private) {
+				printk(KERN_ERR "btrfs csum failed ino %lu off"
+				      " %llu csum %u private %u\n",
+				      inode->i_ino, (unsigned long long)start,
+				      csum, *private);
+				err = -EIO;
+			}
+		}
+
+		start += bvec->bv_len;
+		private++;
+		bvec++;
+	} while (bvec <= bvec_end);
+
+	bio->bi_private = dip->private;
+
+	kfree(dip->csums);
+	kfree(dip);
+	dio_end_io(bio, err);
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map *em;
+	struct btrfs_dio_private *dip;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	u64 start;
+	int skip_sum;
+	int ret = 0;
+
+	dip = kmalloc(sizeof(*dip), GFP_NOFS);
+	if (!dip) {
+		bio_endio(bio, -ENOMEM);
+		return;
+	}
+
+	dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+	if (!dip->csums) {
+		kfree(dip);
+		bio_endio(bio, -ENOMEM);
+	}
+
+	dip->private = bio->bi_private;
+	dip->inode = inode;
+	dip->logical_offset = (u64)bio->bi_sector << 9;
+
+	start = dip->logical_offset;
+	em = btrfs_get_extent(inode, NULL, 0, start, bvec->bv_len, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_err;
+	}
+
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		printk(KERN_ERR "dio to inode resulted in a bad extent "
+		       "(%llu) %llu\n", (unsigned long long)em->block_start, start);
+		ret = -EIO;
+		free_extent_map(em);
+		goto out_err;
+	}
+
+	bio->bi_sector =
+		(em->block_start + (dip->logical_offset - em->start)) >> 9;
+	bio->bi_private = dip;
+
+	free_extent_map(em);
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	bio->bi_end_io = btrfs_endio_direct;
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	if (ret)
+		goto out_err;
+
+	if (!skip_sum)
+		btrfs_lookup_bio_sums_dio(root, inode, bio,
+					  dip->logical_offset, dip->csums);
+
+	ret = btrfs_map_bio(root, rw, bio, 0, 0);
+	if (ret)
+		goto out_err;
+	return;
+out_err:
+	kfree(dip->csums);
+	kfree(dip);
+	bio_endio(bio, ret);
+}
+
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
-	return -EINVAL;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_ordered_extent *ordered;
+	u64 len = 0;
+	ssize_t ret;
+	int i;
+
+	if (rw == WRITE)
+		return 0;
+
+	while (1) {
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, offset,
+				 offset + iov_length(iov, nr_segs) - 1, 0,
+				 &cached_state, GFP_NOFS);
+		ordered = btrfs_lookup_ordered_extent(inode, offset);
+		if (!ordered)
+			break;
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, offset,
+				     offset + iov_length(iov, nr_segs) - 1,
+				     &cached_state, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		cond_resched();
+	}
+
+	ret = blockdev_direct_IO_own_submit(rw, iocb, inode, NULL, iov,
+					    offset, nr_segs,
+					    btrfs_get_blocks_direct,
+					    btrfs_submit_direct);
+
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, offset,
+			     offset + iov_length(iov, nr_segs) - 1,
+			     &cached_state, GFP_NOFS);
+	return ret;
 }
 
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-- 
1.6.6.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/3] Btrfs: add basic DIO read support
  2010-05-06 19:01 [PATCH 3/3] Btrfs: add basic DIO read support Josef Bacik
@ 2010-05-06 21:14 ` Christoph Hellwig
  2010-05-07  9:55 ` Jamie Lokier
  1 sibling, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2010-05-06 21:14 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, linux-kernel, linux-fsdevel, linux-mm

> +struct btrfs_dio_private {
> +	struct inode *inode;
> +	u64 logical_offset;
> +	u32 *csums;
> +	void *private;
> +};
> +
> +static void btrfs_endio_direct(struct bio *bio, int err)
> +{
> +	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
> +	struct bio_vec *bvec = bio->bi_io_vec;
> +	struct btrfs_dio_private *dip = bio->bi_private;
> +	struct inode *inode = dip->inode;
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	u64 start;
> +	u32 *private = dip->csums;
> +
> +	start = dip->logical_offset;
> +	do {
> +		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
> +			struct page *page = bvec->bv_page;
> +			char *kaddr;
> +			u32 csum = ~(u32)0;
> +
> +			kaddr = kmap_atomic(page, KM_USER0);

KM_USER0 seems wrong given that the bio completion callback can and
usually will be called from some kind of IRQ context.

> +	ret = blockdev_direct_IO_own_submit(rw, iocb, inode, NULL, iov,
> +					    offset, nr_segs,
> +					    btrfs_get_blocks_direct,
> +					    btrfs_submit_direct);

Don't you need to do some alignment checks of your own given that you
don't pass in a block device?

Btw, passing in the bdev here is a really horrible API, I'd much rather
move this to the callers..

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/3] Btrfs: add basic DIO read support
  2010-05-06 19:01 [PATCH 3/3] Btrfs: add basic DIO read support Josef Bacik
  2010-05-06 21:14 ` Christoph Hellwig
@ 2010-05-07  9:55 ` Jamie Lokier
  2010-05-07 13:40   ` Josef Bacik
  1 sibling, 1 reply; 6+ messages in thread
From: Jamie Lokier @ 2010-05-07  9:55 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs, linux-kernel, linux-fsdevel, linux-mm

Josef Bacik wrote:
> 3) Lock the entire range during DIO.  I originally had it so we would lock the
> extents as get_block was called, and then unlock them as the endio function was
> called, which worked great, but if we ever had an error in the submit_io hook,
> we could have locked an extent that would never be submitted for IO, so we
> wouldn't be able to unlock it, so this solution fixed that problem and made it a
> bit cleaner.

Does this prevent concurrent DIOs to overlapping or nearby ranges?

Thanks,
-- Jamie

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 3/3] Btrfs: add basic DIO read support
  2010-05-07  9:55 ` Jamie Lokier
@ 2010-05-07 13:40   ` Josef Bacik
  0 siblings, 0 replies; 6+ messages in thread
From: Josef Bacik @ 2010-05-07 13:40 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Josef Bacik, linux-btrfs, linux-kernel, linux-fsdevel, linux-mm

On Fri, May 07, 2010 at 10:55:37AM +0100, Jamie Lokier wrote:
> Josef Bacik wrote:
> > 3) Lock the entire range during DIO.  I originally had it so we would lock the
> > extents as get_block was called, and then unlock them as the endio function was
> > called, which worked great, but if we ever had an error in the submit_io hook,
> > we could have locked an extent that would never be submitted for IO, so we
> > wouldn't be able to unlock it, so this solution fixed that problem and made it a
> > bit cleaner.
> 
> Does this prevent concurrent DIOs to overlapping or nearby ranges?
> 

It just prevents them from overlapping areas.  Thanks,

Josef

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 3/3] Btrfs: add basic DIO read support
@ 2010-05-03 17:28 Josef Bacik
  0 siblings, 0 replies; 6+ messages in thread
From: Josef Bacik @ 2010-05-03 17:28 UTC (permalink / raw)
  To: linux-btrfs, linux-fsdevel, linux-kernel, linux-mm

This provides basic DIO support for reads only.  It does not do any of the work
to recover from mismatching checksums, that will come later.  A few design
changes have been made from Jim's code (sorry Jim!)

1) Use the generic direct-io code.  Jim originally re-wrote all the generic DIO
code in order to account for all of BTRFS's oddities, but thanks to that work it
seems like the best bet is to just ignore compression and such and just opt to
fallback on buffered IO.

2) Fallback on buffered IO for compressed or inline extents.  Jim's code did
it's own buffering to make dio with compressed extents work.  Now we just
fallback onto normal buffered IO.

3) Lock the entire range during DIO.  I originally had it so we would lock the
extents as get_block was called, and then unlock them as the endio function was
called, which worked great, but if we ever had an error in the submit_io hook,
we could have locked an extent that would never be submitted for IO, so we
wouldn't be able to unlock it, so this solution fixed that problem and made it a
bit cleaner.

I've tested this with fsx and everything works great.  This patch depends on my
dio and filemap.c patches to work.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h     |    2 +
 fs/btrfs/file-item.c |   25 +++++-
 fs/btrfs/inode.c     |  208 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 230 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a724..36994e8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2257,6 +2257,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 logical_offset, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a2550..34ea718 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
 
 
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
-			  struct bio *bio, u32 *dst)
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
+				   struct inode *inode, struct bio *bio,
+				   u64 logical_offset, u32 *dst, int dio)
 {
 	u32 sum;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
-	u64 offset;
+	u64 offset = 0;
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
 	u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	WARN_ON(bio->bi_vcnt <= 0);
 
 	disk_bytenr = (u64)bio->bi_sector << 9;
+	if (dio)
+		offset = logical_offset;
 	while (bio_index < bio->bi_vcnt) {
-		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+		if (!dio)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
 		if (ret == 0)
 			goto found;
@@ -238,6 +242,7 @@ found:
 		else
 			set_state_private(io_tree, offset, sum);
 		disk_bytenr += bvec->bv_len;
+		offset += bvec->bv_len;
 		bio_index++;
 		bvec++;
 	}
@@ -245,6 +250,18 @@ found:
 	return 0;
 }
 
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 offset, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
+
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			     struct list_head *list)
 {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc64..ce7008a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4875,11 +4875,217 @@ out:
 	return em;
 }
 
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	struct extent_map *em;
+	u64 start = iblock << inode->i_blkbits;
+	u64 len = bh_result->b_size;
+
+
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	/*
+	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+	 * io.  INLINE is special, and we could probably kludge it in here, but
+	 * it's still buffered so for safety lets just fall back to the generic
+	 * buffered path.
+	 *
+	 * For COMPRESSED we _have_ to read the entire extent in so we can
+	 * decompress it, so there will be buffering required no matter what we
+	 * do, so go ahead and fallback to buffered.
+	 *
+	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
+	 * to buffered IO.  Don't blame me, this is the price we pay for using
+	 * the generic code.
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+	    em->block_start == EXTENT_MAP_INLINE) {
+		free_extent_map(em);
+		return -ENOTBLK;
+	}
+
+	/* Just a good old fashioned hole, return */
+	if (em->block_start == EXTENT_MAP_HOLE) {
+		free_extent_map(em);
+		return 0;
+	}
+
+	/*
+	 * We use the relative offset in the file so that our own submit bio
+	 * routine will do the mapping to the real blocks.
+	 */
+	bh_result->b_blocknr = start >> inode->i_blkbits;
+	bh_result->b_size = em->len - (start - em->start);
+	bh_result->b_bdev = em->bdev;
+	set_buffer_mapped(bh_result);
+	set_buffer_boundary(bh_result);
+
+	free_extent_map(em);
+
+	return 0;
+}
+
+struct btrfs_dio_private {
+	struct inode *inode;
+	u64 logical_offset;
+	u32 *csums;
+	void *private;
+};
+
+static void btrfs_endio_direct(struct bio *bio, int err)
+{
+	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start;
+	u32 *private = dip->csums;
+
+	start = dip->logical_offset;
+	do {
+		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+			struct page *page = bvec->bv_page;
+			char *kaddr;
+			u32 csum = ~(u32)0;
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+					       csum, bvec->bv_len);
+			btrfs_csum_final(csum, (char *)&csum);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			if (csum != *private) {
+				printk(KERN_ERR "btrfs csum failed ino %lu off"
+				      " %llu csum %u private %u\n",
+				      inode->i_ino, (unsigned long long)start,
+				      csum, *private);
+				err = -EIO;
+			}
+		}
+
+		start += bvec->bv_len;
+		private++;
+		bvec++;
+	} while (bvec <= bvec_end);
+
+	bio->bi_private = dip->private;
+
+	kfree(dip->csums);
+	kfree(dip);
+	dio_end_io(bio, err);
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map *em;
+	struct btrfs_dio_private *dip;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	u64 start;
+	int skip_sum;
+	int ret = 0;
+
+	dip = kmalloc(sizeof(*dip), GFP_NOFS);
+	if (!dip) {
+		bio_endio(bio, -ENOMEM);
+		return;
+	}
+
+	dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+	if (!dip->csums) {
+		kfree(dip);
+		bio_endio(bio, -ENOMEM);
+	}
+
+	dip->private = bio->bi_private;
+	dip->inode = inode;
+	dip->logical_offset = (u64)bio->bi_sector << 9;
+
+	start = dip->logical_offset;
+	em = btrfs_get_extent(inode, NULL, 0, start, bvec->bv_len, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_err;
+	}
+
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		printk(KERN_ERR "dio to inode resulted in a bad extent "
+		       "(%llu) %llu\n", (unsigned long long)em->block_start, start);
+		ret = -EIO;
+		free_extent_map(em);
+		goto out_err;
+	}
+
+	bio->bi_sector =
+		(em->block_start + (dip->logical_offset - em->start)) >> 9;
+	bio->bi_private = dip;
+
+	free_extent_map(em);
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	bio->bi_end_io = btrfs_endio_direct;
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	if (ret)
+		goto out_err;
+
+	if (!skip_sum)
+		btrfs_lookup_bio_sums_dio(root, inode, bio,
+					  dip->logical_offset, dip->csums);
+
+	ret = btrfs_map_bio(root, rw, bio, 0, 0);
+	if (ret)
+		goto out_err;
+	return;
+out_err:
+	kfree(dip->csums);
+	kfree(dip);
+	bio_endio(bio, ret);
+}
+
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
-	return -EINVAL;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_ordered_extent *ordered;
+	u64 len = 0;
+	ssize_t ret;
+	int i;
+
+	if (rw == WRITE)
+		return 0;
+
+	while (1) {
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, offset,
+				 offset + iov_length(iov, nr_segs) - 1, 0,
+				 &cached_state, GFP_NOFS);
+		ordered = btrfs_lookup_ordered_extent(inode, offset);
+		if (!ordered)
+			break;
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, offset,
+				     offset + iov_length(iov, nr_segs) - 1,
+				     &cached_state, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		cond_resched();
+	}
+
+	ret = blockdev_direct_IO_own_submit(rw, iocb, inode, NULL, iov,
+					    offset, nr_segs,
+					    btrfs_get_blocks_direct,
+					    btrfs_submit_direct);
+
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, offset,
+			     offset + iov_length(iov, nr_segs) - 1,
+			     &cached_state, GFP_NOFS);
+	return ret;
 }
 
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-- 
1.6.6.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 1/3] fs: allow short direct-io reads to be completed via buffered IO
@ 2010-05-03 16:11 Josef Bacik
  2010-05-03 16:11 ` [PATCH 2/3] direct-io: add a hook for the fs to provide its own submit_bio function Josef Bacik
  0 siblings, 1 reply; 6+ messages in thread
From: Josef Bacik @ 2010-05-03 16:11 UTC (permalink / raw)
  To: linux-btrfs, linux-fsdevel, linux-kernel, linux-mm; +Cc: Josef Bacik

This is similar to what already happens in the write case.  If we have a short
read while doing O_DIRECT, instead of just returning, fallthrough and try to
read the rest via buffered IO.  BTRFS needs this because if we encounter a
compressed or inline extent during DIO, we need to fallback on buffered.  If the
extent is compressed we need to read the entire thing into memory and
de-compress it into the users pages.  I have tested this with fsx and everything
works great.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 mm/filemap.c |   19 +++++++++++++++----
 1 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 140ebda..cc804d9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1263,7 +1263,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *filp = iocb->ki_filp;
 	ssize_t retval;
-	unsigned long seg;
+	unsigned long seg = 0;
 	size_t count;
 	loff_t *ppos = &iocb->ki_pos;
 
@@ -1292,19 +1292,30 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 			}
 			if (retval > 0)
 				*ppos = pos + retval;
-			if (retval) {
+			if (retval < 0 || !count) {
 				file_accessed(filp);
 				goto out;
 			}
 		}
 	}
 
+	count = retval;
 	for (seg = 0; seg < nr_segs; seg++) {
 		read_descriptor_t desc;
+		loff_t offset = 0;
+
+		if (count) {
+			if (count > iov[seg].iov_len) {
+				count -= iov[seg].iov_len;
+				continue;
+			}
+			offset = count;
+			count = 0;
+		}
 
 		desc.written = 0;
-		desc.arg.buf = iov[seg].iov_base;
-		desc.count = iov[seg].iov_len;
+		desc.arg.buf = iov[seg].iov_base + offset;
+		desc.count = iov[seg].iov_len - offset;
 		if (desc.count == 0)
 			continue;
 		desc.error = 0;
-- 
1.6.6.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/3] direct-io: add a hook for the fs to provide its own submit_bio function
  2010-05-03 16:11 [PATCH 1/3] fs: allow short direct-io reads to be completed via buffered IO Josef Bacik
@ 2010-05-03 16:11 ` Josef Bacik
  2010-05-03 16:11   ` [PATCH 3/3] Btrfs: add basic DIO read support Josef Bacik
  0 siblings, 1 reply; 6+ messages in thread
From: Josef Bacik @ 2010-05-03 16:11 UTC (permalink / raw)
  To: linux-btrfs, linux-fsdevel, linux-kernel, linux-mm; +Cc: Josef Bacik

Because BTRFS can do RAID and such, we need our own submit hook so we can setup
the bio's in the correct fashion, and handle checksum errors properly.  So there
are a few changes here

1) The submit_io hook.  This is straightforward, just call this instead of
submit_bio.

2) Honor the boundary flag a little more specifically.  BTRFS needs to supply
the _logical_ offset to the map_bh in it's get_block function so when we go to
submit the IO we can look up the right checksum information.  In order for this
to work out, we need to make sure each extent we map through get_block gets sent
individually.  The direct IO code tries to merge things together that appear to
be side by side, but since we are using logical offsets, thats always going to
be the case.  So instead of clearing dio->boundary when we create a new bio,
save it in a local variable, and submit the io if boundary was set.

3) Allow the fs to return -ENOTBLK for reads.  Usually this has only worked for
writes, since writes can fallback onto buffered IO.  But BTRFS needs the option
of falling back on buffered IO if it encounters a compressed extent, since we
need to read the entire extent in and decompress it.  So if we get -ENOTBLK back
from get_block we'll return back and fallback on buffered just like the write
case.

I've tested these changes with fsx and everything seems to work.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/direct-io.c     |   32 ++++++++++++++++++++++++++------
 include/linux/fs.h |   19 ++++++++++++++++---
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2..3d174df 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,7 @@ struct dio {
 	int reap_counter;		/* rate limit reaping */
 	get_block_t *get_block;		/* block mapping function */
 	dio_iodone_t *end_io;		/* IO completion function */
+	dio_submit_t *submit_io;	/* IO submition function */
 	sector_t final_block_in_bio;	/* current final block in bio + 1 */
 	sector_t next_block_for_io;	/* next block to be put under IO,
 					   in dio_blocks units */
@@ -300,6 +301,17 @@ static void dio_bio_end_io(struct bio *bio, int error)
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 }
 
+void dio_end_io(struct bio *bio, int error)
+{
+	struct dio *dio = bio->bi_private;
+
+	if (dio->is_async)
+		dio_bio_end_aio(bio, error);
+	else
+		dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL(dio_end_io);
+
 static int
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 		sector_t first_sector, int nr_vecs)
@@ -340,7 +352,10 @@ static void dio_bio_submit(struct dio *dio)
 	if (dio->is_async && dio->rw == READ)
 		bio_set_pages_dirty(bio);
 
-	submit_bio(dio->rw, bio);
+	if (!dio->submit_io)
+		submit_bio(dio->rw, bio);
+	else
+		dio->submit_io(dio->rw, bio, dio->inode);
 
 	dio->bio = NULL;
 	dio->boundary = 0;
@@ -600,6 +615,7 @@ static int dio_bio_add_page(struct dio *dio)
  */
 static int dio_send_cur_page(struct dio *dio)
 {
+	int boundary = dio->boundary;
 	int ret = 0;
 
 	if (dio->bio) {
@@ -612,7 +628,7 @@ static int dio_send_cur_page(struct dio *dio)
 		 * Submit now if the underlying fs is about to perform a
 		 * metadata read
 		 */
-		if (dio->boundary)
+		if (boundary)
 			dio_bio_submit(dio);
 	}
 
@@ -629,6 +645,8 @@ static int dio_send_cur_page(struct dio *dio)
 			ret = dio_bio_add_page(dio);
 			BUG_ON(ret != 0);
 		}
+	} else if (boundary) {
+		dio_bio_submit(dio);
 	}
 out:
 	return ret;
@@ -935,7 +953,7 @@ static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
 	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-	struct dio *dio)
+	dio_submit_t submit_io, struct dio *dio)
 {
 	unsigned long user_addr; 
 	unsigned long flags;
@@ -952,6 +970,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 	dio->get_block = get_block;
 	dio->end_io = end_io;
+	dio->submit_io = submit_io;
 	dio->final_block_in_bio = -1;
 	dio->next_block_for_io = -1;
 
@@ -1008,7 +1027,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	} /* end iovec loop */
 
-	if (ret == -ENOTBLK && (rw & WRITE)) {
+	if (ret == -ENOTBLK) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
@@ -1110,7 +1129,7 @@ ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-	int flags)
+	dio_submit_t submit_io,	int flags)
 {
 	int seg;
 	size_t size;
@@ -1197,7 +1216,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		(end > i_size_read(inode)));
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
-				nr_segs, blkbits, get_block, end_io, dio);
+				nr_segs, blkbits, get_block, end_io,
+				submit_io, dio);
 
 	/*
 	 * In case of error extending write may have instantiated a few
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 44f35ae..d56f0be 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2250,10 +2250,14 @@ static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
 #endif
 
 #ifdef CONFIG_BLOCK
+struct bio;
+typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode);
+void dio_end_io(struct bio *bio, int error);
+
 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-	int lock_type);
+	dio_submit_t submit_io,	int lock_type);
 
 enum {
 	/* need locking between buffered and direct access */
@@ -2269,7 +2273,7 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 	dio_iodone_t end_io)
 {
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				    nr_segs, get_block, end_io,
+				    nr_segs, get_block, end_io, NULL,
 				    DIO_LOCKING | DIO_SKIP_HOLES);
 }
 
@@ -2279,7 +2283,16 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
 	dio_iodone_t end_io)
 {
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				nr_segs, get_block, end_io, 0);
+				    nr_segs, get_block, end_io, NULL, 0);
+}
+
+static inline ssize_t blockdev_direct_IO_own_submit(int rw, struct kiocb *iocb,
+	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs, get_block_t get_block,
+	dio_submit_t submit_io)
+{
+	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+				    nr_segs, get_block, NULL, submit_io, 0);
 }
 #endif
 
-- 
1.6.6.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/3] Btrfs: add basic DIO read support
  2010-05-03 16:11 ` [PATCH 2/3] direct-io: add a hook for the fs to provide its own submit_bio function Josef Bacik
@ 2010-05-03 16:11   ` Josef Bacik
  0 siblings, 0 replies; 6+ messages in thread
From: Josef Bacik @ 2010-05-03 16:11 UTC (permalink / raw)
  To: linux-btrfs, linux-fsdevel, linux-kernel, linux-mm; +Cc: Josef Bacik

This provides basic DIO support for reads only.  It does not do any of the work
to recover from mismatching checksums, that will come later.  A few design
changes have been made from Jim's code (sorry Jim!)

1) Use the generic direct-io code.  Jim originally re-wrote all the generic DIO
code in order to account for all of BTRFS's oddities, but thanks to that work it
seems like the best bet is to just ignore compression and such and just opt to
fallback on buffered IO.

2) Fallback on buffered IO for compressed or inline extents.  Jim's code did
it's own buffering to make dio with compressed extents work.  Now we just
fallback onto normal buffered IO.

3) Lock the entire range during DIO.  I originally had it so we would lock the
extents as get_block was called, and then unlock them as the endio function was
called, which worked great, but if we ever had an error in the submit_io hook,
we could have locked an extent that would never be submitted for IO, so we
wouldn't be able to unlock it, so this solution fixed that problem and made it a
bit cleaner.

I've tested this with fsx and everything works great.  This patch depends on my
dio and filemap.c patches to work.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h     |    2 +
 fs/btrfs/file-item.c |   25 +++++-
 fs/btrfs/inode.c     |  208 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 230 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a724..36994e8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2257,6 +2257,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 logical_offset, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a2550..34ea718 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
 
 
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
-			  struct bio *bio, u32 *dst)
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
+				   struct inode *inode, struct bio *bio,
+				   u64 logical_offset, u32 *dst, int dio)
 {
 	u32 sum;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
-	u64 offset;
+	u64 offset = 0;
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
 	u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	WARN_ON(bio->bi_vcnt <= 0);
 
 	disk_bytenr = (u64)bio->bi_sector << 9;
+	if (dio)
+		offset = logical_offset;
 	while (bio_index < bio->bi_vcnt) {
-		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+		if (!dio)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
 		if (ret == 0)
 			goto found;
@@ -238,6 +242,7 @@ found:
 		else
 			set_state_private(io_tree, offset, sum);
 		disk_bytenr += bvec->bv_len;
+		offset += bvec->bv_len;
 		bio_index++;
 		bvec++;
 	}
@@ -245,6 +250,18 @@ found:
 	return 0;
 }
 
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+			      struct bio *bio, u64 offset, u32 *dst)
+{
+	return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
+
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			     struct list_head *list)
 {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc64..ce7008a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4875,11 +4875,217 @@ out:
 	return em;
 }
 
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	struct extent_map *em;
+	u64 start = iblock << inode->i_blkbits;
+	u64 len = bh_result->b_size;
+
+
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	/*
+	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+	 * io.  INLINE is special, and we could probably kludge it in here, but
+	 * it's still buffered so for safety lets just fall back to the generic
+	 * buffered path.
+	 *
+	 * For COMPRESSED we _have_ to read the entire extent in so we can
+	 * decompress it, so there will be buffering required no matter what we
+	 * do, so go ahead and fallback to buffered.
+	 *
+	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
+	 * to buffered IO.  Don't blame me, this is the price we pay for using
+	 * the generic code.
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+	    em->block_start == EXTENT_MAP_INLINE) {
+		free_extent_map(em);
+		return -ENOTBLK;
+	}
+
+	/* Just a good old fashioned hole, return */
+	if (em->block_start == EXTENT_MAP_HOLE) {
+		free_extent_map(em);
+		return 0;
+	}
+
+	/*
+	 * We use the relative offset in the file so that our own submit bio
+	 * routine will do the mapping to the real blocks.
+	 */
+	bh_result->b_blocknr = start >> inode->i_blkbits;
+	bh_result->b_size = em->len - (start - em->start);
+	bh_result->b_bdev = em->bdev;
+	set_buffer_mapped(bh_result);
+	set_buffer_boundary(bh_result);
+
+	free_extent_map(em);
+
+	return 0;
+}
+
+struct btrfs_dio_private {
+	struct inode *inode;
+	u64 logical_offset;
+	u32 *csums;
+	void *private;
+};
+
+static void btrfs_endio_direct(struct bio *bio, int err)
+{
+	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct inode *inode = dip->inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 start;
+	u32 *private = dip->csums;
+
+	start = dip->logical_offset;
+	do {
+		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+			struct page *page = bvec->bv_page;
+			char *kaddr;
+			u32 csum = ~(u32)0;
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+					       csum, bvec->bv_len);
+			btrfs_csum_final(csum, (char *)&csum);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			if (csum != *private) {
+				printk(KERN_ERR "btrfs csum failed ino %lu off"
+				      " %llu csum %u private %u\n",
+				      inode->i_ino, (unsigned long long)start,
+				      csum, *private);
+				err = -EIO;
+			}
+		}
+
+		start += bvec->bv_len;
+		private++;
+		bvec++;
+	} while (bvec <= bvec_end);
+
+	bio->bi_private = dip->private;
+
+	kfree(dip->csums);
+	kfree(dip);
+	dio_end_io(bio, err);
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map *em;
+	struct btrfs_dio_private *dip;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	u64 start;
+	int skip_sum;
+	int ret = 0;
+
+	dip = kmalloc(sizeof(*dip), GFP_NOFS);
+	if (!dip) {
+		bio_endio(bio, -ENOMEM);
+		return;
+	}
+
+	dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+	if (!dip->csums) {
+		kfree(dip);
+		bio_endio(bio, -ENOMEM);
+	}
+
+	dip->private = bio->bi_private;
+	dip->inode = inode;
+	dip->logical_offset = (u64)bio->bi_sector << 9;
+
+	start = dip->logical_offset;
+	em = btrfs_get_extent(inode, NULL, 0, start, bvec->bv_len, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_err;
+	}
+
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		printk(KERN_ERR "dio to inode resulted in a bad extent "
+		       "(%llu) %llu\n", (unsigned long long)em->block_start, start);
+		ret = -EIO;
+		free_extent_map(em);
+		goto out_err;
+	}
+
+	bio->bi_sector =
+		(em->block_start + (dip->logical_offset - em->start)) >> 9;
+	bio->bi_private = dip;
+
+	free_extent_map(em);
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	bio->bi_end_io = btrfs_endio_direct;
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	if (ret)
+		goto out_err;
+
+	if (!skip_sum)
+		btrfs_lookup_bio_sums_dio(root, inode, bio,
+					  dip->logical_offset, dip->csums);
+
+	ret = btrfs_map_bio(root, rw, bio, 0, 0);
+	if (ret)
+		goto out_err;
+	return;
+out_err:
+	kfree(dip->csums);
+	kfree(dip);
+	bio_endio(bio, ret);
+}
+
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
-	return -EINVAL;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_ordered_extent *ordered;
+	u64 len = 0;
+	ssize_t ret;
+	int i;
+
+	if (rw == WRITE)
+		return 0;
+
+	while (1) {
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, offset,
+				 offset + iov_length(iov, nr_segs) - 1, 0,
+				 &cached_state, GFP_NOFS);
+		ordered = btrfs_lookup_ordered_extent(inode, offset);
+		if (!ordered)
+			break;
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, offset,
+				     offset + iov_length(iov, nr_segs) - 1,
+				     &cached_state, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		cond_resched();
+	}
+
+	ret = blockdev_direct_IO_own_submit(rw, iocb, inode, NULL, iov,
+					    offset, nr_segs,
+					    btrfs_get_blocks_direct,
+					    btrfs_submit_direct);
+
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, offset,
+			     offset + iov_length(iov, nr_segs) - 1,
+			     &cached_state, GFP_NOFS);
+	return ret;
 }
 
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-- 
1.6.6.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2010-05-07 13:40 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-06 19:01 [PATCH 3/3] Btrfs: add basic DIO read support Josef Bacik
2010-05-06 21:14 ` Christoph Hellwig
2010-05-07  9:55 ` Jamie Lokier
2010-05-07 13:40   ` Josef Bacik
  -- strict thread matches above, loose matches on Subject: below --
2010-05-03 17:28 Josef Bacik
2010-05-03 16:11 [PATCH 1/3] fs: allow short direct-io reads to be completed via buffered IO Josef Bacik
2010-05-03 16:11 ` [PATCH 2/3] direct-io: add a hook for the fs to provide its own submit_bio function Josef Bacik
2010-05-03 16:11   ` [PATCH 3/3] Btrfs: add basic DIO read support Josef Bacik

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).