linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [patch] fsblock preview
       [not found] <20080911165335.GA31244@wotan.suse.de>
@ 2008-09-12  9:48 ` Nick Piggin
       [not found] ` <20080914221500.GH27080@wotan.suse.de>
  1 sibling, 0 replies; 6+ messages in thread
From: Nick Piggin @ 2008-09-12  9:48 UTC (permalink / raw)
  To: linux-fsdevel

On Thu, Sep 11, 2008 at 06:53:35PM +0200, Nick Piggin wrote:
> 
> I've been doing some work on fsblock again lately, so in case anybody might
> find it interesting, here is a "preview" patch. Basically it compiles and
> runs OK for me here, under a few stress tests. I wouldn't say it is close to
> bug free, and it needs a lot of bits and pieces to polish up like error
> handling.

For some testing with a *real* filesystem :)

Here is an incremental patch which switches XFS to fsblock (not quite
complete or well stress tested, but boots and runs with block size == page
size).

This adds incremental support for delayed allocation and stuff to fsblock,
which turned out to be pretty easy..

---

Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
@@ -43,25 +43,38 @@
 #include <linux/writeback.h>
 
 STATIC void
+__xfs_count_block_state(
+	struct fsblock		*fsb,
+	int			*delalloc,
+	int			*unmapped,
+	int			*unwritten)
+{
+	if ((fsb->flags & (BL_uptodate|BL_mapped)) == BL_uptodate)
+		(*unmapped) = 1;
+	else if (fsb->flags & BL_unwritten)
+		(*unwritten) = 1;
+	else if (fsb->flags & BL_delay)
+		(*delalloc) = 1;
+}
+STATIC void
 xfs_count_page_state(
 	struct page		*page,
 	int			*delalloc,
 	int			*unmapped,
 	int			*unwritten)
 {
-	struct buffer_head	*bh, *head;
+	struct fsblock	*fsb;
 
 	*delalloc = *unmapped = *unwritten = 0;
 
-	bh = head = page_buffers(page);
-	do {
-		if (buffer_uptodate(bh) && !buffer_mapped(bh))
-			(*unmapped) = 1;
-		else if (buffer_unwritten(bh))
-			(*unwritten) = 1;
-		else if (buffer_delay(bh))
-			(*delalloc) = 1;
-	} while ((bh = bh->b_this_page) != head);
+	fsb = page_blocks(page);
+	if (fsblock_midpage(fsb)) {
+		__xfs_count_block_state(fsb, delalloc, unmapped, unwritten);
+	} else {
+		struct fsblock *b;
+		for_each_block(fsb, b)
+			__xfs_count_block_state(b, delalloc, unmapped, unwritten);
+	}
 }
 
 #if defined(XFS_RW_TRACE)
@@ -77,7 +90,7 @@ xfs_page_trace(
 	loff_t		offset = page_offset(page);
 	int		delalloc = -1, unmapped = -1, unwritten = -1;
 
-	if (page_has_buffers(page))
+	if (PageBlocks(page))
 		xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
 
 	ip = XFS_I(inode);
@@ -137,7 +150,7 @@ xfs_finish_ioend(
 
 /*
  * We're now finished for good with this ioend structure.
- * Update the page state via the associated buffer_heads,
+ * Update the page state via the associated fsblocks,
  * release holds on the inode and bio, and finally free
  * up memory.  Do not use the ioend after this.
  */
@@ -145,11 +158,13 @@ STATIC void
 xfs_destroy_ioend(
 	xfs_ioend_t		*ioend)
 {
-	struct buffer_head	*bh, *next;
+	struct fsblock	*fsb, *next;
 
-	for (bh = ioend->io_buffer_head; bh; bh = next) {
-		next = bh->b_private;
-		bh->b_end_io(bh, !ioend->io_error);
+	for (fsb = ioend->io_fsb_head; fsb; fsb = next) {
+		next = fsb->private;
+		fsb->private = NULL;
+		unlock_block(fsb);
+		fsblock_end_io(fsb, !ioend->io_error);
 	}
 	if (unlikely(ioend->io_error)) {
 		vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error,
@@ -291,8 +306,8 @@ xfs_alloc_ioend(
 	ioend->io_list = NULL;
 	ioend->io_type = type;
 	ioend->io_inode = inode;
-	ioend->io_buffer_head = NULL;
-	ioend->io_buffer_tail = NULL;
+	ioend->io_fsb_head = NULL;
+	ioend->io_fsb_tail = NULL;
 	atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
 	ioend->io_offset = 0;
 	ioend->io_size = 0;
@@ -374,10 +389,11 @@ xfs_submit_ioend_bio(
 
 STATIC struct bio *
 xfs_alloc_ioend_bio(
-	struct buffer_head	*bh)
+	struct fsblock	*fsb)
 {
 	struct bio		*bio;
-	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
+	struct block_device	*bdev = fsb->page->mapping->host->i_sb->s_bdev;
+	int			nvecs = bio_get_nr_vecs(bdev);
 
 	do {
 		bio = bio_alloc(GFP_NOIO, nvecs);
@@ -385,24 +401,27 @@ xfs_alloc_ioend_bio(
 	} while (!bio);
 
 	ASSERT(bio->bi_private == NULL);
-	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-	bio->bi_bdev = bh->b_bdev;
+	bio->bi_sector = fsb->block_nr << (fsblock_bits(fsb) - 9);
+	bio->bi_bdev = bdev;
 	bio_get(bio);
 	return bio;
 }
 
 STATIC void
 xfs_start_buffer_writeback(
-	struct buffer_head	*bh)
+	struct fsblock	*fsb)
 {
-	ASSERT(buffer_mapped(bh));
-	ASSERT(buffer_locked(bh));
-	ASSERT(!buffer_delay(bh));
-	ASSERT(!buffer_unwritten(bh));
-
-	mark_buffer_async_write(bh);
-	set_buffer_uptodate(bh);
-	clear_buffer_dirty(bh);
+	ASSERT(fsb->flags & BL_mapped);
+	ASSERT(fsb->flags & BL_locked);
+	ASSERT(!(fsb->flags & BL_delay));
+	ASSERT(!(fsb->flags & BL_unwritten));
+	ASSERT(!(fsb->flags & BL_uptodate));
+
+	spin_lock_block_irq(fsb);
+	fsb->count++;
+	fsb->flags |= BL_writeback;
+	clear_block_dirty(fsb);
+	spin_unlock_block_irq(fsb);
 }
 
 STATIC void
@@ -420,11 +439,15 @@ xfs_start_page_writeback(
 	/* If no buffers on the page are to be written, finish it here */
 	if (!buffers)
 		end_page_writeback(page);
+	else
+		page_cache_get(page);
 }
 
-static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+static inline int bio_add_buffer(struct bio *bio, struct fsblock *fsb)
 {
-	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+	unsigned int size = fsblock_size(fsb);
+	unsigned int offset = block_page_offset(fsb, size);
+	return bio_add_page(bio, fsb->page, size, offset);
 }
 
 /*
@@ -433,16 +456,16 @@ static inline int bio_add_buffer(struct 
  *
  * Because we may have multiple ioends spanning a page, we need to start
  * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
+ * buffers as we got, then we can end up with a page that only has fsblocks
  * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
+ * fsblocks async write.
  *
  * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
+ * we call it twice for the one page as the code in fsblock_end_io()
+ * assumes that all fsblocks on the page are started at the same time.
  *
  * The fix is two passes across the ioend list - one to start writeback on the
- * buffer_heads, and then submit them for I/O on the second pass.
+ * fsblocks, and then submit them for I/O on the second pass.
  */
 STATIC void
 xfs_submit_ioend(
@@ -450,15 +473,15 @@ xfs_submit_ioend(
 {
 	xfs_ioend_t		*head = ioend;
 	xfs_ioend_t		*next;
-	struct buffer_head	*bh;
+	struct fsblock		*fsb;
 	struct bio		*bio;
 	sector_t		lastblock = 0;
 
 	/* Pass 1 - start writeback */
 	do {
 		next = ioend->io_list;
-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-			xfs_start_buffer_writeback(bh);
+		for (fsb = ioend->io_fsb_head; fsb; fsb = fsb->private) {
+			xfs_start_buffer_writeback(fsb);
 		}
 	} while ((ioend = next) != NULL);
 
@@ -468,22 +491,22 @@ xfs_submit_ioend(
 		next = ioend->io_list;
 		bio = NULL;
 
-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+		for (fsb = ioend->io_fsb_head; fsb; fsb = fsb->private) {
 
 			if (!bio) {
  retry:
-				bio = xfs_alloc_ioend_bio(bh);
-			} else if (bh->b_blocknr != lastblock + 1) {
+				bio = xfs_alloc_ioend_bio(fsb);
+			} else if (fsb->block_nr != lastblock + 1) {
 				xfs_submit_ioend_bio(ioend, bio);
 				goto retry;
 			}
 
-			if (bio_add_buffer(bio, bh) != bh->b_size) {
+			if (bio_add_buffer(bio, fsb) != fsblock_size(fsb)) {
 				xfs_submit_ioend_bio(ioend, bio);
 				goto retry;
 			}
 
-			lastblock = bh->b_blocknr;
+			lastblock = fsb->block_nr;
 		}
 		if (bio)
 			xfs_submit_ioend_bio(ioend, bio);
@@ -492,7 +515,7 @@ xfs_submit_ioend(
 }
 
 /*
- * Cancel submission of all buffer_heads so far in this endio.
+ * Cancel submission of all fsblocks so far in this endio.
  * Toss the endio too.  Only ever called for the initial page
  * in a writepage request, so only ever one page.
  */
@@ -501,16 +524,19 @@ xfs_cancel_ioend(
 	xfs_ioend_t		*ioend)
 {
 	xfs_ioend_t		*next;
-	struct buffer_head	*bh, *next_bh;
+	struct fsblock		*fsb, *next_fsb;
 
 	do {
 		next = ioend->io_list;
-		bh = ioend->io_buffer_head;
+		fsb = ioend->io_fsb_head;
 		do {
-			next_bh = bh->b_private;
-			clear_buffer_async_write(bh);
-			unlock_buffer(bh);
-		} while ((bh = next_bh) != NULL);
+			next_fsb = fsb->private;
+			spin_lock_block_irq(fsb);
+			fsb->flags &= ~BL_writeback;
+			fsb->count--;
+			spin_unlock_block_irq(fsb);
+			unlock_block(fsb);
+		} while ((fsb = next_fsb) != NULL);
 
 		vn_iowake(XFS_I(ioend->io_inode));
 		mempool_free(ioend, xfs_ioend_pool);
@@ -519,14 +545,14 @@ xfs_cancel_ioend(
 
 /*
  * Test to see if we've been building up a completion structure for
- * earlier buffers -- if so, we try to append to this ioend if we
+ * earlier fsblocks -- if so, we try to append to this ioend if we
  * can, otherwise we finish off any current ioend and start another.
  * Return true if we've finished the given ioend.
  */
 STATIC void
 xfs_add_to_ioend(
 	struct inode		*inode,
-	struct buffer_head	*bh,
+	struct fsblock		*fsb,
 	xfs_off_t		offset,
 	unsigned int		type,
 	xfs_ioend_t		**result,
@@ -539,23 +565,23 @@ xfs_add_to_ioend(
 
 		ioend = xfs_alloc_ioend(inode, type);
 		ioend->io_offset = offset;
-		ioend->io_buffer_head = bh;
-		ioend->io_buffer_tail = bh;
+		ioend->io_fsb_head = fsb;
+		ioend->io_fsb_tail = fsb;
 		if (previous)
 			previous->io_list = ioend;
 		*result = ioend;
 	} else {
-		ioend->io_buffer_tail->b_private = bh;
-		ioend->io_buffer_tail = bh;
+		ioend->io_fsb_tail->private = fsb;
+		ioend->io_fsb_tail = fsb;
 	}
 
-	bh->b_private = NULL;
-	ioend->io_size += bh->b_size;
+	fsb->private = NULL;
+	ioend->io_size += fsblock_size(fsb);
 }
 
 STATIC void
 xfs_map_buffer(
-	struct buffer_head	*bh,
+	struct fsblock		*fsb,
 	xfs_iomap_t		*mp,
 	xfs_off_t		offset,
 	uint			block_bits)
@@ -569,13 +595,12 @@ xfs_map_buffer(
 
 	ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME));
 
-	bh->b_blocknr = bn;
-	set_buffer_mapped(bh);
+	map_fsblock(fsb, bn);
 }
 
 STATIC void
 xfs_map_at_offset(
-	struct buffer_head	*bh,
+	struct fsblock		*fsb,
 	loff_t			offset,
 	int			block_bits,
 	xfs_iomap_t		*iomapp)
@@ -583,12 +608,16 @@ xfs_map_at_offset(
 	ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
 	ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
 
-	lock_buffer(bh);
-	xfs_map_buffer(bh, iomapp, offset, block_bits);
-	bh->b_bdev = iomapp->iomap_target->bt_bdev;
-	set_buffer_mapped(bh);
-	clear_buffer_delay(bh);
-	clear_buffer_unwritten(bh);
+	spin_lock_block_irq(fsb);
+	fsb->count++; // XXX: hack
+	spin_unlock_block_irq(fsb);
+
+	lock_block(fsb);
+	spin_lock_block_irq(fsb);
+	xfs_map_buffer(fsb, iomapp, offset, block_bits);
+	fsb->count--;
+	spin_unlock_block_irq(fsb);
+//XXX?	bh->b_bdev = iomapp->iomap_target->bt_bdev;
 }
 
 /*
@@ -606,19 +635,28 @@ xfs_probe_page(
 		return 0;
 
 	if (page->mapping && PageDirty(page)) {
-		if (page_has_buffers(page)) {
-			struct buffer_head	*bh, *head;
+		if (PageBlocks(page)) {
+			struct fsblock	*fsb;
 
-			bh = head = page_buffers(page);
-			do {
-				if (!buffer_uptodate(bh))
-					break;
-				if (mapped != buffer_mapped(bh))
-					break;
-				ret += bh->b_size;
-				if (ret >= pg_offset)
-					break;
-			} while ((bh = bh->b_this_page) != head);
+			fsb = page_blocks(page);
+			if (fsblock_midpage(fsb)) {
+				if (!(fsb->flags & BL_uptodate))
+					return 0;
+				if (mapped != (fsb->flags & BL_mapped))
+					return 0;
+				return PAGE_CACHE_SIZE;
+			} else {
+				struct fsblock *b;
+				for_each_block(fsb, b) {
+					if (!(b->flags & BL_uptodate))
+						break;
+					if (mapped != (b->flags & BL_mapped))
+						break;
+					ret += fsblock_size(fsb);
+					if (ret >= pg_offset)
+						break;
+				}
+			}
 		} else
 			ret = mapped ? 0 : PAGE_CACHE_SIZE;
 	}
@@ -630,8 +668,8 @@ STATIC size_t
 xfs_probe_cluster(
 	struct inode		*inode,
 	struct page		*startpage,
-	struct buffer_head	*bh,
-	struct buffer_head	*head,
+	struct fsblock		*fsb,
+	struct fsblock		*head,
 	int			mapped)
 {
 	struct pagevec		pvec;
@@ -640,11 +678,12 @@ xfs_probe_cluster(
 	int			done = 0, i;
 
 	/* First sum forwards in this page */
-	do {
-		if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh)))
+	if (fsblock_midpage(fsb)) {
+		if (!(fsb->flags & BL_uptodate) ||
+				mapped != (fsb->flags & BL_mapped))
 			return total;
-		total += bh->b_size;
-	} while ((bh = bh->b_this_page) != head);
+		total += fsblock_size(fsb);
+	}
 
 	/* if we reached the end of the page, sum forwards in following pages */
 	tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
@@ -707,21 +746,21 @@ xfs_is_delayed_page(
 	if (PageWriteback(page))
 		return 0;
 
-	if (page->mapping && page_has_buffers(page)) {
-		struct buffer_head	*bh, *head;
+	if (page->mapping && PageBlocks(page)) {
+		struct fsblock		*fsb;
 		int			acceptable = 0;
 
-		bh = head = page_buffers(page);
-		do {
-			if (buffer_unwritten(bh))
+		fsb = page_blocks(page);
+		if (fsblock_midpage(fsb)) { /* XXX: midpage! */
+			if (fsb->flags & BL_unwritten)
 				acceptable = (type == IOMAP_UNWRITTEN);
-			else if (buffer_delay(bh))
+			else if (fsb->flags & BL_delay)
 				acceptable = (type == IOMAP_DELAY);
-			else if (buffer_dirty(bh) && buffer_mapped(bh))
+			else if ((fsb->flags & (BL_dirty|BL_mapped)) == (BL_dirty|BL_mapped))
 				acceptable = (type == IOMAP_NEW);
 			else
-				break;
-		} while ((bh = bh->b_this_page) != head);
+				return 0;
+		}
 
 		if (acceptable)
 			return 1;
@@ -747,7 +786,7 @@ xfs_convert_page(
 	int			startio,
 	int			all_bh)
 {
-	struct buffer_head	*bh, *head;
+	struct fsblock		*fsb;
 	xfs_off_t		end_offset;
 	unsigned long		p_offset;
 	unsigned int		type;
@@ -790,19 +829,20 @@ xfs_convert_page(
 	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 	page_dirty = p_offset / len;
 
-	bh = head = page_buffers(page);
+	/* XXX: midpage */
+	fsb = page_blocks(page);
 	do {
 		if (offset >= end_offset)
 			break;
-		if (!buffer_uptodate(bh))
+		if (!(fsb->flags & BL_uptodate))
 			uptodate = 0;
-		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
+		if (!(PageUptodate(page) || (fsb->flags & BL_uptodate))) {
 			done = 1;
 			continue;
 		}
 
-		if (buffer_unwritten(bh) || buffer_delay(bh)) {
-			if (buffer_unwritten(bh))
+		if (fsb->flags & (BL_unwritten|BL_delay)) {
+			if (fsb->flags & BL_unwritten)
 				type = IOMAP_UNWRITTEN;
 			else
 				type = IOMAP_DELAY;
@@ -815,22 +855,21 @@ xfs_convert_page(
 			ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
 			ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
 
-			xfs_map_at_offset(bh, offset, bbits, mp);
+			xfs_map_at_offset(fsb, offset, bbits, mp);
 			if (startio) {
-				xfs_add_to_ioend(inode, bh, offset,
+				xfs_add_to_ioend(inode, fsb, offset,
 						type, ioendp, done);
 			} else {
-				set_buffer_dirty(bh);
-				unlock_buffer(bh);
-				mark_buffer_dirty(bh);
+				mark_mblock_dirty(fsb);
+				unlock_block(fsb);
 			}
 			page_dirty--;
 			count++;
 		} else {
 			type = IOMAP_NEW;
-			if (buffer_mapped(bh) && all_bh && startio) {
-				lock_buffer(bh);
-				xfs_add_to_ioend(inode, bh, offset,
+			if (fsb->flags & BL_mapped && all_bh && startio) {
+				lock_block(fsb);
+				xfs_add_to_ioend(inode, fsb, offset,
 						type, ioendp, done);
 				count++;
 				page_dirty--;
@@ -838,9 +877,9 @@ xfs_convert_page(
 				done = 1;
 			}
 		}
-	} while (offset += len, (bh = bh->b_this_page) != head);
+	} while (offset += len, 1);
 
-	if (uptodate && bh == head)
+	if (uptodate && 1) // fsb == head)
 		SetPageUptodate(page);
 
 	if (startio) {
@@ -930,7 +969,7 @@ xfs_page_state_convert(
 	int		startio,
 	int		unmapped) /* also implies page uptodate */
 {
-	struct buffer_head	*bh, *head;
+	struct fsblock		*fsb;
 	xfs_iomap_t		iomap;
 	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 	loff_t			offset;
@@ -983,7 +1022,7 @@ xfs_page_state_convert(
 	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
 	page_dirty = p_offset / len;
 
-	bh = head = page_buffers(page);
+	fsb = page_blocks(page);
 	offset = page_offset(page);
 	flags = BMAPI_READ;
 	type = IOMAP_NEW;
@@ -993,9 +1032,9 @@ xfs_page_state_convert(
 	do {
 		if (offset >= end_offset)
 			break;
-		if (!buffer_uptodate(bh))
+		if (!(fsb->flags & BL_uptodate))
 			uptodate = 0;
-		if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
+		if (!(PageUptodate(page) || fsb->flags & BL_uptodate) && !startio) {
 			/*
 			 * the iomap is actually still valid, but the ioend
 			 * isn't.  shouldn't happen too often.
@@ -1017,9 +1056,9 @@ xfs_page_state_convert(
 		 * Third case, an unmapped buffer was found, and we are
 		 * in a path where we need to write the whole page out.
 		 */
-		if (buffer_unwritten(bh) || buffer_delay(bh) ||
-		    ((buffer_uptodate(bh) || PageUptodate(page)) &&
-		     !buffer_mapped(bh) && (unmapped || startio))) {
+		if (fsb->flags & (BL_unwritten|BL_delay) ||
+		    ((fsb->flags & BL_uptodate || PageUptodate(page)) &&
+		     !(fsb->flags & BL_mapped) && (unmapped || startio))) {
 			int new_ioend = 0;
 
 			/*
@@ -1028,10 +1067,10 @@ xfs_page_state_convert(
 			if (flags == BMAPI_READ)
 				iomap_valid = 0;
 
-			if (buffer_unwritten(bh)) {
+			if (fsb->flags & BL_unwritten) {
 				type = IOMAP_UNWRITTEN;
 				flags = BMAPI_WRITE | BMAPI_IGNSTATE;
-			} else if (buffer_delay(bh)) {
+			} else if (fsb->flags & BL_delay) {
 				type = IOMAP_DELAY;
 				flags = BMAPI_ALLOCATE | trylock;
 			} else {
@@ -1051,7 +1090,7 @@ xfs_page_state_convert(
 				new_ioend = 1;
 				if (type == IOMAP_NEW) {
 					size = xfs_probe_cluster(inode,
-							page, bh, head, 0);
+							page, fsb, NULL, 0);
 				} else {
 					size = len;
 				}
@@ -1063,21 +1102,20 @@ xfs_page_state_convert(
 				iomap_valid = xfs_iomap_valid(&iomap, offset);
 			}
 			if (iomap_valid) {
-				xfs_map_at_offset(bh, offset,
+				xfs_map_at_offset(fsb, offset,
 						inode->i_blkbits, &iomap);
 				if (startio) {
-					xfs_add_to_ioend(inode, bh, offset,
+					xfs_add_to_ioend(inode, fsb, offset,
 							type, &ioend,
 							new_ioend);
 				} else {
-					set_buffer_dirty(bh);
-					unlock_buffer(bh);
-					mark_buffer_dirty(bh);
+					mark_mblock_dirty(fsb);
+					unlock_block(fsb);
 				}
 				page_dirty--;
 				count++;
 			}
-		} else if (buffer_uptodate(bh) && startio) {
+		} else if (fsb->flags & BL_uptodate && startio) {
 			/*
 			 * we got here because the buffer is already mapped.
 			 * That means it must already have extents allocated
@@ -1085,8 +1123,8 @@ xfs_page_state_convert(
 			 */
 			if (!iomap_valid || flags != BMAPI_READ) {
 				flags = BMAPI_READ;
-				size = xfs_probe_cluster(inode, page, bh,
-								head, 1);
+				size = xfs_probe_cluster(inode, page, fsb,
+								NULL, 1);
 				err = xfs_map_blocks(inode, offset, size,
 						&iomap, flags);
 				if (err)
@@ -1103,18 +1141,18 @@ xfs_page_state_convert(
 			 * that we are writing into for the first time.
 			 */
 			type = IOMAP_NEW;
-			if (trylock_buffer(bh)) {
-				ASSERT(buffer_mapped(bh));
+			if (trylock_block(fsb)) {
+				ASSERT(fsb->flags & BL_mapped);
 				if (iomap_valid)
 					all_bh = 1;
-				xfs_add_to_ioend(inode, bh, offset, type,
+				xfs_add_to_ioend(inode, fsb, offset, type,
 						&ioend, !iomap_valid);
 				page_dirty--;
 				count++;
 			} else {
 				iomap_valid = 0;
 			}
-		} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
+		} else if ((fsb->flags & BL_uptodate || PageUptodate(page)) &&
 			   (unmapped || startio)) {
 			iomap_valid = 0;
 		}
@@ -1122,9 +1160,9 @@ xfs_page_state_convert(
 		if (!iohead)
 			iohead = ioend;
 
-	} while (offset += len, ((bh = bh->b_this_page) != head));
+	} while (offset += len, 1);
 
-	if (uptodate && bh == head)
+	if (uptodate && 1) //bh == head)
 		SetPageUptodate(page);
 
 	if (startio)
@@ -1154,7 +1192,7 @@ error:
 	 */
 	if (err != -EAGAIN) {
 		if (!unmapped)
-			block_invalidatepage(page, 0);
+			fsblock_invalidate_page(page, 0);
 		ClearPageUptodate(page);
 	}
 	return err;
@@ -1200,7 +1238,7 @@ xfs_vm_writepage(
 	 *  4. There are unwritten buffers on the page
 	 */
 
-	if (!page_has_buffers(page)) {
+	if (!PageBlocks(page)) {
 		unmapped = 1;
 		need_trans = 1;
 	} else {
@@ -1223,8 +1261,8 @@ xfs_vm_writepage(
 	 * Delay hooking up buffer heads until we have
 	 * made our go/no-go decision.
 	 */
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+	if (!PageBlocks(page))
+		create_unmapped_blocks(page, GFP_NOIO, 1 << inode->i_blkbits, 0);
 
 	/*
 	 * Convert delayed allocate, unwritten or unmapped space
@@ -1289,7 +1327,7 @@ xfs_vm_releasepage(
 
 	xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0);
 
-	if (!page_has_buffers(page))
+	if (!PageBlocks(page))
 		return 0;
 
 	xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
@@ -1317,14 +1355,14 @@ xfs_vm_releasepage(
 	return 0;
 
 free_buffers:
-	return try_to_free_buffers(page);
+	return try_to_free_blocks(page);
 }
 
 STATIC int
 __xfs_get_blocks(
 	struct inode		*inode,
 	sector_t		iblock,
-	struct buffer_head	*bh_result,
+	struct fsblock		*fsb_result,
 	int			create,
 	int			direct,
 	bmapi_flags_t		flags)
@@ -1336,8 +1374,8 @@ __xfs_get_blocks(
 	int			error;
 
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
-	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
-	size = bh_result->b_size;
+	ASSERT(fsblock_size(fsb_result) >= (1 << inode->i_blkbits));
+	size = fsblock_size(fsb_result);
 	error = xfs_iomap(XFS_I(inode), offset, size,
 			     create ? flags : BMAPI_READ, &iomap, &niomap);
 	if (error)
@@ -1345,19 +1383,20 @@ __xfs_get_blocks(
 	if (niomap == 0)
 		return 0;
 
+	spin_lock_block_irq(fsb_result);
 	if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
 		/*
 		 * For unwritten extents do not report a disk address on
 		 * the read case (treat as if we're reading into a hole).
 		 */
 		if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
-			xfs_map_buffer(bh_result, &iomap, offset,
+			xfs_map_buffer(fsb_result, &iomap, offset,
 				       inode->i_blkbits);
 		}
 		if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
 			if (direct)
-				bh_result->b_private = inode;
-			set_buffer_unwritten(bh_result);
+				fsb_result->private = inode;
+			fsb_result->flags |= BL_unwritten;
 		}
 	}
 
@@ -1365,7 +1404,7 @@ __xfs_get_blocks(
 	 * If this is a realtime file, data may be on a different device.
 	 * to that pointed to from the buffer_head b_bdev currently.
 	 */
-	bh_result->b_bdev = iomap.iomap_target->bt_bdev;
+//XXX	bh_result->b_bdev = iomap.iomap_target->bt_bdev;
 
 	/*
 	 * If we previously allocated a block out beyond eof and we are now
@@ -1377,50 +1416,50 @@ __xfs_get_blocks(
 	 * correctly zeroed.
 	 */
 	if (create &&
-	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
+	    ((!(fsb_result->flags & (BL_mapped|BL_uptodate))) ||
 	     (offset >= i_size_read(inode)) ||
 	     (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN))))
-		set_buffer_new(bh_result);
+		fsb_result->flags |= BL_new;
 
 	if (iomap.iomap_flags & IOMAP_DELAY) {
 		BUG_ON(direct);
-		if (create) {
-			set_buffer_uptodate(bh_result);
-			set_buffer_mapped(bh_result);
-			set_buffer_delay(bh_result);
-		}
+		if (create)
+			fsb_result->flags |= BL_uptodate|BL_mapped|BL_delay;
 	}
 
 	if (direct || size > (1 << inode->i_blkbits)) {
 		ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
 		offset = min_t(xfs_off_t,
 				iomap.iomap_bsize - iomap.iomap_delta, size);
-		bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
+//XXX: could change fsb size bits		fsb_result->size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
 	}
+	spin_unlock_block_irq(fsb_result);
 
 	return 0;
 }
 
 int
 xfs_get_blocks(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	int			create)
+	struct address_space	*mapping,
+	struct fsblock		*fsb_result,
+	loff_t			pos,
+	int			mode)
 {
-	return __xfs_get_blocks(inode, iblock,
-				bh_result, create, 0, BMAPI_WRITE);
+	sector_t iblock;
+	iblock = pos >> fsblock_bits(fsb_result);
+	return __xfs_get_blocks(mapping->host, iblock,
+				fsb_result, mode, 0, BMAPI_WRITE);
 }
 
 STATIC int
 xfs_get_blocks_direct(
 	struct inode		*inode,
 	sector_t		iblock,
-	struct buffer_head	*bh_result,
+	struct fsblock		*fsb_result,
 	int			create)
 {
 	return __xfs_get_blocks(inode, iblock,
-				bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
+				fsb_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT);
 }
 
 STATIC void
@@ -1521,7 +1560,7 @@ xfs_vm_write_begin(
 	void			**fsdata)
 {
 	*pagep = NULL;
-	return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+	return fsblock_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 								xfs_get_blocks);
 }
 
@@ -1537,7 +1576,7 @@ xfs_vm_bmap(
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	return generic_block_bmap(mapping, block, xfs_get_blocks);
+	return fsblock_bmap(mapping, block, xfs_get_blocks);
 }
 
 STATIC int
@@ -1545,17 +1584,7 @@ xfs_vm_readpage(
 	struct file		*unused,
 	struct page		*page)
 {
-	return mpage_readpage(page, xfs_get_blocks);
-}
-
-STATIC int
-xfs_vm_readpages(
-	struct file		*unused,
-	struct address_space	*mapping,
-	struct list_head	*pages,
-	unsigned		nr_pages)
-{
-	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
+	return fsblock_read_page(page, xfs_get_blocks);
 }
 
 STATIC void
@@ -1565,20 +1594,18 @@ xfs_vm_invalidatepage(
 {
 	xfs_page_trace(XFS_INVALIDPAGE_ENTER,
 			page->mapping->host, page, offset);
-	block_invalidatepage(page, offset);
+	fsblock_invalidate_page(page, offset);
 }
 
 const struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
-	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
 	.writepages		= xfs_vm_writepages,
-	.sync_page		= block_sync_page,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
 	.write_begin		= xfs_vm_write_begin,
-	.write_end		= generic_write_end,
+	.write_end		= fsblock_write_end,
 	.bmap			= xfs_vm_bmap,
 	.direct_IO		= xfs_vm_direct_IO,
-	.migratepage		= buffer_migrate_page,
+	.set_page_dirty		= fsblock_set_page_dirty,
 };
Index: linux-2.6/fs/xfs/linux-2.6/xfs_buf.h
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_buf.h
+++ linux-2.6/fs/xfs/linux-2.6/xfs_buf.h
@@ -24,7 +24,7 @@
 #include <asm/system.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
-#include <linux/buffer_head.h>
+#include <linux/fsblock.h>
 #include <linux/uio.h>
 
 /*
Index: linux-2.6/fs/fsblock.c
===================================================================
--- linux-2.6.orig/fs/fsblock.c
+++ linux-2.6/fs/fsblock.c
@@ -76,38 +76,6 @@ void __init fsblock_init(void)
 #endif
 }
 
-#ifdef BDFLUSH_FLUSHING
-static void clear_block_dirty(struct fsblock *block);
-
-static int test_and_set_block_dirty(struct fsblock *block);
-
-static void set_block_dirty(struct fsblock *block)
-{
-	test_and_set_block_dirty(block);
-}
-#else
-static inline void clear_block_dirty(struct fsblock *block)
-{
-	FSB_BUG_ON(!spin_is_locked_block(block));
-	block->flags &= ~BL_dirty;
-}
-
-static inline int test_and_set_block_dirty(struct fsblock *block)
-{
-	FSB_BUG_ON(!spin_is_locked_block(block));
-	if (block->flags & BL_dirty)
-		return 1;
-	block->flags |= BL_dirty;
-	return 0;
-}
-
-static inline void set_block_dirty(struct fsblock *block)
-{
-	FSB_BUG_ON(!spin_is_locked_block(block));
-	block->flags |= BL_dirty;
-}
-#endif
-
 static void init_block(struct page *page, struct fsblock *block, unsigned int bits)
 {
 	block->flags = 0;
@@ -257,7 +225,7 @@ static void free_block_check(struct fsbl
 		printk("block count = %u\n", count);
 		printk("block private = %p\n", private);
 		printk("vmap count  = %u\n", vmap_count);
-		BUG();
+		FSB_BUG();
 	}
 }
 #endif
@@ -407,7 +375,7 @@ int trylock_block(struct fsblock *block)
 	unsigned long flags;
 	int ret;
 
-	FSB_BUG_ON(block->count == 0);
+	FSB_BUG_ON(!some_refcounted(block));
 	/* XXX: audit for possible irq uses */
 	spin_lock_block_irqsave(block, flags);
 	ret = !(block->flags & BL_locked);
@@ -431,7 +399,7 @@ void unlock_block(struct fsblock *block)
 {
 	unsigned long flags;
 
-	FSB_BUG_ON(block->count == 0);
+	FSB_BUG_ON(!some_refcounted(block));
 	spin_lock_block_irqsave(block, flags);
 	FSB_BUG_ON(!(block->flags & BL_locked));
 	block->flags &= ~BL_locked;
@@ -1048,7 +1016,7 @@ static int invalidate_aliasing_blocks(st
 }
 
 #define CREATE_METADATA	0x01
-static int create_unmapped_blocks(struct page *page, gfp_t gfp_flags, unsigned int size, unsigned int flags)
+int create_unmapped_blocks(struct page *page, gfp_t gfp_flags, unsigned int size, unsigned int flags)
 {
 	unsigned int bits = ffs(size) - 1;
 	struct fsblock *block;
@@ -1119,6 +1087,7 @@ static int create_unmapped_blocks(struct
 
 	return 0;
 }
+EXPORT_SYMBOL(create_unmapped_blocks);
 
 static int create_unmapped_blocks_oneref(struct page *page, gfp_t gfp_flags, unsigned int size, unsigned int flags)
 {
@@ -1445,6 +1414,15 @@ static void block_end_write(struct fsblo
 	local_irq_restore(flags);
 }
 
+void fsblock_end_io(struct fsblock *block, int uptodate)
+{
+	if (block->flags & BL_readin)
+		block_end_read(block, uptodate);
+	else
+		block_end_write(block, uptodate);
+}
+EXPORT_SYMBOL(fsblock_end_io);
+
 static void block_end_bio_io(struct bio *bio, int err)
 {
 	struct fsblock *block = bio->bi_private;
@@ -1459,10 +1437,7 @@ static void block_end_bio_io(struct bio 
 	FSB_BUG_ON((block->flags & (BL_readin|BL_writeback)) == (BL_readin|BL_writeback));
 	FSB_BUG_ON((block->flags & (BL_readin|BL_writeback)) == 0);
 
-	if (block->flags & BL_readin)
-		block_end_read(block, uptodate);
-	else
-		block_end_write(block, uptodate);
+	fsblock_end_io(block, uptodate);
 
 	bio_put(bio);
 }
@@ -2059,7 +2034,8 @@ sector_t fsblock_bmap(struct address_spa
 			loff_t off;
 			spin_unlock_block_irq(block);
 			off = sector_offset(blocknr, inode->i_blkbits);
-			ret = map_block(mapping, block, off, 0); /* create? */
+			/* create block? */
+			ret = map_block(mapping, block, off, MAP_BLOCK_READ);
 			spin_lock_block_irq(block);
 			if (ret)
 				goto out_unlock;
@@ -2167,7 +2143,7 @@ int fsblock_read_page(struct page *page,
 		for_each_block(block, b) {
 			if (!(b->flags & (BL_mapped|BL_hole))) {
 				spin_unlock_block_irq(block);
-				ret = map_block(mapping, b, off, 0);
+				ret = map_block(mapping, b, off, MAP_BLOCK_READ);
 				spin_lock_block_irq(block);
 				/* XXX: SetPageError on failure? */
 				if (ret)
@@ -2206,7 +2182,7 @@ int fsblock_read_page(struct page *page,
 	} else if (fsblock_midpage(block)) {
 		if (!(block->flags & (BL_mapped|BL_hole))) {
 			spin_unlock_block_irq(block);
-			ret = map_block(mapping, block, off, 0);
+			ret = map_block(mapping, block, off, MAP_BLOCK_READ);
 			/* XXX: SetPageError on failure? */
 			if (ret)
 				goto out_drop;
@@ -2233,7 +2209,7 @@ int fsblock_read_page(struct page *page,
 		FSB_BUG_ON(block->flags & BL_dirty);
 
 		if (!(block->flags & (BL_mapped|BL_hole))) {
-			ret = map_block(mapping, block, off, 0);
+			ret = map_block(mapping, block, off, MAP_BLOCK_READ);
 			if (ret)
 				goto out_unlock;
 		}
@@ -2340,6 +2316,12 @@ int fsblock_write_page(struct page *page
 		struct fsblock *b;
 
 		for_each_block(block, b) {
+
+			if ((b->flags & (BL_delay|BL_dirty)) == (BL_delay|BL_dirty)) {
+				spin_unlock_block_irq(b);
+				ret = map_block(mapping, b, off, MAP_BLOCK_ALLOCATE);
+				spin_lock_block_irq(b);
+			}
 			nr += block_write_helper(page, b);
 
 			off += size;
@@ -2376,6 +2358,11 @@ int fsblock_write_page(struct page *page
 		unlock_page(page);
 
 	} else if (fsblock_midpage(block)) {
+		if ((block->flags & (BL_delay|BL_dirty)) == (BL_delay|BL_dirty)) {
+			spin_unlock_block_irq(block);
+			ret = map_block(mapping, block, off, MAP_BLOCK_ALLOCATE);
+			spin_lock_block_irq(block);
+		}
 		if (block_write_helper(page, block)) {
 			FSB_BUG_ON(PageWriteback(page));
 			set_page_writeback(page);
@@ -2511,7 +2498,7 @@ static int block_dirty_helper(struct pag
 		 */
 		__set_page_dirty_noblocks_nowarn(page);
 		return 0;
-	} else if (block->flags & BL_uptodate) {
+	} else if (block->flags & (BL_uptodate|BL_delay|BL_unwritten)) {
 		return 0;
 	} else {
 		if (from <= offset && to >= offset+size)
@@ -2542,7 +2529,7 @@ static int fsblock_write_begin_super(str
 		return ret;
 
 	if (!(block->flags & BL_mapped)) {
-		ret = map_block(mapping, block, pos & ~(size-1), 1);
+		ret = map_block(mapping, block, pos & ~(size-1), MAP_BLOCK_RESERVE);
 		if (ret)
 			goto out_unlock;
 	}
@@ -2637,7 +2624,7 @@ int fsblock_write_begin(struct file *fil
 		for_each_block(block, b) {
 			if (off < to && off + size > from) {
 				if (!(b->flags & BL_mapped)) {
-					ret = map_block(mapping, b, pos+off, 1);
+					ret = map_block(mapping, b, pos+off, MAP_BLOCK_RESERVE);
 					if (ret)
 						goto out_zero_new;
 				}
@@ -2684,7 +2671,7 @@ int fsblock_write_begin(struct file *fil
 		 */
 		if (!(block->flags & BL_mapped)) {
 			spin_unlock_block_irq(block);
-			ret = map_block(mapping, block, pos, 1);
+			ret = map_block(mapping, block, pos, MAP_BLOCK_RESERVE);
 			if (ret)
 				goto out_zero_new;
 			spin_lock_block_irq(block);
@@ -3111,7 +3098,12 @@ static void invalidate_block(struct fsbl
 	FSB_BUG_ON(block->flags & BL_locked);
 	FSB_BUG_ON(!block->page->mapping);
 
-//	lock_block(block); XXX: why lock?
+#if 0
+	__block_get(block);
+	spin_unlock_block_irq(block);
+	lock_block(block); /* XXX: why lock? For XFS */
+	spin_lock_block_irq(block);
+#endif
 	/*
 	 * XXX
 	 * FSB_BUG_ON(block->flags & BL_new);
@@ -3121,9 +3113,14 @@ static void invalidate_block(struct fsbl
 	clear_block_dirty(block);
 	block->flags &= ~BL_new;
 	/* Don't clear uptodate because if the block essentially turns into a hole and remains uptodate */
-	block->flags &= ~(BL_mapped|BL_hole);
+	block->flags &= ~(BL_mapped|BL_hole|BL_delay|BL_unwritten);
 	block->block_nr = -1;
-//	unlock_block(block);
+#if 0
+	spin_unlock_block_irq(block);
+	unlock_block(block);
+	spin_lock_block_irq(block);
+	block->count--;
+#endif
 	/* XXX: if metadata, then have an fs-private release? */
 }
 
@@ -3549,10 +3546,11 @@ static void fbd_del_dirty_block(struct f
 	spin_unlock(&fbd->lock);
 }
 
-static void clear_block_dirty(struct fsblock *block)
+void clear_block_dirty(struct fsblock *block)
 {
 	struct fsblock_bd *fbd;
 
+	FSB_BUG_ON(!spin_is_locked_block(block));
 	if (!(block->flags & BL_dirty))
 		return;
 
@@ -3563,10 +3561,11 @@ static void clear_block_dirty(struct fsb
 		fbd_del_dirty_block(fbd, block);
 }
 
-static int test_and_set_block_dirty(struct fsblock *block)
+int test_and_set_block_dirty(struct fsblock *block)
 {
 	struct fsblock_bd *fbd;
 
+	FSB_BUG_ON(!spin_is_locked_block(block));
 	if (block->flags & BL_dirty)
 		return 1;
 
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.h
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.h
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.h
@@ -33,14 +33,18 @@ typedef struct xfs_ioend {
 	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */
 	struct inode		*io_inode;	/* file being written to */
-	struct buffer_head	*io_buffer_head;/* buffer linked list head */
-	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
+	struct fsblock		*io_fsb_head;	/* fsb linked list head */
+	struct fsblock		*io_fsb_tail;	/* fsb linked list tail */
 	size_t			io_size;	/* size of the extent */
 	xfs_off_t		io_offset;	/* offset in the file */
 	struct work_struct	io_work;	/* xfsdatad work queue */
 } xfs_ioend_t;
 
 extern const struct address_space_operations xfs_address_space_operations;
-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+extern int xfs_get_blocks(
+	struct address_space	*mapping,
+	struct fsblock		*fsb_result,
+	loff_t			pos,
+	int			create);
 
 #endif /* __XFS_AOPS_H__ */
Index: linux-2.6/include/linux/fsblock.h
===================================================================
--- linux-2.6.orig/include/linux/fsblock.h
+++ linux-2.6/include/linux/fsblock.h
@@ -15,6 +15,10 @@
 
 #include <linux/kallsyms.h>
 #define MIN_SECTOR_SHIFT	9 /* 512 bytes */
+#define MIN_SECTOR_SIZE		(1UL<<MIN_SECTOR_SHIFT)
+#ifndef MAX_BUF_PER_PAGE
+#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / MIN_SECTOR_SIZE)
+#endif
 
 #define BL_bits_mask	0x000f
 
@@ -39,6 +43,8 @@
 #endif
 
 #define BL_dirty_acct	0x10000
+#define BL_unwritten	0x20000
+#define BL_delay	0x40000
 
 #ifndef FSB_DEBUG
 static inline void assert_block(struct fsblock *block)
@@ -48,6 +54,11 @@ static inline void assert_block(struct f
 void assert_block(struct fsblock *block);
 #endif
 
+
+#define MAP_BLOCK_READ		0
+#define MAP_BLOCK_RESERVE	1
+#define MAP_BLOCK_ALLOCATE	2
+
 /*
  * XXX: should distinguish data buffer and metadata buffer. data buffer
  * attachment (or dirtyment?) could cause the page to *also* be added to
@@ -236,9 +247,10 @@ static inline void clear_page_blocks(str
 static inline void map_fsblock(struct fsblock *block, sector_t blocknr)
 {
 //	FSB_BUG_ON(!spin_is_locked_block(block));
-	FSB_BUG_ON(block->flags & BL_mapped);
+//XXX	FSB_BUG_ON(block->flags & BL_mapped); XFS
 	block->block_nr = blocknr;
 	block->flags |= BL_mapped;
+	block->flags &= ~(BL_delay|BL_unwritten);
 #ifdef FSB_DEBUG
 	/* XXX: test for inside bdev? */
 	if (block->flags & BL_metadata) {
@@ -378,6 +390,7 @@ static inline struct fsblock_meta *sb_mb
 
 void mbforget(struct fsblock_meta *mblock);
 
+int create_unmapped_blocks(struct page *page, gfp_t gfp_flags, unsigned int size, unsigned int flags);
 void mark_mblock_uptodate(struct fsblock_meta *mblock);
 int mark_mblock_dirty(struct fsblock_meta *mblock);
 int mark_mblock_dirty_inode(struct fsblock_meta *mblock, struct inode *inode);
@@ -522,6 +535,38 @@ int trylock_block(struct fsblock *block)
 void lock_block(struct fsblock *block);
 void unlock_block(struct fsblock *block);
 
+#ifdef BDFLUSH_FLUSHING
+void clear_block_dirty(struct fsblock *block);
+
+int test_and_set_block_dirty(struct fsblock *block);
+
+static inline void set_block_dirty(struct fsblock *block)
+{
+	test_and_set_block_dirty(block);
+}
+#else
+static inline void clear_block_dirty(struct fsblock *block)
+{
+	FSB_BUG_ON(!spin_is_locked_block(block));
+	block->flags &= ~BL_dirty;
+}
+
+static inline int test_and_set_block_dirty(struct fsblock *block)
+{
+	FSB_BUG_ON(!spin_is_locked_block(block));
+	if (block->flags & BL_dirty)
+		return 1;
+	block->flags |= BL_dirty;
+	return 0;
+}
+
+static inline void set_block_dirty(struct fsblock *block)
+{
+	FSB_BUG_ON(!spin_is_locked_block(block));
+	block->flags |= BL_dirty;
+}
+#endif
+
 sector_t fsblock_bmap(struct address_space *mapping, sector_t block, map_block_fn *insert_mapping);
 
 int fsblock_read_page(struct page *page, map_block_fn *insert_mapping);
@@ -561,5 +606,6 @@ static inline void writeback_blockdevs_b
 #endif
 
 void fsblock_init(void);
+void fsblock_end_io(struct fsblock *block, int uptodate);
 
 #endif
Index: linux-2.6/fs/xfs/linux-2.6/xfs_buf.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_buf.c
+++ linux-2.6/fs/xfs/linux-2.6/xfs_buf.c
@@ -1501,7 +1501,6 @@ xfs_mapping_buftarg(
 	struct inode		*inode;
 	struct address_space	*mapping;
 	static const struct address_space_operations mapping_aops = {
-		.sync_page = block_sync_page,
 		.migratepage = fail_migrate_page,
 	};
 
Index: linux-2.6/fs/xfs/linux-2.6/xfs_file.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_file.c
+++ linux-2.6/fs/xfs/linux-2.6/xfs_file.c
@@ -429,7 +429,7 @@ xfs_vm_page_mkwrite(
 	struct vm_area_struct	*vma,
 	struct page		*page)
 {
-	return block_page_mkwrite(vma, page, xfs_get_blocks);
+	return fsblock_page_mkwrite(vma, page, xfs_get_blocks);
 }
 
 const struct file_operations xfs_file_operations = {
Index: linux-2.6/fs/xfs/linux-2.6/xfs_iops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_iops.c
+++ linux-2.6/fs/xfs/linux-2.6/xfs_iops.c
@@ -614,8 +614,7 @@ xfs_vn_truncate(
 	struct inode	*inode)
 {
 	int	error;
-	error = block_truncate_page(inode->i_mapping, inode->i_size,
-							xfs_get_blocks);
+	error = fsblock_truncate_page(inode->i_mapping, inode->i_size);
 	WARN_ON(error);
 }
 
Index: linux-2.6/fs/ext2/inode.c
===================================================================
--- linux-2.6.orig/fs/ext2/inode.c
+++ linux-2.6/fs/ext2/inode.c
@@ -591,6 +591,8 @@ static int ext2_get_blocks(struct inode 
 	int count = 0;
 	ext2_fsblk_t first_block = 0;
 
+	FSB_BUG_ON(create == MAP_BLOCK_ALLOCATE);
+
 	*flags = 0;
 
 	depth = ext2_block_to_path(inode, blocknr, offsets,&blocks_to_boundary);
@@ -724,16 +726,17 @@ static int ext2_map_extent(struct addres
 }
 
 static int ext2_map_block(struct address_space *mapping,
-			struct fsblock *block, loff_t pos, int create)
+			struct fsblock *block, loff_t pos, int mode)
 {
 	FSB_BUG_ON(block->flags & BL_mapped);
+	FSB_BUG_ON(mode == MAP_BLOCK_ALLOCATE);
 
-	return fsb_ext_map_fsblock(mapping, pos, block, create, &EXT2_I(mapping->host)->fsb_ext_root, ext2_map_extent);
+	return fsb_ext_map_fsblock(mapping, pos, block, mode, &EXT2_I(mapping->host)->fsb_ext_root, ext2_map_extent);
 }
 #else
 
 static int ext2_map_block(struct address_space *mapping,
-			struct fsblock *b, loff_t pos, int create)
+			struct fsblock *b, loff_t pos, int mode)
 {
 	struct inode *inode = mapping->host;
 	sector_t blocknr;
@@ -743,10 +746,11 @@ static int ext2_map_block(struct address
 	int ret;
 
 	FSB_BUG_ON(b->flags & BL_mapped);
+	FSB_BUG_ON(mode == MAP_BLOCK_ALLOCATE);
 
 	blocknr = pos >> inode->i_blkbits;
 
-	ret = ext2_get_blocks(inode, blocknr, 1, create, &offset, &block, &size, &flags);
+	ret = ext2_get_blocks(inode, blocknr, 1, mode, &offset, &block, &size, &flags);
 	if (ret > 0) {
 		ret = 0;
 	}
Index: linux-2.6/fs/fsb_extentmap.c
===================================================================
--- linux-2.6.orig/fs/fsb_extentmap.c
+++ linux-2.6/fs/fsb_extentmap.c
@@ -99,7 +99,7 @@ static int fsb_ext_merge_before(struct f
 }
 
 int fsb_ext_map_fsblock(struct address_space *mapping, loff_t off,
-			struct fsblock *fsblock, int create,
+			struct fsblock *fsblock, int mode,
 			struct fsb_ext_root *root, map_fsb_extent_fn mapfn)
 {
 	struct inode *inode = mapping->host;
@@ -113,7 +113,7 @@ int fsb_ext_map_fsblock(struct address_s
 	if (!ext)
 		goto get_new;
 
-	if (ext->flags & FE_mapped || (ext->flags & FE_hole && !create)) {
+	if (ext->flags & FE_mapped || (ext->flags & FE_hole && mode == MAP_BLOCK_READ)) {
 		spin_lock_block_irq(fsblock);
 		if (ext->flags & FE_mapped) {
 			blocknr = ext->block + (offset - ext->offset);
@@ -147,7 +147,7 @@ get_new:
 			return -ENOMEM;
 		}
 
-		ret = mapfn(mapping, off, create, &new->offset, &new->block,
+		ret = mapfn(mapping, off, mode, &new->offset, &new->block,
 						&new->size, &new->flags);
 		if (ret) {
 			kmem_cache_free(extent_cache, split);
@@ -268,7 +268,7 @@ try_next:
 				fsblock->flags |= BL_new;
 		} else {
 			FSB_BUG_ON(!(new->flags & FE_hole));
-			FSB_BUG_ON(create);
+			FSB_BUG_ON(mode != MAP_BLOCK_READ);
 			fsblock->flags |= BL_hole;
 		}
 		spin_unlock_block_irq(fsblock);
Index: linux-2.6/include/linux/fsblock_types.h
===================================================================
--- linux-2.6.orig/include/linux/fsblock_types.h
+++ linux-2.6/include/linux/fsblock_types.h
@@ -36,7 +36,7 @@
  */
 //#define VMAP_CACHE	1
 
-#define BDFLUSH_FLUSHING 1
+//#define BDFLUSH_FLUSHING 1
 
 struct address_space;
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [patch] fsblock preview
       [not found] ` <20080914221500.GH27080@wotan.suse.de>
@ 2008-09-15  8:30   ` Nick Piggin
  2008-09-16 11:35     ` Neil Brown
  0 siblings, 1 reply; 6+ messages in thread
From: Nick Piggin @ 2008-09-15  8:30 UTC (permalink / raw)
  To: linux-fsdevel

OK, vger doesn't seem to like my patch, so I'll have to give a url to it,
sorry.

http://www.kernel.org/pub/linux/kernel/people/npiggin/patches/fsblock/2.6.27-rc5/fsb-preview.patch

I've been doing some work on fsblock again lately, so in case anybody might
find it interesting, here is a "preview" patch. Basically it compiles and
runs OK for me here, under a few stress tests. I wouldn't say it is close to
bug free, and it needs a lot of bits and pieces to polish up like error
handling.

I've also just stripped out the large block size support in the patch I'm
mailing out... I have been developing with ext2 without large lock support
sizes so those paths have rotted a bit and besides they still really need
a bit more changes to some VM paths.

Since I last posted fsblock, there have been some big changes:

- Using a per block spinlock to protect most access now. This eliminates
  some races I had against dirtying vs cleaning, and with fsblock
  refcounting and reclaim.

- fsblock_no_cache aka "nobh" mode now works well due to the above. When
  /proc/sys/vm/fsblock_no_cache is 1, you never get fsblocks hanging around
  longer than they have to. You also would never be subject to the circular
  referencing "orphan" pages that buffer heads are subject to.

- RCU is gone. This is actually a good thing because in "nobh" mode, some
  workloads will rapidly allocate and free the structures, and that can
  be costly with RCU.

- struct fsblock has shrunk to 32 bytes on 64-bit. Less than 1/3 the size
  of struct buffer_head. Although absolute size doesn't matter so much now
  (because of no_cache mode). I even have an optional feature "bdflush"
  that increases the size... although I do want to keep it within 64 bytes
  (one cacheline).

- added an "intermediate" mode which provides a ->data pointer in struct
  fsblock_meta, and means it is trivial to transition filesystems to
  fsblock (although they would not be able to support superpage blocks).

- Added ext2 intermediate support.

- Had to modify the VM a little bit in order to close races with freeing a
  page's fsblock before it can be cleaned (or still has a chance to be
  dirtied via mmap). fsblock of course ensures that zero memory allocations
  are required in the writeout path.

- Lockless pagecache has been merged in mainline, which means the largest
  granularity of synchronisation anywhere in the fsblock core code is on a
  per-page basis (buffer uses per-inode private_lock). This is one of the
  reasons I am skeptical that keeping pagecache state in extents is better: it
  would be rather impressive if it could match the straight line speed or
  scalability of fsblock.

- However, I *have* always agreed that it makes sense to keep (some) block
  state in extents, because that is going to change much less frequently, and
  should be represented with fewer extents provided the filesystem layout is
  reasonable. So I've written a (very) basic extent cache for block mappings,
  which can be used by filesystems that don't have good in-memory block
  mapping structures themselves (like ext2, for example). No reclaim for this
  at present, I should just add a simple shrinker.

- bdflush... it's commented out so it won't build by default, but basically
  because fslbock properly keeps block dirty state in synch with page dirty
  state, I can keep sorted structure of dirty fsblocks per device, and do
  writeout based on that rather than this fragile walking over inodes that
  pdflush does. Of course it won't work with delayed allocation, so something
  would have to be figured out with that (perhaps allocate all outstanding
  blocks before each writeout pass).

  The thing I like about bdflush is that it can easily do nice submit
  ordering of inter-file as well as file/metadata blocks for writeout. I
  don't know if it will come to anything, but at least it is not tightly
  coupled with the core fsblock stuff. It's a bit hacky at the moment ;)

- Still not using a private bdev for fsblock filesystems... I never got around
  to figuring out how to do this. This means that sometimes funny things will
  happen with block_dev device if pages and buffers try to use it. It mostly
  works OK but is a hack that I need to fix.

- Finally, for those not listening last time. I'm doing block sizes larger
  than page size (up to 16MB IIRC, but easily expandable to much higher) with
  fsblock using exactly the same data structures. Although I haven't included
  that in the patch here.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch] fsblock preview
  2008-09-15  8:30   ` Nick Piggin
@ 2008-09-16 11:35     ` Neil Brown
  2008-09-23  4:39       ` Nick Piggin
  0 siblings, 1 reply; 6+ messages in thread
From: Neil Brown @ 2008-09-16 11:35 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-fsdevel

On Monday September 15, npiggin@suse.de wrote:
> OK, vger doesn't seem to like my patch, so I'll have to give a url to it,
> sorry.

Thank (A URL is lots better than nothing)!

Some comments.

1/ I think you need to include a design document as a patch to
   Documentation/something.  This would help me not to ask the same
   question twice, such as:

2/ Why do you allow pages in the same file to have either blocks or
   buffers.   Surely it should be set-in-concrete on a per-address-space
   (and probably per-filesystem) basis whether buffer_heads or blocks
   or something else is used.

   I know I've asked this before so I found the answer in my mail
   archives.
   Some filesystems store their metadata in the same address-space
   that block_device.c uses for reading/writing the device.  For this
   to work they must both use the same buffering strategy.

   My feeling is that the complexity that you add to switch between
   buffers and block on a per-page basis is too great a cost, and you
   should avoid this.
   My preferred approach would be to require the filesystem to use a
   separate address-space to store the metadata (even if it is still
   physically indexed).  However I suspect that would cause breakage
   for things like tune2fs as the caches would no longer be coherent.
   (Would that actually be a problem for anything other than the
    filesystem superblock?  In which case, just continue to use
   buffer_heads for the superblock and fsblock for the rest).

   The alternative would be for block_device.c to either use buffers
   or blocks depending on how the filesystem is using it.
   So on first open, default to using buffers.  When the filesystem
   bd_claims the device it can switch it to using blocks.  It already
   must potentially flush the whole cache if it is changing block
   size.  This is just a small extra step.  You would probably just
   replace the address_space_operations structure in something similar
   to set_blocksize, between sync_blockdev and kill_bdev.

   This might end up being more code, but it would remove some real
   ugliness from your current patch.

   At the very least, I think that the difference between using
   buffer_heads and fsblocks should be entirely encoded in the
   PageBlocks flags, and that PagePrivate should be left to mean
   exactly what it currently does:  That the ->private field is in
   use by something.

3/ Why do blocks hold a refcount on the page.  This seems wrong.
   The setting of PagePrivate (or PageBlock) is the equivalent of a
   refcount.  If it is set, then releasepage will be called to when
   the VM wants to drop the page, and invalidatepage will be called
   when it insists on dropping the page.  No need for a refcount.

   I found a hint in the patch which suggests that this is needed for 
   superpage support.  So when you truncate a file at the middle of a
   superpage, fsblock can still hold on to the pages that
   truncate_inode_pages wants to get rid of.  However as you didn't
   include that I cannot yet comment on whether that seems
   justifiable.

   As I think more on this, I'm wondering if I've missed something
   (entirely possible).  If each block does indeed hold a refcount on
   the page, then freeing idle pages which happen to have blocks
   attached is not ever going to happen.  This would be ok if you
   dropped the fsblocks as soon as they weren't in used, but your
   commentary seems to suggest that is an option, not a certainty. 
   Confused.

4/ In truncate_inode_pages_range, you have:

-			if (page_index > end) {
-				next = page_index;
+			next = page_index+1;
+			if (next-1 > end)
 				break;
-			}
 
-			if (page_index > next)
-				next = page_index;
-			next++;

  This reminds me of very similar code in invalidate_mapping_pages
  which has the comment
			/*
			 * We really shouldn't be looking at the ->index of an
			 * unlocked page.  But we're not allowed to lock these
			 * pages.  So we rely upon nobody altering the ->index
			 * of this (pinned-by-us) page.
			 */
  (inserted by a patch attributed to me, but I think the text is from
   Andrew Morton).
  The theory is that until you get the page lock, the ->mapping and
  ->index of a page can change (I'm not sure how.  Maybe splice??).
  Note that truncate_complete_page (Which is called fairly shortly
  after the patch included above) starts with
	if (page->mapping != mapping)
		return;

  which seems to justify the need for caution.

  Now, back to the change that you made:
   The "before" code will only ever increase next based on the
   (possibly random) value retrieved from ->index.  Your "after" code
   always sets it explicitly.   It could be argued that this is not
   as safe.

  I'm not really sure how important this all is, and whether ->index
  really can give a surprise value in this context.  But I thought I
  would mention it, and suggest that if you really want to make this
  change which (along with the reversal of trylock_page and testing
  PageWriteback) doesn't seem to be directly related to fsblock, then
  maybe it should be a separate patch so it could be reviewed more
  effectively.  Either way, it would be good to have the code in
  'truncate' and 'invalidate' as similar as possible, including
  comments.

5/ Still in truncate_inode_pages_range: what is the point of making
   any changes in there at all?  They seem to be just syntactic
   rearrangement with no obvious gain.
   I can see that you add a test for PageWriteback before
   trylock_page, which could avoid taking a page lock in some cases so
   that might be a micro-optimisation.
   I can also see that you have dropped a called to cond_resched(),
   which probably needs some justification.
   In any case, this should all be in a separate patch.


6/ I would love to have more words explaining the new
   address_space_operations.  We even have a file to contain them:
     Documentation/filesystems/vfs.txt.

   I'm guessing that the "any private data" is typically
   buffers/blocks holding metadata (e.g. indirect blocks)

   Hmmm.. looking further is seems that courtesy of struct mb_assoc,
   one fsblock of metadata can be shared by multiple address_spaces.
   I guess that is for block-allocation bitmaps etc.

   I'm beginning to wonder if this belongs in the address_space rather
   than directly in the inode.  Part of my reasoning for this is that
   I didn't think that i_data should normally be used in generic code,
   i_mapping should be used instead.  But all accesses to private_list
   and assoc_mapping use i_data, suggesting that it is really part of
   the inode.

   I realise that you didn't make it this way, but if it is a bad
   design (which I'm beginning to suspect) then we should perpetuate
   it by putting the 'release' and 'sync' operations in the
   address_space.  They should rather be inode_operations I think.

7/ Those kmallocs in fsblock_init are a worry.  Surely is isn't hard
   to come up with a scheme for static allocation.
   And I think you should remove "if (i < 30)" and just leave it as
   a simple 'else'.

8/ Given that sector_t is unsigned, code like
+	block->block_nr = -1;
   (in init_blocks) bothers me.
+	sector_t last_block_nr = (sector_t)-1ULL;
   Bothers me even more. "-1" as an unsigned long long ??
   I would much prefer you used (~0ULL).  But maybe that's just me. 


9/ I'm a bit confused by fsblock_writeout_data and its interaction
   with writeout_block and the PageWriteback flag.  This is in the
   subpage case.

   fsblock_writeout_data seems to want to write out lots of blocks
   in sequential order.
   It will find a block, lock the page, and (if the block is still
   dirty) start writeout using writeout_block.  This will set
   PageWriteback on the page.

   fsblock_writeout_data will often find the next block which is in
   the same page, but as PageWriteback is now set, it will not try to
   lock and writeout that block.  So I can imagine it writing out
   only the first block of each page.  Surely this cannot be
   intended.  Am I missing something?

10/  list_add_tail(new, head) is better than list_add(new, head->prev)
  
11/ nit.  checkpatch.pl reports.
  total: 67 errors, 141 warnings, 7239 lines checke
  I thought to run it because I noticed white-space-damage in
  blkdev_get_block. 

12/ In 'invalidate_block' you have commented out lock_block, and 
   put in a comment "why lock?".

   The reason to lock is to wait for pending writeout to complete.
   If the writeout was triggered by the VFS/VM, then the page will
   have PageWriteback set and we will already have waited for that to
   clear.  However if the writeout was triggered internally by the
   filesystem (such as for data ordering guarantees) then
   PageWriteback will not have been set, but the block (not the page)
   will be locked.

   At least that is how I understand buffer_heads to work.  However
   for fsblock you set PageWriteback whenever you write out any block
   on the page.  As noted in point 9 above,  I'm not convinced that is
   the correct thing to do.

And finally:

13/ 
+	/*
+	 * XXX: maybe use private alloc funcions so fses can embed block into
+	 * their fs-private block rather than using ->private? Maybe ->private
+	 * is easier though...
+	 */

   Yes, that's just what I was going to say.

   I find 'struct fsblock_meta' slightly confusing.   It extends
   'struct fsblock' in two ways.
   1/ It embeds fsblock in itself, just adding a 'data' pointer.
   2/ It sometimes sets ->private to a 'struct mb_assoc'.

   So it looks like you "have a bob each way".  Both embedding and using
   ->private.

   It seems to me that you have a core 'fsblock' subsystem, and a
   secondary fsblock_meta subsystem which would be useful for some
   filesystems and useless to other filesystems which handle metadata
   in a different way.
   It would be nice if these two were clearly delineated.
   It would also be nice if filesystems could extend fsblock in what
   ever way suited them.  
   Using embedding seems to be the most common approach in Linux these
   days, and saves a pointer.  Using ->private has the advantage of
   allowing you to allocate little arrays of fsblocks in common code
   because you "know" how big they should be.  More importantly it
   allows 'for_each_block' to work fairly easily.

   Given that the fs would need to allocate it's own 'private' info
   anyway, for_each_block is really the only benefit of using
   ->private.
   It shouldn't be to hard to use (say) 8 bits for fsblock to record
   the size of the full structure (much as you currently record the
   number of blocks-per-page) and then for_each_block can use that.

   You could even save a couple of bits but allocating one int plus
   N filesystem_block structures, and recording 'N' and 'sizeof
   filesystem_block'  in the 'int'.  You could even put the spinlock
   bit in that int rather than (ab)using the lsb of page->private.


I think that will do for now.

NeilBrown

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch] fsblock preview
  2008-09-16 11:35     ` Neil Brown
@ 2008-09-23  4:39       ` Nick Piggin
  2008-09-24  1:31         ` Neil Brown
  0 siblings, 1 reply; 6+ messages in thread
From: Nick Piggin @ 2008-09-23  4:39 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-fsdevel

Thanks for the review so far, and sorry for not getting back to you earlier.
Just starting to get back into the swing of things.

On Tue, Sep 16, 2008 at 09:35:16PM +1000, Neil Brown wrote:
> On Monday September 15, npiggin@suse.de wrote:
> > OK, vger doesn't seem to like my patch, so I'll have to give a url to it,
> > sorry.
> 
> Thank (A URL is lots better than nothing)!
> 
> Some comments.
> 
> 1/ I think you need to include a design document as a patch to
>    Documentation/something.  This would help me not to ask the same
>    question twice, such as:

Probably, or at least more documentation in the .c files. But the code
really had been in flux up until not long ago -- I switched from using
atomic bitops on the fsblock flags to having a lock to protect them. I
don't anticipate further changes on that scale, but possibly a few
things will be required eg if I look at switching ext3 to fsblocks...
But anyway now that it is settling down I will make a point to start
adding more documentation. Good point.

 
> 2/ Why do you allow pages in the same file to have either blocks or
>    buffers.   Surely it should be set-in-concrete on a per-address-space
>    (and probably per-filesystem) basis whether buffer_heads or blocks
>    or something else is used.

Yeah, this is a hack because I'm using the block device for both types
of filesystems...

 
>    I know I've asked this before so I found the answer in my mail
>    archives.
>    Some filesystems store their metadata in the same address-space
>    that block_device.c uses for reading/writing the device.  For this
>    to work they must both use the same buffering strategy.
> 
>    My feeling is that the complexity that you add to switch between
>    buffers and block on a per-page basis is too great a cost, and you
>    should avoid this.
>    My preferred approach would be to require the filesystem to use a
>    separate address-space to store the metadata (even if it is still
>    physically indexed).  However I suspect that would cause breakage
>    for things like tune2fs as the caches would no longer be coherent.
>    (Would that actually be a problem for anything other than the
>     filesystem superblock?  In which case, just continue to use
>    buffer_heads for the superblock and fsblock for the rest).
> 
>    The alternative would be for block_device.c to either use buffers
>    or blocks depending on how the filesystem is using it.
>    So on first open, default to using buffers.  When the filesystem
>    bd_claims the device it can switch it to using blocks.  It already
>    must potentially flush the whole cache if it is changing block
>    size.  This is just a small extra step.  You would probably just
>    replace the address_space_operations structure in something similar
>    to set_blocksize, between sync_blockdev and kill_bdev.
> 
>    This might end up being more code, but it would remove some real
>    ugliness from your current patch.

Definitely that is the plan, which Christoph also suggested a year
ago. It will make all that much nicer. Coherency should not be a problem
because for block devices we have the big-hammer coherency of flushing on
close, and I suspect online fs tools would use ioctls or syscalls into
the filesystem rather than raw bdev access.

The issue is that I just never spent enough time figuring out how to do
this. I ended up with code that "works" so there wasn't enough pressure
to change it :P Actually it is now the most ugly part of the patchset, so
I should really fix this too.


>    At the very least, I think that the difference between using
>    buffer_heads and fsblocks should be entirely encoded in the
>    PageBlocks flags, and that PagePrivate should be left to mean
>    exactly what it currently does:  That the ->private field is in
>    use by something.

It's even better than that. When fsblock filesystems use their own linear
address space for metadata, PageBlocks can go away completely. PagePrivate,
to the VM, just means that it must call into the aops in some situations.
Then the aops themselves decide what to do... so if we have no "mixed"
buffer and fsblock aops, then there is no need to distinguish :)

 
> 3/ Why do blocks hold a refcount on the page.  This seems wrong.
>    The setting of PagePrivate (or PageBlock) is the equivalent of a
>    refcount.  If it is set, then releasepage will be called to when
>    the VM wants to drop the page, and invalidatepage will be called
>    when it insists on dropping the page.  No need for a refcount.

Yeah, well... this is just how it has been done. The VM expects a refcount,
but I agree it doesn't really need an extra one.

 
>    I found a hint in the patch which suggests that this is needed for 
>    superpage support.  So when you truncate a file at the middle of a
>    superpage, fsblock can still hold on to the pages that
>    truncate_inode_pages wants to get rid of.  However as you didn't
>    include that I cannot yet comment on whether that seems
>    justifiable.

Yeah, that's one of the tricky places that needs a bit more VM changes to
really get it right.

 
>    As I think more on this, I'm wondering if I've missed something
>    (entirely possible).  If each block does indeed hold a refcount on
>    the page, then freeing idle pages which happen to have blocks
>    attached is not ever going to happen.  This would be ok if you
>    dropped the fsblocks as soon as they weren't in used, but your
>    commentary seems to suggest that is an option, not a certainty. 
>    Confused.

Yeah, it's just that we already expect PagePrivate to elevate the refcount
everywhere... actually if it didn't, I think the VM might not reclaim those
pages ;) Buffer heads do the same thing.

 
> 4/ In truncate_inode_pages_range, you have:
> 
> -			if (page_index > end) {
> -				next = page_index;
> +			next = page_index+1;
> +			if (next-1 > end)
>  				break;
> -			}
>  
> -			if (page_index > next)
> -				next = page_index;
> -			next++;
> 
>   This reminds me of very similar code in invalidate_mapping_pages
>   which has the comment
> 			/*
> 			 * We really shouldn't be looking at the ->index of an
> 			 * unlocked page.  But we're not allowed to lock these
> 			 * pages.  So we rely upon nobody altering the ->index
> 			 * of this (pinned-by-us) page.
> 			 */
>   (inserted by a patch attributed to me, but I think the text is from
>    Andrew Morton).
>   The theory is that until you get the page lock, the ->mapping and
>   ->index of a page can change (I'm not sure how.  Maybe splice??).
>   Note that truncate_complete_page (Which is called fairly shortly
>   after the patch included above) starts with
> 	if (page->mapping != mapping)
> 		return;
> 
>   which seems to justify the need for caution.
> 
>   Now, back to the change that you made:
>    The "before" code will only ever increase next based on the
>    (possibly random) value retrieved from ->index.  Your "after" code
>    always sets it explicitly.   It could be argued that this is not
>    as safe.

Well yeah, it's a hack because the radix tree range operations are not
really good for the task at hand. Ideally they would obey ranges and
operate on an iterator or at least return the next range to start looking
at for the next lookup.

Actually, index should not change, and mapping should only ever go to
NULL I think. But people get paranoid about truncate and do funny checks
(which sometimes serve to confuse the issue).

But yeah, most of the changes to generic code need to be put into seperate
patches and in some cases (possibly this one), the change is just a relic
and no longer required. Don't worry too much about these for the moment.

 
>   I'm not really sure how important this all is, and whether ->index
>   really can give a surprise value in this context.  But I thought I
>   would mention it, and suggest that if you really want to make this
>   change which (along with the reversal of trylock_page and testing
>   PageWriteback) doesn't seem to be directly related to fsblock, then
>   maybe it should be a separate patch so it could be reviewed more
>   effectively.  Either way, it would be good to have the code in
>   'truncate' and 'invalidate' as similar as possible, including
>   comments.

Agreed.

 
> 5/ Still in truncate_inode_pages_range: what is the point of making
>    any changes in there at all?  They seem to be just syntactic
>    rearrangement with no obvious gain.
>    I can see that you add a test for PageWriteback before
>    trylock_page, which could avoid taking a page lock in some cases so
>    that might be a micro-optimisation.
>    I can also see that you have dropped a called to cond_resched(),
>    which probably needs some justification.
>    In any case, this should all be in a separate patch.

Yep.

 
> 6/ I would love to have more words explaining the new
>    address_space_operations.  We even have a file to contain them:
>      Documentation/filesystems/vfs.txt.

Yes, they would all need to be in individual patches, properly documented
etc.


>    I'm guessing that the "any private data" is typically
>    buffers/blocks holding metadata (e.g. indirect blocks)
> 
>    Hmmm.. looking further is seems that courtesy of struct mb_assoc,
>    one fsblock of metadata can be shared by multiple address_spaces.
>    I guess that is for block-allocation bitmaps etc.

Yes.

 
>    I'm beginning to wonder if this belongs in the address_space rather
>    than directly in the inode.  Part of my reasoning for this is that
>    I didn't think that i_data should normally be used in generic code,
>    i_mapping should be used instead.  But all accesses to private_list
>    and assoc_mapping use i_data, suggesting that it is really part of
>    the inode.
> 
>    I realise that you didn't make it this way, but if it is a bad
>    design (which I'm beginning to suspect) then we should perpetuate
>    it by putting the 'release' and 'sync' operations in the
>    address_space.  They should rather be inode_operations I think.

Hmm... I don't know coda code at all, but AFAIK it doesn't really make
sense to use both i_mapping and i_data at the same time if they are
different. At any rate I guess we get away with it because nothing 
that uses buffer heads switches the i_mapping?

 
> 7/ Those kmallocs in fsblock_init are a worry.  Surely is isn't hard
>    to come up with a scheme for static allocation.
>    And I think you should remove "if (i < 30)" and just leave it as
>    a simple 'else'.

Oh? I don't see such a problem except I should panic if they fail I
suppose (rather than oops strangely).

 
> 8/ Given that sector_t is unsigned, code like
> +	block->block_nr = -1;
>    (in init_blocks) bothers me.
> +	sector_t last_block_nr = (sector_t)-1ULL;
>    Bothers me even more. "-1" as an unsigned long long ??
>    I would much prefer you used (~0ULL).  But maybe that's just me. 

I guess (sector_t)ULLONG_MAX ?

 
> 9/ I'm a bit confused by fsblock_writeout_data and its interaction
>    with writeout_block and the PageWriteback flag.  This is in the
>    subpage case.
> 
>    fsblock_writeout_data seems to want to write out lots of blocks
>    in sequential order.
>    It will find a block, lock the page, and (if the block is still
>    dirty) start writeout using writeout_block.  This will set
>    PageWriteback on the page.
> 
>    fsblock_writeout_data will often find the next block which is in
>    the same page, but as PageWriteback is now set, it will not try to
>    lock and writeout that block.  So I can imagine it writing out
>    only the first block of each page.  Surely this cannot be
>    intended.  Am I missing something?

No... it's just been ignored :P It's not a trivial problem, but for the
trivial case at least (blocks linear within page), I should be doing page
based writeout definitely.


> 10/  list_add_tail(new, head) is better than list_add(new, head->prev)

OK.

   
> 11/ nit.  checkpatch.pl reports.
>   total: 67 errors, 141 warnings, 7239 lines checke
>   I thought to run it because I noticed white-space-damage in
>   blkdev_get_block. 
> 
> 12/ In 'invalidate_block' you have commented out lock_block, and 
>    put in a comment "why lock?".
> 
>    The reason to lock is to wait for pending writeout to complete.
>    If the writeout was triggered by the VFS/VM, then the page will
>    have PageWriteback set and we will already have waited for that to
>    clear.  However if the writeout was triggered internally by the
>    filesystem (such as for data ordering guarantees) then
>    PageWriteback will not have been set, but the block (not the page)
>    will be locked.
> 
>    At least that is how I understand buffer_heads to work.  However
>    for fsblock you set PageWriteback whenever you write out any block
>    on the page.  As noted in point 9 above,  I'm not convinced that is
>    the correct thing to do.

Yeah, fsblock keeps dirty and writeout coherent, and uses only the page
lock (rather than page and buffer lock that buffer heads use) for fsblock
internal locking. I have not found any cases where the locking is needed,
but I was doing it in case minix/xfs/ext2/etc required it for some reason.

But if I can have a clean break, I'll just require filesystems do their
own locking rather than use the lock here ;) In fsblock I'm never going to
add code with comments that look like /* I think reiserfs does this */,
/* Does this still happen */ etc ;)


 
> And finally:
> 
> 13/ 
> +	/*
> +	 * XXX: maybe use private alloc funcions so fses can embed block into
> +	 * their fs-private block rather than using ->private? Maybe ->private
> +	 * is easier though...
> +	 */
> 
>    Yes, that's just what I was going to say.
> 
>    I find 'struct fsblock_meta' slightly confusing.   It extends
>    'struct fsblock' in two ways.
>    1/ It embeds fsblock in itself, just adding a 'data' pointer.
>    2/ It sometimes sets ->private to a 'struct mb_assoc'.
> 
>    So it looks like you "have a bob each way".  Both embedding and using
>    ->private.

Yep. It's a bigger problem for fsblock core because I'd need to do
mindless duplication of a lot of function names if fsblock was not
in the first part of fsblock_meta.

For filesystems I think they would know far more often whether a given
path is using data or metadata blocks, so I think the seperation is
still a reasonable idea.

 
>    It seems to me that you have a core 'fsblock' subsystem, and a
>    secondary fsblock_meta subsystem which would be useful for some
>    filesystems and useless to other filesystems which handle metadata
>    in a different way.
>    It would be nice if these two were clearly delineated.

They somewhat are, but they share so many common helpers that they seem
pretty entangled. filesystems should definitely be able to use one or
the other data / metadata handling functions though...


>    It would also be nice if filesystems could extend fsblock in what
>    ever way suited them.  
>    Using embedding seems to be the most common approach in Linux these
>    days, and saves a pointer.  Using ->private has the advantage of
>    allowing you to allocate little arrays of fsblocks in common code
>    because you "know" how big they should be.  More importantly it
>    allows 'for_each_block' to work fairly easily.
> 
>    Given that the fs would need to allocate it's own 'private' info
>    anyway, for_each_block is really the only benefit of using
>    ->private.
>    It shouldn't be to hard to use (say) 8 bits for fsblock to record
>    the size of the full structure (much as you currently record the
>    number of blocks-per-page) and then for_each_block can use that.
> 
>    You could even save a couple of bits but allocating one int plus
>    N filesystem_block structures, and recording 'N' and 'sizeof
>    filesystem_block'  in the 'int'.

I have considered approaches like that... It's not a clear win, given
that in other cases the filesystem may not always want its private data
to be attached an allocated, or it may need to attach different types
of private data.

But yes there are pros and cons of each. Basically though, there was no
clear fundamental reason I could see that makes one the best way to go,
so I decided to use the existing scheme so as not to make my life harder
when converting filesystems ;)


>  You could even put the spinlock
>    bit in that int rather than (ab)using the lsb of page->private.

Well page->private of fsblock-using-filesystem's pages belongs to fsblock
subsystem, so it's not abuse. It's neat to have the lock there because I
can take it without having any fsblocks allocated.

 
> I think that will do for now.

Thanks, you make some very good points.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch] fsblock preview
  2008-09-23  4:39       ` Nick Piggin
@ 2008-09-24  1:31         ` Neil Brown
  2008-09-25  4:38           ` Nick Piggin
  0 siblings, 1 reply; 6+ messages in thread
From: Neil Brown @ 2008-09-24  1:31 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-fsdevel

On Tuesday September 23, npiggin@suse.de wrote:
> Thanks for the review so far, and sorry for not getting back to you earlier.
> Just starting to get back into the swing of things.

:-)  The conference season does that do you.

> 
> Definitely that is the plan, which Christoph also suggested a year
> ago. It will make all that much nicer. Coherency should not be a problem
> because for block devices we have the big-hammer coherency of flushing on
> close, and I suspect online fs tools would use ioctls or syscalls into
> the filesystem rather than raw bdev access.

You might suspect that... but you might be surprised.

  tune2fs -c 30 /dev/sda1

changes the max-mount-count (number of mounts before we force a fsck)
to 30, and does it simply by writing to the block device.  No ioctls.
This is expected to work when the filesystem is mounted.  I'm fairly
sure that works exactly because the ext3 holds on the to the
bufferhead for the superblock in the block device, so a write by
tune2fs updates the memory that ext3fs store the superblock in.  So
when ext3 journals and then write that superblock, it still has the
updates max-mount-count.

> >    As I think more on this, I'm wondering if I've missed something
> >    (entirely possible).  If each block does indeed hold a refcount on
> >    the page, then freeing idle pages which happen to have blocks
> >    attached is not ever going to happen.  This would be ok if you
> >    dropped the fsblocks as soon as they weren't in used, but your
> >    commentary seems to suggest that is an option, not a certainty. 
> >    Confused.
> 
> Yeah, it's just that we already expect PagePrivate to elevate the refcount
> everywhere... actually if it didn't, I think the VM might not reclaim those
> pages ;) Buffer heads do the same thing.
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

I cannot see that.  I don't think that any code expects PagePrivate to
elevate the refcount.  I cannot see that buffer heads get a reference
on the page for each buffer_head?  There is only one get_page (a
find_get_page) in buffer.c, and it is followed shortly by a
page_cache_release.

????

>  
> > 7/ Those kmallocs in fsblock_init are a worry.  Surely is isn't hard
> >    to come up with a scheme for static allocation.
> >    And I think you should remove "if (i < 30)" and just leave it as
> >    a simple 'else'.
> 
> Oh? I don't see such a problem except I should panic if they fail I
> suppose (rather than oops strangely).

I've been programmed to raise a warning whenever I see a malloc where
the result isn't checked for NULL. :-(

> > 
> > 12/ In 'invalidate_block' you have commented out lock_block, and 
> >    put in a comment "why lock?".
> > 
> >    The reason to lock is to wait for pending writeout to complete.
> >    If the writeout was triggered by the VFS/VM, then the page will
> >    have PageWriteback set and we will already have waited for that to
> >    clear.  However if the writeout was triggered internally by the
> >    filesystem (such as for data ordering guarantees) then
> >    PageWriteback will not have been set, but the block (not the page)
> >    will be locked.
> > 
> >    At least that is how I understand buffer_heads to work.  However
> >    for fsblock you set PageWriteback whenever you write out any block
> >    on the page.  As noted in point 9 above,  I'm not convinced that is
> >    the correct thing to do.
> 
> Yeah, fsblock keeps dirty and writeout coherent, and uses only the page
> lock (rather than page and buffer lock that buffer heads use) for fsblock
> internal locking. I have not found any cases where the locking is needed,
> but I was doing it in case minix/xfs/ext2/etc required it for some reason.

I'm not sure that is a good design decision.
It means that the only way to concurrently write out two blocks that
are in the same page, is to write the whole page.

I notice that you don't even provide an interface for writing a block
asynchronously.  If the filesystem has a random collection of blocks
that it wants to write out it has to write each on synchronously, or
maybe write out the whole page for each block, which might not be what
is wanted.

> 
> But if I can have a clean break, I'll just require filesystems do their
> own locking rather than use the lock here ;) In fsblock I'm never going to
> add code with comments that look like /* I think reiserfs does this */,
> /* Does this still happen */ etc ;)

Yeah, those comments are pretty ugly.

What I think you want is a well defined public interface that provides
the sort of functionality that filesystems want, and which takes care
of all the tricky details.
Locking is certainly a tricky detail so I'm not sure that leaving it
entirely to the filesystem is a good idea.
And async block writes are something that I expect some filesystems
would want (I have one that does).


NeilBrown

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [patch] fsblock preview
  2008-09-24  1:31         ` Neil Brown
@ 2008-09-25  4:38           ` Nick Piggin
  0 siblings, 0 replies; 6+ messages in thread
From: Nick Piggin @ 2008-09-25  4:38 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-fsdevel

On Wed, Sep 24, 2008 at 11:31:46AM +1000, Neil Brown wrote:
> On Tuesday September 23, npiggin@suse.de wrote:
> > Thanks for the review so far, and sorry for not getting back to you earlier.
> > Just starting to get back into the swing of things.
> 
> :-)  The conference season does that do you.
> 
> > 
> > Definitely that is the plan, which Christoph also suggested a year
> > ago. It will make all that much nicer. Coherency should not be a problem
> > because for block devices we have the big-hammer coherency of flushing on
> > close, and I suspect online fs tools would use ioctls or syscalls into
> > the filesystem rather than raw bdev access.
> 
> You might suspect that... but you might be surprised.
> 
>   tune2fs -c 30 /dev/sda1
> 
> changes the max-mount-count (number of mounts before we force a fsck)
> to 30, and does it simply by writing to the block device.  No ioctls.
> This is expected to work when the filesystem is mounted.  I'm fairly
> sure that works exactly because the ext3 holds on the to the
> bufferhead for the superblock in the block device, so a write by
> tune2fs updates the memory that ext3fs store the superblock in.  So
> when ext3 journals and then write that superblock, it still has the
> updates max-mount-count.

Oh. Hmm, that sucks -- the block may not have been dirty before the write, so
it may require allocations to write it out (not that filesystems people care
about that, but it is wrong and one day in the distant future we won't do it
any more in Linux).

Anyway, I don't think there is a lot that can be done really...

 
> > >    As I think more on this, I'm wondering if I've missed something
> > >    (entirely possible).  If each block does indeed hold a refcount on
> > >    the page, then freeing idle pages which happen to have blocks
> > >    attached is not ever going to happen.  This would be ok if you
> > >    dropped the fsblocks as soon as they weren't in used, but your
> > >    commentary seems to suggest that is an option, not a certainty. 
> > >    Confused.
> > 
> > Yeah, it's just that we already expect PagePrivate to elevate the refcount
> > everywhere... actually if it didn't, I think the VM might not reclaim those
> > pages ;) Buffer heads do the same thing.
>            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> I cannot see that.  I don't think that any code expects PagePrivate to
> elevate the refcount.  I cannot see that buffer heads get a reference
> on the page for each buffer_head?  There is only one get_page (a
> find_get_page) in buffer.c, and it is followed shortly by a
> page_cache_release.
> 
> ????

See attach_page_buffers and __clear_page_buffers. Actually neither buffer
heads or fsblocks elevate the reference for each buffer/fsblock attached
to the page, but just elevate it once for N items.

Actually it is not so trivial to get rid of this I think, because we must
handle the case of "orphaned" pagecache pages: low level page refcounting
definitely does not even look at PagePrivate, and it's really only a few
things like reclaim which treats that flag as a pin on the page.
 

> > > 7/ Those kmallocs in fsblock_init are a worry.  Surely is isn't hard
> > >    to come up with a scheme for static allocation.
> > >    And I think you should remove "if (i < 30)" and just leave it as
> > >    a simple 'else'.
> > 
> > Oh? I don't see such a problem except I should panic if they fail I
> > suppose (rather than oops strangely).
> 
> I've been programmed to raise a warning whenever I see a malloc where
> the result isn't checked for NULL. :-(

Done, fixed it (with a panic! :))

 
> > > 12/ In 'invalidate_block' you have commented out lock_block, and 
> > >    put in a comment "why lock?".
> > > 
> > >    The reason to lock is to wait for pending writeout to complete.
> > >    If the writeout was triggered by the VFS/VM, then the page will
> > >    have PageWriteback set and we will already have waited for that to
> > >    clear.  However if the writeout was triggered internally by the
> > >    filesystem (such as for data ordering guarantees) then
> > >    PageWriteback will not have been set, but the block (not the page)
> > >    will be locked.
> > > 
> > >    At least that is how I understand buffer_heads to work.  However
> > >    for fsblock you set PageWriteback whenever you write out any block
> > >    on the page.  As noted in point 9 above,  I'm not convinced that is
> > >    the correct thing to do.
> > 
> > Yeah, fsblock keeps dirty and writeout coherent, and uses only the page
> > lock (rather than page and buffer lock that buffer heads use) for fsblock
> > internal locking. I have not found any cases where the locking is needed,
> > but I was doing it in case minix/xfs/ext2/etc required it for some reason.
> 
> I'm not sure that is a good design decision.
> It means that the only way to concurrently write out two blocks that
> are in the same page, is to write the whole page.

I should be able to add more blocks to a page under writeout, actually,
thanks to the fsblock locking.


> I notice that you don't even provide an interface for writing a block
> asynchronously.  If the filesystem has a random collection of blocks
> that it wants to write out it has to write each on synchronously, or
> maybe write out the whole page for each block, which might not be what
> is wanted.

Right. That could be added, but I just have kind of been exporting things
as it turns out they're required. But that's a pretty obvious one.

 
> > But if I can have a clean break, I'll just require filesystems do their
> > own locking rather than use the lock here ;) In fsblock I'm never going to
> > add code with comments that look like /* I think reiserfs does this */,
> > /* Does this still happen */ etc ;)
> 
> Yeah, those comments are pretty ugly.
> 
> What I think you want is a well defined public interface that provides
> the sort of functionality that filesystems want, and which takes care
> of all the tricky details.
> Locking is certainly a tricky detail so I'm not sure that leaving it
> entirely to the filesystem is a good idea.

Well the locking is taken care of if they stay within the page lock (in
that respect it is simpler than buffer heads). But if they then go out and
do things outside that page lock and it turns out they need protection
from invalidation as well, then they'd have to do their own locking...

I just don't want to add a lot of expensive operations even to something as
rare as invalidate, "just in case".


> And async block writes are something that I expect some filesystems
> would want (I have one that does).

OK, that's a good point. I'll look at adding that.


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-09-25  4:38 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20080911165335.GA31244@wotan.suse.de>
2008-09-12  9:48 ` [patch] fsblock preview Nick Piggin
     [not found] ` <20080914221500.GH27080@wotan.suse.de>
2008-09-15  8:30   ` Nick Piggin
2008-09-16 11:35     ` Neil Brown
2008-09-23  4:39       ` Nick Piggin
2008-09-24  1:31         ` Neil Brown
2008-09-25  4:38           ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).