All of lore.kernel.org
 help / color / mirror / Atom feed
From: Con Kolivas <kernel@kolivas.org>
To: linux list <linux-kernel@vger.kernel.org>
Cc: ck list <ck@vds.kolivas.org>
Subject: [ckpatch][28/29] fs-fcache-v2.1.patch
Date: Sun, 18 Jun 2006 17:35:30 +1000	[thread overview]
Message-ID: <200606181735.30466.kernel@kolivas.org> (raw)

A frontend cache for a block device. The purpose is to speedup a
fairly random but repeated read work load, like the boot of a system.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/ll_rw_blk.c       |   11 
 drivers/block/Kconfig   |    6 
 drivers/block/Makefile  |    1 
 drivers/block/fcache.c  | 1475 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext3/super.c         |   81 ++
 include/linux/bio.h     |    9 
 include/linux/ext3_fs.h |   14 
 7 files changed, 1587 insertions(+), 10 deletions(-)

Index: linux-ck-dev/block/ll_rw_blk.c
===================================================================
--- linux-ck-dev.orig/block/ll_rw_blk.c	2006-06-18 15:20:10.000000000 +1000
+++ linux-ck-dev/block/ll_rw_blk.c	2006-06-18 15:25:27.000000000 +1000
@@ -2817,12 +2817,10 @@ static void init_request_from_bio(struct
 	 */
 	if (bio_rw_ahead(bio) || bio_failfast(bio))
 		req->flags |= REQ_FAILFAST;
-
-	/*
-	 * REQ_BARRIER implies no merging, but lets make it explicit
-	 */
 	if (unlikely(bio_barrier(bio)))
-		req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+		req->flags |= REQ_HARDBARRIER;
+	if (!bio_mergeable(bio))
+		req->flags |= REQ_NOMERGE;
 
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
@@ -2870,7 +2868,7 @@ static int __make_request(request_queue_
 
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely(barrier) || elv_queue_empty(q))
+	if (!bio_mergeable(bio) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -3109,6 +3107,7 @@ void submit_bio(int rw, struct bio *bio)
 
 	BIO_BUG_ON(!bio->bi_size);
 	BIO_BUG_ON(!bio->bi_io_vec);
+	BIO_BUG_ON(bio->bi_next);
 	bio->bi_rw |= rw;
 	if (rw & WRITE)
 		mod_page_state(pgpgout, count);
Index: linux-ck-dev/drivers/block/Kconfig
===================================================================
--- linux-ck-dev.orig/drivers/block/Kconfig	2006-06-18 15:20:10.000000000 +1000
+++ linux-ck-dev/drivers/block/Kconfig	2006-06-18 15:25:27.000000000 +1000
@@ -456,4 +456,10 @@ config ATA_OVER_ETH
 	This driver provides Support for ATA over Ethernet block
 	devices like the Coraid EtherDrive (R) Storage Blade.
 
+config BLK_FCACHE
+	bool "Boot frontend cache driver"
+	help
+	This driver puts the data needed for a boot sequentially in a
+	defined place, taking all seeks out of the boot process.
+
 endmenu
Index: linux-ck-dev/drivers/block/Makefile
===================================================================
--- linux-ck-dev.orig/drivers/block/Makefile	2006-06-18 15:20:10.000000000 +1000
+++ linux-ck-dev/drivers/block/Makefile	2006-06-18 15:25:27.000000000 +1000
@@ -5,6 +5,7 @@
 # Rewritten to use lists instead of if-statements.
 # 
 
+obj-$(CONFIG_BLK_FCACHE)	+= fcache.o
 obj-$(CONFIG_MAC_FLOPPY)	+= swim3.o
 obj-$(CONFIG_BLK_DEV_FD)	+= floppy.o
 obj-$(CONFIG_AMIGA_FLOPPY)	+= amiflop.o
Index: linux-ck-dev/drivers/block/fcache.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-ck-dev/drivers/block/fcache.c	2006-06-18 15:25:27.000000000 +1000
@@ -0,0 +1,1475 @@
+/*
+ * A frontend cache for a block device. The purpose is to speedup a
+ * fairly random but repeated read work load, like the boot of a system.
+ *
+ * When run in priming mode, fcache allocates and writes data read from
+ * the source drive to our extent cache in the order in which they are
+ * accessed. When later run in non-priming mode, data accessed in the same
+ * order will be linearly available in the cache.
+ *
+ * Performance when priming is slower than non-fcache usage would be. If
+ * the fcache is located on another disk, the hit should be small. If the
+ * the fcache is located on the same disk (another partition), it runs
+ * at about half the speed. Non-priming performance should be fairly
+ * similar on same/other disk.
+ *
+ * On-disk format is as follows:
+ *	Block0:		header
+ *	Block1..X	extent maps
+ *	BlockX+1..Y	extent data
+ *
+ * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/blkdev.h>
+#include <linux/prio_tree.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+
+#define FCACHE_MAGIC	0x61786663
+#define FCACHE_VERSION	0x02
+
+#define FCACHE_HEADER_BLOCK	0
+#define FCACHE_EXTENT_BLOCK	1
+
+#undef FCACHE_PAGES_PROTECTED
+
+struct fcache_dev {
+	struct block_device *bdev;
+	struct block_device *fs_bdev;
+	make_request_fn *mfn;
+	struct prio_tree_root prio_root;
+	unsigned long next_cache_block;
+	unsigned long nr_extents;
+	unsigned long max_extents;
+	unsigned int old_bs;
+	spinlock_t lock;
+
+	sector_t cache_start_sector;
+	unsigned long cache_blocks;
+	sector_t fs_start_sector;
+	sector_t fs_sectors;
+
+	unsigned long flags;
+	int priming;
+	int serial;
+	int chop_ios;
+
+	struct list_head list;
+	struct work_struct work;
+
+	/*
+	 * stats
+	 */
+	unsigned int ios[2];
+	unsigned int hits;
+	unsigned int misses;
+	unsigned int overwrites;
+};
+
+enum {
+	FDEV_F_DOWN = 0,
+};
+
+static struct fcache_dev fcache_dev;
+
+static int disable;
+module_param(disable, int, 0444);
+
+struct fcache_endio_data {
+	struct fcache_dev *fdev;
+	sector_t fs_sector;
+	unsigned int fs_size;
+	sector_t cache_sector;
+	atomic_t completions;
+	struct bio *bio;
+	int io_error;
+	struct list_head list;
+};
+
+/*
+ * Maps a file system block to the fcache
+ */
+struct fcache_extent {
+	sector_t fs_sector;	/* real device offset */
+	unsigned int fs_size;	/* extent length */
+	sector_t cache_sector;	/* cache device offset */
+
+	struct prio_tree_node prio_node;
+};
+
+/*
+ * Header on fcache device - will take up the first page of data, so
+ * plenty of room to go around.
+ */
+struct fcache_header {
+	u32 magic;		/* fcache magic */
+	u32 version;		/* fcache version */
+	u32 nr_extents;		/* nr of extents in cache */
+	u32 max_extents;	/* max nr of extents available */
+	u32 serial;		/* fs and cache serial */
+	u32 extent_offset;	/* where extents start */
+	u64 fs_start_sector;	/* where fs starts */
+	u64 fs_sectors;		/* how big fs is */
+	char fs_dev[BDEVNAME_SIZE];	/* fs partition */
+	u64 cache_blocks;	/* number of blocks in cache */
+	u64 cache_blocks_used;	/* used blocks in cache */
+	u16 sector_t_size;	/* user space helper */
+	u16 extent_size;	/* user space helper */
+};
+
+#define BLOCK_SHIFT	(PAGE_SHIFT - 9)
+
+static struct kmem_cache *fcache_slab;
+static struct kmem_cache *fcache_fed_slab;
+static mempool_t *fed_pool;
+static struct workqueue_struct *fcache_workqueue;
+
+static int fcache_rw_page_endio(struct bio *bio, unsigned int bytes, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	complete(bio->bi_private);
+	return 0;
+}
+
+/*
+ * Writes out a page of data and waits for it to complete.
+ */
+static int fcache_rw_page(struct fcache_dev *fdev, sector_t index,
+			  struct page *page, int rw)
+{
+	DECLARE_COMPLETION(wait);
+	struct bio *bio;
+	int ret = 0;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+
+	bio->bi_sector = index << BLOCK_SHIFT;
+	bio->bi_bdev = fdev->bdev;
+	bio->bi_rw |= (1 << BIO_RW_SYNC);
+	bio->bi_end_io = fcache_rw_page_endio;
+	bio->bi_private = &wait;
+
+	bio_add_page(bio, page, PAGE_SIZE, 0);
+	submit_bio(rw, bio);
+
+	wait_for_completion(&wait);
+
+	if (!bio_flagged(bio, BIO_UPTODATE))
+		ret = -EIO;
+
+	bio_put(bio);
+	return ret;
+}
+
+static inline void fcache_fill_header(struct fcache_dev *fdev,
+				      struct fcache_header *header,
+				      unsigned int nr_extents)
+{
+	/*
+	 * See how many pages we need for extent headers, then we know where
+	 * to start putting data. Assume worst case of 1 page per extent, and
+	 * reserve the first page for the header.
+	 */
+
+	header->magic = FCACHE_MAGIC;
+	header->version = FCACHE_VERSION;
+	header->nr_extents = nr_extents;
+	header->max_extents = ((fdev->cache_blocks - 1) * PAGE_SIZE) / (PAGE_SIZE - sizeof(struct fcache_extent));
+	header->serial = fdev->serial;
+
+	header->extent_offset = 1 + (header->max_extents * sizeof(struct fcache_extent) / PAGE_SIZE);
+
+	header->fs_start_sector = fdev->fs_start_sector;
+	header->fs_sectors = fdev->fs_sectors;
+	bdevname(fdev->fs_bdev, header->fs_dev);
+	header->cache_blocks = fdev->cache_blocks;
+	header->cache_blocks_used = fdev->next_cache_block;
+	header->sector_t_size = sizeof(sector_t);
+	header->extent_size = sizeof(struct fcache_extent);
+}
+
+static int fcache_write_new_header(struct fcache_dev *fdev)
+{
+	struct fcache_header *header;
+	struct page *page;
+	int ret;
+
+	page = alloc_page(GFP_HIGHUSER);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	header = kmap_atomic(page, KM_USER0);
+	clear_page(header);
+	fcache_fill_header(fdev, header, 0);
+	fdev->next_cache_block = header->extent_offset;
+	fdev->max_extents = header->max_extents;
+	kunmap_atomic(header, KM_USER0);
+
+	printk("fcache: new header: first block %lu, max %lu\n",
+				fdev->next_cache_block, fdev->max_extents);
+	ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
+	__free_page(page);
+	return ret;
+}
+
+static void fcache_free_prio_tree(struct fcache_dev *fdev)
+{
+	struct fcache_extent *fe;
+	struct prio_tree_iter iter;
+	struct prio_tree_node *node;
+
+	/*
+	 * Now prune and free tree, wish there was a better way...
+	 */
+	do {
+		prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
+
+		node = prio_tree_next(&iter);
+		if (!node)
+			break;
+
+		fe = prio_tree_entry(node, struct fcache_extent, prio_node);
+		prio_tree_remove(&fdev->prio_root, node);
+		kmem_cache_free(fcache_slab, fe);
+	} while (1);
+}
+
+/*
+ * First clear the header, write extents, then write real header.
+ */
+static int fcache_write_extents(struct fcache_dev *fdev)
+{
+	struct fcache_header *header;
+	sector_t index, sectors;
+	unsigned int nr_extents, this_extents;
+	struct fcache_extent *fe;
+	struct prio_tree_iter iter;
+	struct prio_tree_node *node;
+	struct page *page;
+	void *p;
+	int ret;
+
+	page = alloc_page(GFP_KERNEL);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	header = page_address(page);
+	clear_page(header);
+	fcache_fill_header(fdev, header, 0);
+	ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
+	if (ret)
+		goto err;
+
+	/*
+	 * Now write the extents in page size chunks.
+	 */
+	p = page_address(page);
+	clear_page(p);
+	index = FCACHE_EXTENT_BLOCK;
+	sectors = 0;
+	this_extents = nr_extents = 0;
+
+	prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
+
+	do {
+		node = prio_tree_next(&iter);
+		if (!node)
+			break;
+
+		fe = prio_tree_entry(node, struct fcache_extent, prio_node);
+		nr_extents++;
+		this_extents++;
+		sectors += fe->fs_size >> 9;
+		memcpy(p, fe, sizeof(*fe));
+		p += sizeof(*fe);
+		if ((this_extents + 1) * sizeof(*fe) > PAGE_SIZE) {
+			ret = fcache_rw_page(fdev, index, page, WRITE);
+			if (ret)
+				break;
+
+			this_extents = 0;
+			index++;
+			p = page_address(page);
+		}
+	} while (1);
+
+	if (this_extents)
+		ret = fcache_rw_page(fdev, index, page, WRITE);
+
+	fdev->nr_extents = nr_extents;
+	printk("fcache: wrote %d extents, holding %llu sectors of data\n",
+				nr_extents, (unsigned long long) sectors);
+err:
+	__free_page(page);
+	return ret;
+}
+
+static int fcache_write_header(struct fcache_dev *fdev)
+{
+	struct page *page;
+	int ret;
+
+	page = alloc_page(GFP_KERNEL);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ);
+	if (!ret) {
+		struct fcache_header *header = page_address(page);
+
+		fcache_fill_header(fdev, header, fdev->nr_extents);
+		ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
+		printk("fcache: wrote header (extents=%lu,serial=%d)\n",
+						fdev->nr_extents, fdev->serial);
+	}
+
+	__free_page(page);
+	return ret;
+}
+
+static void fcache_tree_link(struct fcache_dev *fdev, struct fcache_extent *fe)
+{
+	struct prio_tree_node *node = &fe->prio_node;
+	unsigned long flags;
+
+	INIT_PRIO_TREE_NODE(node);
+	node->start = fe->fs_sector;
+	node->last = fe->fs_sector + (fe->fs_size >> 9) - 1;
+
+	spin_lock_irqsave(&fdev->lock, flags);
+	prio_tree_insert(&fdev->prio_root, node);
+	spin_unlock_irqrestore(&fdev->lock, flags);
+}
+
+#define MAX_FE	16
+
+/*
+ * Lookup the range of a given request in the prio tree. Used for both
+ * looking up a range covering a read operation to be served from cache,
+ * and to lookup potential conflicts from a new write with an existing
+ * extent.
+ */
+static int fcache_lookup_extent(struct fcache_dev *fdev, sector_t offset,
+				unsigned int bytes, struct fcache_extent **map)
+{
+	sector_t end_sector = offset + (bytes >> 9) - 1;
+	struct prio_tree_node *node;
+	struct prio_tree_iter iter;
+	int i = 0;
+
+	prio_tree_iter_init(&iter, &fdev->prio_root, offset, end_sector);
+
+	/*
+	 * We only need to lock, if we are priming. The prio tree does
+	 * not change when in normal mode.
+	 */
+	if (fdev->priming)
+		spin_lock_irq(&fdev->lock);
+
+	do {
+		node = prio_tree_next(&iter);
+		if (!node)
+			break;
+
+		map[i] = prio_tree_entry(node, struct fcache_extent, prio_node);
+	} while (++i < MAX_FE);
+
+	if (fdev->priming)
+		spin_unlock_irq(&fdev->lock);
+
+	return i;
+}
+
+/*
+ * Our data write is done, now insert the fcache extents into the rbtree.
+ */
+static int fcache_instantiate_extent(struct fcache_dev *fdev,
+				     struct fcache_endio_data *fed)
+{
+	struct fcache_extent *fe;
+
+	fe = kmem_cache_alloc(fcache_slab, GFP_ATOMIC);
+	if (fe) {
+		fe->fs_sector = fed->fs_sector;
+		fe->fs_size = fed->fs_size;
+		fe->cache_sector = fed->cache_sector;
+
+		fcache_tree_link(fdev, fe);
+		return 0;
+	}
+
+	return -ENOMEM;
+}
+
+/*
+ * Hang on to the bio and its pages - ideally we would want to ensure
+ * that the page data doesn't change between calling this function and
+ * fcache_put_bio_pages() as well...
+ */
+static void fcache_get_bio_pages(struct fcache_dev *fdev, struct bio *bio)
+{
+	/*
+	 * Currently stubbed out, as we cannot end the bio read before
+	 * the write completes without also making sure that the pages
+	 * don't get reused for something else in the mean time.
+	 */
+#ifdef FCACHE_PAGES_PROTECTED
+	struct bio_vec *bvec;
+	int i;
+
+	bio_get(bio);
+
+	__bio_for_each_segment(bvec, bio, i, 0)
+		get_page(bvec->bv_page);
+#endif
+}
+
+static void fcache_put_bio_pages(struct fcache_dev *fdev, struct bio *bio)
+{
+#ifdef FCACHE_PAGES_PROTECTED
+	struct bio_vec *bvec;
+	int i;
+
+	__bio_for_each_segment(bvec, bio, i, 0)
+		put_page(bvec->bv_page);
+
+	bio_put(bio);
+#endif
+}
+
+static void fcache_chop_write_done(struct fcache_endio_data *fed)
+{
+	/*
+	 * Last io completes.
+	 */
+	if (atomic_dec_and_test(&fed->completions)) {
+		struct fcache_dev *fdev = fed->fdev;
+		struct bio *bio = fed->bio;
+
+		/*
+		 * Release our reference to the original bio and
+		 * its pages.
+		 */
+		fcache_put_bio_pages(fdev, bio);
+
+		/*
+		 * End the read!
+		 */
+		bio_endio(bio, bio->bi_size, 0);
+
+		/*
+		 * All done, now add extent to our list if io completed ok.
+		 */
+		if (!fed->io_error)
+			fcache_instantiate_extent(fdev, fed);
+
+		mempool_free(fed, fed_pool);
+	}
+}
+
+/*
+ * Our data write to the cache completes, we can free our clone and
+ * instantiate the extent block.
+ */
+static int fcache_extent_write_endio(struct bio *bio, unsigned int bytes,
+				     int err)
+{
+	struct fcache_endio_data *fed;
+
+	if (bio->bi_size)
+		return 1;
+
+	fed = bio->bi_private;
+
+	if (!bio_flagged(bio, BIO_UPTODATE))
+		fed->io_error = -EIO;
+
+	bio_put(bio);
+	fcache_chop_write_done(fed);
+	return 0;
+}
+
+static void fcache_chop_read_done(struct fcache_endio_data *fed)
+{
+	if (atomic_dec_and_test(&fed->completions)) {
+		struct bio *bio = fed->bio;
+
+		bio_endio(bio, bio->bi_size, fed->io_error);
+		mempool_free(fed, fed_pool);
+	}
+}
+
+static int fcache_chop_read_endio(struct bio *bio, unsigned int bytes, int err)
+{
+	struct fcache_endio_data *fed;
+
+	if (bio->bi_size)
+		return 1;
+
+	fed = bio->bi_private;
+
+	if (!bio_flagged(bio, BIO_UPTODATE))
+		fed->io_error = -EIO;
+
+	bio_put(bio);
+	fcache_chop_read_done(fed);
+	return 0;
+}
+
+typedef void (chopper_done_t) (struct fcache_endio_data *);
+
+/*
+ * This is our io chopper - it hacks a bio into smaller pieces, suitable
+ * for the target device. Caller supplies suitable end_io and done functions.
+ */
+static void fcache_io_chopper(struct fcache_dev *fdev,
+			      struct fcache_endio_data *fed,
+			      bio_end_io_t *endio, chopper_done_t *done, int rw)
+{
+	struct bio *bio = NULL;
+	struct bio_vec *bv;
+	unsigned int total_bytes;
+	sector_t sector;
+	int i, vecs;
+
+	/*
+	 * Make sure 'fed' doesn't disappear while we are still issuing
+	 * ios, the artificial reference is dropped at the end.
+	 */
+	atomic_set(&fed->completions, 1);
+
+	sector = fed->cache_sector;
+	total_bytes = fed->fs_size;
+	vecs = fed->bio->bi_vcnt;
+	bio_for_each_segment(bv, fed->bio, i) {
+		unsigned int len;
+
+		if (!total_bytes)
+			break;
+
+		len = bv->bv_len;
+		if (len > total_bytes)
+			len = total_bytes;
+
+		do {
+			unsigned int l;
+
+			if (!bio) {
+				bio = bio_alloc(GFP_NOFS, vecs);
+
+				bio->bi_sector = sector;
+				bio->bi_bdev = fdev->bdev;
+				bio->bi_end_io = endio;
+				bio->bi_private = fed;
+			}
+
+			/*
+			 * If successful, break out of this loop and move on.
+			 */
+			l = bio_add_page(bio, bv->bv_page, len, bv->bv_offset);
+			if (l == len)
+				break;
+
+			BUG_ON(!bio->bi_size);
+
+			/*
+			 * We could not add this page, submit what we have
+			 * and alloc a new bio.
+			 */
+			atomic_inc(&fed->completions);
+			submit_bio(rw, bio);
+			bio = NULL;
+		} while (1);
+
+		total_bytes -= len;
+		sector += len >> 9;
+		vecs--;
+	}
+
+	if (bio) {
+		atomic_inc(&fed->completions);
+		submit_bio(rw, bio);
+	}
+
+	/*
+	 * Drop our reference to fed.
+	 */
+	done(fed);
+}
+
+/*
+ * cache device has similar or higher queue restrictions than the fs
+ * device - in that case, we can resubmit the bio to the device directly.
+ */
+static void fcache_direct_cache_write(struct fcache_dev *fdev,
+				      struct fcache_endio_data *fed)
+{
+	struct bio *bio = bio_clone(fed->bio, GFP_NOFS);
+
+	bio->bi_sector = fed->cache_sector;
+	bio->bi_bdev = fdev->bdev;
+	bio->bi_end_io = fcache_extent_write_endio;
+	bio->bi_private = fed;
+
+	atomic_set(&fed->completions, 1);
+	submit_bio(WRITE, bio);
+}
+
+/*
+ * cache device has more conservative restrictions than the fs device.
+ * The safest approach is to split up the bio and let bio_add_page()
+ * decide when it's time to submit the pieces.
+ */
+static void fcache_submit_cache_write(struct fcache_dev *fdev,
+				      struct fcache_endio_data *fed)
+{
+	if (!fdev->chop_ios)
+		fcache_direct_cache_write(fdev, fed);
+	else
+		fcache_io_chopper(fdev, fed, fcache_extent_write_endio,
+					fcache_chop_write_done, WRITE);
+}
+
+/*
+ * We punt work to fcache_work() whenever we need do work that blocks. The
+ * only thing that this thread handles is submitting the extent write
+ * when the real read has completed. We used to do the extent instantiation
+ * here as well, but fcache_extent_write_endio handles that now.
+ */
+static void fcache_work(void *data)
+{
+	struct fcache_dev *fdev = data;
+
+	do {
+		struct fcache_endio_data *fed = NULL;
+		struct bio *bio;
+
+		spin_lock_irq(&fdev->lock);
+		if (!list_empty(&fdev->list)) {
+			fed = list_entry(fdev->list.next, struct fcache_endio_data,list);
+			list_del_init(&fed->list);
+		}
+		spin_unlock_irq(&fdev->lock);
+
+		if (!fed)
+			break;
+
+		bio = fed->bio;
+
+		if (fed->io_error) {
+			printk(KERN_ERR "fcache: read error from device\n");
+			bio_endio(bio, bio->bi_size, fed->io_error);
+			continue;
+		}
+
+		/*
+		 * Get a ref on the original bio and pages, then
+		 * we should be able to signal completion of the READ
+		 * without waiting for the write to finish first.
+		 */
+		fcache_get_bio_pages(fdev, bio);
+
+		/*
+		 * Submit the read data as cache writes.
+		 */
+		fcache_submit_cache_write(fdev, fed);
+
+		/*
+		 * If fcache_get_bio_pages() could protect the pages from
+		 * being changed, we could end the io here instead of in
+		 * fcache_extent_fed_completes().
+		 */
+	} while (1);
+}
+
+/*
+ * Align bio to start at extent and stop sooner if extent is short. Must
+ * be called cautiously - it's only allowed to modify the bio if this is
+ * a clone and a write request, reads must be fully aligned and only
+ * possibly require a starting offset modification.
+ */
+static void fcache_bio_align(struct bio *bio, struct fcache_extent *fe)
+{
+	struct bio_vec *bvec;
+	sector_t start, end;
+	sector_t org_start, org_end;
+	unsigned int org_size, org_idx;
+	int i;
+
+	start = bio->bi_sector;
+	bio->bi_sector = fe->cache_sector;
+
+	/*
+	 * Nothing to do, perfectly aligned.
+	 */
+	if (start == fe->fs_sector && bio->bi_size == fe->fs_size)
+		return;
+
+	org_start = bio->bi_sector;
+	org_end = bio->bi_sector + (bio->bi_size >> 9);
+	org_size = bio->bi_size;
+	org_idx = bio->bi_idx;
+
+	/*
+	 * Adjust beginning.
+	 */
+	if (start > fe->fs_sector)
+		bio->bi_sector += (start - fe->fs_sector);
+	else if (start < fe->fs_sector) {
+		sector_t diff = fe->fs_sector - start;
+		int idx = 0;
+
+		BUG_ON(!(bio->bi_flags & (1 << BIO_CLONED)));
+		BUG_ON(bio_data_dir(bio) != WRITE);
+
+		/*
+		 * Adjust where bio starts
+		 */
+		__bio_for_each_segment(bvec, bio, i, 0) {
+			unsigned int bsec = bvec->bv_len >> 9;
+			unsigned int this_diff = bsec;
+
+			if (!diff)
+				break;
+			if (this_diff > diff)
+				this_diff = diff;
+
+			bio->bi_sector += this_diff;
+			bio->bi_size -= (this_diff << 9);
+
+			/*
+			 * Bigger than this chunk, skip ahead.
+			 */
+			if (this_diff == bsec) {
+				idx++;
+				diff -= this_diff;
+				continue;
+			}
+
+			/*
+			 * Adjust this bvec
+			 */
+			bvec->bv_offset += (this_diff << 9);
+			bvec->bv_len -= (this_diff << 9);
+			break;
+		}
+		bio->bi_idx += idx;
+	}
+
+	/*
+	 * Goes beyond the end, shrink size.
+	 */
+	end = bio->bi_sector + (bio->bi_size >> 9);
+	if (end > fe->cache_sector + (fe->fs_size >> 9)) {
+		sector_t diff = end - (fe->cache_sector + (fe->fs_size >> 9));
+		int vecs = 0;
+
+		BUG_ON(!(bio->bi_flags & (1 << BIO_CLONED)));
+		BUG_ON(bio_data_dir(bio) != WRITE);
+
+		/*
+		 * This is __bio_for_each_segment_reverse().
+		 */
+		for (i = bio->bi_vcnt - 1; i >= bio->bi_idx; i--) {
+			struct bio_vec *bvec = &bio->bi_io_vec[i];
+			unsigned int bsec = bvec->bv_len >> 9;
+			unsigned int this_diff = bsec;
+
+			if (!diff)
+				break;
+			if (this_diff > diff)
+				this_diff = diff;
+
+			bio->bi_size -= (this_diff << 9);
+
+			/*
+			 * Bigger than this chunk, skip ahead.
+			 */
+			if (this_diff == bsec) {
+				vecs++;
+				diff -= this_diff;
+				continue;
+			}
+
+			/*
+			 * Adjust this bvec
+			 */
+			bvec->bv_len -= (this_diff << 9);
+			break;
+		}
+		bio->bi_vcnt -= vecs;
+	}
+
+	BUG_ON(bio->bi_sector < fe->cache_sector);
+	BUG_ON(bio->bi_sector + (bio->bi_size >> 9) > fe->cache_sector + (fe->fs_size >> 9));
+
+	/*
+	 * Invalidate the segment counts, we changed the bio layout.
+	 */
+	bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+	bio->bi_flags |= (1 << BIO_NOMERGE);
+}
+
+static int fcache_overwrite_endio(struct bio *bio, unsigned int bytes, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	if (!bio_flagged(bio, BIO_UPTODATE)) {
+		struct fcache_dev *fdev = bio->bi_private;
+
+		printk(KERN_ERR "fcache: overwrite error, cache off\n");
+		set_bit(FDEV_F_DOWN, &fdev->flags);
+	}
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * Schedule overwrite of some existing block(s).
+ */
+static int fcache_overwrite_extent(struct fcache_dev *fdev,
+				   struct fcache_extent *fe, struct bio *bio)
+{
+	struct bio *clone = bio_clone(bio, GFP_NOFS);
+
+	clone->bi_bdev = fdev->bdev;
+	clone->bi_end_io = fcache_overwrite_endio;
+	clone->bi_private = fdev;
+	fcache_bio_align(clone, fe);
+	submit_bio(WRITE, clone);
+	return 0;
+}
+
+/*
+ * Our real data read is complete. Kick our process context handler so it
+ * can submit the write to our cache.
+ */
+static int fcache_extent_endio(struct bio *bio, unsigned int bytes, int err)
+{
+	struct fcache_dev *fdev;
+	struct fcache_endio_data *fed;
+	unsigned long flags;
+
+	if (bio->bi_size)
+		return 1;
+
+	fed = bio->bi_private;
+
+	if (!bio_flagged(bio, BIO_UPTODATE))
+		fed->io_error = -EIO;
+
+	bio_put(bio);
+
+	fdev = fed->fdev;
+	spin_lock_irqsave(&fdev->lock, flags);
+	list_add_tail(&fed->list, &fdev->list);
+	spin_unlock_irqrestore(&fdev->lock, flags);
+	queue_work(fcache_workqueue, &fdev->work);
+	return 0;
+}
+
+/*
+ * This initiates adding an extent to our list. We do this by cloning the
+ * original bio and submitting that to the real device and when that completes
+ * we write that out to the cache device and instantiate the extent.
+ */
+static int fcache_add_extent(struct fcache_dev *fdev, struct bio *bio)
+{
+	struct fcache_endio_data *fed;
+	struct bio *clone;
+
+	fed = mempool_alloc(fed_pool, GFP_NOIO);
+
+	fed->fdev = fdev;
+	fed->fs_sector = bio->bi_sector;
+	fed->fs_size = bio->bi_size;
+	fed->cache_sector = -1;
+	fed->bio = bio;
+	fed->io_error = 0;
+	INIT_LIST_HEAD(&fed->list);
+
+	/*
+	 * Allocate/assign an extent block for this range
+	 */
+	spin_lock_irq(&fdev->lock);
+	if (fdev->nr_extents < fdev->max_extents) {
+		unsigned long nr = (bio->bi_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+		if (fdev->next_cache_block + nr <= fdev->cache_blocks) {
+			fdev->nr_extents++;
+			fed->cache_sector = fdev->next_cache_block << BLOCK_SHIFT;
+			fdev->next_cache_block += nr;
+		}
+	}
+	spin_unlock_irq(&fdev->lock);
+
+	/*
+	 * Ran out of room
+	 */
+	if (fed->cache_sector == -1) {
+		printk(KERN_ERR "fcache: ran out of space, priming now off\n");
+		fdev->priming = 0;
+		mempool_free(fed, fed_pool);
+		return -ENOENT;
+	}
+
+	clone = bio_clone(bio, GFP_NOFS);
+	clone->bi_private = fed;
+	clone->bi_end_io = fcache_extent_endio;
+	clone->bi_rw |= (1 << BIO_RW_SYNC);
+
+	generic_make_request(clone);
+	return 0;
+}
+
+static int fcache_parse_extents(struct fcache_dev *fdev, void *addr,
+				unsigned int max_extents)
+{
+	int nr_extents = PAGE_SIZE / sizeof(struct fcache_extent);
+	int extents_read;
+
+	if (nr_extents > max_extents)
+		nr_extents = max_extents;
+
+	extents_read = 0;
+	while (nr_extents) {
+		struct fcache_extent *fe, *__fe = addr;
+
+		fe = kmem_cache_alloc(fcache_slab, GFP_KERNEL);
+		if (unlikely(!fe))
+			return -ENOMEM;
+
+		memset(fe, 0, sizeof(*fe));
+		fe->fs_sector = __fe->fs_sector;
+		fe->fs_size = __fe->fs_size;
+		fe->cache_sector = __fe->cache_sector;
+
+		fcache_tree_link(fdev, fe);
+
+		nr_extents--;
+		extents_read++;
+		addr += sizeof(*fe);
+	}
+
+	return extents_read;
+}
+
+static int fcache_read_extents(struct fcache_dev *fdev)
+{
+	unsigned int nr_extents = fdev->nr_extents;
+	int ret, extents, total_extents;
+	struct page *page;
+	sector_t index;
+	void *p;
+
+	page = alloc_page(GFP_KERNEL);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	ret = 0;
+	total_extents = 0;
+	index = FCACHE_EXTENT_BLOCK;
+	while (nr_extents) {
+		ret = fcache_rw_page(fdev, index, page, READ);
+		if (ret)
+			break;
+
+		p = page_address(page);
+		extents = fcache_parse_extents(fdev, p, nr_extents);
+
+		if (extents < 0) {
+			ret = extents;
+			break;
+		}
+
+		index++;
+		nr_extents -= extents;
+		total_extents += extents;
+	}
+
+	__free_page(page);
+
+	if (ret)
+		return ret;
+
+	return total_extents;
+}
+
+/*
+ * Read an existing fcache header from the device, and then proceed to
+ * reading and adding the extents to out prio tree.
+ */
+static int fcache_load_header(struct fcache_dev *fdev, int serial)
+{
+	struct fcache_header *header = NULL;
+	struct page *page;
+	int ret, wrong_serial = 0;
+	char b[BDEVNAME_SIZE];
+
+	page = alloc_page(GFP_HIGHUSER);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ);
+	if (unlikely(ret))
+		goto err;
+
+	ret = -EINVAL;
+	header = kmap_atomic(page, KM_USER0);
+	if (header->magic != FCACHE_MAGIC) {
+		printk(KERN_ERR "fcache: bad magic %x\n", header->magic);
+		goto err;
+	}
+	if (header->version != FCACHE_VERSION) {
+		printk(KERN_ERR "fcache: bad version %d\n", header->version);
+		goto err;
+	}
+	if (strcmp(bdevname(fdev->fs_bdev, b), header->fs_dev)) {
+		printk(KERN_ERR "fcache: device mismatch (%s/%s\n", b,
+							header->fs_dev);
+		goto err;
+	}
+	if (header->fs_start_sector != fdev->fs_start_sector ||
+	    header->fs_sectors != fdev->fs_sectors) {
+		printk(KERN_ERR "fcache: fs appears to have changed size\n");
+		goto err;
+	}
+
+	fdev->nr_extents = header->nr_extents;
+	fdev->max_extents = header->max_extents;
+
+	/*
+	 * Don't fail on out-of-date serial, just warn that the user needs
+	 * to prime the cache again. Until then we'll just bypass the cache.
+	 */
+	if (header->serial != serial) {
+		printk(KERN_ERR "fcache: found serial %d, expected %d.\n",
+							header->serial, serial);
+		printk(KERN_ERR "fcache: reprime the cache!\n");
+		wrong_serial = 1;
+	}
+
+	fdev->serial = header->serial;
+	kunmap_atomic(header, KM_USER0);
+	__free_page(page);
+
+	if (!wrong_serial) {
+		printk("fcache: header looks valid (extents=%ld extents, serial=%u)\n", fdev->nr_extents, fdev->serial);
+
+		ret = fcache_read_extents(fdev);
+		printk("fcache: loaded %d extents\n", ret);
+
+		/*
+		 * If we don't find all the extents we require, fail.
+		 */
+		if (ret != fdev->nr_extents) {
+			fcache_free_prio_tree(fdev);
+			ret = -EINVAL;
+		} else
+			ret = 0;
+	}
+
+	return ret;
+err:
+	__free_page(page);
+	if (header)
+		kunmap_atomic(header, KM_USER0);
+	return ret;
+}
+
+/*
+ * We use this range to decide when to log an io to the target device.
+ */
+static void fcache_fill_fs_size(struct fcache_dev *fdev)
+{
+	struct block_device *bdev = fdev->fs_bdev;
+
+	/*
+	 * Partition or whole device?
+	 */
+	if (bdev != bdev->bd_contains) {
+		struct hd_struct *p = bdev->bd_part;
+
+		fdev->fs_start_sector = p->start_sect;
+		fdev->fs_sectors = p->nr_sects;
+	} else {
+		fdev->fs_start_sector = 0;
+		fdev->fs_sectors = bdev->bd_inode->i_size >> 9;
+	}
+}
+
+static void fcache_fill_cache_size(struct fcache_dev *fdev)
+{
+	struct block_device *bdev = fdev->bdev;
+
+	/*
+	 * Partition or whole device?
+	 */
+	if (bdev != bdev->bd_contains) {
+		struct hd_struct *p = bdev->bd_part;
+
+		fdev->cache_start_sector = p->start_sect;
+		fdev->cache_blocks = p->nr_sects >> BLOCK_SHIFT;
+	} else {
+		fdev->cache_start_sector = 0;
+		fdev->cache_blocks = bdev->bd_inode->i_size >> PAGE_SHIFT;
+	}
+}
+
+/*
+ * This is a read request, check if we have that block. If we do, then
+ * just redirect. If not, pass it through.
+ */
+static int fcache_read_request(struct fcache_dev *fdev, request_queue_t *q,
+			       struct bio *bio)
+{
+	struct fcache_extent *extents[MAX_FE];
+	struct fcache_extent *fe;
+	int i, nr;
+
+	/*
+	 * Not there, redirect to original but schedule adding this extent
+	 * to our list if we are priming.
+	 */
+	nr = fcache_lookup_extent(fdev, bio->bi_sector, bio->bi_size, extents);
+	if (!nr) {
+		if (fdev->priming && !fcache_add_extent(fdev, bio))
+			return 0;
+
+		fdev->misses++;
+		return fdev->mfn(q, bio);
+	}
+
+	/*
+	 * If range is at least as big, we use our cache. If not, cop out
+	 * and just submit to real device.
+	 */
+	for (i = 0; i < nr; i++) {
+		sector_t end_fe, end_bi;
+		fe = extents[i];
+
+		end_fe = fe->fs_sector + (fe->fs_size >> 9);
+		end_bi = bio->bi_sector + (bio->bi_size >> 9);
+
+		/*
+		 * match!
+		 */
+		if (bio->bi_sector >= fe->fs_sector && end_bi <= end_fe)
+			break;
+
+		fe = NULL;
+	}
+
+	/*
+	 * Nopes, send to real device.
+	 */
+	if (!fe) {
+		fdev->misses++;
+		return fdev->mfn(q, bio);
+	}
+
+	/*
+	 * Perfect, adjust start offset if it isn't aligned.
+	 */
+	fdev->hits++;
+	fcache_bio_align(bio, fe);
+
+	/*
+	 * If we don't have to chop it up, just let generic_make_request()
+	 * handle the stacking. Otherwise, return handled and pass to chopper.
+	 */
+	if (fdev->chop_ios) {
+		struct fcache_endio_data *fed;
+
+		fed = mempool_alloc(fed_pool, GFP_NOIO);
+
+		fed->fdev = fdev;
+		fed->cache_sector = bio->bi_sector;
+		fed->fs_size = bio->bi_size;
+		fed->bio = bio;
+		fed->io_error = 0;
+		fcache_io_chopper(fdev, fed, fcache_chop_read_endio,
+					fcache_chop_read_done, READ);
+		return 0;
+	}
+
+	bio->bi_bdev = fdev->bdev;
+	return 1;
+}
+
+/*
+ * If we are priming the cache, always add this block. If not, then we still
+ * need to overwrite this block if it's in our cache.
+ */
+static int fcache_write_request(struct fcache_dev *fdev, request_queue_t *q,
+				struct bio *bio)
+{
+	struct fcache_extent *extents[MAX_FE];
+	struct fcache_extent *fe;
+	sector_t start = bio->bi_sector;
+	int i, nr;
+
+repeat:
+	nr = fcache_lookup_extent(fdev, bio->bi_sector, bio->bi_size, extents);
+
+	/*
+	 * Find out what to overwrite, if anything.
+	 */
+	for (i = 0; i < nr; i++) {
+		fe = extents[i];
+		fdev->overwrites++;
+		fcache_overwrite_extent(fdev, fe, bio);
+	}
+
+	/*
+	 * If i == MAX_FE, there _may_ be more extents. Repeat lookup, start
+	 * from the end of last request.
+	 */
+	if (i == MAX_FE) {
+		fe = extents[i - 1];
+		start = fe->fs_sector + (fe->fs_size >> 9);
+		goto repeat;
+	}
+
+	return fdev->mfn(q, bio);
+}
+
+/*
+ * This is the only case where we resubmit an io to the device but don't
+ * want to count it as part of io we log.
+ */
+#define fcache_bio_seen(bio)	((bio)->bi_end_io == fcache_extent_endio)
+
+static int fcache_make_request(request_queue_t *q, struct bio *bio)
+{
+	struct fcache_dev *fdev = &fcache_dev;
+
+	/*
+	 * If it's in the sector range we are monitoring and the device isn't
+	 * being shutdown, then pass it on. Assume a bio doesn't span into
+	 * the next partition, so don't bother accounting for size.
+	 */
+	if ((bio->bi_sector >= fdev->fs_start_sector) &&
+	    (bio->bi_sector < (fdev->fs_start_sector + fdev->fs_sectors)) &&
+	    !test_bit(FDEV_F_DOWN, &fdev->flags) &&
+	    !fcache_bio_seen(bio)) {
+
+		fdev->ios[bio_data_dir(bio)]++;
+
+		if (bio_data_dir(bio) == READ)
+			return fcache_read_request(fdev, q, bio);
+
+		return fcache_write_request(fdev, q, bio);
+	}
+
+	/*
+	 * Pass through to original make_request_fn.
+	 */
+	return fdev->mfn(q, bio);
+}
+
+/*
+ * Attach the cache device 'bdev' to 'fdev'.
+ */
+static int fcache_setup_dev(struct fcache_dev *fdev,
+			    struct block_device *fs_bdev,
+			    struct block_device *bdev,
+			    int priming, int serial)
+{
+	request_queue_t *fs_q, *cache_q;
+	char b[BDEVNAME_SIZE];
+	int ret;
+
+	memset(fdev, 0, sizeof(*fdev));
+	INIT_PRIO_TREE_ROOT(&fdev->prio_root);
+	spin_lock_init(&fdev->lock);
+	INIT_LIST_HEAD(&fdev->list);
+	INIT_WORK(&fdev->work, fcache_work, fdev);
+	fdev->priming = priming;
+	fdev->fs_bdev = fs_bdev;
+	fdev->bdev = bdev;
+
+	ret = -EINVAL;
+
+	fs_q = bdev_get_queue(fs_bdev);
+	cache_q = bdev_get_queue(bdev);
+	if (!fs_q || !cache_q)
+		goto out;
+
+	/*
+	 * Chop up outgoing ios, if the target is a different queue. We could
+	 * look closer at limits, but it's fragile and pretty pointless.
+	 */
+	if (fs_q != cache_q)
+		fdev->chop_ios = 1;
+
+	ret = bd_claim(bdev, fcache_setup_dev);
+	if (ret < 0)
+		goto out;
+
+	ret = block_size(bdev);
+	if (ret != PAGE_SIZE) {
+		fdev->old_bs = ret;
+		ret = set_blocksize(bdev, PAGE_SIZE);
+		if (ret < 0)
+			goto out_release;
+	} else
+		ret = 0;
+
+	fcache_fill_cache_size(fdev);
+	fcache_fill_fs_size(fdev);
+
+	if (priming) {
+		fdev->serial = serial;
+		ret = fcache_write_new_header(fdev);
+	} else
+		ret = fcache_load_header(fdev, serial);
+
+	if (!ret) {
+		printk("fcache: %s opened successfully (%spriming)\n",
+						bdevname(bdev, b),
+						priming ? "" : "not ");
+		return 0;
+	}
+
+out_release:
+	bd_release(fdev->bdev);
+out:
+	blkdev_put(fdev->bdev);
+	fdev->bdev = NULL;
+	return ret;
+}
+
+/*
+ * Return fdev->bdev to its original state.
+ */
+static void fcache_shutdown_dev(struct fcache_dev *fdev,
+				struct block_device *bdev)
+{
+	if (fdev->bdev) {
+		if (fdev->mfn) {
+			request_queue_t *q = bdev_get_queue(bdev);
+
+			(void) xchg(&q->make_request_fn, fdev->mfn);
+		}
+		sync_blockdev(fdev->bdev);
+		if (fdev->old_bs)
+			set_blocksize(fdev->bdev, fdev->old_bs);
+
+		bd_release(fdev->bdev);
+		blkdev_put(fdev->bdev);
+		fdev->bdev = NULL;
+		INIT_PRIO_TREE_ROOT(&fdev->prio_root);
+	}
+}
+
+/*
+ * bdev is the file system device, cache_dev is the device we want to store
+ * the cache on.
+ */
+int fcache_dev_open(struct block_device *bdev, unsigned long cache_dev,
+		    int priming, int serial)
+{
+	struct block_device *fcache_bdev;
+	request_queue_t *q;
+	int ret;
+
+	if (disable)
+		return 0;
+	if (fcache_dev.bdev)
+		return -EBUSY;
+
+	fcache_bdev = open_by_devnum(cache_dev, FMODE_READ|FMODE_WRITE);
+	if (IS_ERR(fcache_bdev))
+		return PTR_ERR(fcache_bdev);
+
+	ret = fcache_setup_dev(&fcache_dev, bdev, fcache_bdev, priming, serial);
+	if (ret)
+		return ret;
+
+	q = bdev_get_queue(bdev);
+	fcache_dev.mfn = xchg(&q->make_request_fn, fcache_make_request);
+	return 0;
+}
+
+EXPORT_SYMBOL(fcache_dev_open);
+
+void fcache_dev_close(struct block_device *bdev, int serial)
+{
+	struct fcache_dev *fdev = &fcache_dev;
+
+	if (disable)
+		return;
+
+	if (!fdev->bdev)
+		return;
+
+	printk("fcache: ios r/w %u/%u, hits %u, misses %u, overwrites %u\n",
+					fdev->ios[0], fdev->ios[1], fdev->hits,
+					fdev->misses, fdev->overwrites);
+	fdev->serial = serial;
+
+	sync_blockdev(bdev);
+	set_bit(FDEV_F_DOWN, &fdev->flags);
+
+	if (fdev->priming)
+		fcache_write_extents(fdev);
+
+	fcache_write_header(fdev);
+	fcache_free_prio_tree(fdev);
+	fcache_shutdown_dev(fdev, bdev);
+}
+
+EXPORT_SYMBOL(fcache_dev_close);
+
+static int fcache_init(void)
+{
+	fcache_slab = kmem_cache_create("fcache", sizeof(struct fcache_extent),
+					0, 0, NULL, NULL);
+	if (!fcache_slab)
+		return -ENOMEM;
+
+	fcache_fed_slab = kmem_cache_create("fcache_fed",
+					sizeof(struct fcache_endio_data), 0, 0,
+					NULL, NULL);
+	if (!fcache_fed_slab) {
+		kmem_cache_destroy(fcache_slab);
+		return -ENOMEM;
+	}
+
+	fed_pool = mempool_create_slab_pool(1, fcache_fed_slab);
+	if (!fed_pool) {
+		kmem_cache_destroy(fcache_slab);
+		kmem_cache_destroy(fcache_fed_slab);
+		return -ENOMEM;
+	}
+
+	fcache_workqueue = create_singlethread_workqueue("fcached");
+	if (!fcache_workqueue)
+		panic("fcache: failed to create fcached\n");
+
+	return 0;
+}
+
+static void fcache_exit(void)
+{
+	destroy_workqueue(fcache_workqueue);
+	kmem_cache_destroy(fcache_slab);
+	kmem_cache_destroy(fcache_fed_slab);
+	mempool_destroy(fed_pool);
+}
+
+MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
+MODULE_LICENSE("GPL");
+
+module_init(fcache_init);
+module_exit(fcache_exit);
Index: linux-ck-dev/fs/ext3/super.c
===================================================================
--- linux-ck-dev.orig/fs/ext3/super.c	2006-06-18 15:20:10.000000000 +1000
+++ linux-ck-dev/fs/ext3/super.c	2006-06-18 15:25:27.000000000 +1000
@@ -384,11 +384,43 @@ static void dump_orphan_list(struct supe
 	}
 }
 
+extern int fcache_dev_open(struct block_device *, unsigned long, int, int);
+extern int fcache_dev_close(struct block_device *, int);
+
+static void ext3_close_fcache(struct super_block *sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	int serial = le16_to_cpu(es->s_mnt_count);
+
+	fcache_dev_close(sb->s_bdev, serial);
+}
+
+static int ext3_open_fcache(struct super_block *sb, unsigned long cachedev)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_super_block *es = sbi->s_es;
+	int priming = test_opt(sb, FCACHEPRIME);
+	int serial = le16_to_cpu(es->s_mnt_count);
+	int ret;
+
+	ret = fcache_dev_open(sb->s_bdev, cachedev, priming, serial);
+	if (!ret) {
+		set_opt(sbi->s_mount_opt, FCACHE);
+		return 0;
+	}
+
+	printk(KERN_ERR "ext3: failed to open fcache (err=%d)\n", ret);
+	return ret;
+}
+
 static void ext3_put_super (struct super_block * sb)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	int i;
+	int i, has_fcache;
+
+	has_fcache = test_opt(sb, FCACHE);
 
 	ext3_xattr_put_super(sb);
 	journal_destroy(sbi->s_journal);
@@ -431,6 +463,8 @@ static void ext3_put_super (struct super
 		invalidate_bdev(sbi->journal_bdev, 0);
 		ext3_blkdev_remove(sbi);
 	}
+	if (has_fcache)
+		ext3_close_fcache(sb);
 	sb->s_fs_info = NULL;
 	kfree(sbi);
 	return;
@@ -635,7 +669,7 @@ enum {
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota
+	Opt_grpquota, Opt_fcache_dev, Opt_fcache_prime,
 };
 
 static match_table_t tokens = {
@@ -684,6 +718,8 @@ static match_table_t tokens = {
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
+	{Opt_fcache_dev, "fcache_dev=%s"},
+	{Opt_fcache_prime, "fcache_prime=%u"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
 };
@@ -710,6 +746,7 @@ static unsigned long get_sb_block(void *
 
 static int parse_options (char *options, struct super_block *sb,
 			  unsigned long *inum, unsigned long *journal_devnum,
+			  unsigned long *fcache_devnum,
 			  unsigned long *n_blocks_count, int is_remount)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
@@ -1012,6 +1049,29 @@ clear_qf_name:
 		case Opt_nobh:
 			set_opt(sbi->s_mount_opt, NOBH);
 			break;
+		case Opt_fcache_dev: {
+			int maj, min;
+			char *p, *pm;
+
+			if (!fcache_devnum)
+				break;
+			p = match_strdup(&args[0]);
+			if (!p)
+				return 0;
+			maj = simple_strtol(p, &pm, 10);
+			min = simple_strtol(pm + 1, NULL, 10);
+			*fcache_devnum = maj << MINORBITS | min;
+			kfree(p);
+			break;
+			}
+		case Opt_fcache_prime:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option)
+				set_opt(sbi->s_mount_opt, FCACHEPRIME);
+			else
+				clear_opt(sbi->s_mount_opt, FCACHEPRIME);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT3-fs: Unrecognized mount option \"%s\" "
@@ -1346,6 +1406,7 @@ static int ext3_fill_super (struct super
 	unsigned long offset = 0;
 	unsigned long journal_inum = 0;
 	unsigned long journal_devnum = 0;
+	unsigned long fcache_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
 	int blocksize;
@@ -1353,6 +1414,7 @@ static int ext3_fill_super (struct super
 	int db_count;
 	int i;
 	int needs_recovery;
+	int fcache = 0;
 	__le32 features;
 
 	sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
@@ -1427,7 +1489,7 @@ static int ext3_fill_super (struct super
 	set_opt(sbi->s_mount_opt, RESERVATION);
 
 	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
-			    NULL, 0))
+			    &fcache_devnum, NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -1651,6 +1713,9 @@ static int ext3_fill_super (struct super
 		goto failed_mount2;
 	}
 
+	if (fcache_devnum)
+		fcache = ext3_open_fcache(sb, fcache_devnum);
+
 	/* We have now updated the journal if required, so we can
 	 * validate the data journaling mode. */
 	switch (test_opt(sb, DATA_FLAGS)) {
@@ -1740,6 +1805,8 @@ cantfind_ext3:
 	goto failed_mount;
 
 failed_mount3:
+	if (!fcache)
+		ext3_close_fcache(sb);
 	journal_destroy(sbi->s_journal);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
@@ -2205,6 +2272,7 @@ static int ext3_remount (struct super_bl
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	unsigned long n_blocks_count = 0;
 	unsigned long old_sb_flags;
+	unsigned long fcache_devnum = 0;
 	struct ext3_mount_options old_opts;
 	int err;
 #ifdef CONFIG_QUOTA
@@ -2226,7 +2294,7 @@ static int ext3_remount (struct super_bl
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+	if (!parse_options(data, sb, NULL, NULL, &fcache_devnum, &n_blocks_count, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
@@ -2241,6 +2309,11 @@ static int ext3_remount (struct super_bl
 
 	ext3_init_journal_params(sb, sbi->s_journal);
 
+	if (fcache_devnum) {
+		ext3_close_fcache(sb);
+		ext3_open_fcache(sb, fcache_devnum);
+	}
+
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
 		n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
 		if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
Index: linux-ck-dev/include/linux/bio.h
===================================================================
--- linux-ck-dev.orig/include/linux/bio.h	2006-06-18 15:20:10.000000000 +1000
+++ linux-ck-dev/include/linux/bio.h	2006-06-18 15:25:27.000000000 +1000
@@ -124,6 +124,7 @@ struct bio {
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
 #define BIO_EOPNOTSUPP	7	/* not supported */
+#define BIO_NOMERGE	8	/* bio not mergeable */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
@@ -179,6 +180,14 @@ struct bio {
 #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 
+static inline int bio_mergeable(struct bio *bio)
+{
+	if (!bio_barrier(bio) && !bio->bi_idx && !bio_flagged(bio, BIO_NOMERGE))
+		return 1;
+
+	return 0;
+}
+
 /*
  * will die
  */
Index: linux-ck-dev/include/linux/ext3_fs.h
===================================================================
--- linux-ck-dev.orig/include/linux/ext3_fs.h	2006-06-18 15:20:10.000000000 +1000
+++ linux-ck-dev/include/linux/ext3_fs.h	2006-06-18 15:25:27.000000000 +1000
@@ -376,6 +376,8 @@ struct ext3_inode {
 #define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT3_MOUNT_FCACHE		0x400000 /* using fcache */
+#define EXT3_MOUNT_FCACHEPRIME		0x800000 /* priming fcache */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -847,6 +849,18 @@ extern struct inode_operations ext3_spec
 extern struct inode_operations ext3_symlink_inode_operations;
 extern struct inode_operations ext3_fast_symlink_inode_operations;
 
+#ifndef CONFIG_BLK_FCACHE
+static inline int fcache_dev_open(struct block_device *bdev,
+			unsigned long cache_dev, int priming, int serial)
+{
+	return -ENODEV;
+}
+
+static inline int fcache_dev_close(struct block_device *bdev, int serial)
+{
+	return 0;
+}
+#endif	/* CONFIG_BLK_FCACHE */
 
 #endif	/* __KERNEL__ */
 

-- 
-ck

             reply	other threads:[~2006-06-18  7:36 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-06-18  7:35 Con Kolivas [this message]
2006-06-18 10:13 ` [ck] [ckpatch][28/29] fs-fcache-v2.1.patch André Goddard Rosa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200606181735.30466.kernel@kolivas.org \
    --to=kernel@kolivas.org \
    --cc=ck@vds.kolivas.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.