* [PATCH 1/2] HACK: ext3: mount fast even when recovering
2009-07-14 14:05 [PATCH 0/2] ext3 HACKs Adrian Hunter
@ 2009-07-14 14:05 ` Adrian Hunter
2009-07-14 21:34 ` Andrew Morton
2009-07-14 14:06 ` [PATCH 2/2] HACK: do I/O read requests while ext3 journal recovers Adrian Hunter
1 sibling, 1 reply; 11+ messages in thread
From: Adrian Hunter @ 2009-07-14 14:05 UTC (permalink / raw)
To: Stephen Tweedie, Andreas Dilger, Andrew Morton
Cc: Adrian Hunter, linux-ext4, Artem Bityutskiy
>From 40c3dac03ac40d03d987b2b1385ab3e68277067b Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Fri, 3 Jul 2009 15:25:13 +0300
Subject: [PATCH] HACK: ext3: mount fast even when recovering
Speed up ext3 recovery mount time by not sync'ing the
block device. Instead place all dirty buffers into the
I/O queue and add a write barrier. This ensures that
no subsequent write will reach the disk before all the
recovery writes, but that we do not have to wait for the
I/O.
Note that ext3 reads sectors the correct way: through the
buffer cache, so there is no risk of reading old metadata.
Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
---
fs/ext3/super.c | 66 ++++++++++++++++++++++++++++++++++++++++++----
fs/jbd/journal.c | 23 ++++++++++++----
fs/jbd/recovery.c | 19 +++++++++++++-
include/linux/ext3_fs.h | 1 +
include/linux/jbd.h | 1 +
5 files changed, 97 insertions(+), 13 deletions(-)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f4be66e..59efefb 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1263,7 +1263,13 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
ext3_update_dynamic_rev(sb);
EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
- ext3_commit_super(sb, es, 1);
+ /*
+ * If we are in a hurry, we do not need to wait for the super block to
+ * reach the disk. We just need to make sure that all previous writes
+ * arrive before it. Setting the sync parameter to 2 will cause a
+ * write barrier to be added but will not wait for the I/O to complete.
+ */
+ ext3_commit_super(sb, es, test_opt(sb, FAST) ? 2 : 1);
if (test_opt(sb, DEBUG))
printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, "
"bpg=%lu, ipg=%lu, mo=%04lx]\n",
@@ -1622,6 +1628,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
set_opt(sbi->s_mount_opt, RESERVATION);
+ /*
+ * Set an option to indicate that we want to mount fast even
+ * when recovering. That is achieved by not sync'ing the
+ * block device, but instead placing all dirty buffers into
+ * the I/O queue and adding a write barrier.
+ */
+ set_opt(sbi->s_mount_opt, FAST);
+
if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
NULL, 0))
goto failed_mount;
@@ -2007,6 +2021,12 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
else
journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
+ /*
+ * Tell the journal about our fast mounting scheme, so it can
+ * play its part.
+ */
+ if (test_opt(sb, FAST))
+ journal->j_flags |= JFS_LOAD_FAST;
spin_unlock(&journal->j_state_lock);
}
@@ -2224,7 +2244,13 @@ static int ext3_load_journal(struct super_block *sb,
mark_sb_dirty(sb);
/* Make sure we flush the recovery flag to disk. */
- ext3_commit_super(sb, es, 1);
+ /*
+ * The super gets committed later by 'ext3_setup_super()'
+ * or 'ext3_maek_recovery_complete()' anyway, so if we are
+ * in a hurry we can skip it here.
+ */
+ if (!test_opt(sb, FAST))
+ ext3_commit_super(sb, es, 1);
}
return 0;
@@ -2285,7 +2311,16 @@ static void ext3_commit_super (struct super_block * sb,
es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
BUFFER_TRACE(sbh, "marking dirty");
mark_buffer_dirty(sbh);
- if (sync)
+ if (sync == 2) {
+ /*
+ * Caller has requested that a barrier is used, so that this
+ * write will not reach the disk before any previous ones,
+ * and we will not have to wait for it either.
+ */
+ set_buffer_ordered(sbh);
+ ll_rw_block(SWRITE, 1, &sbh);
+ clear_buffer_ordered(sbh);
+ } else if (sync)
sync_dirty_buffer(sbh);
}
@@ -2301,15 +2336,29 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
journal_t *journal = EXT3_SB(sb)->s_journal;
journal_lock_updates(journal);
- if (journal_flush(journal) < 0)
+
+ /*
+ * There is no need to flush the journal so skip it if we are in a
+ * hurry.
+ */
+ if (!test_opt(sb, FAST) && journal_flush(journal) < 0)
goto out;
lock_super(sb);
if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
sb->s_flags & MS_RDONLY) {
+ /*
+ * If we are in a hurry, we do not need to wait for the super
+ * block to reach the disk. We just need to make sure that
+ * all previous writes arrive before it. Setting the sync
+ * parameter to 2 will cause a write barrier to be added but
+ * will not wait for the I/O to complete.
+ */
+ int sync = test_opt(sb, FAST) ? 2 : 1;
+
EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
mark_sb_clean(sb);
- ext3_commit_super(sb, es, 1);
+ ext3_commit_super(sb, es, sync);
}
unlock_super(sb);
@@ -2348,7 +2397,12 @@ static void ext3_clear_journal_err(struct super_block * sb,
EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
- ext3_commit_super (sb, es, 1);
+ /*
+ * The super gets committed later by 'ext3_setup_super()'
+ * anyway, so if we are in a hurry we can skip it here.
+ */
+ if (!test_opt(sb, FAST))
+ ext3_commit_super (sb, es, 1);
journal_clear_err(journal);
}
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9e4fa52..3fd14ef 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -822,7 +822,7 @@ static void journal_fail_superblock (journal_t *journal)
* subsequent use.
*/
-static int journal_reset(journal_t *journal)
+static int journal_reset(journal_t *journal, int wait)
{
journal_superblock_t *sb = journal->j_superblock;
unsigned long first, last;
@@ -844,7 +844,7 @@ static int journal_reset(journal_t *journal)
journal->j_max_transaction_buffers = journal->j_maxlen / 4;
/* Add the dynamic fields and write it to disk. */
- journal_update_superblock(journal, 1);
+ journal_update_superblock(journal, wait);
return journal_start_thread(journal);
}
@@ -916,13 +916,14 @@ int journal_create(journal_t *journal)
journal->j_flags &= ~JFS_ABORT;
journal->j_format_version = 2;
- return journal_reset(journal);
+ return journal_reset(journal, 1);
}
/**
* void journal_update_superblock() - Update journal sb on disk.
* @journal: The journal to update.
* @wait: Set to '0' if you don't want to wait for IO completion.
+ * Note that a write barrier is used in that case.
*
* Update a journal's dynamic superblock fields and write it to disk,
* optionally waiting for the IO to complete.
@@ -961,8 +962,11 @@ void journal_update_superblock(journal_t *journal, int wait)
mark_buffer_dirty(bh);
if (wait)
sync_dirty_buffer(bh);
- else
+ else {
+ set_buffer_ordered(bh);
ll_rw_block(SWRITE, 1, &bh);
+ clear_buffer_ordered(bh);
+ }
out:
/* If we have just flushed the log (by marking s_start==0), then
@@ -1073,7 +1077,7 @@ static int load_superblock(journal_t *journal)
*/
int journal_load(journal_t *journal)
{
- int err;
+ int err, wait;
journal_superblock_t *sb;
err = load_superblock(journal);
@@ -1103,7 +1107,14 @@ int journal_load(journal_t *journal)
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
- if (journal_reset(journal))
+ /*
+ * If we are in a hurry, tell the reset not to wait, which will
+ * cause the journal superblock buffer to be placed into the I/O
+ * queue with a barrier, but we will not wait for the I/O to
+ * complete.
+ */
+ wait = journal->j_flags & JFS_LOAD_FAST ? 0 : 1;
+ if (journal_reset(journal, wait))
goto recovery_error;
journal->j_flags &= ~JFS_ABORT;
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982..a245c36 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -261,7 +261,24 @@ int journal_recover(journal_t *journal)
journal->j_transaction_sequence = ++info.end_transaction;
journal_clear_revoke(journal);
- err2 = sync_blockdev(journal->j_fs_dev);
+ /*
+ * We can massively speed-up the recovery mount time by avoiding
+ * synchronizing the block device. Instead, we just throw all the
+ * dirty buffers into the I/O queue, and rely on callers to add
+ * a write barrier.
+ */
+ if (journal->j_flags & JFS_LOAD_FAST) {
+ struct block_device *bdev = journal->j_fs_dev;
+
+ err2 = 0;
+ if (bdev) {
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+ if (mapping->nrpages)
+ err2 = filemap_fdatawrite(mapping);
+ }
+ } else
+ err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index d14f029..117e7a1 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -382,6 +382,7 @@ struct ext3_inode {
#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write
* error in ordered mode */
+#define EXT3_MOUNT_FAST 0x800000 /* Do not sync during recovery */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 346e2b8..06459ca 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -819,6 +819,7 @@ struct journal_s
#define JFS_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file
* data write error in ordered
* mode */
+#define JFS_LOAD_FAST 0x080 /* Do not sync during recovery */
/*
* Function declarations for the journaling transaction and buffer
--
1.5.6.3
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 2/2] HACK: do I/O read requests while ext3 journal recovers
2009-07-14 14:05 [PATCH 0/2] ext3 HACKs Adrian Hunter
2009-07-14 14:05 ` [PATCH 1/2] HACK: ext3: mount fast even when recovering Adrian Hunter
@ 2009-07-14 14:06 ` Adrian Hunter
1 sibling, 0 replies; 11+ messages in thread
From: Adrian Hunter @ 2009-07-14 14:06 UTC (permalink / raw)
To: Stephen Tweedie, Andreas Dilger, Andrew Morton
Cc: Artem Bityutskiy, linux-ext4, Adrian Hunter
>From c034a8b69ecc13ef924edd342ff945f890ebac61 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@nokia.com>
Date: Tue, 14 Jul 2009 12:58:34 +0300
Subject: [PATCH] HACK: do I/O read requests while ext3 journal recovers
The ext3 journal can take a long time to recover at mount
time. That was partially fixed by placing a barrier into
the I/O queue and then not waiting for the actual I/O to
complete.
However the barrier stops all other I/O, making the file
system unresponsive until the journal I/O completes
anyway.
This hack allows I/O read requests to jump the barrier
to the front on the I/O queue.
Note that the hack only takes affect while the ext3 journal
is recovering.
Note also, that in the normal situation, the I/O scheduler
is entitled to reorder I/O requests however it pleases,
so jumping read requests to the front is quite valid.
Where the normal rules are being broken, is that a barrier
is being jumped over. That is safe for two reasons:
- barriers are not otherwise used by ext3, vfat or swap
- ext3 I/O all goes through buffers, so any attempt
to read from sectors not yet written, will successfully
read from the buffers instead.
Signed-off-by: Adrian Hunter <adrian.hunter@nokia.com>
---
block/blk-core.c | 121 ++++++++++++++++++++++++++++++++++++++++++-
block/elevator.c | 37 +++++++++++++
fs/buffer.c | 9 +++-
fs/ext3/super.c | 8 +++
fs/jbd/journal.c | 8 +++
include/linux/bio.h | 3 +
include/linux/blkdev.h | 12 ++++
include/linux/buffer_head.h | 2 +
include/linux/elevator.h | 1 +
include/linux/fs.h | 1 +
10 files changed, 199 insertions(+), 3 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index c36aa98..66ac9b5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1003,6 +1003,23 @@ static inline void add_request(struct request_queue *q, struct request *req)
__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
}
+/*
+ * Leapfrog requests are inserted with a special 'where' code:
+ * ELEVATOR_INSERT_FRONT_BACK which means the back of the READ requests that
+ * are at the front of the dispatch queue.
+ */
+static inline void request_leapfrog(struct request_queue *q,
+ struct request *req)
+{
+ drive_stat_acct(req, 1);
+
+ /*
+ * elevator indicated where it wants this request to be
+ * inserted at elevator_merge time
+ */
+ __elv_add_request(q, req, ELEVATOR_INSERT_FRONT_BACK, 0);
+}
+
static void part_round_stats_single(int cpu, struct hd_struct *part,
unsigned long now)
{
@@ -1117,6 +1134,13 @@ void init_request_from_bio(struct request *req, struct bio *bio)
if (bio_rw_meta(bio))
req->cmd_flags |= REQ_RW_META;
+ /*
+ * The bio says to start leapfrog mode, so set the request
+ * to say the same.
+ */
+ if (bio_leapfrog(bio))
+ req->cmd_flags |= REQ_LEAPFROG;
+
req->errors = 0;
req->hard_sector = req->sector = bio->bi_sector;
req->ioprio = bio_prio(bio);
@@ -1124,13 +1148,68 @@ void init_request_from_bio(struct request *req, struct bio *bio)
blk_rq_bio_prep(req->q, req, bio);
}
+/*
+ * This is the same as elv_rq_merge_ok but for leapfrog mode, we are
+ * merging into the dispatch queue and do not want to involve the
+ * I/O scheduler in any way.
+ */
+static int elv_rq_leapfrog_merge_ok(struct request *rq, struct bio *bio)
+{
+ if (!rq_mergeable(rq))
+ return 0;
+
+ /*
+ * Don't merge file system requests and discard requests
+ */
+ if (bio_discard(bio) != bio_discard(rq->bio))
+ return 0;
+
+ /*
+ * different data direction or already started, don't merge
+ */
+ if (bio_data_dir(bio) != rq_data_dir(rq))
+ return 0;
+
+ /*
+ * must be same device and not a special request
+ */
+ if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
+ return 0;
+
+ /*
+ * only merge integrity protected bio into ditto rq
+ */
+ if (bio_integrity(bio) != blk_integrity_rq(rq))
+ return 0;
+
+ return 1;
+}
+
+/* This is the same as elv_try_merge but calls elv_rq_leapfrog_merge_ok */
+static inline int elv_try_leapfrog_merge(struct request *__rq, struct bio *bio)
+{
+ int ret = ELEVATOR_NO_MERGE;
+
+ /*
+ * we can merge and sequence is ok, check if it's possible
+ */
+ if (elv_rq_leapfrog_merge_ok(__rq, bio)) {
+ if (__rq->sector + __rq->nr_sectors == bio->bi_sector)
+ ret = ELEVATOR_BACK_MERGE;
+ else if (__rq->sector - bio_sectors(bio) == bio->bi_sector)
+ ret = ELEVATOR_FRONT_MERGE;
+ }
+
+ return ret;
+}
+
static int __make_request(struct request_queue *q, struct bio *bio)
{
struct request *req;
int el_ret, nr_sectors, barrier, discard, err;
const unsigned short prio = bio_prio(bio);
const int sync = bio_sync(bio);
- int rw_flags;
+ int rw_flags, leapfrog = 0;
nr_sectors = bio_sectors(bio);
@@ -1159,6 +1238,40 @@ static int __make_request(struct request_queue *q, struct bio *bio)
if (unlikely(barrier) || elv_queue_empty(q))
goto get_rq;
+ /*
+ * If the request queue is in leapfrog mode, leapfrog READs to the
+ * front of the queue.
+ */
+ if (unlikely(q->leapfrog) && !discard && (bio->bi_rw & (1 << BIO_RW)) == READ) {
+ /* Look in the dispatch queue for a request to merge with */
+ list_for_each_entry(req, &q->queue_head, queuelist) {
+ if (req->cmd_flags & REQ_STARTED)
+ continue;
+ if (rq_data_dir(req) == READ) {
+ /* Try to merge bio into request */
+ el_ret = elv_try_leapfrog_merge(req, bio);
+ /* Front merges are uncommon, so just do back merges */
+ if (el_ret == ELEVATOR_BACK_MERGE && ll_back_merge_fn(q, req, bio)) {
+ /* Merge is OK so plonk bio into this request and we are done */
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+ req->biotail->bi_next = bio;
+ req->biotail = bio;
+ req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+ req->ioprio = ioprio_best(req->ioprio, prio);
+ if (!blk_rq_cpu_valid(req))
+ req->cpu = bio->bi_comp_cpu;
+ drive_stat_acct(req, 0);
+ goto out;
+ }
+ continue;
+ }
+ break;
+ }
+ /* Was not able to merge so create a new request */
+ leapfrog = 1;
+ goto get_rq;
+ }
+
el_ret = elv_merge(q, &req, bio);
switch (el_ret) {
case ELEVATOR_BACK_MERGE:
@@ -1244,7 +1357,11 @@ get_rq:
req->cpu = blk_cpu_to_group(smp_processor_id());
if (elv_queue_empty(q))
blk_plug_device(q);
- add_request(q, req);
+ /* Leapfrogging requests are added specially */
+ if (unlikely(leapfrog))
+ request_leapfrog(q, req);
+ else
+ add_request(q, req);
out:
if (sync)
__generic_unplug_device(q);
diff --git a/block/elevator.c b/block/elevator.c
index a6951f7..80dbd18 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -663,6 +663,31 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
list_add_tail(&rq->queuelist, pos);
break;
+ case ELEVATOR_INSERT_FRONT_BACK:
+ /*
+ * New 'where' code for leapfrog mode. Put the request at the
+ * front of the queue but after any requests that have already
+ * started, and after other READ requests.
+ */
+ {
+ struct request *r;
+ struct list_head *p = &q->queue_head;
+
+ list_for_each_entry(r, &q->queue_head, queuelist) {
+ if (r->cmd_flags & REQ_STARTED) {
+ p = &r->queuelist;
+ continue;
+ }
+ if (rq_data_dir(r) == READ) {
+ p = &r->queuelist;
+ continue;
+ }
+ break;
+ }
+ list_add(&rq->queuelist, p);
+ break;
+ }
+
default:
printk(KERN_ERR "%s: bad insertion point %d\n",
__func__, where);
@@ -691,6 +716,10 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
if (blk_barrier_rq(rq))
q->ordcolor ^= 1;
+ /* A request marked as 'leapfrog' cause leapfrog mode to start */
+ if (blk_leapfrog_rq(rq))
+ q->leapfrog += 1;
+
/*
* barriers implicitly indicate back insertion
*/
@@ -773,6 +802,14 @@ struct request *elv_next_request(struct request_queue *q)
*/
rq->cmd_flags |= REQ_STARTED;
blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+
+ /*
+ * If this request started leapfrog mode, then
+ * leapfrog mode stops now that this request is
+ * starting.
+ */
+ if (blk_leapfrog_rq(rq))
+ q->leapfrog -= 1;
}
if (!q->boundary_rq || q->boundary_rq == rq) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 10179cf..b4f3b92 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2930,8 +2930,15 @@ int submit_bh(int rw, struct buffer_head * bh)
* Mask in barrier bit for a write (could be either a WRITE or a
* WRITE_SYNC
*/
- if (buffer_ordered(bh) && (rw & WRITE))
+ if (buffer_ordered(bh) && (rw & WRITE)) {
rw |= WRITE_BARRIER;
+ /*
+ * If the buffer says to start leapfrog mode, then flag it
+ * on the bio too.
+ */
+ if (buffer_leapfrog(bh))
+ rw |= LEAPFROG;
+ }
/*
* Only clear out a write error when rewriting
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 59efefb..b75a825 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2317,8 +2317,16 @@ static void ext3_commit_super (struct super_block * sb,
* write will not reach the disk before any previous ones,
* and we will not have to wait for it either.
*/
+ /*
+ * Start leapfrog mode. Leapfrog mode continues until the
+ * associated I/O request is started by the underlying
+ * block driver. Note that the request is also a barrier
+ * so it is never merged with another request.
+ */
set_buffer_ordered(sbh);
+ set_buffer_leapfrog(sbh);
ll_rw_block(SWRITE, 1, &sbh);
+ clear_buffer_leapfrog(sbh);
clear_buffer_ordered(sbh);
} else if (sync)
sync_dirty_buffer(sbh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 3fd14ef..5e3628c 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -963,8 +963,16 @@ void journal_update_superblock(journal_t *journal, int wait)
if (wait)
sync_dirty_buffer(bh);
else {
+ /*
+ * Start leapfrog mode. Leapfrog mode continues until the
+ * associated I/O request is started by the underlying
+ * block driver. Note that the request is also a barrier
+ * so it is never merged with another request.
+ */
set_buffer_ordered(bh);
+ set_buffer_leapfrog(bh);
ll_rw_block(SWRITE, 1, &bh);
+ clear_buffer_leapfrog(bh);
clear_buffer_ordered(bh);
}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6a64209..43bd58d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -150,6 +150,7 @@ struct bio {
* bit 7 -- fail fast transport errors
* bit 8 -- fail fast driver errors
* Don't want driver retries for any fast fail whatever the reason.
+ * bit 9 -- start leapfrog mode
*/
#define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */
#define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */
@@ -160,6 +161,7 @@ struct bio {
#define BIO_RW_FAILFAST_DEV 6
#define BIO_RW_FAILFAST_TRANSPORT 7
#define BIO_RW_FAILFAST_DRIVER 8
+#define BIO_RW_LEAPFROG 9
/*
* upper 16 bits of bi_rw define the io priority of this bio
@@ -194,6 +196,7 @@ struct bio {
#define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META))
#define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD))
#define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
+#define bio_leapfrog(bio) ((bio)->bi_rw & (1 << BIO_RW_LEAPFROG))
static inline unsigned int bio_cur_sectors(struct bio *bio)
{
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 031a315..3ed0639 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -109,6 +109,7 @@ enum rq_flag_bits {
__REQ_RW_META, /* metadata io request */
__REQ_COPY_USER, /* contains copies of user pages */
__REQ_INTEGRITY, /* integrity metadata has been remapped */
+ __REQ_LEAPFROG, /* start leapfrog mode */
__REQ_NR_BITS, /* stops here */
};
@@ -135,6 +136,7 @@ enum rq_flag_bits {
#define REQ_RW_META (1 << __REQ_RW_META)
#define REQ_COPY_USER (1 << __REQ_COPY_USER)
#define REQ_INTEGRITY (1 << __REQ_INTEGRITY)
+#define REQ_LEAPFROG (1 << __REQ_LEAPFROG)
#define BLK_MAX_CDB 16
@@ -399,6 +401,15 @@ struct request_queue
unsigned int dma_pad_mask;
unsigned int dma_alignment;
+ /*
+ * Flag indicating leapfrog mode. When a request also
+ * has a leapfrog flag, then the request queue starts
+ * leapfrog mode. When that request is finally started,
+ * leapfrog mode ends. Here 'leapfrog' is a counter, so
+ * if 2 requests start leapfrog mode, then the value is 2.
+ */
+ unsigned int leapfrog;
+
struct blk_queue_tag *queue_tags;
struct list_head tag_busy_list;
@@ -584,6 +595,7 @@ enum {
#define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER)
#define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA)
#define blk_discard_rq(rq) ((rq)->cmd_flags & REQ_DISCARD)
+#define blk_leapfrog_rq(rq) ((rq)->cmd_flags & REQ_LEAPFROG)
#define blk_bidi_rq(rq) ((rq)->next_rq != NULL)
#define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
/* rq->queuelist of dequeued request must be list_empty() */
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3ce64b9..2b73a1f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -35,6 +35,7 @@ enum bh_state_bits {
BH_Ordered, /* ordered write */
BH_Eopnotsupp, /* operation not supported (barrier) */
BH_Unwritten, /* Buffer is allocated on disk but not written */
+ BH_Leapfrog, /* Start leapfrog mode */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
@@ -127,6 +128,7 @@ BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Ordered, ordered)
BUFFER_FNS(Eopnotsupp, eopnotsupp)
BUFFER_FNS(Unwritten, unwritten)
+BUFFER_FNS(Leapfrog, leapfrog)
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
#define touch_buffer(bh) mark_page_accessed(bh->b_page)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 92f6f63..e5112c4 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -160,6 +160,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
#define ELEVATOR_INSERT_BACK 2
#define ELEVATOR_INSERT_SORT 3
#define ELEVATOR_INSERT_REQUEUE 4
+#define ELEVATOR_INSERT_FRONT_BACK 5
/*
* return values from elevator_may_queue_fn
diff --git a/include/linux/fs.h b/include/linux/fs.h
index aaa6291..1635a41 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -95,6 +95,7 @@ extern int dir_notify_enable;
#define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER))
#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
+#define LEAPFROG (1 << BIO_RW_LEAPFROG)
#define SEL_IN 1
#define SEL_OUT 2
--
1.5.6.3
^ permalink raw reply related [flat|nested] 11+ messages in thread