* [patch 1/4] [md] Add SKIP_RESYNC ioctl
2009-10-01 22:39 [patch 0/4] Journal guided resync and support scjody
@ 2009-10-01 22:39 ` scjody
2009-10-01 22:39 ` [patch 2/4] [md] Add RESYNC_RANGE ioctl scjody
` (3 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: scjody @ 2009-10-01 22:39 UTC (permalink / raw)
To: linux-ext4, linux-raid; +Cc: linux-kernel, Andreas Dilger
[-- Attachment #1: md-skip-resync.patch --]
[-- Type: TEXT/PLAIN, Size: 7680 bytes --]
Add a SKIP_RESYNC ioctl to md allowing resync to be skipped on an MD device
or partition.
Design note: I expect there to be one (unpartitioned MD device) or just a few
(partitioned MD device) skip_list entries, therefore searching a linked list
is not a huge concern.
Index: linux-2.6.18-128.1.6/drivers/md/md.c
===================================================================
--- linux-2.6.18-128.1.6.orig/drivers/md/md.c
+++ linux-2.6.18-128.1.6/drivers/md/md.c
@@ -314,12 +314,13 @@ static inline int mddev_trylock(mddev_t
return mutex_trylock(&mddev->reconfig_mutex);
}
-static inline void mddev_unlock(mddev_t * mddev)
+inline void mddev_unlock(mddev_t * mddev)
{
mutex_unlock(&mddev->reconfig_mutex);
md_wakeup_thread(mddev->thread);
}
+EXPORT_SYMBOL_GPL(mddev_unlock);
static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
{
@@ -4484,6 +4485,33 @@ static int md_ioctl(struct inode *inode,
err = set_bitmap_file(mddev, (int)arg);
goto done_unlock;
+ case SKIP_RESYNC:
+ {
+ struct hd_struct *part = inode->i_bdev->bd_part;
+ sector_t start, end;
+
+ if (mddev->pers == NULL) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ if (mddev->pers->skip_resync == NULL) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+ if (part) {
+ start = part->start_sect;
+ end = part->start_sect + part->nr_sects - 1;
+ } else {
+ start = 0;
+ end = (mddev->array_size<<1) - 1;
+ }
+
+ err = mddev->pers->skip_resync(mddev, start, end);
+ goto done_unlock;
+ }
+
default:
err = -EINVAL;
goto abort_unlock;
Index: linux-2.6.18-128.1.6/include/linux/raid/md_u.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/raid/md_u.h
+++ linux-2.6.18-128.1.6/include/linux/raid/md_u.h
@@ -45,6 +45,7 @@
#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
+#define SKIP_RESYNC _IO (MD_MAJOR, 0x40)
typedef struct mdu_version_s {
int major;
Index: linux-2.6.18-128.1.6/include/linux/raid/md_k.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/raid/md_k.h
+++ linux-2.6.18-128.1.6/include/linux/raid/md_k.h
@@ -283,6 +283,7 @@ struct mdk_personality
* others - reserved
*/
void (*quiesce) (mddev_t *mddev, int state);
+ int (*skip_resync) (mddev_t *mddev, sector_t start, sector_t end);
};
Index: linux-2.6.18-128.1.6/drivers/md/raid5.c
===================================================================
--- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c
+++ linux-2.6.18-128.1.6/drivers/md/raid5.c
@@ -2827,6 +2827,72 @@ static inline int raid5_redo_bio(raid5_c
return redo;
}
+/*
+ * Mark the range of sectors start-end to be skipped during the current
+ * resync. If no resync is in progress, this will be ignored.
+ */
+static int skip_resync(mddev_t *mddev, sector_t start, sector_t end)
+{
+ struct skip_entry *new;
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+ unsigned int dd_idx, pd_idx, disks, data_disks;
+
+ if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ return 0;
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (new == NULL)
+ return -ENOMEM;
+
+ disks = conf->raid_disks;
+ data_disks = disks - conf->max_degraded;
+
+ new->start = raid5_compute_sector(start, disks, data_disks,
+ &dd_idx, &pd_idx, conf);
+ new->end = raid5_compute_sector(end, disks, data_disks,
+ &dd_idx, &pd_idx, conf);
+ spin_lock_irq(&conf->device_lock);
+ list_add(&new->skip_list, &conf->skip_list);
+ spin_unlock_irq(&conf->device_lock);
+
+ return 0;
+}
+
+/*
+ * Check to see if this sector should be skipped. If so, return the number
+ * of sectors to skip.
+ */
+static sector_t check_skip_list(raid5_conf_t *conf, sector_t sector_nr)
+{
+ struct skip_entry *e;
+
+ list_for_each_entry(e, &conf->skip_list, skip_list) {
+ if (sector_nr >= e->start && sector_nr <= e->end)
+ return (e->end - sector_nr + 1);
+ }
+
+ return 0;
+}
+
+/* Clear the skip list and free associated memory. */
+static void clear_skip_list(raid5_conf_t *conf)
+{
+ struct list_head free_list;
+
+ INIT_LIST_HEAD(&free_list);
+ spin_lock_irq(&conf->device_lock);
+ list_splice_init(&conf->skip_list, &free_list);
+ spin_unlock_irq(&conf->device_lock);
+
+ while (!list_empty(&free_list)) {
+ struct list_head *l = free_list.next;
+ struct skip_entry *e = list_entry(l, struct skip_entry,
+ skip_list);
+ list_del_init(l);
+ kfree(e);
+ }
+}
+
static int make_request(request_queue_t *q, struct bio * bi)
{
mddev_t *mddev = q->queuedata;
@@ -3154,6 +3220,7 @@ static inline sector_t sync_request(mdde
int sync_blocks;
int still_degraded = 0;
int i;
+ sector_t skip_sectors;
if (sector_nr >= max_sector) {
/* just being told to finish up .. nothing much to do */
@@ -3169,6 +3236,7 @@ static inline sector_t sync_request(mdde
else /* completed sync */
conf->fullsync = 0;
bitmap_close_sync(mddev->bitmap);
+ clear_skip_list(conf);
return 0;
}
@@ -3194,6 +3262,13 @@ static inline sector_t sync_request(mdde
*skipped = 1;
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
}
+ spin_lock_irq(&conf->device_lock);
+ skip_sectors = check_skip_list(conf, sector_nr);
+ spin_unlock_irq(&conf->device_lock);
+ if (skip_sectors) {
+ *skipped = 1;
+ return skip_sectors;
+ }
pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
@@ -3449,6 +3524,7 @@ static int run(mddev_t *mddev)
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
INIT_LIST_HEAD(&conf->inactive_list);
+ INIT_LIST_HEAD(&conf->skip_list);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
@@ -4029,6 +4105,7 @@ static struct mdk_personality raid6_pers
.sync_request = sync_request,
.resize = raid5_resize,
.quiesce = raid5_quiesce,
+ .skip_resync = skip_resync,
};
static struct mdk_personality raid5_personality =
{
@@ -4050,6 +4127,7 @@ static struct mdk_personality raid5_pers
.start_reshape = raid5_start_reshape,
#endif
.quiesce = raid5_quiesce,
+ .skip_resync = skip_resync,
};
static struct mdk_personality raid4_personality =
@@ -4068,6 +4146,7 @@ static struct mdk_personality raid4_pers
.sync_request = sync_request,
.resize = raid5_resize,
.quiesce = raid5_quiesce,
+ .skip_resync = skip_resync,
};
static int __init raid5_init(void)
Index: linux-2.6.18-128.1.6/include/linux/raid/raid5.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/raid/raid5.h
+++ linux-2.6.18-128.1.6/include/linux/raid/raid5.h
@@ -260,6 +260,7 @@ struct raid5_private_data {
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;
+ struct list_head skip_list; /* used to skip resync on certain blocks */
/*
* Stats
@@ -294,4 +295,11 @@ typedef struct raid5_private_data raid5_
#define ALGORITHM_LEFT_SYMMETRIC 2
#define ALGORITHM_RIGHT_SYMMETRIC 3
+struct skip_entry {
+ struct list_head skip_list;
+
+ sector_t start;
+ sector_t end;
+};
+
#endif
Index: linux-2.6.18-128.1.6/include/linux/raid/md.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/raid/md.h
+++ linux-2.6.18-128.1.6/include/linux/raid/md.h
@@ -95,5 +95,7 @@ extern void md_new_event(mddev_t *mddev)
extern void md_update_sb(mddev_t * mddev);
+extern void mddev_unlock(mddev_t * mddev);
+
#endif
--
^ permalink raw reply [flat|nested] 9+ messages in thread* [patch 2/4] [md] Add RESYNC_RANGE ioctl
2009-10-01 22:39 [patch 0/4] Journal guided resync and support scjody
2009-10-01 22:39 ` [patch 1/4] [md] Add SKIP_RESYNC ioctl scjody
@ 2009-10-01 22:39 ` scjody
2009-10-01 22:39 ` [patch 3/4] [jbd] Add support for journal guided resync scjody
` (2 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: scjody @ 2009-10-01 22:39 UTC (permalink / raw)
To: linux-ext4, linux-raid; +Cc: linux-kernel, Andreas Dilger
[-- Attachment #1: md-resync-range.patch --]
[-- Type: TEXT/PLAIN, Size: 8109 bytes --]
Add the RESYNC_RANGE ioctl and implement it for RAID 4/5/6. This causes an
immediate resync of the requested sectors if the device is under resync.
TODO: In raid456 (and probably in any other personality that implements
this), there should be some concept of the last blocks that were resynced
since the personality can resync more data than requested which will result
in multiple resyncs of the same data with this implementation.
Index: linux-2.6.18-128.1.6/drivers/md/md.c
===================================================================
--- linux-2.6.18-128.1.6.orig/drivers/md/md.c
+++ linux-2.6.18-128.1.6/drivers/md/md.c
@@ -4512,6 +4512,71 @@ static int md_ioctl(struct inode *inode,
goto done_unlock;
}
+ case RESYNC_RANGE:
+ {
+ mdu_range_t range;
+ struct hd_struct *part = inode->i_bdev->bd_part;
+ int ret;
+
+ if (!arg) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+ ret = copy_from_user(&range, argp, sizeof(range));
+ if (ret) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+
+ if (range.start > range.end) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+ if (part) {
+ sector_t part_end;
+
+ range.start += part->start_sect;
+ range.end += part->start_sect;
+
+ part_end = part->start_sect + part->nr_sects - 1;
+
+ if (range.end > part_end) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ }
+
+ if (range.end >= mddev->array_size<<1) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+ /* We are already in sync; return success */
+ err = 0;
+ goto abort_unlock;
+ }
+
+ if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ /* Something is running but not a resync. */
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+
+ if (mddev->pers->resync_range == NULL) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+
+ err = mddev->pers->resync_range(mddev, range.start,
+ range.end);
+
+ goto done_unlock;
+ }
+
default:
err = -EINVAL;
goto abort_unlock;
@@ -4865,6 +4930,7 @@ static int md_seq_show(struct seq_file *
mdk_rdev_t *rdev;
struct mdstat_info *mi = seq->private;
struct bitmap *bitmap;
+ unsigned long resync;
if (v == (void*)1) {
struct mdk_personality *pers;
@@ -4883,6 +4949,8 @@ static int md_seq_show(struct seq_file *
return 0;
}
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+
if (mddev_lock(mddev) < 0)
return -EINTR;
Index: linux-2.6.18-128.1.6/include/linux/raid/md_u.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/raid/md_u.h
+++ linux-2.6.18-128.1.6/include/linux/raid/md_u.h
@@ -46,6 +46,7 @@
#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
#define SKIP_RESYNC _IO (MD_MAJOR, 0x40)
+#define RESYNC_RANGE _IO (MD_MAJOR, 0x41)
typedef struct mdu_version_s {
int major;
@@ -121,5 +122,11 @@ typedef struct mdu_param_s
int max_fault; /* unused for now */
} mdu_param_t;
+typedef struct mdu_range_s
+{
+ __u64 start; /* starting sector */
+ __u64 end; /* ending sector */
+} mdu_range_t;
+
#endif
Index: linux-2.6.18-128.1.6/drivers/md/raid5.c
===================================================================
--- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c
+++ linux-2.6.18-128.1.6/drivers/md/raid5.c
@@ -1698,8 +1698,10 @@ static void handle_stripe5(struct stripe
}
}
if (failed > 1 && syncing) {
- md_done_sync(conf->mddev, STRIPE_SECTORS,0);
+ if (!test_bit(STRIPE_RESYNC_RANGE, &sh->state))
+ md_done_sync(conf->mddev, STRIPE_SECTORS,0);
clear_bit(STRIPE_SYNCING, &sh->state);
+ clear_bit(STRIPE_RESYNC_RANGE, &sh->state);
syncing = 0;
}
@@ -1932,8 +1934,10 @@ static void handle_stripe5(struct stripe
}
}
if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
- md_done_sync(conf->mddev, STRIPE_SECTORS,1);
+ if (!test_bit(STRIPE_RESYNC_RANGE, &sh->state))
+ md_done_sync(conf->mddev, STRIPE_SECTORS,1);
clear_bit(STRIPE_SYNCING, &sh->state);
+ clear_bit(STRIPE_RESYNC_RANGE, &sh->state);
}
/* If the failed drive is just a ReadError, then we might need to progress
@@ -2275,8 +2279,10 @@ static void handle_stripe6(struct stripe
}
}
if (failed > 2 && syncing) {
- md_done_sync(conf->mddev, STRIPE_SECTORS,0);
+ if (!test_bit(STRIPE_RESYNC_RANGE, &sh->state))
+ md_done_sync(conf->mddev, STRIPE_SECTORS,0);
clear_bit(STRIPE_SYNCING, &sh->state);
+ clear_bit(STRIPE_RESYNC_RANGE, &sh->state);
syncing = 0;
}
@@ -2571,8 +2577,10 @@ static void handle_stripe6(struct stripe
}
if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
- md_done_sync(conf->mddev, STRIPE_SECTORS,1);
+ if (!test_bit(STRIPE_RESYNC_RANGE, &sh->state))
+ md_done_sync(conf->mddev, STRIPE_SECTORS,1);
clear_bit(STRIPE_SYNCING, &sh->state);
+ clear_bit(STRIPE_RESYNC_RANGE, &sh->state);
}
/* If the failed drives are just a ReadError, then we might need
@@ -3300,6 +3308,52 @@ static inline sector_t sync_request(mdde
return STRIPE_SECTORS;
}
+/* Perform an immediate resync of the requested range. */
+static int resync_range(mddev_t *mddev, sector_t start, sector_t end)
+{
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+ sector_t j, sync_end;
+ unsigned int dd_idx, pd_idx, disks, data_disks;
+
+ printk("resync_range, sectors %llu - %llu\n", (unsigned long long)start,
+ (unsigned long long)end);
+
+ disks = conf->raid_disks;
+ data_disks = disks - conf->max_degraded;
+
+ j = raid5_compute_sector(start, disks, data_disks,
+ &dd_idx, &pd_idx, conf);
+ sync_end = raid5_compute_sector(end, disks, data_disks,
+ &dd_idx, &pd_idx, conf);
+
+ while (j <= sync_end) {
+ struct stripe_head *sh;
+
+ pd_idx = stripe_to_pdidx(j, conf, disks);
+ sh = get_active_stripe(conf, j, disks, pd_idx, 1);
+ if (sh == NULL) {
+ sh = get_active_stripe(conf, j, disks, pd_idx, 0);
+ /* make sure we don't swamp the stripe cache if someone
+ * else is trying to get access
+ */
+ schedule_timeout_uninterruptible(1);
+ }
+
+ spin_lock(&sh->lock);
+ set_bit(STRIPE_SYNCING, &sh->state);
+ set_bit(STRIPE_RESYNC_RANGE, &sh->state);
+ clear_bit(STRIPE_INSYNC, &sh->state);
+ spin_unlock(&sh->lock);
+
+ handle_stripe(sh, NULL, NULL);
+ release_stripe(sh);
+
+ j += STRIPE_SECTORS;
+ }
+
+ return 0;
+}
+
/*
* This is our raid5 kernel thread.
*
@@ -4106,6 +4160,7 @@ static struct mdk_personality raid6_pers
.resize = raid5_resize,
.quiesce = raid5_quiesce,
.skip_resync = skip_resync,
+ .resync_range = resync_range,
};
static struct mdk_personality raid5_personality =
{
@@ -4128,6 +4183,7 @@ static struct mdk_personality raid5_pers
#endif
.quiesce = raid5_quiesce,
.skip_resync = skip_resync,
+ .resync_range = resync_range,
};
static struct mdk_personality raid4_personality =
@@ -4147,6 +4203,7 @@ static struct mdk_personality raid4_pers
.resize = raid5_resize,
.quiesce = raid5_quiesce,
.skip_resync = skip_resync,
+ .resync_range = resync_range,
};
static int __init raid5_init(void)
Index: linux-2.6.18-128.1.6/include/linux/raid/md_k.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/raid/md_k.h
+++ linux-2.6.18-128.1.6/include/linux/raid/md_k.h
@@ -284,6 +284,7 @@ struct mdk_personality
*/
void (*quiesce) (mddev_t *mddev, int state);
int (*skip_resync) (mddev_t *mddev, sector_t start, sector_t end);
+ int (*resync_range) (mddev_t *mddev, sector_t start, sector_t end);
};
Index: linux-2.6.18-128.1.6/include/linux/raid/raid5.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/raid/raid5.h
+++ linux-2.6.18-128.1.6/include/linux/raid/raid5.h
@@ -180,6 +180,8 @@ struct stripe_head {
#define STRIPE_EXPANDING 9
#define STRIPE_EXPAND_SOURCE 10
#define STRIPE_EXPAND_READY 11
+#define STRIPE_RESYNC_RANGE 12
+
/*
* Plugging:
*
--
^ permalink raw reply [flat|nested] 9+ messages in thread* [patch 3/4] [jbd] Add support for journal guided resync.
2009-10-01 22:39 [patch 0/4] Journal guided resync and support scjody
2009-10-01 22:39 ` [patch 1/4] [md] Add SKIP_RESYNC ioctl scjody
2009-10-01 22:39 ` [patch 2/4] [md] Add RESYNC_RANGE ioctl scjody
@ 2009-10-01 22:39 ` scjody
2009-10-01 23:39 ` Andrew Morton
2009-10-01 22:39 ` [patch 4/4] [ext3] Add journal guided resync (data=declared mode) scjody
2009-10-02 0:36 ` [patch 0/4] Journal guided resync and support Andi Kleen
4 siblings, 1 reply; 9+ messages in thread
From: scjody @ 2009-10-01 22:39 UTC (permalink / raw)
To: linux-ext4, linux-raid; +Cc: linux-kernel, Andreas Dilger
[-- Attachment #1: jbd-journal-guided-resync-infra.patch --]
[-- Type: TEXT/PLAIN, Size: 34517 bytes --]
Adds support for declare blocks, used by ext3's journal guided resync (declared
mode.) A declare block is added to the journal to list blocks to be written
during the current transaction. During journal replay, we perform a RAID
resync of only these blocks and skip the rest of the resync.
Index: linux-2.6.18-128.1.6/fs/jbd/checkpoint.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/checkpoint.c
+++ linux-2.6.18-128.1.6/fs/jbd/checkpoint.c
@@ -712,6 +712,8 @@ void __journal_drop_transaction(journal_
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
+ J_ASSERT(transaction->t_declare_root.rnode == NULL);
+ J_ASSERT(transaction->t_declare_done_root.rnode == NULL);
J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
Index: linux-2.6.18-128.1.6/fs/jbd/commit.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/commit.c
+++ linux-2.6.18-128.1.6/fs/jbd/commit.c
@@ -373,6 +373,262 @@ static inline __u32 jbd_checksum_data(__
return checksum;
}
+int wait_for_descriptors(journal_t *journal, transaction_t *trans) {
+ struct journal_head *jh;
+ struct buffer_head *bh;
+ int err = 0;
+
+wait_for_ctlbuf:
+
+ while (trans->t_log_list != NULL) {
+
+ jh = trans->t_log_list->b_tprev;
+ bh = jh2bh(jh);
+ if (buffer_locked(bh)) {
+ wait_on_buffer(bh);
+ goto wait_for_ctlbuf;
+ }
+ if (cond_resched())
+ goto wait_for_ctlbuf;
+
+ if (unlikely(!buffer_uptodate(bh)))
+ err = -EIO;
+
+ BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+ clear_buffer_jwrite(bh);
+ journal_unfile_buffer(journal, jh);
+ journal_put_journal_head(jh);
+ __brelse(bh); /* One for getblk */
+ }
+
+ return err;
+}
+
+struct journal_head *get_descriptor(journal_t *journal, transaction_t *trans,
+ int blocktype, char **tagp, int *space_left) {
+ struct journal_head *descriptor;
+ struct buffer_head *dbh;
+ journal_header_t *header;
+
+ jbd_debug(4, "JBD: get descriptor\n");
+
+ descriptor = journal_get_descriptor_buffer(journal);
+ if (!descriptor)
+ return NULL;
+
+ dbh = jh2bh(descriptor);
+ jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+ (unsigned long long)dbh->b_blocknr, dbh->b_data);
+ header = (journal_header_t *)&dbh->b_data[0];
+ header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+ header->h_blocktype = cpu_to_be32(blocktype);
+ header->h_sequence = cpu_to_be32(trans->t_tid);
+
+ *tagp = &dbh->b_data[sizeof(journal_header_t)];
+ *space_left = dbh->b_size - sizeof(journal_header_t);
+
+ set_buffer_jwrite(dbh);
+ set_buffer_dirty(dbh);
+
+ /* Record it so that we can wait for it later */
+ BUFFER_TRACE(dbh, "ph3: file as descriptor");
+ journal_file_buffer(descriptor, trans, BJ_LogCtl);
+
+ return descriptor;
+}
+
+/*
+ * Write declare blocks containing a list of the data blocks that will be
+ * written out
+ */
+void write_declare_blocks(journal_t *journal, transaction_t *transaction,
+ int committing)
+{
+ struct journal_head *jh, *descriptor = NULL;
+ struct buffer_head *bh;
+ int i, bufs = 0, err;
+ unsigned int n, count = 0, to_write;
+ unsigned long nextblock = 0;
+ char *tagp = NULL;
+ journal_block_tag_t *tag = NULL;
+ int space_left = 0, first_tag = 0, tag_flag;
+ struct radix_tree_root *root;
+
+ root = &transaction->t_declare_root;
+
+ spin_lock(&journal->j_list_lock);
+ to_write = transaction->t_declare_request;
+ transaction->t_declare_request = 0;
+ spin_unlock(&journal->j_list_lock);
+
+ if (to_write == UINT_MAX)
+ jbd_debug (1, "jbd: tid %d write declare request for ALL "
+ "blocks\n", transaction->t_tid);
+ else
+ jbd_debug (1, "jbd: tid %d write declare request for %u "
+ "blocks\n", transaction->t_tid, to_write);
+write_declare:
+ cond_resched();
+ spin_lock(&journal->j_list_lock);
+
+ n = radix_tree_gang_lookup(root, journal->j_declare_jhs, nextblock, 1);
+ while (n) {
+ if (!descriptor) {
+ J_ASSERT(bufs == 0);
+
+ spin_unlock(&journal->j_list_lock);
+
+ descriptor = get_descriptor(journal, transaction,
+ JFS_DECLARE_BLOCK,
+ &tagp, &space_left);
+
+ if (!descriptor) {
+ journal_abort(journal, -EIO);
+ return;
+ }
+
+ first_tag = 1;
+ journal->j_declare_bhs[bufs++] = jh2bh(descriptor);
+
+ goto write_declare;
+ }
+
+ jh = (struct journal_head *)journal->j_declare_jhs[0];
+ bh = jh2bh(jh);
+
+ /* refile the buffer as having been declared */
+ if (!inverted_lock(journal, bh))
+ goto write_declare;
+ __journal_unfile_buffer(jh);
+ __journal_file_buffer(jh, transaction, BJ_DeclareDone);
+
+ jbd_unlock_bh_state(bh);
+
+ /* record the block's tag in the current descriptor buffer */
+ tag_flag = 0;
+ if (!first_tag)
+ tag_flag |= JFS_FLAG_SAME_UUID;
+
+ tag = (journal_block_tag_t *)tagp;
+ tag->t_blocknr = cpu_to_be32(bh->b_blocknr);
+ tag->t_flags = cpu_to_be32(tag_flag);
+ tagp += sizeof(journal_block_tag_t);
+ space_left -= sizeof(journal_block_tag_t);
+
+ if (first_tag) {
+ memcpy (tagp, journal->j_uuid, 16);
+ tagp += 16;
+ space_left -= 16;
+ first_tag = 0;
+ }
+
+ count++;
+
+ /* advance to the next journal head and buffer */
+ nextblock = bh->b_blocknr + 1;
+ n = radix_tree_gang_lookup(root, journal->j_declare_jhs,
+ nextblock, 1);
+
+ /* If there's no more to do, or if the descriptor is full,
+ let the IO rip! */
+
+ if (bufs == ARRAY_SIZE(journal->j_declare_bhs) || n == 0 ||
+ count == to_write ||
+ space_left < sizeof(journal_block_tag_t) + 16) {
+
+ jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+
+ /* Write an end-of-descriptor marker before
+ * submitting the IOs. "tag" still points to
+ * the last tag we set up.
+ */
+
+ tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
+
+ spin_unlock(&journal->j_list_lock);
+
+ for (i = 0; i < bufs; i++) {
+ struct buffer_head *bh = journal->j_declare_bhs[i];
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ bh->b_end_io = journal_end_buffer_io_sync;
+ submit_bh(WRITE, bh);
+ }
+
+ cond_resched();
+ spin_lock(&journal->j_list_lock);
+
+ /* force a new descriptor to be generated next time */
+ descriptor = NULL;
+ bufs = 0;
+
+ /* need to redo tree lookup since we lost the lock,
+ but that will happen after we get a new descriptor */
+ }
+
+ if (count == to_write) break;
+ }
+ spin_unlock(&journal->j_list_lock);
+
+ jbd_debug(2, "jbd: tid %d wrote declarations for %u blocks\n",
+ transaction->t_tid, count);
+ if (to_write == UINT_MAX)
+ J_ASSERT(transaction->t_declare_root.rnode == NULL);
+
+ /* wait for the declare blocks to be written */
+ err = wait_for_descriptors(journal, transaction);
+
+ /* move the declared buffers to the sync data list */
+
+ root = &transaction->t_declare_done_root;
+ count = 0;
+ nextblock = 0;
+
+move_declare:
+ cond_resched();
+ spin_lock(&journal->j_list_lock);
+
+ while ((n = radix_tree_gang_lookup(root, journal->j_declare_jhs,
+ nextblock,
+ ARRAY_SIZE(journal->j_declare_jhs)))) {
+ /* loop and move the journal heads */
+ for (i = 0; i < n; i++) {
+ jh = journal->j_declare_jhs[i];
+ bh = jh2bh(jh);
+
+ if (!inverted_lock(journal, bh)) {
+ goto move_declare;
+ }
+ __journal_unfile_buffer(jh);
+
+ if (committing)
+ /* set buffer dirty for writing below */
+ set_buffer_dirty(bh);
+ else
+ /* set page dirty for virtual memory */
+ mark_buffer_dirty(bh);
+
+ __journal_file_buffer(jh, transaction, BJ_SyncData);
+
+ count++;
+
+ nextblock = bh->b_blocknr + 1;
+
+ jbd_unlock_bh_state(bh);
+
+ if (lock_need_resched(&journal->j_list_lock)) {
+ spin_unlock(&journal->j_list_lock);
+ goto move_declare;
+ }
+ }
+ }
+ spin_unlock(&journal->j_list_lock);
+
+ jbd_debug(2, "jbd: tid %d moved %u declare blocks\n",
+ transaction->t_tid, count);
+}
+
/*
* journal_commit_transaction
*
@@ -390,7 +646,6 @@ void journal_commit_transaction(journal_
int err;
unsigned long blocknr;
char *tagp = NULL;
- journal_header_t *header;
journal_block_tag_t *tag = NULL;
int space_left = 0;
int first_tag = 0;
@@ -517,6 +772,11 @@ void journal_commit_transaction(journal_
jbd_debug (3, "JBD: commit phase 2\n");
+ if (journal->j_flags & JFS_DECLARE) {
+ commit_transaction->t_declare_request = UINT_MAX;
+ write_declare_blocks(journal, commit_transaction, 1);
+ }
+
/*
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
@@ -545,9 +805,13 @@ void journal_commit_transaction(journal_
* If we found any dirty or locked buffers, then we should have
* looped back up to the write_out_data label. If there weren't
* any then journal_clean_data_list should have wiped the list
- * clean by now, so check that it is in fact empty.
+ * clean by now, so check that it is in fact empty. Also check
+ * declared mode trees - write_declare_blocks() should have left
+ * them empty.
*/
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(commit_transaction->t_declare_root.rnode == NULL);
+ J_ASSERT(commit_transaction->t_declare_done_root.rnode == NULL);
jbd_debug (3, "JBD: commit phase 3\n");
@@ -596,38 +860,20 @@ void journal_commit_transaction(journal_
record the metadata buffer. */
if (!descriptor) {
- struct buffer_head *bh;
-
J_ASSERT (bufs == 0);
- jbd_debug(4, "JBD: get descriptor\n");
+ descriptor = get_descriptor(journal,
+ commit_transaction,
+ JFS_DESCRIPTOR_BLOCK,
+ &tagp, &space_left);
- descriptor = journal_get_descriptor_buffer(journal);
if (!descriptor) {
journal_abort(journal, -EIO);
continue;
}
- bh = jh2bh(descriptor);
- jbd_debug(4, "JBD: got buffer %llu (%p)\n",
- (unsigned long long)bh->b_blocknr, bh->b_data);
- header = (journal_header_t *)&bh->b_data[0];
- header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
- header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
- header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-
- tagp = &bh->b_data[sizeof(journal_header_t)];
- space_left = bh->b_size - sizeof(journal_header_t);
first_tag = 1;
- set_buffer_jwrite(bh);
- set_buffer_dirty(bh);
- wbuf[bufs++] = bh;
-
- /* Record it so that we can wait for IO
- completion later */
- BUFFER_TRACE(bh, "ph3: file as descriptor");
- journal_file_buffer(descriptor, commit_transaction,
- BJ_LogCtl);
+ wbuf[bufs++] = jh2bh(descriptor);
}
/* Where is the buffer to be written? */
@@ -826,29 +1072,7 @@ wait_for_iobuf:
jbd_debug(3, "JBD: commit phase 5\n");
/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
- while (commit_transaction->t_log_list != NULL) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_log_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_ctlbuf;
- }
- if (cond_resched())
- goto wait_for_ctlbuf;
-
- if (unlikely(!buffer_uptodate(bh)))
- err = -EIO;
-
- BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
- clear_buffer_jwrite(bh);
- journal_unfile_buffer(journal, jh);
- journal_put_journal_head(jh);
- __brelse(bh); /* One for getblk */
- /* AKPM: bforget here */
- }
+ err = wait_for_descriptors(journal, commit_transaction);
if (err)
journal_abort(journal, err);
@@ -904,6 +1128,8 @@ wait_for_iobuf:
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
J_ASSERT(commit_transaction->t_shadow_list == NULL);
J_ASSERT(commit_transaction->t_log_list == NULL);
+ J_ASSERT(commit_transaction->t_declare_root.rnode == NULL);
+ J_ASSERT(commit_transaction->t_declare_done_root.rnode == NULL);
restart_loop:
/*
Index: linux-2.6.18-128.1.6/fs/jbd/journal.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/journal.c
+++ linux-2.6.18-128.1.6/fs/jbd/journal.c
@@ -86,6 +86,10 @@ EXPORT_SYMBOL(journal_invalidatepage);
EXPORT_SYMBOL(journal_try_to_free_buffers);
EXPORT_SYMBOL(journal_bmap);
EXPORT_SYMBOL(journal_force_commit);
+EXPORT_SYMBOL(journal_write_declare);
+
+extern void write_declare_blocks(journal_t *journal,
+ transaction_t *commit_transaction, int committing);
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
@@ -156,6 +160,16 @@ loop:
journal_commit_transaction(journal);
spin_lock(&journal->j_state_lock);
goto loop;
+ } else if (journal->j_flags & JFS_DECLARE &&
+ (transaction = journal->j_running_transaction) &&
+ transaction->t_declare_request) {
+ jbd_debug(2, "early declare\n");
+ spin_unlock(&journal->j_state_lock);
+ write_declare_blocks(journal, transaction, 0);
+ spin_lock(&journal->j_state_lock);
+
+ wake_up(&journal->j_wait_declare);
+ goto loop;
}
wake_up(&journal->j_wait_done_commit);
@@ -494,6 +508,38 @@ int journal_force_commit_nested(journal_
}
/*
+ * For ext3_fsync: start a request to declare the file's data and wait
+ * for the declarations to complete.
+ */
+int journal_write_declare(journal_t *journal)
+{
+ transaction_t *transaction = journal->j_running_transaction;
+ DEFINE_WAIT(wait);
+
+ if (transaction == NULL)
+ return 0;
+
+ spin_lock(&journal->j_list_lock);
+
+ if (transaction->t_declare_root.rnode == NULL) {
+ spin_unlock(&journal->j_list_lock);
+ return 0;
+ }
+
+ transaction->t_declare_request = UINT_MAX;
+
+ jbd_debug(1, "waking commit thread for fsync declare\n");
+ wake_up(&journal->j_wait_commit);
+
+ prepare_to_wait(&journal->j_wait_declare, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock(&journal->j_list_lock);
+ schedule();
+ finish_wait(&journal->j_wait_declare, &wait);
+
+ return 0;
+}
+
+/*
* Start a commit of the current running transaction (if any). Returns true
* if a transaction was started, and fills its tid in at *ptid
*/
@@ -959,6 +1005,7 @@ static journal_t * journal_init_common (
init_waitqueue_head(&journal->j_wait_checkpoint);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
+ init_waitqueue_head(&journal->j_wait_declare);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
@@ -1292,6 +1339,8 @@ static int journal_get_superblock(journa
J_ASSERT(bh != NULL);
if (!buffer_uptodate(bh)) {
+ /* TODO: resync the superblock */
+
ll_rw_block(READ, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
Index: linux-2.6.18-128.1.6/fs/jbd/recovery.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/recovery.c
+++ linux-2.6.18-128.1.6/fs/jbd/recovery.c
@@ -22,6 +22,7 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/crc32.h>
+#include <linux/raid/md.h>
#endif
/*
@@ -36,6 +37,9 @@ struct recovery_info
int nr_replays;
int nr_revokes;
int nr_revoke_hits;
+ int nr_declared;
+
+ int resync_errors;
};
enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
@@ -43,6 +47,7 @@ static int do_one_pass(journal_t *journa
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
tid_t, struct recovery_info *);
+static int journal_syncraid(journal_t *, unsigned long);
#ifdef __KERNEL__
@@ -53,6 +58,37 @@ void journal_brelse_array(struct buffer_
brelse (b[n]);
}
+static int resync_range(journal_t *j, unsigned long start,
+ unsigned long end)
+{
+ int err;
+ struct inode *fake_inode = kmalloc(sizeof(*fake_inode), GFP_KERNEL);
+ mdu_range_t range;
+ sector_t sectors_per_block = j->j_blocksize >> 9;
+ mm_segment_t old_fs;
+
+ if (fake_inode == NULL) {
+ printk(KERN_ERR "JBD: Out of memory during recovery.\n");
+ return -ENOMEM;
+ }
+
+ fake_inode->i_bdev = j->j_fs_dev;
+ range.start = start * sectors_per_block;
+ range.end = end * sectors_per_block + sectors_per_block - 1;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = blkdev_driver_ioctl(fake_inode, NULL, j->j_fs_dev->bd_disk,
+ RESYNC_RANGE, (long)&range);
+ set_fs(old_fs);
+
+ jbd_debug(3, "RESYNC_RANGE of sectors %llu - %llu returned %d\n",
+ range.start, range.end, err);
+
+ kfree(fake_inode);
+
+ return err;
+}
/*
* When reading from the journal, we are going through the block device
@@ -67,7 +103,7 @@ void journal_brelse_array(struct buffer_
*/
#define MAXBUF 8
-static int do_readahead(journal_t *journal, unsigned int start)
+static int do_readahead(journal_t *journal, unsigned int start, int raid_sync)
{
int err;
unsigned int max, nbufs, next;
@@ -95,6 +131,14 @@ static int do_readahead(journal_t *journ
goto failed;
}
+ /* For declared mode: perform a raid synchronization for the
+ * journal blocks; this will resync all of the journal blocks
+ * read, which is more than strictly necessary.
+ */
+
+ if (raid_sync)
+ resync_range(journal, blocknr, blocknr);
+
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
err = -ENOMEM;
@@ -103,6 +147,7 @@ static int do_readahead(journal_t *journ
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
+
if (nbufs == MAXBUF) {
ll_rw_block(READ, nbufs, bufs);
journal_brelse_array(bufs, nbufs);
@@ -130,7 +175,7 @@ failed:
*/
static int jread(struct buffer_head **bhp, journal_t *journal,
- unsigned int offset)
+ unsigned int offset, int sync_raid)
{
int err;
unsigned long blocknr;
@@ -159,7 +204,7 @@ static int jread(struct buffer_head **bh
/* If this is a brand new buffer, start readahead.
Otherwise, we assume we are already reading it. */
if (!buffer_req(bh))
- do_readahead(journal, offset);
+ do_readahead(journal, offset, sync_raid);
wait_on_buffer(bh);
}
@@ -257,6 +302,30 @@ int journal_recover(journal_t *journal)
jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+ if (!err && !info.resync_errors && JFS_HAS_INCOMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)) {
+ /* Successful declared mode resync: instruct the block device
+ * to skip its resync */
+ struct inode *fake_inode;
+
+ jbd_debug(0, "JBD: Resynced %d declared blocks\n",
+ info.nr_declared);
+
+ fake_inode = kmalloc(sizeof(*fake_inode), GFP_KERNEL);
+ if (fake_inode) {
+ fake_inode->i_bdev = journal->j_fs_dev;
+ jbd_debug(1, "Sending SKIP_RESYNC ioctl\n");
+
+ blkdev_driver_ioctl(fake_inode, NULL,
+ journal->j_fs_dev->bd_disk,
+ SKIP_RESYNC, 0);
+ }
+ kfree(fake_inode);
+ }
+
+ journal_clear_features(journal, 0, 0,
+ JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS);
+
/* Restart the log at the next transaction ID, thus invalidating
* any existing commit records in the log. */
journal->j_transaction_sequence = ++info.end_transaction;
@@ -329,7 +398,7 @@ static int calc_chksums(journal_t *journ
for (i = 0; i < num_blks; i++) {
io_block = (*next_log_block)++;
wrap(journal, *next_log_block);
- err = jread(&obh, journal, io_block);
+ err = jread(&obh, journal, io_block, 0);
if (err) {
printk(KERN_ERR "JBD: IO error %d recovering block "
"%lu in log\n", err, io_block);
@@ -355,6 +424,7 @@ static int do_one_pass(journal_t *journa
unsigned int sequence;
int blocktype;
__u32 crc32_sum = ~0; /* Transactional Checksums */
+ int raid_sync_journal = 0, raid_sync_data = 0;
/* Precompute the maximum metadata descriptors in a descriptor block */
int MAX_BLOCKS_PER_DESC;
@@ -397,9 +467,30 @@ static int do_one_pass(journal_t *journa
* check right now that we haven't gone past the end of
* the log. */
- if (pass != PASS_SCAN)
- if (tid_geq(next_commit_ID, info->end_transaction))
- break;
+ if (pass != PASS_SCAN) {
+ if (tid_geq(next_commit_ID, info->end_transaction)) {
+ /* For declared mode resync, move ahead past
+ * the last commmitted transaction to deal with
+ * raid sync for declare blocks and the head
+ * of the journal.
+ */
+ if (pass == PASS_REPLAY &&
+ JFS_HAS_INCOMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)) {
+ if (journal->j_fs_dev == journal->j_dev)
+ raid_sync_journal = 1;
+ if (!raid_sync_data)
+ jbd_debug(1, "Declared mode was used; "
+ "performing raid sync %s\n",
+ raid_sync_journal ?
+ "of journal and data" :
+ "of data");
+ raid_sync_data = 1;
+ }
+ else
+ break;
+ }
+ }
jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
next_commit_ID, next_log_block, journal->j_last);
@@ -409,7 +500,7 @@ static int do_one_pass(journal_t *journa
* record. */
jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
- err = jread(&bh, journal, next_log_block);
+ err = jread(&bh, journal, next_log_block, raid_sync_journal);
if (err)
goto failed;
@@ -426,6 +517,12 @@ static int do_one_pass(journal_t *journa
if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
brelse(bh);
+
+ /* raid sync the head of the journal */
+ if (raid_sync_journal) {
+ if (journal_syncraid(journal, next_log_block))
+ info->resync_errors++;
+ }
break;
}
@@ -436,6 +533,12 @@ static int do_one_pass(journal_t *journa
if (sequence != next_commit_ID) {
brelse(bh);
+
+ /* raid sync the head of the journal */
+ if (raid_sync_journal) {
+ if (journal_syncraid(journal, next_log_block))
+ info->resync_errors++;
+ }
break;
}
@@ -485,7 +588,8 @@ static int do_one_pass(journal_t *journa
io_block = next_log_block++;
wrap(journal, next_log_block);
- err = jread(&obh, journal, io_block);
+ err = jread(&obh, journal, io_block,
+ raid_sync_journal);
if (err) {
/* Recover what we can, but
* report failure at the end. */
@@ -668,6 +772,42 @@ static int do_one_pass(journal_t *journa
goto failed;
continue;
+ case JFS_DECLARE_BLOCK:
+ if (!raid_sync_data) {
+ brelse(bh);
+ continue;
+ }
+
+ /* this is a declare block for an uncommitted
+ * transaction, so raid sync all of the blocks it
+ * describes
+ */
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+ while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
+ <= journal->j_blocksize) {
+
+ unsigned long blocknr;
+
+ tag = (journal_block_tag_t *) tagp;
+ flags = be32_to_cpu(tag->t_flags);
+ blocknr = be32_to_cpu(tag->t_blocknr);
+
+ if (resync_range(journal, blocknr, blocknr))
+ ++info->resync_errors;
+ ++info->nr_declared;
+
+ tagp += sizeof(journal_block_tag_t);
+ if (!(flags & JFS_FLAG_SAME_UUID))
+ tagp += 16;
+
+ if (flags & JFS_FLAG_LAST_TAG)
+ break;
+ }
+
+ brelse(bh);
+ continue;
+
default:
jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
@@ -705,6 +845,38 @@ static int do_one_pass(journal_t *journa
return err;
}
+/* RAID sync the next one quarter of the journal. This is called once at the
+ * end of recovery if declare blocks are present since that part of the journal
+ * was likely undergoing writes before the crash.
+ */
+static int
+journal_syncraid(journal_t *journal, unsigned long next_log_block)
+{
+ int i, err;
+ unsigned long blocknr;
+
+ jbd_debug(2, "RAID resync of 1/4 of the journal starting at %lu\n",
+ next_log_block);
+
+ for (i = 0; i < journal->j_maxlen / 4; i++) {
+ err = journal_bmap(journal, next_log_block, &blocknr);
+
+ if (err) {
+ printk(KERN_ERR "JBD: bad block at offset %lu\n",
+ next_log_block);
+ return err;
+ }
+
+ err = resync_range(journal, blocknr, blocknr);
+ if (err)
+ return err;
+
+ next_log_block++;
+ wrap(journal, next_log_block);
+ }
+
+ return 0;
+}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
Index: linux-2.6.18-128.1.6/fs/jbd/transaction.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/jbd/transaction.c
+++ linux-2.6.18-128.1.6/fs/jbd/transaction.c
@@ -58,6 +58,10 @@ get_transaction(journal_t *journal, tran
journal->j_commit_timer.expires = transaction->t_expires;
add_timer(&journal->j_commit_timer);
+ /* Initialize the declare radix tree */
+ INIT_RADIX_TREE(&transaction->t_declare_root, GFP_ATOMIC);
+ INIT_RADIX_TREE(&transaction->t_declare_done_root, GFP_ATOMIC);
+
J_ASSERT(journal->j_running_transaction == NULL);
journal->j_running_transaction = transaction;
transaction->t_max_wait = 0;
@@ -956,6 +960,7 @@ int journal_dirty_data(handle_t *handle,
journal_t *journal = handle->h_transaction->t_journal;
int need_brelse = 0;
struct journal_head *jh;
+ int jdatalist;
if (is_handle_aborted(handle))
return 0;
@@ -999,6 +1004,8 @@ int journal_dirty_data(handle_t *handle,
goto no_journal;
}
+ jdatalist = journal->j_flags & JFS_DECLARE ? BJ_Declare : BJ_SyncData;
+
if (jh->b_transaction) {
JBUFFER_TRACE(jh, "has transaction");
if (jh->b_transaction != handle->h_transaction) {
@@ -1041,6 +1048,8 @@ int journal_dirty_data(handle_t *handle,
*/
if (jh->b_jlist != BJ_None &&
jh->b_jlist != BJ_SyncData &&
+ jh->b_jlist != BJ_Declare &&
+ jh->b_jlist != BJ_DeclareDone &&
jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "Not stealing");
goto no_journal;
@@ -1088,18 +1097,19 @@ int journal_dirty_data(handle_t *handle,
* committing transaction, so might still be left on that
* transaction's metadata lists.
*/
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
+ if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Declare &&
+ jh->b_jlist != BJ_DeclareDone && jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "not on correct data list: unfile");
J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
__journal_temp_unlink_buffer(jh);
jh->b_transaction = handle->h_transaction;
JBUFFER_TRACE(jh, "file as data");
__journal_file_buffer(jh, handle->h_transaction,
- BJ_SyncData);
+ jdatalist);
}
} else {
JBUFFER_TRACE(jh, "not on a transaction");
- __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
+ __journal_file_buffer(jh, handle->h_transaction, jdatalist);
}
no_journal:
spin_unlock(&journal->j_list_lock);
@@ -1578,6 +1588,7 @@ void __journal_temp_unlink_buffer(struct
struct journal_head **list = NULL;
transaction_t *transaction;
struct buffer_head *bh = jh2bh(jh);
+ struct radix_tree_root *root = NULL;
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
transaction = jh->b_transaction;
@@ -1617,9 +1628,25 @@ void __journal_temp_unlink_buffer(struct
case BJ_Locked:
list = &transaction->t_locked_list;
break;
+ case BJ_Declare:
+ root = &transaction->t_declare_root;
+ transaction->t_declare_count--;
+ break;
+ case BJ_DeclareDone:
+ root = &transaction->t_declare_done_root;
+ break;
+ }
+
+ if (jh->b_jlist == BJ_Declare || jh->b_jlist == BJ_DeclareDone) {
+ if ((radix_tree_delete(root, bh->b_blocknr)) != jh) {
+ printk(KERN_ERR
+ "jbd: ERROR radix tree delete block %8llu\n",
+ (unsigned long long)bh->b_blocknr);
+ }
}
+ else
+ __blist_del_buffer(list, jh);
- __blist_del_buffer(list, jh);
jh->b_jlist = BJ_None;
if (test_clear_buffer_jbddirty(bh))
mark_buffer_dirty(bh); /* Expose it to the VM */
@@ -1660,7 +1687,8 @@ __journal_try_to_free_buffer(journal_t *
spin_lock(&journal->j_list_lock);
if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
+ if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Declare ||
+ jh->b_jlist == BJ_DeclareDone || jh->b_jlist == BJ_Locked) {
/* A written-back ordered data buffer */
JBUFFER_TRACE(jh, "release data");
__journal_unfile_buffer(jh);
@@ -2072,6 +2100,8 @@ void __journal_file_buffer(struct journa
struct journal_head **list = NULL;
int was_dirty = 0;
struct buffer_head *bh = jh2bh(jh);
+ struct radix_tree_root *root = NULL;
+ int declare_per_block;
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -2126,15 +2156,44 @@ void __journal_file_buffer(struct journa
list = &transaction->t_reserved_list;
break;
case BJ_Locked:
- list = &transaction->t_locked_list;
+ list = &transaction->t_locked_list;
+ break;
+ case BJ_Declare:
+ root = &transaction->t_declare_root;
+ transaction->t_declare_count++;
break;
+ case BJ_DeclareDone:
+ root = &transaction->t_declare_done_root;
+ break;
+ }
+
+ if (jlist == BJ_Declare || jlist == BJ_DeclareDone) {
+ if ((radix_tree_insert(root, bh->b_blocknr, jh)) != 0) {
+ printk(KERN_ERR
+ "jbd: ERROR radix tree insert block %8lu\n",
+ (long unsigned)bh->b_blocknr);
+ }
+ } else {
+ __blist_add_buffer(list, jh);
}
- __blist_add_buffer(list, jh);
jh->b_jlist = jlist;
if (was_dirty)
set_buffer_jbddirty(bh);
+
+ declare_per_block = (bh->b_size - (sizeof(journal_header_t) + 32)) /
+ sizeof(journal_block_tag_t);
+
+ /* wake up the commit thread to perform early declarations */
+ assert_spin_locked(&transaction->t_journal->j_list_lock);
+ if (transaction->t_journal->j_flags & JFS_DECLARE &&
+ jlist == BJ_Declare &&
+ transaction->t_declare_count >= declare_per_block) {
+ transaction->t_declare_request = transaction->t_declare_count /
+ declare_per_block * declare_per_block;
+ wake_up(&transaction->t_journal->j_wait_commit);
+ }
}
void journal_file_buffer(struct journal_head *jh,
Index: linux-2.6.18-128.1.6/include/linux/jbd.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/jbd.h
+++ linux-2.6.18-128.1.6/include/linux/jbd.h
@@ -26,6 +26,7 @@
#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/journal-head.h>
+#include <linux/radix-tree.h>
#include <linux/stddef.h>
#include <linux/bit_spinlock.h>
#include <linux/mutex.h>
@@ -137,6 +138,7 @@ typedef struct journal_s journal_t; /* J
#define JFS_SUPERBLOCK_V1 3
#define JFS_SUPERBLOCK_V2 4
#define JFS_REVOKE_BLOCK 5
+#define JFS_DECLARE_BLOCK 6
/*
* Standard header for all descriptor blocks:
@@ -261,12 +263,14 @@ typedef struct journal_superblock_s
#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
+#define JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS 0x00000008
/* Features known to this kernel version: */
#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM
#define JFS_KNOWN_ROCOMPAT_FEATURES 0
#define JFS_KNOWN_INCOMPAT_FEATURES (JFS_FEATURE_INCOMPAT_REVOKE | \
- JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT | \
+ JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)
#ifdef __KERNEL__
@@ -559,6 +563,15 @@ struct transaction_s
struct journal_head *t_sync_datalist;
/*
+ * Radix tree of all data buffers that must be declared before being
+ * written, declare mode counters [j_list_lock]
+ */
+ struct radix_tree_root t_declare_root;
+ struct radix_tree_root t_declare_done_root;
+ unsigned int t_declare_count;
+ unsigned int t_declare_request;
+
+ /*
* Doubly-linked circular list of all forget buffers (superseded
* buffers which we can un-checkpoint once this transaction commits)
* [j_list_lock]
@@ -730,6 +743,7 @@ jbd_time_diff(unsigned int start, unsign
* @j_wait_checkpoint: Wait queue to trigger checkpointing
* @j_wait_commit: Wait queue to trigger commit
* @j_wait_updates: Wait queue to wait for updates to complete
+ * @j_wait_declare: Wait queue to wait for declarations to complete
* @j_checkpoint_mutex: Mutex for locking against concurrent checkpoints
* @j_head: Journal head - identifies the first unused block in the journal
* @j_tail: Journal tail - identifies the oldest still-used block in the
@@ -768,6 +782,8 @@ jbd_time_diff(unsigned int start, unsign
* @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
* number that will fit in j_blocksize
* @j_last_sync_writer: most recent pid which did a synchronous write
+ * @j_declare_jhs: array of journal_heads for write_declare_blocks
+ * @j_declare_bhs: array of buffer_heads for write_declare_blocks
* @j_private: An opaque pointer to fs-private information.
*/
@@ -841,6 +857,9 @@ struct journal_s
/* Wait queue to wait for updates to complete */
wait_queue_head_t j_wait_updates;
+ /* Wait queue to wait for declarations to complete */
+ wait_queue_head_t j_wait_declare;
+
/* Semaphore for locking against concurrent checkpoints */
struct mutex j_checkpoint_mutex;
@@ -970,6 +989,13 @@ struct journal_s
struct transaction_stats_s j_stats;
/*
+ * Arrays of jhs and bhs for write_declare_blocks, to avoid
+ * having to allocate them each time.
+ */
+ void *j_declare_jhs[64];
+ struct buffer_head *j_declare_bhs[64];
+
+ /*
* An opaque pointer to fs-private information. ext3 puts its
* superblock pointer here
*/
@@ -985,6 +1011,7 @@ struct journal_s
#define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */
#define JFS_LOADED 0x010 /* The journal superblock has been loaded */
#define JFS_BARRIER 0x020 /* Use IDE barriers */
+#define JFS_DECLARE 0x040 /* Declare data blocks before writing */
/*
* Function declarations for the journaling transaction and buffer
@@ -1100,6 +1127,7 @@ extern void journal_ack_err (journ
extern int journal_clear_err (journal_t *);
extern int journal_bmap(journal_t *, unsigned long, unsigned long *);
extern int journal_force_commit(journal_t *);
+extern int journal_write_declare(journal_t *);
/*
* journal_head management
@@ -1244,7 +1272,9 @@ static inline int jbd_space_needed(journ
#define BJ_LogCtl 6 /* Buffer contains log descriptors */
#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
#define BJ_Locked 8 /* Locked for I/O during commit */
-#define BJ_Types 9
+#define BJ_Declare 9 /* Needs to be declared first */
+#define BJ_DeclareDone 10 /* Has been declared */
+#define BJ_Types 11
extern int jbd_blocks_per_page(struct inode *inode);
--
^ permalink raw reply [flat|nested] 9+ messages in thread* [patch 4/4] [ext3] Add journal guided resync (data=declared mode)
2009-10-01 22:39 [patch 0/4] Journal guided resync and support scjody
` (2 preceding siblings ...)
2009-10-01 22:39 ` [patch 3/4] [jbd] Add support for journal guided resync scjody
@ 2009-10-01 22:39 ` scjody
2009-10-02 1:51 ` Neil Brown
2009-10-02 0:36 ` [patch 0/4] Journal guided resync and support Andi Kleen
4 siblings, 1 reply; 9+ messages in thread
From: scjody @ 2009-10-01 22:39 UTC (permalink / raw)
To: linux-ext4, linux-raid; +Cc: linux-kernel, Andreas Dilger
[-- Attachment #1: ext3-journal-guided-resync.patch --]
[-- Type: TEXT/PLAIN, Size: 10496 bytes --]
We introduce a new data write mode known as declared mode. This is based on
ordered mode except that a list of blocks to be written during the current
transaction is added to the journal before the blocks themselves are written to
the disk. Then, if the system crashes, we can resync only those blocks during
journal replay and skip the rest of the resync of the RAID array.
TODO: Add support to e2fsck.
TODO: The following sequence of events could cause resync to be skipped
incorrectly:
- An MD array that supports RESYNC_RANGE is undergoing resync.
- A filesystem on that array is mounted with data=declared.
- The machine crashes before the resync completes.
- The array is restarted and the filesystem is remounted.
- Recovery resyncs only the blocks that were undergoing writes during
the crash and skips the rest.
Addressing this requires even more communication between MD and ext and
I need to think more about how to do this.
Index: linux-2.6.18-128.1.6/fs/ext3/file.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/ext3/file.c
+++ linux-2.6.18-128.1.6/fs/ext3/file.c
@@ -78,7 +78,8 @@ ext3_file_write(struct kiocb *iocb, cons
* Open question --- do we care about flushing timestamps too
* if the inode is IS_SYNC?
*/
- if (!ext3_should_journal_data(inode))
+ if (!ext3_should_journal_data(inode) &&
+ !ext3_should_declare_data(inode))
return ret;
goto force_commit;
Index: linux-2.6.18-128.1.6/fs/ext3/fsync.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/ext3/fsync.c
+++ linux-2.6.18-128.1.6/fs/ext3/fsync.c
@@ -66,8 +66,13 @@ int ext3_sync_file(struct file * file, s
* filemap_fdatawait() will encounter a ton of newly-dirtied pages
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
+ *
+ * data=declared:
+ * Declare blocks are written before data blocks, then the
+ * sync proceeds as for data=ordered.
*/
- if (ext3_should_journal_data(inode)) {
+ if (ext3_should_journal_data(inode) ||
+ ext3_should_declare_data(inode)) {
ret = ext3_force_commit(inode->i_sb);
goto out;
}
Index: linux-2.6.18-128.1.6/fs/ext3/inode.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/ext3/inode.c
+++ linux-2.6.18-128.1.6/fs/ext3/inode.c
@@ -1190,6 +1190,15 @@ static int commit_write_fn(handle_t *han
return ext3_journal_dirty_metadata(handle, bh);
}
+/* For commit_write() in data=declared mode */
+static int declared_commit_write_fn(handle_t *handle, struct buffer_head *bh)
+{
+ if (!buffer_mapped(bh) || buffer_freed(bh))
+ return 0;
+ set_buffer_uptodate(bh);
+ return ext3_journal_dirty_data(handle, bh);
+}
+
/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
@@ -1220,6 +1229,37 @@ static int ext3_ordered_commit_write(str
EXT3_I(inode)->i_disksize = new_i_size;
ret = generic_commit_write(file, page, from, to);
}
+
+ ret2 = ext3_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+ return ret;
+}
+
+static int ext3_declared_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ handle_t *handle = ext3_journal_current_handle();
+ struct inode *inode = page->mapping->host;
+ int ret = 0, ret2;
+ int partial = 0;
+ loff_t pos;
+
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, &partial, declared_commit_write_fn);
+
+ if (ret == 0) {
+ pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ if (pos > EXT3_I(inode)->i_disksize)
+ EXT3_I(inode)->i_disksize = pos;
+ if (!partial)
+ SetPageUptodate(page);
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ }
+
ret2 = ext3_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1741,14 +1781,30 @@ static const struct address_space_operat
.releasepage = ext3_releasepage,
};
+static const struct address_space_operations ext3_declared_aops = {
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_ordered_writepage,
+ .sync_page = block_sync_page,
+ .prepare_write = ext3_prepare_write,
+ .commit_write = ext3_declared_commit_write,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .direct_IO = ext3_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
void ext3_set_aops(struct inode *inode)
{
if (ext3_should_order_data(inode))
inode->i_mapping->a_ops = &ext3_ordered_aops;
else if (ext3_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext3_writeback_aops;
- else
+ else if (ext3_should_journal_data(inode))
inode->i_mapping->a_ops = &ext3_journalled_aops;
+ else
+ inode->i_mapping->a_ops = &ext3_declared_aops;
}
/*
@@ -1845,9 +1901,12 @@ static int ext3_block_truncate_page(hand
if (ext3_should_journal_data(inode)) {
err = ext3_journal_dirty_metadata(handle, bh);
} else {
- if (ext3_should_order_data(inode))
+ if (ext3_should_order_data(inode) ||
+ ext3_should_declare_data(inode))
err = ext3_journal_dirty_data(handle, bh);
- mark_buffer_dirty(bh);
+
+ if (!ext3_should_declare_data(inode))
+ mark_buffer_dirty(bh);
}
unlock:
Index: linux-2.6.18-128.1.6/fs/ext3/super.c
===================================================================
--- linux-2.6.18-128.1.6.orig/fs/ext3/super.c
+++ linux-2.6.18-128.1.6/fs/ext3/super.c
@@ -391,6 +391,9 @@ static void ext3_put_super (struct super
int i, err;
ext3_xattr_put_super(sb);
+ journal_clear_features(sbi->s_journal, 0, 0,
+ JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS);
+ journal_update_superblock(sbi->s_journal, 1);
err = journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
if (err < 0)
@@ -553,6 +556,8 @@ static int ext3_show_options(struct seq_
seq_puts(seq, ",data=ordered");
else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
seq_puts(seq, ",data=writeback");
+ else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_DECLARED_DATA)
+ seq_puts(seq, ",data=declared");
ext3_show_quota_options(seq, sb);
@@ -682,7 +687,7 @@ enum {
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_grpquota
+ Opt_grpquota, Opt_data_declared
};
static match_table_t tokens = {
@@ -721,6 +726,7 @@ static match_table_t tokens = {
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
+ {Opt_data_declared, "data=declared"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
@@ -922,6 +928,9 @@ static int parse_options (char *options,
goto datacheck;
case Opt_data_writeback:
data_opt = EXT3_MOUNT_WRITEBACK_DATA;
+ goto datacheck;
+ case Opt_data_declared:
+ data_opt = EXT3_MOUNT_DECLARED_DATA;
datacheck:
if (is_remount) {
if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
@@ -1740,7 +1749,21 @@ static int ext3_fill_super (struct super
else
set_opt(sbi->s_mount_opt, JOURNAL_DATA);
break;
-
+ case EXT3_MOUNT_DECLARED_DATA:
+ if (!journal_check_available_features
+ (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)) {
+ printk(KERN_ERR "EXT3-fs: Journal does not support "
+ "declared data journaling mode\n");
+ goto failed_mount4;
+ }
+ spin_lock(&sbi->s_journal->j_state_lock);
+ sbi->s_journal->j_flags |= JFS_DECLARE;
+ spin_unlock(&sbi->s_journal->j_state_lock);
+ if (!journal_set_features(sbi->s_journal, 0, 0,
+ JFS_FEATURE_INCOMPAT_DECLARE_BLOCKS)) {
+ printk(KERN_ERR "EXT3-fs: Cannot set declared mode.\n");
+ goto failed_mount4;
+ }
case EXT3_MOUNT_ORDERED_DATA:
case EXT3_MOUNT_WRITEBACK_DATA:
if (!journal_check_available_features
@@ -1797,6 +1820,7 @@ static int ext3_fill_super (struct super
printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_DECLARED_DATA ? "declared":
"writeback");
lock_kernel();
Index: linux-2.6.18-128.1.6/include/linux/ext3_fs.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/ext3_fs.h
+++ linux-2.6.18-128.1.6/include/linux/ext3_fs.h
@@ -357,11 +357,11 @@ struct ext3_inode {
#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */
-#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
+#define EXT3_MOUNT_DATA_FLAGS 0x01C00 /* Mode for data writes: */
#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
-#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
+#define EXT3_MOUNT_DECLARED_DATA 0x01000 /* Declare data blocks before writing */
#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
@@ -383,6 +383,7 @@ struct ext3_inode {
#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT
#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS
#endif
+#define EXT3_MOUNT_UPDATE_JOURNAL 0x40000000 /* Update the journal format */
#define ext3_set_bit ext2_set_bit
#define ext3_set_bit_atomic ext2_set_bit_atomic
Index: linux-2.6.18-128.1.6/include/linux/ext3_jbd.h
===================================================================
--- linux-2.6.18-128.1.6.orig/include/linux/ext3_jbd.h
+++ linux-2.6.18-128.1.6/include/linux/ext3_jbd.h
@@ -265,4 +265,15 @@ static inline int ext3_should_writeback_
return 0;
}
+static inline int ext3_should_declare_data(struct inode *inode)
+{
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
+ return 0;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_DECLARED_DATA)
+ return 1;
+ return 0;
+}
+
#endif /* _LINUX_EXT3_JBD_H */
--
^ permalink raw reply [flat|nested] 9+ messages in thread