From: Jonathan E Brassow <jbrassow@redhat.com>
To: device-mapper development <dm-devel@redhat.com>
Subject: mirroring: device failure tolerance
Date: Fri, 24 Jun 2005 15:16:16 -0500 [thread overview]
Message-ID: <12249dad6f7ea11612dfb0a87d22bbf3@redhat.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 188 bytes --]
Attached is a patch to provide device failure tolerance/detection for
mirroring.
brassow
P.S. A basic write-up of how things work is at
http://www.brassow.com/mirroring/index.html
[-- Attachment #2: 005.patch --]
[-- Type: application/octet-stream, Size: 38570 bytes --]
--- linux-2.6.12/drivers/md/dm-log.h-patch 2005-06-21 14:40:48.000000000 -0500
+++ linux-2.6.12/drivers/md/dm-log.h 2005-06-23 22:39:04.687203685 -0500
@@ -9,6 +9,11 @@
#include "dm.h"
+#define LOG_DIRTY 0
+#define LOG_CLEAN 1 /* if a region is clean, it is also in sync */
+#define LOG_NOSYNC 2
+#define LOG_REMOTE_RECOVERING 3
+
typedef sector_t region_t;
struct dirty_log_type;
@@ -23,6 +28,7 @@
const char *name;
struct module *module;
unsigned int use_count;
+ unsigned int multi_node;
int (*ctr)(struct dirty_log *log, struct dm_target *ti,
unsigned int argc, char **argv);
@@ -128,3 +134,13 @@
void dm_dirty_log_exit(void);
#endif
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only. This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
--- linux-2.6.12/drivers/md/dm.c-patch 2005-06-21 14:40:48.000000000 -0500
+++ linux-2.6.12/drivers/md/dm.c 2005-06-23 15:31:03.000000000 -0500
@@ -1055,14 +1055,14 @@
if (test_bit(DMF_BLOCK_IO, &md->flags))
goto out_read_unlock;
- error = __lock_fs(md);
- if (error)
- goto out_read_unlock;
-
map = dm_get_table(md);
if (map)
dm_table_presuspend_targets(map);
+ error = __lock_fs(md);
+ if (error)
+ goto out_read_unlock;
+
up_read(&md->lock);
/*
--- linux-2.6.12/drivers/md/dm-raid1.c-patch 2005-06-21 14:40:48.000000000 -0500
+++ linux-2.6.12/drivers/md/dm-raid1.c 2005-06-24 15:11:53.150233021 -0500
@@ -1,11 +1,13 @@
/*
* Copyright (C) 2003 Sistina Software Limited.
+ * Copyright (C) 2004-2005 Red Hat Inc.
*
* This file is released under the GPL.
*/
#include "dm.h"
#include "dm-bio-list.h"
+#include "dm-bio-record.h"
#include "dm-io.h"
#include "dm-log.h"
#include "kcopyd.h"
@@ -28,6 +30,8 @@
queue_work(_kmirrord_wq, &_kmirrord_work);
}
+static struct workqueue_struct *_mir_mond_wq;
+
/*-----------------------------------------------------------------
* Region hash
*
@@ -91,7 +95,8 @@
RH_CLEAN,
RH_DIRTY,
RH_NOSYNC,
- RH_RECOVERING
+ RH_RECOVERING,
+ RH_REMOTE_RECOVERING
};
struct region {
@@ -120,7 +125,7 @@
}
/* FIXME move this */
-static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
+static int queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
static void *region_alloc(unsigned int __nocast gfp_mask, void *pool_data)
{
@@ -234,7 +239,7 @@
read_unlock(&rh->hash_lock);
nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
- nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
+ nreg->state = (rh->log->type->in_sync(rh->log, region, 1) == LOG_CLEAN) ?
RH_CLEAN : RH_NOSYNC;
nreg->rh = rh;
nreg->key = region;
@@ -252,15 +257,15 @@
else {
__rh_insert(rh, nreg);
- if (nreg->state == RH_CLEAN) {
- spin_lock(&rh->region_lock);
- list_add(&nreg->list, &rh->clean_regions);
- spin_unlock(&rh->region_lock);
- }
reg = nreg;
}
write_unlock_irq(&rh->hash_lock);
read_lock(&rh->hash_lock);
+ if (reg->state == RH_CLEAN) {
+ spin_lock(&rh->region_lock);
+ list_add(®->list, &rh->clean_regions);
+ spin_unlock(&rh->region_lock);
+ }
return reg;
}
@@ -278,33 +283,47 @@
static int rh_state(struct region_hash *rh, region_t region, int may_block)
{
- int r;
+ int r = 0;
struct region *reg;
read_lock(&rh->hash_lock);
reg = __rh_lookup(rh, region);
+ if (reg)
+ r = reg->state;
read_unlock(&rh->hash_lock);
- if (reg)
- return reg->state;
+ if (r)
+ return r;
/*
- * The region wasn't in the hash, so we fall back to the
- * dirty log.
+ * The region wasn't in the hash, so we fall back to the dirty log.
*/
- r = rh->log->type->in_sync(rh->log, region, may_block);
+ switch(rh->log->type->in_sync(rh->log, region, may_block)){
+ case LOG_CLEAN:
+ r = RH_CLEAN;
+ break;
+ case LOG_DIRTY:
+ r = RH_DIRTY;
+ break;
+ case LOG_REMOTE_RECOVERING:
+ r = RH_REMOTE_RECOVERING;
+ break;
+ default:
+ r = RH_NOSYNC;
+ break;
+ }
/*
* Any error from the dirty log (eg. -EWOULDBLOCK) gets
* taken as a RH_NOSYNC
*/
- return r == 1 ? RH_CLEAN : RH_NOSYNC;
+ return r;
}
-static inline int rh_in_sync(struct region_hash *rh,
- region_t region, int may_block)
+static inline int rh_in_sync(struct region_hash *rh, region_t region)
{
- int state = rh_state(rh, region, may_block);
+ int state = rh_state(rh, region, 0);
+
return state == RH_CLEAN || state == RH_DIRTY;
}
@@ -312,9 +331,8 @@
{
struct bio *bio;
- while ((bio = bio_list_pop(bio_list))) {
+ while ((bio = bio_list_pop(bio_list)))
queue_bio(ms, bio, WRITE);
- }
}
static void rh_update_states(struct region_hash *rh)
@@ -333,7 +351,7 @@
list_splice(&rh->clean_regions, &clean);
INIT_LIST_HEAD(&rh->clean_regions);
- list_for_each_entry (reg, &clean, list) {
+ list_for_each_entry(reg, &clean, list) {
rh->log->type->clear_region(rh->log, reg->key);
list_del(®->hash_list);
}
@@ -343,9 +361,10 @@
list_splice(&rh->recovered_regions, &recovered);
INIT_LIST_HEAD(&rh->recovered_regions);
- list_for_each_entry (reg, &recovered, list)
+ list_for_each_entry(reg, &recovered, list)
list_del(®->hash_list);
}
+
spin_unlock(&rh->region_lock);
write_unlock_irq(&rh->hash_lock);
@@ -365,7 +384,7 @@
if (!list_empty(&recovered))
rh->log->type->flush(rh->log);
- list_for_each_entry_safe (reg, next, &clean, list)
+ list_for_each_entry_safe(reg, next, &clean, list)
mempool_free(reg, rh->region_pool);
}
@@ -375,16 +394,24 @@
read_lock(&rh->hash_lock);
reg = __rh_find(rh, region);
+
+ /*
+ * We lock around this to prevent a race with rh_dec.
+ * We unlock because the mark can block - holding things up
+ */
+ spin_lock_irq(&rh->region_lock);
+ atomic_inc(®->pending);
+ spin_unlock_irq(&rh->region_lock);
+
if (reg->state == RH_CLEAN) {
rh->log->type->mark_region(rh->log, reg->key);
spin_lock_irq(&rh->region_lock);
reg->state = RH_DIRTY;
- list_del_init(®->list); /* take off the clean list */
+ list_del_init(®->list); /* Take off the clean list. */
spin_unlock_irq(&rh->region_lock);
}
- atomic_inc(®->pending);
read_unlock(&rh->hash_lock);
}
@@ -406,17 +433,17 @@
reg = __rh_lookup(rh, region);
read_unlock(&rh->hash_lock);
+ spin_lock_irqsave(&rh->region_lock, flags);
if (atomic_dec_and_test(®->pending)) {
- spin_lock_irqsave(&rh->region_lock, flags);
if (reg->state == RH_RECOVERING) {
list_add_tail(®->list, &rh->quiesced_regions);
} else {
reg->state = RH_CLEAN;
list_add(®->list, &rh->clean_regions);
}
- spin_unlock_irqrestore(&rh->region_lock, flags);
should_wake = 1;
}
+ spin_unlock_irqrestore(&rh->region_lock, flags);
if (should_wake)
wake();
@@ -452,7 +479,6 @@
/* Already quiesced ? */
if (atomic_read(®->pending))
list_del_init(®->list);
-
else {
list_del_init(®->list);
list_add(®->list, &rh->quiesced_regions);
@@ -482,7 +508,7 @@
if (!list_empty(&rh->quiesced_regions)) {
reg = list_entry(rh->quiesced_regions.next,
struct region, list);
- list_del_init(®->list); /* remove from the quiesced list */
+ list_del_init(®->list); /* Remove from the quiesced list. */
}
spin_unlock_irq(&rh->region_lock);
@@ -538,8 +564,10 @@
/*-----------------------------------------------------------------
* Mirror set structures.
*---------------------------------------------------------------*/
+
struct mirror {
- atomic_t error_count;
+ atomic_t error_count; /* Error counter to flag mirror failure. */
+ struct mirror_set *ms;
struct dm_dev *dev;
sector_t offset;
};
@@ -550,36 +578,59 @@
struct region_hash rh;
struct kcopyd_client *kcopyd_client;
- spinlock_t lock; /* protects the next two lists */
+ spinlock_t lock; /* protects the lists */
struct bio_list reads;
struct bio_list writes;
+ struct bio_list failures;
+ struct work_struct failure_work;
+ struct completion failure_completion;
/* recovery */
+ atomic_t suspended;
region_t nr_regions;
int in_sync;
- unsigned int nr_mirrors;
+ spinlock_t choose_lock; /* protects select in choose_mirror(). */
+ atomic_t read_count; /* Read counter for read balancing. */
+ unsigned int nr_mirrors; /* # of mirrors in this set. */
+ unsigned int read_mirror; /* Last mirror read. */
+ struct mirror *default_mirror; /* Default mirror. */
struct mirror mirror[0];
};
+struct bio_map_info {
+ struct mirror *bmi_m;
+ struct dm_bio_details bmi_bd;
+};
+
+static mempool_t *bio_map_info_pool = NULL;
+
+static void *bio_map_info_alloc(unsigned int gfp_mask, void *pool_data){
+ return kmalloc(sizeof(struct bio_map_info), gfp_mask);
+}
+
+static void bio_map_info_free(void *element, void *pool_data){
+ kfree(element);
+}
+
/*
* Every mirror should look like this one.
*/
#define DEFAULT_MIRROR 0
/*
- * This is yucky. We squirrel the mirror_set struct away inside
- * bi_next for write buffers. This is safe since the bh
+ * This is yucky. We squirrel the mirror struct away inside
+ * bi_next for read+write buffers. This is safe since the bh
* doesn't get submitted to the lower levels of block layer.
*/
-static struct mirror_set *bio_get_ms(struct bio *bio)
+static struct mirror *bio_get_m(struct bio *bio)
{
- return (struct mirror_set *) bio->bi_next;
+ return (struct mirror *) bio->bi_next;
}
-static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
+static void bio_set_m(struct bio *bio, struct mirror *m)
{
- bio->bi_next = (struct bio *) ms;
+ bio->bi_next = (struct bio *) m;
}
/*-----------------------------------------------------------------
@@ -602,12 +653,12 @@
{
int r;
unsigned int i;
- struct io_region from, to[KCOPYD_MAX_REGIONS], *dest;
+ struct io_region from, to[ms->nr_mirrors - 1], *dest;
struct mirror *m;
unsigned long flags = 0;
- /* fill in the source */
- m = ms->mirror + DEFAULT_MIRROR;
+ /* Fill in the source. */
+ m = ms->default_mirror;
from.bdev = m->dev->bdev;
from.sector = m->offset + region_to_sector(reg->rh, reg->key);
if (reg->key == (ms->nr_regions - 1)) {
@@ -623,7 +674,7 @@
/* fill in the destinations */
for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
- if (i == DEFAULT_MIRROR)
+ if (&ms->mirror[i] == ms->default_mirror)
continue;
m = ms->mirror + i;
@@ -666,49 +717,208 @@
*/
if (!ms->in_sync &&
(log->type->get_sync_count(log) == ms->nr_regions)) {
- /* the sync is complete */
+ /* The sync is complete. */
dm_table_event(ms->ti->table);
ms->in_sync = 1;
}
}
+/*
+ * Remap a buffer to a particular mirror.
+ */
+static sector_t map_sector(struct mirror *m, struct bio *bio)
+{
+ return m->offset + (bio->bi_sector - m->ms->ti->begin);
+}
+
+static void map_bio(struct mirror *m, struct bio *bio)
+{
+ bio->bi_bdev = m->dev->bdev;
+ bio->bi_sector = map_sector(m, bio);
+}
+
+static void map_region(struct io_region *io, struct mirror *m,
+ struct bio *bio)
+{
+ io->bdev = m->dev->bdev;
+ io->sector = map_sector(m, bio);
+ io->count = bio->bi_size >> 9;
+}
+
/*-----------------------------------------------------------------
* Reads
*---------------------------------------------------------------*/
-static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
+/* FIXME: do something smarter for read balancing. */
+
+/*
+ * Select a mirror to queue the read to (read balancing).
+ *
+ * The selection process must be locked, because the daemon
+ * and the mapping function can access it concurrently.
+ */
+#define MIN_READS 128
+static struct mirror *choose_mirror(struct mirror_set *ms, struct mirror *m)
{
- /* FIXME: add read balancing */
- return ms->mirror + DEFAULT_MIRROR;
+ int i, retry;
+ unsigned long flags;
+ struct mirror *ret = NULL;
+
+ spin_lock_irqsave(&ms->choose_lock, flags);
+
+ if (unlikely(m == ms->default_mirror)) {
+ i = DEFAULT_MIRROR;
+ atomic_set(&ms->read_count, MIN_READS);
+ } else {
+ i = ms->read_mirror;
+ }
+
+ for (retry = 0; retry < ms->nr_mirrors; ) {
+ i %= ms->nr_mirrors;
+ ret = ms->mirror + i;
+
+ if (unlikely(atomic_read(&ret->error_count))) {
+ retry++;
+ i++;
+ } else {
+ /*
+ * Guarantee that a number of read IOs
+ * get queued to the same mirror.
+ */
+ if (atomic_dec_and_test(&ms->read_count)) {
+ atomic_set(&ms->read_count, MIN_READS);
+ i++;
+ }
+
+ ms->read_mirror = i;
+ break;
+ }
+ }
+
+ if(unlikely(m == ms->default_mirror)){
+ ms->default_mirror = ret;
+ }
+
+ spin_unlock_irqrestore(&ms->choose_lock, flags);
+
+ if (unlikely(atomic_read(&ret->error_count))){
+ DMERR("All mirror devices are dead. Unable to choose_mirror.");
+ return NULL;
+ }
+
+ return ret;
}
/*
- * remap a buffer to a particular mirror.
+ * Fail a mirror and optionally select another one as the default.
*/
-static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
+static void fail_mirror(struct mirror *m)
{
- bio->bi_bdev = m->dev->bdev;
- bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
+ DMINFO("incrementing error_count on %s", m->dev->name);
+ atomic_inc(&m->error_count);
+
+ choose_mirror(m->ms, m);
+}
+
+static int default_mirror(struct mirror *m)
+{
+ return !atomic_read(&m->ms->default_mirror->error_count);
+}
+
+static void read_callback(unsigned long error, void *context)
+{
+ struct bio *bio = (struct bio *)context;
+ struct mirror *m;
+
+ m = bio_get_m(bio);
+ bio_set_m(bio, NULL);
+
+ if (unlikely(error)) {
+ DMWARN("A read failure occurred on a mirror device.");
+ fail_mirror(m);
+ if (likely(default_mirror(m))) {
+ DMWARN("Trying different device.");
+ queue_bio(m->ms, bio, bio_rw(bio));
+ } else {
+ DMERR("No other device available, failing I/O.");
+ bio_endio(bio, 0, -EIO);
+ }
+ } else
+ bio_endio(bio, bio->bi_size, 0);
+}
+
+/* Asynchronous read. */
+static void read_async_bio(struct mirror *m, struct bio *bio)
+{
+ struct io_region io;
+
+ map_region(&io, m, bio);
+ bio_set_m(bio, m);
+ dm_io_async_bvec(1, &io, READ,
+ bio->bi_io_vec + bio->bi_idx,
+ read_callback, bio);
}
static void do_reads(struct mirror_set *ms, struct bio_list *reads)
{
- region_t region;
struct bio *bio;
struct mirror *m;
while ((bio = bio_list_pop(reads))) {
- region = bio_to_region(&ms->rh, bio);
-
/*
* We can only read balance if the region is in sync.
*/
- if (rh_in_sync(&ms->rh, region, 0))
- m = choose_mirror(ms, bio->bi_sector);
- else
- m = ms->mirror + DEFAULT_MIRROR;
+ if (likely(rh_in_sync(&ms->rh, bio_to_region(&ms->rh, bio))))
+ m = choose_mirror(ms, NULL);
+ else {
+ m = ms->default_mirror;
+
+ /* If the default fails, we give up .*/
+ if (unlikely(m && atomic_read(&m->error_count)))
+ m = NULL;
+ }
- map_bio(ms, m, bio);
- generic_make_request(bio);
+ if (likely(m)){
+ read_async_bio(m, bio);
+ }else{
+ bio_endio(bio, 0, -EIO);
+ }
+ }
+}
+
+static void write_failure_handler(void *data)
+{
+ int i = 0;
+ struct bio *bio;
+ struct bio_list failed_writes;
+ struct mirror_set *ms = (struct mirror_set *)data;
+ struct dirty_log *log = ms->rh.log;
+
+
+ dm_table_event(ms->ti->table);
+
+ if(log->type->multi_node){
+ DMERR("Event signaled. Waiting to start failure handling.");
+ wait_for_completion(&ms->failure_completion);
+ DMINFO("Wait complete");
+ }
+
+ /*
+ * Device must be suspended to prevent corruption in
+ * cluster context.
+ */
+
+ /* Take list out to handle endios. */
+ spin_lock(&ms->lock);
+ failed_writes = ms->failures;
+ bio_list_init(&ms->failures);
+ spin_unlock(&ms->lock);
+
+ while ((bio = bio_list_pop(&failed_writes))) {
+ DMINFO("Completing I/O : %d", i++);
+ bio_endio(bio, bio->bi_size, 0);
+ }
+ if(log->type->multi_node){
+ DMERR("Failure handling complete.");
}
}
@@ -724,13 +934,12 @@
*---------------------------------------------------------------*/
static void write_callback(unsigned long error, void *context)
{
- unsigned int i;
- int uptodate = 1;
+ unsigned int i, ret = 0;
struct bio *bio = (struct bio *) context;
struct mirror_set *ms;
- ms = bio_get_ms(bio);
- bio_set_ms(bio, NULL);
+ ms = (bio_get_m(bio))->ms;
+ bio_set_m(bio, NULL);
/*
* NOTE: We don't decrement the pending count here,
@@ -738,48 +947,98 @@
* This way we handle both writes to SYNC and NOSYNC
* regions with the same code.
*/
+ if (unlikely(error)) {
+ int uptodate = 0, run;
+
+ DMERR("Error during write occurred.");
- if (error) {
/*
- * only error the io if all mirrors failed.
- * FIXME: bogus
+ * Test all bits - if all failed, fail io.
+ * Otherwise, go through hassle of failing a device...
*/
- uptodate = 0;
- for (i = 0; i < ms->nr_mirrors; i++)
- if (!test_bit(i, &error)) {
+ for (i = 0; i < ms->nr_mirrors; i++){
+ if (test_bit(i, &error))
+ fail_mirror(ms->mirror + i);
+ else
uptodate = 1;
- break;
+
+ }
+
+ if (likely(uptodate)) {
+ spin_lock(&ms->lock);
+ if (atomic_read(&ms->suspended)) {
+ /*
+ * The device is suspended, it is
+ * safe to complete I/O.
+ */
+ spin_unlock(&ms->lock);
+ } else {
+ /*
+ * Failed writes on the list ->
+ * process is scheduled.
+ *
+ * None on the list ->
+ * process must block for the
+ * suspend, then complete the I/O.
+ */
+ run = !ms->failures.head;
+ bio_list_add(&ms->failures, bio);
+ spin_unlock(&ms->lock);
+
+ if (run){
+ queue_work(_mir_mond_wq,
+ &ms->failure_work);
+ }
+
+ /*
+ * DO NOT SIGNAL COMPLETION, work thread will call
+ * bio_endio()
+ */
+ return;
}
+ } else {
+ DMERR("All replicated volumes dead, failing I/O");
+ /* None of the writes succeeded, fail the I/O. */
+ ret = -EIO;
+ }
}
- bio_endio(bio, bio->bi_size, 0);
+
+ bio_endio(bio, bio->bi_size, ret);
}
static void do_write(struct mirror_set *ms, struct bio *bio)
{
unsigned int i;
- struct io_region io[KCOPYD_MAX_REGIONS+1];
+ struct io_region io[ms->nr_mirrors], *dest = io;
struct mirror *m;
+ struct dirty_log *log = ms->rh.log;
- for (i = 0; i < ms->nr_mirrors; i++) {
- m = ms->mirror + i;
-
- io[i].bdev = m->dev->bdev;
- io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
- io[i].count = bio->bi_size >> 9;
+ for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
+ if (likely(!atomic_read(&m->error_count) || log->type->multi_node))
+ map_region(dest++, m, bio);
}
- bio_set_ms(bio, ms);
- dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
- bio->bi_io_vec + bio->bi_idx,
- write_callback, bio);
+ if (likely(dest - io)) {
+ /*
+ * We can use the default mirror here, because we
+ * only need it in order to retrieve the reference
+ * to the mirror set in write_callback().
+ */
+ bio_set_m(bio, ms->default_mirror);
+ dm_io_async_bvec(dest - io, io, WRITE,
+ bio->bi_io_vec + bio->bi_idx,
+ write_callback, bio);
+ } else
+ bio_endio(bio, bio->bi_size, -EIO);
}
static void do_writes(struct mirror_set *ms, struct bio_list *writes)
{
- int state;
struct bio *bio;
struct bio_list sync, nosync, recover, *this_list = NULL;
+ struct bio_list tmp;
+ /* Nothing to do... */
if (!writes->head)
return;
@@ -789,10 +1048,10 @@
bio_list_init(&sync);
bio_list_init(&nosync);
bio_list_init(&recover);
+ bio_list_init(&tmp);
while ((bio = bio_list_pop(writes))) {
- state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
- switch (state) {
+ switch (rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1)) {
case RH_CLEAN:
case RH_DIRTY:
this_list = &sync;
@@ -805,15 +1064,20 @@
case RH_RECOVERING:
this_list = &recover;
break;
+
+ case RH_REMOTE_RECOVERING:
+ this_list = &tmp;
+ break;
}
bio_list_add(this_list, bio);
}
+ bio_list_merge(writes, &tmp);
/*
* Increment the pending counts for any regions that will
* be written to (writes to recover regions are going to
- * be delayed).
+ * be delayed) and flush the dirty log.
*/
rh_inc_pending(&ms->rh, &sync);
rh_inc_pending(&ms->rh, &nosync);
@@ -825,13 +1089,13 @@
while ((bio = bio_list_pop(&sync)))
do_write(ms, bio);
- while ((bio = bio_list_pop(&recover)))
- rh_delay(&ms->rh, bio);
-
while ((bio = bio_list_pop(&nosync))) {
- map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
+ map_bio(ms->default_mirror, bio);
generic_make_request(bio);
}
+
+ while ((bio = bio_list_pop(&recover)))
+ rh_delay(&ms->rh, bio);
}
/*-----------------------------------------------------------------
@@ -861,8 +1125,9 @@
{
struct mirror_set *ms;
+ /* FIXME: adding/deleting sets can take forever in busy situations. */
down_read(&_mirror_sets_lock);
- list_for_each_entry (ms, &_mirror_sets, list)
+ list_for_each_entry(ms, &_mirror_sets, list)
do_mirror(ms);
up_read(&_mirror_sets_lock);
}
@@ -891,17 +1156,27 @@
memset(ms, 0, len);
spin_lock_init(&ms->lock);
+ spin_lock_init(&ms->choose_lock);
ms->ti = ti;
ms->nr_mirrors = nr_mirrors;
ms->nr_regions = dm_sector_div_up(ti->len, region_size);
ms->in_sync = 0;
+ ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
+
+ atomic_set(&ms->suspended, 0);
if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
ti->error = "dm-mirror: Error creating dirty region hash";
kfree(ms);
return NULL;
}
+
+ atomic_set(&ms->read_count, MIN_READS);
+
+ bio_list_init(&ms->failures);
+ INIT_WORK(&ms->failure_work, write_failure_handler, ms);
+ init_completion(&ms->failure_completion);
return ms;
}
@@ -926,6 +1201,7 @@
unsigned int mirror, char **argv)
{
sector_t offset;
+ struct mirror *m = ms->mirror + mirror;
if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
ti->error = "dm-mirror: Invalid offset";
@@ -933,13 +1209,14 @@
}
if (dm_get_device(ti, argv[0], offset, ti->len,
- dm_table_get_mode(ti->table),
- &ms->mirror[mirror].dev)) {
+ dm_table_get_mode(ti->table), &m->dev)) {
ti->error = "dm-mirror: Device lookup failure";
return -ENXIO;
}
- ms->mirror[mirror].offset = offset;
+ atomic_set(&m->error_count, 0);
+ m->offset = offset;
+ m->ms = ms;
return 0;
}
@@ -1028,7 +1305,7 @@
argc -= args_used;
if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
- nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
+ nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS) {
ti->error = "dm-mirror: Invalid number of mirrors";
dm_destroy_dirty_log(dl);
return -EINVAL;
@@ -1059,7 +1336,7 @@
argc -= 2;
}
- ti->private = ms;
+ ti->private = ms->mirror;
r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
if (r) {
@@ -1067,100 +1344,185 @@
return r;
}
+ ms->read_mirror = 1;
+
add_mirror_set(ms);
return 0;
}
static void mirror_dtr(struct dm_target *ti)
{
- struct mirror_set *ms = (struct mirror_set *) ti->private;
+ struct mirror_set *ms = ((struct mirror *) ti->private)->ms;
del_mirror_set(ms);
kcopyd_client_destroy(ms->kcopyd_client);
free_context(ms, ti, ms->nr_mirrors);
}
-static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
+static int queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
{
- int should_wake = 0;
- struct bio_list *bl;
+ int should_wake;
+ struct bio_list *bl = rw == WRITE ? &ms->writes : &ms->reads;
- bl = (rw == WRITE) ? &ms->writes : &ms->reads;
spin_lock(&ms->lock);
- should_wake = !(bl->head);
+ should_wake = !bl->head;
bio_list_add(bl, bio);
spin_unlock(&ms->lock);
if (should_wake)
wake();
+
+ return 0;
}
/*
- * Mirror mapping function
+ * Mirror mapping function.
*/
static int mirror_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
int r, rw = bio_rw(bio);
- struct mirror *m;
- struct mirror_set *ms = ti->private;
-
- map_context->ll = bio->bi_sector >> ms->rh.region_shift;
-
- if (rw == WRITE) {
- queue_bio(ms, bio, rw);
- return 0;
+ struct mirror *m = (struct mirror *) ti->private;
+ struct mirror_set *ms = m->ms;
+ struct dm_bio_details *bd;
+ struct bio_map_info *bmi;
+
+ /* Queue writes to daemon to duplicate them to all mirrors. */
+ if (rw == WRITE){
+ /* Save region for mirror_end_io() handler. */
+ map_context->ll = bio_to_region(&ms->rh, bio);
+
+ return queue_bio(ms, bio, rw);
+ }
+
+ /* From here down, it's about READS */
+
+ bmi = mempool_alloc(bio_map_info_pool, GFP_KERNEL);
+
+ if(bmi){
+ /* without this, a read is not retryable */
+ bd = &bmi->bmi_bd;
+ dm_bio_record(bd, bio);
+ map_context->ptr = bmi;
+ } else {
+ /* we could fail now, but we can at least give it a shot. **
+ ** the bd is only used to retry in the event of a failure **
+ ** anyway. If we fail, we can fail the I/O then. */
+ map_context->ptr = NULL;
+ }
+
+ /* Ask dirty log non-blocking, if region's in sync. */
+ r = ms->rh.log->type->in_sync(ms->rh.log, bio_to_region(&ms->rh, bio), 0);
+ if (unlikely(r < 0)) {
+ if (likely(r == -EWOULDBLOCK)) /* FIXME: ugly */
+ r = 0;
+ else
+ return r; /* Can't carry on w/o dirty log. */
}
- r = ms->rh.log->type->in_sync(ms->rh.log,
- bio_to_region(&ms->rh, bio), 0);
- if (r < 0 && r != -EWOULDBLOCK)
- return r;
-
- if (r == -EWOULDBLOCK) /* FIXME: ugly */
- r = 0;
-
- /*
- * We don't want to fast track a recovery just for a read
- * ahead. So we just let it silently fail.
- * FIXME: get rid of this.
- */
- if (!r && rw == READA)
- return -EIO;
+ /* Region in sync. */
+ if (likely(r == LOG_CLEAN)) {
+ /*
+ * Optimize reads by avoiding to hand them to daemon.
+ *
+ * In case they fail, queue them for another shot
+ * in the mirror_end_io() function.
+ */
+ m = choose_mirror(ms, NULL);
+ if (likely(m)) {
+ bmi->bmi_m = m;
+ map_bio(m, bio);
+ return 1; /* Mapped -> queue request. */
+ } else{
+ mempool_free(bmi, bio_map_info_pool);
+ return -EIO;
+ }
+ } else {
+ /*
+ * We don't want to fast track a recovery just for
+ * a read ahead. So we just let it silently fail.
+ *
+ * FIXME: get rid of this.
+ */
+ if (rw == READA)
+ return -EIO;
- if (!r) {
- /* Pass this io over to the daemon */
+ /* Queue reads to out of sync regions to the daemon. */
queue_bio(ms, bio, rw);
- return 0;
}
- m = choose_mirror(ms, bio->bi_sector);
- if (!m)
- return -EIO;
-
- map_bio(ms, m, bio);
- return 1;
+ return 0;
}
+/*
+ * End io handler.
+ *
+ * Decrements write pending count on regions
+ * and fails mirrors on error.
+ */
static int mirror_end_io(struct dm_target *ti, struct bio *bio,
int error, union map_info *map_context)
{
int rw = bio_rw(bio);
- struct mirror_set *ms = (struct mirror_set *) ti->private;
- region_t region = map_context->ll;
+ struct mirror *m = NULL;
/*
* We need to dec pending if this was a write.
*/
- if (rw == WRITE)
- rh_dec(&ms->rh, region);
+ if (rw == WRITE){
+ m = (struct mirror *)ti->private;
+ rh_dec(&m->ms->rh, map_context->ll); /* Region squirreled. */
+ return error;
+ }
- return 0;
+ if (unlikely(error)) {
+ DMERR("A read failure occurred on a mirror device.");
+ if(!map_context->ptr){
+ /*
+ * There wasn't enough memory to record necessary
+ * information for a retry.
+ */
+ DMERR("Out of memory causing inability to retry read.");
+ return -EIO;
+ }
+ m = ((struct bio_map_info *)map_context->ptr)->bmi_m;
+ fail_mirror(m); /* Flag error on mirror. */
+
+ /*
+ * A failed read needs to get queued
+ * to the daemon for another shot to
+ * one (if any) intact mirrors.
+ */
+ if (rw == READ && default_mirror(m)) {
+ struct dm_bio_details *bd = &(((struct bio_map_info *)map_context->ptr)->bmi_bd);
+
+ DMWARN("Trying different device.");
+ dm_bio_restore(bd, bio);
+ mempool_free(map_context->ptr, bio_map_info_pool);
+ map_context->ptr = NULL;
+ queue_bio(m->ms, bio, rw);
+ return 1; /* We want another shot on the bio. */
+ }
+ DMERR("All replicated volumes dead, failing I/O");
+ }
+ if(map_context->ptr)
+ mempool_free(map_context->ptr, bio_map_info_pool);
+
+ /* ATTENTION -- we want to return the error, right? */
+ return error;
+}
+
+static void mirror_presuspend(struct dm_target *ti)
+{
+ struct mirror_set *ms = ((struct mirror *) ti->private)->ms;
+
+ atomic_set(&ms->suspended, 1);
+ complete(&ms->failure_completion);
}
static void mirror_postsuspend(struct dm_target *ti)
{
- struct mirror_set *ms = (struct mirror_set *) ti->private;
+ struct mirror_set *ms = ((struct mirror *) ti->private)->ms;
struct dirty_log *log = ms->rh.log;
rh_stop_recovery(&ms->rh);
@@ -1171,27 +1533,35 @@
static void mirror_resume(struct dm_target *ti)
{
- struct mirror_set *ms = (struct mirror_set *) ti->private;
+ struct mirror_set *ms = ((struct mirror *) ti->private)->ms;
struct dirty_log *log = ms->rh.log;
+
if (log->type->resume && log->type->resume(log))
/* FIXME: need better error handling */
DMWARN("log resume failed");
+
rh_start_recovery(&ms->rh);
+ atomic_set(&ms->suspended, 0);
}
static int mirror_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
- unsigned int m, sz;
- struct mirror_set *ms = (struct mirror_set *) ti->private;
+ char buffer[32];
+ unsigned int sz;
+ struct mirror *m = (struct mirror *) ti->private;
+ struct mirror_set *ms = m->ms;
sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
switch (type) {
case STATUSTYPE_INFO:
DMEMIT("%d ", ms->nr_mirrors);
- for (m = 0; m < ms->nr_mirrors; m++)
- DMEMIT("%s ", ms->mirror[m].dev->name);
+ for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) {
+ format_dev_t(buffer, m->dev->bdev->bd_dev);
+ DMEMIT("%s/%s ", buffer,
+ atomic_read(&m->error_count) ? "D" : "A");
+ }
DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
ms->rh.log->type->get_sync_count(ms->rh.log),
@@ -1200,14 +1570,16 @@
case STATUSTYPE_TABLE:
DMEMIT("%d ", ms->nr_mirrors);
- for (m = 0; m < ms->nr_mirrors; m++)
- DMEMIT("%s " SECTOR_FORMAT " ",
- ms->mirror[m].dev->name, ms->mirror[m].offset);
+ for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) {
+ format_dev_t(buffer, m->dev->bdev->bd_dev);
+ DMEMIT("%s " SECTOR_FORMAT " ", buffer, m->offset);
+ }
}
return 0;
}
+
static struct target_type mirror_target = {
.name = "mirror",
.version = {1, 0, 1},
@@ -1216,6 +1588,7 @@
.dtr = mirror_dtr,
.map = mirror_map,
.end_io = mirror_end_io,
+ .presuspend = mirror_presuspend,
.postsuspend = mirror_postsuspend,
.resume = mirror_resume,
.status = mirror_status,
@@ -1225,6 +1598,11 @@
{
int r;
+ bio_map_info_pool = mempool_create(100, bio_map_info_alloc, bio_map_info_free, NULL);
+ if(!bio_map_info_pool){
+ return -ENOMEM;
+ }
+
r = dm_dirty_log_init();
if (r)
return r;
@@ -1233,16 +1611,25 @@
if (!_kmirrord_wq) {
DMERR("couldn't start kmirrord");
dm_dirty_log_exit();
- return r;
+ return -ENOMEM;
}
INIT_WORK(&_kmirrord_work, do_work, NULL);
+ _mir_mond_wq = create_workqueue("mir_mond");
+ if (!_mir_mond_wq) {
+ DMERR("couldn't start mir_mond");
+ dm_dirty_log_exit();
+ destroy_workqueue(_kmirrord_wq);
+ return -ENOMEM;
+ }
+
r = dm_register_target(&mirror_target);
if (r < 0) {
DMERR("%s: Failed to register mirror target",
mirror_target.name);
dm_dirty_log_exit();
destroy_workqueue(_kmirrord_wq);
+ destroy_workqueue(_mir_mond_wq);
}
return r;
@@ -1265,5 +1652,15 @@
module_exit(dm_mirror_exit);
MODULE_DESCRIPTION(DM_NAME " mirror target");
-MODULE_AUTHOR("Joe Thornber");
+MODULE_AUTHOR("Joe Thornber / Jon Brassow / Heinz Mauelshagen");
MODULE_LICENSE("GPL");
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only. This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
--- linux-2.6.12/drivers/md/dm-log.c-patch 2005-06-21 14:40:48.000000000 -0500
+++ linux-2.6.12/drivers/md/dm-log.c 2005-06-24 14:28:52.649687339 -0500
@@ -15,6 +15,7 @@
static LIST_HEAD(_log_types);
static DEFINE_SPINLOCK(_lock);
+
int dm_register_dirty_log_type(struct dirty_log_type *type)
{
spin_lock(&_lock);
@@ -150,6 +151,7 @@
/*
* Disk log fields
*/
+ int log_dev_failed;
struct dm_dev *log_dev;
struct log_header header;
@@ -276,8 +278,7 @@
unsigned long ebits;
bits_to_disk(log->clean_bits, log->disk_bits,
log->bitset_uint32_count);
- return dm_io_sync_vm(1, &log->bits_location, WRITE,
- log->disk_bits, &ebits);
+ return dm_io_sync_vm(1, &log->bits_location, WRITE, log->disk_bits, &ebits);
}
/*----------------------------------------------------------------
@@ -412,6 +413,7 @@
lc = (struct log_c *) log->context;
lc->log_dev = dev;
+ lc->log_dev_failed = 0;
/* setup the disk header fields */
lc->header_location.bdev = lc->log_dev->bdev;
@@ -474,13 +476,19 @@
/* read the disk header */
r = read_header(lc);
- if (r)
- return r;
-
- /* read the bits */
- r = read_bits(lc);
- if (r)
- return r;
+ if (r){
+ DMERR("A read failure has occurred on a mirror log device.");
+ dm_table_event(lc->ti->table);
+ lc->header.nr_regions = 0;
+ } else {
+ /* read the bits */
+ r = read_bits(lc);
+ if (r){
+ DMERR("A read failure has occurred on a mirror log device.");
+ dm_table_event(lc->ti->table);
+ lc->header.nr_regions = 0;
+ }
+ }
/* set or clear any new bits */
if (lc->sync == NOSYNC)
@@ -496,16 +504,24 @@
memcpy(lc->sync_bits, lc->clean_bits, size);
lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
+ /* set the correct number of regions in the header */
+ lc->header.nr_regions = lc->region_count;
+
/* write the bits */
r = write_bits(lc);
- if (r)
+ if (r){
+ DMERR("A write failure has occurred on a mirror log device.");
+ dm_table_event(lc->ti->table);
return r;
-
- /* set the correct number of regions in the header */
- lc->header.nr_regions = lc->region_count;
+ }
/* write the new header */
- return write_header(lc);
+ r = write_header(lc);
+ if(r){
+ DMERR("A write failure has occurred on a mirror log device.");
+ dm_table_event(lc->ti->table);
+ }
+ return r;
}
static uint32_t core_get_region_size(struct dirty_log *log)
@@ -517,13 +533,13 @@
static int core_is_clean(struct dirty_log *log, region_t region)
{
struct log_c *lc = (struct log_c *) log->context;
- return log_test_bit(lc->clean_bits, region);
+ return log_test_bit(lc->clean_bits, region)? LOG_CLEAN: LOG_DIRTY;
}
static int core_in_sync(struct dirty_log *log, region_t region, int block)
{
struct log_c *lc = (struct log_c *) log->context;
- return log_test_bit(lc->sync_bits, region);
+ return log_test_bit(lc->sync_bits, region) ? LOG_CLEAN: LOG_NOSYNC;
}
static int core_flush(struct dirty_log *log)
@@ -541,10 +557,28 @@
if (!lc->touched)
return 0;
+ /*
+ * Could be dangerous if the write fails.
+ * If the machine dies while the on-disk log is different from the core,
+ * and the device is readalbe when the machine comes back, it may be
+ * possible that not all regions will be recovered.
+ *
+ * The event is raised so that dmeventd can suspend the device for a
+ * moment while it removes the log device.
+ *
+ * So, not running dmeventd and having a machine fail after a log has
+ * failed and having the device available when the machine reboots is
+ * a bad thing.
+ */
r = write_bits(lc);
if (!r)
lc->touched = 0;
-
+ else {
+ DMERR("A write failure has occurred on a mirror log device.");
+ DMERR("Log device is now not in-sync with the core.");
+ dm_table_event(lc->ti->table);
+ }
+
return r;
}
@@ -613,11 +647,18 @@
switch(status) {
case STATUSTYPE_INFO:
+ DMEMIT("%s %u %u ",
+ log->type->name, /* type name */
+ lc->sync == DEFAULTSYNC ? 1 : 2, /* # of args */
+ lc->region_size); /* region size */
+ DMEMIT_SYNC;
break;
case STATUSTYPE_TABLE:
- DMEMIT("%s %u %u ", log->type->name,
- lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
+ DMEMIT("%s %u %u ",
+ log->type->name, /* type name */
+ lc->sync == DEFAULTSYNC ? 1 : 2, /* # of args */
+ lc->region_size); /* region size */
DMEMIT_SYNC;
}
@@ -633,13 +674,23 @@
switch(status) {
case STATUSTYPE_INFO:
+ format_dev_t(buffer, lc->log_dev->bdev->bd_dev);
+ DMEMIT("%s %u %s%s %u ",
+ log->type->name, /* type name */
+ lc->sync == DEFAULTSYNC ? 2 : 3, /* # of args */
+ buffer, /* The log device */
+ (lc->log_dev_failed)? "/D":"/A", /* log device liveness */
+ lc->region_size); /* Region size */
+ DMEMIT_SYNC;
break;
case STATUSTYPE_TABLE:
format_dev_t(buffer, lc->log_dev->bdev->bd_dev);
- DMEMIT("%s %u %s %u ", log->type->name,
- lc->sync == DEFAULTSYNC ? 2 : 3, buffer,
- lc->region_size);
+ DMEMIT("%s %u %s %u ",
+ log->type->name, /* type name */
+ lc->sync == DEFAULTSYNC ? 2 : 3, /* # of args */
+ buffer, /* The log device */
+ lc->region_size); /* Region size */
DMEMIT_SYNC;
}
@@ -649,6 +700,7 @@
static struct dirty_log_type _core_type = {
.name = "core",
.module = THIS_MODULE,
+ .multi_node = 0,
.ctr = core_ctr,
.dtr = core_dtr,
.get_region_size = core_get_region_size,
@@ -666,6 +718,7 @@
static struct dirty_log_type _disk_type = {
.name = "disk",
.module = THIS_MODULE,
+ .multi_node = 0,
.ctr = disk_ctr,
.dtr = disk_dtr,
.suspend = disk_flush,
@@ -709,3 +762,13 @@
EXPORT_SYMBOL(dm_unregister_dirty_log_type);
EXPORT_SYMBOL(dm_create_dirty_log);
EXPORT_SYMBOL(dm_destroy_dirty_log);
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only. This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
[-- Attachment #3: Type: text/plain, Size: 0 bytes --]
next reply other threads:[~2005-06-24 20:16 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-06-24 20:16 Jonathan E Brassow [this message]
2005-06-24 21:06 ` mirroring: device failure tolerance Jonathan E Brassow
-- strict thread matches above, loose matches on Subject: below --
2005-06-24 20:18 Jonathan E Brassow
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=12249dad6f7ea11612dfb0a87d22bbf3@redhat.com \
--to=jbrassow@redhat.com \
--cc=dm-devel@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.