From: Jan Schmidt <list.btrfs@jan-o-sch.net>
To: Ian Kent <raven@themaw.net>
Cc: chris.mason@oracle.com, linux-btrfs@vger.kernel.org
Subject: Re: [RFC PATCH 4/4] btrfs: Moved repair code from inode.c to extent_io.c
Date: Mon, 25 Jul 2011 10:59:19 +0200 [thread overview]
Message-ID: <4E2D3067.70007@jan-o-sch.net> (raw)
In-Reply-To: <1311566280.3401.15.camel@perseus.themaw.net>
On 25.07.2011 05:58, Ian Kent wrote:
> On Fri, 2011-07-22 at 16:58 +0200, Jan Schmidt wrote:
>> +static int bio_readpage_error(struct bio *failed_bio, struct page *page,
>> + u64 start, u64 end, int failed_mirror,
>> + struct extent_state *state)
>> +{
>> + struct io_failure_record *failrec = NULL;
>> + u64 private;
>> + struct extent_map *em;
>> + struct inode *inode = page->mapping->host;
>> + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
>> + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
>> + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
>> + struct bio *bio;
>> + int num_copies;
>> + int ret;
>> + int read_mode;
>> + u64 logical;
>> +
>> + BUG_ON(failed_bio->bi_rw & REQ_WRITE);
>> +
>> + ret = get_state_private(failure_tree, start, &private);
>> + if (ret) {
>> + failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
>> + if (!failrec)
>> + return -ENOMEM;
>> + failrec->start = start;
>> + failrec->len = end - start + 1;
>> + failrec->this_mirror = 0;
>> + failrec->bio_flags = 0;
>> + failrec->in_validation = 0;
>> +
>> + read_lock(&em_tree->lock);
>> + em = lookup_extent_mapping(em_tree, start, failrec->len);
>> + if (!em) {
>
> Looks like a missing "read_unlock(&em_tree->lock);" here to me?
Thanks, will be fixed in next version.
-Jan
>> + kfree(failrec);
>> + return -EIO;
>> + }
>> +
>> + if (em->start > start || em->start + em->len < start) {
>> + free_extent_map(em);
>> + em = NULL;
>> + }
>> + read_unlock(&em_tree->lock);
>> +
>> + if (!em || IS_ERR(em)) {
>> + kfree(failrec);
>> + return -EIO;
>> + }
>> + logical = start - em->start;
>> + logical = em->block_start + logical;
>> + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
>> + logical = em->block_start;
>> + failrec->bio_flags = EXTENT_BIO_COMPRESSED;
>> + extent_set_compress_type(&failrec->bio_flags,
>> + em->compress_type);
>> + }
>> + pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
>> + "len=%llu\n", logical, start, failrec->len);
>> + failrec->logical = logical;
>> + free_extent_map(em);
>> +
>> + /* set the bits in the private failure tree */
>> + ret = set_extent_bits(failure_tree, start, end,
>> + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
>> + if (ret >= 0)
>> + ret = set_state_private(failure_tree, start,
>> + (u64)(unsigned long)failrec);
>> + /* set the bits in the inode's tree */
>> + if (ret >= 0)
>> + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
>> + GFP_NOFS);
>> + if (ret < 0) {
>> + kfree(failrec);
>> + return ret;
>> + }
>> + } else {
>> + failrec = (struct io_failure_record *)(unsigned long)private;
>> + pr_debug("bio_readpage_error: (found) logical=%llu, "
>> + "start=%llu, len=%llu, validation=%d\n",
>> + failrec->logical, failrec->start, failrec->len,
>> + failrec->in_validation);
>> + /*
>> + * when data can be on disk more than twice, add to failrec here
>> + * (e.g. with a list for failed_mirror) to make
>> + * clean_io_failure() clean all those errors at once.
>> + */
>> + }
>> + num_copies = btrfs_num_copies(
>> + &BTRFS_I(inode)->root->fs_info->mapping_tree,
>> + failrec->logical, failrec->len);
>> + if (num_copies == 1) {
>> + /*
>> + * we only have a single copy of the data, so don't bother with
>> + * all the retry and error correction code that follows. no
>> + * matter what the error is, it is very likely to persist.
>> + */
>> + pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
>> + "state=%p, num_copies=%d, next_mirror %d, "
>> + "failed_mirror %d\n", state, num_copies,
>> + failrec->this_mirror, failed_mirror);
>> + free_io_failure(inode, failrec, 0);
>> + return -EIO;
>> + }
>> +
>> + if (!state) {
>> + spin_lock(&tree->lock);
>> + state = find_first_extent_bit_state(tree, failrec->start,
>> + EXTENT_LOCKED);
>> + if (state && state->start != failrec->start)
>> + state = NULL;
>> + spin_unlock(&tree->lock);
>> + }
>> +
>> + /*
>> + * there are two premises:
>> + * a) deliver good data to the caller
>> + * b) correct the bad sectors on disk
>> + */
>> + if (failed_bio->bi_vcnt > 1) {
>> + /*
>> + * to fulfill b), we need to know the exact failing sectors, as
>> + * we don't want to rewrite any more than the failed ones. thus,
>> + * we need separate read requests for the failed bio
>> + *
>> + * if the following BUG_ON triggers, our validation request got
>> + * merged. we need separate requests for our algorithm to work.
>> + */
>> + BUG_ON(failrec->in_validation);
>> + failrec->in_validation = 1;
>> + failrec->this_mirror = failed_mirror;
>> + read_mode = READ_SYNC | REQ_FAILFAST_DEV;
>> + } else {
>> + /*
>> + * we're ready to fulfill a) and b) alongside. get a good copy
>> + * of the failed sector and if we succeed, we have setup
>> + * everything for repair_io_failure to do the rest for us.
>> + */
>> + if (failrec->in_validation) {
>> + BUG_ON(failrec->this_mirror != failed_mirror);
>> + failrec->in_validation = 0;
>> + failrec->this_mirror = 0;
>> + }
>> + failrec->failed_mirror = failed_mirror;
>> + failrec->this_mirror++;
>> + if (failrec->this_mirror == failed_mirror)
>> + failrec->this_mirror++;
>> + read_mode = READ_SYNC;
>> + }
>> +
>> + if (!state || failrec->this_mirror > num_copies) {
>> + pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
>> + "next_mirror %d, failed_mirror %d\n", state,
>> + num_copies, failrec->this_mirror, failed_mirror);
>> + free_io_failure(inode, failrec, 0);
>> + return -EIO;
>> + }
>> +
>> + bio = bio_alloc(GFP_NOFS, 1);
>> + bio->bi_private = state;
>> + bio->bi_end_io = failed_bio->bi_end_io;
>> + bio->bi_sector = failrec->logical >> 9;
>> + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
>> + bio->bi_size = 0;
>> +
>> + bio_add_page(bio, page, failrec->len, start - page_offset(page));
>> +
>> + pr_debug("bio_readpage_error: submitting new read[%#x] to "
>> + "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
>> + failrec->this_mirror, num_copies, failrec->in_validation);
>> +
>> + tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
>> + failrec->bio_flags, 0);
>> + return 0;
>> +}
prev parent reply other threads:[~2011-07-25 8:59 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-07-22 14:58 [RFC PATCH 0/4] btrfs: Suggestion for raid auto-repair Jan Schmidt
2011-07-22 14:58 ` [RFC PATCH 1/4] btrfs: btrfs_multi_bio replaced with btrfs_bio Jan Schmidt
2011-07-22 14:58 ` [RFC PATCH 2/4] btrfs: Do not use bio->bi_bdev after submission Jan Schmidt
2011-07-22 14:58 ` [RFC PATCH 3/4] btrfs: Put mirror_num in bi_bdev Jan Schmidt
2011-07-22 14:58 ` [RFC PATCH 4/4] btrfs: Moved repair code from inode.c to extent_io.c Jan Schmidt
2011-07-24 16:24 ` Andi Kleen
2011-07-24 17:28 ` Jan Schmidt
2011-07-24 23:01 ` Andi Kleen
2011-07-25 8:52 ` Jan Schmidt
2011-07-25 3:58 ` Ian Kent
2011-07-25 8:59 ` Jan Schmidt [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4E2D3067.70007@jan-o-sch.net \
--to=list.btrfs@jan-o-sch.net \
--cc=chris.mason@oracle.com \
--cc=linux-btrfs@vger.kernel.org \
--cc=raven@themaw.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).