From: "Vladimir 'φ-coder/phcoder' Serbinenko" <phcoder@gmail.com>
To: The development of GNU GRUB <grub-devel@gnu.org>
Subject: Re: [RFD] diskfilter stale RAID member detection vs. lazy scanning
Date: Wed, 15 Jul 2015 23:47:38 +0200 [thread overview]
Message-ID: <55A6D4FA.7000400@gmail.com> (raw)
In-Reply-To: <55A6A104.6060407@gmail.com>
[-- Attachment #1.1: Type: text/plain, Size: 2026 bytes --]
On 15.07.2015 20:05, Vladimir 'φ-coder/phcoder' Serbinenko wrote:
> On 28.06.2015 20:06, Andrei Borzenkov wrote:
>> I was looking at implementing detection of outdated RAID members.
>> Unfortunately it appears to be fundamentally incompatible with lazy
>> scanning as implemented currently by GRUB. We simply cannot stop
>> scanning for other copies of metadata once "enough" was seen. Because
>> any other disk may contain more actual copy which invalidates
>> everything seen up to this point.
>>
>> So basically either we officially admit that GRUB is not able to detect
>> stale members or we drop lazy scanning.
>>
>> Comments, ideas?
>>
> We don't need to see all disks to decide that there is no staleness. If
> you have an array with N devices and you can lose at most K of them,
> then you can check for staleness after you have seen max(K+1, N-K)
> drives. Why?
>
> Let those disks have generation numbers g_0,...,g_{N-1}. Our goal is to
> find the largest number G s.t. number of indices with
> g_i >= G is at least N-K.
> In most common case when you have seen K+1 disks all of them will have
> the same generation number
> g_0=g_1=...=g_{K}
> Then we know that
> G<=g_0
> Suppose not then all of 0,...,K are stale and we have lost K+1 drives
> which contradicts our goal.
> On the other hand when we have seen N-K devices we know that
> G>=min(g_0,...,g_{N-K-1})
> as with G=min(g_0,...,g_{N-K-1}) we already have N-K disks.
>
> In cases other than mirror usually K+1<=N-K and so we don't even need to
> scan for more disks to detect staleness.
> The code will be slightly tricky as it has to handle tolerating
> staleness if there are too little disks but it's totally feasible. Let
> me figure out the rest of math and write a prototype.
Untested patch implementing these ideas, just to illustrate
>> _______________________________________________
>> Grub-devel mailing list
>> Grub-devel@gnu.org
>> https://lists.gnu.org/mailman/listinfo/grub-devel
>>
>
>
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1.2: diskfilter_gen.diff --]
[-- Type: text/x-diff; name="diskfilter_gen.diff", Size: 17154 bytes --]
diff --git a/grub-core/disk/diskfilter.c b/grub-core/disk/diskfilter.c
index c4f6678..850046e 100644
--- a/grub-core/disk/diskfilter.c
+++ b/grub-core/disk/diskfilter.c
@@ -41,77 +41,201 @@ static int lv_num = 0;
static struct grub_diskfilter_lv *
find_lv (const char *name);
-static int is_lv_readable (struct grub_diskfilter_lv *lv, int easily);
+static void
+recompute_lv_readable (struct grub_diskfilter_lv *lv);
\f
-static grub_err_t
-is_node_readable (const struct grub_diskfilter_node *node, int easily)
+static int
+get_number_needed_nodes(struct grub_diskfilter_segment *seg, int easily)
+{
+ int need = seg->node_count;
+
+ switch (seg->type)
+ {
+ case GRUB_DISKFILTER_RAID6:
+ if (!easily)
+ need--;
+ /* Fallthrough. */
+ case GRUB_DISKFILTER_RAID4:
+ case GRUB_DISKFILTER_RAID5:
+ if (!easily)
+ need--;
+ /* Fallthrough. */
+ case GRUB_DISKFILTER_STRIPED:
+ break;
+
+ case GRUB_DISKFILTER_MIRROR:
+ need = 1;
+ break;
+
+ case GRUB_DISKFILTER_RAID10:
+ {
+ unsigned int n;
+ n = seg->layout & 0xFF;
+ if (n == 1)
+ n = (seg->layout >> 8) & 0xFF;
+ need = seg->node_count - n + 1;
+ }
+ break;
+ }
+ return need;
+}
+
+/* Return whether node is readable and if so update node->generation. */
+static void
+recompute_node_readable (struct grub_diskfilter_node *node)
{
/* Check whether we actually know the physical volume we want to
read from. */
if (node->pv)
- return !!(node->pv->disk);
+ {
+ node->readability.is_readable = !!(node->pv->disk);
+ node->readability.is_easily_readable = !!(node->pv->disk);
+ node->readability.generation = node->pv->generation;
+ return;
+ }
if (node->lv)
- return is_lv_readable (node->lv, easily);
+ {
+ recompute_lv_readable (node->lv);
+ node->readability = node->lv->readability;
+ return;
+ }
+ node->readability.is_readable = 0;
+ node->readability.is_easily_readable = 0;
+ node->readability.generation = 0;
+ return;
+}
+
+static grub_uint64_t
+compute_generation (struct grub_diskfilter_segment *seg, int need)
+{
+ unsigned i;
+ for (i = 0; i < seg->node_count; i++)
+ if (seg->nodes[i].readability.is_readable) {
+ unsigned j;
+ grub_uint64_t candidate = seg->nodes[i].readability.generation;
+ int higher = 0, strictly_higher = 0;
+ for (j = 0; j < seg->node_count; j++)
+ if (seg->nodes[j].readability.is_readable) {
+ strictly_higher += (seg->nodes[j].readability.generation > candidate);
+ higher += (seg->nodes[j].readability.generation >= candidate);
+ }
+ if (higher >= need && strictly_higher < need)
+ return candidate;
+ }
return 0;
}
static int
-is_lv_readable (struct grub_diskfilter_lv *lv, int easily)
+still_could_increase (struct grub_diskfilter_segment *seg, int need, grub_uint64_t generation)
{
- unsigned i, j;
- if (!lv)
- return 0;
- for (i = 0; i < lv->segment_count; i++)
+ unsigned j;
+ int strictly_higher = 0;
+ for (j = 0; j < seg->node_count; j++)
+ {
+ strictly_higher += (seg->nodes[j].readability.generation > generation) || !(seg->nodes[j].readability.is_readable);
+ }
+ return (strictly_higher >= need);
+}
+
+static void
+recompute_segment_readable (struct grub_diskfilter_segment *seg)
+{
+ int need = get_number_needed_nodes(seg, 0), have = 0;
+ int easily_readable;
+ grub_uint64_t generation;
+ unsigned j;
+ for (j = 0; j < seg->node_count; j++)
+ recompute_node_readable (seg->nodes + j);
+ for (j = 0; j < seg->node_count; j++)
+ {
+ if (seg->nodes[j].readability.is_readable)
+ have++;
+ }
+ /* Not enough disks. */
+ if (have < need)
+ {
+ seg->readability.is_easily_readable = 0;
+ seg->readability.is_readable = 0;
+ seg->readability.generation = 0;
+ return;
+ }
+
+ generation = compute_generation (seg, need);
+ /* Do not tolerate any staleness on easily readable level. */
+ if (still_could_increase (seg, need, generation))
+ {
+ easily_readable = 0;
+ }
+ else
{
- int need = lv->segments[i].node_count, have = 0;
- switch (lv->segments[i].type)
+ int have_easily = 0, need_easily = get_number_needed_nodes(seg, 1);
+ for (j = 0; j < seg->node_count; j++)
{
- case GRUB_DISKFILTER_RAID6:
- if (!easily)
- need--;
- /* Fallthrough. */
- case GRUB_DISKFILTER_RAID4:
- case GRUB_DISKFILTER_RAID5:
- if (!easily)
- need--;
- /* Fallthrough. */
- case GRUB_DISKFILTER_STRIPED:
- break;
-
- case GRUB_DISKFILTER_MIRROR:
- need = 1;
- break;
-
- case GRUB_DISKFILTER_RAID10:
- {
- unsigned int n;
- n = lv->segments[i].layout & 0xFF;
- if (n == 1)
- n = (lv->segments[i].layout >> 8) & 0xFF;
- need = lv->segments[i].node_count - n + 1;
- }
- break;
+ if (seg->nodes[j].readability.is_easily_readable && seg->nodes[j].readability.generation >= generation)
+ have_easily++;
}
- for (j = 0; j < lv->segments[i].node_count; j++)
- {
- if (is_node_readable (lv->segments[i].nodes + j, easily))
- have++;
- if (have >= need)
- break;
- }
- if (have < need)
- return 0;
+ easily_readable = have_easily >= need_easily;
}
+ seg->readability.generation = generation;
+ seg->readability.is_readable = 1;
+ seg->readability.is_easily_readable = easily_readable;
+}
- return 1;
+static void
+recompute_lv_readable (struct grub_diskfilter_lv *lv)
+{
+ unsigned i;
+ struct grub_diskfilter_readability ret =
+ {
+ .is_readable = 1,
+ .is_easily_readable = 1,
+ .generation = -1
+ };
+ if (!lv)
+ return;
+ for (i = 0; i < lv->segment_count; i++)
+ {
+ recompute_segment_readable (&lv->segments[i]);
+ if (!lv->segments[i].readability.is_easily_readable)
+ ret.is_easily_readable = 0;
+ if (!lv->segments[i].readability.is_readable)
+ ret.is_readable = 0;
+ if (ret.generation > lv->segments[i].readability.generation)
+ ret.generation = lv->segments[i].readability.generation;
+ }
+ lv->readability = ret;
+}
+
+static int
+is_lv_readable (struct grub_diskfilter_lv *lv)
+{
+ if (!lv)
+ return 0;
+
+ if (lv->readability.is_readable)
+ return 1;
+ recompute_lv_readable(lv);
+ return lv->readability.is_readable;
+}
+
+static int
+is_lv_easily_readable (struct grub_diskfilter_lv *lv)
+{
+ if (!lv)
+ return 0;
+ if (lv->readability.is_easily_readable)
+ return 1;
+ recompute_lv_readable(lv);
+ return lv->readability.is_easily_readable;
}
static grub_err_t
insert_array (grub_disk_t disk, const struct grub_diskfilter_pv_id *id,
struct grub_diskfilter_vg *array,
grub_disk_addr_t start_sector,
+ grub_uint64_t generation,
grub_diskfilter_t diskfilter __attribute__ ((unused)));
static int
@@ -154,15 +278,16 @@ scan_disk_partition_iter (grub_disk_t disk, grub_partition_t p, void *data)
for (diskfilter = grub_diskfilter_list; diskfilter; diskfilter = diskfilter->next)
{
+ grub_uint64_t generation = 0;
#ifdef GRUB_UTIL
grub_util_info ("Scanning for %s devices on disk %s",
diskfilter->name, name);
#endif
id.uuid = 0;
id.uuidlen = 0;
- arr = diskfilter->detect (disk, &id, &start_sector);
+ arr = diskfilter->detect (disk, &id, &start_sector, &generation);
if (arr &&
- (! insert_array (disk, &id, arr, start_sector, diskfilter)))
+ (! insert_array (disk, &id, arr, start_sector, generation, diskfilter)))
{
if (id.uuidlen)
grub_free (id.uuid);
@@ -232,7 +357,7 @@ scan_devices (const char *arname)
{
if ((p->iterate) (scan_disk_hook, NULL, pull))
return;
- if (arname && is_lv_readable (find_lv (arname), 1))
+ if (arname && is_lv_easily_readable (find_lv (arname)))
return;
}
@@ -412,7 +537,7 @@ find_lv (const char *name)
for (lv = vg->lvs; lv; lv = lv->next)
if (((lv->fullname && grub_strcmp (lv->fullname, name) == 0)
|| (lv->idname && grub_strcmp (lv->idname, name) == 0))
- && is_lv_readable (lv, 0))
+ && is_lv_readable (lv))
return lv;
}
return NULL;
@@ -429,7 +554,7 @@ grub_diskfilter_open (const char *name, grub_disk_t disk)
lv = find_lv (name);
- if (! lv)
+ if (! lv || !lv->readability.is_easily_readable)
{
scan_devices (name);
if (grub_errno)
@@ -644,16 +769,19 @@ read_segment (struct grub_diskfilter_segment *seg, grub_disk_addr_t sector,
|| grub_errno == GRUB_ERR_UNKNOWN_DEVICE)
grub_errno = GRUB_ERR_NONE;
- err = grub_diskfilter_read_node (&seg->nodes[k],
- read_sector
- + j * far_ofs + b,
- read_size,
- buf);
- if (! err)
- break;
- else if (err != GRUB_ERR_READ_ERROR
- && err != GRUB_ERR_UNKNOWN_DEVICE)
- return err;
+ if (seg->nodes[k].readability.generation >= seg->readability.generation)
+ {
+ err = grub_diskfilter_read_node (&seg->nodes[k],
+ read_sector
+ + j * far_ofs + b,
+ read_size,
+ buf);
+ if (! err)
+ break;
+ else if (err != GRUB_ERR_READ_ERROR
+ && err != GRUB_ERR_UNKNOWN_DEVICE)
+ return err;
+ }
k++;
if (k == seg->node_count)
k = 0;
@@ -749,10 +877,13 @@ read_segment (struct grub_diskfilter_segment *seg, grub_disk_addr_t sector,
|| grub_errno == GRUB_ERR_UNKNOWN_DEVICE)
grub_errno = GRUB_ERR_NONE;
- err = grub_diskfilter_read_node (&seg->nodes[disknr],
- read_sector + b,
- read_size,
- buf);
+ if (seg->nodes[disknr].readability.generation >= seg->readability.generation)
+ err = grub_diskfilter_read_node (&seg->nodes[disknr],
+ read_sector + b,
+ read_size,
+ buf);
+ else
+ err = GRUB_ERR_READ_ERROR;
if ((err) && (err != GRUB_ERR_READ_ERROR
&& err != GRUB_ERR_UNKNOWN_DEVICE))
@@ -1183,6 +1314,7 @@ static grub_err_t
insert_array (grub_disk_t disk, const struct grub_diskfilter_pv_id *id,
struct grub_diskfilter_vg *array,
grub_disk_addr_t start_sector,
+ grub_uint64_t generation,
grub_diskfilter_t diskfilter __attribute__ ((unused)))
{
struct grub_diskfilter_pv *pv;
@@ -1219,6 +1351,7 @@ insert_array (grub_disk_t disk, const struct grub_diskfilter_pv_id *id,
pv->start_sector -= pv->part_start;
pv->part_start = grub_partition_get_start (disk->partition);
pv->part_size = grub_disk_get_size (disk);
+ pv->generation = generation;
#ifdef GRUB_UTIL
{
@@ -1238,7 +1371,7 @@ insert_array (grub_disk_t disk, const struct grub_diskfilter_pv_id *id,
pv->start_sector += pv->part_start;
/* Add the device to the array. */
for (lv = array->lvs; lv; lv = lv->next)
- if (!lv->became_readable_at && lv->fullname && is_lv_readable (lv, 0))
+ if (!lv->became_readable_at && lv->fullname && is_lv_readable (lv))
lv->became_readable_at = ++inscnt;
break;
}
diff --git a/grub-core/disk/dmraid_nvidia.c b/grub-core/disk/dmraid_nvidia.c
index 881508c..73895ad 100644
--- a/grub-core/disk/dmraid_nvidia.c
+++ b/grub-core/disk/dmraid_nvidia.c
@@ -92,8 +92,9 @@ struct grub_nv_super
static struct grub_diskfilter_vg *
grub_dmraid_nv_detect (grub_disk_t disk,
- struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector)
+ struct grub_diskfilter_pv_id *id,
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation)
{
grub_disk_addr_t sector;
struct grub_nv_super sb;
@@ -103,6 +104,8 @@ grub_dmraid_nv_detect (grub_disk_t disk,
grub_uint8_t total_volumes;
char *uuid;
+ *generation = 0;
+
if (disk->partition)
/* Skip partition. */
return NULL;
diff --git a/grub-core/disk/ldm.c b/grub-core/disk/ldm.c
index 8075f2a..f8f3be0 100644
--- a/grub-core/disk/ldm.c
+++ b/grub-core/disk/ldm.c
@@ -780,12 +780,15 @@ make_vg (grub_disk_t disk,
static struct grub_diskfilter_vg *
grub_ldm_detect (grub_disk_t disk,
struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector)
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation)
{
grub_err_t err;
struct grub_ldm_label label;
struct grub_diskfilter_vg *vg;
+ *generation = 0;
+
#ifdef GRUB_UTIL
grub_util_info ("scanning %s for LDM", disk->name);
#endif
diff --git a/grub-core/disk/lvm.c b/grub-core/disk/lvm.c
index 7b265c7..58b0704 100644
--- a/grub-core/disk/lvm.c
+++ b/grub-core/disk/lvm.c
@@ -98,7 +98,8 @@ grub_lvm_check_flag (char *p, const char *str, const char *flag)
static struct grub_diskfilter_vg *
grub_lvm_detect (grub_disk_t disk,
struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector)
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation)
{
grub_err_t err;
grub_uint64_t mda_offset, mda_size;
@@ -116,6 +117,8 @@ grub_lvm_detect (grub_disk_t disk,
struct grub_diskfilter_vg *vg;
struct grub_diskfilter_pv *pv;
+ *generation = 0;
+
/* Search for label. */
for (i = 0; i < GRUB_LVM_LABEL_SCAN_SECTORS; i++)
{
diff --git a/grub-core/disk/mdraid1x_linux.c b/grub-core/disk/mdraid1x_linux.c
index 7cc80d3..40a5ab7 100644
--- a/grub-core/disk/mdraid1x_linux.c
+++ b/grub-core/disk/mdraid1x_linux.c
@@ -106,7 +106,8 @@ struct grub_raid_super_1x
static struct grub_diskfilter_vg *
grub_mdraid_detect (grub_disk_t disk,
struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector)
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation)
{
grub_uint64_t size;
grub_uint8_t minor_version;
@@ -198,6 +199,7 @@ grub_mdraid_detect (grub_disk_t disk,
grub_memcpy (uuid, sb.set_uuid, 16);
*start_sector = grub_le_to_cpu64 (sb.data_offset);
+ *generation = grub_le_to_cpu64 (sb.events);
array = grub_diskfilter_make_raid (16, uuid,
grub_le_to_cpu32 (sb.raid_disks),
diff --git a/grub-core/disk/mdraid_linux.c b/grub-core/disk/mdraid_linux.c
index 11024ae..7b9768c 100644
--- a/grub-core/disk/mdraid_linux.c
+++ b/grub-core/disk/mdraid_linux.c
@@ -180,7 +180,8 @@ struct grub_raid_super_09
static struct grub_diskfilter_vg *
grub_mdraid_detect (grub_disk_t disk,
struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector)
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation)
{
grub_disk_addr_t sector;
grub_uint64_t size;
@@ -247,6 +248,7 @@ grub_mdraid_detect (grub_disk_t disk,
uuid[3] = grub_swap_bytes32 (sb->set_uuid3);
*start_sector = 0;
+ *generation = grub_md_to_cpu64 (sb->cp_events);
id->uuidlen = 0;
id->id = grub_md_to_cpu32 (sb->this_disk.number);
diff --git a/include/grub/diskfilter.h b/include/grub/diskfilter.h
index 1aedcd3..947500b 100644
--- a/include/grub/diskfilter.h
+++ b/include/grub/diskfilter.h
@@ -69,6 +69,7 @@ struct grub_diskfilter_pv {
grub_disk_addr_t part_start;
grub_disk_addr_t part_size;
grub_disk_addr_t start_sector; /* Sector number where the data area starts. */
+ grub_uint64_t generation;
struct grub_diskfilter_pv *next;
/* Optional. */
grub_uint8_t *internal_id;
@@ -77,6 +78,15 @@ struct grub_diskfilter_pv {
#endif
};
+struct grub_diskfilter_readability
+{
+ /* Readable without reading any stale disks or doing any recovery. */
+ int is_easily_readable;
+ /* Readable at all: even reding stale disks and doing recovery. */
+ int is_readable;
+ grub_uint64_t generation;
+};
+
struct grub_diskfilter_lv {
/* Name used for disk. */
char *fullname;
@@ -90,6 +100,7 @@ struct grub_diskfilter_lv {
int became_readable_at;
int scanned;
int visible;
+ struct grub_diskfilter_readability readability;
/* Pointer to segment_count segments. */
struct grub_diskfilter_segment *segments;
@@ -119,6 +130,7 @@ struct grub_diskfilter_segment {
unsigned int node_count;
unsigned int node_alloc;
struct grub_diskfilter_node *nodes;
+ struct grub_diskfilter_readability readability;
unsigned int stripe_size;
};
@@ -127,6 +139,7 @@ struct grub_diskfilter_node {
grub_disk_addr_t start;
/* Optional. */
char *name;
+ struct grub_diskfilter_readability readability;
struct grub_diskfilter_pv *pv;
struct grub_diskfilter_lv *lv;
};
@@ -143,7 +156,8 @@ struct grub_diskfilter
struct grub_diskfilter_vg * (*detect) (grub_disk_t disk,
struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector);
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation);
};
typedef struct grub_diskfilter *grub_diskfilter_t;
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 213 bytes --]
next prev parent reply other threads:[~2015-07-15 21:47 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-06-28 18:06 [RFD] diskfilter stale RAID member detection vs. lazy scanning Andrei Borzenkov
2015-07-15 18:05 ` Vladimir 'φ-coder/phcoder' Serbinenko
2015-07-15 21:47 ` Vladimir 'φ-coder/phcoder' Serbinenko [this message]
2015-11-30 3:34 ` Andrei Borzenkov
2015-07-16 3:42 ` Andrei Borzenkov
2015-07-16 8:01 ` Vladimir 'φ-coder/phcoder' Serbinenko
2016-01-11 21:35 ` Andrei Borzenkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=55A6D4FA.7000400@gmail.com \
--to=phcoder@gmail.com \
--cc=grub-devel@gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).