From: "Vladimir 'φ-coder/phcoder' Serbinenko" <phcoder@gmail.com>
To: The development of GNU GRUB <grub-devel@gnu.org>
Subject: Re: [RFD] diskfilter stale RAID member detection vs. lazy scanning
Date: Wed, 15 Jul 2015 23:47:38 +0200 [thread overview]
Message-ID: <55A6D4FA.7000400@gmail.com> (raw)
In-Reply-To: <55A6A104.6060407@gmail.com>
[-- Attachment #1.1: Type: text/plain, Size: 2026 bytes --]
On 15.07.2015 20:05, Vladimir 'φ-coder/phcoder' Serbinenko wrote:
> On 28.06.2015 20:06, Andrei Borzenkov wrote:
>> I was looking at implementing detection of outdated RAID members.
>> Unfortunately it appears to be fundamentally incompatible with lazy
>> scanning as implemented currently by GRUB. We simply cannot stop
>> scanning for other copies of metadata once "enough" was seen. Because
>> any other disk may contain more actual copy which invalidates
>> everything seen up to this point.
>>
>> So basically either we officially admit that GRUB is not able to detect
>> stale members or we drop lazy scanning.
>>
>> Comments, ideas?
>>
> We don't need to see all disks to decide that there is no staleness. If
> you have an array with N devices and you can lose at most K of them,
> then you can check for staleness after you have seen max(K+1, N-K)
> drives. Why?
>
> Let those disks have generation numbers g_0,...,g_{N-1}. Our goal is to
> find the largest number G s.t. number of indices with
> g_i >= G is at least N-K.
> In most common case when you have seen K+1 disks all of them will have
> the same generation number
> g_0=g_1=...=g_{K}
> Then we know that
> G<=g_0
> Suppose not then all of 0,...,K are stale and we have lost K+1 drives
> which contradicts our goal.
> On the other hand when we have seen N-K devices we know that
> G>=min(g_0,...,g_{N-K-1})
> as with G=min(g_0,...,g_{N-K-1}) we already have N-K disks.
>
> In cases other than mirror usually K+1<=N-K and so we don't even need to
> scan for more disks to detect staleness.
> The code will be slightly tricky as it has to handle tolerating
> staleness if there are too little disks but it's totally feasible. Let
> me figure out the rest of math and write a prototype.
Untested patch implementing these ideas, just to illustrate
>> _______________________________________________
>> Grub-devel mailing list
>> Grub-devel@gnu.org
>> https://lists.gnu.org/mailman/listinfo/grub-devel
>>
>
>
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1.2: diskfilter_gen.diff --]
[-- Type: text/x-diff; name="diskfilter_gen.diff", Size: 17154 bytes --]
diff --git a/grub-core/disk/diskfilter.c b/grub-core/disk/diskfilter.c
index c4f6678..850046e 100644
--- a/grub-core/disk/diskfilter.c
+++ b/grub-core/disk/diskfilter.c
@@ -41,77 +41,201 @@ static int lv_num = 0;
static struct grub_diskfilter_lv *
find_lv (const char *name);
-static int is_lv_readable (struct grub_diskfilter_lv *lv, int easily);
+static void
+recompute_lv_readable (struct grub_diskfilter_lv *lv);
\f
-static grub_err_t
-is_node_readable (const struct grub_diskfilter_node *node, int easily)
+static int
+get_number_needed_nodes(struct grub_diskfilter_segment *seg, int easily)
+{
+ int need = seg->node_count;
+
+ switch (seg->type)
+ {
+ case GRUB_DISKFILTER_RAID6:
+ if (!easily)
+ need--;
+ /* Fallthrough. */
+ case GRUB_DISKFILTER_RAID4:
+ case GRUB_DISKFILTER_RAID5:
+ if (!easily)
+ need--;
+ /* Fallthrough. */
+ case GRUB_DISKFILTER_STRIPED:
+ break;
+
+ case GRUB_DISKFILTER_MIRROR:
+ need = 1;
+ break;
+
+ case GRUB_DISKFILTER_RAID10:
+ {
+ unsigned int n;
+ n = seg->layout & 0xFF;
+ if (n == 1)
+ n = (seg->layout >> 8) & 0xFF;
+ need = seg->node_count - n + 1;
+ }
+ break;
+ }
+ return need;
+}
+
+/* Return whether node is readable and if so update node->generation. */
+static void
+recompute_node_readable (struct grub_diskfilter_node *node)
{
/* Check whether we actually know the physical volume we want to
read from. */
if (node->pv)
- return !!(node->pv->disk);
+ {
+ node->readability.is_readable = !!(node->pv->disk);
+ node->readability.is_easily_readable = !!(node->pv->disk);
+ node->readability.generation = node->pv->generation;
+ return;
+ }
if (node->lv)
- return is_lv_readable (node->lv, easily);
+ {
+ recompute_lv_readable (node->lv);
+ node->readability = node->lv->readability;
+ return;
+ }
+ node->readability.is_readable = 0;
+ node->readability.is_easily_readable = 0;
+ node->readability.generation = 0;
+ return;
+}
+
+static grub_uint64_t
+compute_generation (struct grub_diskfilter_segment *seg, int need)
+{
+ unsigned i;
+ for (i = 0; i < seg->node_count; i++)
+ if (seg->nodes[i].readability.is_readable) {
+ unsigned j;
+ grub_uint64_t candidate = seg->nodes[i].readability.generation;
+ int higher = 0, strictly_higher = 0;
+ for (j = 0; j < seg->node_count; j++)
+ if (seg->nodes[j].readability.is_readable) {
+ strictly_higher += (seg->nodes[j].readability.generation > candidate);
+ higher += (seg->nodes[j].readability.generation >= candidate);
+ }
+ if (higher >= need && strictly_higher < need)
+ return candidate;
+ }
return 0;
}
static int
-is_lv_readable (struct grub_diskfilter_lv *lv, int easily)
+still_could_increase (struct grub_diskfilter_segment *seg, int need, grub_uint64_t generation)
{
- unsigned i, j;
- if (!lv)
- return 0;
- for (i = 0; i < lv->segment_count; i++)
+ unsigned j;
+ int strictly_higher = 0;
+ for (j = 0; j < seg->node_count; j++)
+ {
+ strictly_higher += (seg->nodes[j].readability.generation > generation) || !(seg->nodes[j].readability.is_readable);
+ }
+ return (strictly_higher >= need);
+}
+
+static void
+recompute_segment_readable (struct grub_diskfilter_segment *seg)
+{
+ int need = get_number_needed_nodes(seg, 0), have = 0;
+ int easily_readable;
+ grub_uint64_t generation;
+ unsigned j;
+ for (j = 0; j < seg->node_count; j++)
+ recompute_node_readable (seg->nodes + j);
+ for (j = 0; j < seg->node_count; j++)
+ {
+ if (seg->nodes[j].readability.is_readable)
+ have++;
+ }
+ /* Not enough disks. */
+ if (have < need)
+ {
+ seg->readability.is_easily_readable = 0;
+ seg->readability.is_readable = 0;
+ seg->readability.generation = 0;
+ return;
+ }
+
+ generation = compute_generation (seg, need);
+ /* Do not tolerate any staleness on easily readable level. */
+ if (still_could_increase (seg, need, generation))
+ {
+ easily_readable = 0;
+ }
+ else
{
- int need = lv->segments[i].node_count, have = 0;
- switch (lv->segments[i].type)
+ int have_easily = 0, need_easily = get_number_needed_nodes(seg, 1);
+ for (j = 0; j < seg->node_count; j++)
{
- case GRUB_DISKFILTER_RAID6:
- if (!easily)
- need--;
- /* Fallthrough. */
- case GRUB_DISKFILTER_RAID4:
- case GRUB_DISKFILTER_RAID5:
- if (!easily)
- need--;
- /* Fallthrough. */
- case GRUB_DISKFILTER_STRIPED:
- break;
-
- case GRUB_DISKFILTER_MIRROR:
- need = 1;
- break;
-
- case GRUB_DISKFILTER_RAID10:
- {
- unsigned int n;
- n = lv->segments[i].layout & 0xFF;
- if (n == 1)
- n = (lv->segments[i].layout >> 8) & 0xFF;
- need = lv->segments[i].node_count - n + 1;
- }
- break;
+ if (seg->nodes[j].readability.is_easily_readable && seg->nodes[j].readability.generation >= generation)
+ have_easily++;
}
- for (j = 0; j < lv->segments[i].node_count; j++)
- {
- if (is_node_readable (lv->segments[i].nodes + j, easily))
- have++;
- if (have >= need)
- break;
- }
- if (have < need)
- return 0;
+ easily_readable = have_easily >= need_easily;
}
+ seg->readability.generation = generation;
+ seg->readability.is_readable = 1;
+ seg->readability.is_easily_readable = easily_readable;
+}
- return 1;
+static void
+recompute_lv_readable (struct grub_diskfilter_lv *lv)
+{
+ unsigned i;
+ struct grub_diskfilter_readability ret =
+ {
+ .is_readable = 1,
+ .is_easily_readable = 1,
+ .generation = -1
+ };
+ if (!lv)
+ return;
+ for (i = 0; i < lv->segment_count; i++)
+ {
+ recompute_segment_readable (&lv->segments[i]);
+ if (!lv->segments[i].readability.is_easily_readable)
+ ret.is_easily_readable = 0;
+ if (!lv->segments[i].readability.is_readable)
+ ret.is_readable = 0;
+ if (ret.generation > lv->segments[i].readability.generation)
+ ret.generation = lv->segments[i].readability.generation;
+ }
+ lv->readability = ret;
+}
+
+static int
+is_lv_readable (struct grub_diskfilter_lv *lv)
+{
+ if (!lv)
+ return 0;
+
+ if (lv->readability.is_readable)
+ return 1;
+ recompute_lv_readable(lv);
+ return lv->readability.is_readable;
+}
+
+static int
+is_lv_easily_readable (struct grub_diskfilter_lv *lv)
+{
+ if (!lv)
+ return 0;
+ if (lv->readability.is_easily_readable)
+ return 1;
+ recompute_lv_readable(lv);
+ return lv->readability.is_easily_readable;
}
static grub_err_t
insert_array (grub_disk_t disk, const struct grub_diskfilter_pv_id *id,
struct grub_diskfilter_vg *array,
grub_disk_addr_t start_sector,
+ grub_uint64_t generation,
grub_diskfilter_t diskfilter __attribute__ ((unused)));
static int
@@ -154,15 +278,16 @@ scan_disk_partition_iter (grub_disk_t disk, grub_partition_t p, void *data)
for (diskfilter = grub_diskfilter_list; diskfilter; diskfilter = diskfilter->next)
{
+ grub_uint64_t generation = 0;
#ifdef GRUB_UTIL
grub_util_info ("Scanning for %s devices on disk %s",
diskfilter->name, name);
#endif
id.uuid = 0;
id.uuidlen = 0;
- arr = diskfilter->detect (disk, &id, &start_sector);
+ arr = diskfilter->detect (disk, &id, &start_sector, &generation);
if (arr &&
- (! insert_array (disk, &id, arr, start_sector, diskfilter)))
+ (! insert_array (disk, &id, arr, start_sector, generation, diskfilter)))
{
if (id.uuidlen)
grub_free (id.uuid);
@@ -232,7 +357,7 @@ scan_devices (const char *arname)
{
if ((p->iterate) (scan_disk_hook, NULL, pull))
return;
- if (arname && is_lv_readable (find_lv (arname), 1))
+ if (arname && is_lv_easily_readable (find_lv (arname)))
return;
}
@@ -412,7 +537,7 @@ find_lv (const char *name)
for (lv = vg->lvs; lv; lv = lv->next)
if (((lv->fullname && grub_strcmp (lv->fullname, name) == 0)
|| (lv->idname && grub_strcmp (lv->idname, name) == 0))
- && is_lv_readable (lv, 0))
+ && is_lv_readable (lv))
return lv;
}
return NULL;
@@ -429,7 +554,7 @@ grub_diskfilter_open (const char *name, grub_disk_t disk)
lv = find_lv (name);
- if (! lv)
+ if (! lv || !lv->readability.is_easily_readable)
{
scan_devices (name);
if (grub_errno)
@@ -644,16 +769,19 @@ read_segment (struct grub_diskfilter_segment *seg, grub_disk_addr_t sector,
|| grub_errno == GRUB_ERR_UNKNOWN_DEVICE)
grub_errno = GRUB_ERR_NONE;
- err = grub_diskfilter_read_node (&seg->nodes[k],
- read_sector
- + j * far_ofs + b,
- read_size,
- buf);
- if (! err)
- break;
- else if (err != GRUB_ERR_READ_ERROR
- && err != GRUB_ERR_UNKNOWN_DEVICE)
- return err;
+ if (seg->nodes[k].readability.generation >= seg->readability.generation)
+ {
+ err = grub_diskfilter_read_node (&seg->nodes[k],
+ read_sector
+ + j * far_ofs + b,
+ read_size,
+ buf);
+ if (! err)
+ break;
+ else if (err != GRUB_ERR_READ_ERROR
+ && err != GRUB_ERR_UNKNOWN_DEVICE)
+ return err;
+ }
k++;
if (k == seg->node_count)
k = 0;
@@ -749,10 +877,13 @@ read_segment (struct grub_diskfilter_segment *seg, grub_disk_addr_t sector,
|| grub_errno == GRUB_ERR_UNKNOWN_DEVICE)
grub_errno = GRUB_ERR_NONE;
- err = grub_diskfilter_read_node (&seg->nodes[disknr],
- read_sector + b,
- read_size,
- buf);
+ if (seg->nodes[disknr].readability.generation >= seg->readability.generation)
+ err = grub_diskfilter_read_node (&seg->nodes[disknr],
+ read_sector + b,
+ read_size,
+ buf);
+ else
+ err = GRUB_ERR_READ_ERROR;
if ((err) && (err != GRUB_ERR_READ_ERROR
&& err != GRUB_ERR_UNKNOWN_DEVICE))
@@ -1183,6 +1314,7 @@ static grub_err_t
insert_array (grub_disk_t disk, const struct grub_diskfilter_pv_id *id,
struct grub_diskfilter_vg *array,
grub_disk_addr_t start_sector,
+ grub_uint64_t generation,
grub_diskfilter_t diskfilter __attribute__ ((unused)))
{
struct grub_diskfilter_pv *pv;
@@ -1219,6 +1351,7 @@ insert_array (grub_disk_t disk, const struct grub_diskfilter_pv_id *id,
pv->start_sector -= pv->part_start;
pv->part_start = grub_partition_get_start (disk->partition);
pv->part_size = grub_disk_get_size (disk);
+ pv->generation = generation;
#ifdef GRUB_UTIL
{
@@ -1238,7 +1371,7 @@ insert_array (grub_disk_t disk, const struct grub_diskfilter_pv_id *id,
pv->start_sector += pv->part_start;
/* Add the device to the array. */
for (lv = array->lvs; lv; lv = lv->next)
- if (!lv->became_readable_at && lv->fullname && is_lv_readable (lv, 0))
+ if (!lv->became_readable_at && lv->fullname && is_lv_readable (lv))
lv->became_readable_at = ++inscnt;
break;
}
diff --git a/grub-core/disk/dmraid_nvidia.c b/grub-core/disk/dmraid_nvidia.c
index 881508c..73895ad 100644
--- a/grub-core/disk/dmraid_nvidia.c
+++ b/grub-core/disk/dmraid_nvidia.c
@@ -92,8 +92,9 @@ struct grub_nv_super
static struct grub_diskfilter_vg *
grub_dmraid_nv_detect (grub_disk_t disk,
- struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector)
+ struct grub_diskfilter_pv_id *id,
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation)
{
grub_disk_addr_t sector;
struct grub_nv_super sb;
@@ -103,6 +104,8 @@ grub_dmraid_nv_detect (grub_disk_t disk,
grub_uint8_t total_volumes;
char *uuid;
+ *generation = 0;
+
if (disk->partition)
/* Skip partition. */
return NULL;
diff --git a/grub-core/disk/ldm.c b/grub-core/disk/ldm.c
index 8075f2a..f8f3be0 100644
--- a/grub-core/disk/ldm.c
+++ b/grub-core/disk/ldm.c
@@ -780,12 +780,15 @@ make_vg (grub_disk_t disk,
static struct grub_diskfilter_vg *
grub_ldm_detect (grub_disk_t disk,
struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector)
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation)
{
grub_err_t err;
struct grub_ldm_label label;
struct grub_diskfilter_vg *vg;
+ *generation = 0;
+
#ifdef GRUB_UTIL
grub_util_info ("scanning %s for LDM", disk->name);
#endif
diff --git a/grub-core/disk/lvm.c b/grub-core/disk/lvm.c
index 7b265c7..58b0704 100644
--- a/grub-core/disk/lvm.c
+++ b/grub-core/disk/lvm.c
@@ -98,7 +98,8 @@ grub_lvm_check_flag (char *p, const char *str, const char *flag)
static struct grub_diskfilter_vg *
grub_lvm_detect (grub_disk_t disk,
struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector)
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation)
{
grub_err_t err;
grub_uint64_t mda_offset, mda_size;
@@ -116,6 +117,8 @@ grub_lvm_detect (grub_disk_t disk,
struct grub_diskfilter_vg *vg;
struct grub_diskfilter_pv *pv;
+ *generation = 0;
+
/* Search for label. */
for (i = 0; i < GRUB_LVM_LABEL_SCAN_SECTORS; i++)
{
diff --git a/grub-core/disk/mdraid1x_linux.c b/grub-core/disk/mdraid1x_linux.c
index 7cc80d3..40a5ab7 100644
--- a/grub-core/disk/mdraid1x_linux.c
+++ b/grub-core/disk/mdraid1x_linux.c
@@ -106,7 +106,8 @@ struct grub_raid_super_1x
static struct grub_diskfilter_vg *
grub_mdraid_detect (grub_disk_t disk,
struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector)
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation)
{
grub_uint64_t size;
grub_uint8_t minor_version;
@@ -198,6 +199,7 @@ grub_mdraid_detect (grub_disk_t disk,
grub_memcpy (uuid, sb.set_uuid, 16);
*start_sector = grub_le_to_cpu64 (sb.data_offset);
+ *generation = grub_le_to_cpu64 (sb.events);
array = grub_diskfilter_make_raid (16, uuid,
grub_le_to_cpu32 (sb.raid_disks),
diff --git a/grub-core/disk/mdraid_linux.c b/grub-core/disk/mdraid_linux.c
index 11024ae..7b9768c 100644
--- a/grub-core/disk/mdraid_linux.c
+++ b/grub-core/disk/mdraid_linux.c
@@ -180,7 +180,8 @@ struct grub_raid_super_09
static struct grub_diskfilter_vg *
grub_mdraid_detect (grub_disk_t disk,
struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector)
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation)
{
grub_disk_addr_t sector;
grub_uint64_t size;
@@ -247,6 +248,7 @@ grub_mdraid_detect (grub_disk_t disk,
uuid[3] = grub_swap_bytes32 (sb->set_uuid3);
*start_sector = 0;
+ *generation = grub_md_to_cpu64 (sb->cp_events);
id->uuidlen = 0;
id->id = grub_md_to_cpu32 (sb->this_disk.number);
diff --git a/include/grub/diskfilter.h b/include/grub/diskfilter.h
index 1aedcd3..947500b 100644
--- a/include/grub/diskfilter.h
+++ b/include/grub/diskfilter.h
@@ -69,6 +69,7 @@ struct grub_diskfilter_pv {
grub_disk_addr_t part_start;
grub_disk_addr_t part_size;
grub_disk_addr_t start_sector; /* Sector number where the data area starts. */
+ grub_uint64_t generation;
struct grub_diskfilter_pv *next;
/* Optional. */
grub_uint8_t *internal_id;
@@ -77,6 +78,15 @@ struct grub_diskfilter_pv {
#endif
};
+struct grub_diskfilter_readability
+{
+ /* Readable without reading any stale disks or doing any recovery. */
+ int is_easily_readable;
+ /* Readable at all: even reding stale disks and doing recovery. */
+ int is_readable;
+ grub_uint64_t generation;
+};
+
struct grub_diskfilter_lv {
/* Name used for disk. */
char *fullname;
@@ -90,6 +100,7 @@ struct grub_diskfilter_lv {
int became_readable_at;
int scanned;
int visible;
+ struct grub_diskfilter_readability readability;
/* Pointer to segment_count segments. */
struct grub_diskfilter_segment *segments;
@@ -119,6 +130,7 @@ struct grub_diskfilter_segment {
unsigned int node_count;
unsigned int node_alloc;
struct grub_diskfilter_node *nodes;
+ struct grub_diskfilter_readability readability;
unsigned int stripe_size;
};
@@ -127,6 +139,7 @@ struct grub_diskfilter_node {
grub_disk_addr_t start;
/* Optional. */
char *name;
+ struct grub_diskfilter_readability readability;
struct grub_diskfilter_pv *pv;
struct grub_diskfilter_lv *lv;
};
@@ -143,7 +156,8 @@ struct grub_diskfilter
struct grub_diskfilter_vg * (*detect) (grub_disk_t disk,
struct grub_diskfilter_pv_id *id,
- grub_disk_addr_t *start_sector);
+ grub_disk_addr_t *start_sector,
+ grub_uint64_t *generation);
};
typedef struct grub_diskfilter *grub_diskfilter_t;
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 213 bytes --]
next prev parent reply other threads:[~2015-07-15 21:47 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-06-28 18:06 [RFD] diskfilter stale RAID member detection vs. lazy scanning Andrei Borzenkov
2015-07-15 18:05 ` Vladimir 'φ-coder/phcoder' Serbinenko
2015-07-15 21:47 ` Vladimir 'φ-coder/phcoder' Serbinenko [this message]
2015-11-30 3:34 ` Andrei Borzenkov
2015-07-16 3:42 ` Andrei Borzenkov
2015-07-16 8:01 ` Vladimir 'φ-coder/phcoder' Serbinenko
2016-01-11 21:35 ` Andrei Borzenkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=55A6D4FA.7000400@gmail.com \
--to=phcoder@gmail.com \
--cc=grub-devel@gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.