[Cluster-devel] cluster/group/gfs_controld recover.c

All of lore.kernel.org
 help / color / mirror / Atom feed

From: teigland@sourceware.org <teigland@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/group/gfs_controld recover.c
Date: 19 Dec 2006 22:19:59 -0000	[thread overview]
Message-ID: <20061219221959.22687.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	teigland at sourceware.org	2006-12-19 22:19:59

Modified files:
	group/gfs_controld: recover.c 

Log message:
	Fixes related to the needs_recovery state and first-mounter recovery.
	Probably not perfect yet, but working in the tests I'm able to contrive.
	bz 218551

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.23.2.3&r2=1.23.2.4

--- cluster/group/gfs_controld/recover.c	2006/12/19 17:07:12	1.23.2.3
+++ cluster/group/gfs_controld/recover.c	2006/12/19 22:19:59	1.23.2.4
@@ -29,6 +29,7 @@
 void start_spectator_2(struct mountgroup *mg);
 void notify_mount_client(struct mountgroup *mg);
 
+
 int set_sysfs(struct mountgroup *mg, char *field, int val)
 {
 	char fname[512];
@@ -325,6 +326,9 @@
 			continue;
 		if (memb->jid == -9)
 			continue;
+		if (memb->spectator || memb->readonly || memb->withdrawing ||
+		    memb->ms_kernel_mount_done)
+			continue;
 		if (low == -1 || memb->nodeid < low) {
 			next = memb;
 			low = memb->nodeid;
@@ -641,12 +645,11 @@
 		goto out;
 	}
 
-	if (mg->needs_recovery) {
-		log_group(mg, "receive_remount from %d needs_recovery", from);
-		msg = "error: needs recovery";
-		error = -1;
-		goto out;
-	}
+	/* FIXME: check if we've even fully completed our normal mount yet
+	   (received our own mount-status?)  if not, then disallow remount */
+
+	/* FIXME: going ro->rw may mean we can now do journal or first-mounter
+	   recovery that we couldn't do before. */
 
 	memb->readonly = ro;
 	memb->rw = !ro;
@@ -746,30 +749,19 @@
 		else if (memb->readonly)
 			ro_count++;
 
-		if (memb->opts & MEMB_OPT_RECOVER)
+		if (memb->opts & MEMB_OPT_RECOVER) {
 			memb_recover = memb;
+			log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
+				  memb->nodeid);
+		}
 
 		if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
 			memb_mounted = memb;
 	}
 
-	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d",
-		  total, invalid_count, rw_count, ro_count, spect_count);
-
-	/* do we let the new member mount? jid=-2 means no.
-	   - we only allow an rw mount when the fs needs recovery
-	   - we only allow a single rw mount when the fs needs recovery */
-
-	if (mg->needs_recovery) {
-		if (!new->rw || rw_count)
-			new->jid = -2;
-	}
-
-	if (new->jid == -2) {
-		log_group(mg, "assign_journal: fail - needs_recovery %d",
-			  mg->needs_recovery);
-		goto out;
-	}
+	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
+		  "needs_recovery %d", total, invalid_count, rw_count,
+		  ro_count, spect_count, mg->needs_recovery);
 
 	if (new->spectator) {
 		log_group(mg, "assign_journal: new spectator allowed");
@@ -785,17 +777,33 @@
 		}
 	}
 
-	/* Currently the fs needs recovery, i.e. none of the current
-	   mounters (ro/spectators) can recover journals.  So, this new rw
-	   mounter is told to do first-mounter recovery of all the journals. */
-
+	/* Repeat first-mounter recovery: the fs has been mounted and in-use,
+	   but nodes have failed and none of the current mounters has been able
+	   to do recovery (all remaining nodes may be ro/spect for example).
+	   This puts us into the special "needs_recovery" state where new
+	   mounters are asked to do first-mounter recovery of the fs while
+	   the current mounters sit in a blocked state. */
+	   
 	if (mg->needs_recovery) {
-		log_group(mg, "assign_journal: memb %d gets OPT_RECOVER, "
-			  "needs_recovery", new->nodeid);
-		new->opts |= MEMB_OPT_RECOVER;
+		if (!memb_recover) {
+			log_group(mg, "assign_journal: needs_recovery: "
+				  "new memb %d gets OPT_RECOVER",
+				  new->nodeid);
+			new->opts |= MEMB_OPT_RECOVER;
+		} else {
+			log_group(mg, "assign_journal: needs_recovery: "
+				  "new memb %d memb %d has OPT_RECOVER",
+				  new->nodeid, memb_recover->nodeid);
+		}
 		goto out;
 	}
 
+	/* Initial first-mounter recovery: the fs is coming online, the first
+	   mg member assumes first-mounter role and other nodes join the mg
+	   while the first-mounter is working.  These non-first mounters wait
+	   for the first-mounter to finish before notifying mount.gfs.  If the
+	   first-mounter fails, one of them will become the first-mounter. */
+
 	/* it shouldn't be possible to have someone doing first mounter
 	   recovery and also have someone with the fs fully mounted */
 
@@ -839,7 +847,8 @@
 		      mg->kernel_mount_done, mg->kernel_mount_error,
 		      mg->first_mounter, mg->first_mounter_done);
 
-	log_group(mg, "assign_journal: memb %d gets OPT_RECOVER", new->nodeid);
+	log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
+		  "fs not mounted", new->nodeid);
 	new->opts |= MEMB_OPT_RECOVER;
 
  out:
@@ -1006,7 +1015,7 @@
 		/* delay notifying mount client until we get a successful
 		   mount status from the first mounter */
 		log_group(mg, "other node doing first mounter recovery, "
-			  "delay notify_mount_client");
+			  "set mount_client_delay");
 		mg->mount_client_delay = 1;
 		mg->save_plocks = 0;
 		return;
@@ -1402,7 +1411,6 @@
 	if (memb_gone_recover) {
 		log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
 			  memb_gone_recover->nodeid);
-		ASSERT(!mg->mount_client_notified);
 		memb_gone_recover->tell_gfs_to_recover = 0;
 	}
 
@@ -2168,14 +2176,39 @@
 	return 0;
 }
 
-/* FIXME: what happens if a node is unmounting, others have it in members_gone,
-   and it crashes?  It shouldn't need journal recovery since the kernel umount
-   happens before leaving the group. */
+/*  After a start that initiated a recovery, everyone will go and see if they
+    can do recovery and try if they can.  If a node can't, it does start_done,
+    if it tries and fails, it does start_done, if it tries and succeeds it
+    sends a message and then does start_done once it receives's it back.  So,
+    when we get a finish we know that we have all the results from the recovery
+    cycle and can judge if everything is recovered properly or not.  If so, we
+    can unblock locks (in the finish), if not, we leave them blocked (in the
+    finish).
+
+    If we leave locks blocked in the finish, then they can only be unblocked
+    after someone is able to do the recovery that's needed.  So, leaving locks
+    blocked in a finish because recovery hasn't worked puts us into a special
+    state: the fs needs recovery, none of the current mounters has been able to
+    recover it, all current mounters have locks blocked in gfs, new mounters
+    are allowed, nodes can unmount, new mounters are asked to do first-mounter
+    recovery, if one of them succeeds then we can all clear this special state
+    and unblock locks (the unblock would happen upon recving the success
+    message from the new pseudo-first mounter, not as part of a finish), future
+    finishes would then go back to being able to unblock locks.
+
+    While in this special state, a new node has been added and asked to do
+    first-mounter recovery, other nodes can also be added while the new
+    first-mounter is active.  These other nodes don't notify mount.gfs.
+    They'll receive the result of the first mounter and if it succeeded they'll
+    notify mount.gfs, otherwise one of them will become the next first-mounter
+    and notify mount.gfs. */
 
 int do_finish(struct mountgroup *mg)
 {
 	struct mg_member *memb, *safe;
-	int leave_blocked = 0;
+
+	log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
+		  mg->needs_recovery);
 
 	/* members_gone list are the members that were removed from the
 	   members list when processing a start.  members are removed
@@ -2192,11 +2225,10 @@
 			list_del(&memb->list);
 			free(memb);
 		} else {
+			log_error("%s finish: needs recovery jid %d nodeid %d "
+				  "status %d", mg->name, memb->jid,
+				  memb->nodeid, memb->recovery_status);
 			mg->needs_recovery = 1;
-			log_group(mg, "finish: needs recovery "
-				  "jid %d nodeid %d status %d",
-				  memb->jid, memb->nodeid,
-				  memb->recovery_status);
 		}
 	}
 
@@ -2210,12 +2242,7 @@
 		return 0;
 	}
 
-	if (mg->needs_recovery) {
-		log_group(mg, "finish: leave locks blocked for needs_recovery");
-		leave_blocked = 1;
-	}
-
-	if (!leave_blocked) {
+	if (!mg->needs_recovery) {
 		set_sysfs(mg, "block", 0);
 
 		/* we may have been holding back our local mount due to
@@ -2224,7 +2251,8 @@
 			mg->mount_client_delay = 0;
 			notify_mount_client(mg);
 		}
-	}
+	} else
+		log_group(mg, "finish: leave locks blocked for needs_recovery");
 
 	return 0;
 }

next             reply	other threads:[~2006-12-19 22:19 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-12-19 22:19 teigland [this message]
  -- strict thread matches above, loose matches on Subject: below --
2007-06-06 15:47 [Cluster-devel] cluster/group/gfs_controld recover.c teigland
2007-06-06 15:44 teigland
2006-12-19 22:20 teigland
2006-12-19 22:19 teigland
2006-10-16 15:09 teigland
2006-08-31 18:56 teigland
2006-08-21 19:38 teigland
2006-08-15 21:38 teigland
2006-08-10 19:40 teigland

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20061219221959.22687.qmail@sourceware.org \
    --to=teigland@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.