cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2007-06-06 15:47 teigland
  0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2007-06-06 15:47 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	teigland at sourceware.org	2007-06-06 15:47:42

Modified files:
	group/gfs_controld: recover.c 

Log message:
	return a different error number to mount.gfs for each specific failure
	case, so mount can translate that into a helpful error message

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.23.2.6&r2=1.23.2.7

--- cluster/group/gfs_controld/recover.c	2007/04/26 19:01:42	1.23.2.6
+++ cluster/group/gfs_controld/recover.c	2007/06/06 15:47:42	1.23.2.7
@@ -1537,7 +1537,7 @@
 
 	if (strcmp(mg->dev, dev)) {
 		log_error("different fs dev %s with same name", mg->dev);
-		return -EINVAL;
+		return -EADDRINUSE;
 	}
 
 	if (find_mountpoint(mg, dir)) {
@@ -1575,14 +1575,14 @@
 	struct mountgroup *mg = NULL;
 	char table2[MAXLINE];
 	char *cluster = NULL, *name = NULL;
-	int rv;
+	int rv, new_mg = 0;
 
 	log_debug("mount: %s %s %s %s %s %s",
 		  dir, type, proto, table, options, dev);
 
 	if (strcmp(proto, "lock_dlm")) {
 		log_error("mount: lockproto %s not supported", proto);
-		rv = -EINVAL;
+		rv = -EPROTONOSUPPORT;
 		goto out;
 	}
 
@@ -1590,7 +1590,7 @@
 	    strstr(options, "first=") ||
 	    strstr(options, "id=")) {
 		log_error("mount: jid, first and id are reserved options");
-		rv = -EINVAL;
+		rv = -EOPNOTSUPP;
 		goto out;
 	}
 
@@ -1601,7 +1601,7 @@
 
 	name = strstr(table2, ":");
 	if (!name) {
-		rv = -EINVAL;
+		rv = -EBADFD;
 		goto out;
 	}
 
@@ -1625,6 +1625,7 @@
 		rv = -ENOMEM;
 		goto out;
 	}
+	new_mg = 1;
 
 	mg->mount_client = ci;
 	strncpy(mg->type, type, sizeof(mg->type));
@@ -1634,7 +1635,7 @@
 
 	if (strlen(cluster) != strlen(clustername) ||
 	    strlen(cluster) == 0 || strcmp(cluster, clustername)) {
-		rv = -1;
+		rv = -EBADR;
 		log_error("mount: fs requires cluster=\"%s\" current=\"%s\"",
 			  cluster, clustername);
 		goto out;
@@ -1646,7 +1647,7 @@
 		mg->spectator = 1;
 	} else {
 		if (!we_are_in_fence_domain()) {
-			rv = -EINVAL;
+			rv = -ENOANO;
 			log_error("mount: not in default fence domain");
 			goto out;
 		}
@@ -1656,7 +1657,7 @@
 		mg->rw = 1;
 	else if (strstr(options, "ro")) {
 		if (mg->spectator) {
-			rv = -EINVAL;
+			rv = -EROFS;
 			log_error("mount: readonly invalid with spectator");
 			goto out;
 		}
@@ -1664,7 +1665,7 @@
 	}
 
 	if (strlen(options) > MAX_OPTIONS_LEN-1) {
-		rv = -EINVAL;
+		rv = -EMLINK;
 		log_error("mount: options too long %d", strlen(options));
 		goto out;
 	}
@@ -1673,8 +1674,12 @@
 	group_join(gh, name);
 	rv = 0;
  out:
-	*mg_ret = mg;
-	log_group(mg, "do_mount: rv %d", rv);
+	if (mg) {
+		*mg_ret = mg;
+		log_group(mg, "do_mount: rv %d", rv);
+	}
+	if (rv && new_mg)
+		free(mg);
 	return rv;
 }
 



^ permalink raw reply	[flat|nested] 10+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2007-06-06 15:44 teigland
  0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2007-06-06 15:44 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2007-06-06 15:44:49

Modified files:
	group/gfs_controld: recover.c 

Log message:
	return a different error number to mount.gfs for each specific failure
	case, so mount can translate that into a helpful error message

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.29&r2=1.30

--- cluster/group/gfs_controld/recover.c	2007/04/26 19:01:34	1.29
+++ cluster/group/gfs_controld/recover.c	2007/06/06 15:44:49	1.30
@@ -1537,7 +1537,7 @@
 
 	if (strcmp(mg->dev, dev)) {
 		log_error("different fs dev %s with same name", mg->dev);
-		return -EINVAL;
+		return -EADDRINUSE;
 	}
 
 	if (find_mountpoint(mg, dir)) {
@@ -1575,14 +1575,14 @@
 	struct mountgroup *mg = NULL;
 	char table2[MAXLINE];
 	char *cluster = NULL, *name = NULL;
-	int rv;
+	int rv, new_mg = 0;
 
 	log_debug("mount: %s %s %s %s %s %s",
 		  dir, type, proto, table, options, dev);
 
 	if (strcmp(proto, "lock_dlm")) {
 		log_error("mount: lockproto %s not supported", proto);
-		rv = -EINVAL;
+		rv = -EPROTONOSUPPORT;
 		goto out;
 	}
 
@@ -1590,7 +1590,7 @@
 	    strstr(options, "first=") ||
 	    strstr(options, "id=")) {
 		log_error("mount: jid, first and id are reserved options");
-		rv = -EINVAL;
+		rv = -EOPNOTSUPP;
 		goto out;
 	}
 
@@ -1601,7 +1601,7 @@
 
 	name = strstr(table2, ":");
 	if (!name) {
-		rv = -EINVAL;
+		rv = -EBADFD;
 		goto out;
 	}
 
@@ -1625,6 +1625,7 @@
 		rv = -ENOMEM;
 		goto out;
 	}
+	new_mg = 1;
 
 	mg->mount_client = ci;
 	strncpy(mg->type, type, sizeof(mg->type));
@@ -1634,7 +1635,7 @@
 
 	if (strlen(cluster) != strlen(clustername) ||
 	    strlen(cluster) == 0 || strcmp(cluster, clustername)) {
-		rv = -1;
+		rv = -EBADR;
 		log_error("mount: fs requires cluster=\"%s\" current=\"%s\"",
 			  cluster, clustername);
 		goto out;
@@ -1646,7 +1647,7 @@
 		mg->spectator = 1;
 	} else {
 		if (!we_are_in_fence_domain()) {
-			rv = -EINVAL;
+			rv = -ENOANO;
 			log_error("mount: not in default fence domain");
 			goto out;
 		}
@@ -1656,7 +1657,7 @@
 		mg->rw = 1;
 	else if (strstr(options, "ro")) {
 		if (mg->spectator) {
-			rv = -EINVAL;
+			rv = -EROFS;
 			log_error("mount: readonly invalid with spectator");
 			goto out;
 		}
@@ -1664,7 +1665,7 @@
 	}
 
 	if (strlen(options) > MAX_OPTIONS_LEN-1) {
-		rv = -EINVAL;
+		rv = -EMLINK;
 		log_error("mount: options too long %d", strlen(options));
 		goto out;
 	}
@@ -1673,8 +1674,12 @@
 	group_join(gh, name);
 	rv = 0;
  out:
-	*mg_ret = mg;
-	log_group(mg, "do_mount: rv %d", rv);
+	if (mg) {
+		*mg_ret = mg;
+		log_group(mg, "do_mount: rv %d", rv);
+	}
+	if (rv && new_mg)
+		free(mg);
 	return rv;
 }
 



^ permalink raw reply	[flat|nested] 10+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-12-19 22:20 teigland
  0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-12-19 22:20 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL50
Changes by:	teigland at sourceware.org	2006-12-19 22:20:08

Modified files:
	group/gfs_controld: recover.c 

Log message:
	Fixes related to the needs_recovery state and first-mounter recovery.
	Probably not perfect yet, but working in the tests I'm able to contrive.
	bz 218551

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&only_with_tag=RHEL50&r1=1.23.4.3&r2=1.23.4.4

--- cluster/group/gfs_controld/recover.c	2006/12/19 17:07:22	1.23.4.3
+++ cluster/group/gfs_controld/recover.c	2006/12/19 22:20:08	1.23.4.4
@@ -29,6 +29,7 @@
 void start_spectator_2(struct mountgroup *mg);
 void notify_mount_client(struct mountgroup *mg);
 
+
 int set_sysfs(struct mountgroup *mg, char *field, int val)
 {
 	char fname[512];
@@ -325,6 +326,9 @@
 			continue;
 		if (memb->jid == -9)
 			continue;
+		if (memb->spectator || memb->readonly || memb->withdrawing ||
+		    memb->ms_kernel_mount_done)
+			continue;
 		if (low == -1 || memb->nodeid < low) {
 			next = memb;
 			low = memb->nodeid;
@@ -641,12 +645,11 @@
 		goto out;
 	}
 
-	if (mg->needs_recovery) {
-		log_group(mg, "receive_remount from %d needs_recovery", from);
-		msg = "error: needs recovery";
-		error = -1;
-		goto out;
-	}
+	/* FIXME: check if we've even fully completed our normal mount yet
+	   (received our own mount-status?)  if not, then disallow remount */
+
+	/* FIXME: going ro->rw may mean we can now do journal or first-mounter
+	   recovery that we couldn't do before. */
 
 	memb->readonly = ro;
 	memb->rw = !ro;
@@ -746,30 +749,19 @@
 		else if (memb->readonly)
 			ro_count++;
 
-		if (memb->opts & MEMB_OPT_RECOVER)
+		if (memb->opts & MEMB_OPT_RECOVER) {
 			memb_recover = memb;
+			log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
+				  memb->nodeid);
+		}
 
 		if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
 			memb_mounted = memb;
 	}
 
-	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d",
-		  total, invalid_count, rw_count, ro_count, spect_count);
-
-	/* do we let the new member mount? jid=-2 means no.
-	   - we only allow an rw mount when the fs needs recovery
-	   - we only allow a single rw mount when the fs needs recovery */
-
-	if (mg->needs_recovery) {
-		if (!new->rw || rw_count)
-			new->jid = -2;
-	}
-
-	if (new->jid == -2) {
-		log_group(mg, "assign_journal: fail - needs_recovery %d",
-			  mg->needs_recovery);
-		goto out;
-	}
+	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
+		  "needs_recovery %d", total, invalid_count, rw_count,
+		  ro_count, spect_count, mg->needs_recovery);
 
 	if (new->spectator) {
 		log_group(mg, "assign_journal: new spectator allowed");
@@ -785,17 +777,33 @@
 		}
 	}
 
-	/* Currently the fs needs recovery, i.e. none of the current
-	   mounters (ro/spectators) can recover journals.  So, this new rw
-	   mounter is told to do first-mounter recovery of all the journals. */
-
+	/* Repeat first-mounter recovery: the fs has been mounted and in-use,
+	   but nodes have failed and none of the current mounters has been able
+	   to do recovery (all remaining nodes may be ro/spect for example).
+	   This puts us into the special "needs_recovery" state where new
+	   mounters are asked to do first-mounter recovery of the fs while
+	   the current mounters sit in a blocked state. */
+	   
 	if (mg->needs_recovery) {
-		log_group(mg, "assign_journal: memb %d gets OPT_RECOVER, "
-			  "needs_recovery", new->nodeid);
-		new->opts |= MEMB_OPT_RECOVER;
+		if (!memb_recover) {
+			log_group(mg, "assign_journal: needs_recovery: "
+				  "new memb %d gets OPT_RECOVER",
+				  new->nodeid);
+			new->opts |= MEMB_OPT_RECOVER;
+		} else {
+			log_group(mg, "assign_journal: needs_recovery: "
+				  "new memb %d memb %d has OPT_RECOVER",
+				  new->nodeid, memb_recover->nodeid);
+		}
 		goto out;
 	}
 
+	/* Initial first-mounter recovery: the fs is coming online, the first
+	   mg member assumes first-mounter role and other nodes join the mg
+	   while the first-mounter is working.  These non-first mounters wait
+	   for the first-mounter to finish before notifying mount.gfs.  If the
+	   first-mounter fails, one of them will become the first-mounter. */
+
 	/* it shouldn't be possible to have someone doing first mounter
 	   recovery and also have someone with the fs fully mounted */
 
@@ -839,7 +847,8 @@
 		      mg->kernel_mount_done, mg->kernel_mount_error,
 		      mg->first_mounter, mg->first_mounter_done);
 
-	log_group(mg, "assign_journal: memb %d gets OPT_RECOVER", new->nodeid);
+	log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
+		  "fs not mounted", new->nodeid);
 	new->opts |= MEMB_OPT_RECOVER;
 
  out:
@@ -1006,7 +1015,7 @@
 		/* delay notifying mount client until we get a successful
 		   mount status from the first mounter */
 		log_group(mg, "other node doing first mounter recovery, "
-			  "delay notify_mount_client");
+			  "set mount_client_delay");
 		mg->mount_client_delay = 1;
 		mg->save_plocks = 0;
 		return;
@@ -1402,7 +1411,6 @@
 	if (memb_gone_recover) {
 		log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
 			  memb_gone_recover->nodeid);
-		ASSERT(!mg->mount_client_notified);
 		memb_gone_recover->tell_gfs_to_recover = 0;
 	}
 
@@ -2168,14 +2176,39 @@
 	return 0;
 }
 
-/* FIXME: what happens if a node is unmounting, others have it in members_gone,
-   and it crashes?  It shouldn't need journal recovery since the kernel umount
-   happens before leaving the group. */
+/*  After a start that initiated a recovery, everyone will go and see if they
+    can do recovery and try if they can.  If a node can't, it does start_done,
+    if it tries and fails, it does start_done, if it tries and succeeds it
+    sends a message and then does start_done once it receives's it back.  So,
+    when we get a finish we know that we have all the results from the recovery
+    cycle and can judge if everything is recovered properly or not.  If so, we
+    can unblock locks (in the finish), if not, we leave them blocked (in the
+    finish).
+
+    If we leave locks blocked in the finish, then they can only be unblocked
+    after someone is able to do the recovery that's needed.  So, leaving locks
+    blocked in a finish because recovery hasn't worked puts us into a special
+    state: the fs needs recovery, none of the current mounters has been able to
+    recover it, all current mounters have locks blocked in gfs, new mounters
+    are allowed, nodes can unmount, new mounters are asked to do first-mounter
+    recovery, if one of them succeeds then we can all clear this special state
+    and unblock locks (the unblock would happen upon recving the success
+    message from the new pseudo-first mounter, not as part of a finish), future
+    finishes would then go back to being able to unblock locks.
+
+    While in this special state, a new node has been added and asked to do
+    first-mounter recovery, other nodes can also be added while the new
+    first-mounter is active.  These other nodes don't notify mount.gfs.
+    They'll receive the result of the first mounter and if it succeeded they'll
+    notify mount.gfs, otherwise one of them will become the next first-mounter
+    and notify mount.gfs. */
 
 int do_finish(struct mountgroup *mg)
 {
 	struct mg_member *memb, *safe;
-	int leave_blocked = 0;
+
+	log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
+		  mg->needs_recovery);
 
 	/* members_gone list are the members that were removed from the
 	   members list when processing a start.  members are removed
@@ -2192,11 +2225,10 @@
 			list_del(&memb->list);
 			free(memb);
 		} else {
+			log_error("%s finish: needs recovery jid %d nodeid %d "
+				  "status %d", mg->name, memb->jid,
+				  memb->nodeid, memb->recovery_status);
 			mg->needs_recovery = 1;
-			log_group(mg, "finish: needs recovery "
-				  "jid %d nodeid %d status %d",
-				  memb->jid, memb->nodeid,
-				  memb->recovery_status);
 		}
 	}
 
@@ -2210,12 +2242,7 @@
 		return 0;
 	}
 
-	if (mg->needs_recovery) {
-		log_group(mg, "finish: leave locks blocked for needs_recovery");
-		leave_blocked = 1;
-	}
-
-	if (!leave_blocked) {
+	if (!mg->needs_recovery) {
 		set_sysfs(mg, "block", 0);
 
 		/* we may have been holding back our local mount due to
@@ -2224,7 +2251,8 @@
 			mg->mount_client_delay = 0;
 			notify_mount_client(mg);
 		}
-	}
+	} else
+		log_group(mg, "finish: leave locks blocked for needs_recovery");
 
 	return 0;
 }



^ permalink raw reply	[flat|nested] 10+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-12-19 22:19 teigland
  0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-12-19 22:19 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	teigland at sourceware.org	2006-12-19 22:19:59

Modified files:
	group/gfs_controld: recover.c 

Log message:
	Fixes related to the needs_recovery state and first-mounter recovery.
	Probably not perfect yet, but working in the tests I'm able to contrive.
	bz 218551

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.23.2.3&r2=1.23.2.4

--- cluster/group/gfs_controld/recover.c	2006/12/19 17:07:12	1.23.2.3
+++ cluster/group/gfs_controld/recover.c	2006/12/19 22:19:59	1.23.2.4
@@ -29,6 +29,7 @@
 void start_spectator_2(struct mountgroup *mg);
 void notify_mount_client(struct mountgroup *mg);
 
+
 int set_sysfs(struct mountgroup *mg, char *field, int val)
 {
 	char fname[512];
@@ -325,6 +326,9 @@
 			continue;
 		if (memb->jid == -9)
 			continue;
+		if (memb->spectator || memb->readonly || memb->withdrawing ||
+		    memb->ms_kernel_mount_done)
+			continue;
 		if (low == -1 || memb->nodeid < low) {
 			next = memb;
 			low = memb->nodeid;
@@ -641,12 +645,11 @@
 		goto out;
 	}
 
-	if (mg->needs_recovery) {
-		log_group(mg, "receive_remount from %d needs_recovery", from);
-		msg = "error: needs recovery";
-		error = -1;
-		goto out;
-	}
+	/* FIXME: check if we've even fully completed our normal mount yet
+	   (received our own mount-status?)  if not, then disallow remount */
+
+	/* FIXME: going ro->rw may mean we can now do journal or first-mounter
+	   recovery that we couldn't do before. */
 
 	memb->readonly = ro;
 	memb->rw = !ro;
@@ -746,30 +749,19 @@
 		else if (memb->readonly)
 			ro_count++;
 
-		if (memb->opts & MEMB_OPT_RECOVER)
+		if (memb->opts & MEMB_OPT_RECOVER) {
 			memb_recover = memb;
+			log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
+				  memb->nodeid);
+		}
 
 		if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
 			memb_mounted = memb;
 	}
 
-	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d",
-		  total, invalid_count, rw_count, ro_count, spect_count);
-
-	/* do we let the new member mount? jid=-2 means no.
-	   - we only allow an rw mount when the fs needs recovery
-	   - we only allow a single rw mount when the fs needs recovery */
-
-	if (mg->needs_recovery) {
-		if (!new->rw || rw_count)
-			new->jid = -2;
-	}
-
-	if (new->jid == -2) {
-		log_group(mg, "assign_journal: fail - needs_recovery %d",
-			  mg->needs_recovery);
-		goto out;
-	}
+	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
+		  "needs_recovery %d", total, invalid_count, rw_count,
+		  ro_count, spect_count, mg->needs_recovery);
 
 	if (new->spectator) {
 		log_group(mg, "assign_journal: new spectator allowed");
@@ -785,17 +777,33 @@
 		}
 	}
 
-	/* Currently the fs needs recovery, i.e. none of the current
-	   mounters (ro/spectators) can recover journals.  So, this new rw
-	   mounter is told to do first-mounter recovery of all the journals. */
-
+	/* Repeat first-mounter recovery: the fs has been mounted and in-use,
+	   but nodes have failed and none of the current mounters has been able
+	   to do recovery (all remaining nodes may be ro/spect for example).
+	   This puts us into the special "needs_recovery" state where new
+	   mounters are asked to do first-mounter recovery of the fs while
+	   the current mounters sit in a blocked state. */
+	   
 	if (mg->needs_recovery) {
-		log_group(mg, "assign_journal: memb %d gets OPT_RECOVER, "
-			  "needs_recovery", new->nodeid);
-		new->opts |= MEMB_OPT_RECOVER;
+		if (!memb_recover) {
+			log_group(mg, "assign_journal: needs_recovery: "
+				  "new memb %d gets OPT_RECOVER",
+				  new->nodeid);
+			new->opts |= MEMB_OPT_RECOVER;
+		} else {
+			log_group(mg, "assign_journal: needs_recovery: "
+				  "new memb %d memb %d has OPT_RECOVER",
+				  new->nodeid, memb_recover->nodeid);
+		}
 		goto out;
 	}
 
+	/* Initial first-mounter recovery: the fs is coming online, the first
+	   mg member assumes first-mounter role and other nodes join the mg
+	   while the first-mounter is working.  These non-first mounters wait
+	   for the first-mounter to finish before notifying mount.gfs.  If the
+	   first-mounter fails, one of them will become the first-mounter. */
+
 	/* it shouldn't be possible to have someone doing first mounter
 	   recovery and also have someone with the fs fully mounted */
 
@@ -839,7 +847,8 @@
 		      mg->kernel_mount_done, mg->kernel_mount_error,
 		      mg->first_mounter, mg->first_mounter_done);
 
-	log_group(mg, "assign_journal: memb %d gets OPT_RECOVER", new->nodeid);
+	log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
+		  "fs not mounted", new->nodeid);
 	new->opts |= MEMB_OPT_RECOVER;
 
  out:
@@ -1006,7 +1015,7 @@
 		/* delay notifying mount client until we get a successful
 		   mount status from the first mounter */
 		log_group(mg, "other node doing first mounter recovery, "
-			  "delay notify_mount_client");
+			  "set mount_client_delay");
 		mg->mount_client_delay = 1;
 		mg->save_plocks = 0;
 		return;
@@ -1402,7 +1411,6 @@
 	if (memb_gone_recover) {
 		log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
 			  memb_gone_recover->nodeid);
-		ASSERT(!mg->mount_client_notified);
 		memb_gone_recover->tell_gfs_to_recover = 0;
 	}
 
@@ -2168,14 +2176,39 @@
 	return 0;
 }
 
-/* FIXME: what happens if a node is unmounting, others have it in members_gone,
-   and it crashes?  It shouldn't need journal recovery since the kernel umount
-   happens before leaving the group. */
+/*  After a start that initiated a recovery, everyone will go and see if they
+    can do recovery and try if they can.  If a node can't, it does start_done,
+    if it tries and fails, it does start_done, if it tries and succeeds it
+    sends a message and then does start_done once it receives's it back.  So,
+    when we get a finish we know that we have all the results from the recovery
+    cycle and can judge if everything is recovered properly or not.  If so, we
+    can unblock locks (in the finish), if not, we leave them blocked (in the
+    finish).
+
+    If we leave locks blocked in the finish, then they can only be unblocked
+    after someone is able to do the recovery that's needed.  So, leaving locks
+    blocked in a finish because recovery hasn't worked puts us into a special
+    state: the fs needs recovery, none of the current mounters has been able to
+    recover it, all current mounters have locks blocked in gfs, new mounters
+    are allowed, nodes can unmount, new mounters are asked to do first-mounter
+    recovery, if one of them succeeds then we can all clear this special state
+    and unblock locks (the unblock would happen upon recving the success
+    message from the new pseudo-first mounter, not as part of a finish), future
+    finishes would then go back to being able to unblock locks.
+
+    While in this special state, a new node has been added and asked to do
+    first-mounter recovery, other nodes can also be added while the new
+    first-mounter is active.  These other nodes don't notify mount.gfs.
+    They'll receive the result of the first mounter and if it succeeded they'll
+    notify mount.gfs, otherwise one of them will become the next first-mounter
+    and notify mount.gfs. */
 
 int do_finish(struct mountgroup *mg)
 {
 	struct mg_member *memb, *safe;
-	int leave_blocked = 0;
+
+	log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
+		  mg->needs_recovery);
 
 	/* members_gone list are the members that were removed from the
 	   members list when processing a start.  members are removed
@@ -2192,11 +2225,10 @@
 			list_del(&memb->list);
 			free(memb);
 		} else {
+			log_error("%s finish: needs recovery jid %d nodeid %d "
+				  "status %d", mg->name, memb->jid,
+				  memb->nodeid, memb->recovery_status);
 			mg->needs_recovery = 1;
-			log_group(mg, "finish: needs recovery "
-				  "jid %d nodeid %d status %d",
-				  memb->jid, memb->nodeid,
-				  memb->recovery_status);
 		}
 	}
 
@@ -2210,12 +2242,7 @@
 		return 0;
 	}
 
-	if (mg->needs_recovery) {
-		log_group(mg, "finish: leave locks blocked for needs_recovery");
-		leave_blocked = 1;
-	}
-
-	if (!leave_blocked) {
+	if (!mg->needs_recovery) {
 		set_sysfs(mg, "block", 0);
 
 		/* we may have been holding back our local mount due to
@@ -2224,7 +2251,8 @@
 			mg->mount_client_delay = 0;
 			notify_mount_client(mg);
 		}
-	}
+	} else
+		log_group(mg, "finish: leave locks blocked for needs_recovery");
 
 	return 0;
 }



^ permalink raw reply	[flat|nested] 10+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-12-19 22:19 teigland
  0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-12-19 22:19 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-12-19 22:19:02

Modified files:
	group/gfs_controld: recover.c 

Log message:
	Fixes related to the needs_recovery state and first-mounter recovery.
	Probably not perfect yet, but working in the tests I'm able to contrive.
	bz 218551

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.26&r2=1.27

--- cluster/group/gfs_controld/recover.c	2006/12/19 17:05:59	1.26
+++ cluster/group/gfs_controld/recover.c	2006/12/19 22:19:01	1.27
@@ -29,6 +29,7 @@
 void start_spectator_2(struct mountgroup *mg);
 void notify_mount_client(struct mountgroup *mg);
 
+
 int set_sysfs(struct mountgroup *mg, char *field, int val)
 {
 	char fname[512];
@@ -325,6 +326,9 @@
 			continue;
 		if (memb->jid == -9)
 			continue;
+		if (memb->spectator || memb->readonly || memb->withdrawing ||
+		    memb->ms_kernel_mount_done)
+			continue;
 		if (low == -1 || memb->nodeid < low) {
 			next = memb;
 			low = memb->nodeid;
@@ -641,12 +645,11 @@
 		goto out;
 	}
 
-	if (mg->needs_recovery) {
-		log_group(mg, "receive_remount from %d needs_recovery", from);
-		msg = "error: needs recovery";
-		error = -1;
-		goto out;
-	}
+	/* FIXME: check if we've even fully completed our normal mount yet
+	   (received our own mount-status?)  if not, then disallow remount */
+
+	/* FIXME: going ro->rw may mean we can now do journal or first-mounter
+	   recovery that we couldn't do before. */
 
 	memb->readonly = ro;
 	memb->rw = !ro;
@@ -746,30 +749,19 @@
 		else if (memb->readonly)
 			ro_count++;
 
-		if (memb->opts & MEMB_OPT_RECOVER)
+		if (memb->opts & MEMB_OPT_RECOVER) {
 			memb_recover = memb;
+			log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
+				  memb->nodeid);
+		}
 
 		if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
 			memb_mounted = memb;
 	}
 
-	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d",
-		  total, invalid_count, rw_count, ro_count, spect_count);
-
-	/* do we let the new member mount? jid=-2 means no.
-	   - we only allow an rw mount when the fs needs recovery
-	   - we only allow a single rw mount when the fs needs recovery */
-
-	if (mg->needs_recovery) {
-		if (!new->rw || rw_count)
-			new->jid = -2;
-	}
-
-	if (new->jid == -2) {
-		log_group(mg, "assign_journal: fail - needs_recovery %d",
-			  mg->needs_recovery);
-		goto out;
-	}
+	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
+		  "needs_recovery %d", total, invalid_count, rw_count,
+		  ro_count, spect_count, mg->needs_recovery);
 
 	if (new->spectator) {
 		log_group(mg, "assign_journal: new spectator allowed");
@@ -785,17 +777,33 @@
 		}
 	}
 
-	/* Currently the fs needs recovery, i.e. none of the current
-	   mounters (ro/spectators) can recover journals.  So, this new rw
-	   mounter is told to do first-mounter recovery of all the journals. */
-
+	/* Repeat first-mounter recovery: the fs has been mounted and in-use,
+	   but nodes have failed and none of the current mounters has been able
+	   to do recovery (all remaining nodes may be ro/spect for example).
+	   This puts us into the special "needs_recovery" state where new
+	   mounters are asked to do first-mounter recovery of the fs while
+	   the current mounters sit in a blocked state. */
+	   
 	if (mg->needs_recovery) {
-		log_group(mg, "assign_journal: memb %d gets OPT_RECOVER, "
-			  "needs_recovery", new->nodeid);
-		new->opts |= MEMB_OPT_RECOVER;
+		if (!memb_recover) {
+			log_group(mg, "assign_journal: needs_recovery: "
+				  "new memb %d gets OPT_RECOVER",
+				  new->nodeid);
+			new->opts |= MEMB_OPT_RECOVER;
+		} else {
+			log_group(mg, "assign_journal: needs_recovery: "
+				  "new memb %d memb %d has OPT_RECOVER",
+				  new->nodeid, memb_recover->nodeid);
+		}
 		goto out;
 	}
 
+	/* Initial first-mounter recovery: the fs is coming online, the first
+	   mg member assumes first-mounter role and other nodes join the mg
+	   while the first-mounter is working.  These non-first mounters wait
+	   for the first-mounter to finish before notifying mount.gfs.  If the
+	   first-mounter fails, one of them will become the first-mounter. */
+
 	/* it shouldn't be possible to have someone doing first mounter
 	   recovery and also have someone with the fs fully mounted */
 
@@ -839,7 +847,8 @@
 		      mg->kernel_mount_done, mg->kernel_mount_error,
 		      mg->first_mounter, mg->first_mounter_done);
 
-	log_group(mg, "assign_journal: memb %d gets OPT_RECOVER", new->nodeid);
+	log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
+		  "fs not mounted", new->nodeid);
 	new->opts |= MEMB_OPT_RECOVER;
 
  out:
@@ -1006,7 +1015,7 @@
 		/* delay notifying mount client until we get a successful
 		   mount status from the first mounter */
 		log_group(mg, "other node doing first mounter recovery, "
-			  "delay notify_mount_client");
+			  "set mount_client_delay");
 		mg->mount_client_delay = 1;
 		mg->save_plocks = 0;
 		return;
@@ -1402,7 +1411,6 @@
 	if (memb_gone_recover) {
 		log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
 			  memb_gone_recover->nodeid);
-		ASSERT(!mg->mount_client_notified);
 		memb_gone_recover->tell_gfs_to_recover = 0;
 	}
 
@@ -2168,14 +2176,39 @@
 	return 0;
 }
 
-/* FIXME: what happens if a node is unmounting, others have it in members_gone,
-   and it crashes?  It shouldn't need journal recovery since the kernel umount
-   happens before leaving the group. */
+/*  After a start that initiated a recovery, everyone will go and see if they
+    can do recovery and try if they can.  If a node can't, it does start_done,
+    if it tries and fails, it does start_done, if it tries and succeeds it
+    sends a message and then does start_done once it receives's it back.  So,
+    when we get a finish we know that we have all the results from the recovery
+    cycle and can judge if everything is recovered properly or not.  If so, we
+    can unblock locks (in the finish), if not, we leave them blocked (in the
+    finish).
+
+    If we leave locks blocked in the finish, then they can only be unblocked
+    after someone is able to do the recovery that's needed.  So, leaving locks
+    blocked in a finish because recovery hasn't worked puts us into a special
+    state: the fs needs recovery, none of the current mounters has been able to
+    recover it, all current mounters have locks blocked in gfs, new mounters
+    are allowed, nodes can unmount, new mounters are asked to do first-mounter
+    recovery, if one of them succeeds then we can all clear this special state
+    and unblock locks (the unblock would happen upon recving the success
+    message from the new pseudo-first mounter, not as part of a finish), future
+    finishes would then go back to being able to unblock locks.
+
+    While in this special state, a new node has been added and asked to do
+    first-mounter recovery, other nodes can also be added while the new
+    first-mounter is active.  These other nodes don't notify mount.gfs.
+    They'll receive the result of the first mounter and if it succeeded they'll
+    notify mount.gfs, otherwise one of them will become the next first-mounter
+    and notify mount.gfs. */
 
 int do_finish(struct mountgroup *mg)
 {
 	struct mg_member *memb, *safe;
-	int leave_blocked = 0;
+
+	log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
+		  mg->needs_recovery);
 
 	/* members_gone list are the members that were removed from the
 	   members list when processing a start.  members are removed
@@ -2192,11 +2225,10 @@
 			list_del(&memb->list);
 			free(memb);
 		} else {
+			log_error("%s finish: needs recovery jid %d nodeid %d "
+				  "status %d", mg->name, memb->jid,
+				  memb->nodeid, memb->recovery_status);
 			mg->needs_recovery = 1;
-			log_group(mg, "finish: needs recovery "
-				  "jid %d nodeid %d status %d",
-				  memb->jid, memb->nodeid,
-				  memb->recovery_status);
 		}
 	}
 
@@ -2210,12 +2242,7 @@
 		return 0;
 	}
 
-	if (mg->needs_recovery) {
-		log_group(mg, "finish: leave locks blocked for needs_recovery");
-		leave_blocked = 1;
-	}
-
-	if (!leave_blocked) {
+	if (!mg->needs_recovery) {
 		set_sysfs(mg, "block", 0);
 
 		/* we may have been holding back our local mount due to
@@ -2224,7 +2251,8 @@
 			mg->mount_client_delay = 0;
 			notify_mount_client(mg);
 		}
-	}
+	} else
+		log_group(mg, "finish: leave locks blocked for needs_recovery");
 
 	return 0;
 }



^ permalink raw reply	[flat|nested] 10+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-10-16 15:09 teigland
  0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-10-16 15:09 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-10-16 15:09:26

Modified files:
	group/gfs_controld: recover.c 

Log message:
	fix typo in debug message

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.20&r2=1.21

--- cluster/group/gfs_controld/recover.c	2006/10/16 14:44:02	1.20
+++ cluster/group/gfs_controld/recover.c	2006/10/16 15:09:25	1.21
@@ -826,7 +826,7 @@
 	   which means no first mounter recovery is needed or is current */
 
 	if (mg->global_first_recover_done) {
-		log_group(mg, "assign_journal: global_firsts_recover_done");
+		log_group(mg, "assign_journal: global_first_recover_done");
 		goto out;
 	}
 



^ permalink raw reply	[flat|nested] 10+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-08-31 18:56 teigland
  0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-08-31 18:56 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-31 18:56:25

Modified files:
	group/gfs_controld: recover.c 

Log message:
	When deciding whether we need to unlink the checkpoint and resend journals
	for a failed master node we weren't distinguishing between the master
	failing (where we need to do this stuff) and the master just leaving
	(where we don't).

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.16&r2=1.17

--- cluster/group/gfs_controld/recover.c	2006/08/21 19:38:53	1.16
+++ cluster/group/gfs_controld/recover.c	2006/08/31 18:56:25	1.17
@@ -967,6 +967,7 @@
 {
 	struct mg_member *memb, *safe;
 	int i, found, id, pos = 0, neg = 0, prev_master_nodeid;
+	int master_failed = 0;
 
 	/* move departed nodes from members list to members_gone */
 
@@ -1017,6 +1018,10 @@
 				  memb->wait_gfs_recover_done);
 
 			purge_plocks(mg, memb->nodeid, 0);
+
+			if (mg->master_nodeid == memb->nodeid &&
+			    memb->gone_type == GROUP_NODE_FAILED)
+				master_failed = 1;
 		}
 	}	
 
@@ -1048,7 +1053,7 @@
 	   - store plocks in ckpt for the new mounters to read when they
 	     get the journals msg from us */
 
-	if (neg &&
+	if (neg && master_failed &&
 	    (prev_master_nodeid != -1) &&
 	    (prev_master_nodeid != mg->master_nodeid) &&
 	    (our_nodeid == mg->master_nodeid)) {



^ permalink raw reply	[flat|nested] 10+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-08-21 19:38 teigland
  0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-08-21 19:38 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-21 19:38:53

Modified files:
	group/gfs_controld: recover.c 

Log message:
	expand the number of cases where we don't tell gfs-kernel to do recovery
	because it won't be able to -- esp cases related to a mount in progress
	but not yet far enough for gfs to be able to do journal recovery

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.15&r2=1.16

--- cluster/group/gfs_controld/recover.c	2006/08/21 17:46:19	1.15
+++ cluster/group/gfs_controld/recover.c	2006/08/21 19:38:53	1.16
@@ -1286,25 +1286,49 @@
    and moves the memb structs for those nodes into members_gone
    and sets memb->tell_gfs_to_recover on them */
 
+/* we don't want to tell gfs-kernel to do journal recovery for a failed
+   node in a number of cases:
+   - we're a spectator or readonly mount
+   - gfs-kernel is currently withdrawing
+   - we're mounting and haven't received a journals message yet
+   - we're mounting and got a kernel mount error back from mount.gfs
+   - we're mounting and haven't notified mount.gfs yet (to do mount(2))
+   - we're mounting and got_kernel_mount is 0, i.e. we've not seen a uevent
+     related to the kernel mount yet
+   (some of the mounting checks should be obviated by others)
+
+   the problem we're trying to avoid here is telling gfs-kernel to do
+   recovery when it can't for some reason and then waiting forever for
+   a recovery_done signal that will never arrive. */
+
 void recover_journals(struct mountgroup *mg)
 {
 	struct mg_member *memb;
 	int rv;
 
-	/* we can't do journal recovery if: we're a spectator or readonly
-	   mount, gfs is currently withdrawing, or we're mounting and haven't
-	   received a journals message yet */
+	if (mg->spectator ||
+	    mg->readonly ||
+	    mg->withdraw ||
+	    mg->our_jid == JID_INIT ||
+	    mg->kernel_mount_error ||
+	    !mg->mount_client_notified ||
+	    !mg->got_kernel_mount) {
 
-	if (mg->spectator || mg->readonly || mg->withdraw ||
-	    mg->our_jid == JID_INIT) {
 		list_for_each_entry(memb, &mg->members_gone, list) {
 			if (!memb->tell_gfs_to_recover)
 				continue;
 
-			log_group(mg, "recover journal %d nodeid %d skip, "
-				  "spect %d ro %d our_jid %d",
+			log_group(mg, "recover journal %d nodeid %d skip: "
+				  "%d %d %d %d %d %d %d",
 				  memb->jid, memb->nodeid,
-				  mg->spectator, mg->readonly, mg->our_jid);
+				  mg->spectator,
+				  mg->readonly,
+				  mg->withdraw,
+				  mg->our_jid,
+				  mg->kernel_mount_error,
+				  mg->mount_client_notified,
+				  mg->got_kernel_mount);
+
 			memb->tell_gfs_to_recover = 0;
 			memb->local_recovery_status = RS_READONLY;
 		}



^ permalink raw reply	[flat|nested] 10+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-08-15 21:38 teigland
  0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-08-15 21:38 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-15 21:38:00

Modified files:
	group/gfs_controld: recover.c 

Log message:
	errors opening sysfs files are normal/expected in many cases, so
	don't complain in syslog about it

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.12&r2=1.13

--- cluster/group/gfs_controld/recover.c	2006/08/14 17:22:53	1.12
+++ cluster/group/gfs_controld/recover.c	2006/08/15 21:38:00	1.13
@@ -41,7 +41,7 @@
 
 	fd = open(fname, O_RDWR);
 	if (fd < 0) {
-		log_error("open %s error %d %d", fname, fd, errno);
+		log_group(mg, "set open %s error %d %d", fname, fd, errno);
 		return -1;
 	}
 
@@ -71,7 +71,7 @@
 
 	fd = open(fname, O_RDONLY);
 	if (fd < 0) {
-		log_error("open %s error %d %d", fname, fd, errno);
+		log_group(mg, "get open %s error %d %d", fname, fd, errno);
 		return -1;
 	}
 



^ permalink raw reply	[flat|nested] 10+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld recover.c
@ 2006-08-10 19:40 teigland
  0 siblings, 0 replies; 10+ messages in thread
From: teigland @ 2006-08-10 19:40 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-10 19:40:50

Modified files:
	group/gfs_controld: recover.c 

Log message:
	log_debug() when we receive a withdraw message

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.10&r2=1.11

--- cluster/group/gfs_controld/recover.c	2006/08/08 21:19:17	1.10
+++ cluster/group/gfs_controld/recover.c	2006/08/10 19:40:50	1.11
@@ -170,6 +170,7 @@
 		log_group(mg, "receive_withdraw no member %d", from);
 		return;
 	}
+	log_group(mg, "receive_withdraw from %d", from);
 	memb->withdrawing = 1;
 
 	if (from == our_nodeid)



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2007-06-06 15:47 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-06-06 15:47 [Cluster-devel] cluster/group/gfs_controld recover.c teigland
  -- strict thread matches above, loose matches on Subject: below --
2007-06-06 15:44 teigland
2006-12-19 22:20 teigland
2006-12-19 22:19 teigland
2006-12-19 22:19 teigland
2006-10-16 15:09 teigland
2006-08-31 18:56 teigland
2006-08-21 19:38 teigland
2006-08-15 21:38 teigland
2006-08-10 19:40 teigland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).