cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
* [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c  ...
@ 2006-08-07 16:57 teigland
  0 siblings, 0 replies; 6+ messages in thread
From: teigland @ 2006-08-07 16:57 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-07 16:57:50

Modified files:
	group/gfs_controld: lock_dlm.h plock.c recover.c 

Log message:
	free all plock state for an fs when it's unmounted

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.10&r2=1.11
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.7&r2=1.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.8&r2=1.9

--- cluster/group/gfs_controld/lock_dlm.h	2006/08/04 21:56:10	1.10
+++ cluster/group/gfs_controld/lock_dlm.h	2006/08/07 16:57:50	1.11
@@ -263,6 +263,6 @@
 void retrieve_plocks(struct mountgroup *mg);
 int dump_plocks(char *name, int fd);
 void process_saved_plocks(struct mountgroup *mg);
-void purge_plocks(struct mountgroup *mg, int nodeid);
+void purge_plocks(struct mountgroup *mg, int nodeid, int unmount);
 
 #endif
--- cluster/group/gfs_controld/plock.c	2006/08/04 21:56:10	1.7
+++ cluster/group/gfs_controld/plock.c	2006/08/07 16:57:50	1.8
@@ -942,7 +942,7 @@
 	}
 }
 
-void purge_plocks(struct mountgroup *mg, int nodeid)
+void purge_plocks(struct mountgroup *mg, int nodeid, int unmount)
 {
 	struct posix_lock *po, *po2;
 	struct lock_waiter *w, *w2;
@@ -951,7 +951,7 @@
 
 	list_for_each_entry_safe(r, r2, &mg->resources, list) {
 		list_for_each_entry_safe(po, po2, &r->locks, list) {
-			if (po->nodeid == nodeid) {
+			if (po->nodeid == nodeid || unmount) {
 				list_del(&po->list);
 				free(po);
 				purged++;
@@ -959,7 +959,7 @@
 		}
 
 		list_for_each_entry_safe(w, w2, &r->waiters, list) {
-			if (w->info.nodeid == nodeid) {
+			if (w->info.nodeid == nodeid || unmount) {
 				list_del(&w->list);
 				free(w);
 				purged++;
--- cluster/group/gfs_controld/recover.c	2006/08/04 21:56:10	1.8
+++ cluster/group/gfs_controld/recover.c	2006/08/07 16:57:50	1.9
@@ -966,7 +966,7 @@
 				  memb->spectator,
 				  memb->wait_gfs_recover_done);
 
-			purge_plocks(mg, memb->nodeid);
+			purge_plocks(mg, memb->nodeid, 0);
 		}
 	}	
 
@@ -2011,9 +2011,7 @@
 
 int do_terminate(struct mountgroup *mg)
 {
-	/* FIXME: all group members aren't guaranteed to be stopped for
-	   our leave yet when we get terminate.  We need that guarantee
-	   before we tell a withdrawing gfs to drop locks. */
+	purge_plocks(mg, 0, 1);
 
 	if (mg->withdraw) {
 		log_group(mg, "termination of our withdraw leave");



^ permalink raw reply	[flat|nested] 6+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c  ...
@ 2006-08-21 17:46 teigland
  0 siblings, 0 replies; 6+ messages in thread
From: teigland @ 2006-08-21 17:46 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-21 17:46:20

Modified files:
	group/gfs_controld: lock_dlm.h plock.c recover.c 

Log message:
	- the check for us becoming the new low nodeid after the previous one
	failed and unlinking the ckpt wasn't adequately checking for the old
	low node having failed
	- rename low_finished_nodeid to master_nodeid and clarify some of the
	code using this since it was confusing and misleading

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.15&r2=1.16
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.18&r2=1.19
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.14&r2=1.15

--- cluster/group/gfs_controld/lock_dlm.h	2006/08/18 16:33:08	1.15
+++ cluster/group/gfs_controld/lock_dlm.h	2006/08/21 17:46:19	1.16
@@ -148,8 +148,8 @@
 	int			first_mounter_done;
 	int			emulate_first_mounter;
 	int			wait_first_done;
-	int			low_finished_nodeid;
 	int			low_nodeid;
+	int			master_nodeid;
 	int			save_plocks;
 
 	uint64_t		cp_handle;
--- cluster/group/gfs_controld/plock.c	2006/08/18 16:33:08	1.18
+++ cluster/group/gfs_controld/plock.c	2006/08/21 17:46:19	1.19
@@ -1374,7 +1374,8 @@
 	saCkptSectionIterationFinalize(itr);
  out:
 	if (mg->low_nodeid == our_nodeid) {
-		log_group(mg, "retrieve_plocks: unlink ckpt from old low node");
+		/* we're the new low nodeid, will be master */
+		log_group(mg, "retrieve_plocks: unlink ckpt from old master");
 		_unlink_checkpoint(mg, &name);
 	} else
 		saCkptCheckpointClose(h);
--- cluster/group/gfs_controld/recover.c	2006/08/18 16:33:08	1.14
+++ cluster/group/gfs_controld/recover.c	2006/08/21 17:46:19	1.15
@@ -514,9 +514,8 @@
 	free(buf);
 }
 
-/* We set the new member's jid to the lowest unused jid.
-   If we're the lowest existing member (by nodeid), then
-   send jid info to the new node. */
+/* We set the new member's jid to the lowest unused jid.  If we're the lowest
+   existing member (by nodeid), then send jid info to the new node. */
 
 /* Look at rw/ro/spectator status of all existing mounters and whether
    we need to do recovery.  Based on that, decide if the current mount
@@ -590,14 +589,14 @@
 	log_group(mg, "assign_journal: new member %d got jid %d",
 		  new->nodeid, new->jid);
 
-	if (mg->low_finished_nodeid == our_nodeid)
+	if (mg->master_nodeid == our_nodeid) {
 		store_plocks(mg, new->nodeid);
 
-	/* if we're the first mounter and haven't gotten others_may_mount
-	   yet, then don't send journals until kernel_recovery_done_first
-	   so the second node won't mount the fs until omm. */
+		/* if we're the first mounter and haven't gotten
+		   others_may_mount yet, then don't send journals until
+		   kernel_recovery_done_first so the second node won't mount
+		   the fs until omm. */
 
-	if (mg->low_finished_nodeid == our_nodeid) {
 		if (mg->first_mounter && !mg->first_mounter_done) {
 			log_group(mg, "delay sending journals to %d",
 				  new->nodeid);
@@ -911,13 +910,63 @@
 	clear_memb_list(&mg->members_gone);
 }
 
+/* New mounters may be waiting for a journals message that a failed node (as
+   master) would have sent.  If the master failed and we're the new master,
+   then send a journals message to any nodes for whom we've not seen a journals
+   message.  We also need to checkpoint the plock state for the new nodes to
+   read after they get their journals message. */
+
+void resend_journals(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+	int stored_plocks = 0;
+
+	list_for_each_entry(memb, &mg->members, list) {
+		if (!memb->needs_journals)
+			continue;
+
+		if (!stored_plocks) {
+			store_plocks(mg, memb->nodeid);
+			stored_plocks = 1;
+		}
+
+		log_group(mg, "resend_journals to %d", memb->nodeid);
+		send_journals(mg, memb->nodeid);
+	}
+}
+
+/* The master node is the member of the group with the lowest nodeid who
+   was also a member of the last "finished" group, i.e. a member of the
+   group the last time it got a finish callback.  The job of the master
+   is to send state info to new nodes joining the group, and doing that
+   requires that the master has all the state to send -- a new joining
+   node that has the lowest nodeid doesn't have any state, which is why
+   we add the "finished" requirement. */
+
+void update_master_nodeid(struct mountgroup *mg)
+{
+	struct mg_member *memb;
+	int new = -1, low = -1;
+
+	list_for_each_entry(memb, &mg->members, list) {
+		if (low == -1 || memb->nodeid < low)
+			low = memb->nodeid;
+		if (!memb->finished)
+			continue;
+		if (new == -1 || memb->nodeid < new)
+			new = memb->nodeid;
+	}
+	mg->master_nodeid = new;
+	mg->low_nodeid = low;
+}
+
 /* This can happen before we receive a journals message for our mount. */
 
 void recover_members(struct mountgroup *mg, int num_nodes,
  		     int *nodeids, int *pos_out, int *neg_out)
 {
 	struct mg_member *memb, *safe;
-	int i, found, id, pos = 0, neg = 0, low = -1, old_low_finished_nodeid;
+	int i, found, id, pos = 0, neg = 0, prev_master_nodeid;
 
 	/* move departed nodes from members list to members_gone */
 
@@ -982,30 +1031,31 @@
 		log_group(mg, "add member %d", id);
 	}
 
-	list_for_each_entry(memb, &mg->members, list) {
-		if (mg->low_nodeid == -1 || memb->nodeid < mg->low_nodeid)
-			mg->low_nodeid = memb->nodeid;
-		if (!memb->finished)
-			continue;
-		if (low == -1 || memb->nodeid < low)
-			low = memb->nodeid;
-	}
-	old_low_finished_nodeid = mg->low_finished_nodeid;
-	mg->low_finished_nodeid = low;
+	prev_master_nodeid = mg->master_nodeid;
+	update_master_nodeid(mg);
 
 	*pos_out = pos;
 	*neg_out = neg;
 
-	log_group(mg, "total members %d low_finished_nodeid %d",
-		  mg->memb_count, low);
+	log_group(mg, "total members %d master_nodeid %d prev %d",
+		  mg->memb_count, mg->master_nodeid, prev_master_nodeid);
 
-	/* the low nodeid failed and we're the new low nodeid, we need
-	   to unlink the ckpt that the failed node had open so new ckpts
-	   can be created down the road */
-	if ((old_low_finished_nodeid != low) && (our_nodeid == low)) {
-		log_group(mg, "unlink ckpt for failed low node %d",
-			  old_low_finished_nodeid);
+	/* the master failed and we're the new master, we need to:
+	   - unlink the ckpt that the failed master had open so new ckpts
+	     can be created down the road
+	   - resend journals msg to any nodes that needed one from the
+	     failed master
+	   - store plocks in ckpt for the new mounters to read when they
+	     get the journals msg from us */
+
+	if (neg &&
+	    (prev_master_nodeid != -1) &&
+	    (prev_master_nodeid != mg->master_nodeid) &&
+	    (our_nodeid == mg->master_nodeid)) {
+		log_group(mg, "unlink ckpt for failed master %d",
+			  prev_master_nodeid);
 		unlink_checkpoint(mg);
+		resend_journals(mg);
 	}
 }
 
@@ -1021,6 +1071,7 @@
 	INIT_LIST_HEAD(&mg->resources);
 	INIT_LIST_HEAD(&mg->saved_messages);
 	mg->init = 1;
+	mg->master_nodeid = -1;
 	mg->low_nodeid = -1;
 
 	strncpy(mg->name, name, MAXNAME);
@@ -1925,31 +1976,6 @@
 	}
 }
 
-/* New mounters may be waiting for a journals message that a failed node (as
-   low nodeid) would have sent.  If the low nodeid failed and we're the new low
-   nodeid, then send a journals message to any nodes for whom we've not seen a
-   journals message.  We also need to checkpoint the plock state for the new
-   nodes to read after they get their journals message. */
-
-void resend_journals(struct mountgroup *mg)
-{
-	struct mg_member *memb;
-	int stored_plocks = 0;
-
-	list_for_each_entry(memb, &mg->members, list) {
-		if (!memb->needs_journals)
-			continue;
-
-		if (!stored_plocks) {
-			store_plocks(mg, memb->nodeid);
-			stored_plocks = 1;
-		}
-
-		log_group(mg, "resend_journals to %d", memb->nodeid);
-		send_journals(mg, memb->nodeid);
-	}
-}
-
 /*
    old method:
    A is rw mount, B mounts rw
@@ -1987,7 +2013,7 @@
 
 void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids)
 {
-	int pos = 0, neg = 0, low;
+	int pos = 0, neg = 0;
 
 	mg->start_event_nr = mg->last_start;
 	mg->start_type = type;
@@ -1995,18 +2021,9 @@
 	log_group(mg, "start %d init %d type %d member_count %d",
 		  mg->last_start, mg->init, type, member_count);
 
-	low = mg->low_finished_nodeid;
-
 	recover_members(mg, member_count, nodeids, &pos, &neg);
-
 	reset_unfinished_recoveries(mg);
 
-	if (neg && low != mg->low_finished_nodeid && low == our_nodeid) {
-		log_group(mg, "low nodeid failed old %d new %d",
-			  low, mg->low_finished_nodeid);
-		resend_journals(mg);
-	}
-
 	if (mg->init) {
 		if (member_count == 1)
 			start_first_mounter(mg);



^ permalink raw reply	[flat|nested] 6+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c  ...
@ 2006-08-18 16:33 teigland
  0 siblings, 0 replies; 6+ messages in thread
From: teigland @ 2006-08-18 16:33 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-18 16:33:08

Modified files:
	group/gfs_controld: lock_dlm.h plock.c recover.c 

Log message:
	when the low nodeid fails, the checkpoint needs to be unlinked,
	otherwise creating the ckpt will fail down the road when another
	node mounts

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.14&r2=1.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.17&r2=1.18
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.13&r2=1.14

--- cluster/group/gfs_controld/lock_dlm.h	2006/08/14 17:22:53	1.14
+++ cluster/group/gfs_controld/lock_dlm.h	2006/08/18 16:33:08	1.15
@@ -276,5 +276,6 @@
 int dump_plocks(char *name, int fd);
 void process_saved_plocks(struct mountgroup *mg);
 void purge_plocks(struct mountgroup *mg, int nodeid, int unmount);
+int unlink_checkpoint(struct mountgroup *mg);
 
 #endif
--- cluster/group/gfs_controld/plock.c	2006/08/17 19:39:17	1.17
+++ cluster/group/gfs_controld/plock.c	2006/08/18 16:33:08	1.18
@@ -1034,7 +1034,7 @@
 	return 0;
 }
 
-int unlink_checkpoint(struct mountgroup *mg, SaNameT *name)
+int _unlink_checkpoint(struct mountgroup *mg, SaNameT *name)
 {
 	SaCkptCheckpointHandleT h;
 	SaCkptCheckpointDescriptorT s;
@@ -1097,6 +1097,16 @@
 	return ret;
 }
 
+int unlink_checkpoint(struct mountgroup *mg)
+{
+	SaNameT name;
+	int len;
+
+	len = snprintf(name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", mg->name);
+	name.length = len;
+	return _unlink_checkpoint(mg, &name);
+}
+
 /* Copy all plock state into a checkpoint so new node can retrieve it.  The
    node creating the ckpt for the mounter needs to be the same node that's
    sending the mounter its journals message (i.e. the low nodeid).  The new
@@ -1139,7 +1149,7 @@
 
 	/* unlink an old checkpoint before we create a new one */
 	if (mg->cp_handle) {
-		if (unlink_checkpoint(mg, &name))
+		if (_unlink_checkpoint(mg, &name))
 			return;
 	}
 
@@ -1231,7 +1241,7 @@
 			/* this shouldn't happen in general */
 			log_group(mg, "store_plocks: clearing old ckpt");
 			saCkptCheckpointClose(h);
-			unlink_checkpoint(mg, &name);
+			_unlink_checkpoint(mg, &name);
 			goto open_retry;
 		}
 		if (rv != SA_AIS_OK) {
@@ -1318,6 +1328,9 @@
 			goto out_it;
 		}
 
+		if (!desc.sectionSize)
+			continue;
+
 		iov.sectionId = desc.sectionId;
 		iov.dataBuffer = &section_buf;
 		iov.dataSize = desc.sectionSize;
@@ -1362,7 +1375,7 @@
  out:
 	if (mg->low_nodeid == our_nodeid) {
 		log_group(mg, "retrieve_plocks: unlink ckpt from old low node");
-		unlink_checkpoint(mg, &name);
+		_unlink_checkpoint(mg, &name);
 	} else
 		saCkptCheckpointClose(h);
 }
@@ -1372,8 +1385,7 @@
 	struct posix_lock *po, *po2;
 	struct lock_waiter *w, *w2;
 	struct resource *r, *r2;
-	int len, purged = 0;
-	SaNameT name;
+	int purged = 0;
 
 	list_for_each_entry_safe(r, r2, &mg->resources, list) {
 		list_for_each_entry_safe(po, po2, &r->locks, list) {
@@ -1408,12 +1420,8 @@
 	   we need to unlink it so another node can create a new ckpt for
 	   the next mounter after we leave */
 
-	if (unmount && mg->cp_handle) {
-		len = snprintf(name.value, SA_MAX_NAME_LENGTH,
-			       "gfsplock.%s", mg->name);
-		name.length = len;
-		unlink_checkpoint(mg, &name);
-	}
+	if (unmount && mg->cp_handle)
+		unlink_checkpoint(mg);
 }
 
 int dump_plocks(char *name, int fd)
--- cluster/group/gfs_controld/recover.c	2006/08/15 21:38:00	1.13
+++ cluster/group/gfs_controld/recover.c	2006/08/18 16:33:08	1.14
@@ -917,7 +917,7 @@
  		     int *nodeids, int *pos_out, int *neg_out)
 {
 	struct mg_member *memb, *safe;
-	int i, found, id, pos = 0, neg = 0, low = -1;
+	int i, found, id, pos = 0, neg = 0, low = -1, old_low_finished_nodeid;
 
 	/* move departed nodes from members list to members_gone */
 
@@ -990,6 +990,7 @@
 		if (low == -1 || memb->nodeid < low)
 			low = memb->nodeid;
 	}
+	old_low_finished_nodeid = mg->low_finished_nodeid;
 	mg->low_finished_nodeid = low;
 
 	*pos_out = pos;
@@ -997,6 +998,15 @@
 
 	log_group(mg, "total members %d low_finished_nodeid %d",
 		  mg->memb_count, low);
+
+	/* the low nodeid failed and we're the new low nodeid, we need
+	   to unlink the ckpt that the failed node had open so new ckpts
+	   can be created down the road */
+	if ((old_low_finished_nodeid != low) && (our_nodeid == low)) {
+		log_group(mg, "unlink ckpt for failed low node %d",
+			  old_low_finished_nodeid);
+		unlink_checkpoint(mg);
+	}
 }
 
 struct mountgroup *create_mg(char *name)



^ permalink raw reply	[flat|nested] 6+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c  ...
@ 2006-08-08 21:19 teigland
  0 siblings, 0 replies; 6+ messages in thread
From: teigland @ 2006-08-08 21:19 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-08 21:19:18

Modified files:
	group/gfs_controld: lock_dlm.h plock.c recover.c 

Log message:
	The idea to have the last node that did the checkpoint try to reuse it
	even if it wasn't the low nodeid any more doesn't work because the new
	mounter tries to read the ckpt when it gets the journals message from the
	low nodeid before the ckpt is written from the other node.  Now, the
	low nodeid is always the one to create a ckpt for a new mounter which
	means a node saving the last ckpt needs to unlink it when it sees a new
	low nodeid join the group.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.11&r2=1.12
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.10&r2=1.11
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.9&r2=1.10

--- cluster/group/gfs_controld/lock_dlm.h	2006/08/07 16:57:50	1.11
+++ cluster/group/gfs_controld/lock_dlm.h	2006/08/08 21:19:17	1.12
@@ -140,6 +140,7 @@
 	int			emulate_first_mounter;
 	int			wait_first_done;
 	int			low_finished_nodeid;
+	int			low_nodeid;
 	int			save_plocks;
 
 	uint64_t		cp_handle;
@@ -259,7 +260,7 @@
 
 int send_group_message(struct mountgroup *mg, int len, char *buf);
 
-void store_plocks(struct mountgroup *mg);
+void store_plocks(struct mountgroup *mg, int nodeid);
 void retrieve_plocks(struct mountgroup *mg);
 int dump_plocks(char *name, int fd);
 void process_saved_plocks(struct mountgroup *mg);
--- cluster/group/gfs_controld/plock.c	2006/08/08 19:37:33	1.10
+++ cluster/group/gfs_controld/plock.c	2006/08/08 21:19:17	1.11
@@ -1094,20 +1094,18 @@
 	return ret;
 }
 
-/* Copy all plock state into a checkpoint so new node can retrieve it.
+/* Copy all plock state into a checkpoint so new node can retrieve it.  The
+   node creating the ckpt for the mounter needs to be the same node that's
+   sending the mounter its journals message (i.e. the low nodeid).  The new
+   mounter knows the ckpt is ready to read only after it gets its journals
+   message.
+ 
+   If the mounter is becoming the new low nodeid in the group, the node doing
+   the store closes the ckpt and the new node unlinks the ckpt after reading
+   it.  The ckpt should then disappear and the new node can create a new ckpt
+   for the next mounter. */
 
-   The low node in the group and the previous node to create the ckpt (with
-   non-zero cp_handle) may be different if a new node joins with a lower nodeid
-   than the previous low node that created the ckpt.  In this case, the prev
-   node has the old ckpt open and will reuse it if no plock state has changed,
-   or will unlink it and create a new one.  The low node will also attempt to
-   create a new ckpt.  That open-create will either fail due to the prev node
-   reusing the old ckpt, or it will race with the open-create on the prev node
-   after the prev node unlinks the old ckpt.  Either way, when there are two
-   different nodes in the group calling store_plocks(), one of them will fail
-   at the Open(CREATE) step with ERR_EXIST due to the other. */
-
-void store_plocks(struct mountgroup *mg)
+void store_plocks(struct mountgroup *mg, int nodeid)
 {
 	SaCkptCheckpointCreationAttributesT attr;
 	SaCkptCheckpointHandleT h;
@@ -1128,8 +1126,8 @@
 
 	/* no change to plock state since we created the last checkpoint */
 	if (mg->last_checkpoint_time > mg->last_plock_time) {
-		log_group(mg, "store_plocks: ckpt uptodate");
-		return;
+		log_group(mg, "store_plocks: saved ckpt uptodate");
+		goto out;
 	}
 	mg->last_checkpoint_time = time(NULL);
 
@@ -1236,6 +1234,17 @@
 			break;
 		}
 	}
+
+ out:
+	/* If the new nodeid is becoming the low nodeid it will now be in
+	   charge of creating ckpt's for mounters instead of us. */
+
+	if (nodeid < our_nodeid) {
+		log_group(mg, "store_plocks: closing ckpt for new low node %d",
+			  nodeid);
+		saCkptCheckpointClose(h);
+		mg->cp_handle = 0;
+	}
 }
 
 /* called by a node that's just been added to the group to get existing plock
@@ -1336,7 +1345,11 @@
  out_it:
 	saCkptSectionIterationFinalize(itr);
  out:
-	saCkptCheckpointClose(h);
+	if (mg->low_nodeid == our_nodeid) {
+		log_group(mg, "retrieve_plocks: unlink ckpt from old low node");
+		unlink_checkpoint(mg, &name);
+	} else
+		saCkptCheckpointClose(h);
 }
 
 void purge_plocks(struct mountgroup *mg, int nodeid, int unmount)
--- cluster/group/gfs_controld/recover.c	2006/08/07 16:57:50	1.9
+++ cluster/group/gfs_controld/recover.c	2006/08/08 21:19:17	1.10
@@ -589,8 +589,8 @@
 	log_group(mg, "assign_journal: new member %d got jid %d",
 		  new->nodeid, new->jid);
 
-	if (mg->low_finished_nodeid == our_nodeid || mg->cp_handle)
-		store_plocks(mg);
+	if (mg->low_finished_nodeid == our_nodeid)
+		store_plocks(mg, new->nodeid);
 
 	/* if we're the first mounter and haven't gotten others_may_mount
 	   yet, then don't send journals until kernel_recovery_done_first
@@ -982,6 +982,8 @@
 	}
 
 	list_for_each_entry(memb, &mg->members, list) {
+		if (mg->low_nodeid == -1 || memb->nodeid < mg->low_nodeid)
+			mg->low_nodeid = memb->nodeid;
 		if (!memb->finished)
 			continue;
 		if (low == -1 || memb->nodeid < low)
@@ -1008,6 +1010,7 @@
 	INIT_LIST_HEAD(&mg->resources);
 	INIT_LIST_HEAD(&mg->saved_messages);
 	mg->init = 1;
+	mg->low_nodeid = -1;
 
 	strncpy(mg->name, name, MAXNAME);
 
@@ -1902,7 +1905,7 @@
 			continue;
 
 		if (!stored_plocks) {
-			store_plocks(mg);
+			store_plocks(mg, memb->nodeid);
 			stored_plocks = 1;
 		}
 



^ permalink raw reply	[flat|nested] 6+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c  ...
@ 2006-08-04 21:56 teigland
  0 siblings, 0 replies; 6+ messages in thread
From: teigland @ 2006-08-04 21:56 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-04 21:56:10

Modified files:
	group/gfs_controld: lock_dlm.h plock.c recover.c 

Log message:
	Some basic stuff that I hadn't realized I'd not done back when
	first writing this:
	- purge plocks of failed nodes
	- implement get
	- write results back to processes waiting in the kernel

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.9&r2=1.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.6&r2=1.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.7&r2=1.8

--- cluster/group/gfs_controld/lock_dlm.h	2006/08/02 18:27:57	1.9
+++ cluster/group/gfs_controld/lock_dlm.h	2006/08/04 21:56:10	1.10
@@ -263,5 +263,6 @@
 void retrieve_plocks(struct mountgroup *mg);
 int dump_plocks(char *name, int fd);
 void process_saved_plocks(struct mountgroup *mg);
+void purge_plocks(struct mountgroup *mg, int nodeid);
 
 #endif
--- cluster/group/gfs_controld/plock.c	2006/08/02 20:50:40	1.6
+++ cluster/group/gfs_controld/plock.c	2006/08/04 21:56:10	1.7
@@ -117,6 +117,30 @@
 	i->owner	= le64_to_cpu(i->owner);
 }
 
+static char *op_str(int optype)
+{
+	switch (optype) {
+	case GDLM_PLOCK_OP_LOCK:
+		return "LK";
+	case GDLM_PLOCK_OP_UNLOCK:
+		return "UN";
+	case GDLM_PLOCK_OP_GET:
+		return "GET";
+	default:
+		return "??";
+	}
+}
+
+static char *ex_str(int optype, int ex)
+{
+	if (optype == GDLM_PLOCK_OP_UNLOCK || optype == GDLM_PLOCK_OP_GET)
+		return "-";
+	if (ex)
+		return "WR";
+	else
+		return "RD";
+}
+
 static int get_proc_number(const char *file, const char *name, uint32_t *number)
 {
 	FILE *fl;
@@ -277,9 +301,6 @@
 
 	rv = read(control_fd, &info, sizeof(info));
 
-	log_debug("process_plocks %d op %d fs %x num %llx ex %d wait %d", rv,
-		  info.optype, info.fsid, info.number, info.ex, info.wait);
-
 	mg = find_mg_id(info.fsid);
 	if (!mg) {
 		log_debug("process_plocks: no mg id %x", info.fsid);
@@ -287,6 +308,14 @@
 		goto fail;
 	}
 
+	log_group(mg, "read plock %llx %s %s %llx-%llx %d/%u/%llx w %d",
+		  info.number,
+		  op_str(info.optype),
+		  ex_str(info.optype, info.ex),
+		  info.start, info.end,
+		  info.nodeid, info.pid, info.owner,
+		  info.wait);
+
 	len = sizeof(struct gdlm_header) + sizeof(struct gdlm_plock_info);
 	buf = malloc(len);
 	if (!buf) {
@@ -478,7 +507,7 @@
 	return shrink_range2(&po->start, &po->end, start, end);
 }
 
-static int is_conflict(struct resource *r, struct gdlm_plock_info *in)
+static int is_conflict(struct resource *r, struct gdlm_plock_info *in, int get)
 {
 	struct posix_lock *po;
 
@@ -488,8 +517,15 @@
 		if (!ranges_overlap(po->start, po->end, in->start, in->end))
 			continue;
 
-		if (in->ex || po->ex)
+		if (in->ex || po->ex) {
+			if (get) {
+				in->ex = po->ex;
+				in->pid = po->pid;
+				in->start = po->start;
+				in->end = po->end;
+			}
 			return 1;
+		}
 	}
 	return 0;
 }
@@ -523,19 +559,22 @@
 		      struct gdlm_plock_info *in)
 {
 	uint64_t start2, end2;
+	int rv;
 
 	/* non-overlapping area start2:end2 */
 	start2 = po->start;
 	end2 = po->end;
-	shrink_range2(&start2, &end2, in->start, in->end);
+	rv = shrink_range2(&start2, &end2, in->start, in->end);
+	if (rv)
+		goto out;
 
 	po->start = in->start;
 	po->end = in->end;
 	po->ex = in->ex;
 
-	add_lock(r, in->nodeid, in->owner, in->pid, !in->ex, start2, end2);
-
-	return 0;
+	rv = add_lock(r, in->nodeid, in->owner, in->pid, !in->ex, start2, end2);
+ out:
+	return rv;
 }
 
 /* RN within RE (RE overlaps RN on both sides)
@@ -547,17 +586,23 @@
 		      struct gdlm_plock_info *in)
 
 {
-	add_lock(r, in->nodeid, in->owner, in->pid,
-		 !in->ex, po->start, in->start - 1);
+	int rv;
 
-	add_lock(r, in->nodeid, in->owner, in->pid,
-		 !in->ex, in->end + 1, po->end);
+	rv = add_lock(r, in->nodeid, in->owner, in->pid,
+		      !in->ex, po->start, in->start - 1);
+	if (rv)
+		goto out;
+
+	rv = add_lock(r, in->nodeid, in->owner, in->pid,
+		      !in->ex, in->end + 1, po->end);
+	if (rv)
+		goto out;
 
 	po->start = in->start;
 	po->end = in->end;
 	po->ex = in->ex;
-
-	return 0;
+ out:
+	return rv;
 }
 
 static int lock_internal(struct mountgroup *mg, struct resource *r,
@@ -618,7 +663,6 @@
 
 	rv = add_lock(r, in->nodeid, in->owner, in->pid,
 		      in->ex, in->start, in->end);
-
  out:
 	return rv;
 
@@ -638,7 +682,7 @@
 
 		/* existing range (RE) overlaps new range (RN) */
 
-		switch(overlap_type(in->start, in->end, po->start, po->end)) {
+		switch (overlap_type(in->start, in->end, po->start, po->end)) {
 
 		case 0:
 			/* ranges the same - just remove the existing lock */
@@ -651,15 +695,15 @@
 			/* RN within RE and starts or ends on RE boundary -
 			 * shrink and update RE */
 
-			shrink_range(po, in->start, in->end);
+			rv = shrink_range(po, in->start, in->end);
 			goto out;
 
 		case 2:
 			/* RN within RE - shrink and update RE to be front
 			 * fragment, and add a new lock for back fragment */
 
-			add_lock(r, in->nodeid, in->owner, in->pid,
-				 po->ex, in->end + 1, po->end);
+			rv = add_lock(r, in->nodeid, in->owner, in->pid,
+				      po->ex, in->end + 1, po->end);
 			po->end = in->start - 1;
 			goto out;
 
@@ -676,7 +720,7 @@
 			 * update RE, then continue because RN could cover
 			 * other locks */
 
-			shrink_range(po, in->start, in->end);
+			rv = shrink_range(po, in->start, in->end);
 			continue;
 
 		default:
@@ -684,7 +728,6 @@
 			goto out;
 		}
 	}
-
  out:
 	return rv;
 }
@@ -702,6 +745,17 @@
 	return 0;
 }
 
+static void write_result(struct mountgroup *mg, struct gdlm_plock_info *in,
+			 int rv)
+{
+	int err;
+
+	in->rv = rv;
+	err = write(control_fd, in, sizeof(struct gdlm_plock_info));
+	if (err != sizeof(struct gdlm_plock_info))
+		log_error("plock result write err %d errno %d", err, errno);
+}
+
 static void do_waiters(struct mountgroup *mg, struct resource *r)
 {
 	struct lock_waiter *w, *safe;
@@ -711,61 +765,86 @@
 	list_for_each_entry_safe(w, safe, &r->waiters, list) {
 		in = &w->info;
 
-		if (is_conflict(r, in))
+		if (is_conflict(r, in, 0))
 			continue;
 
 		list_del(&w->list);
 
+		/*
+		log_group(mg, "take waiter %llx %llx-%llx %d/%u/%llx",
+			  in->number, in->start, in->end,
+			  in->nodeid, in->pid, in->owner);
+		*/
+
 		rv = lock_internal(mg, r, in);
 
+		if (in->nodeid == our_nodeid)
+			write_result(mg, in, rv);
+
 		free(w);
 	}
 }
 
-static int do_lock(struct mountgroup *mg, struct gdlm_plock_info *in)
+static void do_lock(struct mountgroup *mg, struct gdlm_plock_info *in)
 {
 	struct resource *r = NULL;
 	int rv;
 
 	rv = find_resource(mg, in->number, 1, &r);
-	if (rv || !r)
+	if (rv)
 		goto out;
 
-	if (is_conflict(r, in)) {
+	if (is_conflict(r, in, 0)) {
 		if (!in->wait)
 			rv = -EAGAIN;
-		else
+		else {
 			rv = add_waiter(mg, r, in);
-		goto out;
-	}
+			if (rv)
+				goto out;
+			rv = -EINPROGRESS;
+		}
+	} else
+		rv = lock_internal(mg, r, in);
 
-	rv = lock_internal(mg, r, in);
-	if (rv)
-		goto out;
+ out:
+	if (in->nodeid == our_nodeid && rv != -EINPROGRESS)
+		write_result(mg, in, rv);
 
 	do_waiters(mg, r);
 	put_resource(r);
- out:
-	return rv;
 }
 
-static int do_unlock(struct mountgroup *mg, struct gdlm_plock_info *in)
+static void do_unlock(struct mountgroup *mg, struct gdlm_plock_info *in)
 {
 	struct resource *r = NULL;
 	int rv;
 
 	rv = find_resource(mg, in->number, 0, &r);
-	if (rv || !r)
-		goto out;
+	if (!rv)
+		rv = unlock_internal(mg, r, in);
 
-	rv = unlock_internal(mg, r, in);
-	if (rv)
-		goto out;
+	if (in->nodeid == our_nodeid)
+		write_result(mg, in, rv);
 
 	do_waiters(mg, r);
 	put_resource(r);
+}
+
+static void do_get(struct mountgroup *mg, struct gdlm_plock_info *in)
+{
+	struct resource *r = NULL;
+	int rv;
+
+	rv = find_resource(mg, in->number, 0, &r);
+	if (rv)
+		goto out;
+
+	if (is_conflict(r, in, 1))
+		in->rv = 1;
+	else
+		in->rv = 0;
  out:
-	return rv;
+	write_result(mg, in, rv);
 }
 
 /* When mg members receive our options message (for our mount), one of them
@@ -788,8 +867,12 @@
 
 	info_bswap_in(&info);
 
-	log_group(mg, "receive_plock from %d op %d fs %x num %llx ex %d w %d",
-		  from, info.optype, info.fsid, info.number, info.ex,
+	log_group(mg, "receive plock %llx %s %s %llx-%llx %d/%u/%llx w %d",
+		  info.number,
+		  op_str(info.optype),
+		  ex_str(info.optype, info.ex),
+		  info.start, info.end,
+		  info.nodeid, info.pid, info.owner,
 		  info.wait);
 
 	if (info.optype == GDLM_PLOCK_OP_GET && from != our_nodeid)
@@ -805,24 +888,23 @@
 	switch (info.optype) {
 	case GDLM_PLOCK_OP_LOCK:
 		mg->last_plock_time = time(NULL);
-		rv = do_lock(mg, &info);
+		do_lock(mg, &info);
 		break;
 	case GDLM_PLOCK_OP_UNLOCK:
 		mg->last_plock_time = time(NULL);
-		rv = do_unlock(mg, &info);
+		do_unlock(mg, &info);
 		break;
 	case GDLM_PLOCK_OP_GET:
-		/* rv = do_get(mg, &info); */
+		do_get(mg, &info);
 		break;
 	default:
+		log_error("receive_plock from %d optype %d", from, info.optype);
 		rv = -EINVAL;
 	}
 
  out:
-	if (from == our_nodeid) {
-		info.rv = rv;
-		rv = write(control_fd, &info, sizeof(info));
-	}
+	if (from == our_nodeid && rv)
+		write_result(mg, &info, rv);
 }
 
 void receive_plock(struct mountgroup *mg, char *buf, int len, int from)
@@ -860,6 +942,43 @@
 	}
 }
 
+void purge_plocks(struct mountgroup *mg, int nodeid)
+{
+	struct posix_lock *po, *po2;
+	struct lock_waiter *w, *w2;
+	struct resource *r, *r2;
+	int purged = 0;
+
+	list_for_each_entry_safe(r, r2, &mg->resources, list) {
+		list_for_each_entry_safe(po, po2, &r->locks, list) {
+			if (po->nodeid == nodeid) {
+				list_del(&po->list);
+				free(po);
+				purged++;
+			}
+		}
+
+		list_for_each_entry_safe(w, w2, &r->waiters, list) {
+			if (w->info.nodeid == nodeid) {
+				list_del(&w->list);
+				free(w);
+				purged++;
+			}
+		}
+
+		if (list_empty(&r->locks) && list_empty(&r->waiters)) {
+			list_del(&r->list);
+			free(r);
+		} else
+			do_waiters(mg, r);
+	}
+	
+	if (purged)
+		mg->last_plock_time = time(NULL);
+
+	log_group(mg, "purged %d plocks for %d", purged, nodeid);
+}
+
 void plock_exit(void)
 {
 	if (plocks_online)
@@ -1283,9 +1402,9 @@
 			snprintf(line, MAXLINE,
 			      "%llu WAITING %s %llu-%llu nodeid %d pid %u owner %llx\n",
 			      r->number,
-			      po->ex ? "WR" : "RD",
-			      po->start, po->end,
-			      po->nodeid, po->pid, po->owner);
+			      w->info.ex ? "WR" : "RD",
+			      w->info.start, w->info.end,
+			      w->info.nodeid, w->info.pid, w->info.owner);
 
 			rv = write(fd, line, strlen(line));
 		}
--- cluster/group/gfs_controld/recover.c	2006/08/02 20:50:40	1.7
+++ cluster/group/gfs_controld/recover.c	2006/08/04 21:56:10	1.8
@@ -965,6 +965,8 @@
 				  memb->jid,
 				  memb->spectator,
 				  memb->wait_gfs_recover_done);
+
+			purge_plocks(mg, memb->nodeid);
 		}
 	}	
 



^ permalink raw reply	[flat|nested] 6+ messages in thread
* [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c  ...
@ 2006-08-02 18:27 teigland
  0 siblings, 0 replies; 6+ messages in thread
From: teigland @ 2006-08-02 18:27 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-02 18:27:58

Modified files:
	group/gfs_controld: lock_dlm.h plock.c recover.c 

Log message:
	- checkpoint usage for plocks is getting closer, basic writing/reading
	of plock state to/from ckpt's works, but unlinking ckpt's and clearing
	open ckpt's from processes that exit don't appear to be working right
	in openais

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.8&r2=1.9
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.3&r2=1.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.5&r2=1.6

--- cluster/group/gfs_controld/lock_dlm.h	2006/07/31 18:37:07	1.8
+++ cluster/group/gfs_controld/lock_dlm.h	2006/08/02 18:27:57	1.9
@@ -140,6 +140,7 @@
 	int			emulate_first_mounter;
 	int			wait_first_done;
 	int			low_finished_nodeid;
+	int			save_plocks;
 
 	uint64_t		cp_handle;
 	time_t			last_checkpoint_time;
@@ -224,6 +225,13 @@
 	char			name[MAXNAME];
 };
 
+struct save_msg {
+	struct list_head list;
+	int nodeid;
+	int len;
+	int type;
+	char buf[0];
+};
 
 struct mountgroup *find_mg(char *name);
 struct mountgroup *find_mg_id(uint32_t id);
@@ -245,6 +253,7 @@
 int do_withdraw(char *name);
 int kernel_recovery_done(char *name);
 void ping_kernel_mount(char *table);
+void save_message(struct mountgroup *mg, char *buf, int len, int from, int type);
 
 int client_send(int ci, char *buf, int len);
 
@@ -253,5 +262,6 @@
 void store_plocks(struct mountgroup *mg);
 void retrieve_plocks(struct mountgroup *mg);
 int dump_plocks(char *name, int fd);
+void process_saved_plocks(struct mountgroup *mg);
 
 #endif
--- cluster/group/gfs_controld/plock.c	2006/07/31 18:37:07	1.3
+++ cluster/group/gfs_controld/plock.c	2006/08/02 18:27:57	1.4
@@ -226,9 +226,6 @@
 	else
 		log_error("ckpt init error %d - plocks unavailable", err);
 
-	/* REMOVEME: disable actual use of checkpoints for now */
-	plocks_online = 0;
-
 	rv = open_control();
 	if (rv)
 		return rv;
@@ -740,7 +737,17 @@
 	return rv;
 }
 
-void receive_plock(struct mountgroup *mg, char *buf, int len, int from)
+/* When mg members receive our options message (for our mount), one of them
+   saves all plock state received to that point in a checkpoint and then sounds
+   us our journals message.  We know to retrieve the plock state from the
+   checkpoint when we receive our journals message.  Any plocks messages that
+   arrive between seeing our options message and our journals message needs to
+   be saved and processed after we synchronize our plock state from the
+   checkpoint.  Any plock message received while we're mounting but before we
+   set save_plocks (when we see our options message) can be ignored because it
+   should be reflected in the checkpointed state. */
+
+void _receive_plock(struct mountgroup *mg, char *buf, int len, int from)
 {
 	struct gdlm_plock_info info;
 	struct gdlm_header *hd = (struct gdlm_header *) buf;
@@ -754,6 +761,9 @@
 		  from, info.optype, info.fsid, info.number, info.ex,
 		  info.wait);
 
+	if (info.optype == GDLM_PLOCK_OP_GET && from != our_nodeid)
+		return;
+
 	if (from != hd->nodeid || from != info.nodeid) {
 		log_error("receive_plock from %d header %d info %d",
 			  from, hd->nodeid, info.nodeid);
@@ -761,9 +771,6 @@
 		goto out;
 	}
 
-	if (info.optype == GDLM_PLOCK_OP_GET && from != our_nodeid)
-		return;
-
 	switch (info.optype) {
 	case GDLM_PLOCK_OP_LOCK:
 		mg->last_plock_time = time(NULL);
@@ -787,6 +794,41 @@
 	}
 }
 
+void receive_plock(struct mountgroup *mg, char *buf, int len, int from)
+{
+	if (mg->save_plocks) {
+		save_message(mg, buf, len, from, MSG_PLOCK);
+		return;
+	}
+
+	if (!mg->got_our_journals) {
+		log_group(mg, "not saving plock messages yet");
+		return;
+	}
+
+	_receive_plock(mg, buf, len, from);
+}
+
+void process_saved_plocks(struct mountgroup *mg)
+{
+	struct save_msg *sm, *sm2;
+
+	mg->save_plocks = 0;
+
+	if (list_empty(&mg->saved_messages))
+		return;
+
+	log_group(mg, "process_saved_plocks");
+
+	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
+		if (sm->type != MSG_PLOCK)
+			continue;
+		_receive_plock(mg, sm->buf, sm->len, sm->nodeid);
+		list_del(&sm->list);
+		free(sm);
+	}
+}
+
 void plock_exit(void)
 {
 	if (plocks_online)
@@ -807,6 +849,7 @@
 	list_for_each_entry(po, &r->locks, list) {
 		pp->start	= po->start;
 		pp->end		= po->end;
+		pp->owner	= po->owner;
 		pp->pid		= po->pid;
 		pp->nodeid	= po->nodeid;
 		pp->ex		= po->ex;
@@ -818,6 +861,7 @@
 	list_for_each_entry(w, &r->waiters, list) {
 		pp->start	= w->info.start;
 		pp->end		= w->info.end;
+		pp->owner	= w->info.owner;
 		pp->pid		= w->info.pid;
 		pp->nodeid	= w->info.nodeid;
 		pp->ex		= w->info.ex;
@@ -844,8 +888,9 @@
 	if (!r)
 		return -ENOMEM;
 	memset(r, 0, sizeof(struct resource));
-
-	sscanf(numbuf, "%llu", &r->number);
+	INIT_LIST_HEAD(&r->locks);
+	INIT_LIST_HEAD(&r->waiters);
+	sscanf(numbuf, "r%llu", &r->number);
 
 	log_group(mg, "unpack %llx count %d", r->number, count);
 
@@ -856,13 +901,16 @@
 			po = malloc(sizeof(struct posix_lock));
 			po->start	= pp->start;
 			po->end		= pp->end;
+			po->owner	= pp->owner;
 			po->pid		= pp->pid;
+			po->nodeid	= pp->nodeid;
 			po->ex		= pp->ex;
 			list_add_tail(&po->list, &r->locks);
 		} else {
 			w = malloc(sizeof(struct lock_waiter));
 			w->info.start	= pp->start;
 			w->info.end	= pp->end;
+			w->info.owner	= pp->owner;
 			w->info.pid	= pp->pid;
 			w->info.nodeid	= pp->nodeid;
 			w->info.ex	= pp->ex;
@@ -875,7 +923,76 @@
 	return 0;
 }
 
-/* copy all plock state into a checkpoint so new node can retrieve it */
+int unlink_checkpoint(struct mountgroup *mg, SaNameT *name)
+{
+	SaCkptCheckpointHandleT h;
+	SaCkptCheckpointDescriptorT s;
+	SaAisErrorT rv;
+	int ret = 0;
+
+	h = (SaCkptCheckpointHandleT) mg->cp_handle;
+	log_group(mg, "unlink ckpt %llx", h);
+
+ unlink_retry:
+	rv = saCkptCheckpointUnlink(h, name);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(mg, "unlink ckpt retry");
+		sleep(1);
+		goto unlink_retry;
+	}
+	if (rv == SA_AIS_OK)
+		goto out_close;
+
+	log_error("unlink ckpt error %d %s", rv, mg->name);
+	ret = -1;
+
+ status_retry:
+	rv = saCkptCheckpointStatusGet(h, &s);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(mg, "unlink ckpt status retry");
+		sleep(1);
+		goto status_retry;
+	}
+	if (rv != SA_AIS_OK) {
+		log_error("unlink ckpt status error %d %s", rv, mg->name);
+		goto out_close;
+	}
+
+	log_group(mg, "unlink ckpt status: size %llu, max sections %u, "
+		      "max section size %llu, section count %u, mem %u",
+		 s.checkpointCreationAttributes.checkpointSize,
+		 s.checkpointCreationAttributes.maxSections,
+		 s.checkpointCreationAttributes.maxSectionSize,
+		 s.numberOfSections, s.memoryUsed);
+
+ out_close:
+	rv = saCkptCheckpointClose(h);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(mg, "unlink ckpt close retry");
+		sleep(1);
+		goto out_close;
+	}
+	if (rv != SA_AIS_OK) {
+		log_error("unlink ckpt close error %d %s", rv, mg->name);
+		ret = -1;
+	}
+
+	mg->cp_handle = 0;
+	return ret;
+}
+
+/* Copy all plock state into a checkpoint so new node can retrieve it.
+
+   The low node in the group and the previous node to create the ckpt (with
+   non-zero cp_handle) may be different if a new node joins with a lower nodeid
+   than the previous low node that created the ckpt.  In this case, the prev
+   node has the old ckpt open and will reuse it if no plock state has changed,
+   or will unlink it and create a new one.  The low node will also attempt to
+   create a new ckpt.  That open-create will either fail due to the prev node
+   reusing the old ckpt, or it will race with the open-create on the prev node
+   after the prev node unlinks the old ckpt.  Either way, when there are two
+   different nodes in the group calling store_plocks(), one of them will fail
+   at the Open(CREATE) step with ERR_EXIST due to the other. */
 
 void store_plocks(struct mountgroup *mg)
 {
@@ -883,13 +1000,15 @@
 	SaCkptCheckpointHandleT h;
 	SaCkptSectionIdT section_id;
 	SaCkptSectionCreationAttributesT section_attr;
+	SaCkptCheckpointOpenFlagsT flags;
 	SaNameT name;
 	SaAisErrorT rv;
 	char buf[32];
 	struct resource *r;
 	struct posix_lock *po;
 	struct lock_waiter *w;
-	int len, r_count, total_size, section_size, max_section_size;
+	int r_count, lock_count, total_size, section_size, max_section_size;
+	int len;
 
 	if (!plocks_online)
 		return;
@@ -906,65 +1025,75 @@
 
 	/* unlink an old checkpoint before we create a new one */
 	if (mg->cp_handle) {
-		log_group(mg, "store_plocks: unlink ckpt");
-		h = (SaCkptCheckpointHandleT) mg->cp_handle;
-		rv = saCkptCheckpointUnlink(h, &name);
-		if (rv != SA_AIS_OK)
-			log_error("ckpt unlink error %d %s", rv, mg->name);
-		h = 0;
-		mg->cp_handle = 0;
+		if (unlink_checkpoint(mg, &name))
+			return;
 	}
 
 	/* loop through all plocks to figure out sizes to set in
 	   the attr fields */
 
 	r_count = 0;
+	lock_count = 0;
 	total_size = 0;
 	max_section_size = 0;
 
 	list_for_each_entry(r, &mg->resources, list) {
 		r_count++;
 		section_size = 0;
-		list_for_each_entry(po, &r->locks, list)
+		list_for_each_entry(po, &r->locks, list) {
 			section_size += sizeof(struct pack_plock);
-		list_for_each_entry(w, &r->waiters, list)
+			lock_count++;
+		}
+		list_for_each_entry(w, &r->waiters, list) {
 			section_size += sizeof(struct pack_plock);
+			lock_count++;
+		}
 		total_size += section_size;
 		if (section_size > max_section_size)
 			max_section_size = section_size;
 	}
 
-	log_group(mg, "store_plocks: r_count %d total %d max_section %d",
-		  r_count, total_size, max_section_size);
+	log_group(mg, "store_plocks: r_count %d, lock_count %d, pp %d bytes",
+		  r_count, lock_count, sizeof(struct pack_plock));
+
+	log_group(mg, "store_plocks: total %d bytes, max_section %d bytes",
+		  total_size, max_section_size);
 
 	attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS;
 	attr.checkpointSize = total_size;
 	attr.retentionDuration = SA_TIME_MAX;
-	attr.maxSections = r_count;
+	attr.maxSections = r_count + 1;      /* don't know why we need +1 */
 	attr.maxSectionSize = max_section_size;
-	attr.maxSectionIdSize = 21;             /* 20 digits in max uint64 */
+	attr.maxSectionIdSize = 22;
+	
+	/* 22 = 20 digits in max uint64 + "r" prefix + \0 suffix */
+
+	flags = SA_CKPT_CHECKPOINT_READ |
+		SA_CKPT_CHECKPOINT_WRITE |
+		SA_CKPT_CHECKPOINT_CREATE;
 
  open_retry:
-	rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr,
-				  SA_CKPT_CHECKPOINT_CREATE |
-				  SA_CKPT_CHECKPOINT_READ |
-				  SA_CKPT_CHECKPOINT_WRITE,
-				  0, &h);
+	rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr, flags, 0, &h);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
 		log_group(mg, "store_plocks: ckpt open retry");
 		sleep(1);
 		goto open_retry;
 	}
+	if (rv == SA_AIS_ERR_EXIST) {
+		log_group(mg, "store_plocks: ckpt already exists");
+		return;
+	}
 	if (rv != SA_AIS_OK) {
 		log_error("store_plocks: ckpt open error %d %s", rv, mg->name);
 		return;
 	}
 
+	log_group(mg, "store_plocks: open ckpt handle %llx", h);
 	mg->cp_handle = (uint64_t) h;
 
 	list_for_each_entry(r, &mg->resources, list) {
 		memset(&buf, 0, 32);
-		len = snprintf(buf, 32, "%llu", r->number);
+		len = snprintf(buf, 32, "r%llu", r->number);
 
 		section_id.id = buf;
 		section_id.idLen = len + 1;
@@ -973,7 +1102,7 @@
 
 		pack_section_buf(mg, r);
 
- create_retry:
+	 create_retry:
 		rv = saCkptSectionCreate(h, &section_attr, &section_buf,
 					 section_len);
 		if (rv == SA_AIS_ERR_TRY_AGAIN) {
@@ -982,7 +1111,7 @@
 			goto create_retry;
 		}
 		if (rv != SA_AIS_OK) {
-			log_error("store_plocks: ckpt create error %d %s",
+			log_error("store_plocks: ckpt section create err %d %s",
 				  rv, mg->name);
 			break;
 		}
@@ -1005,6 +1134,8 @@
 	if (!plocks_online)
 		return;
 
+	log_group(mg, "retrieve_plocks");
+
 	len = snprintf(name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", mg->name);
 	name.length = len;
 
@@ -1032,11 +1163,11 @@
 	if (rv != SA_AIS_OK) {
 		log_error("retrieve_plocks: ckpt iterinit error %d %s",
 			  rv, mg->name);
-		return;
+		goto out;
 	}
 
 	while (1) {
- next_retry:
+	 next_retry:
 		rv = saCkptSectionIterationNext(itr, &desc);
 		if (rv == SA_AIS_ERR_NO_SECTIONS)
 			break;
@@ -1048,7 +1179,7 @@
 		if (rv != SA_AIS_OK) {
 			log_error("retrieve_plocks: ckpt iternext error %d %s",
 				  rv, mg->name);
-			break;
+			goto out_it;
 		}
 
 		iov.sectionId = desc.sectionId;
@@ -1056,7 +1187,7 @@
 		iov.dataSize = desc.sectionSize;
 		iov.dataOffset = 0;
 
- read_retry:
+	 read_retry:
 		rv = saCkptCheckpointRead(h, &iov, 1, NULL);
 		if (rv == SA_AIS_ERR_TRY_AGAIN) {
 			log_group(mg, "retrieve_plocks: ckpt read retry");
@@ -1066,13 +1197,19 @@
 		if (rv != SA_AIS_OK) {
 			log_error("retrieve_plocks: ckpt read error %d %s",
 				  rv, mg->name);
-			break;
+			goto out_it;
 		}
 
+		log_group(mg, "retrieve_plocks: ckpt read %llu bytes",
+			  iov.readSize);
+		section_len = iov.readSize;
+
 		unpack_section_buf(mg, desc.sectionId.id, desc.sectionId.idLen);
 	}
 
+ out_it:
 	saCkptSectionIterationFinalize(itr);
+ out:
 	saCkptCheckpointClose(h);
 }
 
--- cluster/group/gfs_controld/recover.c	2006/07/31 18:37:07	1.5
+++ cluster/group/gfs_controld/recover.c	2006/08/02 18:27:57	1.6
@@ -12,14 +12,6 @@
 
 #include "lock_dlm.h"
 
-struct save_msg {
-	struct list_head list;
-	int nodeid;
-	int len;
-	int type;
-	char buf[0];
-};
-
 #define SYSFS_DIR	"/sys/fs"
 #define JID_INIT	-9
 
@@ -597,12 +589,14 @@
 	log_group(mg, "assign_journal: new member %d got jid %d",
 		  new->nodeid, new->jid);
 
+	if (mg->low_finished_nodeid == our_nodeid || mg->cp_handle)
+		store_plocks(mg);
+
 	/* if we're the first mounter and haven't gotten others_may_mount
 	   yet, then don't send journals until kernel_recovery_done_first
 	   so the second node won't mount the fs until omm. */
 
 	if (mg->low_finished_nodeid == our_nodeid) {
-		store_plocks(mg);
 		if (mg->first_mounter && !mg->first_mounter_done) {
 			log_group(mg, "delay sending journals to %d",
 				  new->nodeid);
@@ -655,6 +649,7 @@
 
 	if (hd->nodeid == our_nodeid) {
 		mg->got_our_options = 1;
+		mg->save_plocks = 1;
 		return;
 	}
 
@@ -1784,7 +1779,7 @@
 	}
 
 	retrieve_plocks(mg);
-	/* process_saved_plocks(mg); */
+	process_saved_plocks(mg);
  out:
 	notify_mount_client(mg);
 }



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2006-08-21 17:46 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-07 16:57 [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c teigland
  -- strict thread matches above, loose matches on Subject: below --
2006-08-21 17:46 teigland
2006-08-18 16:33 teigland
2006-08-08 21:19 teigland
2006-08-04 21:56 teigland
2006-08-02 18:27 teigland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).