[Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c ...

All of lore.kernel.org
 help / color / mirror / Atom feed

From: teigland@sourceware.org <teigland@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c  ...
Date: 2 Aug 2006 18:27:58 -0000	[thread overview]
Message-ID: <20060802182758.29785.qmail@sourceware.org> (raw)

CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-08-02 18:27:58

Modified files:
	group/gfs_controld: lock_dlm.h plock.c recover.c 

Log message:
	- checkpoint usage for plocks is getting closer, basic writing/reading
	of plock state to/from ckpt's works, but unlinking ckpt's and clearing
	open ckpt's from processes that exit don't appear to be working right
	in openais

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.8&r2=1.9
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.3&r2=1.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.5&r2=1.6

--- cluster/group/gfs_controld/lock_dlm.h	2006/07/31 18:37:07	1.8
+++ cluster/group/gfs_controld/lock_dlm.h	2006/08/02 18:27:57	1.9
@@ -140,6 +140,7 @@
 	int			emulate_first_mounter;
 	int			wait_first_done;
 	int			low_finished_nodeid;
+	int			save_plocks;
 
 	uint64_t		cp_handle;
 	time_t			last_checkpoint_time;
@@ -224,6 +225,13 @@
 	char			name[MAXNAME];
 };
 
+struct save_msg {
+	struct list_head list;
+	int nodeid;
+	int len;
+	int type;
+	char buf[0];
+};
 
 struct mountgroup *find_mg(char *name);
 struct mountgroup *find_mg_id(uint32_t id);
@@ -245,6 +253,7 @@
 int do_withdraw(char *name);
 int kernel_recovery_done(char *name);
 void ping_kernel_mount(char *table);
+void save_message(struct mountgroup *mg, char *buf, int len, int from, int type);
 
 int client_send(int ci, char *buf, int len);
 
@@ -253,5 +262,6 @@
 void store_plocks(struct mountgroup *mg);
 void retrieve_plocks(struct mountgroup *mg);
 int dump_plocks(char *name, int fd);
+void process_saved_plocks(struct mountgroup *mg);
 
 #endif
--- cluster/group/gfs_controld/plock.c	2006/07/31 18:37:07	1.3
+++ cluster/group/gfs_controld/plock.c	2006/08/02 18:27:57	1.4
@@ -226,9 +226,6 @@
 	else
 		log_error("ckpt init error %d - plocks unavailable", err);
 
-	/* REMOVEME: disable actual use of checkpoints for now */
-	plocks_online = 0;
-
 	rv = open_control();
 	if (rv)
 		return rv;
@@ -740,7 +737,17 @@
 	return rv;
 }
 
-void receive_plock(struct mountgroup *mg, char *buf, int len, int from)
+/* When mg members receive our options message (for our mount), one of them
+   saves all plock state received to that point in a checkpoint and then sounds
+   us our journals message.  We know to retrieve the plock state from the
+   checkpoint when we receive our journals message.  Any plocks messages that
+   arrive between seeing our options message and our journals message needs to
+   be saved and processed after we synchronize our plock state from the
+   checkpoint.  Any plock message received while we're mounting but before we
+   set save_plocks (when we see our options message) can be ignored because it
+   should be reflected in the checkpointed state. */
+
+void _receive_plock(struct mountgroup *mg, char *buf, int len, int from)
 {
 	struct gdlm_plock_info info;
 	struct gdlm_header *hd = (struct gdlm_header *) buf;
@@ -754,6 +761,9 @@
 		  from, info.optype, info.fsid, info.number, info.ex,
 		  info.wait);
 
+	if (info.optype == GDLM_PLOCK_OP_GET && from != our_nodeid)
+		return;
+
 	if (from != hd->nodeid || from != info.nodeid) {
 		log_error("receive_plock from %d header %d info %d",
 			  from, hd->nodeid, info.nodeid);
@@ -761,9 +771,6 @@
 		goto out;
 	}
 
-	if (info.optype == GDLM_PLOCK_OP_GET && from != our_nodeid)
-		return;
-
 	switch (info.optype) {
 	case GDLM_PLOCK_OP_LOCK:
 		mg->last_plock_time = time(NULL);
@@ -787,6 +794,41 @@
 	}
 }
 
+void receive_plock(struct mountgroup *mg, char *buf, int len, int from)
+{
+	if (mg->save_plocks) {
+		save_message(mg, buf, len, from, MSG_PLOCK);
+		return;
+	}
+
+	if (!mg->got_our_journals) {
+		log_group(mg, "not saving plock messages yet");
+		return;
+	}
+
+	_receive_plock(mg, buf, len, from);
+}
+
+void process_saved_plocks(struct mountgroup *mg)
+{
+	struct save_msg *sm, *sm2;
+
+	mg->save_plocks = 0;
+
+	if (list_empty(&mg->saved_messages))
+		return;
+
+	log_group(mg, "process_saved_plocks");
+
+	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
+		if (sm->type != MSG_PLOCK)
+			continue;
+		_receive_plock(mg, sm->buf, sm->len, sm->nodeid);
+		list_del(&sm->list);
+		free(sm);
+	}
+}
+
 void plock_exit(void)
 {
 	if (plocks_online)
@@ -807,6 +849,7 @@
 	list_for_each_entry(po, &r->locks, list) {
 		pp->start	= po->start;
 		pp->end		= po->end;
+		pp->owner	= po->owner;
 		pp->pid		= po->pid;
 		pp->nodeid	= po->nodeid;
 		pp->ex		= po->ex;
@@ -818,6 +861,7 @@
 	list_for_each_entry(w, &r->waiters, list) {
 		pp->start	= w->info.start;
 		pp->end		= w->info.end;
+		pp->owner	= w->info.owner;
 		pp->pid		= w->info.pid;
 		pp->nodeid	= w->info.nodeid;
 		pp->ex		= w->info.ex;
@@ -844,8 +888,9 @@
 	if (!r)
 		return -ENOMEM;
 	memset(r, 0, sizeof(struct resource));
-
-	sscanf(numbuf, "%llu", &r->number);
+	INIT_LIST_HEAD(&r->locks);
+	INIT_LIST_HEAD(&r->waiters);
+	sscanf(numbuf, "r%llu", &r->number);
 
 	log_group(mg, "unpack %llx count %d", r->number, count);
 
@@ -856,13 +901,16 @@
 			po = malloc(sizeof(struct posix_lock));
 			po->start	= pp->start;
 			po->end		= pp->end;
+			po->owner	= pp->owner;
 			po->pid		= pp->pid;
+			po->nodeid	= pp->nodeid;
 			po->ex		= pp->ex;
 			list_add_tail(&po->list, &r->locks);
 		} else {
 			w = malloc(sizeof(struct lock_waiter));
 			w->info.start	= pp->start;
 			w->info.end	= pp->end;
+			w->info.owner	= pp->owner;
 			w->info.pid	= pp->pid;
 			w->info.nodeid	= pp->nodeid;
 			w->info.ex	= pp->ex;
@@ -875,7 +923,76 @@
 	return 0;
 }
 
-/* copy all plock state into a checkpoint so new node can retrieve it */
+int unlink_checkpoint(struct mountgroup *mg, SaNameT *name)
+{
+	SaCkptCheckpointHandleT h;
+	SaCkptCheckpointDescriptorT s;
+	SaAisErrorT rv;
+	int ret = 0;
+
+	h = (SaCkptCheckpointHandleT) mg->cp_handle;
+	log_group(mg, "unlink ckpt %llx", h);
+
+ unlink_retry:
+	rv = saCkptCheckpointUnlink(h, name);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(mg, "unlink ckpt retry");
+		sleep(1);
+		goto unlink_retry;
+	}
+	if (rv == SA_AIS_OK)
+		goto out_close;
+
+	log_error("unlink ckpt error %d %s", rv, mg->name);
+	ret = -1;
+
+ status_retry:
+	rv = saCkptCheckpointStatusGet(h, &s);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(mg, "unlink ckpt status retry");
+		sleep(1);
+		goto status_retry;
+	}
+	if (rv != SA_AIS_OK) {
+		log_error("unlink ckpt status error %d %s", rv, mg->name);
+		goto out_close;
+	}
+
+	log_group(mg, "unlink ckpt status: size %llu, max sections %u, "
+		      "max section size %llu, section count %u, mem %u",
+		 s.checkpointCreationAttributes.checkpointSize,
+		 s.checkpointCreationAttributes.maxSections,
+		 s.checkpointCreationAttributes.maxSectionSize,
+		 s.numberOfSections, s.memoryUsed);
+
+ out_close:
+	rv = saCkptCheckpointClose(h);
+	if (rv == SA_AIS_ERR_TRY_AGAIN) {
+		log_group(mg, "unlink ckpt close retry");
+		sleep(1);
+		goto out_close;
+	}
+	if (rv != SA_AIS_OK) {
+		log_error("unlink ckpt close error %d %s", rv, mg->name);
+		ret = -1;
+	}
+
+	mg->cp_handle = 0;
+	return ret;
+}
+
+/* Copy all plock state into a checkpoint so new node can retrieve it.
+
+   The low node in the group and the previous node to create the ckpt (with
+   non-zero cp_handle) may be different if a new node joins with a lower nodeid
+   than the previous low node that created the ckpt.  In this case, the prev
+   node has the old ckpt open and will reuse it if no plock state has changed,
+   or will unlink it and create a new one.  The low node will also attempt to
+   create a new ckpt.  That open-create will either fail due to the prev node
+   reusing the old ckpt, or it will race with the open-create on the prev node
+   after the prev node unlinks the old ckpt.  Either way, when there are two
+   different nodes in the group calling store_plocks(), one of them will fail
+   at the Open(CREATE) step with ERR_EXIST due to the other. */
 
 void store_plocks(struct mountgroup *mg)
 {
@@ -883,13 +1000,15 @@
 	SaCkptCheckpointHandleT h;
 	SaCkptSectionIdT section_id;
 	SaCkptSectionCreationAttributesT section_attr;
+	SaCkptCheckpointOpenFlagsT flags;
 	SaNameT name;
 	SaAisErrorT rv;
 	char buf[32];
 	struct resource *r;
 	struct posix_lock *po;
 	struct lock_waiter *w;
-	int len, r_count, total_size, section_size, max_section_size;
+	int r_count, lock_count, total_size, section_size, max_section_size;
+	int len;
 
 	if (!plocks_online)
 		return;
@@ -906,65 +1025,75 @@
 
 	/* unlink an old checkpoint before we create a new one */
 	if (mg->cp_handle) {
-		log_group(mg, "store_plocks: unlink ckpt");
-		h = (SaCkptCheckpointHandleT) mg->cp_handle;
-		rv = saCkptCheckpointUnlink(h, &name);
-		if (rv != SA_AIS_OK)
-			log_error("ckpt unlink error %d %s", rv, mg->name);
-		h = 0;
-		mg->cp_handle = 0;
+		if (unlink_checkpoint(mg, &name))
+			return;
 	}
 
 	/* loop through all plocks to figure out sizes to set in
 	   the attr fields */
 
 	r_count = 0;
+	lock_count = 0;
 	total_size = 0;
 	max_section_size = 0;
 
 	list_for_each_entry(r, &mg->resources, list) {
 		r_count++;
 		section_size = 0;
-		list_for_each_entry(po, &r->locks, list)
+		list_for_each_entry(po, &r->locks, list) {
 			section_size += sizeof(struct pack_plock);
-		list_for_each_entry(w, &r->waiters, list)
+			lock_count++;
+		}
+		list_for_each_entry(w, &r->waiters, list) {
 			section_size += sizeof(struct pack_plock);
+			lock_count++;
+		}
 		total_size += section_size;
 		if (section_size > max_section_size)
 			max_section_size = section_size;
 	}
 
-	log_group(mg, "store_plocks: r_count %d total %d max_section %d",
-		  r_count, total_size, max_section_size);
+	log_group(mg, "store_plocks: r_count %d, lock_count %d, pp %d bytes",
+		  r_count, lock_count, sizeof(struct pack_plock));
+
+	log_group(mg, "store_plocks: total %d bytes, max_section %d bytes",
+		  total_size, max_section_size);
 
 	attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS;
 	attr.checkpointSize = total_size;
 	attr.retentionDuration = SA_TIME_MAX;
-	attr.maxSections = r_count;
+	attr.maxSections = r_count + 1;      /* don't know why we need +1 */
 	attr.maxSectionSize = max_section_size;
-	attr.maxSectionIdSize = 21;             /* 20 digits in max uint64 */
+	attr.maxSectionIdSize = 22;
+	
+	/* 22 = 20 digits in max uint64 + "r" prefix + \0 suffix */
+
+	flags = SA_CKPT_CHECKPOINT_READ |
+		SA_CKPT_CHECKPOINT_WRITE |
+		SA_CKPT_CHECKPOINT_CREATE;
 
  open_retry:
-	rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr,
-				  SA_CKPT_CHECKPOINT_CREATE |
-				  SA_CKPT_CHECKPOINT_READ |
-				  SA_CKPT_CHECKPOINT_WRITE,
-				  0, &h);
+	rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr, flags, 0, &h);
 	if (rv == SA_AIS_ERR_TRY_AGAIN) {
 		log_group(mg, "store_plocks: ckpt open retry");
 		sleep(1);
 		goto open_retry;
 	}
+	if (rv == SA_AIS_ERR_EXIST) {
+		log_group(mg, "store_plocks: ckpt already exists");
+		return;
+	}
 	if (rv != SA_AIS_OK) {
 		log_error("store_plocks: ckpt open error %d %s", rv, mg->name);
 		return;
 	}
 
+	log_group(mg, "store_plocks: open ckpt handle %llx", h);
 	mg->cp_handle = (uint64_t) h;
 
 	list_for_each_entry(r, &mg->resources, list) {
 		memset(&buf, 0, 32);
-		len = snprintf(buf, 32, "%llu", r->number);
+		len = snprintf(buf, 32, "r%llu", r->number);
 
 		section_id.id = buf;
 		section_id.idLen = len + 1;
@@ -973,7 +1102,7 @@
 
 		pack_section_buf(mg, r);
 
- create_retry:
+	 create_retry:
 		rv = saCkptSectionCreate(h, &section_attr, &section_buf,
 					 section_len);
 		if (rv == SA_AIS_ERR_TRY_AGAIN) {
@@ -982,7 +1111,7 @@
 			goto create_retry;
 		}
 		if (rv != SA_AIS_OK) {
-			log_error("store_plocks: ckpt create error %d %s",
+			log_error("store_plocks: ckpt section create err %d %s",
 				  rv, mg->name);
 			break;
 		}
@@ -1005,6 +1134,8 @@
 	if (!plocks_online)
 		return;
 
+	log_group(mg, "retrieve_plocks");
+
 	len = snprintf(name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", mg->name);
 	name.length = len;
 
@@ -1032,11 +1163,11 @@
 	if (rv != SA_AIS_OK) {
 		log_error("retrieve_plocks: ckpt iterinit error %d %s",
 			  rv, mg->name);
-		return;
+		goto out;
 	}
 
 	while (1) {
- next_retry:
+	 next_retry:
 		rv = saCkptSectionIterationNext(itr, &desc);
 		if (rv == SA_AIS_ERR_NO_SECTIONS)
 			break;
@@ -1048,7 +1179,7 @@
 		if (rv != SA_AIS_OK) {
 			log_error("retrieve_plocks: ckpt iternext error %d %s",
 				  rv, mg->name);
-			break;
+			goto out_it;
 		}
 
 		iov.sectionId = desc.sectionId;
@@ -1056,7 +1187,7 @@
 		iov.dataSize = desc.sectionSize;
 		iov.dataOffset = 0;
 
- read_retry:
+	 read_retry:
 		rv = saCkptCheckpointRead(h, &iov, 1, NULL);
 		if (rv == SA_AIS_ERR_TRY_AGAIN) {
 			log_group(mg, "retrieve_plocks: ckpt read retry");
@@ -1066,13 +1197,19 @@
 		if (rv != SA_AIS_OK) {
 			log_error("retrieve_plocks: ckpt read error %d %s",
 				  rv, mg->name);
-			break;
+			goto out_it;
 		}
 
+		log_group(mg, "retrieve_plocks: ckpt read %llu bytes",
+			  iov.readSize);
+		section_len = iov.readSize;
+
 		unpack_section_buf(mg, desc.sectionId.id, desc.sectionId.idLen);
 	}
 
+ out_it:
 	saCkptSectionIterationFinalize(itr);
+ out:
 	saCkptCheckpointClose(h);
 }
 
--- cluster/group/gfs_controld/recover.c	2006/07/31 18:37:07	1.5
+++ cluster/group/gfs_controld/recover.c	2006/08/02 18:27:57	1.6
@@ -12,14 +12,6 @@
 
 #include "lock_dlm.h"
 
-struct save_msg {
-	struct list_head list;
-	int nodeid;
-	int len;
-	int type;
-	char buf[0];
-};
-
 #define SYSFS_DIR	"/sys/fs"
 #define JID_INIT	-9
 
@@ -597,12 +589,14 @@
 	log_group(mg, "assign_journal: new member %d got jid %d",
 		  new->nodeid, new->jid);
 
+	if (mg->low_finished_nodeid == our_nodeid || mg->cp_handle)
+		store_plocks(mg);
+
 	/* if we're the first mounter and haven't gotten others_may_mount
 	   yet, then don't send journals until kernel_recovery_done_first
 	   so the second node won't mount the fs until omm. */
 
 	if (mg->low_finished_nodeid == our_nodeid) {
-		store_plocks(mg);
 		if (mg->first_mounter && !mg->first_mounter_done) {
 			log_group(mg, "delay sending journals to %d",
 				  new->nodeid);
@@ -655,6 +649,7 @@
 
 	if (hd->nodeid == our_nodeid) {
 		mg->got_our_options = 1;
+		mg->save_plocks = 1;
 		return;
 	}
 
@@ -1784,7 +1779,7 @@
 	}
 
 	retrieve_plocks(mg);
-	/* process_saved_plocks(mg); */
+	process_saved_plocks(mg);
  out:
 	notify_mount_client(mg);
 }

next             reply	other threads:[~2006-08-02 18:27 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-08-02 18:27 teigland [this message]
  -- strict thread matches above, loose matches on Subject: below --
2006-08-04 21:56 [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c teigland
2006-08-07 16:57 teigland
2006-08-08 21:19 teigland
2006-08-18 16:33 teigland
2006-08-21 17:46 teigland

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060802182758.29785.qmail@sourceware.org \
    --to=teigland@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.