From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 18 Aug 2006 16:33:10 -0000 Subject: [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c ... Message-ID: <20060818163310.9752.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: teigland at sourceware.org 2006-08-18 16:33:08 Modified files: group/gfs_controld: lock_dlm.h plock.c recover.c Log message: when the low nodeid fails, the checkpoint needs to be unlinked, otherwise creating the ckpt will fail down the road when another node mounts Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.14&r2=1.15 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.17&r2=1.18 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.13&r2=1.14 --- cluster/group/gfs_controld/lock_dlm.h 2006/08/14 17:22:53 1.14 +++ cluster/group/gfs_controld/lock_dlm.h 2006/08/18 16:33:08 1.15 @@ -276,5 +276,6 @@ int dump_plocks(char *name, int fd); void process_saved_plocks(struct mountgroup *mg); void purge_plocks(struct mountgroup *mg, int nodeid, int unmount); +int unlink_checkpoint(struct mountgroup *mg); #endif --- cluster/group/gfs_controld/plock.c 2006/08/17 19:39:17 1.17 +++ cluster/group/gfs_controld/plock.c 2006/08/18 16:33:08 1.18 @@ -1034,7 +1034,7 @@ return 0; } -int unlink_checkpoint(struct mountgroup *mg, SaNameT *name) +int _unlink_checkpoint(struct mountgroup *mg, SaNameT *name) { SaCkptCheckpointHandleT h; SaCkptCheckpointDescriptorT s; @@ -1097,6 +1097,16 @@ return ret; } +int unlink_checkpoint(struct mountgroup *mg) +{ + SaNameT name; + int len; + + len = snprintf(name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", mg->name); + name.length = len; + return _unlink_checkpoint(mg, &name); +} + /* Copy all plock state into a checkpoint so new node can retrieve it. The node creating the ckpt for the mounter needs to be the same node that's sending the mounter its journals message (i.e. the low nodeid). The new @@ -1139,7 +1149,7 @@ /* unlink an old checkpoint before we create a new one */ if (mg->cp_handle) { - if (unlink_checkpoint(mg, &name)) + if (_unlink_checkpoint(mg, &name)) return; } @@ -1231,7 +1241,7 @@ /* this shouldn't happen in general */ log_group(mg, "store_plocks: clearing old ckpt"); saCkptCheckpointClose(h); - unlink_checkpoint(mg, &name); + _unlink_checkpoint(mg, &name); goto open_retry; } if (rv != SA_AIS_OK) { @@ -1318,6 +1328,9 @@ goto out_it; } + if (!desc.sectionSize) + continue; + iov.sectionId = desc.sectionId; iov.dataBuffer = §ion_buf; iov.dataSize = desc.sectionSize; @@ -1362,7 +1375,7 @@ out: if (mg->low_nodeid == our_nodeid) { log_group(mg, "retrieve_plocks: unlink ckpt from old low node"); - unlink_checkpoint(mg, &name); + _unlink_checkpoint(mg, &name); } else saCkptCheckpointClose(h); } @@ -1372,8 +1385,7 @@ struct posix_lock *po, *po2; struct lock_waiter *w, *w2; struct resource *r, *r2; - int len, purged = 0; - SaNameT name; + int purged = 0; list_for_each_entry_safe(r, r2, &mg->resources, list) { list_for_each_entry_safe(po, po2, &r->locks, list) { @@ -1408,12 +1420,8 @@ we need to unlink it so another node can create a new ckpt for the next mounter after we leave */ - if (unmount && mg->cp_handle) { - len = snprintf(name.value, SA_MAX_NAME_LENGTH, - "gfsplock.%s", mg->name); - name.length = len; - unlink_checkpoint(mg, &name); - } + if (unmount && mg->cp_handle) + unlink_checkpoint(mg); } int dump_plocks(char *name, int fd) --- cluster/group/gfs_controld/recover.c 2006/08/15 21:38:00 1.13 +++ cluster/group/gfs_controld/recover.c 2006/08/18 16:33:08 1.14 @@ -917,7 +917,7 @@ int *nodeids, int *pos_out, int *neg_out) { struct mg_member *memb, *safe; - int i, found, id, pos = 0, neg = 0, low = -1; + int i, found, id, pos = 0, neg = 0, low = -1, old_low_finished_nodeid; /* move departed nodes from members list to members_gone */ @@ -990,6 +990,7 @@ if (low == -1 || memb->nodeid < low) low = memb->nodeid; } + old_low_finished_nodeid = mg->low_finished_nodeid; mg->low_finished_nodeid = low; *pos_out = pos; @@ -997,6 +998,15 @@ log_group(mg, "total members %d low_finished_nodeid %d", mg->memb_count, low); + + /* the low nodeid failed and we're the new low nodeid, we need + to unlink the ckpt that the failed node had open so new ckpts + can be created down the road */ + if ((old_low_finished_nodeid != low) && (our_nodeid == low)) { + log_group(mg, "unlink ckpt for failed low node %d", + old_low_finished_nodeid); + unlink_checkpoint(mg); + } } struct mountgroup *create_mg(char *name)