From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 2 Aug 2006 18:27:58 -0000 Subject: [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c ... Message-ID: <20060802182758.29785.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: teigland at sourceware.org 2006-08-02 18:27:58 Modified files: group/gfs_controld: lock_dlm.h plock.c recover.c Log message: - checkpoint usage for plocks is getting closer, basic writing/reading of plock state to/from ckpt's works, but unlinking ckpt's and clearing open ckpt's from processes that exit don't appear to be working right in openais Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.8&r2=1.9 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.3&r2=1.4 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.5&r2=1.6 --- cluster/group/gfs_controld/lock_dlm.h 2006/07/31 18:37:07 1.8 +++ cluster/group/gfs_controld/lock_dlm.h 2006/08/02 18:27:57 1.9 @@ -140,6 +140,7 @@ int emulate_first_mounter; int wait_first_done; int low_finished_nodeid; + int save_plocks; uint64_t cp_handle; time_t last_checkpoint_time; @@ -224,6 +225,13 @@ char name[MAXNAME]; }; +struct save_msg { + struct list_head list; + int nodeid; + int len; + int type; + char buf[0]; +}; struct mountgroup *find_mg(char *name); struct mountgroup *find_mg_id(uint32_t id); @@ -245,6 +253,7 @@ int do_withdraw(char *name); int kernel_recovery_done(char *name); void ping_kernel_mount(char *table); +void save_message(struct mountgroup *mg, char *buf, int len, int from, int type); int client_send(int ci, char *buf, int len); @@ -253,5 +262,6 @@ void store_plocks(struct mountgroup *mg); void retrieve_plocks(struct mountgroup *mg); int dump_plocks(char *name, int fd); +void process_saved_plocks(struct mountgroup *mg); #endif --- cluster/group/gfs_controld/plock.c 2006/07/31 18:37:07 1.3 +++ cluster/group/gfs_controld/plock.c 2006/08/02 18:27:57 1.4 @@ -226,9 +226,6 @@ else log_error("ckpt init error %d - plocks unavailable", err); - /* REMOVEME: disable actual use of checkpoints for now */ - plocks_online = 0; - rv = open_control(); if (rv) return rv; @@ -740,7 +737,17 @@ return rv; } -void receive_plock(struct mountgroup *mg, char *buf, int len, int from) +/* When mg members receive our options message (for our mount), one of them + saves all plock state received to that point in a checkpoint and then sounds + us our journals message. We know to retrieve the plock state from the + checkpoint when we receive our journals message. Any plocks messages that + arrive between seeing our options message and our journals message needs to + be saved and processed after we synchronize our plock state from the + checkpoint. Any plock message received while we're mounting but before we + set save_plocks (when we see our options message) can be ignored because it + should be reflected in the checkpointed state. */ + +void _receive_plock(struct mountgroup *mg, char *buf, int len, int from) { struct gdlm_plock_info info; struct gdlm_header *hd = (struct gdlm_header *) buf; @@ -754,6 +761,9 @@ from, info.optype, info.fsid, info.number, info.ex, info.wait); + if (info.optype == GDLM_PLOCK_OP_GET && from != our_nodeid) + return; + if (from != hd->nodeid || from != info.nodeid) { log_error("receive_plock from %d header %d info %d", from, hd->nodeid, info.nodeid); @@ -761,9 +771,6 @@ goto out; } - if (info.optype == GDLM_PLOCK_OP_GET && from != our_nodeid) - return; - switch (info.optype) { case GDLM_PLOCK_OP_LOCK: mg->last_plock_time = time(NULL); @@ -787,6 +794,41 @@ } } +void receive_plock(struct mountgroup *mg, char *buf, int len, int from) +{ + if (mg->save_plocks) { + save_message(mg, buf, len, from, MSG_PLOCK); + return; + } + + if (!mg->got_our_journals) { + log_group(mg, "not saving plock messages yet"); + return; + } + + _receive_plock(mg, buf, len, from); +} + +void process_saved_plocks(struct mountgroup *mg) +{ + struct save_msg *sm, *sm2; + + mg->save_plocks = 0; + + if (list_empty(&mg->saved_messages)) + return; + + log_group(mg, "process_saved_plocks"); + + list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) { + if (sm->type != MSG_PLOCK) + continue; + _receive_plock(mg, sm->buf, sm->len, sm->nodeid); + list_del(&sm->list); + free(sm); + } +} + void plock_exit(void) { if (plocks_online) @@ -807,6 +849,7 @@ list_for_each_entry(po, &r->locks, list) { pp->start = po->start; pp->end = po->end; + pp->owner = po->owner; pp->pid = po->pid; pp->nodeid = po->nodeid; pp->ex = po->ex; @@ -818,6 +861,7 @@ list_for_each_entry(w, &r->waiters, list) { pp->start = w->info.start; pp->end = w->info.end; + pp->owner = w->info.owner; pp->pid = w->info.pid; pp->nodeid = w->info.nodeid; pp->ex = w->info.ex; @@ -844,8 +888,9 @@ if (!r) return -ENOMEM; memset(r, 0, sizeof(struct resource)); - - sscanf(numbuf, "%llu", &r->number); + INIT_LIST_HEAD(&r->locks); + INIT_LIST_HEAD(&r->waiters); + sscanf(numbuf, "r%llu", &r->number); log_group(mg, "unpack %llx count %d", r->number, count); @@ -856,13 +901,16 @@ po = malloc(sizeof(struct posix_lock)); po->start = pp->start; po->end = pp->end; + po->owner = pp->owner; po->pid = pp->pid; + po->nodeid = pp->nodeid; po->ex = pp->ex; list_add_tail(&po->list, &r->locks); } else { w = malloc(sizeof(struct lock_waiter)); w->info.start = pp->start; w->info.end = pp->end; + w->info.owner = pp->owner; w->info.pid = pp->pid; w->info.nodeid = pp->nodeid; w->info.ex = pp->ex; @@ -875,7 +923,76 @@ return 0; } -/* copy all plock state into a checkpoint so new node can retrieve it */ +int unlink_checkpoint(struct mountgroup *mg, SaNameT *name) +{ + SaCkptCheckpointHandleT h; + SaCkptCheckpointDescriptorT s; + SaAisErrorT rv; + int ret = 0; + + h = (SaCkptCheckpointHandleT) mg->cp_handle; + log_group(mg, "unlink ckpt %llx", h); + + unlink_retry: + rv = saCkptCheckpointUnlink(h, name); + if (rv == SA_AIS_ERR_TRY_AGAIN) { + log_group(mg, "unlink ckpt retry"); + sleep(1); + goto unlink_retry; + } + if (rv == SA_AIS_OK) + goto out_close; + + log_error("unlink ckpt error %d %s", rv, mg->name); + ret = -1; + + status_retry: + rv = saCkptCheckpointStatusGet(h, &s); + if (rv == SA_AIS_ERR_TRY_AGAIN) { + log_group(mg, "unlink ckpt status retry"); + sleep(1); + goto status_retry; + } + if (rv != SA_AIS_OK) { + log_error("unlink ckpt status error %d %s", rv, mg->name); + goto out_close; + } + + log_group(mg, "unlink ckpt status: size %llu, max sections %u, " + "max section size %llu, section count %u, mem %u", + s.checkpointCreationAttributes.checkpointSize, + s.checkpointCreationAttributes.maxSections, + s.checkpointCreationAttributes.maxSectionSize, + s.numberOfSections, s.memoryUsed); + + out_close: + rv = saCkptCheckpointClose(h); + if (rv == SA_AIS_ERR_TRY_AGAIN) { + log_group(mg, "unlink ckpt close retry"); + sleep(1); + goto out_close; + } + if (rv != SA_AIS_OK) { + log_error("unlink ckpt close error %d %s", rv, mg->name); + ret = -1; + } + + mg->cp_handle = 0; + return ret; +} + +/* Copy all plock state into a checkpoint so new node can retrieve it. + + The low node in the group and the previous node to create the ckpt (with + non-zero cp_handle) may be different if a new node joins with a lower nodeid + than the previous low node that created the ckpt. In this case, the prev + node has the old ckpt open and will reuse it if no plock state has changed, + or will unlink it and create a new one. The low node will also attempt to + create a new ckpt. That open-create will either fail due to the prev node + reusing the old ckpt, or it will race with the open-create on the prev node + after the prev node unlinks the old ckpt. Either way, when there are two + different nodes in the group calling store_plocks(), one of them will fail + at the Open(CREATE) step with ERR_EXIST due to the other. */ void store_plocks(struct mountgroup *mg) { @@ -883,13 +1000,15 @@ SaCkptCheckpointHandleT h; SaCkptSectionIdT section_id; SaCkptSectionCreationAttributesT section_attr; + SaCkptCheckpointOpenFlagsT flags; SaNameT name; SaAisErrorT rv; char buf[32]; struct resource *r; struct posix_lock *po; struct lock_waiter *w; - int len, r_count, total_size, section_size, max_section_size; + int r_count, lock_count, total_size, section_size, max_section_size; + int len; if (!plocks_online) return; @@ -906,65 +1025,75 @@ /* unlink an old checkpoint before we create a new one */ if (mg->cp_handle) { - log_group(mg, "store_plocks: unlink ckpt"); - h = (SaCkptCheckpointHandleT) mg->cp_handle; - rv = saCkptCheckpointUnlink(h, &name); - if (rv != SA_AIS_OK) - log_error("ckpt unlink error %d %s", rv, mg->name); - h = 0; - mg->cp_handle = 0; + if (unlink_checkpoint(mg, &name)) + return; } /* loop through all plocks to figure out sizes to set in the attr fields */ r_count = 0; + lock_count = 0; total_size = 0; max_section_size = 0; list_for_each_entry(r, &mg->resources, list) { r_count++; section_size = 0; - list_for_each_entry(po, &r->locks, list) + list_for_each_entry(po, &r->locks, list) { section_size += sizeof(struct pack_plock); - list_for_each_entry(w, &r->waiters, list) + lock_count++; + } + list_for_each_entry(w, &r->waiters, list) { section_size += sizeof(struct pack_plock); + lock_count++; + } total_size += section_size; if (section_size > max_section_size) max_section_size = section_size; } - log_group(mg, "store_plocks: r_count %d total %d max_section %d", - r_count, total_size, max_section_size); + log_group(mg, "store_plocks: r_count %d, lock_count %d, pp %d bytes", + r_count, lock_count, sizeof(struct pack_plock)); + + log_group(mg, "store_plocks: total %d bytes, max_section %d bytes", + total_size, max_section_size); attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS; attr.checkpointSize = total_size; attr.retentionDuration = SA_TIME_MAX; - attr.maxSections = r_count; + attr.maxSections = r_count + 1; /* don't know why we need +1 */ attr.maxSectionSize = max_section_size; - attr.maxSectionIdSize = 21; /* 20 digits in max uint64 */ + attr.maxSectionIdSize = 22; + + /* 22 = 20 digits in max uint64 + "r" prefix + \0 suffix */ + + flags = SA_CKPT_CHECKPOINT_READ | + SA_CKPT_CHECKPOINT_WRITE | + SA_CKPT_CHECKPOINT_CREATE; open_retry: - rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr, - SA_CKPT_CHECKPOINT_CREATE | - SA_CKPT_CHECKPOINT_READ | - SA_CKPT_CHECKPOINT_WRITE, - 0, &h); + rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr, flags, 0, &h); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(mg, "store_plocks: ckpt open retry"); sleep(1); goto open_retry; } + if (rv == SA_AIS_ERR_EXIST) { + log_group(mg, "store_plocks: ckpt already exists"); + return; + } if (rv != SA_AIS_OK) { log_error("store_plocks: ckpt open error %d %s", rv, mg->name); return; } + log_group(mg, "store_plocks: open ckpt handle %llx", h); mg->cp_handle = (uint64_t) h; list_for_each_entry(r, &mg->resources, list) { memset(&buf, 0, 32); - len = snprintf(buf, 32, "%llu", r->number); + len = snprintf(buf, 32, "r%llu", r->number); section_id.id = buf; section_id.idLen = len + 1; @@ -973,7 +1102,7 @@ pack_section_buf(mg, r); - create_retry: + create_retry: rv = saCkptSectionCreate(h, §ion_attr, §ion_buf, section_len); if (rv == SA_AIS_ERR_TRY_AGAIN) { @@ -982,7 +1111,7 @@ goto create_retry; } if (rv != SA_AIS_OK) { - log_error("store_plocks: ckpt create error %d %s", + log_error("store_plocks: ckpt section create err %d %s", rv, mg->name); break; } @@ -1005,6 +1134,8 @@ if (!plocks_online) return; + log_group(mg, "retrieve_plocks"); + len = snprintf(name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", mg->name); name.length = len; @@ -1032,11 +1163,11 @@ if (rv != SA_AIS_OK) { log_error("retrieve_plocks: ckpt iterinit error %d %s", rv, mg->name); - return; + goto out; } while (1) { - next_retry: + next_retry: rv = saCkptSectionIterationNext(itr, &desc); if (rv == SA_AIS_ERR_NO_SECTIONS) break; @@ -1048,7 +1179,7 @@ if (rv != SA_AIS_OK) { log_error("retrieve_plocks: ckpt iternext error %d %s", rv, mg->name); - break; + goto out_it; } iov.sectionId = desc.sectionId; @@ -1056,7 +1187,7 @@ iov.dataSize = desc.sectionSize; iov.dataOffset = 0; - read_retry: + read_retry: rv = saCkptCheckpointRead(h, &iov, 1, NULL); if (rv == SA_AIS_ERR_TRY_AGAIN) { log_group(mg, "retrieve_plocks: ckpt read retry"); @@ -1066,13 +1197,19 @@ if (rv != SA_AIS_OK) { log_error("retrieve_plocks: ckpt read error %d %s", rv, mg->name); - break; + goto out_it; } + log_group(mg, "retrieve_plocks: ckpt read %llu bytes", + iov.readSize); + section_len = iov.readSize; + unpack_section_buf(mg, desc.sectionId.id, desc.sectionId.idLen); } + out_it: saCkptSectionIterationFinalize(itr); + out: saCkptCheckpointClose(h); } --- cluster/group/gfs_controld/recover.c 2006/07/31 18:37:07 1.5 +++ cluster/group/gfs_controld/recover.c 2006/08/02 18:27:57 1.6 @@ -12,14 +12,6 @@ #include "lock_dlm.h" -struct save_msg { - struct list_head list; - int nodeid; - int len; - int type; - char buf[0]; -}; - #define SYSFS_DIR "/sys/fs" #define JID_INIT -9 @@ -597,12 +589,14 @@ log_group(mg, "assign_journal: new member %d got jid %d", new->nodeid, new->jid); + if (mg->low_finished_nodeid == our_nodeid || mg->cp_handle) + store_plocks(mg); + /* if we're the first mounter and haven't gotten others_may_mount yet, then don't send journals until kernel_recovery_done_first so the second node won't mount the fs until omm. */ if (mg->low_finished_nodeid == our_nodeid) { - store_plocks(mg); if (mg->first_mounter && !mg->first_mounter_done) { log_group(mg, "delay sending journals to %d", new->nodeid); @@ -655,6 +649,7 @@ if (hd->nodeid == our_nodeid) { mg->got_our_options = 1; + mg->save_plocks = 1; return; } @@ -1784,7 +1779,7 @@ } retrieve_plocks(mg); - /* process_saved_plocks(mg); */ + process_saved_plocks(mg); out: notify_mount_client(mg); }