From mboxrd@z Thu Jan 1 00:00:00 1970 From: teigland@sourceware.org Date: 31 Jul 2006 18:37:08 -0000 Subject: [Cluster-devel] cluster/group/gfs_controld Makefile cpg.c lock ... Message-ID: <20060731183708.14450.qmail@sourceware.org> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit CVSROOT: /cvs/cluster Module name: cluster Changes by: teigland at sourceware.org 2006-07-31 18:37:07 Modified files: group/gfs_controld: Makefile cpg.c lock_dlm.h main.c plock.c recover.c Log message: - use nodeid and owner when checking the owner of a plock instead of just pid - this requires the recent addition of an owner field to the struct in the lock_dlm_plock.h kernel header - add ability to dump all the plocks to a client (group_tool) to display - add new code that uses the SA CKPT service to synchronize all the plock state for the group to a new node that joins the group, this is currently disabled until it's been tested and debugged Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/Makefile.diff?cvsroot=cluster&r1=1.5&r2=1.6 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/cpg.c.diff?cvsroot=cluster&r1=1.4&r2=1.5 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.7&r2=1.8 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/main.c.diff?cvsroot=cluster&r1=1.6&r2=1.7 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.2&r2=1.3 http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.4&r2=1.5 --- cluster/group/gfs_controld/Makefile 2006/07/07 16:53:26 1.5 +++ cluster/group/gfs_controld/Makefile 2006/07/31 18:37:07 1.6 @@ -38,7 +38,7 @@ plock.o \ recover.o \ ../lib/libgroup.a - $(CC) $(LDFLAGS) -o $@ $^ -lcman -lcpg + $(CC) $(LDFLAGS) -o $@ $^ -lcman -lcpg -lSaCkpt main.o: main.c --- cluster/group/gfs_controld/cpg.c 2006/06/30 15:35:23 1.4 +++ cluster/group/gfs_controld/cpg.c 2006/07/31 18:37:07 1.5 @@ -38,8 +38,11 @@ hd = (struct gdlm_header *) data; mg = find_mg(hd->name); - if (!mg) + if (!mg) { + log_error("cpg message from %d len %d no group %s", + nodeid, len, hd->name); return; + } hd->version[0] = le16_to_cpu(hd->version[0]); hd->version[1] = le16_to_cpu(hd->version[1]); @@ -152,8 +155,10 @@ } cpg_fd_get(daemon_handle, &fd); - if (fd < 0) + if (fd < 0) { + log_error("cpg_fd_get error %d", error); return -1; + } memset(&daemon_name, 0, sizeof(daemon_name)); strcpy(daemon_name.value, "gfs_controld"); @@ -187,15 +192,17 @@ retry: error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1); - if (error != CPG_OK) - log_error("cpg_mcast_joined error %d handle %llx", error, h); if (error == CPG_ERR_TRY_AGAIN) { - /* FIXME: backoff say .25 sec, .5 sec, .75 sec, 1 sec */ + log_debug("cpg_mcast_joined error %d", error); retries++; if (retries > 3) sleep(1); goto retry; } + if (error != CPG_OK) { + log_error("cpg_mcast_joined error %d handle %llx", error, h); + return -1; + } return 0; } --- cluster/group/gfs_controld/lock_dlm.h 2006/07/20 20:19:44 1.7 +++ cluster/group/gfs_controld/lock_dlm.h 2006/07/31 18:37:07 1.8 @@ -141,6 +141,10 @@ int wait_first_done; int low_finished_nodeid; + uint64_t cp_handle; + time_t last_checkpoint_time; + time_t last_plock_time; + int needs_recovery; int our_jid; int spectator; @@ -246,4 +250,8 @@ int send_group_message(struct mountgroup *mg, int len, char *buf); +void store_plocks(struct mountgroup *mg); +void retrieve_plocks(struct mountgroup *mg); +int dump_plocks(char *name, int fd); + #endif --- cluster/group/gfs_controld/main.c 2006/07/20 20:19:44 1.6 +++ cluster/group/gfs_controld/main.c 2006/07/31 18:37:07 1.7 @@ -125,7 +125,7 @@ return write(client[ci].fd, buf, len); } -static int do_dump(int ci) +static int dump_debug(int ci) { int rv, len; @@ -182,7 +182,11 @@ else if (!strcmp(cmd, "remount")) rv = do_remount(ci, dir, argv[3]); else if (!strcmp(cmd, "dump")) { - do_dump(ci); + dump_debug(ci); + return 0; + } else if (!strcmp(cmd, "plocks")) { + dump_plocks(dir, client[ci].fd); + client_dead(ci); return 0; } else rv = -EINVAL; --- cluster/group/gfs_controld/plock.c 2006/07/19 14:44:40 1.2 +++ cluster/group/gfs_controld/plock.c 2006/07/31 18:37:07 1.3 @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include #include "lock_dlm.h" @@ -43,6 +45,25 @@ static int control_fd = -1; extern int our_nodeid; +static int plocks_online = 0; + +static SaCkptHandleT ckpt_handle; +static SaCkptCallbacksT callbacks = { 0, 0 }; +static SaVersionT version = { 'B', 1, 1 }; +static char section_buf[1024 * 1024]; +static uint32_t section_len; + +struct pack_plock { + uint64_t start; + uint64_t end; + uint64_t owner; + uint32_t pid; + uint32_t nodeid; + uint8_t ex; + uint8_t waiter; + uint16_t pad1; + uint32_t pad; +}; struct resource { struct list_head list; /* list of resources */ @@ -54,9 +75,11 @@ struct posix_lock { struct list_head list; /* resource locks or waiters list */ uint32_t pid; + uint64_t owner; uint64_t start; uint64_t end; int ex; + int nodeid; }; struct lock_waiter { @@ -194,8 +217,18 @@ int setup_plocks(void) { + SaAisErrorT err; int rv; + err = saCkptInitialize(&ckpt_handle, &callbacks, &version); + if (err == SA_AIS_OK) + plocks_online = 1; + else + log_error("ckpt init error %d - plocks unavailable", err); + + /* REMOVEME: disable actual use of checkpoints for now */ + plocks_online = 0; + rv = open_control(); if (rv) return rv; @@ -222,6 +255,7 @@ mg = find_mg_id(info.fsid); if (!mg) { + log_debug("process_plocks: no mg id %x", info.fsid); rv = -EEXIST; goto fail; } @@ -232,13 +266,16 @@ rv = -ENOMEM; goto fail; } + memset(buf, 0, len); + + info.nodeid = our_nodeid; /* FIXME: do byte swapping */ hd = (struct gdlm_header *)buf; hd->type = MSG_PLOCK; hd->nodeid = our_nodeid; - hd->to_nodeid = 0; /* to all */ + hd->to_nodeid = 0; memcpy(buf + sizeof(struct gdlm_header), &info, sizeof(info)); rv = send_group_message(mg, len, buf); @@ -418,7 +455,7 @@ struct posix_lock *po; list_for_each_entry(po, &r->locks, list) { - if (po->pid == in->pid) + if (po->nodeid == in->nodeid && po->owner == in->owner) continue; if (!ranges_overlap(po->start, po->end, in->start, in->end)) continue; @@ -429,8 +466,8 @@ return 0; } -static int add_lock(struct resource *r, uint32_t pid, int ex, - uint64_t start, uint64_t end) +static int add_lock(struct resource *r, uint32_t nodeid, uint64_t owner, + uint32_t pid, int ex, uint64_t start, uint64_t end) { struct posix_lock *po; @@ -441,6 +478,8 @@ po->start = start; po->end = end; + po->nodeid = nodeid; + po->owner = owner; po->pid = pid; po->ex = ex; list_add_tail(&po->list, &r->locks); @@ -466,7 +505,7 @@ po->end = in->end; po->ex = in->ex; - add_lock(r, in->pid, !in->ex, start2, end2); + add_lock(r, in->nodeid, in->owner, in->pid, !in->ex, start2, end2); return 0; } @@ -480,8 +519,11 @@ struct gdlm_plock_info *in) { - add_lock(r, in->pid, !in->ex, po->start, in->start - 1); - add_lock(r, in->pid, !in->ex, in->end + 1, po->end); + add_lock(r, in->nodeid, in->owner, in->pid, + !in->ex, po->start, in->start - 1); + + add_lock(r, in->nodeid, in->owner, in->pid, + !in->ex, in->end + 1, po->end); po->start = in->start; po->end = in->end; @@ -497,7 +539,7 @@ int rv = 0; list_for_each_entry_safe(po, safe, &r->locks, list) { - if (po->pid != in->pid) + if (po->nodeid != in->nodeid || po->owner != in->owner) continue; if (!ranges_overlap(po->start, po->end, in->start, in->end)) continue; @@ -546,7 +588,8 @@ } } - rv = add_lock(r, in->pid, in->ex, in->start, in->end); + rv = add_lock(r, in->nodeid, in->owner, in->pid, + in->ex, in->start, in->end); out: return rv; @@ -560,7 +603,7 @@ int rv = 0; list_for_each_entry_safe(po, safe, &r->locks, list) { - if (po->pid != in->pid) + if (po->nodeid != in->nodeid || po->owner != in->owner) continue; if (!ranges_overlap(po->start, po->end, in->start, in->end)) continue; @@ -587,7 +630,8 @@ /* RN within RE - shrink and update RE to be front * fragment, and add a new lock for back fragment */ - add_lock(r, in->pid, po->ex, in->end + 1, po->end); + add_lock(r, in->nodeid, in->owner, in->pid, + po->ex, in->end + 1, po->end); po->end = in->start - 1; goto out; @@ -704,12 +748,15 @@ memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info)); - log_group(mg, "receive_plock %d op %d fs %x num %llx ex %d wait %d", + /* FIXME: do byte swapping */ + + log_group(mg, "receive_plock from %d op %d fs %x num %llx ex %d w %d", from, info.optype, info.fsid, info.number, info.ex, info.wait); - if (from != hd->nodeid) { - log_error("receive_plock from %d vs %d", from, hd->nodeid); + if (from != hd->nodeid || from != info.nodeid) { + log_error("receive_plock from %d header %d info %d", + from, hd->nodeid, info.nodeid); rv = -EINVAL; goto out; } @@ -719,9 +766,11 @@ switch (info.optype) { case GDLM_PLOCK_OP_LOCK: + mg->last_plock_time = time(NULL); rv = do_lock(mg, &info); break; case GDLM_PLOCK_OP_UNLOCK: + mg->last_plock_time = time(NULL); rv = do_unlock(mg, &info); break; case GDLM_PLOCK_OP_GET: @@ -738,3 +787,336 @@ } } +void plock_exit(void) +{ + if (plocks_online) + saCkptFinalize(ckpt_handle); +} + +void pack_section_buf(struct mountgroup *mg, struct resource *r) +{ + struct pack_plock *pp; + struct posix_lock *po; + struct lock_waiter *w; + int count = 0; + + memset(§ion_buf, 0, sizeof(section_buf)); + + pp = (struct pack_plock *) §ion_buf; + + list_for_each_entry(po, &r->locks, list) { + pp->start = po->start; + pp->end = po->end; + pp->pid = po->pid; + pp->nodeid = po->nodeid; + pp->ex = po->ex; + pp->waiter = 0; + pp++; + count++; + } + + list_for_each_entry(w, &r->waiters, list) { + pp->start = w->info.start; + pp->end = w->info.end; + pp->pid = w->info.pid; + pp->nodeid = w->info.nodeid; + pp->ex = w->info.ex; + pp->waiter = 1; + pp++; + count++; + } + + section_len = count * sizeof(struct pack_plock); + + log_group(mg, "pack %llx count %d", r->number, count); +} + +int unpack_section_buf(struct mountgroup *mg, char *numbuf, int buflen) +{ + struct pack_plock *pp; + struct posix_lock *po; + struct lock_waiter *w; + struct resource *r; + int count = section_len / sizeof(struct pack_plock); + int i; + + r = malloc(sizeof(struct resource)); + if (!r) + return -ENOMEM; + memset(r, 0, sizeof(struct resource)); + + sscanf(numbuf, "%llu", &r->number); + + log_group(mg, "unpack %llx count %d", r->number, count); + + pp = (struct pack_plock *) §ion_buf; + + for (i = 0; i < count; i++) { + if (!pp->waiter) { + po = malloc(sizeof(struct posix_lock)); + po->start = pp->start; + po->end = pp->end; + po->pid = pp->pid; + po->ex = pp->ex; + list_add_tail(&po->list, &r->locks); + } else { + w = malloc(sizeof(struct lock_waiter)); + w->info.start = pp->start; + w->info.end = pp->end; + w->info.pid = pp->pid; + w->info.nodeid = pp->nodeid; + w->info.ex = pp->ex; + list_add_tail(&w->list, &r->waiters); + } + pp++; + } + + list_add_tail(&r->list, &mg->resources); + return 0; +} + +/* copy all plock state into a checkpoint so new node can retrieve it */ + +void store_plocks(struct mountgroup *mg) +{ + SaCkptCheckpointCreationAttributesT attr; + SaCkptCheckpointHandleT h; + SaCkptSectionIdT section_id; + SaCkptSectionCreationAttributesT section_attr; + SaNameT name; + SaAisErrorT rv; + char buf[32]; + struct resource *r; + struct posix_lock *po; + struct lock_waiter *w; + int len, r_count, total_size, section_size, max_section_size; + + if (!plocks_online) + return; + + /* no change to plock state since we created the last checkpoint */ + if (mg->last_checkpoint_time > mg->last_plock_time) { + log_group(mg, "store_plocks: ckpt uptodate"); + return; + } + mg->last_checkpoint_time = time(NULL); + + len = snprintf(name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", mg->name); + name.length = len; + + /* unlink an old checkpoint before we create a new one */ + if (mg->cp_handle) { + log_group(mg, "store_plocks: unlink ckpt"); + h = (SaCkptCheckpointHandleT) mg->cp_handle; + rv = saCkptCheckpointUnlink(h, &name); + if (rv != SA_AIS_OK) + log_error("ckpt unlink error %d %s", rv, mg->name); + h = 0; + mg->cp_handle = 0; + } + + /* loop through all plocks to figure out sizes to set in + the attr fields */ + + r_count = 0; + total_size = 0; + max_section_size = 0; + + list_for_each_entry(r, &mg->resources, list) { + r_count++; + section_size = 0; + list_for_each_entry(po, &r->locks, list) + section_size += sizeof(struct pack_plock); + list_for_each_entry(w, &r->waiters, list) + section_size += sizeof(struct pack_plock); + total_size += section_size; + if (section_size > max_section_size) + max_section_size = section_size; + } + + log_group(mg, "store_plocks: r_count %d total %d max_section %d", + r_count, total_size, max_section_size); + + attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS; + attr.checkpointSize = total_size; + attr.retentionDuration = SA_TIME_MAX; + attr.maxSections = r_count; + attr.maxSectionSize = max_section_size; + attr.maxSectionIdSize = 21; /* 20 digits in max uint64 */ + + open_retry: + rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr, + SA_CKPT_CHECKPOINT_CREATE | + SA_CKPT_CHECKPOINT_READ | + SA_CKPT_CHECKPOINT_WRITE, + 0, &h); + if (rv == SA_AIS_ERR_TRY_AGAIN) { + log_group(mg, "store_plocks: ckpt open retry"); + sleep(1); + goto open_retry; + } + if (rv != SA_AIS_OK) { + log_error("store_plocks: ckpt open error %d %s", rv, mg->name); + return; + } + + mg->cp_handle = (uint64_t) h; + + list_for_each_entry(r, &mg->resources, list) { + memset(&buf, 0, 32); + len = snprintf(buf, 32, "%llu", r->number); + + section_id.id = buf; + section_id.idLen = len + 1; + section_attr.sectionId = §ion_id; + section_attr.expirationTime = SA_TIME_END; + + pack_section_buf(mg, r); + + create_retry: + rv = saCkptSectionCreate(h, §ion_attr, §ion_buf, + section_len); + if (rv == SA_AIS_ERR_TRY_AGAIN) { + log_group(mg, "store_plocks: ckpt create retry"); + sleep(1); + goto create_retry; + } + if (rv != SA_AIS_OK) { + log_error("store_plocks: ckpt create error %d %s", + rv, mg->name); + break; + } + } +} + +/* called by a node that's just been added to the group to get existing plock + state */ + +void retrieve_plocks(struct mountgroup *mg) +{ + SaCkptCheckpointHandleT h; + SaCkptSectionIterationHandleT itr; + SaCkptSectionDescriptorT desc; + SaCkptIOVectorElementT iov; + SaNameT name; + SaAisErrorT rv; + int len; + + if (!plocks_online) + return; + + len = snprintf(name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", mg->name); + name.length = len; + + open_retry: + rv = saCkptCheckpointOpen(ckpt_handle, &name, NULL, + SA_CKPT_CHECKPOINT_READ, 0, &h); + if (rv == SA_AIS_ERR_TRY_AGAIN) { + log_group(mg, "retrieve_plocks: ckpt open retry"); + sleep(1); + goto open_retry; + } + if (rv != SA_AIS_OK) { + log_error("retrieve_plocks: ckpt open error %d %s", + rv, mg->name); + return; + } + + init_retry: + rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, 0, &itr); + if (rv == SA_AIS_ERR_TRY_AGAIN) { + log_group(mg, "retrieve_plocks: ckpt iterinit retry"); + sleep(1); + goto init_retry; + } + if (rv != SA_AIS_OK) { + log_error("retrieve_plocks: ckpt iterinit error %d %s", + rv, mg->name); + return; + } + + while (1) { + next_retry: + rv = saCkptSectionIterationNext(itr, &desc); + if (rv == SA_AIS_ERR_NO_SECTIONS) + break; + if (rv == SA_AIS_ERR_TRY_AGAIN) { + log_group(mg, "retrieve_plocks: ckpt iternext retry"); + sleep(1); + goto next_retry; + } + if (rv != SA_AIS_OK) { + log_error("retrieve_plocks: ckpt iternext error %d %s", + rv, mg->name); + break; + } + + iov.sectionId = desc.sectionId; + iov.dataBuffer = §ion_buf; + iov.dataSize = desc.sectionSize; + iov.dataOffset = 0; + + read_retry: + rv = saCkptCheckpointRead(h, &iov, 1, NULL); + if (rv == SA_AIS_ERR_TRY_AGAIN) { + log_group(mg, "retrieve_plocks: ckpt read retry"); + sleep(1); + goto read_retry; + } + if (rv != SA_AIS_OK) { + log_error("retrieve_plocks: ckpt read error %d %s", + rv, mg->name); + break; + } + + unpack_section_buf(mg, desc.sectionId.id, desc.sectionId.idLen); + } + + saCkptSectionIterationFinalize(itr); + saCkptCheckpointClose(h); +} + +int dump_plocks(char *name, int fd) +{ + struct mountgroup *mg; + struct posix_lock *po; + struct lock_waiter *w; + struct resource *r; + char line[MAXLINE]; + int rv; + + if (!name) + return -1; + + mg = find_mg(name); + if (!mg) + return -1; + + list_for_each_entry(r, &mg->resources, list) { + + list_for_each_entry(po, &r->locks, list) { + snprintf(line, MAXLINE, + "%llu %s %llu-%llu nodeid %d pid %u owner %llx\n", + r->number, + po->ex ? "WR" : "RD", + po->start, po->end, + po->nodeid, po->pid, po->owner); + + rv = write(fd, line, strlen(line)); + } + + list_for_each_entry(w, &r->waiters, list) { + snprintf(line, MAXLINE, + "%llu WAITING %s %llu-%llu nodeid %d pid %u owner %llx\n", + r->number, + po->ex ? "WR" : "RD", + po->start, po->end, + po->nodeid, po->pid, po->owner); + + rv = write(fd, line, strlen(line)); + } + } + + return 0; +} + --- cluster/group/gfs_controld/recover.c 2006/07/20 20:19:44 1.4 +++ cluster/group/gfs_controld/recover.c 2006/07/31 18:37:07 1.5 @@ -602,6 +602,7 @@ so the second node won't mount the fs until omm. */ if (mg->low_finished_nodeid == our_nodeid) { + store_plocks(mg); if (mg->first_mounter && !mg->first_mounter_done) { log_group(mg, "delay sending journals to %d", new->nodeid); @@ -1781,6 +1782,9 @@ mg->first_mounter = 1; mg->first_mounter_done = 0; } + + retrieve_plocks(mg); + /* process_saved_plocks(mg); */ out: notify_mount_client(mg); }