From: teigland@sourceware.org <teigland@sourceware.org>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c ...
Date: 2 Aug 2006 18:27:58 -0000 [thread overview]
Message-ID: <20060802182758.29785.qmail@sourceware.org> (raw)
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-08-02 18:27:58
Modified files:
group/gfs_controld: lock_dlm.h plock.c recover.c
Log message:
- checkpoint usage for plocks is getting closer, basic writing/reading
of plock state to/from ckpt's works, but unlinking ckpt's and clearing
open ckpt's from processes that exit don't appear to be working right
in openais
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.8&r2=1.9
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/plock.c.diff?cvsroot=cluster&r1=1.3&r2=1.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.5&r2=1.6
--- cluster/group/gfs_controld/lock_dlm.h 2006/07/31 18:37:07 1.8
+++ cluster/group/gfs_controld/lock_dlm.h 2006/08/02 18:27:57 1.9
@@ -140,6 +140,7 @@
int emulate_first_mounter;
int wait_first_done;
int low_finished_nodeid;
+ int save_plocks;
uint64_t cp_handle;
time_t last_checkpoint_time;
@@ -224,6 +225,13 @@
char name[MAXNAME];
};
+struct save_msg {
+ struct list_head list;
+ int nodeid;
+ int len;
+ int type;
+ char buf[0];
+};
struct mountgroup *find_mg(char *name);
struct mountgroup *find_mg_id(uint32_t id);
@@ -245,6 +253,7 @@
int do_withdraw(char *name);
int kernel_recovery_done(char *name);
void ping_kernel_mount(char *table);
+void save_message(struct mountgroup *mg, char *buf, int len, int from, int type);
int client_send(int ci, char *buf, int len);
@@ -253,5 +262,6 @@
void store_plocks(struct mountgroup *mg);
void retrieve_plocks(struct mountgroup *mg);
int dump_plocks(char *name, int fd);
+void process_saved_plocks(struct mountgroup *mg);
#endif
--- cluster/group/gfs_controld/plock.c 2006/07/31 18:37:07 1.3
+++ cluster/group/gfs_controld/plock.c 2006/08/02 18:27:57 1.4
@@ -226,9 +226,6 @@
else
log_error("ckpt init error %d - plocks unavailable", err);
- /* REMOVEME: disable actual use of checkpoints for now */
- plocks_online = 0;
-
rv = open_control();
if (rv)
return rv;
@@ -740,7 +737,17 @@
return rv;
}
-void receive_plock(struct mountgroup *mg, char *buf, int len, int from)
+/* When mg members receive our options message (for our mount), one of them
+ saves all plock state received to that point in a checkpoint and then sounds
+ us our journals message. We know to retrieve the plock state from the
+ checkpoint when we receive our journals message. Any plocks messages that
+ arrive between seeing our options message and our journals message needs to
+ be saved and processed after we synchronize our plock state from the
+ checkpoint. Any plock message received while we're mounting but before we
+ set save_plocks (when we see our options message) can be ignored because it
+ should be reflected in the checkpointed state. */
+
+void _receive_plock(struct mountgroup *mg, char *buf, int len, int from)
{
struct gdlm_plock_info info;
struct gdlm_header *hd = (struct gdlm_header *) buf;
@@ -754,6 +761,9 @@
from, info.optype, info.fsid, info.number, info.ex,
info.wait);
+ if (info.optype == GDLM_PLOCK_OP_GET && from != our_nodeid)
+ return;
+
if (from != hd->nodeid || from != info.nodeid) {
log_error("receive_plock from %d header %d info %d",
from, hd->nodeid, info.nodeid);
@@ -761,9 +771,6 @@
goto out;
}
- if (info.optype == GDLM_PLOCK_OP_GET && from != our_nodeid)
- return;
-
switch (info.optype) {
case GDLM_PLOCK_OP_LOCK:
mg->last_plock_time = time(NULL);
@@ -787,6 +794,41 @@
}
}
+void receive_plock(struct mountgroup *mg, char *buf, int len, int from)
+{
+ if (mg->save_plocks) {
+ save_message(mg, buf, len, from, MSG_PLOCK);
+ return;
+ }
+
+ if (!mg->got_our_journals) {
+ log_group(mg, "not saving plock messages yet");
+ return;
+ }
+
+ _receive_plock(mg, buf, len, from);
+}
+
+void process_saved_plocks(struct mountgroup *mg)
+{
+ struct save_msg *sm, *sm2;
+
+ mg->save_plocks = 0;
+
+ if (list_empty(&mg->saved_messages))
+ return;
+
+ log_group(mg, "process_saved_plocks");
+
+ list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
+ if (sm->type != MSG_PLOCK)
+ continue;
+ _receive_plock(mg, sm->buf, sm->len, sm->nodeid);
+ list_del(&sm->list);
+ free(sm);
+ }
+}
+
void plock_exit(void)
{
if (plocks_online)
@@ -807,6 +849,7 @@
list_for_each_entry(po, &r->locks, list) {
pp->start = po->start;
pp->end = po->end;
+ pp->owner = po->owner;
pp->pid = po->pid;
pp->nodeid = po->nodeid;
pp->ex = po->ex;
@@ -818,6 +861,7 @@
list_for_each_entry(w, &r->waiters, list) {
pp->start = w->info.start;
pp->end = w->info.end;
+ pp->owner = w->info.owner;
pp->pid = w->info.pid;
pp->nodeid = w->info.nodeid;
pp->ex = w->info.ex;
@@ -844,8 +888,9 @@
if (!r)
return -ENOMEM;
memset(r, 0, sizeof(struct resource));
-
- sscanf(numbuf, "%llu", &r->number);
+ INIT_LIST_HEAD(&r->locks);
+ INIT_LIST_HEAD(&r->waiters);
+ sscanf(numbuf, "r%llu", &r->number);
log_group(mg, "unpack %llx count %d", r->number, count);
@@ -856,13 +901,16 @@
po = malloc(sizeof(struct posix_lock));
po->start = pp->start;
po->end = pp->end;
+ po->owner = pp->owner;
po->pid = pp->pid;
+ po->nodeid = pp->nodeid;
po->ex = pp->ex;
list_add_tail(&po->list, &r->locks);
} else {
w = malloc(sizeof(struct lock_waiter));
w->info.start = pp->start;
w->info.end = pp->end;
+ w->info.owner = pp->owner;
w->info.pid = pp->pid;
w->info.nodeid = pp->nodeid;
w->info.ex = pp->ex;
@@ -875,7 +923,76 @@
return 0;
}
-/* copy all plock state into a checkpoint so new node can retrieve it */
+int unlink_checkpoint(struct mountgroup *mg, SaNameT *name)
+{
+ SaCkptCheckpointHandleT h;
+ SaCkptCheckpointDescriptorT s;
+ SaAisErrorT rv;
+ int ret = 0;
+
+ h = (SaCkptCheckpointHandleT) mg->cp_handle;
+ log_group(mg, "unlink ckpt %llx", h);
+
+ unlink_retry:
+ rv = saCkptCheckpointUnlink(h, name);
+ if (rv == SA_AIS_ERR_TRY_AGAIN) {
+ log_group(mg, "unlink ckpt retry");
+ sleep(1);
+ goto unlink_retry;
+ }
+ if (rv == SA_AIS_OK)
+ goto out_close;
+
+ log_error("unlink ckpt error %d %s", rv, mg->name);
+ ret = -1;
+
+ status_retry:
+ rv = saCkptCheckpointStatusGet(h, &s);
+ if (rv == SA_AIS_ERR_TRY_AGAIN) {
+ log_group(mg, "unlink ckpt status retry");
+ sleep(1);
+ goto status_retry;
+ }
+ if (rv != SA_AIS_OK) {
+ log_error("unlink ckpt status error %d %s", rv, mg->name);
+ goto out_close;
+ }
+
+ log_group(mg, "unlink ckpt status: size %llu, max sections %u, "
+ "max section size %llu, section count %u, mem %u",
+ s.checkpointCreationAttributes.checkpointSize,
+ s.checkpointCreationAttributes.maxSections,
+ s.checkpointCreationAttributes.maxSectionSize,
+ s.numberOfSections, s.memoryUsed);
+
+ out_close:
+ rv = saCkptCheckpointClose(h);
+ if (rv == SA_AIS_ERR_TRY_AGAIN) {
+ log_group(mg, "unlink ckpt close retry");
+ sleep(1);
+ goto out_close;
+ }
+ if (rv != SA_AIS_OK) {
+ log_error("unlink ckpt close error %d %s", rv, mg->name);
+ ret = -1;
+ }
+
+ mg->cp_handle = 0;
+ return ret;
+}
+
+/* Copy all plock state into a checkpoint so new node can retrieve it.
+
+ The low node in the group and the previous node to create the ckpt (with
+ non-zero cp_handle) may be different if a new node joins with a lower nodeid
+ than the previous low node that created the ckpt. In this case, the prev
+ node has the old ckpt open and will reuse it if no plock state has changed,
+ or will unlink it and create a new one. The low node will also attempt to
+ create a new ckpt. That open-create will either fail due to the prev node
+ reusing the old ckpt, or it will race with the open-create on the prev node
+ after the prev node unlinks the old ckpt. Either way, when there are two
+ different nodes in the group calling store_plocks(), one of them will fail
+ at the Open(CREATE) step with ERR_EXIST due to the other. */
void store_plocks(struct mountgroup *mg)
{
@@ -883,13 +1000,15 @@
SaCkptCheckpointHandleT h;
SaCkptSectionIdT section_id;
SaCkptSectionCreationAttributesT section_attr;
+ SaCkptCheckpointOpenFlagsT flags;
SaNameT name;
SaAisErrorT rv;
char buf[32];
struct resource *r;
struct posix_lock *po;
struct lock_waiter *w;
- int len, r_count, total_size, section_size, max_section_size;
+ int r_count, lock_count, total_size, section_size, max_section_size;
+ int len;
if (!plocks_online)
return;
@@ -906,65 +1025,75 @@
/* unlink an old checkpoint before we create a new one */
if (mg->cp_handle) {
- log_group(mg, "store_plocks: unlink ckpt");
- h = (SaCkptCheckpointHandleT) mg->cp_handle;
- rv = saCkptCheckpointUnlink(h, &name);
- if (rv != SA_AIS_OK)
- log_error("ckpt unlink error %d %s", rv, mg->name);
- h = 0;
- mg->cp_handle = 0;
+ if (unlink_checkpoint(mg, &name))
+ return;
}
/* loop through all plocks to figure out sizes to set in
the attr fields */
r_count = 0;
+ lock_count = 0;
total_size = 0;
max_section_size = 0;
list_for_each_entry(r, &mg->resources, list) {
r_count++;
section_size = 0;
- list_for_each_entry(po, &r->locks, list)
+ list_for_each_entry(po, &r->locks, list) {
section_size += sizeof(struct pack_plock);
- list_for_each_entry(w, &r->waiters, list)
+ lock_count++;
+ }
+ list_for_each_entry(w, &r->waiters, list) {
section_size += sizeof(struct pack_plock);
+ lock_count++;
+ }
total_size += section_size;
if (section_size > max_section_size)
max_section_size = section_size;
}
- log_group(mg, "store_plocks: r_count %d total %d max_section %d",
- r_count, total_size, max_section_size);
+ log_group(mg, "store_plocks: r_count %d, lock_count %d, pp %d bytes",
+ r_count, lock_count, sizeof(struct pack_plock));
+
+ log_group(mg, "store_plocks: total %d bytes, max_section %d bytes",
+ total_size, max_section_size);
attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS;
attr.checkpointSize = total_size;
attr.retentionDuration = SA_TIME_MAX;
- attr.maxSections = r_count;
+ attr.maxSections = r_count + 1; /* don't know why we need +1 */
attr.maxSectionSize = max_section_size;
- attr.maxSectionIdSize = 21; /* 20 digits in max uint64 */
+ attr.maxSectionIdSize = 22;
+
+ /* 22 = 20 digits in max uint64 + "r" prefix + \0 suffix */
+
+ flags = SA_CKPT_CHECKPOINT_READ |
+ SA_CKPT_CHECKPOINT_WRITE |
+ SA_CKPT_CHECKPOINT_CREATE;
open_retry:
- rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr,
- SA_CKPT_CHECKPOINT_CREATE |
- SA_CKPT_CHECKPOINT_READ |
- SA_CKPT_CHECKPOINT_WRITE,
- 0, &h);
+ rv = saCkptCheckpointOpen(ckpt_handle, &name, &attr, flags, 0, &h);
if (rv == SA_AIS_ERR_TRY_AGAIN) {
log_group(mg, "store_plocks: ckpt open retry");
sleep(1);
goto open_retry;
}
+ if (rv == SA_AIS_ERR_EXIST) {
+ log_group(mg, "store_plocks: ckpt already exists");
+ return;
+ }
if (rv != SA_AIS_OK) {
log_error("store_plocks: ckpt open error %d %s", rv, mg->name);
return;
}
+ log_group(mg, "store_plocks: open ckpt handle %llx", h);
mg->cp_handle = (uint64_t) h;
list_for_each_entry(r, &mg->resources, list) {
memset(&buf, 0, 32);
- len = snprintf(buf, 32, "%llu", r->number);
+ len = snprintf(buf, 32, "r%llu", r->number);
section_id.id = buf;
section_id.idLen = len + 1;
@@ -973,7 +1102,7 @@
pack_section_buf(mg, r);
- create_retry:
+ create_retry:
rv = saCkptSectionCreate(h, §ion_attr, §ion_buf,
section_len);
if (rv == SA_AIS_ERR_TRY_AGAIN) {
@@ -982,7 +1111,7 @@
goto create_retry;
}
if (rv != SA_AIS_OK) {
- log_error("store_plocks: ckpt create error %d %s",
+ log_error("store_plocks: ckpt section create err %d %s",
rv, mg->name);
break;
}
@@ -1005,6 +1134,8 @@
if (!plocks_online)
return;
+ log_group(mg, "retrieve_plocks");
+
len = snprintf(name.value, SA_MAX_NAME_LENGTH, "gfsplock.%s", mg->name);
name.length = len;
@@ -1032,11 +1163,11 @@
if (rv != SA_AIS_OK) {
log_error("retrieve_plocks: ckpt iterinit error %d %s",
rv, mg->name);
- return;
+ goto out;
}
while (1) {
- next_retry:
+ next_retry:
rv = saCkptSectionIterationNext(itr, &desc);
if (rv == SA_AIS_ERR_NO_SECTIONS)
break;
@@ -1048,7 +1179,7 @@
if (rv != SA_AIS_OK) {
log_error("retrieve_plocks: ckpt iternext error %d %s",
rv, mg->name);
- break;
+ goto out_it;
}
iov.sectionId = desc.sectionId;
@@ -1056,7 +1187,7 @@
iov.dataSize = desc.sectionSize;
iov.dataOffset = 0;
- read_retry:
+ read_retry:
rv = saCkptCheckpointRead(h, &iov, 1, NULL);
if (rv == SA_AIS_ERR_TRY_AGAIN) {
log_group(mg, "retrieve_plocks: ckpt read retry");
@@ -1066,13 +1197,19 @@
if (rv != SA_AIS_OK) {
log_error("retrieve_plocks: ckpt read error %d %s",
rv, mg->name);
- break;
+ goto out_it;
}
+ log_group(mg, "retrieve_plocks: ckpt read %llu bytes",
+ iov.readSize);
+ section_len = iov.readSize;
+
unpack_section_buf(mg, desc.sectionId.id, desc.sectionId.idLen);
}
+ out_it:
saCkptSectionIterationFinalize(itr);
+ out:
saCkptCheckpointClose(h);
}
--- cluster/group/gfs_controld/recover.c 2006/07/31 18:37:07 1.5
+++ cluster/group/gfs_controld/recover.c 2006/08/02 18:27:57 1.6
@@ -12,14 +12,6 @@
#include "lock_dlm.h"
-struct save_msg {
- struct list_head list;
- int nodeid;
- int len;
- int type;
- char buf[0];
-};
-
#define SYSFS_DIR "/sys/fs"
#define JID_INIT -9
@@ -597,12 +589,14 @@
log_group(mg, "assign_journal: new member %d got jid %d",
new->nodeid, new->jid);
+ if (mg->low_finished_nodeid == our_nodeid || mg->cp_handle)
+ store_plocks(mg);
+
/* if we're the first mounter and haven't gotten others_may_mount
yet, then don't send journals until kernel_recovery_done_first
so the second node won't mount the fs until omm. */
if (mg->low_finished_nodeid == our_nodeid) {
- store_plocks(mg);
if (mg->first_mounter && !mg->first_mounter_done) {
log_group(mg, "delay sending journals to %d",
new->nodeid);
@@ -655,6 +649,7 @@
if (hd->nodeid == our_nodeid) {
mg->got_our_options = 1;
+ mg->save_plocks = 1;
return;
}
@@ -1784,7 +1779,7 @@
}
retrieve_plocks(mg);
- /* process_saved_plocks(mg); */
+ process_saved_plocks(mg);
out:
notify_mount_client(mg);
}
next reply other threads:[~2006-08-02 18:27 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-08-02 18:27 teigland [this message]
-- strict thread matches above, loose matches on Subject: below --
2006-08-04 21:56 [Cluster-devel] cluster/group/gfs_controld lock_dlm.h plock.c teigland
2006-08-07 16:57 teigland
2006-08-08 21:19 teigland
2006-08-18 16:33 teigland
2006-08-21 17:46 teigland
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060802182758.29785.qmail@sourceware.org \
--to=teigland@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).