All of lore.kernel.org
 help / color / mirror / Atom feed
From: Pete Zaitcev <zaitcev@redhat.com>
To: Jeff Garzik <jeff@garzik.org>
Cc: Project Hail List <hail-devel@vger.kernel.org>
Subject: [Patch 4/7] tabled: retry conflicting locks
Date: Thu, 14 Jan 2010 21:13:12 -0700	[thread overview]
Message-ID: <20100114211312.2df3d349@redhat.com> (raw)

This problem was with us for a while, and even with this fix our start-up
is not reliable. But at least we will not be 100% guaranteed to hang as
before when restarting too quickly. So although the whole area needs some
serious reworking, this specific case was just too annoying to let it
continue.

Signed-Off-By: Pete Zaitcev <zaitcev@redhat.com>

---
 server/cldu.c |   38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

commit fa910aacff5118664177f988029cc5f8e6ef886d
Author: Master <zaitcev@lembas.zaitcev.lan>
Date:   Thu Jan 14 19:56:13 2010 -0700

    Retry the lock conflict.

diff --git a/server/cldu.c b/server/cldu.c
index 273f149..1d61672 100644
--- a/server/cldu.c
+++ b/server/cldu.c
@@ -59,6 +59,7 @@ struct cld_session {
 	 * using sleep(), neither of the timers must ever be active simultane-
 	 * ously with any other. But using one timer structure is too annoying.
 	 */
+	struct event tm_relock;
 	struct event tm_retry;
 	struct event tm_rescan;
 	struct event tm_reopen;
@@ -85,6 +86,7 @@ static int cldu_set_cldc(struct cld_session *sp, int newactive);
 static int cldu_new_sess(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
+static void try_lock(struct cld_session *sp);
 static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_put_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_get_1_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
@@ -99,6 +101,7 @@ static int cldu_close_y_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 static void add_remote(const char *name);
 static void add_chunk_node(struct cld_session *sp, const char *name);
 
+static struct timeval cldu_relock_delay = { 10, 0 };
 static struct timeval cldu_retry_delay = { 5, 0 };
 static struct timeval cldu_rescan_delay = { 50, 0 };
 static struct timeval cldu_reopen_delay = { 3, 0 };
@@ -168,6 +171,15 @@ err_oom:
 	return 0;
 }
 
+static void cldu_tm_relock(int fd, short events, void *userdata)
+{
+	struct cld_session *sp = userdata;
+
+	if (debugging)
+		applog(LOG_DEBUG, "Retrying locking of %s", sp->ffname);
+	try_lock(sp);
+}
+
 static void cldu_tm_retry(int fd, short events, void *userdata)
 {
 	struct cld_session *sp = userdata;
@@ -454,8 +466,6 @@ static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 {
 	struct cld_session *sp = carg->private;
-	struct cldc_call_opts copts;
-	int rc;
 
 	if (errc != CLE_OK) {
 		applog(LOG_ERR, "CLD open(%s) failed: %d", sp->ffname, errc);
@@ -473,6 +483,15 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	if (debugging)
 		applog(LOG_DEBUG, "CLD file \"%s\" created", sp->ffname);
 
+	try_lock(sp);
+	return 0;
+}
+
+static void try_lock(struct cld_session *sp)
+{
+	struct cldc_call_opts copts;
+	int rc;
+
 	/*
 	 * Lock the file, in case two hosts got the same hostname.
 	 */
@@ -483,8 +502,6 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	if (rc) {
 		applog(LOG_ERR, "cldc_lock call error %d", rc);
 	}
-
-	return 0;
 }
 
 static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
@@ -497,6 +514,18 @@ static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 
 	if (errc != CLE_OK) {
 		applog(LOG_ERR, "CLD lock(%s) failed: %d", sp->ffname, errc);
+		if (errc == CLE_LOCK_CONFLICT) {
+			/*
+			 * The usual reason why we get a lock conflict is
+			 * restarting too quickly and hitting the previous lock
+			 * that is going to disappear soon.
+			 *
+			 * FIXME: However, it may also be that a master
+			 * is ok and we should become a slave, e.g. start TDB.
+			 * We do not support multi-node, but we should.
+			 */
+			evtimer_add(&sp->tm_relock, &cldu_relock_delay);
+		}
 		return 0;
 	}
 
@@ -940,6 +969,7 @@ int cld_begin(const char *thishost, const char *thiscell)
 {
 	static struct cld_session *sp = &ses;
 
+	evtimer_set(&ses.tm_relock, cldu_tm_relock, &ses);
 	evtimer_set(&ses.tm_retry, cldu_tm_retry, &ses);
 	evtimer_set(&ses.tm_rescan, cldu_tm_rescan, &ses);
 	evtimer_set(&ses.tm_reopen, cldu_tm_reopen, &ses);

             reply	other threads:[~2010-01-15  4:13 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-01-15  4:13 Pete Zaitcev [this message]
2010-01-20 19:53 ` [Patch 4/7] tabled: retry conflicting locks Jeff Garzik
2010-01-20 20:16   ` Pete Zaitcev
2010-01-20 22:00     ` Jeff Garzik
2010-01-20 22:56       ` Pete Zaitcev
2010-02-03 23:10         ` Jeff Garzik

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100114211312.2df3d349@redhat.com \
    --to=zaitcev@redhat.com \
    --cc=hail-devel@vger.kernel.org \
    --cc=jeff@garzik.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.