All of lore.kernel.org
 help / color / mirror / Atom feed
* [Patch 4/7] tabled: retry conflicting locks
@ 2010-01-15  4:13 Pete Zaitcev
  2010-01-20 19:53 ` Jeff Garzik
  0 siblings, 1 reply; 6+ messages in thread
From: Pete Zaitcev @ 2010-01-15  4:13 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Project Hail List

This problem was with us for a while, and even with this fix our start-up
is not reliable. But at least we will not be 100% guaranteed to hang as
before when restarting too quickly. So although the whole area needs some
serious reworking, this specific case was just too annoying to let it
continue.

Signed-Off-By: Pete Zaitcev <zaitcev@redhat.com>

---
 server/cldu.c |   38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

commit fa910aacff5118664177f988029cc5f8e6ef886d
Author: Master <zaitcev@lembas.zaitcev.lan>
Date:   Thu Jan 14 19:56:13 2010 -0700

    Retry the lock conflict.

diff --git a/server/cldu.c b/server/cldu.c
index 273f149..1d61672 100644
--- a/server/cldu.c
+++ b/server/cldu.c
@@ -59,6 +59,7 @@ struct cld_session {
 	 * using sleep(), neither of the timers must ever be active simultane-
 	 * ously with any other. But using one timer structure is too annoying.
 	 */
+	struct event tm_relock;
 	struct event tm_retry;
 	struct event tm_rescan;
 	struct event tm_reopen;
@@ -85,6 +86,7 @@ static int cldu_set_cldc(struct cld_session *sp, int newactive);
 static int cldu_new_sess(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
+static void try_lock(struct cld_session *sp);
 static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_put_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_get_1_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
@@ -99,6 +101,7 @@ static int cldu_close_y_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 static void add_remote(const char *name);
 static void add_chunk_node(struct cld_session *sp, const char *name);
 
+static struct timeval cldu_relock_delay = { 10, 0 };
 static struct timeval cldu_retry_delay = { 5, 0 };
 static struct timeval cldu_rescan_delay = { 50, 0 };
 static struct timeval cldu_reopen_delay = { 3, 0 };
@@ -168,6 +171,15 @@ err_oom:
 	return 0;
 }
 
+static void cldu_tm_relock(int fd, short events, void *userdata)
+{
+	struct cld_session *sp = userdata;
+
+	if (debugging)
+		applog(LOG_DEBUG, "Retrying locking of %s", sp->ffname);
+	try_lock(sp);
+}
+
 static void cldu_tm_retry(int fd, short events, void *userdata)
 {
 	struct cld_session *sp = userdata;
@@ -454,8 +466,6 @@ static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 {
 	struct cld_session *sp = carg->private;
-	struct cldc_call_opts copts;
-	int rc;
 
 	if (errc != CLE_OK) {
 		applog(LOG_ERR, "CLD open(%s) failed: %d", sp->ffname, errc);
@@ -473,6 +483,15 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	if (debugging)
 		applog(LOG_DEBUG, "CLD file \"%s\" created", sp->ffname);
 
+	try_lock(sp);
+	return 0;
+}
+
+static void try_lock(struct cld_session *sp)
+{
+	struct cldc_call_opts copts;
+	int rc;
+
 	/*
 	 * Lock the file, in case two hosts got the same hostname.
 	 */
@@ -483,8 +502,6 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	if (rc) {
 		applog(LOG_ERR, "cldc_lock call error %d", rc);
 	}
-
-	return 0;
 }
 
 static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
@@ -497,6 +514,18 @@ static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 
 	if (errc != CLE_OK) {
 		applog(LOG_ERR, "CLD lock(%s) failed: %d", sp->ffname, errc);
+		if (errc == CLE_LOCK_CONFLICT) {
+			/*
+			 * The usual reason why we get a lock conflict is
+			 * restarting too quickly and hitting the previous lock
+			 * that is going to disappear soon.
+			 *
+			 * FIXME: However, it may also be that a master
+			 * is ok and we should become a slave, e.g. start TDB.
+			 * We do not support multi-node, but we should.
+			 */
+			evtimer_add(&sp->tm_relock, &cldu_relock_delay);
+		}
 		return 0;
 	}
 
@@ -940,6 +969,7 @@ int cld_begin(const char *thishost, const char *thiscell)
 {
 	static struct cld_session *sp = &ses;
 
+	evtimer_set(&ses.tm_relock, cldu_tm_relock, &ses);
 	evtimer_set(&ses.tm_retry, cldu_tm_retry, &ses);
 	evtimer_set(&ses.tm_rescan, cldu_tm_rescan, &ses);
 	evtimer_set(&ses.tm_reopen, cldu_tm_reopen, &ses);

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [Patch 4/7] tabled: retry conflicting locks
  2010-01-15  4:13 [Patch 4/7] tabled: retry conflicting locks Pete Zaitcev
@ 2010-01-20 19:53 ` Jeff Garzik
  2010-01-20 20:16   ` Pete Zaitcev
  0 siblings, 1 reply; 6+ messages in thread
From: Jeff Garzik @ 2010-01-20 19:53 UTC (permalink / raw)
  To: Pete Zaitcev; +Cc: Project Hail List

On 01/14/2010 11:13 PM, Pete Zaitcev wrote:
> This problem was with us for a while, and even with this fix our start-up
> is not reliable. But at least we will not be 100% guaranteed to hang as
> before when restarting too quickly. So although the whole area needs some
> serious reworking, this specific case was just too annoying to let it
> continue.
>
> Signed-Off-By: Pete Zaitcev<zaitcev@redhat.com>
>
> ---
>   server/cldu.c |   38 ++++++++++++++++++++++++++++++++++----
>   1 file changed, 34 insertions(+), 4 deletions(-)

This is not correct.  CLD has blocking locks.  You issue the LOCK op, 
and will be notified when you have acquired the lock, possibly hours or 
days later.  There is no need to retry anything...

	Jeff



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch 4/7] tabled: retry conflicting locks
  2010-01-20 19:53 ` Jeff Garzik
@ 2010-01-20 20:16   ` Pete Zaitcev
  2010-01-20 22:00     ` Jeff Garzik
  0 siblings, 1 reply; 6+ messages in thread
From: Pete Zaitcev @ 2010-01-20 20:16 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Project Hail List

On Wed, 20 Jan 2010 14:53:17 -0500, Jeff Garzik <jeff@garzik.org> wrote:
> On 01/14/2010 11:13 PM, Pete Zaitcev wrote:

> > This problem was with us for a while, and even with this fix our start-up
> > is not reliable. But at least we will not be 100% guaranteed to hang as
> > before when restarting too quickly. So although the whole area needs some
> > serious reworking, this specific case was just too annoying to let it
> > continue.

> This is not correct.  CLD has blocking locks.  You issue the LOCK op, 
> and will be notified when you have acquired the lock, possibly hours or 
> days later.  There is no need to retry anything...

Meanwhile, there's no way to cancel an outstanding lock requiest
short of blowing off the whole session. I'll switch to LOCK when
you fix that, but currently TRYLOCK is the only way (which BTW you
use in cldcli too).

N.B. ncld continues with this approach. In fact currectly it does not
even have a method that performs a blocking lock.

-- Pete

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch 4/7] tabled: retry conflicting locks
  2010-01-20 20:16   ` Pete Zaitcev
@ 2010-01-20 22:00     ` Jeff Garzik
  2010-01-20 22:56       ` Pete Zaitcev
  0 siblings, 1 reply; 6+ messages in thread
From: Jeff Garzik @ 2010-01-20 22:00 UTC (permalink / raw)
  To: Pete Zaitcev; +Cc: Project Hail List

On 01/20/2010 03:16 PM, Pete Zaitcev wrote:
> On Wed, 20 Jan 2010 14:53:17 -0500, Jeff Garzik<jeff@garzik.org>  wrote:
>> On 01/14/2010 11:13 PM, Pete Zaitcev wrote:
>
>>> This problem was with us for a while, and even with this fix our start-up
>>> is not reliable. But at least we will not be 100% guaranteed to hang as
>>> before when restarting too quickly. So although the whole area needs some
>>> serious reworking, this specific case was just too annoying to let it
>>> continue.
>
>> This is not correct.  CLD has blocking locks.  You issue the LOCK op,
>> and will be notified when you have acquired the lock, possibly hours or
>> days later.  There is no need to retry anything...
>
> Meanwhile, there's no way to cancel an outstanding lock requiest
> short of blowing off the whole session. I'll switch to LOCK when
> you fix that, but currently TRYLOCK is the only way (which BTW you
> use in cldcli too).

Do you mean cancelling someone else's lock request?  That is not 
something that meshes with the design.  If you mean cancelling your own 
lock request, that's probably reasonable.

But the entire logic behind LOCK is central to what needs to be done: 
ensure one and only one session holds a lock, until the lock is released 
or the client dies (thus forcing the server to time out and release the 
dead session's locks).

If you are restarting quickly, a lock-timeout wait does not seem 
unreasonable.


> N.B. ncld continues with this approach. In fact currectly it does not
> even have a method that performs a blocking lock.

That's definitely a problem, as blocking locks are pretty central to 
CLD's design.  If you want to own a resource, you get a blocking lock. 
You only own the resource as long as the session is alive, and you have 
not released the lock yourself.  If you do not immediate acquire the 
lock, (1) you should not access the shared resource as master, and (2) 
you will be notified immediately when atomic lock acquisition occurs.

TRYLOCK is painful in the cloud because it encourages programmers, with 
patch #4 being a perfect example, to create racy polling-lock solutions 
where forward [lock] progress is not guaranteed.  IOW, the lock-polling 
loop should be in the server, with the client being asynchronously 
notified of acquisition.  TRYLOCK mainly exists for the less-common 
situation of "if (!trylock) exit(0)" type of cloud client execution.

NFS and other protocols in this space have repeatedly shown that polling 
locks is a painful, racy, byte-heavy solution for lock acquisition.

If there is a problem implementing blocking locks in the protocol or 
client, let me know, and we'll fix it.

	Jeff


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch 4/7] tabled: retry conflicting locks
  2010-01-20 22:00     ` Jeff Garzik
@ 2010-01-20 22:56       ` Pete Zaitcev
  2010-02-03 23:10         ` Jeff Garzik
  0 siblings, 1 reply; 6+ messages in thread
From: Pete Zaitcev @ 2010-01-20 22:56 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Project Hail List

> >> This is not correct.  CLD has blocking locks.  You issue the LOCK op,
> >> and will be notified when you have acquired the lock, possibly hours or
> >> days later.  There is no need to retry anything...
> >
> > Meanwhile, there's no way to cancel an outstanding lock requiest
> > short of blowing off the whole session. I'll switch to LOCK when
> > you fix that, but currently TRYLOCK is the only way (which BTW you
> > use in cldcli too).
> 
> Do you mean cancelling someone else's lock request?  That is not 
> something that meshes with the design.  If you mean cancelling your own 
> lock request, that's probably reasonable.

Own of course.

> If there is a problem implementing blocking locks in the protocol or 
> client, let me know, and we'll fix it.

Is there a way to cancel an outstanding lock request? How? You seem
to think that there's no problem.

Actually I think an cmo_close on a handle that has outstanding
requests of any kind should drop them, so I was incorrect about
killing the session being the only way. Maybe I can create some
kind of ncld_open_locked() by using that feature. That ought to
be good enough.

-- Pete

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch 4/7] tabled: retry conflicting locks
  2010-01-20 22:56       ` Pete Zaitcev
@ 2010-02-03 23:10         ` Jeff Garzik
  0 siblings, 0 replies; 6+ messages in thread
From: Jeff Garzik @ 2010-02-03 23:10 UTC (permalink / raw)
  To: Pete Zaitcev; +Cc: Project Hail List

On 01/20/2010 05:56 PM, Pete Zaitcev wrote:
> Is there a way to cancel an outstanding lock request? How? You seem
> to think that there's no problem.
>
> Actually I think an cmo_close on a handle that has outstanding
> requests of any kind should drop them, so I was incorrect about
> killing the session being the only way. Maybe I can create some
> kind of ncld_open_locked() by using that feature. That ought to
> be good enough.

CLOSE always removes outstanding locks, FWIW...  always has.

	Jeff



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2010-02-03 23:10 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-15  4:13 [Patch 4/7] tabled: retry conflicting locks Pete Zaitcev
2010-01-20 19:53 ` Jeff Garzik
2010-01-20 20:16   ` Pete Zaitcev
2010-01-20 22:00     ` Jeff Garzik
2010-01-20 22:56       ` Pete Zaitcev
2010-02-03 23:10         ` Jeff Garzik

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.