linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Abhijit Bhopatkar <abhopatk@cisco.com>
To: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: linux-raid@vger.kernel.org
Subject: Re: Potential race in dlm based messaging md-cluster.c
Date: Fri, 01 May 2015 00:21:52 +0530	[thread overview]
Message-ID: <554279C8.6070305@cisco.com> (raw)
In-Reply-To: <554278CF.4000401@cisco.com>

On 01/05/15 12:17 am, Abhijit Bhopatkar wrote:
> On 01/05/15 12:06 am, Abhijit Bhopatkar wrote:
>> There is a possibility of a receiver losing out on messages in certain
>> corner conditions. One of the buggy case is if there is are two sender
>> ready with messages to be sent. Sender 1 initially gets the TOKEN lock
>> and proceeds.
>> After initial processing the sender of message 1 _will_ release TOKEN as
>> soon as receiver releases ACK, it does not wait till ACK CR is
>> re-acquired by receiver.
>>
> I could not come up with any solution except to add one more lock
> resource for now we will call it "SYNC"
>
Here is POC patch (completely untested only as an RFC not even compiled)
If the solution is agreed upon I will go ahead and test it.

Abhijit
---
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index fcfc4b9..addbbb4 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -62,6 +62,7 @@ struct md_cluster_info {
  	struct dlm_lock_resource *ack_lockres;
  	struct dlm_lock_resource *message_lockres;
  	struct dlm_lock_resource *token_lockres;
+	struct dlm_lock_resource *sync_lockres;
  	struct dlm_lock_resource *no_new_dev_lockres;
  	struct md_thread *recv_thread;
  	struct completion newdisk_completion;
@@ -94,7 +95,7 @@ static void sync_ast(void *arg)
  	complete(&res->completion);
  }
  
-static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
+static int dlm_lock_queue(struct dlm_lock_resource *res, int mode)
  {
  	int ret = 0;
  
@@ -102,12 +103,26 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
  	ret = dlm_lock(res->ls, mode, &res->lksb,
  			res->flags, res->name, strlen(res->name),
  			0, sync_ast, res, res->bast);
-	if (ret)
-		return ret;
+	return ret;
+}
+
+static int dlm_wait_for_lock_grant(struct dlm_lock_resource *res)
+{
  	wait_for_completion(&res->completion);
  	return res->lksb.sb_status;
  }
  
+static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
+{
+	int ret = 0;
+	ret = dlm_lock_queue(res,mode);
+
+	if (ret)
+		return ret;
+	ret = dlm_wait_for_lock_grant(res);
+	return ret;
+}
+
  static int dlm_unlock_sync(struct dlm_lock_resource *res)
  {
  	return dlm_lock_sync(res, DLM_LOCK_NL);
@@ -466,6 +481,7 @@ static void recv_daemon(struct md_thread *thread)
  	struct md_cluster_info *cinfo = thread->mddev->cluster_info;
  	struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
  	struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
+	struct dlm_lock_resource *sync_lockres = cinfo->sync_lockres;
  	struct cluster_msg msg;
  
  	/*get CR on Message*/
@@ -478,6 +494,9 @@ static void recv_daemon(struct md_thread *thread)
  	memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
  	process_recvd_msg(thread->mddev, &msg);
  
+	/*queue EX on TOKEN blocks new senders till we acquire CR on ACK */
+	dlm_lock_queue(sync_lockres,DLM_LOCK_EX);
+
  	/*release CR on ack_lockres*/
  	dlm_unlock_sync(ack_lockres);
  	/*up-convert to EX on message_lockres*/
@@ -486,6 +505,11 @@ static void recv_daemon(struct md_thread *thread)
  	dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
  	/*release CR on message_lockres*/
  	dlm_unlock_sync(message_lockres);
+
+	/*wait till EX on token is granted */
+	dlm_wait_for_lock_grant(token_lockres);
+	/*release EX on token_lockres*/
+	dlm_unlock_sync(sync_lockres);
  }
  
  /* lock_comm()
@@ -500,11 +524,16 @@ static int lock_comm(struct md_cluster_info *cinfo)
  	if (error)
  		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
  				__func__, __LINE__, error);
+	error = dlm_lock_sync(cinfo->sync_lockres, DLM_LOCK_EX);
+	if (error)
+		pr_err("md-cluster(%s:%d): failed to get EX on SYNC (%d)\n",
+				__func__, __LINE__, error);
  	return error;
  }
  
  static void unlock_comm(struct md_cluster_info *cinfo)
  {
+	dlm_unlock_sync(cinfo->sync_lockres);
  	dlm_unlock_sync(cinfo->token_lockres);
  }
  
@@ -673,6 +702,9 @@ static int join(struct mddev *mddev, int nodes)
  	cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
  	if (!cinfo->token_lockres)
  		goto err;
+	cinfo->sync_lockres = lockres_init(mddev, "sync", NULL, 0);
+	if (!cinfo->sync_lockres)
+		goto err;
  	cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
  	if (!cinfo->ack_lockres)
  		goto err;
@@ -711,6 +743,7 @@ static int join(struct mddev *mddev, int nodes)
  err:
  	lockres_free(cinfo->message_lockres);
  	lockres_free(cinfo->token_lockres);
+	lockres_free(cinfo->sync_lockres);
  	lockres_free(cinfo->ack_lockres);
  	lockres_free(cinfo->no_new_dev_lockres);
  	lockres_free(cinfo->bitmap_lockres);
@@ -733,6 +766,7 @@ static int leave(struct mddev *mddev)
  	md_unregister_thread(&cinfo->recv_thread);
  	lockres_free(cinfo->message_lockres);
  	lockres_free(cinfo->token_lockres);
+	lockres_free(cinfo->sync_lockres);
  	lockres_free(cinfo->ack_lockres);
  	lockres_free(cinfo->no_new_dev_lockres);
  	lockres_free(cinfo->sb_lock);


  reply	other threads:[~2015-04-30 18:51 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CAE3Hb8oss1JZ2u7g7OQQgrEtgQ1vbQou04isiS6eEqbS=uzbhw@mail.gmail.com>
     [not found] ` <CAE3Hb8qNczD30RrcHFYCR90Jf9QFD-XH=x89MAu4Dpmm80se0A@mail.gmail.com>
     [not found]   ` <554251EA.3000807@suse.com>
     [not found]     ` <CAE3Hb8pJ=0MB6EX5jVch28gj-gnf0Mp1wyzxBfWjzLf=SuV4sQ@mail.gmail.com>
2015-04-30 18:36       ` Potential race in dlm based messaging md-cluster.c Abhijit Bhopatkar
2015-04-30 18:47         ` Abhijit Bhopatkar
2015-04-30 18:51           ` Abhijit Bhopatkar [this message]
2015-05-05  9:22         ` Lidong Zhong
2015-05-05  9:44           ` Abhijit Bhopatkar
2015-05-05 12:10             ` Abhijit Bhopatkar
2015-05-07  2:43               ` Lidong Zhong
2015-05-07  9:14                 ` Abhijit Bhopatkar
2015-05-08  5:06                   ` Lidong Zhong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=554279C8.6070305@cisco.com \
    --to=abhopatk@cisco.com \
    --cc=linux-raid@vger.kernel.org \
    --cc=rgoldwyn@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).